File: | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp |
Warning: | line 2023, column 15 Called C++ object pointer is uninitialized |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | // | |||
9 | /// \file | |||
10 | /// SI Implementation of TargetInstrInfo. | |||
11 | // | |||
12 | //===----------------------------------------------------------------------===// | |||
13 | ||||
14 | #include "SIInstrInfo.h" | |||
15 | #include "AMDGPU.h" | |||
16 | #include "AMDGPUInstrInfo.h" | |||
17 | #include "GCNHazardRecognizer.h" | |||
18 | #include "GCNSubtarget.h" | |||
19 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" | |||
20 | #include "SIMachineFunctionInfo.h" | |||
21 | #include "llvm/Analysis/ValueTracking.h" | |||
22 | #include "llvm/CodeGen/LiveVariables.h" | |||
23 | #include "llvm/CodeGen/MachineDominators.h" | |||
24 | #include "llvm/CodeGen/RegisterScavenging.h" | |||
25 | #include "llvm/CodeGen/ScheduleDAG.h" | |||
26 | #include "llvm/IR/DiagnosticInfo.h" | |||
27 | #include "llvm/IR/IntrinsicsAMDGPU.h" | |||
28 | #include "llvm/Support/CommandLine.h" | |||
29 | #include "llvm/Target/TargetMachine.h" | |||
30 | ||||
31 | using namespace llvm; | |||
32 | ||||
33 | #define DEBUG_TYPE"si-instr-info" "si-instr-info" | |||
34 | ||||
35 | #define GET_INSTRINFO_CTOR_DTOR | |||
36 | #include "AMDGPUGenInstrInfo.inc" | |||
37 | ||||
38 | namespace llvm { | |||
39 | ||||
40 | class AAResults; | |||
41 | ||||
42 | namespace AMDGPU { | |||
43 | #define GET_D16ImageDimIntrinsics_IMPL | |||
44 | #define GET_ImageDimIntrinsicTable_IMPL | |||
45 | #define GET_RsrcIntrinsics_IMPL | |||
46 | #include "AMDGPUGenSearchableTables.inc" | |||
47 | } | |||
48 | } | |||
49 | ||||
50 | ||||
51 | // Must be at least 4 to be able to branch over minimum unconditional branch | |||
52 | // code. This is only for making it possible to write reasonably small tests for | |||
53 | // long branches. | |||
54 | static cl::opt<unsigned> | |||
55 | BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), | |||
56 | cl::desc("Restrict range of branch instructions (DEBUG)")); | |||
57 | ||||
58 | static cl::opt<bool> Fix16BitCopies( | |||
59 | "amdgpu-fix-16-bit-physreg-copies", | |||
60 | cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), | |||
61 | cl::init(true), | |||
62 | cl::ReallyHidden); | |||
63 | ||||
64 | SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) | |||
65 | : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), | |||
66 | RI(ST), ST(ST) { | |||
67 | SchedModel.init(&ST); | |||
68 | } | |||
69 | ||||
70 | //===----------------------------------------------------------------------===// | |||
71 | // TargetInstrInfo callbacks | |||
72 | //===----------------------------------------------------------------------===// | |||
73 | ||||
74 | static unsigned getNumOperandsNoGlue(SDNode *Node) { | |||
75 | unsigned N = Node->getNumOperands(); | |||
76 | while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) | |||
77 | --N; | |||
78 | return N; | |||
79 | } | |||
80 | ||||
81 | /// Returns true if both nodes have the same value for the given | |||
82 | /// operand \p Op, or if both nodes do not have this operand. | |||
83 | static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { | |||
84 | unsigned Opc0 = N0->getMachineOpcode(); | |||
85 | unsigned Opc1 = N1->getMachineOpcode(); | |||
86 | ||||
87 | int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); | |||
88 | int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); | |||
89 | ||||
90 | if (Op0Idx == -1 && Op1Idx == -1) | |||
91 | return true; | |||
92 | ||||
93 | ||||
94 | if ((Op0Idx == -1 && Op1Idx != -1) || | |||
95 | (Op1Idx == -1 && Op0Idx != -1)) | |||
96 | return false; | |||
97 | ||||
98 | // getNamedOperandIdx returns the index for the MachineInstr's operands, | |||
99 | // which includes the result as the first operand. We are indexing into the | |||
100 | // MachineSDNode's operands, so we need to skip the result operand to get | |||
101 | // the real index. | |||
102 | --Op0Idx; | |||
103 | --Op1Idx; | |||
104 | ||||
105 | return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); | |||
106 | } | |||
107 | ||||
108 | bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, | |||
109 | AAResults *AA) const { | |||
110 | // TODO: The generic check fails for VALU instructions that should be | |||
111 | // rematerializable due to implicit reads of exec. We really want all of the | |||
112 | // generic logic for this except for this. | |||
113 | switch (MI.getOpcode()) { | |||
114 | case AMDGPU::V_MOV_B32_e32: | |||
115 | case AMDGPU::V_MOV_B32_e64: | |||
116 | case AMDGPU::V_MOV_B64_PSEUDO: | |||
117 | case AMDGPU::V_ACCVGPR_READ_B32_e64: | |||
118 | case AMDGPU::V_ACCVGPR_WRITE_B32_e64: | |||
119 | // No implicit operands. | |||
120 | return MI.getNumOperands() == MI.getDesc().getNumOperands(); | |||
121 | default: | |||
122 | return false; | |||
123 | } | |||
124 | } | |||
125 | ||||
126 | bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, | |||
127 | int64_t &Offset0, | |||
128 | int64_t &Offset1) const { | |||
129 | if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) | |||
130 | return false; | |||
131 | ||||
132 | unsigned Opc0 = Load0->getMachineOpcode(); | |||
133 | unsigned Opc1 = Load1->getMachineOpcode(); | |||
134 | ||||
135 | // Make sure both are actually loads. | |||
136 | if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) | |||
137 | return false; | |||
138 | ||||
139 | if (isDS(Opc0) && isDS(Opc1)) { | |||
140 | ||||
141 | // FIXME: Handle this case: | |||
142 | if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) | |||
143 | return false; | |||
144 | ||||
145 | // Check base reg. | |||
146 | if (Load0->getOperand(0) != Load1->getOperand(0)) | |||
147 | return false; | |||
148 | ||||
149 | // Skip read2 / write2 variants for simplicity. | |||
150 | // TODO: We should report true if the used offsets are adjacent (excluded | |||
151 | // st64 versions). | |||
152 | int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); | |||
153 | int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); | |||
154 | if (Offset0Idx == -1 || Offset1Idx == -1) | |||
155 | return false; | |||
156 | ||||
157 | // XXX - be careful of datalesss loads | |||
158 | // getNamedOperandIdx returns the index for MachineInstrs. Since they | |||
159 | // include the output in the operand list, but SDNodes don't, we need to | |||
160 | // subtract the index by one. | |||
161 | Offset0Idx -= get(Opc0).NumDefs; | |||
162 | Offset1Idx -= get(Opc1).NumDefs; | |||
163 | Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue(); | |||
164 | Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue(); | |||
165 | return true; | |||
166 | } | |||
167 | ||||
168 | if (isSMRD(Opc0) && isSMRD(Opc1)) { | |||
169 | // Skip time and cache invalidation instructions. | |||
170 | if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || | |||
171 | AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) | |||
172 | return false; | |||
173 | ||||
174 | assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1))((getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)) ? static_cast<void> (0) : __assert_fail ("getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 174, __PRETTY_FUNCTION__)); | |||
175 | ||||
176 | // Check base reg. | |||
177 | if (Load0->getOperand(0) != Load1->getOperand(0)) | |||
178 | return false; | |||
179 | ||||
180 | const ConstantSDNode *Load0Offset = | |||
181 | dyn_cast<ConstantSDNode>(Load0->getOperand(1)); | |||
182 | const ConstantSDNode *Load1Offset = | |||
183 | dyn_cast<ConstantSDNode>(Load1->getOperand(1)); | |||
184 | ||||
185 | if (!Load0Offset || !Load1Offset) | |||
186 | return false; | |||
187 | ||||
188 | Offset0 = Load0Offset->getZExtValue(); | |||
189 | Offset1 = Load1Offset->getZExtValue(); | |||
190 | return true; | |||
191 | } | |||
192 | ||||
193 | // MUBUF and MTBUF can access the same addresses. | |||
194 | if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { | |||
195 | ||||
196 | // MUBUF and MTBUF have vaddr at different indices. | |||
197 | if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || | |||
198 | !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || | |||
199 | !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) | |||
200 | return false; | |||
201 | ||||
202 | int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); | |||
203 | int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); | |||
204 | ||||
205 | if (OffIdx0 == -1 || OffIdx1 == -1) | |||
206 | return false; | |||
207 | ||||
208 | // getNamedOperandIdx returns the index for MachineInstrs. Since they | |||
209 | // include the output in the operand list, but SDNodes don't, we need to | |||
210 | // subtract the index by one. | |||
211 | OffIdx0 -= get(Opc0).NumDefs; | |||
212 | OffIdx1 -= get(Opc1).NumDefs; | |||
213 | ||||
214 | SDValue Off0 = Load0->getOperand(OffIdx0); | |||
215 | SDValue Off1 = Load1->getOperand(OffIdx1); | |||
216 | ||||
217 | // The offset might be a FrameIndexSDNode. | |||
218 | if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) | |||
219 | return false; | |||
220 | ||||
221 | Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); | |||
222 | Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); | |||
223 | return true; | |||
224 | } | |||
225 | ||||
226 | return false; | |||
227 | } | |||
228 | ||||
229 | static bool isStride64(unsigned Opc) { | |||
230 | switch (Opc) { | |||
231 | case AMDGPU::DS_READ2ST64_B32: | |||
232 | case AMDGPU::DS_READ2ST64_B64: | |||
233 | case AMDGPU::DS_WRITE2ST64_B32: | |||
234 | case AMDGPU::DS_WRITE2ST64_B64: | |||
235 | return true; | |||
236 | default: | |||
237 | return false; | |||
238 | } | |||
239 | } | |||
240 | ||||
241 | bool SIInstrInfo::getMemOperandsWithOffsetWidth( | |||
242 | const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, | |||
243 | int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, | |||
244 | const TargetRegisterInfo *TRI) const { | |||
245 | if (!LdSt.mayLoadOrStore()) | |||
246 | return false; | |||
247 | ||||
248 | unsigned Opc = LdSt.getOpcode(); | |||
249 | OffsetIsScalable = false; | |||
250 | const MachineOperand *BaseOp, *OffsetOp; | |||
251 | int DataOpIdx; | |||
252 | ||||
253 | if (isDS(LdSt)) { | |||
254 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); | |||
255 | OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); | |||
256 | if (OffsetOp) { | |||
257 | // Normal, single offset LDS instruction. | |||
258 | if (!BaseOp) { | |||
259 | // DS_CONSUME/DS_APPEND use M0 for the base address. | |||
260 | // TODO: find the implicit use operand for M0 and use that as BaseOp? | |||
261 | return false; | |||
262 | } | |||
263 | BaseOps.push_back(BaseOp); | |||
264 | Offset = OffsetOp->getImm(); | |||
265 | // Get appropriate operand, and compute width accordingly. | |||
266 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
267 | if (DataOpIdx == -1) | |||
268 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); | |||
269 | Width = getOpSize(LdSt, DataOpIdx); | |||
270 | } else { | |||
271 | // The 2 offset instructions use offset0 and offset1 instead. We can treat | |||
272 | // these as a load with a single offset if the 2 offsets are consecutive. | |||
273 | // We will use this for some partially aligned loads. | |||
274 | const MachineOperand *Offset0Op = | |||
275 | getNamedOperand(LdSt, AMDGPU::OpName::offset0); | |||
276 | const MachineOperand *Offset1Op = | |||
277 | getNamedOperand(LdSt, AMDGPU::OpName::offset1); | |||
278 | ||||
279 | unsigned Offset0 = Offset0Op->getImm(); | |||
280 | unsigned Offset1 = Offset1Op->getImm(); | |||
281 | if (Offset0 + 1 != Offset1) | |||
282 | return false; | |||
283 | ||||
284 | // Each of these offsets is in element sized units, so we need to convert | |||
285 | // to bytes of the individual reads. | |||
286 | ||||
287 | unsigned EltSize; | |||
288 | if (LdSt.mayLoad()) | |||
289 | EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; | |||
290 | else { | |||
291 | assert(LdSt.mayStore())((LdSt.mayStore()) ? static_cast<void> (0) : __assert_fail ("LdSt.mayStore()", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 291, __PRETTY_FUNCTION__)); | |||
292 | int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); | |||
293 | EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; | |||
294 | } | |||
295 | ||||
296 | if (isStride64(Opc)) | |||
297 | EltSize *= 64; | |||
298 | ||||
299 | BaseOps.push_back(BaseOp); | |||
300 | Offset = EltSize * Offset0; | |||
301 | // Get appropriate operand(s), and compute width accordingly. | |||
302 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
303 | if (DataOpIdx == -1) { | |||
304 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); | |||
305 | Width = getOpSize(LdSt, DataOpIdx); | |||
306 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1); | |||
307 | Width += getOpSize(LdSt, DataOpIdx); | |||
308 | } else { | |||
309 | Width = getOpSize(LdSt, DataOpIdx); | |||
310 | } | |||
311 | } | |||
312 | return true; | |||
313 | } | |||
314 | ||||
315 | if (isMUBUF(LdSt) || isMTBUF(LdSt)) { | |||
316 | const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); | |||
317 | if (SOffset && SOffset->isReg()) { | |||
318 | // We can only handle this if it's a stack access, as any other resource | |||
319 | // would require reporting multiple base registers. | |||
320 | const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); | |||
321 | if (AddrReg && !AddrReg->isFI()) | |||
322 | return false; | |||
323 | ||||
324 | const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); | |||
325 | const SIMachineFunctionInfo *MFI | |||
326 | = LdSt.getParent()->getParent()->getInfo<SIMachineFunctionInfo>(); | |||
327 | if (RSrc->getReg() != MFI->getScratchRSrcReg()) | |||
328 | return false; | |||
329 | ||||
330 | const MachineOperand *OffsetImm = | |||
331 | getNamedOperand(LdSt, AMDGPU::OpName::offset); | |||
332 | BaseOps.push_back(RSrc); | |||
333 | BaseOps.push_back(SOffset); | |||
334 | Offset = OffsetImm->getImm(); | |||
335 | } else { | |||
336 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); | |||
337 | if (!BaseOp) // e.g. BUFFER_WBINVL1_VOL | |||
338 | return false; | |||
339 | BaseOps.push_back(BaseOp); | |||
340 | ||||
341 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); | |||
342 | if (BaseOp) | |||
343 | BaseOps.push_back(BaseOp); | |||
344 | ||||
345 | const MachineOperand *OffsetImm = | |||
346 | getNamedOperand(LdSt, AMDGPU::OpName::offset); | |||
347 | Offset = OffsetImm->getImm(); | |||
348 | if (SOffset) // soffset can be an inline immediate. | |||
349 | Offset += SOffset->getImm(); | |||
350 | } | |||
351 | // Get appropriate operand, and compute width accordingly. | |||
352 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
353 | if (DataOpIdx == -1) | |||
354 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); | |||
355 | Width = getOpSize(LdSt, DataOpIdx); | |||
356 | return true; | |||
357 | } | |||
358 | ||||
359 | if (isMIMG(LdSt)) { | |||
360 | int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); | |||
361 | BaseOps.push_back(&LdSt.getOperand(SRsrcIdx)); | |||
362 | int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); | |||
363 | if (VAddr0Idx >= 0) { | |||
364 | // GFX10 possible NSA encoding. | |||
365 | for (int I = VAddr0Idx; I < SRsrcIdx; ++I) | |||
366 | BaseOps.push_back(&LdSt.getOperand(I)); | |||
367 | } else { | |||
368 | BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr)); | |||
369 | } | |||
370 | Offset = 0; | |||
371 | // Get appropriate operand, and compute width accordingly. | |||
372 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); | |||
373 | Width = getOpSize(LdSt, DataOpIdx); | |||
374 | return true; | |||
375 | } | |||
376 | ||||
377 | if (isSMRD(LdSt)) { | |||
378 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase); | |||
379 | if (!BaseOp) // e.g. S_MEMTIME | |||
380 | return false; | |||
381 | BaseOps.push_back(BaseOp); | |||
382 | OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); | |||
383 | Offset = OffsetOp ? OffsetOp->getImm() : 0; | |||
384 | // Get appropriate operand, and compute width accordingly. | |||
385 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst); | |||
386 | Width = getOpSize(LdSt, DataOpIdx); | |||
387 | return true; | |||
388 | } | |||
389 | ||||
390 | if (isFLAT(LdSt)) { | |||
391 | // Instructions have either vaddr or saddr or both or none. | |||
392 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); | |||
393 | if (BaseOp) | |||
394 | BaseOps.push_back(BaseOp); | |||
395 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); | |||
396 | if (BaseOp) | |||
397 | BaseOps.push_back(BaseOp); | |||
398 | Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); | |||
399 | // Get appropriate operand, and compute width accordingly. | |||
400 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
401 | if (DataOpIdx == -1) | |||
402 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); | |||
403 | Width = getOpSize(LdSt, DataOpIdx); | |||
404 | return true; | |||
405 | } | |||
406 | ||||
407 | return false; | |||
408 | } | |||
409 | ||||
410 | static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, | |||
411 | ArrayRef<const MachineOperand *> BaseOps1, | |||
412 | const MachineInstr &MI2, | |||
413 | ArrayRef<const MachineOperand *> BaseOps2) { | |||
414 | // Only examine the first "base" operand of each instruction, on the | |||
415 | // assumption that it represents the real base address of the memory access. | |||
416 | // Other operands are typically offsets or indices from this base address. | |||
417 | if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front())) | |||
418 | return true; | |||
419 | ||||
420 | if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) | |||
421 | return false; | |||
422 | ||||
423 | auto MO1 = *MI1.memoperands_begin(); | |||
424 | auto MO2 = *MI2.memoperands_begin(); | |||
425 | if (MO1->getAddrSpace() != MO2->getAddrSpace()) | |||
426 | return false; | |||
427 | ||||
428 | auto Base1 = MO1->getValue(); | |||
429 | auto Base2 = MO2->getValue(); | |||
430 | if (!Base1 || !Base2) | |||
431 | return false; | |||
432 | Base1 = getUnderlyingObject(Base1); | |||
433 | Base2 = getUnderlyingObject(Base2); | |||
434 | ||||
435 | if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2)) | |||
436 | return false; | |||
437 | ||||
438 | return Base1 == Base2; | |||
439 | } | |||
440 | ||||
441 | bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, | |||
442 | ArrayRef<const MachineOperand *> BaseOps2, | |||
443 | unsigned NumLoads, | |||
444 | unsigned NumBytes) const { | |||
445 | // If the mem ops (to be clustered) do not have the same base ptr, then they | |||
446 | // should not be clustered | |||
447 | if (!BaseOps1.empty() && !BaseOps2.empty()) { | |||
448 | const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); | |||
449 | const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); | |||
450 | if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) | |||
451 | return false; | |||
452 | } else if (!BaseOps1.empty() || !BaseOps2.empty()) { | |||
453 | // If only one base op is empty, they do not have the same base ptr | |||
454 | return false; | |||
455 | } | |||
456 | ||||
457 | // In order to avoid regester pressure, on an average, the number of DWORDS | |||
458 | // loaded together by all clustered mem ops should not exceed 8. This is an | |||
459 | // empirical value based on certain observations and performance related | |||
460 | // experiments. | |||
461 | // The good thing about this heuristic is - it avoids clustering of too many | |||
462 | // sub-word loads, and also avoids clustering of wide loads. Below is the | |||
463 | // brief summary of how the heuristic behaves for various `LoadSize`. | |||
464 | // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops | |||
465 | // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops | |||
466 | // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops | |||
467 | // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops | |||
468 | // (5) LoadSize >= 17: do not cluster | |||
469 | const unsigned LoadSize = NumBytes / NumLoads; | |||
470 | const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads; | |||
471 | return NumDWORDs <= 8; | |||
472 | } | |||
473 | ||||
474 | // FIXME: This behaves strangely. If, for example, you have 32 load + stores, | |||
475 | // the first 16 loads will be interleaved with the stores, and the next 16 will | |||
476 | // be clustered as expected. It should really split into 2 16 store batches. | |||
477 | // | |||
478 | // Loads are clustered until this returns false, rather than trying to schedule | |||
479 | // groups of stores. This also means we have to deal with saying different | |||
480 | // address space loads should be clustered, and ones which might cause bank | |||
481 | // conflicts. | |||
482 | // | |||
483 | // This might be deprecated so it might not be worth that much effort to fix. | |||
484 | bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, | |||
485 | int64_t Offset0, int64_t Offset1, | |||
486 | unsigned NumLoads) const { | |||
487 | assert(Offset1 > Offset0 &&((Offset1 > Offset0 && "Second offset should be larger than first offset!" ) ? static_cast<void> (0) : __assert_fail ("Offset1 > Offset0 && \"Second offset should be larger than first offset!\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 488, __PRETTY_FUNCTION__)) | |||
488 | "Second offset should be larger than first offset!")((Offset1 > Offset0 && "Second offset should be larger than first offset!" ) ? static_cast<void> (0) : __assert_fail ("Offset1 > Offset0 && \"Second offset should be larger than first offset!\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 488, __PRETTY_FUNCTION__)); | |||
489 | // If we have less than 16 loads in a row, and the offsets are within 64 | |||
490 | // bytes, then schedule together. | |||
491 | ||||
492 | // A cacheline is 64 bytes (for global memory). | |||
493 | return (NumLoads <= 16 && (Offset1 - Offset0) < 64); | |||
494 | } | |||
495 | ||||
496 | static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, | |||
497 | MachineBasicBlock::iterator MI, | |||
498 | const DebugLoc &DL, MCRegister DestReg, | |||
499 | MCRegister SrcReg, bool KillSrc, | |||
500 | const char *Msg = "illegal SGPR to VGPR copy") { | |||
501 | MachineFunction *MF = MBB.getParent(); | |||
502 | DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error); | |||
503 | LLVMContext &C = MF->getFunction().getContext(); | |||
504 | C.diagnose(IllegalCopy); | |||
505 | ||||
506 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) | |||
507 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
508 | } | |||
509 | ||||
510 | /// Handle copying from SGPR to AGPR, or from AGPR to AGPR. It is not possible | |||
511 | /// to directly copy, so an intermediate VGPR needs to be used. | |||
512 | static void indirectCopyToAGPR(const SIInstrInfo &TII, | |||
513 | MachineBasicBlock &MBB, | |||
514 | MachineBasicBlock::iterator MI, | |||
515 | const DebugLoc &DL, MCRegister DestReg, | |||
516 | MCRegister SrcReg, bool KillSrc, | |||
517 | RegScavenger &RS, | |||
518 | Register ImpDefSuperReg = Register(), | |||
519 | Register ImpUseSuperReg = Register()) { | |||
520 | const SIRegisterInfo &RI = TII.getRegisterInfo(); | |||
521 | ||||
522 | assert(AMDGPU::SReg_32RegClass.contains(SrcReg) ||((AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass .contains(SrcReg)) ? static_cast<void> (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 523, __PRETTY_FUNCTION__)) | |||
523 | AMDGPU::AGPR_32RegClass.contains(SrcReg))((AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass .contains(SrcReg)) ? static_cast<void> (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 523, __PRETTY_FUNCTION__)); | |||
524 | ||||
525 | // First try to find defining accvgpr_write to avoid temporary registers. | |||
526 | for (auto Def = MI, E = MBB.begin(); Def != E; ) { | |||
527 | --Def; | |||
528 | if (!Def->definesRegister(SrcReg, &RI)) | |||
529 | continue; | |||
530 | if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) | |||
531 | break; | |||
532 | ||||
533 | MachineOperand &DefOp = Def->getOperand(1); | |||
534 | assert(DefOp.isReg() || DefOp.isImm())((DefOp.isReg() || DefOp.isImm()) ? static_cast<void> ( 0) : __assert_fail ("DefOp.isReg() || DefOp.isImm()", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 534, __PRETTY_FUNCTION__)); | |||
535 | ||||
536 | if (DefOp.isReg()) { | |||
537 | // Check that register source operand if not clobbered before MI. | |||
538 | // Immediate operands are always safe to propagate. | |||
539 | bool SafeToPropagate = true; | |||
540 | for (auto I = Def; I != MI && SafeToPropagate; ++I) | |||
541 | if (I->modifiesRegister(DefOp.getReg(), &RI)) | |||
542 | SafeToPropagate = false; | |||
543 | ||||
544 | if (!SafeToPropagate) | |||
545 | break; | |||
546 | ||||
547 | DefOp.setIsKill(false); | |||
548 | } | |||
549 | ||||
550 | MachineInstrBuilder Builder = | |||
551 | BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) | |||
552 | .add(DefOp); | |||
553 | if (ImpDefSuperReg) | |||
554 | Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); | |||
555 | ||||
556 | if (ImpUseSuperReg) { | |||
557 | Builder.addReg(ImpUseSuperReg, | |||
558 | getKillRegState(KillSrc) | RegState::Implicit); | |||
559 | } | |||
560 | ||||
561 | return; | |||
562 | } | |||
563 | ||||
564 | RS.enterBasicBlock(MBB); | |||
565 | RS.forward(MI); | |||
566 | ||||
567 | // Ideally we want to have three registers for a long reg_sequence copy | |||
568 | // to hide 2 waitstates between v_mov_b32 and accvgpr_write. | |||
569 | unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, | |||
570 | *MBB.getParent()); | |||
571 | ||||
572 | // Registers in the sequence are allocated contiguously so we can just | |||
573 | // use register number to pick one of three round-robin temps. | |||
574 | unsigned RegNo = DestReg % 3; | |||
575 | Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); | |||
576 | if (!Tmp) | |||
577 | report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); | |||
578 | RS.setRegUsed(Tmp); | |||
579 | ||||
580 | if (!TII.getSubtarget().hasGFX90AInsts()) { | |||
581 | // Only loop through if there are any free registers left, otherwise | |||
582 | // scavenger may report a fatal error without emergency spill slot | |||
583 | // or spill with the slot. | |||
584 | while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { | |||
585 | Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); | |||
586 | if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) | |||
587 | break; | |||
588 | Tmp = Tmp2; | |||
589 | RS.setRegUsed(Tmp); | |||
590 | } | |||
591 | } | |||
592 | ||||
593 | // Insert copy to temporary VGPR. | |||
594 | unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32; | |||
595 | if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) { | |||
596 | TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64; | |||
597 | } else { | |||
598 | assert(AMDGPU::SReg_32RegClass.contains(SrcReg))((AMDGPU::SReg_32RegClass.contains(SrcReg)) ? static_cast< void> (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 598, __PRETTY_FUNCTION__)); | |||
599 | } | |||
600 | ||||
601 | MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp) | |||
602 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
603 | if (ImpUseSuperReg) { | |||
604 | UseBuilder.addReg(ImpUseSuperReg, | |||
605 | getKillRegState(KillSrc) | RegState::Implicit); | |||
606 | } | |||
607 | ||||
608 | MachineInstrBuilder DefBuilder | |||
609 | = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) | |||
610 | .addReg(Tmp, RegState::Kill); | |||
611 | ||||
612 | if (ImpDefSuperReg) | |||
613 | DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); | |||
614 | } | |||
615 | ||||
616 | static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, | |||
617 | MachineBasicBlock::iterator MI, const DebugLoc &DL, | |||
618 | MCRegister DestReg, MCRegister SrcReg, bool KillSrc, | |||
619 | const TargetRegisterClass *RC, bool Forward) { | |||
620 | const SIRegisterInfo &RI = TII.getRegisterInfo(); | |||
621 | ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4); | |||
622 | MachineBasicBlock::iterator I = MI; | |||
623 | MachineInstr *FirstMI = nullptr, *LastMI = nullptr; | |||
624 | ||||
625 | for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) { | |||
626 | int16_t SubIdx = BaseIndices[Idx]; | |||
627 | Register Reg = RI.getSubReg(DestReg, SubIdx); | |||
628 | unsigned Opcode = AMDGPU::S_MOV_B32; | |||
629 | ||||
630 | // Is SGPR aligned? If so try to combine with next. | |||
631 | Register Src = RI.getSubReg(SrcReg, SubIdx); | |||
632 | bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0; | |||
633 | bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0; | |||
634 | if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) { | |||
635 | // Can use SGPR64 copy | |||
636 | unsigned Channel = RI.getChannelFromSubReg(SubIdx); | |||
637 | SubIdx = RI.getSubRegFromChannel(Channel, 2); | |||
638 | Opcode = AMDGPU::S_MOV_B64; | |||
639 | Idx++; | |||
640 | } | |||
641 | ||||
642 | LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx)) | |||
643 | .addReg(RI.getSubReg(SrcReg, SubIdx)) | |||
644 | .addReg(SrcReg, RegState::Implicit); | |||
645 | ||||
646 | if (!FirstMI) | |||
647 | FirstMI = LastMI; | |||
648 | ||||
649 | if (!Forward) | |||
650 | I--; | |||
651 | } | |||
652 | ||||
653 | assert(FirstMI && LastMI)((FirstMI && LastMI) ? static_cast<void> (0) : __assert_fail ("FirstMI && LastMI", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 653, __PRETTY_FUNCTION__)); | |||
654 | if (!Forward) | |||
655 | std::swap(FirstMI, LastMI); | |||
656 | ||||
657 | FirstMI->addOperand( | |||
658 | MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/)); | |||
659 | ||||
660 | if (KillSrc) | |||
661 | LastMI->addRegisterKilled(SrcReg, &RI); | |||
662 | } | |||
663 | ||||
664 | void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, | |||
665 | MachineBasicBlock::iterator MI, | |||
666 | const DebugLoc &DL, MCRegister DestReg, | |||
667 | MCRegister SrcReg, bool KillSrc) const { | |||
668 | const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); | |||
669 | ||||
670 | // FIXME: This is hack to resolve copies between 16 bit and 32 bit | |||
671 | // registers until all patterns are fixed. | |||
672 | if (Fix16BitCopies && | |||
673 | ((RI.getRegSizeInBits(*RC) == 16) ^ | |||
674 | (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) { | |||
675 | MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg; | |||
676 | MCRegister Super = RI.get32BitRegister(RegToFix); | |||
677 | assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix)((RI.getSubReg(Super, AMDGPU::lo16) == RegToFix) ? static_cast <void> (0) : __assert_fail ("RI.getSubReg(Super, AMDGPU::lo16) == RegToFix" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 677, __PRETTY_FUNCTION__)); | |||
678 | RegToFix = Super; | |||
679 | ||||
680 | if (DestReg == SrcReg) { | |||
681 | // Insert empty bundle since ExpandPostRA expects an instruction here. | |||
682 | BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); | |||
683 | return; | |||
684 | } | |||
685 | ||||
686 | RC = RI.getPhysRegClass(DestReg); | |||
687 | } | |||
688 | ||||
689 | if (RC == &AMDGPU::VGPR_32RegClass) { | |||
690 | assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||((AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass .contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg) ) ? static_cast<void> (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 692, __PRETTY_FUNCTION__)) | |||
691 | AMDGPU::SReg_32RegClass.contains(SrcReg) ||((AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass .contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg) ) ? static_cast<void> (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 692, __PRETTY_FUNCTION__)) | |||
692 | AMDGPU::AGPR_32RegClass.contains(SrcReg))((AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass .contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg) ) ? static_cast<void> (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 692, __PRETTY_FUNCTION__)); | |||
693 | unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ? | |||
694 | AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32; | |||
695 | BuildMI(MBB, MI, DL, get(Opc), DestReg) | |||
696 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
697 | return; | |||
698 | } | |||
699 | ||||
700 | if (RC == &AMDGPU::SReg_32_XM0RegClass || | |||
701 | RC == &AMDGPU::SReg_32RegClass) { | |||
702 | if (SrcReg == AMDGPU::SCC) { | |||
703 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) | |||
704 | .addImm(1) | |||
705 | .addImm(0); | |||
706 | return; | |||
707 | } | |||
708 | ||||
709 | if (DestReg == AMDGPU::VCC_LO) { | |||
710 | if (AMDGPU::SReg_32RegClass.contains(SrcReg)) { | |||
711 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO) | |||
712 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
713 | } else { | |||
714 | // FIXME: Hack until VReg_1 removed. | |||
715 | assert(AMDGPU::VGPR_32RegClass.contains(SrcReg))((AMDGPU::VGPR_32RegClass.contains(SrcReg)) ? static_cast< void> (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 715, __PRETTY_FUNCTION__)); | |||
716 | BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) | |||
717 | .addImm(0) | |||
718 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
719 | } | |||
720 | ||||
721 | return; | |||
722 | } | |||
723 | ||||
724 | if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { | |||
725 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); | |||
726 | return; | |||
727 | } | |||
728 | ||||
729 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) | |||
730 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
731 | return; | |||
732 | } | |||
733 | ||||
734 | if (RC == &AMDGPU::SReg_64RegClass) { | |||
735 | if (SrcReg == AMDGPU::SCC) { | |||
736 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg) | |||
737 | .addImm(1) | |||
738 | .addImm(0); | |||
739 | return; | |||
740 | } | |||
741 | ||||
742 | if (DestReg == AMDGPU::VCC) { | |||
743 | if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { | |||
744 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) | |||
745 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
746 | } else { | |||
747 | // FIXME: Hack until VReg_1 removed. | |||
748 | assert(AMDGPU::VGPR_32RegClass.contains(SrcReg))((AMDGPU::VGPR_32RegClass.contains(SrcReg)) ? static_cast< void> (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 748, __PRETTY_FUNCTION__)); | |||
749 | BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) | |||
750 | .addImm(0) | |||
751 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
752 | } | |||
753 | ||||
754 | return; | |||
755 | } | |||
756 | ||||
757 | if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { | |||
758 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); | |||
759 | return; | |||
760 | } | |||
761 | ||||
762 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) | |||
763 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
764 | return; | |||
765 | } | |||
766 | ||||
767 | if (DestReg == AMDGPU::SCC) { | |||
768 | // Copying 64-bit or 32-bit sources to SCC barely makes sense, | |||
769 | // but SelectionDAG emits such copies for i1 sources. | |||
770 | if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { | |||
771 | // This copy can only be produced by patterns | |||
772 | // with explicit SCC, which are known to be enabled | |||
773 | // only for subtargets with S_CMP_LG_U64 present. | |||
774 | assert(ST.hasScalarCompareEq64())((ST.hasScalarCompareEq64()) ? static_cast<void> (0) : __assert_fail ("ST.hasScalarCompareEq64()", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 774, __PRETTY_FUNCTION__)); | |||
775 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64)) | |||
776 | .addReg(SrcReg, getKillRegState(KillSrc)) | |||
777 | .addImm(0); | |||
778 | } else { | |||
779 | assert(AMDGPU::SReg_32RegClass.contains(SrcReg))((AMDGPU::SReg_32RegClass.contains(SrcReg)) ? static_cast< void> (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 779, __PRETTY_FUNCTION__)); | |||
780 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) | |||
781 | .addReg(SrcReg, getKillRegState(KillSrc)) | |||
782 | .addImm(0); | |||
783 | } | |||
784 | ||||
785 | return; | |||
786 | } | |||
787 | ||||
788 | if (RC == &AMDGPU::AGPR_32RegClass) { | |||
789 | if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) { | |||
790 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) | |||
791 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
792 | return; | |||
793 | } | |||
794 | ||||
795 | if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) { | |||
796 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg) | |||
797 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
798 | return; | |||
799 | } | |||
800 | ||||
801 | // FIXME: Pass should maintain scavenger to avoid scan through the block on | |||
802 | // every AGPR spill. | |||
803 | RegScavenger RS; | |||
804 | indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS); | |||
805 | return; | |||
806 | } | |||
807 | ||||
808 | const unsigned Size = RI.getRegSizeInBits(*RC); | |||
809 | if (Size == 16) { | |||
810 | assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||((AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass .contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg ) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)) ? static_cast <void> (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 813, __PRETTY_FUNCTION__)) | |||
811 | AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||((AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass .contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg ) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)) ? static_cast <void> (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 813, __PRETTY_FUNCTION__)) | |||
812 | AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||((AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass .contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg ) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)) ? static_cast <void> (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 813, __PRETTY_FUNCTION__)) | |||
813 | AMDGPU::AGPR_LO16RegClass.contains(SrcReg))((AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass .contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg ) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)) ? static_cast <void> (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 813, __PRETTY_FUNCTION__)); | |||
814 | ||||
815 | bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg); | |||
816 | bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg); | |||
817 | bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg); | |||
818 | bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg); | |||
819 | bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) || | |||
820 | AMDGPU::SReg_LO16RegClass.contains(DestReg) || | |||
821 | AMDGPU::AGPR_LO16RegClass.contains(DestReg); | |||
822 | bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || | |||
823 | AMDGPU::SReg_LO16RegClass.contains(SrcReg) || | |||
824 | AMDGPU::AGPR_LO16RegClass.contains(SrcReg); | |||
825 | MCRegister NewDestReg = RI.get32BitRegister(DestReg); | |||
826 | MCRegister NewSrcReg = RI.get32BitRegister(SrcReg); | |||
827 | ||||
828 | if (IsSGPRDst) { | |||
829 | if (!IsSGPRSrc) { | |||
830 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); | |||
831 | return; | |||
832 | } | |||
833 | ||||
834 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg) | |||
835 | .addReg(NewSrcReg, getKillRegState(KillSrc)); | |||
836 | return; | |||
837 | } | |||
838 | ||||
839 | if (IsAGPRDst || IsAGPRSrc) { | |||
840 | if (!DstLow || !SrcLow) { | |||
841 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, | |||
842 | "Cannot use hi16 subreg with an AGPR!"); | |||
843 | } | |||
844 | ||||
845 | copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc); | |||
846 | return; | |||
847 | } | |||
848 | ||||
849 | if (IsSGPRSrc && !ST.hasSDWAScalar()) { | |||
850 | if (!DstLow || !SrcLow) { | |||
851 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, | |||
852 | "Cannot use hi16 subreg on VI!"); | |||
853 | } | |||
854 | ||||
855 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg) | |||
856 | .addReg(NewSrcReg, getKillRegState(KillSrc)); | |||
857 | return; | |||
858 | } | |||
859 | ||||
860 | auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg) | |||
861 | .addImm(0) // src0_modifiers | |||
862 | .addReg(NewSrcReg) | |||
863 | .addImm(0) // clamp | |||
864 | .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0 | |||
865 | : AMDGPU::SDWA::SdwaSel::WORD_1) | |||
866 | .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) | |||
867 | .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0 | |||
868 | : AMDGPU::SDWA::SdwaSel::WORD_1) | |||
869 | .addReg(NewDestReg, RegState::Implicit | RegState::Undef); | |||
870 | // First implicit operand is $exec. | |||
871 | MIB->tieOperands(0, MIB->getNumOperands() - 1); | |||
872 | return; | |||
873 | } | |||
874 | ||||
875 | if (RC->hasSuperClassEq(&AMDGPU::VReg_64RegClass) && | |||
876 | !RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { | |||
877 | if (ST.hasPackedFP32Ops()) { | |||
878 | BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg) | |||
879 | .addImm(SISrcMods::OP_SEL_1) | |||
880 | .addReg(SrcReg) | |||
881 | .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) | |||
882 | .addReg(SrcReg) | |||
883 | .addImm(0) // op_sel_lo | |||
884 | .addImm(0) // op_sel_hi | |||
885 | .addImm(0) // neg_lo | |||
886 | .addImm(0) // neg_hi | |||
887 | .addImm(0) // clamp | |||
888 | .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); | |||
889 | return; | |||
890 | } | |||
891 | } | |||
892 | ||||
893 | const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); | |||
894 | if (RI.isSGPRClass(RC)) { | |||
895 | if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) { | |||
896 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); | |||
897 | return; | |||
898 | } | |||
899 | expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RC, Forward); | |||
900 | return; | |||
901 | } | |||
902 | ||||
903 | unsigned EltSize = 4; | |||
904 | unsigned Opcode = AMDGPU::V_MOV_B32_e32; | |||
905 | if (RI.hasAGPRs(RC)) { | |||
906 | Opcode = (RI.hasVGPRs(RI.getPhysRegClass(SrcReg))) ? | |||
907 | AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; | |||
908 | } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { | |||
909 | Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64; | |||
910 | } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) && | |||
911 | !RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { | |||
912 | // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov. | |||
913 | if (ST.hasPackedFP32Ops()) { | |||
914 | Opcode = AMDGPU::V_PK_MOV_B32; | |||
915 | EltSize = 8; | |||
916 | } | |||
917 | } | |||
918 | ||||
919 | // For the cases where we need an intermediate instruction/temporary register | |||
920 | // (destination is an AGPR), we need a scavenger. | |||
921 | // | |||
922 | // FIXME: The pass should maintain this for us so we don't have to re-scan the | |||
923 | // whole block for every handled copy. | |||
924 | std::unique_ptr<RegScavenger> RS; | |||
925 | if (Opcode == AMDGPU::INSTRUCTION_LIST_END) | |||
926 | RS.reset(new RegScavenger()); | |||
927 | ||||
928 | ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); | |||
929 | ||||
930 | // If there is an overlap, we can't kill the super-register on the last | |||
931 | // instruction, since it will also kill the components made live by this def. | |||
932 | const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg); | |||
933 | ||||
934 | for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { | |||
935 | unsigned SubIdx; | |||
936 | if (Forward) | |||
937 | SubIdx = SubIndices[Idx]; | |||
938 | else | |||
939 | SubIdx = SubIndices[SubIndices.size() - Idx - 1]; | |||
940 | ||||
941 | bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1; | |||
942 | ||||
943 | if (Opcode == AMDGPU::INSTRUCTION_LIST_END) { | |||
944 | Register ImpDefSuper = Idx == 0 ? Register(DestReg) : Register(); | |||
945 | Register ImpUseSuper = SrcReg; | |||
946 | indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), | |||
947 | RI.getSubReg(SrcReg, SubIdx), UseKill, *RS, | |||
948 | ImpDefSuper, ImpUseSuper); | |||
949 | } else if (Opcode == AMDGPU::V_PK_MOV_B32) { | |||
950 | Register DstSubReg = RI.getSubReg(DestReg, SubIdx); | |||
951 | Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); | |||
952 | MachineInstrBuilder MIB = | |||
953 | BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg) | |||
954 | .addImm(SISrcMods::OP_SEL_1) | |||
955 | .addReg(SrcSubReg) | |||
956 | .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) | |||
957 | .addReg(SrcSubReg) | |||
958 | .addImm(0) // op_sel_lo | |||
959 | .addImm(0) // op_sel_hi | |||
960 | .addImm(0) // neg_lo | |||
961 | .addImm(0) // neg_hi | |||
962 | .addImm(0) // clamp | |||
963 | .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); | |||
964 | if (Idx == 0) | |||
965 | MIB.addReg(DestReg, RegState::Define | RegState::Implicit); | |||
966 | } else { | |||
967 | MachineInstrBuilder Builder = | |||
968 | BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)) | |||
969 | .addReg(RI.getSubReg(SrcReg, SubIdx)); | |||
970 | if (Idx == 0) | |||
971 | Builder.addReg(DestReg, RegState::Define | RegState::Implicit); | |||
972 | ||||
973 | Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); | |||
974 | } | |||
975 | } | |||
976 | } | |||
977 | ||||
978 | int SIInstrInfo::commuteOpcode(unsigned Opcode) const { | |||
979 | int NewOpc; | |||
980 | ||||
981 | // Try to map original to commuted opcode | |||
982 | NewOpc = AMDGPU::getCommuteRev(Opcode); | |||
983 | if (NewOpc != -1) | |||
984 | // Check if the commuted (REV) opcode exists on the target. | |||
985 | return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; | |||
986 | ||||
987 | // Try to map commuted to original opcode | |||
988 | NewOpc = AMDGPU::getCommuteOrig(Opcode); | |||
989 | if (NewOpc != -1) | |||
990 | // Check if the original (non-REV) opcode exists on the target. | |||
991 | return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; | |||
992 | ||||
993 | return Opcode; | |||
994 | } | |||
995 | ||||
996 | void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, | |||
997 | MachineBasicBlock::iterator MI, | |||
998 | const DebugLoc &DL, unsigned DestReg, | |||
999 | int64_t Value) const { | |||
1000 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
1001 | const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); | |||
1002 | if (RegClass == &AMDGPU::SReg_32RegClass || | |||
1003 | RegClass == &AMDGPU::SGPR_32RegClass || | |||
1004 | RegClass == &AMDGPU::SReg_32_XM0RegClass || | |||
1005 | RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { | |||
1006 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) | |||
1007 | .addImm(Value); | |||
1008 | return; | |||
1009 | } | |||
1010 | ||||
1011 | if (RegClass == &AMDGPU::SReg_64RegClass || | |||
1012 | RegClass == &AMDGPU::SGPR_64RegClass || | |||
1013 | RegClass == &AMDGPU::SReg_64_XEXECRegClass) { | |||
1014 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) | |||
1015 | .addImm(Value); | |||
1016 | return; | |||
1017 | } | |||
1018 | ||||
1019 | if (RegClass == &AMDGPU::VGPR_32RegClass) { | |||
1020 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) | |||
1021 | .addImm(Value); | |||
1022 | return; | |||
1023 | } | |||
1024 | if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) { | |||
1025 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) | |||
1026 | .addImm(Value); | |||
1027 | return; | |||
1028 | } | |||
1029 | ||||
1030 | unsigned EltSize = 4; | |||
1031 | unsigned Opcode = AMDGPU::V_MOV_B32_e32; | |||
1032 | if (RI.isSGPRClass(RegClass)) { | |||
1033 | if (RI.getRegSizeInBits(*RegClass) > 32) { | |||
1034 | Opcode = AMDGPU::S_MOV_B64; | |||
1035 | EltSize = 8; | |||
1036 | } else { | |||
1037 | Opcode = AMDGPU::S_MOV_B32; | |||
1038 | EltSize = 4; | |||
1039 | } | |||
1040 | } | |||
1041 | ||||
1042 | ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); | |||
1043 | for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { | |||
1044 | int64_t IdxValue = Idx == 0 ? Value : 0; | |||
1045 | ||||
1046 | MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, | |||
1047 | get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx])); | |||
1048 | Builder.addImm(IdxValue); | |||
1049 | } | |||
1050 | } | |||
1051 | ||||
1052 | const TargetRegisterClass * | |||
1053 | SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { | |||
1054 | return &AMDGPU::VGPR_32RegClass; | |||
1055 | } | |||
1056 | ||||
1057 | void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, | |||
1058 | MachineBasicBlock::iterator I, | |||
1059 | const DebugLoc &DL, Register DstReg, | |||
1060 | ArrayRef<MachineOperand> Cond, | |||
1061 | Register TrueReg, | |||
1062 | Register FalseReg) const { | |||
1063 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
1064 | const TargetRegisterClass *BoolXExecRC = | |||
1065 | RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
1066 | assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&((MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && "Not a VGPR32 reg") ? static_cast<void> (0) : __assert_fail ("MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && \"Not a VGPR32 reg\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1067, __PRETTY_FUNCTION__)) | |||
1067 | "Not a VGPR32 reg")((MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && "Not a VGPR32 reg") ? static_cast<void> (0) : __assert_fail ("MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && \"Not a VGPR32 reg\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1067, __PRETTY_FUNCTION__)); | |||
1068 | ||||
1069 | if (Cond.size() == 1) { | |||
1070 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1071 | BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) | |||
1072 | .add(Cond[0]); | |||
1073 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1074 | .addImm(0) | |||
1075 | .addReg(FalseReg) | |||
1076 | .addImm(0) | |||
1077 | .addReg(TrueReg) | |||
1078 | .addReg(SReg); | |||
1079 | } else if (Cond.size() == 2) { | |||
1080 | assert(Cond[0].isImm() && "Cond[0] is not an immediate")((Cond[0].isImm() && "Cond[0] is not an immediate") ? static_cast<void> (0) : __assert_fail ("Cond[0].isImm() && \"Cond[0] is not an immediate\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1080, __PRETTY_FUNCTION__)); | |||
1081 | switch (Cond[0].getImm()) { | |||
1082 | case SIInstrInfo::SCC_TRUE: { | |||
1083 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1084 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 | |||
1085 | : AMDGPU::S_CSELECT_B64), SReg) | |||
1086 | .addImm(1) | |||
1087 | .addImm(0); | |||
1088 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1089 | .addImm(0) | |||
1090 | .addReg(FalseReg) | |||
1091 | .addImm(0) | |||
1092 | .addReg(TrueReg) | |||
1093 | .addReg(SReg); | |||
1094 | break; | |||
1095 | } | |||
1096 | case SIInstrInfo::SCC_FALSE: { | |||
1097 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1098 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 | |||
1099 | : AMDGPU::S_CSELECT_B64), SReg) | |||
1100 | .addImm(0) | |||
1101 | .addImm(1); | |||
1102 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1103 | .addImm(0) | |||
1104 | .addReg(FalseReg) | |||
1105 | .addImm(0) | |||
1106 | .addReg(TrueReg) | |||
1107 | .addReg(SReg); | |||
1108 | break; | |||
1109 | } | |||
1110 | case SIInstrInfo::VCCNZ: { | |||
1111 | MachineOperand RegOp = Cond[1]; | |||
1112 | RegOp.setImplicit(false); | |||
1113 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1114 | BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) | |||
1115 | .add(RegOp); | |||
1116 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1117 | .addImm(0) | |||
1118 | .addReg(FalseReg) | |||
1119 | .addImm(0) | |||
1120 | .addReg(TrueReg) | |||
1121 | .addReg(SReg); | |||
1122 | break; | |||
1123 | } | |||
1124 | case SIInstrInfo::VCCZ: { | |||
1125 | MachineOperand RegOp = Cond[1]; | |||
1126 | RegOp.setImplicit(false); | |||
1127 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1128 | BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) | |||
1129 | .add(RegOp); | |||
1130 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1131 | .addImm(0) | |||
1132 | .addReg(TrueReg) | |||
1133 | .addImm(0) | |||
1134 | .addReg(FalseReg) | |||
1135 | .addReg(SReg); | |||
1136 | break; | |||
1137 | } | |||
1138 | case SIInstrInfo::EXECNZ: { | |||
1139 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1140 | Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); | |||
1141 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 | |||
1142 | : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) | |||
1143 | .addImm(0); | |||
1144 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 | |||
1145 | : AMDGPU::S_CSELECT_B64), SReg) | |||
1146 | .addImm(1) | |||
1147 | .addImm(0); | |||
1148 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1149 | .addImm(0) | |||
1150 | .addReg(FalseReg) | |||
1151 | .addImm(0) | |||
1152 | .addReg(TrueReg) | |||
1153 | .addReg(SReg); | |||
1154 | break; | |||
1155 | } | |||
1156 | case SIInstrInfo::EXECZ: { | |||
1157 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1158 | Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); | |||
1159 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 | |||
1160 | : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) | |||
1161 | .addImm(0); | |||
1162 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 | |||
1163 | : AMDGPU::S_CSELECT_B64), SReg) | |||
1164 | .addImm(0) | |||
1165 | .addImm(1); | |||
1166 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1167 | .addImm(0) | |||
1168 | .addReg(FalseReg) | |||
1169 | .addImm(0) | |||
1170 | .addReg(TrueReg) | |||
1171 | .addReg(SReg); | |||
1172 | llvm_unreachable("Unhandled branch predicate EXECZ")::llvm::llvm_unreachable_internal("Unhandled branch predicate EXECZ" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1172); | |||
1173 | break; | |||
1174 | } | |||
1175 | default: | |||
1176 | llvm_unreachable("invalid branch predicate")::llvm::llvm_unreachable_internal("invalid branch predicate", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1176); | |||
1177 | } | |||
1178 | } else { | |||
1179 | llvm_unreachable("Can only handle Cond size 1 or 2")::llvm::llvm_unreachable_internal("Can only handle Cond size 1 or 2" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1179); | |||
1180 | } | |||
1181 | } | |||
1182 | ||||
1183 | Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB, | |||
1184 | MachineBasicBlock::iterator I, | |||
1185 | const DebugLoc &DL, | |||
1186 | Register SrcReg, int Value) const { | |||
1187 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | |||
1188 | Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
1189 | BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) | |||
1190 | .addImm(Value) | |||
1191 | .addReg(SrcReg); | |||
1192 | ||||
1193 | return Reg; | |||
1194 | } | |||
1195 | ||||
1196 | Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, | |||
1197 | MachineBasicBlock::iterator I, | |||
1198 | const DebugLoc &DL, | |||
1199 | Register SrcReg, int Value) const { | |||
1200 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | |||
1201 | Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
1202 | BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) | |||
1203 | .addImm(Value) | |||
1204 | .addReg(SrcReg); | |||
1205 | ||||
1206 | return Reg; | |||
1207 | } | |||
1208 | ||||
1209 | unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { | |||
1210 | ||||
1211 | if (RI.hasAGPRs(DstRC)) | |||
1212 | return AMDGPU::COPY; | |||
1213 | if (RI.getRegSizeInBits(*DstRC) == 32) { | |||
1214 | return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; | |||
1215 | } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { | |||
1216 | return AMDGPU::S_MOV_B64; | |||
1217 | } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { | |||
1218 | return AMDGPU::V_MOV_B64_PSEUDO; | |||
1219 | } | |||
1220 | return AMDGPU::COPY; | |||
1221 | } | |||
1222 | ||||
1223 | const MCInstrDesc & | |||
1224 | SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize, | |||
1225 | bool IsIndirectSrc) const { | |||
1226 | if (IsIndirectSrc) { | |||
1227 | if (VecSize <= 32) // 4 bytes | |||
1228 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1); | |||
1229 | if (VecSize <= 64) // 8 bytes | |||
1230 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2); | |||
1231 | if (VecSize <= 96) // 12 bytes | |||
1232 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3); | |||
1233 | if (VecSize <= 128) // 16 bytes | |||
1234 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4); | |||
1235 | if (VecSize <= 160) // 20 bytes | |||
1236 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5); | |||
1237 | if (VecSize <= 256) // 32 bytes | |||
1238 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8); | |||
1239 | if (VecSize <= 512) // 64 bytes | |||
1240 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16); | |||
1241 | if (VecSize <= 1024) // 128 bytes | |||
1242 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32); | |||
1243 | ||||
1244 | llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegReadGPRIDX pseudos" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1244); | |||
1245 | } | |||
1246 | ||||
1247 | if (VecSize <= 32) // 4 bytes | |||
1248 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1); | |||
1249 | if (VecSize <= 64) // 8 bytes | |||
1250 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2); | |||
1251 | if (VecSize <= 96) // 12 bytes | |||
1252 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3); | |||
1253 | if (VecSize <= 128) // 16 bytes | |||
1254 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4); | |||
1255 | if (VecSize <= 160) // 20 bytes | |||
1256 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5); | |||
1257 | if (VecSize <= 256) // 32 bytes | |||
1258 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8); | |||
1259 | if (VecSize <= 512) // 64 bytes | |||
1260 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16); | |||
1261 | if (VecSize <= 1024) // 128 bytes | |||
1262 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32); | |||
1263 | ||||
1264 | llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWriteGPRIDX pseudos" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1264); | |||
1265 | } | |||
1266 | ||||
1267 | static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) { | |||
1268 | if (VecSize <= 32) // 4 bytes | |||
1269 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1; | |||
1270 | if (VecSize <= 64) // 8 bytes | |||
1271 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2; | |||
1272 | if (VecSize <= 96) // 12 bytes | |||
1273 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3; | |||
1274 | if (VecSize <= 128) // 16 bytes | |||
1275 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4; | |||
1276 | if (VecSize <= 160) // 20 bytes | |||
1277 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5; | |||
1278 | if (VecSize <= 256) // 32 bytes | |||
1279 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8; | |||
1280 | if (VecSize <= 512) // 64 bytes | |||
1281 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16; | |||
1282 | if (VecSize <= 1024) // 128 bytes | |||
1283 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32; | |||
1284 | ||||
1285 | llvm_unreachable("unsupported size for IndirectRegWrite pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWrite pseudos" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1285); | |||
1286 | } | |||
1287 | ||||
1288 | static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) { | |||
1289 | if (VecSize <= 32) // 4 bytes | |||
1290 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1; | |||
1291 | if (VecSize <= 64) // 8 bytes | |||
1292 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2; | |||
1293 | if (VecSize <= 96) // 12 bytes | |||
1294 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3; | |||
1295 | if (VecSize <= 128) // 16 bytes | |||
1296 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4; | |||
1297 | if (VecSize <= 160) // 20 bytes | |||
1298 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5; | |||
1299 | if (VecSize <= 256) // 32 bytes | |||
1300 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8; | |||
1301 | if (VecSize <= 512) // 64 bytes | |||
1302 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16; | |||
1303 | if (VecSize <= 1024) // 128 bytes | |||
1304 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32; | |||
1305 | ||||
1306 | llvm_unreachable("unsupported size for IndirectRegWrite pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWrite pseudos" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1306); | |||
1307 | } | |||
1308 | ||||
1309 | static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) { | |||
1310 | if (VecSize <= 64) // 8 bytes | |||
1311 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1; | |||
1312 | if (VecSize <= 128) // 16 bytes | |||
1313 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2; | |||
1314 | if (VecSize <= 256) // 32 bytes | |||
1315 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4; | |||
1316 | if (VecSize <= 512) // 64 bytes | |||
1317 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8; | |||
1318 | if (VecSize <= 1024) // 128 bytes | |||
1319 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16; | |||
1320 | ||||
1321 | llvm_unreachable("unsupported size for IndirectRegWrite pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWrite pseudos" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1321); | |||
1322 | } | |||
1323 | ||||
1324 | const MCInstrDesc & | |||
1325 | SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, | |||
1326 | bool IsSGPR) const { | |||
1327 | if (IsSGPR) { | |||
1328 | switch (EltSize) { | |||
1329 | case 32: | |||
1330 | return get(getIndirectSGPRWriteMovRelPseudo32(VecSize)); | |||
1331 | case 64: | |||
1332 | return get(getIndirectSGPRWriteMovRelPseudo64(VecSize)); | |||
1333 | default: | |||
1334 | llvm_unreachable("invalid reg indexing elt size")::llvm::llvm_unreachable_internal("invalid reg indexing elt size" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1334); | |||
1335 | } | |||
1336 | } | |||
1337 | ||||
1338 | assert(EltSize == 32 && "invalid reg indexing elt size")((EltSize == 32 && "invalid reg indexing elt size") ? static_cast<void> (0) : __assert_fail ("EltSize == 32 && \"invalid reg indexing elt size\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1338, __PRETTY_FUNCTION__)); | |||
1339 | return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize)); | |||
1340 | } | |||
1341 | ||||
1342 | static unsigned getSGPRSpillSaveOpcode(unsigned Size) { | |||
1343 | switch (Size) { | |||
1344 | case 4: | |||
1345 | return AMDGPU::SI_SPILL_S32_SAVE; | |||
1346 | case 8: | |||
1347 | return AMDGPU::SI_SPILL_S64_SAVE; | |||
1348 | case 12: | |||
1349 | return AMDGPU::SI_SPILL_S96_SAVE; | |||
1350 | case 16: | |||
1351 | return AMDGPU::SI_SPILL_S128_SAVE; | |||
1352 | case 20: | |||
1353 | return AMDGPU::SI_SPILL_S160_SAVE; | |||
1354 | case 24: | |||
1355 | return AMDGPU::SI_SPILL_S192_SAVE; | |||
1356 | case 32: | |||
1357 | return AMDGPU::SI_SPILL_S256_SAVE; | |||
1358 | case 64: | |||
1359 | return AMDGPU::SI_SPILL_S512_SAVE; | |||
1360 | case 128: | |||
1361 | return AMDGPU::SI_SPILL_S1024_SAVE; | |||
1362 | default: | |||
1363 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1363); | |||
1364 | } | |||
1365 | } | |||
1366 | ||||
1367 | static unsigned getVGPRSpillSaveOpcode(unsigned Size) { | |||
1368 | switch (Size) { | |||
1369 | case 4: | |||
1370 | return AMDGPU::SI_SPILL_V32_SAVE; | |||
1371 | case 8: | |||
1372 | return AMDGPU::SI_SPILL_V64_SAVE; | |||
1373 | case 12: | |||
1374 | return AMDGPU::SI_SPILL_V96_SAVE; | |||
1375 | case 16: | |||
1376 | return AMDGPU::SI_SPILL_V128_SAVE; | |||
1377 | case 20: | |||
1378 | return AMDGPU::SI_SPILL_V160_SAVE; | |||
1379 | case 24: | |||
1380 | return AMDGPU::SI_SPILL_V192_SAVE; | |||
1381 | case 32: | |||
1382 | return AMDGPU::SI_SPILL_V256_SAVE; | |||
1383 | case 64: | |||
1384 | return AMDGPU::SI_SPILL_V512_SAVE; | |||
1385 | case 128: | |||
1386 | return AMDGPU::SI_SPILL_V1024_SAVE; | |||
1387 | default: | |||
1388 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1388); | |||
1389 | } | |||
1390 | } | |||
1391 | ||||
1392 | static unsigned getAGPRSpillSaveOpcode(unsigned Size) { | |||
1393 | switch (Size) { | |||
1394 | case 4: | |||
1395 | return AMDGPU::SI_SPILL_A32_SAVE; | |||
1396 | case 8: | |||
1397 | return AMDGPU::SI_SPILL_A64_SAVE; | |||
1398 | case 12: | |||
1399 | return AMDGPU::SI_SPILL_A96_SAVE; | |||
1400 | case 16: | |||
1401 | return AMDGPU::SI_SPILL_A128_SAVE; | |||
1402 | case 20: | |||
1403 | return AMDGPU::SI_SPILL_A160_SAVE; | |||
1404 | case 24: | |||
1405 | return AMDGPU::SI_SPILL_A192_SAVE; | |||
1406 | case 32: | |||
1407 | return AMDGPU::SI_SPILL_A256_SAVE; | |||
1408 | case 64: | |||
1409 | return AMDGPU::SI_SPILL_A512_SAVE; | |||
1410 | case 128: | |||
1411 | return AMDGPU::SI_SPILL_A1024_SAVE; | |||
1412 | default: | |||
1413 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1413); | |||
1414 | } | |||
1415 | } | |||
1416 | ||||
1417 | void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, | |||
1418 | MachineBasicBlock::iterator MI, | |||
1419 | Register SrcReg, bool isKill, | |||
1420 | int FrameIndex, | |||
1421 | const TargetRegisterClass *RC, | |||
1422 | const TargetRegisterInfo *TRI) const { | |||
1423 | MachineFunction *MF = MBB.getParent(); | |||
1424 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); | |||
1425 | MachineFrameInfo &FrameInfo = MF->getFrameInfo(); | |||
1426 | const DebugLoc &DL = MBB.findDebugLoc(MI); | |||
1427 | ||||
1428 | MachinePointerInfo PtrInfo | |||
1429 | = MachinePointerInfo::getFixedStack(*MF, FrameIndex); | |||
1430 | MachineMemOperand *MMO = MF->getMachineMemOperand( | |||
1431 | PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), | |||
1432 | FrameInfo.getObjectAlign(FrameIndex)); | |||
1433 | unsigned SpillSize = TRI->getSpillSize(*RC); | |||
1434 | ||||
1435 | if (RI.isSGPRClass(RC)) { | |||
1436 | MFI->setHasSpilledSGPRs(); | |||
1437 | assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled")((SrcReg != AMDGPU::M0 && "m0 should not be spilled") ? static_cast<void> (0) : __assert_fail ("SrcReg != AMDGPU::M0 && \"m0 should not be spilled\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1437, __PRETTY_FUNCTION__)); | |||
1438 | assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&((SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && "exec should not be spilled" ) ? static_cast<void> (0) : __assert_fail ("SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && \"exec should not be spilled\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1439, __PRETTY_FUNCTION__)) | |||
1439 | SrcReg != AMDGPU::EXEC && "exec should not be spilled")((SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && "exec should not be spilled" ) ? static_cast<void> (0) : __assert_fail ("SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && \"exec should not be spilled\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1439, __PRETTY_FUNCTION__)); | |||
1440 | ||||
1441 | // We are only allowed to create one new instruction when spilling | |||
1442 | // registers, so we need to use pseudo instruction for spilling SGPRs. | |||
1443 | const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); | |||
1444 | ||||
1445 | // The SGPR spill/restore instructions only work on number sgprs, so we need | |||
1446 | // to make sure we are using the correct register class. | |||
1447 | if (SrcReg.isVirtual() && SpillSize == 4) { | |||
1448 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
1449 | MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); | |||
1450 | } | |||
1451 | ||||
1452 | BuildMI(MBB, MI, DL, OpDesc) | |||
1453 | .addReg(SrcReg, getKillRegState(isKill)) // data | |||
1454 | .addFrameIndex(FrameIndex) // addr | |||
1455 | .addMemOperand(MMO) | |||
1456 | .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); | |||
1457 | ||||
1458 | if (RI.spillSGPRToVGPR()) | |||
1459 | FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); | |||
1460 | return; | |||
1461 | } | |||
1462 | ||||
1463 | unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize) | |||
1464 | : getVGPRSpillSaveOpcode(SpillSize); | |||
1465 | MFI->setHasSpilledVGPRs(); | |||
1466 | ||||
1467 | BuildMI(MBB, MI, DL, get(Opcode)) | |||
1468 | .addReg(SrcReg, getKillRegState(isKill)) // data | |||
1469 | .addFrameIndex(FrameIndex) // addr | |||
1470 | .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset | |||
1471 | .addImm(0) // offset | |||
1472 | .addMemOperand(MMO); | |||
1473 | } | |||
1474 | ||||
1475 | static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { | |||
1476 | switch (Size) { | |||
1477 | case 4: | |||
1478 | return AMDGPU::SI_SPILL_S32_RESTORE; | |||
1479 | case 8: | |||
1480 | return AMDGPU::SI_SPILL_S64_RESTORE; | |||
1481 | case 12: | |||
1482 | return AMDGPU::SI_SPILL_S96_RESTORE; | |||
1483 | case 16: | |||
1484 | return AMDGPU::SI_SPILL_S128_RESTORE; | |||
1485 | case 20: | |||
1486 | return AMDGPU::SI_SPILL_S160_RESTORE; | |||
1487 | case 24: | |||
1488 | return AMDGPU::SI_SPILL_S192_RESTORE; | |||
1489 | case 32: | |||
1490 | return AMDGPU::SI_SPILL_S256_RESTORE; | |||
1491 | case 64: | |||
1492 | return AMDGPU::SI_SPILL_S512_RESTORE; | |||
1493 | case 128: | |||
1494 | return AMDGPU::SI_SPILL_S1024_RESTORE; | |||
1495 | default: | |||
1496 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1496); | |||
1497 | } | |||
1498 | } | |||
1499 | ||||
1500 | static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { | |||
1501 | switch (Size) { | |||
1502 | case 4: | |||
1503 | return AMDGPU::SI_SPILL_V32_RESTORE; | |||
1504 | case 8: | |||
1505 | return AMDGPU::SI_SPILL_V64_RESTORE; | |||
1506 | case 12: | |||
1507 | return AMDGPU::SI_SPILL_V96_RESTORE; | |||
1508 | case 16: | |||
1509 | return AMDGPU::SI_SPILL_V128_RESTORE; | |||
1510 | case 20: | |||
1511 | return AMDGPU::SI_SPILL_V160_RESTORE; | |||
1512 | case 24: | |||
1513 | return AMDGPU::SI_SPILL_V192_RESTORE; | |||
1514 | case 32: | |||
1515 | return AMDGPU::SI_SPILL_V256_RESTORE; | |||
1516 | case 64: | |||
1517 | return AMDGPU::SI_SPILL_V512_RESTORE; | |||
1518 | case 128: | |||
1519 | return AMDGPU::SI_SPILL_V1024_RESTORE; | |||
1520 | default: | |||
1521 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1521); | |||
1522 | } | |||
1523 | } | |||
1524 | ||||
1525 | static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { | |||
1526 | switch (Size) { | |||
1527 | case 4: | |||
1528 | return AMDGPU::SI_SPILL_A32_RESTORE; | |||
1529 | case 8: | |||
1530 | return AMDGPU::SI_SPILL_A64_RESTORE; | |||
1531 | case 12: | |||
1532 | return AMDGPU::SI_SPILL_A96_RESTORE; | |||
1533 | case 16: | |||
1534 | return AMDGPU::SI_SPILL_A128_RESTORE; | |||
1535 | case 20: | |||
1536 | return AMDGPU::SI_SPILL_A160_RESTORE; | |||
1537 | case 24: | |||
1538 | return AMDGPU::SI_SPILL_A192_RESTORE; | |||
1539 | case 32: | |||
1540 | return AMDGPU::SI_SPILL_A256_RESTORE; | |||
1541 | case 64: | |||
1542 | return AMDGPU::SI_SPILL_A512_RESTORE; | |||
1543 | case 128: | |||
1544 | return AMDGPU::SI_SPILL_A1024_RESTORE; | |||
1545 | default: | |||
1546 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1546); | |||
1547 | } | |||
1548 | } | |||
1549 | ||||
1550 | void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, | |||
1551 | MachineBasicBlock::iterator MI, | |||
1552 | Register DestReg, int FrameIndex, | |||
1553 | const TargetRegisterClass *RC, | |||
1554 | const TargetRegisterInfo *TRI) const { | |||
1555 | MachineFunction *MF = MBB.getParent(); | |||
1556 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); | |||
1557 | MachineFrameInfo &FrameInfo = MF->getFrameInfo(); | |||
1558 | const DebugLoc &DL = MBB.findDebugLoc(MI); | |||
1559 | unsigned SpillSize = TRI->getSpillSize(*RC); | |||
1560 | ||||
1561 | MachinePointerInfo PtrInfo | |||
1562 | = MachinePointerInfo::getFixedStack(*MF, FrameIndex); | |||
1563 | ||||
1564 | MachineMemOperand *MMO = MF->getMachineMemOperand( | |||
1565 | PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex), | |||
1566 | FrameInfo.getObjectAlign(FrameIndex)); | |||
1567 | ||||
1568 | if (RI.isSGPRClass(RC)) { | |||
1569 | MFI->setHasSpilledSGPRs(); | |||
1570 | assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into")((DestReg != AMDGPU::M0 && "m0 should not be reloaded into" ) ? static_cast<void> (0) : __assert_fail ("DestReg != AMDGPU::M0 && \"m0 should not be reloaded into\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1570, __PRETTY_FUNCTION__)); | |||
1571 | assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&((DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && "exec should not be spilled" ) ? static_cast<void> (0) : __assert_fail ("DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && \"exec should not be spilled\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1572, __PRETTY_FUNCTION__)) | |||
1572 | DestReg != AMDGPU::EXEC && "exec should not be spilled")((DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && "exec should not be spilled" ) ? static_cast<void> (0) : __assert_fail ("DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && \"exec should not be spilled\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1572, __PRETTY_FUNCTION__)); | |||
1573 | ||||
1574 | // FIXME: Maybe this should not include a memoperand because it will be | |||
1575 | // lowered to non-memory instructions. | |||
1576 | const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); | |||
1577 | if (DestReg.isVirtual() && SpillSize == 4) { | |||
1578 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
1579 | MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); | |||
1580 | } | |||
1581 | ||||
1582 | if (RI.spillSGPRToVGPR()) | |||
1583 | FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); | |||
1584 | BuildMI(MBB, MI, DL, OpDesc, DestReg) | |||
1585 | .addFrameIndex(FrameIndex) // addr | |||
1586 | .addMemOperand(MMO) | |||
1587 | .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); | |||
1588 | ||||
1589 | return; | |||
1590 | } | |||
1591 | ||||
1592 | unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize) | |||
1593 | : getVGPRSpillRestoreOpcode(SpillSize); | |||
1594 | BuildMI(MBB, MI, DL, get(Opcode), DestReg) | |||
1595 | .addFrameIndex(FrameIndex) // vaddr | |||
1596 | .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset | |||
1597 | .addImm(0) // offset | |||
1598 | .addMemOperand(MMO); | |||
1599 | } | |||
1600 | ||||
1601 | void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, | |||
1602 | MachineBasicBlock::iterator MI) const { | |||
1603 | insertNoops(MBB, MI, 1); | |||
1604 | } | |||
1605 | ||||
1606 | void SIInstrInfo::insertNoops(MachineBasicBlock &MBB, | |||
1607 | MachineBasicBlock::iterator MI, | |||
1608 | unsigned Quantity) const { | |||
1609 | DebugLoc DL = MBB.findDebugLoc(MI); | |||
1610 | while (Quantity > 0) { | |||
1611 | unsigned Arg = std::min(Quantity, 8u); | |||
1612 | Quantity -= Arg; | |||
1613 | BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1); | |||
1614 | } | |||
1615 | } | |||
1616 | ||||
1617 | void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { | |||
1618 | auto MF = MBB.getParent(); | |||
1619 | SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); | |||
1620 | ||||
1621 | assert(Info->isEntryFunction())((Info->isEntryFunction()) ? static_cast<void> (0) : __assert_fail ("Info->isEntryFunction()", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1621, __PRETTY_FUNCTION__)); | |||
1622 | ||||
1623 | if (MBB.succ_empty()) { | |||
1624 | bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); | |||
1625 | if (HasNoTerminator) { | |||
1626 | if (Info->returnsVoid()) { | |||
1627 | BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0); | |||
1628 | } else { | |||
1629 | BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG)); | |||
1630 | } | |||
1631 | } | |||
1632 | } | |||
1633 | } | |||
1634 | ||||
1635 | unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { | |||
1636 | switch (MI.getOpcode()) { | |||
1637 | default: return 1; // FIXME: Do wait states equal cycles? | |||
1638 | ||||
1639 | case AMDGPU::S_NOP: | |||
1640 | return MI.getOperand(0).getImm() + 1; | |||
1641 | } | |||
1642 | } | |||
1643 | ||||
1644 | bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { | |||
1645 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); | |||
1646 | MachineBasicBlock &MBB = *MI.getParent(); | |||
1647 | DebugLoc DL = MBB.findDebugLoc(MI); | |||
1648 | switch (MI.getOpcode()) { | |||
| ||||
1649 | default: return TargetInstrInfo::expandPostRAPseudo(MI); | |||
1650 | case AMDGPU::S_MOV_B64_term: | |||
1651 | // This is only a terminator to get the correct spill code placement during | |||
1652 | // register allocation. | |||
1653 | MI.setDesc(get(AMDGPU::S_MOV_B64)); | |||
1654 | break; | |||
1655 | ||||
1656 | case AMDGPU::S_MOV_B32_term: | |||
1657 | // This is only a terminator to get the correct spill code placement during | |||
1658 | // register allocation. | |||
1659 | MI.setDesc(get(AMDGPU::S_MOV_B32)); | |||
1660 | break; | |||
1661 | ||||
1662 | case AMDGPU::S_XOR_B64_term: | |||
1663 | // This is only a terminator to get the correct spill code placement during | |||
1664 | // register allocation. | |||
1665 | MI.setDesc(get(AMDGPU::S_XOR_B64)); | |||
1666 | break; | |||
1667 | ||||
1668 | case AMDGPU::S_XOR_B32_term: | |||
1669 | // This is only a terminator to get the correct spill code placement during | |||
1670 | // register allocation. | |||
1671 | MI.setDesc(get(AMDGPU::S_XOR_B32)); | |||
1672 | break; | |||
1673 | case AMDGPU::S_OR_B64_term: | |||
1674 | // This is only a terminator to get the correct spill code placement during | |||
1675 | // register allocation. | |||
1676 | MI.setDesc(get(AMDGPU::S_OR_B64)); | |||
1677 | break; | |||
1678 | case AMDGPU::S_OR_B32_term: | |||
1679 | // This is only a terminator to get the correct spill code placement during | |||
1680 | // register allocation. | |||
1681 | MI.setDesc(get(AMDGPU::S_OR_B32)); | |||
1682 | break; | |||
1683 | ||||
1684 | case AMDGPU::S_ANDN2_B64_term: | |||
1685 | // This is only a terminator to get the correct spill code placement during | |||
1686 | // register allocation. | |||
1687 | MI.setDesc(get(AMDGPU::S_ANDN2_B64)); | |||
1688 | break; | |||
1689 | ||||
1690 | case AMDGPU::S_ANDN2_B32_term: | |||
1691 | // This is only a terminator to get the correct spill code placement during | |||
1692 | // register allocation. | |||
1693 | MI.setDesc(get(AMDGPU::S_ANDN2_B32)); | |||
1694 | break; | |||
1695 | ||||
1696 | case AMDGPU::S_AND_B64_term: | |||
1697 | // This is only a terminator to get the correct spill code placement during | |||
1698 | // register allocation. | |||
1699 | MI.setDesc(get(AMDGPU::S_AND_B64)); | |||
1700 | break; | |||
1701 | ||||
1702 | case AMDGPU::S_AND_B32_term: | |||
1703 | // This is only a terminator to get the correct spill code placement during | |||
1704 | // register allocation. | |||
1705 | MI.setDesc(get(AMDGPU::S_AND_B32)); | |||
1706 | break; | |||
1707 | ||||
1708 | case AMDGPU::V_MOV_B64_PSEUDO: { | |||
1709 | Register Dst = MI.getOperand(0).getReg(); | |||
1710 | Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); | |||
1711 | Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); | |||
1712 | ||||
1713 | const MachineOperand &SrcOp = MI.getOperand(1); | |||
1714 | // FIXME: Will this work for 64-bit floating point immediates? | |||
1715 | assert(!SrcOp.isFPImm())((!SrcOp.isFPImm()) ? static_cast<void> (0) : __assert_fail ("!SrcOp.isFPImm()", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1715, __PRETTY_FUNCTION__)); | |||
1716 | if (SrcOp.isImm()) { | |||
1717 | APInt Imm(64, SrcOp.getImm()); | |||
1718 | APInt Lo(32, Imm.getLoBits(32).getZExtValue()); | |||
1719 | APInt Hi(32, Imm.getHiBits(32).getZExtValue()); | |||
1720 | if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) { | |||
1721 | BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) | |||
1722 | .addImm(SISrcMods::OP_SEL_1) | |||
1723 | .addImm(Lo.getSExtValue()) | |||
1724 | .addImm(SISrcMods::OP_SEL_1) | |||
1725 | .addImm(Lo.getSExtValue()) | |||
1726 | .addImm(0) // op_sel_lo | |||
1727 | .addImm(0) // op_sel_hi | |||
1728 | .addImm(0) // neg_lo | |||
1729 | .addImm(0) // neg_hi | |||
1730 | .addImm(0); // clamp | |||
1731 | } else { | |||
1732 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) | |||
1733 | .addImm(Lo.getZExtValue()) | |||
1734 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1735 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) | |||
1736 | .addImm(Hi.getZExtValue()) | |||
1737 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1738 | } | |||
1739 | } else { | |||
1740 | assert(SrcOp.isReg())((SrcOp.isReg()) ? static_cast<void> (0) : __assert_fail ("SrcOp.isReg()", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1740, __PRETTY_FUNCTION__)); | |||
1741 | if (ST.hasPackedFP32Ops() && | |||
1742 | !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) { | |||
1743 | BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) | |||
1744 | .addImm(SISrcMods::OP_SEL_1) // src0_mod | |||
1745 | .addReg(SrcOp.getReg()) | |||
1746 | .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod | |||
1747 | .addReg(SrcOp.getReg()) | |||
1748 | .addImm(0) // op_sel_lo | |||
1749 | .addImm(0) // op_sel_hi | |||
1750 | .addImm(0) // neg_lo | |||
1751 | .addImm(0) // neg_hi | |||
1752 | .addImm(0); // clamp | |||
1753 | } else { | |||
1754 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) | |||
1755 | .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) | |||
1756 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1757 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) | |||
1758 | .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) | |||
1759 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1760 | } | |||
1761 | } | |||
1762 | MI.eraseFromParent(); | |||
1763 | break; | |||
1764 | } | |||
1765 | case AMDGPU::V_MOV_B64_DPP_PSEUDO: { | |||
1766 | expandMovDPP64(MI); | |||
1767 | break; | |||
1768 | } | |||
1769 | case AMDGPU::V_SET_INACTIVE_B32: { | |||
1770 | unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; | |||
1771 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
1772 | auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); | |||
1773 | FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten | |||
1774 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) | |||
1775 | .add(MI.getOperand(2)); | |||
1776 | BuildMI(MBB, MI, DL, get(NotOpc), Exec) | |||
1777 | .addReg(Exec); | |||
1778 | MI.eraseFromParent(); | |||
1779 | break; | |||
1780 | } | |||
1781 | case AMDGPU::V_SET_INACTIVE_B64: { | |||
1782 | unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; | |||
1783 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
1784 | auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); | |||
1785 | FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten | |||
1786 | MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), | |||
1787 | MI.getOperand(0).getReg()) | |||
1788 | .add(MI.getOperand(2)); | |||
1789 | expandPostRAPseudo(*Copy); | |||
1790 | BuildMI(MBB, MI, DL, get(NotOpc), Exec) | |||
1791 | .addReg(Exec); | |||
1792 | MI.eraseFromParent(); | |||
1793 | break; | |||
1794 | } | |||
1795 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1: | |||
1796 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2: | |||
1797 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3: | |||
1798 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4: | |||
1799 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5: | |||
1800 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8: | |||
1801 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16: | |||
1802 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32: | |||
1803 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1: | |||
1804 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2: | |||
1805 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3: | |||
1806 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4: | |||
1807 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5: | |||
1808 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8: | |||
1809 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16: | |||
1810 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32: | |||
1811 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1: | |||
1812 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2: | |||
1813 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4: | |||
1814 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8: | |||
1815 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: { | |||
1816 | const TargetRegisterClass *EltRC = getOpRegClass(MI, 2); | |||
1817 | ||||
1818 | unsigned Opc; | |||
1819 | if (RI.hasVGPRs(EltRC)) { | |||
1820 | Opc = AMDGPU::V_MOVRELD_B32_e32; | |||
1821 | } else { | |||
1822 | Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64 | |||
1823 | : AMDGPU::S_MOVRELD_B32; | |||
1824 | } | |||
1825 | ||||
1826 | const MCInstrDesc &OpDesc = get(Opc); | |||
1827 | Register VecReg = MI.getOperand(0).getReg(); | |||
1828 | bool IsUndef = MI.getOperand(1).isUndef(); | |||
1829 | unsigned SubReg = MI.getOperand(3).getImm(); | |||
1830 | assert(VecReg == MI.getOperand(1).getReg())((VecReg == MI.getOperand(1).getReg()) ? static_cast<void> (0) : __assert_fail ("VecReg == MI.getOperand(1).getReg()", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1830, __PRETTY_FUNCTION__)); | |||
1831 | ||||
1832 | MachineInstrBuilder MIB = | |||
1833 | BuildMI(MBB, MI, DL, OpDesc) | |||
1834 | .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) | |||
1835 | .add(MI.getOperand(2)) | |||
1836 | .addReg(VecReg, RegState::ImplicitDefine) | |||
1837 | .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); | |||
1838 | ||||
1839 | const int ImpDefIdx = | |||
1840 | OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); | |||
1841 | const int ImpUseIdx = ImpDefIdx + 1; | |||
1842 | MIB->tieOperands(ImpDefIdx, ImpUseIdx); | |||
1843 | MI.eraseFromParent(); | |||
1844 | break; | |||
1845 | } | |||
1846 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1: | |||
1847 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2: | |||
1848 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3: | |||
1849 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4: | |||
1850 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5: | |||
1851 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8: | |||
1852 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16: | |||
1853 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: { | |||
1854 | assert(ST.useVGPRIndexMode())((ST.useVGPRIndexMode()) ? static_cast<void> (0) : __assert_fail ("ST.useVGPRIndexMode()", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1854, __PRETTY_FUNCTION__)); | |||
1855 | Register VecReg = MI.getOperand(0).getReg(); | |||
1856 | bool IsUndef = MI.getOperand(1).isUndef(); | |||
1857 | Register Idx = MI.getOperand(3).getReg(); | |||
1858 | Register SubReg = MI.getOperand(4).getImm(); | |||
1859 | ||||
1860 | MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) | |||
1861 | .addReg(Idx) | |||
1862 | .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); | |||
1863 | SetOn->getOperand(3).setIsUndef(); | |||
1864 | ||||
1865 | const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect); | |||
1866 | MachineInstrBuilder MIB = | |||
1867 | BuildMI(MBB, MI, DL, OpDesc) | |||
1868 | .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) | |||
1869 | .add(MI.getOperand(2)) | |||
1870 | .addReg(VecReg, RegState::ImplicitDefine) | |||
1871 | .addReg(VecReg, | |||
1872 | RegState::Implicit | (IsUndef ? RegState::Undef : 0)); | |||
1873 | ||||
1874 | const int ImpDefIdx = OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); | |||
1875 | const int ImpUseIdx = ImpDefIdx + 1; | |||
1876 | MIB->tieOperands(ImpDefIdx, ImpUseIdx); | |||
1877 | ||||
1878 | MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); | |||
1879 | ||||
1880 | finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); | |||
1881 | ||||
1882 | MI.eraseFromParent(); | |||
1883 | break; | |||
1884 | } | |||
1885 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1: | |||
1886 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2: | |||
1887 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3: | |||
1888 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4: | |||
1889 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5: | |||
1890 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8: | |||
1891 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16: | |||
1892 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: { | |||
1893 | assert(ST.useVGPRIndexMode())((ST.useVGPRIndexMode()) ? static_cast<void> (0) : __assert_fail ("ST.useVGPRIndexMode()", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1893, __PRETTY_FUNCTION__)); | |||
1894 | Register Dst = MI.getOperand(0).getReg(); | |||
1895 | Register VecReg = MI.getOperand(1).getReg(); | |||
1896 | bool IsUndef = MI.getOperand(1).isUndef(); | |||
1897 | Register Idx = MI.getOperand(2).getReg(); | |||
1898 | Register SubReg = MI.getOperand(3).getImm(); | |||
1899 | ||||
1900 | MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) | |||
1901 | .addReg(Idx) | |||
1902 | .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); | |||
1903 | SetOn->getOperand(3).setIsUndef(); | |||
1904 | ||||
1905 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32)) | |||
1906 | .addDef(Dst) | |||
1907 | .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) | |||
1908 | .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)) | |||
1909 | .addReg(AMDGPU::M0, RegState::Implicit); | |||
1910 | ||||
1911 | MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); | |||
1912 | ||||
1913 | finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); | |||
1914 | ||||
1915 | MI.eraseFromParent(); | |||
1916 | break; | |||
1917 | } | |||
1918 | case AMDGPU::SI_PC_ADD_REL_OFFSET: { | |||
1919 | MachineFunction &MF = *MBB.getParent(); | |||
1920 | Register Reg = MI.getOperand(0).getReg(); | |||
1921 | Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0); | |||
1922 | Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1); | |||
1923 | ||||
1924 | // Create a bundle so these instructions won't be re-ordered by the | |||
1925 | // post-RA scheduler. | |||
1926 | MIBundleBuilder Bundler(MBB, MI); | |||
1927 | Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); | |||
1928 | ||||
1929 | // Add 32-bit offset from this instruction to the start of the | |||
1930 | // constant data. | |||
1931 | Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) | |||
1932 | .addReg(RegLo) | |||
1933 | .add(MI.getOperand(1))); | |||
1934 | ||||
1935 | MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) | |||
1936 | .addReg(RegHi); | |||
1937 | MIB.add(MI.getOperand(2)); | |||
1938 | ||||
1939 | Bundler.append(MIB); | |||
1940 | finalizeBundle(MBB, Bundler.begin()); | |||
1941 | ||||
1942 | MI.eraseFromParent(); | |||
1943 | break; | |||
1944 | } | |||
1945 | case AMDGPU::ENTER_STRICT_WWM: { | |||
1946 | // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when | |||
1947 | // Whole Wave Mode is entered. | |||
1948 | MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 | |||
1949 | : AMDGPU::S_OR_SAVEEXEC_B64)); | |||
1950 | break; | |||
1951 | } | |||
1952 | case AMDGPU::ENTER_STRICT_WQM: { | |||
1953 | // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when | |||
1954 | // STRICT_WQM is entered. | |||
1955 | const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
1956 | const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64; | |||
1957 | const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; | |||
1958 | BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec); | |||
1959 | BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec); | |||
1960 | ||||
1961 | MI.eraseFromParent(); | |||
1962 | break; | |||
1963 | } | |||
1964 | case AMDGPU::EXIT_STRICT_WWM: | |||
1965 | case AMDGPU::EXIT_STRICT_WQM: { | |||
1966 | // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when | |||
1967 | // WWM/STICT_WQM is exited. | |||
1968 | MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); | |||
1969 | break; | |||
1970 | } | |||
1971 | } | |||
1972 | return true; | |||
1973 | } | |||
1974 | ||||
1975 | std::pair<MachineInstr*, MachineInstr*> | |||
1976 | SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { | |||
1977 | assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO)((MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) ? static_cast <void> (0) : __assert_fail ("MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1977, __PRETTY_FUNCTION__)); | |||
1978 | ||||
1979 | MachineBasicBlock &MBB = *MI.getParent(); | |||
1980 | DebugLoc DL = MBB.findDebugLoc(MI); | |||
1981 | MachineFunction *MF = MBB.getParent(); | |||
1982 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
1983 | Register Dst = MI.getOperand(0).getReg(); | |||
1984 | unsigned Part = 0; | |||
1985 | MachineInstr *Split[2]; | |||
1986 | ||||
1987 | for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) { | |||
1988 | auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp)); | |||
1989 | if (Dst.isPhysical()) { | |||
1990 | MovDPP.addDef(RI.getSubReg(Dst, Sub)); | |||
1991 | } else { | |||
1992 | assert(MRI.isSSA())((MRI.isSSA()) ? static_cast<void> (0) : __assert_fail ( "MRI.isSSA()", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1992, __PRETTY_FUNCTION__)); | |||
1993 | auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
1994 | MovDPP.addDef(Tmp); | |||
1995 | } | |||
1996 | ||||
1997 | for (unsigned I = 1; I <= 2; ++I) { // old and src operands. | |||
1998 | const MachineOperand &SrcOp = MI.getOperand(I); | |||
1999 | assert(!SrcOp.isFPImm())((!SrcOp.isFPImm()) ? static_cast<void> (0) : __assert_fail ("!SrcOp.isFPImm()", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1999, __PRETTY_FUNCTION__)); | |||
2000 | if (SrcOp.isImm()) { | |||
2001 | APInt Imm(64, SrcOp.getImm()); | |||
2002 | Imm.ashrInPlace(Part * 32); | |||
2003 | MovDPP.addImm(Imm.getLoBits(32).getZExtValue()); | |||
2004 | } else { | |||
2005 | assert(SrcOp.isReg())((SrcOp.isReg()) ? static_cast<void> (0) : __assert_fail ("SrcOp.isReg()", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2005, __PRETTY_FUNCTION__)); | |||
2006 | Register Src = SrcOp.getReg(); | |||
2007 | if (Src.isPhysical()) | |||
2008 | MovDPP.addReg(RI.getSubReg(Src, Sub)); | |||
2009 | else | |||
2010 | MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub); | |||
2011 | } | |||
2012 | } | |||
2013 | ||||
2014 | for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I) | |||
2015 | MovDPP.addImm(MI.getOperand(I).getImm()); | |||
2016 | ||||
2017 | Split[Part] = MovDPP; | |||
2018 | ++Part; | |||
2019 | } | |||
2020 | ||||
2021 | if (Dst.isVirtual()) | |||
2022 | BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst) | |||
2023 | .addReg(Split[0]->getOperand(0).getReg()) | |||
| ||||
2024 | .addImm(AMDGPU::sub0) | |||
2025 | .addReg(Split[1]->getOperand(0).getReg()) | |||
2026 | .addImm(AMDGPU::sub1); | |||
2027 | ||||
2028 | MI.eraseFromParent(); | |||
2029 | return std::make_pair(Split[0], Split[1]); | |||
2030 | } | |||
2031 | ||||
2032 | bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, | |||
2033 | MachineOperand &Src0, | |||
2034 | unsigned Src0OpName, | |||
2035 | MachineOperand &Src1, | |||
2036 | unsigned Src1OpName) const { | |||
2037 | MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); | |||
2038 | if (!Src0Mods) | |||
2039 | return false; | |||
2040 | ||||
2041 | MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); | |||
2042 | assert(Src1Mods &&((Src1Mods && "All commutable instructions have both src0 and src1 modifiers" ) ? static_cast<void> (0) : __assert_fail ("Src1Mods && \"All commutable instructions have both src0 and src1 modifiers\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2043, __PRETTY_FUNCTION__)) | |||
2043 | "All commutable instructions have both src0 and src1 modifiers")((Src1Mods && "All commutable instructions have both src0 and src1 modifiers" ) ? static_cast<void> (0) : __assert_fail ("Src1Mods && \"All commutable instructions have both src0 and src1 modifiers\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2043, __PRETTY_FUNCTION__)); | |||
2044 | ||||
2045 | int Src0ModsVal = Src0Mods->getImm(); | |||
2046 | int Src1ModsVal = Src1Mods->getImm(); | |||
2047 | ||||
2048 | Src1Mods->setImm(Src0ModsVal); | |||
2049 | Src0Mods->setImm(Src1ModsVal); | |||
2050 | return true; | |||
2051 | } | |||
2052 | ||||
2053 | static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, | |||
2054 | MachineOperand &RegOp, | |||
2055 | MachineOperand &NonRegOp) { | |||
2056 | Register Reg = RegOp.getReg(); | |||
2057 | unsigned SubReg = RegOp.getSubReg(); | |||
2058 | bool IsKill = RegOp.isKill(); | |||
2059 | bool IsDead = RegOp.isDead(); | |||
2060 | bool IsUndef = RegOp.isUndef(); | |||
2061 | bool IsDebug = RegOp.isDebug(); | |||
2062 | ||||
2063 | if (NonRegOp.isImm()) | |||
2064 | RegOp.ChangeToImmediate(NonRegOp.getImm()); | |||
2065 | else if (NonRegOp.isFI()) | |||
2066 | RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); | |||
2067 | else if (NonRegOp.isGlobal()) { | |||
2068 | RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(), | |||
2069 | NonRegOp.getTargetFlags()); | |||
2070 | } else | |||
2071 | return nullptr; | |||
2072 | ||||
2073 | // Make sure we don't reinterpret a subreg index in the target flags. | |||
2074 | RegOp.setTargetFlags(NonRegOp.getTargetFlags()); | |||
2075 | ||||
2076 | NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); | |||
2077 | NonRegOp.setSubReg(SubReg); | |||
2078 | ||||
2079 | return &MI; | |||
2080 | } | |||
2081 | ||||
2082 | MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, | |||
2083 | unsigned Src0Idx, | |||
2084 | unsigned Src1Idx) const { | |||
2085 | assert(!NewMI && "this should never be used")((!NewMI && "this should never be used") ? static_cast <void> (0) : __assert_fail ("!NewMI && \"this should never be used\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2085, __PRETTY_FUNCTION__)); | |||
2086 | ||||
2087 | unsigned Opc = MI.getOpcode(); | |||
2088 | int CommutedOpcode = commuteOpcode(Opc); | |||
2089 | if (CommutedOpcode == -1) | |||
2090 | return nullptr; | |||
2091 | ||||
2092 | assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==((AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast <int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc , AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && "inconsistency with findCommutedOpIndices") ? static_cast< void> (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2096, __PRETTY_FUNCTION__)) | |||
2093 | static_cast<int>(Src0Idx) &&((AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast <int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc , AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && "inconsistency with findCommutedOpIndices") ? static_cast< void> (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2096, __PRETTY_FUNCTION__)) | |||
2094 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==((AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast <int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc , AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && "inconsistency with findCommutedOpIndices") ? static_cast< void> (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2096, __PRETTY_FUNCTION__)) | |||
2095 | static_cast<int>(Src1Idx) &&((AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast <int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc , AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && "inconsistency with findCommutedOpIndices") ? static_cast< void> (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2096, __PRETTY_FUNCTION__)) | |||
2096 | "inconsistency with findCommutedOpIndices")((AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast <int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc , AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && "inconsistency with findCommutedOpIndices") ? static_cast< void> (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2096, __PRETTY_FUNCTION__)); | |||
2097 | ||||
2098 | MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
2099 | MachineOperand &Src1 = MI.getOperand(Src1Idx); | |||
2100 | ||||
2101 | MachineInstr *CommutedMI = nullptr; | |||
2102 | if (Src0.isReg() && Src1.isReg()) { | |||
2103 | if (isOperandLegal(MI, Src1Idx, &Src0)) { | |||
2104 | // Be sure to copy the source modifiers to the right place. | |||
2105 | CommutedMI | |||
2106 | = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); | |||
2107 | } | |||
2108 | ||||
2109 | } else if (Src0.isReg() && !Src1.isReg()) { | |||
2110 | // src0 should always be able to support any operand type, so no need to | |||
2111 | // check operand legality. | |||
2112 | CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); | |||
2113 | } else if (!Src0.isReg() && Src1.isReg()) { | |||
2114 | if (isOperandLegal(MI, Src1Idx, &Src0)) | |||
2115 | CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); | |||
2116 | } else { | |||
2117 | // FIXME: Found two non registers to commute. This does happen. | |||
2118 | return nullptr; | |||
2119 | } | |||
2120 | ||||
2121 | if (CommutedMI) { | |||
2122 | swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, | |||
2123 | Src1, AMDGPU::OpName::src1_modifiers); | |||
2124 | ||||
2125 | CommutedMI->setDesc(get(CommutedOpcode)); | |||
2126 | } | |||
2127 | ||||
2128 | return CommutedMI; | |||
2129 | } | |||
2130 | ||||
2131 | // This needs to be implemented because the source modifiers may be inserted | |||
2132 | // between the true commutable operands, and the base | |||
2133 | // TargetInstrInfo::commuteInstruction uses it. | |||
2134 | bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI, | |||
2135 | unsigned &SrcOpIdx0, | |||
2136 | unsigned &SrcOpIdx1) const { | |||
2137 | return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); | |||
2138 | } | |||
2139 | ||||
2140 | bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0, | |||
2141 | unsigned &SrcOpIdx1) const { | |||
2142 | if (!Desc.isCommutable()) | |||
2143 | return false; | |||
2144 | ||||
2145 | unsigned Opc = Desc.getOpcode(); | |||
2146 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); | |||
2147 | if (Src0Idx == -1) | |||
2148 | return false; | |||
2149 | ||||
2150 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); | |||
2151 | if (Src1Idx == -1) | |||
2152 | return false; | |||
2153 | ||||
2154 | return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); | |||
2155 | } | |||
2156 | ||||
2157 | bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, | |||
2158 | int64_t BrOffset) const { | |||
2159 | // BranchRelaxation should never have to check s_setpc_b64 because its dest | |||
2160 | // block is unanalyzable. | |||
2161 | assert(BranchOp != AMDGPU::S_SETPC_B64)((BranchOp != AMDGPU::S_SETPC_B64) ? static_cast<void> ( 0) : __assert_fail ("BranchOp != AMDGPU::S_SETPC_B64", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2161, __PRETTY_FUNCTION__)); | |||
2162 | ||||
2163 | // Convert to dwords. | |||
2164 | BrOffset /= 4; | |||
2165 | ||||
2166 | // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is | |||
2167 | // from the next instruction. | |||
2168 | BrOffset -= 1; | |||
2169 | ||||
2170 | return isIntN(BranchOffsetBits, BrOffset); | |||
2171 | } | |||
2172 | ||||
2173 | MachineBasicBlock *SIInstrInfo::getBranchDestBlock( | |||
2174 | const MachineInstr &MI) const { | |||
2175 | if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { | |||
2176 | // This would be a difficult analysis to perform, but can always be legal so | |||
2177 | // there's no need to analyze it. | |||
2178 | return nullptr; | |||
2179 | } | |||
2180 | ||||
2181 | return MI.getOperand(0).getMBB(); | |||
2182 | } | |||
2183 | ||||
2184 | unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, | |||
2185 | MachineBasicBlock &DestBB, | |||
2186 | const DebugLoc &DL, | |||
2187 | int64_t BrOffset, | |||
2188 | RegScavenger *RS) const { | |||
2189 | assert(RS && "RegScavenger required for long branching")((RS && "RegScavenger required for long branching") ? static_cast<void> (0) : __assert_fail ("RS && \"RegScavenger required for long branching\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2189, __PRETTY_FUNCTION__)); | |||
2190 | assert(MBB.empty() &&((MBB.empty() && "new block should be inserted for expanding unconditional branch" ) ? static_cast<void> (0) : __assert_fail ("MBB.empty() && \"new block should be inserted for expanding unconditional branch\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2191, __PRETTY_FUNCTION__)) | |||
2191 | "new block should be inserted for expanding unconditional branch")((MBB.empty() && "new block should be inserted for expanding unconditional branch" ) ? static_cast<void> (0) : __assert_fail ("MBB.empty() && \"new block should be inserted for expanding unconditional branch\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2191, __PRETTY_FUNCTION__)); | |||
2192 | assert(MBB.pred_size() == 1)((MBB.pred_size() == 1) ? static_cast<void> (0) : __assert_fail ("MBB.pred_size() == 1", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2192, __PRETTY_FUNCTION__)); | |||
2193 | ||||
2194 | MachineFunction *MF = MBB.getParent(); | |||
2195 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
2196 | ||||
2197 | // FIXME: Virtual register workaround for RegScavenger not working with empty | |||
2198 | // blocks. | |||
2199 | Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); | |||
2200 | ||||
2201 | auto I = MBB.end(); | |||
2202 | ||||
2203 | // We need to compute the offset relative to the instruction immediately after | |||
2204 | // s_getpc_b64. Insert pc arithmetic code before last terminator. | |||
2205 | MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); | |||
2206 | ||||
2207 | // TODO: Handle > 32-bit block address. | |||
2208 | if (BrOffset >= 0) { | |||
2209 | BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) | |||
2210 | .addReg(PCReg, RegState::Define, AMDGPU::sub0) | |||
2211 | .addReg(PCReg, 0, AMDGPU::sub0) | |||
2212 | .addMBB(&DestBB, MO_LONG_BRANCH_FORWARD); | |||
2213 | BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) | |||
2214 | .addReg(PCReg, RegState::Define, AMDGPU::sub1) | |||
2215 | .addReg(PCReg, 0, AMDGPU::sub1) | |||
2216 | .addImm(0); | |||
2217 | } else { | |||
2218 | // Backwards branch. | |||
2219 | BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) | |||
2220 | .addReg(PCReg, RegState::Define, AMDGPU::sub0) | |||
2221 | .addReg(PCReg, 0, AMDGPU::sub0) | |||
2222 | .addMBB(&DestBB, MO_LONG_BRANCH_BACKWARD); | |||
2223 | BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) | |||
2224 | .addReg(PCReg, RegState::Define, AMDGPU::sub1) | |||
2225 | .addReg(PCReg, 0, AMDGPU::sub1) | |||
2226 | .addImm(0); | |||
2227 | } | |||
2228 | ||||
2229 | // Insert the indirect branch after the other terminator. | |||
2230 | BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) | |||
2231 | .addReg(PCReg); | |||
2232 | ||||
2233 | // FIXME: If spilling is necessary, this will fail because this scavenger has | |||
2234 | // no emergency stack slots. It is non-trivial to spill in this situation, | |||
2235 | // because the restore code needs to be specially placed after the | |||
2236 | // jump. BranchRelaxation then needs to be made aware of the newly inserted | |||
2237 | // block. | |||
2238 | // | |||
2239 | // If a spill is needed for the pc register pair, we need to insert a spill | |||
2240 | // restore block right before the destination block, and insert a short branch | |||
2241 | // into the old destination block's fallthrough predecessor. | |||
2242 | // e.g.: | |||
2243 | // | |||
2244 | // s_cbranch_scc0 skip_long_branch: | |||
2245 | // | |||
2246 | // long_branch_bb: | |||
2247 | // spill s[8:9] | |||
2248 | // s_getpc_b64 s[8:9] | |||
2249 | // s_add_u32 s8, s8, restore_bb | |||
2250 | // s_addc_u32 s9, s9, 0 | |||
2251 | // s_setpc_b64 s[8:9] | |||
2252 | // | |||
2253 | // skip_long_branch: | |||
2254 | // foo; | |||
2255 | // | |||
2256 | // ..... | |||
2257 | // | |||
2258 | // dest_bb_fallthrough_predecessor: | |||
2259 | // bar; | |||
2260 | // s_branch dest_bb | |||
2261 | // | |||
2262 | // restore_bb: | |||
2263 | // restore s[8:9] | |||
2264 | // fallthrough dest_bb | |||
2265 | /// | |||
2266 | // dest_bb: | |||
2267 | // buzz; | |||
2268 | ||||
2269 | RS->enterBasicBlockEnd(MBB); | |||
2270 | Register Scav = RS->scavengeRegisterBackwards( | |||
2271 | AMDGPU::SReg_64RegClass, | |||
2272 | MachineBasicBlock::iterator(GetPC), false, 0); | |||
2273 | MRI.replaceRegWith(PCReg, Scav); | |||
2274 | MRI.clearVirtRegs(); | |||
2275 | RS->setRegUsed(Scav); | |||
2276 | ||||
2277 | return 4 + 8 + 4 + 4; | |||
2278 | } | |||
2279 | ||||
2280 | unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { | |||
2281 | switch (Cond) { | |||
2282 | case SIInstrInfo::SCC_TRUE: | |||
2283 | return AMDGPU::S_CBRANCH_SCC1; | |||
2284 | case SIInstrInfo::SCC_FALSE: | |||
2285 | return AMDGPU::S_CBRANCH_SCC0; | |||
2286 | case SIInstrInfo::VCCNZ: | |||
2287 | return AMDGPU::S_CBRANCH_VCCNZ; | |||
2288 | case SIInstrInfo::VCCZ: | |||
2289 | return AMDGPU::S_CBRANCH_VCCZ; | |||
2290 | case SIInstrInfo::EXECNZ: | |||
2291 | return AMDGPU::S_CBRANCH_EXECNZ; | |||
2292 | case SIInstrInfo::EXECZ: | |||
2293 | return AMDGPU::S_CBRANCH_EXECZ; | |||
2294 | default: | |||
2295 | llvm_unreachable("invalid branch predicate")::llvm::llvm_unreachable_internal("invalid branch predicate", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2295); | |||
2296 | } | |||
2297 | } | |||
2298 | ||||
2299 | SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { | |||
2300 | switch (Opcode) { | |||
2301 | case AMDGPU::S_CBRANCH_SCC0: | |||
2302 | return SCC_FALSE; | |||
2303 | case AMDGPU::S_CBRANCH_SCC1: | |||
2304 | return SCC_TRUE; | |||
2305 | case AMDGPU::S_CBRANCH_VCCNZ: | |||
2306 | return VCCNZ; | |||
2307 | case AMDGPU::S_CBRANCH_VCCZ: | |||
2308 | return VCCZ; | |||
2309 | case AMDGPU::S_CBRANCH_EXECNZ: | |||
2310 | return EXECNZ; | |||
2311 | case AMDGPU::S_CBRANCH_EXECZ: | |||
2312 | return EXECZ; | |||
2313 | default: | |||
2314 | return INVALID_BR; | |||
2315 | } | |||
2316 | } | |||
2317 | ||||
2318 | bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, | |||
2319 | MachineBasicBlock::iterator I, | |||
2320 | MachineBasicBlock *&TBB, | |||
2321 | MachineBasicBlock *&FBB, | |||
2322 | SmallVectorImpl<MachineOperand> &Cond, | |||
2323 | bool AllowModify) const { | |||
2324 | if (I->getOpcode() == AMDGPU::S_BRANCH) { | |||
2325 | // Unconditional Branch | |||
2326 | TBB = I->getOperand(0).getMBB(); | |||
2327 | return false; | |||
2328 | } | |||
2329 | ||||
2330 | MachineBasicBlock *CondBB = nullptr; | |||
2331 | ||||
2332 | if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { | |||
2333 | CondBB = I->getOperand(1).getMBB(); | |||
2334 | Cond.push_back(I->getOperand(0)); | |||
2335 | } else { | |||
2336 | BranchPredicate Pred = getBranchPredicate(I->getOpcode()); | |||
2337 | if (Pred == INVALID_BR) | |||
2338 | return true; | |||
2339 | ||||
2340 | CondBB = I->getOperand(0).getMBB(); | |||
2341 | Cond.push_back(MachineOperand::CreateImm(Pred)); | |||
2342 | Cond.push_back(I->getOperand(1)); // Save the branch register. | |||
2343 | } | |||
2344 | ++I; | |||
2345 | ||||
2346 | if (I == MBB.end()) { | |||
2347 | // Conditional branch followed by fall-through. | |||
2348 | TBB = CondBB; | |||
2349 | return false; | |||
2350 | } | |||
2351 | ||||
2352 | if (I->getOpcode() == AMDGPU::S_BRANCH) { | |||
2353 | TBB = CondBB; | |||
2354 | FBB = I->getOperand(0).getMBB(); | |||
2355 | return false; | |||
2356 | } | |||
2357 | ||||
2358 | return true; | |||
2359 | } | |||
2360 | ||||
2361 | bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, | |||
2362 | MachineBasicBlock *&FBB, | |||
2363 | SmallVectorImpl<MachineOperand> &Cond, | |||
2364 | bool AllowModify) const { | |||
2365 | MachineBasicBlock::iterator I = MBB.getFirstTerminator(); | |||
2366 | auto E = MBB.end(); | |||
2367 | if (I == E) | |||
2368 | return false; | |||
2369 | ||||
2370 | // Skip over the instructions that are artificially terminators for special | |||
2371 | // exec management. | |||
2372 | while (I != E && !I->isBranch() && !I->isReturn() && | |||
2373 | I->getOpcode() != AMDGPU::SI_MASK_BRANCH) { | |||
2374 | switch (I->getOpcode()) { | |||
2375 | case AMDGPU::SI_MASK_BRANCH: | |||
2376 | case AMDGPU::S_MOV_B64_term: | |||
2377 | case AMDGPU::S_XOR_B64_term: | |||
2378 | case AMDGPU::S_OR_B64_term: | |||
2379 | case AMDGPU::S_ANDN2_B64_term: | |||
2380 | case AMDGPU::S_AND_B64_term: | |||
2381 | case AMDGPU::S_MOV_B32_term: | |||
2382 | case AMDGPU::S_XOR_B32_term: | |||
2383 | case AMDGPU::S_OR_B32_term: | |||
2384 | case AMDGPU::S_ANDN2_B32_term: | |||
2385 | case AMDGPU::S_AND_B32_term: | |||
2386 | break; | |||
2387 | case AMDGPU::SI_IF: | |||
2388 | case AMDGPU::SI_ELSE: | |||
2389 | case AMDGPU::SI_KILL_I1_TERMINATOR: | |||
2390 | case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: | |||
2391 | // FIXME: It's messy that these need to be considered here at all. | |||
2392 | return true; | |||
2393 | default: | |||
2394 | llvm_unreachable("unexpected non-branch terminator inst")::llvm::llvm_unreachable_internal("unexpected non-branch terminator inst" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2394); | |||
2395 | } | |||
2396 | ||||
2397 | ++I; | |||
2398 | } | |||
2399 | ||||
2400 | if (I == E) | |||
2401 | return false; | |||
2402 | ||||
2403 | if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) | |||
2404 | return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); | |||
2405 | ||||
2406 | ++I; | |||
2407 | ||||
2408 | // TODO: Should be able to treat as fallthrough? | |||
2409 | if (I == MBB.end()) | |||
2410 | return true; | |||
2411 | ||||
2412 | if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) | |||
2413 | return true; | |||
2414 | ||||
2415 | MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); | |||
2416 | ||||
2417 | // Specifically handle the case where the conditional branch is to the same | |||
2418 | // destination as the mask branch. e.g. | |||
2419 | // | |||
2420 | // si_mask_branch BB8 | |||
2421 | // s_cbranch_execz BB8 | |||
2422 | // s_cbranch BB9 | |||
2423 | // | |||
2424 | // This is required to understand divergent loops which may need the branches | |||
2425 | // to be relaxed. | |||
2426 | if (TBB != MaskBrDest || Cond.empty()) | |||
2427 | return true; | |||
2428 | ||||
2429 | auto Pred = Cond[0].getImm(); | |||
2430 | return (Pred != EXECZ && Pred != EXECNZ); | |||
2431 | } | |||
2432 | ||||
2433 | unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, | |||
2434 | int *BytesRemoved) const { | |||
2435 | MachineBasicBlock::iterator I = MBB.getFirstTerminator(); | |||
2436 | ||||
2437 | unsigned Count = 0; | |||
2438 | unsigned RemovedSize = 0; | |||
2439 | while (I != MBB.end()) { | |||
2440 | MachineBasicBlock::iterator Next = std::next(I); | |||
2441 | if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { | |||
2442 | I = Next; | |||
2443 | continue; | |||
2444 | } | |||
2445 | ||||
2446 | RemovedSize += getInstSizeInBytes(*I); | |||
2447 | I->eraseFromParent(); | |||
2448 | ++Count; | |||
2449 | I = Next; | |||
2450 | } | |||
2451 | ||||
2452 | if (BytesRemoved) | |||
2453 | *BytesRemoved = RemovedSize; | |||
2454 | ||||
2455 | return Count; | |||
2456 | } | |||
2457 | ||||
2458 | // Copy the flags onto the implicit condition register operand. | |||
2459 | static void preserveCondRegFlags(MachineOperand &CondReg, | |||
2460 | const MachineOperand &OrigCond) { | |||
2461 | CondReg.setIsUndef(OrigCond.isUndef()); | |||
2462 | CondReg.setIsKill(OrigCond.isKill()); | |||
2463 | } | |||
2464 | ||||
2465 | unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, | |||
2466 | MachineBasicBlock *TBB, | |||
2467 | MachineBasicBlock *FBB, | |||
2468 | ArrayRef<MachineOperand> Cond, | |||
2469 | const DebugLoc &DL, | |||
2470 | int *BytesAdded) const { | |||
2471 | if (!FBB && Cond.empty()) { | |||
2472 | BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) | |||
2473 | .addMBB(TBB); | |||
2474 | if (BytesAdded) | |||
2475 | *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; | |||
2476 | return 1; | |||
2477 | } | |||
2478 | ||||
2479 | if(Cond.size() == 1 && Cond[0].isReg()) { | |||
2480 | BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) | |||
2481 | .add(Cond[0]) | |||
2482 | .addMBB(TBB); | |||
2483 | return 1; | |||
2484 | } | |||
2485 | ||||
2486 | assert(TBB && Cond[0].isImm())((TBB && Cond[0].isImm()) ? static_cast<void> ( 0) : __assert_fail ("TBB && Cond[0].isImm()", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2486, __PRETTY_FUNCTION__)); | |||
2487 | ||||
2488 | unsigned Opcode | |||
2489 | = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); | |||
2490 | ||||
2491 | if (!FBB) { | |||
2492 | Cond[1].isUndef(); | |||
2493 | MachineInstr *CondBr = | |||
2494 | BuildMI(&MBB, DL, get(Opcode)) | |||
2495 | .addMBB(TBB); | |||
2496 | ||||
2497 | // Copy the flags onto the implicit condition register operand. | |||
2498 | preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); | |||
2499 | fixImplicitOperands(*CondBr); | |||
2500 | ||||
2501 | if (BytesAdded) | |||
2502 | *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; | |||
2503 | return 1; | |||
2504 | } | |||
2505 | ||||
2506 | assert(TBB && FBB)((TBB && FBB) ? static_cast<void> (0) : __assert_fail ("TBB && FBB", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2506, __PRETTY_FUNCTION__)); | |||
2507 | ||||
2508 | MachineInstr *CondBr = | |||
2509 | BuildMI(&MBB, DL, get(Opcode)) | |||
2510 | .addMBB(TBB); | |||
2511 | fixImplicitOperands(*CondBr); | |||
2512 | BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) | |||
2513 | .addMBB(FBB); | |||
2514 | ||||
2515 | MachineOperand &CondReg = CondBr->getOperand(1); | |||
2516 | CondReg.setIsUndef(Cond[1].isUndef()); | |||
2517 | CondReg.setIsKill(Cond[1].isKill()); | |||
2518 | ||||
2519 | if (BytesAdded) | |||
2520 | *BytesAdded = ST.hasOffset3fBug() ? 16 : 8; | |||
2521 | ||||
2522 | return 2; | |||
2523 | } | |||
2524 | ||||
2525 | bool SIInstrInfo::reverseBranchCondition( | |||
2526 | SmallVectorImpl<MachineOperand> &Cond) const { | |||
2527 | if (Cond.size() != 2) { | |||
2528 | return true; | |||
2529 | } | |||
2530 | ||||
2531 | if (Cond[0].isImm()) { | |||
2532 | Cond[0].setImm(-Cond[0].getImm()); | |||
2533 | return false; | |||
2534 | } | |||
2535 | ||||
2536 | return true; | |||
2537 | } | |||
2538 | ||||
2539 | bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, | |||
2540 | ArrayRef<MachineOperand> Cond, | |||
2541 | Register DstReg, Register TrueReg, | |||
2542 | Register FalseReg, int &CondCycles, | |||
2543 | int &TrueCycles, int &FalseCycles) const { | |||
2544 | switch (Cond[0].getImm()) { | |||
2545 | case VCCNZ: | |||
2546 | case VCCZ: { | |||
2547 | const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
2548 | const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); | |||
2549 | if (MRI.getRegClass(FalseReg) != RC) | |||
2550 | return false; | |||
2551 | ||||
2552 | int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; | |||
2553 | CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? | |||
2554 | ||||
2555 | // Limit to equal cost for branch vs. N v_cndmask_b32s. | |||
2556 | return RI.hasVGPRs(RC) && NumInsts <= 6; | |||
2557 | } | |||
2558 | case SCC_TRUE: | |||
2559 | case SCC_FALSE: { | |||
2560 | // FIXME: We could insert for VGPRs if we could replace the original compare | |||
2561 | // with a vector one. | |||
2562 | const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
2563 | const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); | |||
2564 | if (MRI.getRegClass(FalseReg) != RC) | |||
2565 | return false; | |||
2566 | ||||
2567 | int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; | |||
2568 | ||||
2569 | // Multiples of 8 can do s_cselect_b64 | |||
2570 | if (NumInsts % 2 == 0) | |||
2571 | NumInsts /= 2; | |||
2572 | ||||
2573 | CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? | |||
2574 | return RI.isSGPRClass(RC); | |||
2575 | } | |||
2576 | default: | |||
2577 | return false; | |||
2578 | } | |||
2579 | } | |||
2580 | ||||
2581 | void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, | |||
2582 | MachineBasicBlock::iterator I, const DebugLoc &DL, | |||
2583 | Register DstReg, ArrayRef<MachineOperand> Cond, | |||
2584 | Register TrueReg, Register FalseReg) const { | |||
2585 | BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); | |||
2586 | if (Pred == VCCZ || Pred == SCC_FALSE) { | |||
2587 | Pred = static_cast<BranchPredicate>(-Pred); | |||
2588 | std::swap(TrueReg, FalseReg); | |||
2589 | } | |||
2590 | ||||
2591 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
2592 | const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); | |||
2593 | unsigned DstSize = RI.getRegSizeInBits(*DstRC); | |||
2594 | ||||
2595 | if (DstSize == 32) { | |||
2596 | MachineInstr *Select; | |||
2597 | if (Pred == SCC_TRUE) { | |||
2598 | Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg) | |||
2599 | .addReg(TrueReg) | |||
2600 | .addReg(FalseReg); | |||
2601 | } else { | |||
2602 | // Instruction's operands are backwards from what is expected. | |||
2603 | Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg) | |||
2604 | .addReg(FalseReg) | |||
2605 | .addReg(TrueReg); | |||
2606 | } | |||
2607 | ||||
2608 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); | |||
2609 | return; | |||
2610 | } | |||
2611 | ||||
2612 | if (DstSize == 64 && Pred == SCC_TRUE) { | |||
2613 | MachineInstr *Select = | |||
2614 | BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) | |||
2615 | .addReg(TrueReg) | |||
2616 | .addReg(FalseReg); | |||
2617 | ||||
2618 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); | |||
2619 | return; | |||
2620 | } | |||
2621 | ||||
2622 | static const int16_t Sub0_15[] = { | |||
2623 | AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, | |||
2624 | AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, | |||
2625 | AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, | |||
2626 | AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, | |||
2627 | }; | |||
2628 | ||||
2629 | static const int16_t Sub0_15_64[] = { | |||
2630 | AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, | |||
2631 | AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, | |||
2632 | AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, | |||
2633 | AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, | |||
2634 | }; | |||
2635 | ||||
2636 | unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; | |||
2637 | const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; | |||
2638 | const int16_t *SubIndices = Sub0_15; | |||
2639 | int NElts = DstSize / 32; | |||
2640 | ||||
2641 | // 64-bit select is only available for SALU. | |||
2642 | // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit. | |||
2643 | if (Pred == SCC_TRUE) { | |||
2644 | if (NElts % 2) { | |||
2645 | SelOp = AMDGPU::S_CSELECT_B32; | |||
2646 | EltRC = &AMDGPU::SGPR_32RegClass; | |||
2647 | } else { | |||
2648 | SelOp = AMDGPU::S_CSELECT_B64; | |||
2649 | EltRC = &AMDGPU::SGPR_64RegClass; | |||
2650 | SubIndices = Sub0_15_64; | |||
2651 | NElts /= 2; | |||
2652 | } | |||
2653 | } | |||
2654 | ||||
2655 | MachineInstrBuilder MIB = BuildMI( | |||
2656 | MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); | |||
2657 | ||||
2658 | I = MIB->getIterator(); | |||
2659 | ||||
2660 | SmallVector<Register, 8> Regs; | |||
2661 | for (int Idx = 0; Idx != NElts; ++Idx) { | |||
2662 | Register DstElt = MRI.createVirtualRegister(EltRC); | |||
2663 | Regs.push_back(DstElt); | |||
2664 | ||||
2665 | unsigned SubIdx = SubIndices[Idx]; | |||
2666 | ||||
2667 | MachineInstr *Select; | |||
2668 | if (SelOp == AMDGPU::V_CNDMASK_B32_e32) { | |||
2669 | Select = | |||
2670 | BuildMI(MBB, I, DL, get(SelOp), DstElt) | |||
2671 | .addReg(FalseReg, 0, SubIdx) | |||
2672 | .addReg(TrueReg, 0, SubIdx); | |||
2673 | } else { | |||
2674 | Select = | |||
2675 | BuildMI(MBB, I, DL, get(SelOp), DstElt) | |||
2676 | .addReg(TrueReg, 0, SubIdx) | |||
2677 | .addReg(FalseReg, 0, SubIdx); | |||
2678 | } | |||
2679 | ||||
2680 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); | |||
2681 | fixImplicitOperands(*Select); | |||
2682 | ||||
2683 | MIB.addReg(DstElt) | |||
2684 | .addImm(SubIdx); | |||
2685 | } | |||
2686 | } | |||
2687 | ||||
2688 | bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { | |||
2689 | switch (MI.getOpcode()) { | |||
2690 | case AMDGPU::V_MOV_B32_e32: | |||
2691 | case AMDGPU::V_MOV_B32_e64: | |||
2692 | case AMDGPU::V_MOV_B64_PSEUDO: { | |||
2693 | // If there are additional implicit register operands, this may be used for | |||
2694 | // register indexing so the source register operand isn't simply copied. | |||
2695 | unsigned NumOps = MI.getDesc().getNumOperands() + | |||
2696 | MI.getDesc().getNumImplicitUses(); | |||
2697 | ||||
2698 | return MI.getNumOperands() == NumOps; | |||
2699 | } | |||
2700 | case AMDGPU::S_MOV_B32: | |||
2701 | case AMDGPU::S_MOV_B64: | |||
2702 | case AMDGPU::COPY: | |||
2703 | case AMDGPU::V_ACCVGPR_WRITE_B32_e64: | |||
2704 | case AMDGPU::V_ACCVGPR_READ_B32_e64: | |||
2705 | case AMDGPU::V_ACCVGPR_MOV_B32: | |||
2706 | return true; | |||
2707 | default: | |||
2708 | return false; | |||
2709 | } | |||
2710 | } | |||
2711 | ||||
2712 | unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( | |||
2713 | unsigned Kind) const { | |||
2714 | switch(Kind) { | |||
2715 | case PseudoSourceValue::Stack: | |||
2716 | case PseudoSourceValue::FixedStack: | |||
2717 | return AMDGPUAS::PRIVATE_ADDRESS; | |||
2718 | case PseudoSourceValue::ConstantPool: | |||
2719 | case PseudoSourceValue::GOT: | |||
2720 | case PseudoSourceValue::JumpTable: | |||
2721 | case PseudoSourceValue::GlobalValueCallEntry: | |||
2722 | case PseudoSourceValue::ExternalSymbolCallEntry: | |||
2723 | case PseudoSourceValue::TargetCustom: | |||
2724 | return AMDGPUAS::CONSTANT_ADDRESS; | |||
2725 | } | |||
2726 | return AMDGPUAS::FLAT_ADDRESS; | |||
2727 | } | |||
2728 | ||||
2729 | static void removeModOperands(MachineInstr &MI) { | |||
2730 | unsigned Opc = MI.getOpcode(); | |||
2731 | int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, | |||
2732 | AMDGPU::OpName::src0_modifiers); | |||
2733 | int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, | |||
2734 | AMDGPU::OpName::src1_modifiers); | |||
2735 | int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, | |||
2736 | AMDGPU::OpName::src2_modifiers); | |||
2737 | ||||
2738 | MI.RemoveOperand(Src2ModIdx); | |||
2739 | MI.RemoveOperand(Src1ModIdx); | |||
2740 | MI.RemoveOperand(Src0ModIdx); | |||
2741 | } | |||
2742 | ||||
2743 | bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, | |||
2744 | Register Reg, MachineRegisterInfo *MRI) const { | |||
2745 | if (!MRI->hasOneNonDBGUse(Reg)) | |||
2746 | return false; | |||
2747 | ||||
2748 | switch (DefMI.getOpcode()) { | |||
2749 | default: | |||
2750 | return false; | |||
2751 | case AMDGPU::S_MOV_B64: | |||
2752 | // TODO: We could fold 64-bit immediates, but this get compilicated | |||
2753 | // when there are sub-registers. | |||
2754 | return false; | |||
2755 | ||||
2756 | case AMDGPU::V_MOV_B32_e32: | |||
2757 | case AMDGPU::S_MOV_B32: | |||
2758 | case AMDGPU::V_ACCVGPR_WRITE_B32_e64: | |||
2759 | break; | |||
2760 | } | |||
2761 | ||||
2762 | const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); | |||
2763 | assert(ImmOp)((ImmOp) ? static_cast<void> (0) : __assert_fail ("ImmOp" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2763, __PRETTY_FUNCTION__)); | |||
2764 | // FIXME: We could handle FrameIndex values here. | |||
2765 | if (!ImmOp->isImm()) | |||
2766 | return false; | |||
2767 | ||||
2768 | unsigned Opc = UseMI.getOpcode(); | |||
2769 | if (Opc == AMDGPU::COPY) { | |||
2770 | Register DstReg = UseMI.getOperand(0).getReg(); | |||
2771 | bool Is16Bit = getOpSize(UseMI, 0) == 2; | |||
2772 | bool isVGPRCopy = RI.isVGPR(*MRI, DstReg); | |||
2773 | unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; | |||
2774 | APInt Imm(32, ImmOp->getImm()); | |||
2775 | ||||
2776 | if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16) | |||
2777 | Imm = Imm.ashr(16); | |||
2778 | ||||
2779 | if (RI.isAGPR(*MRI, DstReg)) { | |||
2780 | if (!isInlineConstant(Imm)) | |||
2781 | return false; | |||
2782 | NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64; | |||
2783 | } | |||
2784 | ||||
2785 | if (Is16Bit) { | |||
2786 | if (isVGPRCopy) | |||
2787 | return false; // Do not clobber vgpr_hi16 | |||
2788 | ||||
2789 | if (DstReg.isVirtual() && | |||
2790 | UseMI.getOperand(0).getSubReg() != AMDGPU::lo16) | |||
2791 | return false; | |||
2792 | ||||
2793 | UseMI.getOperand(0).setSubReg(0); | |||
2794 | if (DstReg.isPhysical()) { | |||
2795 | DstReg = RI.get32BitRegister(DstReg); | |||
2796 | UseMI.getOperand(0).setReg(DstReg); | |||
2797 | } | |||
2798 | assert(UseMI.getOperand(1).getReg().isVirtual())((UseMI.getOperand(1).getReg().isVirtual()) ? static_cast< void> (0) : __assert_fail ("UseMI.getOperand(1).getReg().isVirtual()" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2798, __PRETTY_FUNCTION__)); | |||
2799 | } | |||
2800 | ||||
2801 | UseMI.setDesc(get(NewOpc)); | |||
2802 | UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue()); | |||
2803 | UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); | |||
2804 | return true; | |||
2805 | } | |||
2806 | ||||
2807 | if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || | |||
2808 | Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 || | |||
2809 | Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || | |||
2810 | Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) { | |||
2811 | // Don't fold if we are using source or output modifiers. The new VOP2 | |||
2812 | // instructions don't have them. | |||
2813 | if (hasAnyModifiersSet(UseMI)) | |||
2814 | return false; | |||
2815 | ||||
2816 | // If this is a free constant, there's no reason to do this. | |||
2817 | // TODO: We could fold this here instead of letting SIFoldOperands do it | |||
2818 | // later. | |||
2819 | MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); | |||
2820 | ||||
2821 | // Any src operand can be used for the legality check. | |||
2822 | if (isInlineConstant(UseMI, *Src0, *ImmOp)) | |||
2823 | return false; | |||
2824 | ||||
2825 | bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || | |||
2826 | Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64; | |||
2827 | bool IsFMA = Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || | |||
2828 | Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64; | |||
2829 | MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); | |||
2830 | MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); | |||
2831 | ||||
2832 | // Multiplied part is the constant: Use v_madmk_{f16, f32}. | |||
2833 | // We should only expect these to be on src0 due to canonicalizations. | |||
2834 | if (Src0->isReg() && Src0->getReg() == Reg) { | |||
2835 | if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) | |||
2836 | return false; | |||
2837 | ||||
2838 | if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) | |||
2839 | return false; | |||
2840 | ||||
2841 | unsigned NewOpc = | |||
2842 | IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16) | |||
2843 | : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); | |||
2844 | if (pseudoToMCOpcode(NewOpc) == -1) | |||
2845 | return false; | |||
2846 | ||||
2847 | // We need to swap operands 0 and 1 since madmk constant is at operand 1. | |||
2848 | ||||
2849 | const int64_t Imm = ImmOp->getImm(); | |||
2850 | ||||
2851 | // FIXME: This would be a lot easier if we could return a new instruction | |||
2852 | // instead of having to modify in place. | |||
2853 | ||||
2854 | // Remove these first since they are at the end. | |||
2855 | UseMI.RemoveOperand( | |||
2856 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); | |||
2857 | UseMI.RemoveOperand( | |||
2858 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); | |||
2859 | ||||
2860 | Register Src1Reg = Src1->getReg(); | |||
2861 | unsigned Src1SubReg = Src1->getSubReg(); | |||
2862 | Src0->setReg(Src1Reg); | |||
2863 | Src0->setSubReg(Src1SubReg); | |||
2864 | Src0->setIsKill(Src1->isKill()); | |||
2865 | ||||
2866 | if (Opc == AMDGPU::V_MAC_F32_e64 || | |||
2867 | Opc == AMDGPU::V_MAC_F16_e64 || | |||
2868 | Opc == AMDGPU::V_FMAC_F32_e64 || | |||
2869 | Opc == AMDGPU::V_FMAC_F16_e64) | |||
2870 | UseMI.untieRegOperand( | |||
2871 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); | |||
2872 | ||||
2873 | Src1->ChangeToImmediate(Imm); | |||
2874 | ||||
2875 | removeModOperands(UseMI); | |||
2876 | UseMI.setDesc(get(NewOpc)); | |||
2877 | ||||
2878 | bool DeleteDef = MRI->hasOneNonDBGUse(Reg); | |||
2879 | if (DeleteDef) | |||
2880 | DefMI.eraseFromParent(); | |||
2881 | ||||
2882 | return true; | |||
2883 | } | |||
2884 | ||||
2885 | // Added part is the constant: Use v_madak_{f16, f32}. | |||
2886 | if (Src2->isReg() && Src2->getReg() == Reg) { | |||
2887 | // Not allowed to use constant bus for another operand. | |||
2888 | // We can however allow an inline immediate as src0. | |||
2889 | bool Src0Inlined = false; | |||
2890 | if (Src0->isReg()) { | |||
2891 | // Try to inline constant if possible. | |||
2892 | // If the Def moves immediate and the use is single | |||
2893 | // We are saving VGPR here. | |||
2894 | MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); | |||
2895 | if (Def && Def->isMoveImmediate() && | |||
2896 | isInlineConstant(Def->getOperand(1)) && | |||
2897 | MRI->hasOneUse(Src0->getReg())) { | |||
2898 | Src0->ChangeToImmediate(Def->getOperand(1).getImm()); | |||
2899 | Src0Inlined = true; | |||
2900 | } else if ((Src0->getReg().isPhysical() && | |||
2901 | (ST.getConstantBusLimit(Opc) <= 1 && | |||
2902 | RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) || | |||
2903 | (Src0->getReg().isVirtual() && | |||
2904 | (ST.getConstantBusLimit(Opc) <= 1 && | |||
2905 | RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) | |||
2906 | return false; | |||
2907 | // VGPR is okay as Src0 - fallthrough | |||
2908 | } | |||
2909 | ||||
2910 | if (Src1->isReg() && !Src0Inlined ) { | |||
2911 | // We have one slot for inlinable constant so far - try to fill it | |||
2912 | MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); | |||
2913 | if (Def && Def->isMoveImmediate() && | |||
2914 | isInlineConstant(Def->getOperand(1)) && | |||
2915 | MRI->hasOneUse(Src1->getReg()) && | |||
2916 | commuteInstruction(UseMI)) { | |||
2917 | Src0->ChangeToImmediate(Def->getOperand(1).getImm()); | |||
2918 | } else if ((Src1->getReg().isPhysical() && | |||
2919 | RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) || | |||
2920 | (Src1->getReg().isVirtual() && | |||
2921 | RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) | |||
2922 | return false; | |||
2923 | // VGPR is okay as Src1 - fallthrough | |||
2924 | } | |||
2925 | ||||
2926 | unsigned NewOpc = | |||
2927 | IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16) | |||
2928 | : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); | |||
2929 | if (pseudoToMCOpcode(NewOpc) == -1) | |||
2930 | return false; | |||
2931 | ||||
2932 | const int64_t Imm = ImmOp->getImm(); | |||
2933 | ||||
2934 | // FIXME: This would be a lot easier if we could return a new instruction | |||
2935 | // instead of having to modify in place. | |||
2936 | ||||
2937 | // Remove these first since they are at the end. | |||
2938 | UseMI.RemoveOperand( | |||
2939 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); | |||
2940 | UseMI.RemoveOperand( | |||
2941 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); | |||
2942 | ||||
2943 | if (Opc == AMDGPU::V_MAC_F32_e64 || | |||
2944 | Opc == AMDGPU::V_MAC_F16_e64 || | |||
2945 | Opc == AMDGPU::V_FMAC_F32_e64 || | |||
2946 | Opc == AMDGPU::V_FMAC_F16_e64) | |||
2947 | UseMI.untieRegOperand( | |||
2948 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); | |||
2949 | ||||
2950 | // ChangingToImmediate adds Src2 back to the instruction. | |||
2951 | Src2->ChangeToImmediate(Imm); | |||
2952 | ||||
2953 | // These come before src2. | |||
2954 | removeModOperands(UseMI); | |||
2955 | UseMI.setDesc(get(NewOpc)); | |||
2956 | // It might happen that UseMI was commuted | |||
2957 | // and we now have SGPR as SRC1. If so 2 inlined | |||
2958 | // constant and SGPR are illegal. | |||
2959 | legalizeOperands(UseMI); | |||
2960 | ||||
2961 | bool DeleteDef = MRI->hasOneNonDBGUse(Reg); | |||
2962 | if (DeleteDef) | |||
2963 | DefMI.eraseFromParent(); | |||
2964 | ||||
2965 | return true; | |||
2966 | } | |||
2967 | } | |||
2968 | ||||
2969 | return false; | |||
2970 | } | |||
2971 | ||||
2972 | static bool | |||
2973 | memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1, | |||
2974 | ArrayRef<const MachineOperand *> BaseOps2) { | |||
2975 | if (BaseOps1.size() != BaseOps2.size()) | |||
2976 | return false; | |||
2977 | for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) { | |||
2978 | if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I])) | |||
2979 | return false; | |||
2980 | } | |||
2981 | return true; | |||
2982 | } | |||
2983 | ||||
2984 | static bool offsetsDoNotOverlap(int WidthA, int OffsetA, | |||
2985 | int WidthB, int OffsetB) { | |||
2986 | int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; | |||
2987 | int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; | |||
2988 | int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; | |||
2989 | return LowOffset + LowWidth <= HighOffset; | |||
2990 | } | |||
2991 | ||||
2992 | bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, | |||
2993 | const MachineInstr &MIb) const { | |||
2994 | SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1; | |||
2995 | int64_t Offset0, Offset1; | |||
2996 | unsigned Dummy0, Dummy1; | |||
2997 | bool Offset0IsScalable, Offset1IsScalable; | |||
2998 | if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable, | |||
2999 | Dummy0, &RI) || | |||
3000 | !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable, | |||
3001 | Dummy1, &RI)) | |||
3002 | return false; | |||
3003 | ||||
3004 | if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1)) | |||
3005 | return false; | |||
3006 | ||||
3007 | if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { | |||
3008 | // FIXME: Handle ds_read2 / ds_write2. | |||
3009 | return false; | |||
3010 | } | |||
3011 | unsigned Width0 = MIa.memoperands().front()->getSize(); | |||
3012 | unsigned Width1 = MIb.memoperands().front()->getSize(); | |||
3013 | return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1); | |||
3014 | } | |||
3015 | ||||
3016 | bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, | |||
3017 | const MachineInstr &MIb) const { | |||
3018 | assert(MIa.mayLoadOrStore() &&((MIa.mayLoadOrStore() && "MIa must load from or modify a memory location" ) ? static_cast<void> (0) : __assert_fail ("MIa.mayLoadOrStore() && \"MIa must load from or modify a memory location\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3019, __PRETTY_FUNCTION__)) | |||
3019 | "MIa must load from or modify a memory location")((MIa.mayLoadOrStore() && "MIa must load from or modify a memory location" ) ? static_cast<void> (0) : __assert_fail ("MIa.mayLoadOrStore() && \"MIa must load from or modify a memory location\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3019, __PRETTY_FUNCTION__)); | |||
3020 | assert(MIb.mayLoadOrStore() &&((MIb.mayLoadOrStore() && "MIb must load from or modify a memory location" ) ? static_cast<void> (0) : __assert_fail ("MIb.mayLoadOrStore() && \"MIb must load from or modify a memory location\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3021, __PRETTY_FUNCTION__)) | |||
3021 | "MIb must load from or modify a memory location")((MIb.mayLoadOrStore() && "MIb must load from or modify a memory location" ) ? static_cast<void> (0) : __assert_fail ("MIb.mayLoadOrStore() && \"MIb must load from or modify a memory location\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3021, __PRETTY_FUNCTION__)); | |||
3022 | ||||
3023 | if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) | |||
3024 | return false; | |||
3025 | ||||
3026 | // XXX - Can we relax this between address spaces? | |||
3027 | if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) | |||
3028 | return false; | |||
3029 | ||||
3030 | // TODO: Should we check the address space from the MachineMemOperand? That | |||
3031 | // would allow us to distinguish objects we know don't alias based on the | |||
3032 | // underlying address space, even if it was lowered to a different one, | |||
3033 | // e.g. private accesses lowered to use MUBUF instructions on a scratch | |||
3034 | // buffer. | |||
3035 | if (isDS(MIa)) { | |||
3036 | if (isDS(MIb)) | |||
3037 | return checkInstOffsetsDoNotOverlap(MIa, MIb); | |||
3038 | ||||
3039 | return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb); | |||
3040 | } | |||
3041 | ||||
3042 | if (isMUBUF(MIa) || isMTBUF(MIa)) { | |||
3043 | if (isMUBUF(MIb) || isMTBUF(MIb)) | |||
3044 | return checkInstOffsetsDoNotOverlap(MIa, MIb); | |||
3045 | ||||
3046 | return !isFLAT(MIb) && !isSMRD(MIb); | |||
3047 | } | |||
3048 | ||||
3049 | if (isSMRD(MIa)) { | |||
3050 | if (isSMRD(MIb)) | |||
3051 | return checkInstOffsetsDoNotOverlap(MIa, MIb); | |||
3052 | ||||
3053 | return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb); | |||
3054 | } | |||
3055 | ||||
3056 | if (isFLAT(MIa)) { | |||
3057 | if (isFLAT(MIb)) | |||
3058 | return checkInstOffsetsDoNotOverlap(MIa, MIb); | |||
3059 | ||||
3060 | return false; | |||
3061 | } | |||
3062 | ||||
3063 | return false; | |||
3064 | } | |||
3065 | ||||
3066 | static int64_t getFoldableImm(const MachineOperand* MO) { | |||
3067 | if (!MO->isReg()) | |||
3068 | return false; | |||
3069 | const MachineFunction *MF = MO->getParent()->getParent()->getParent(); | |||
3070 | const MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
3071 | auto Def = MRI.getUniqueVRegDef(MO->getReg()); | |||
3072 | if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 && | |||
3073 | Def->getOperand(1).isImm()) | |||
3074 | return Def->getOperand(1).getImm(); | |||
3075 | return AMDGPU::NoRegister; | |||
3076 | } | |||
3077 | ||||
3078 | static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, | |||
3079 | MachineInstr &NewMI) { | |||
3080 | if (LV) { | |||
3081 | unsigned NumOps = MI.getNumOperands(); | |||
3082 | for (unsigned I = 1; I < NumOps; ++I) { | |||
3083 | MachineOperand &Op = MI.getOperand(I); | |||
3084 | if (Op.isReg() && Op.isKill()) | |||
3085 | LV->replaceKillInstruction(Op.getReg(), MI, NewMI); | |||
3086 | } | |||
3087 | } | |||
3088 | } | |||
3089 | ||||
3090 | MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, | |||
3091 | MachineInstr &MI, | |||
3092 | LiveVariables *LV) const { | |||
3093 | unsigned Opc = MI.getOpcode(); | |||
3094 | bool IsF16 = false; | |||
3095 | bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || | |||
3096 | Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || | |||
3097 | Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; | |||
3098 | bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; | |||
3099 | ||||
3100 | switch (Opc) { | |||
3101 | default: | |||
3102 | return nullptr; | |||
3103 | case AMDGPU::V_MAC_F16_e64: | |||
3104 | case AMDGPU::V_FMAC_F16_e64: | |||
3105 | IsF16 = true; | |||
3106 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | |||
3107 | case AMDGPU::V_MAC_F32_e64: | |||
3108 | case AMDGPU::V_FMAC_F32_e64: | |||
3109 | case AMDGPU::V_FMAC_F64_e64: | |||
3110 | break; | |||
3111 | case AMDGPU::V_MAC_F16_e32: | |||
3112 | case AMDGPU::V_FMAC_F16_e32: | |||
3113 | IsF16 = true; | |||
3114 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | |||
3115 | case AMDGPU::V_MAC_F32_e32: | |||
3116 | case AMDGPU::V_FMAC_F32_e32: | |||
3117 | case AMDGPU::V_FMAC_F64_e32: { | |||
3118 | int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), | |||
3119 | AMDGPU::OpName::src0); | |||
3120 | const MachineOperand *Src0 = &MI.getOperand(Src0Idx); | |||
3121 | if (!Src0->isReg() && !Src0->isImm()) | |||
3122 | return nullptr; | |||
3123 | ||||
3124 | if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) | |||
3125 | return nullptr; | |||
3126 | ||||
3127 | break; | |||
3128 | } | |||
3129 | } | |||
3130 | ||||
3131 | const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
3132 | const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); | |||
3133 | const MachineOperand *Src0Mods = | |||
3134 | getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); | |||
3135 | const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); | |||
3136 | const MachineOperand *Src1Mods = | |||
3137 | getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); | |||
3138 | const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); | |||
3139 | const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); | |||
3140 | const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); | |||
3141 | MachineInstrBuilder MIB; | |||
3142 | ||||
3143 | if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 && | |||
3144 | // If we have an SGPR input, we will violate the constant bus restriction. | |||
3145 | (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || | |||
3146 | !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { | |||
3147 | if (auto Imm = getFoldableImm(Src2)) { | |||
3148 | unsigned NewOpc = | |||
3149 | IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32) | |||
3150 | : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); | |||
3151 | if (pseudoToMCOpcode(NewOpc) != -1) { | |||
3152 | MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3153 | .add(*Dst) | |||
3154 | .add(*Src0) | |||
3155 | .add(*Src1) | |||
3156 | .addImm(Imm); | |||
3157 | updateLiveVariables(LV, MI, *MIB); | |||
3158 | return MIB; | |||
3159 | } | |||
3160 | } | |||
3161 | unsigned NewOpc = IsFMA | |||
3162 | ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32) | |||
3163 | : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); | |||
3164 | if (auto Imm = getFoldableImm(Src1)) { | |||
3165 | if (pseudoToMCOpcode(NewOpc) != -1) { | |||
3166 | MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3167 | .add(*Dst) | |||
3168 | .add(*Src0) | |||
3169 | .addImm(Imm) | |||
3170 | .add(*Src2); | |||
3171 | updateLiveVariables(LV, MI, *MIB); | |||
3172 | return MIB; | |||
3173 | } | |||
3174 | } | |||
3175 | if (auto Imm = getFoldableImm(Src0)) { | |||
3176 | if (pseudoToMCOpcode(NewOpc) != -1 && | |||
3177 | isOperandLegal( | |||
3178 | MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0), | |||
3179 | Src1)) { | |||
3180 | MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3181 | .add(*Dst) | |||
3182 | .add(*Src1) | |||
3183 | .addImm(Imm) | |||
3184 | .add(*Src2); | |||
3185 | updateLiveVariables(LV, MI, *MIB); | |||
3186 | return MIB; | |||
3187 | } | |||
3188 | } | |||
3189 | } | |||
3190 | ||||
3191 | unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64 | |||
3192 | : IsF64 ? AMDGPU::V_FMA_F64_e64 | |||
3193 | : AMDGPU::V_FMA_F32_e64) | |||
3194 | : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64); | |||
3195 | if (pseudoToMCOpcode(NewOpc) == -1) | |||
3196 | return nullptr; | |||
3197 | ||||
3198 | MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3199 | .add(*Dst) | |||
3200 | .addImm(Src0Mods ? Src0Mods->getImm() : 0) | |||
3201 | .add(*Src0) | |||
3202 | .addImm(Src1Mods ? Src1Mods->getImm() : 0) | |||
3203 | .add(*Src1) | |||
3204 | .addImm(0) // Src mods | |||
3205 | .add(*Src2) | |||
3206 | .addImm(Clamp ? Clamp->getImm() : 0) | |||
3207 | .addImm(Omod ? Omod->getImm() : 0); | |||
3208 | updateLiveVariables(LV, MI, *MIB); | |||
3209 | return MIB; | |||
3210 | } | |||
3211 | ||||
3212 | // It's not generally safe to move VALU instructions across these since it will | |||
3213 | // start using the register as a base index rather than directly. | |||
3214 | // XXX - Why isn't hasSideEffects sufficient for these? | |||
3215 | static bool changesVGPRIndexingMode(const MachineInstr &MI) { | |||
3216 | switch (MI.getOpcode()) { | |||
3217 | case AMDGPU::S_SET_GPR_IDX_ON: | |||
3218 | case AMDGPU::S_SET_GPR_IDX_MODE: | |||
3219 | case AMDGPU::S_SET_GPR_IDX_OFF: | |||
3220 | return true; | |||
3221 | default: | |||
3222 | return false; | |||
3223 | } | |||
3224 | } | |||
3225 | ||||
3226 | bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, | |||
3227 | const MachineBasicBlock *MBB, | |||
3228 | const MachineFunction &MF) const { | |||
3229 | // Skipping the check for SP writes in the base implementation. The reason it | |||
3230 | // was added was apparently due to compile time concerns. | |||
3231 | // | |||
3232 | // TODO: Do we really want this barrier? It triggers unnecessary hazard nops | |||
3233 | // but is probably avoidable. | |||
3234 | ||||
3235 | // Copied from base implementation. | |||
3236 | // Terminators and labels can't be scheduled around. | |||
3237 | if (MI.isTerminator() || MI.isPosition()) | |||
3238 | return true; | |||
3239 | ||||
3240 | // INLINEASM_BR can jump to another block | |||
3241 | if (MI.getOpcode() == TargetOpcode::INLINEASM_BR) | |||
3242 | return true; | |||
3243 | ||||
3244 | // Target-independent instructions do not have an implicit-use of EXEC, even | |||
3245 | // when they operate on VGPRs. Treating EXEC modifications as scheduling | |||
3246 | // boundaries prevents incorrect movements of such instructions. | |||
3247 | return MI.modifiesRegister(AMDGPU::EXEC, &RI) || | |||
3248 | MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || | |||
3249 | MI.getOpcode() == AMDGPU::S_SETREG_B32 || | |||
3250 | changesVGPRIndexingMode(MI); | |||
3251 | } | |||
3252 | ||||
3253 | bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { | |||
3254 | return Opcode == AMDGPU::DS_ORDERED_COUNT || | |||
3255 | Opcode == AMDGPU::DS_GWS_INIT || | |||
3256 | Opcode == AMDGPU::DS_GWS_SEMA_V || | |||
3257 | Opcode == AMDGPU::DS_GWS_SEMA_BR || | |||
3258 | Opcode == AMDGPU::DS_GWS_SEMA_P || | |||
3259 | Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || | |||
3260 | Opcode == AMDGPU::DS_GWS_BARRIER; | |||
3261 | } | |||
3262 | ||||
3263 | bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { | |||
3264 | // Skip the full operand and register alias search modifiesRegister | |||
3265 | // does. There's only a handful of instructions that touch this, it's only an | |||
3266 | // implicit def, and doesn't alias any other registers. | |||
3267 | if (const MCPhysReg *ImpDef = MI.getDesc().getImplicitDefs()) { | |||
3268 | for (; ImpDef && *ImpDef; ++ImpDef) { | |||
3269 | if (*ImpDef == AMDGPU::MODE) | |||
3270 | return true; | |||
3271 | } | |||
3272 | } | |||
3273 | ||||
3274 | return false; | |||
3275 | } | |||
3276 | ||||
3277 | bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { | |||
3278 | unsigned Opcode = MI.getOpcode(); | |||
3279 | ||||
3280 | if (MI.mayStore() && isSMRD(MI)) | |||
3281 | return true; // scalar store or atomic | |||
3282 | ||||
3283 | // This will terminate the function when other lanes may need to continue. | |||
3284 | if (MI.isReturn()) | |||
3285 | return true; | |||
3286 | ||||
3287 | // These instructions cause shader I/O that may cause hardware lockups | |||
3288 | // when executed with an empty EXEC mask. | |||
3289 | // | |||
3290 | // Note: exp with VM = DONE = 0 is automatically skipped by hardware when | |||
3291 | // EXEC = 0, but checking for that case here seems not worth it | |||
3292 | // given the typical code patterns. | |||
3293 | if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || | |||
3294 | isEXP(Opcode) || | |||
3295 | Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP || | |||
3296 | Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER) | |||
3297 | return true; | |||
3298 | ||||
3299 | if (MI.isCall() || MI.isInlineAsm()) | |||
3300 | return true; // conservative assumption | |||
3301 | ||||
3302 | // A mode change is a scalar operation that influences vector instructions. | |||
3303 | if (modifiesModeRegister(MI)) | |||
3304 | return true; | |||
3305 | ||||
3306 | // These are like SALU instructions in terms of effects, so it's questionable | |||
3307 | // whether we should return true for those. | |||
3308 | // | |||
3309 | // However, executing them with EXEC = 0 causes them to operate on undefined | |||
3310 | // data, which we avoid by returning true here. | |||
3311 | if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || | |||
3312 | Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32) | |||
3313 | return true; | |||
3314 | ||||
3315 | return false; | |||
3316 | } | |||
3317 | ||||
3318 | bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI, | |||
3319 | const MachineInstr &MI) const { | |||
3320 | if (MI.isMetaInstruction()) | |||
3321 | return false; | |||
3322 | ||||
3323 | // This won't read exec if this is an SGPR->SGPR copy. | |||
3324 | if (MI.isCopyLike()) { | |||
3325 | if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg())) | |||
3326 | return true; | |||
3327 | ||||
3328 | // Make sure this isn't copying exec as a normal operand | |||
3329 | return MI.readsRegister(AMDGPU::EXEC, &RI); | |||
3330 | } | |||
3331 | ||||
3332 | // Make a conservative assumption about the callee. | |||
3333 | if (MI.isCall()) | |||
3334 | return true; | |||
3335 | ||||
3336 | // Be conservative with any unhandled generic opcodes. | |||
3337 | if (!isTargetSpecificOpcode(MI.getOpcode())) | |||
3338 | return true; | |||
3339 | ||||
3340 | return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI); | |||
3341 | } | |||
3342 | ||||
3343 | bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { | |||
3344 | switch (Imm.getBitWidth()) { | |||
3345 | case 1: // This likely will be a condition code mask. | |||
3346 | return true; | |||
3347 | ||||
3348 | case 32: | |||
3349 | return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), | |||
3350 | ST.hasInv2PiInlineImm()); | |||
3351 | case 64: | |||
3352 | return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), | |||
3353 | ST.hasInv2PiInlineImm()); | |||
3354 | case 16: | |||
3355 | return ST.has16BitInsts() && | |||
3356 | AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), | |||
3357 | ST.hasInv2PiInlineImm()); | |||
3358 | default: | |||
3359 | llvm_unreachable("invalid bitwidth")::llvm::llvm_unreachable_internal("invalid bitwidth", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3359); | |||
3360 | } | |||
3361 | } | |||
3362 | ||||
3363 | bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, | |||
3364 | uint8_t OperandType) const { | |||
3365 | if (!MO.isImm() || | |||
3366 | OperandType < AMDGPU::OPERAND_SRC_FIRST || | |||
3367 | OperandType > AMDGPU::OPERAND_SRC_LAST) | |||
3368 | return false; | |||
3369 | ||||
3370 | // MachineOperand provides no way to tell the true operand size, since it only | |||
3371 | // records a 64-bit value. We need to know the size to determine if a 32-bit | |||
3372 | // floating point immediate bit pattern is legal for an integer immediate. It | |||
3373 | // would be for any 32-bit integer operand, but would not be for a 64-bit one. | |||
3374 | ||||
3375 | int64_t Imm = MO.getImm(); | |||
3376 | switch (OperandType) { | |||
3377 | case AMDGPU::OPERAND_REG_IMM_INT32: | |||
3378 | case AMDGPU::OPERAND_REG_IMM_FP32: | |||
3379 | case AMDGPU::OPERAND_REG_INLINE_C_INT32: | |||
3380 | case AMDGPU::OPERAND_REG_INLINE_C_FP32: | |||
3381 | case AMDGPU::OPERAND_REG_IMM_V2FP32: | |||
3382 | case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: | |||
3383 | case AMDGPU::OPERAND_REG_IMM_V2INT32: | |||
3384 | case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: | |||
3385 | case AMDGPU::OPERAND_REG_INLINE_AC_INT32: | |||
3386 | case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { | |||
3387 | int32_t Trunc = static_cast<int32_t>(Imm); | |||
3388 | return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); | |||
3389 | } | |||
3390 | case AMDGPU::OPERAND_REG_IMM_INT64: | |||
3391 | case AMDGPU::OPERAND_REG_IMM_FP64: | |||
3392 | case AMDGPU::OPERAND_REG_INLINE_C_INT64: | |||
3393 | case AMDGPU::OPERAND_REG_INLINE_C_FP64: | |||
3394 | case AMDGPU::OPERAND_REG_INLINE_AC_FP64: | |||
3395 | return AMDGPU::isInlinableLiteral64(MO.getImm(), | |||
3396 | ST.hasInv2PiInlineImm()); | |||
3397 | case AMDGPU::OPERAND_REG_IMM_INT16: | |||
3398 | case AMDGPU::OPERAND_REG_INLINE_C_INT16: | |||
3399 | case AMDGPU::OPERAND_REG_INLINE_AC_INT16: | |||
3400 | // We would expect inline immediates to not be concerned with an integer/fp | |||
3401 | // distinction. However, in the case of 16-bit integer operations, the | |||
3402 | // "floating point" values appear to not work. It seems read the low 16-bits | |||
3403 | // of 32-bit immediates, which happens to always work for the integer | |||
3404 | // values. | |||
3405 | // | |||
3406 | // See llvm bugzilla 46302. | |||
3407 | // | |||
3408 | // TODO: Theoretically we could use op-sel to use the high bits of the | |||
3409 | // 32-bit FP values. | |||
3410 | return AMDGPU::isInlinableIntLiteral(Imm); | |||
3411 | case AMDGPU::OPERAND_REG_IMM_V2INT16: | |||
3412 | case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: | |||
3413 | case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: | |||
3414 | // This suffers the same problem as the scalar 16-bit cases. | |||
3415 | return AMDGPU::isInlinableIntLiteralV216(Imm); | |||
3416 | case AMDGPU::OPERAND_REG_IMM_FP16: | |||
3417 | case AMDGPU::OPERAND_REG_INLINE_C_FP16: | |||
3418 | case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { | |||
3419 | if (isInt<16>(Imm) || isUInt<16>(Imm)) { | |||
3420 | // A few special case instructions have 16-bit operands on subtargets | |||
3421 | // where 16-bit instructions are not legal. | |||
3422 | // TODO: Do the 32-bit immediates work? We shouldn't really need to handle | |||
3423 | // constants in these cases | |||
3424 | int16_t Trunc = static_cast<int16_t>(Imm); | |||
3425 | return ST.has16BitInsts() && | |||
3426 | AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); | |||
3427 | } | |||
3428 | ||||
3429 | return false; | |||
3430 | } | |||
3431 | case AMDGPU::OPERAND_REG_IMM_V2FP16: | |||
3432 | case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: | |||
3433 | case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { | |||
3434 | uint32_t Trunc = static_cast<uint32_t>(Imm); | |||
3435 | return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); | |||
3436 | } | |||
3437 | default: | |||
3438 | llvm_unreachable("invalid bitwidth")::llvm::llvm_unreachable_internal("invalid bitwidth", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3438); | |||
3439 | } | |||
3440 | } | |||
3441 | ||||
3442 | bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, | |||
3443 | const MCOperandInfo &OpInfo) const { | |||
3444 | switch (MO.getType()) { | |||
3445 | case MachineOperand::MO_Register: | |||
3446 | return false; | |||
3447 | case MachineOperand::MO_Immediate: | |||
3448 | return !isInlineConstant(MO, OpInfo); | |||
3449 | case MachineOperand::MO_FrameIndex: | |||
3450 | case MachineOperand::MO_MachineBasicBlock: | |||
3451 | case MachineOperand::MO_ExternalSymbol: | |||
3452 | case MachineOperand::MO_GlobalAddress: | |||
3453 | case MachineOperand::MO_MCSymbol: | |||
3454 | return true; | |||
3455 | default: | |||
3456 | llvm_unreachable("unexpected operand type")::llvm::llvm_unreachable_internal("unexpected operand type", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3456); | |||
3457 | } | |||
3458 | } | |||
3459 | ||||
3460 | static bool compareMachineOp(const MachineOperand &Op0, | |||
3461 | const MachineOperand &Op1) { | |||
3462 | if (Op0.getType() != Op1.getType()) | |||
3463 | return false; | |||
3464 | ||||
3465 | switch (Op0.getType()) { | |||
3466 | case MachineOperand::MO_Register: | |||
3467 | return Op0.getReg() == Op1.getReg(); | |||
3468 | case MachineOperand::MO_Immediate: | |||
3469 | return Op0.getImm() == Op1.getImm(); | |||
3470 | default: | |||
3471 | llvm_unreachable("Didn't expect to be comparing these operand types")::llvm::llvm_unreachable_internal("Didn't expect to be comparing these operand types" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3471); | |||
3472 | } | |||
3473 | } | |||
3474 | ||||
3475 | bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, | |||
3476 | const MachineOperand &MO) const { | |||
3477 | const MCInstrDesc &InstDesc = MI.getDesc(); | |||
3478 | const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo]; | |||
3479 | ||||
3480 | assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal())((MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal ()) ? static_cast<void> (0) : __assert_fail ("MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3480, __PRETTY_FUNCTION__)); | |||
3481 | ||||
3482 | if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) | |||
3483 | return true; | |||
3484 | ||||
3485 | if (OpInfo.RegClass < 0) | |||
3486 | return false; | |||
3487 | ||||
3488 | if (MO.isImm() && isInlineConstant(MO, OpInfo)) { | |||
3489 | if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() && | |||
3490 | OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(), | |||
3491 | AMDGPU::OpName::src2)) | |||
3492 | return false; | |||
3493 | return RI.opCanUseInlineConstant(OpInfo.OperandType); | |||
3494 | } | |||
3495 | ||||
3496 | if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) | |||
3497 | return false; | |||
3498 | ||||
3499 | if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) | |||
3500 | return true; | |||
3501 | ||||
3502 | return ST.hasVOP3Literal(); | |||
3503 | } | |||
3504 | ||||
3505 | bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { | |||
3506 | // GFX90A does not have V_MUL_LEGACY_F32_e32. | |||
3507 | if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts()) | |||
3508 | return false; | |||
3509 | ||||
3510 | int Op32 = AMDGPU::getVOPe32(Opcode); | |||
3511 | if (Op32 == -1) | |||
3512 | return false; | |||
3513 | ||||
3514 | return pseudoToMCOpcode(Op32) != -1; | |||
3515 | } | |||
3516 | ||||
3517 | bool SIInstrInfo::hasModifiers(unsigned Opcode) const { | |||
3518 | // The src0_modifier operand is present on all instructions | |||
3519 | // that have modifiers. | |||
3520 | ||||
3521 | return AMDGPU::getNamedOperandIdx(Opcode, | |||
3522 | AMDGPU::OpName::src0_modifiers) != -1; | |||
3523 | } | |||
3524 | ||||
3525 | bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, | |||
3526 | unsigned OpName) const { | |||
3527 | const MachineOperand *Mods = getNamedOperand(MI, OpName); | |||
3528 | return Mods && Mods->getImm(); | |||
3529 | } | |||
3530 | ||||
3531 | bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { | |||
3532 | return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || | |||
3533 | hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || | |||
3534 | hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || | |||
3535 | hasModifiersSet(MI, AMDGPU::OpName::clamp) || | |||
3536 | hasModifiersSet(MI, AMDGPU::OpName::omod); | |||
3537 | } | |||
3538 | ||||
3539 | bool SIInstrInfo::canShrink(const MachineInstr &MI, | |||
3540 | const MachineRegisterInfo &MRI) const { | |||
3541 | const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); | |||
3542 | // Can't shrink instruction with three operands. | |||
3543 | // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add | |||
3544 | // a special case for it. It can only be shrunk if the third operand | |||
3545 | // is vcc, and src0_modifiers and src1_modifiers are not set. | |||
3546 | // We should handle this the same way we handle vopc, by addding | |||
3547 | // a register allocation hint pre-regalloc and then do the shrinking | |||
3548 | // post-regalloc. | |||
3549 | if (Src2) { | |||
3550 | switch (MI.getOpcode()) { | |||
3551 | default: return false; | |||
3552 | ||||
3553 | case AMDGPU::V_ADDC_U32_e64: | |||
3554 | case AMDGPU::V_SUBB_U32_e64: | |||
3555 | case AMDGPU::V_SUBBREV_U32_e64: { | |||
3556 | const MachineOperand *Src1 | |||
3557 | = getNamedOperand(MI, AMDGPU::OpName::src1); | |||
3558 | if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg())) | |||
3559 | return false; | |||
3560 | // Additional verification is needed for sdst/src2. | |||
3561 | return true; | |||
3562 | } | |||
3563 | case AMDGPU::V_MAC_F32_e64: | |||
3564 | case AMDGPU::V_MAC_F16_e64: | |||
3565 | case AMDGPU::V_FMAC_F32_e64: | |||
3566 | case AMDGPU::V_FMAC_F16_e64: | |||
3567 | case AMDGPU::V_FMAC_F64_e64: | |||
3568 | if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || | |||
3569 | hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) | |||
3570 | return false; | |||
3571 | break; | |||
3572 | ||||
3573 | case AMDGPU::V_CNDMASK_B32_e64: | |||
3574 | break; | |||
3575 | } | |||
3576 | } | |||
3577 | ||||
3578 | const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); | |||
3579 | if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) || | |||
3580 | hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) | |||
3581 | return false; | |||
3582 | ||||
3583 | // We don't need to check src0, all input types are legal, so just make sure | |||
3584 | // src0 isn't using any modifiers. | |||
3585 | if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) | |||
3586 | return false; | |||
3587 | ||||
3588 | // Can it be shrunk to a valid 32 bit opcode? | |||
3589 | if (!hasVALU32BitEncoding(MI.getOpcode())) | |||
3590 | return false; | |||
3591 | ||||
3592 | // Check output modifiers | |||
3593 | return !hasModifiersSet(MI, AMDGPU::OpName::omod) && | |||
3594 | !hasModifiersSet(MI, AMDGPU::OpName::clamp); | |||
3595 | } | |||
3596 | ||||
3597 | // Set VCC operand with all flags from \p Orig, except for setting it as | |||
3598 | // implicit. | |||
3599 | static void copyFlagsToImplicitVCC(MachineInstr &MI, | |||
3600 | const MachineOperand &Orig) { | |||
3601 | ||||
3602 | for (MachineOperand &Use : MI.implicit_operands()) { | |||
3603 | if (Use.isUse() && | |||
3604 | (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) { | |||
3605 | Use.setIsUndef(Orig.isUndef()); | |||
3606 | Use.setIsKill(Orig.isKill()); | |||
3607 | return; | |||
3608 | } | |||
3609 | } | |||
3610 | } | |||
3611 | ||||
3612 | MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, | |||
3613 | unsigned Op32) const { | |||
3614 | MachineBasicBlock *MBB = MI.getParent();; | |||
3615 | MachineInstrBuilder Inst32 = | |||
3616 | BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)) | |||
3617 | .setMIFlags(MI.getFlags()); | |||
3618 | ||||
3619 | // Add the dst operand if the 32-bit encoding also has an explicit $vdst. | |||
3620 | // For VOPC instructions, this is replaced by an implicit def of vcc. | |||
3621 | int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); | |||
3622 | if (Op32DstIdx != -1) { | |||
3623 | // dst | |||
3624 | Inst32.add(MI.getOperand(0)); | |||
3625 | } else { | |||
3626 | assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||((((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand (0).getReg() == AMDGPU::VCC_LO)) && "Unexpected case" ) ? static_cast<void> (0) : __assert_fail ("((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && \"Unexpected case\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3628, __PRETTY_FUNCTION__)) | |||
3627 | (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&((((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand (0).getReg() == AMDGPU::VCC_LO)) && "Unexpected case" ) ? static_cast<void> (0) : __assert_fail ("((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && \"Unexpected case\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3628, __PRETTY_FUNCTION__)) | |||
3628 | "Unexpected case")((((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand (0).getReg() == AMDGPU::VCC_LO)) && "Unexpected case" ) ? static_cast<void> (0) : __assert_fail ("((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && \"Unexpected case\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3628, __PRETTY_FUNCTION__)); | |||
3629 | } | |||
3630 | ||||
3631 | Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0)); | |||
3632 | ||||
3633 | const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); | |||
3634 | if (Src1) | |||
3635 | Inst32.add(*Src1); | |||
3636 | ||||
3637 | const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); | |||
3638 | ||||
3639 | if (Src2) { | |||
3640 | int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); | |||
3641 | if (Op32Src2Idx != -1) { | |||
3642 | Inst32.add(*Src2); | |||
3643 | } else { | |||
3644 | // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is | |||
3645 | // replaced with an implicit read of vcc or vcc_lo. The implicit read | |||
3646 | // of vcc was already added during the initial BuildMI, but we | |||
3647 | // 1) may need to change vcc to vcc_lo to preserve the original register | |||
3648 | // 2) have to preserve the original flags. | |||
3649 | fixImplicitOperands(*Inst32); | |||
3650 | copyFlagsToImplicitVCC(*Inst32, *Src2); | |||
3651 | } | |||
3652 | } | |||
3653 | ||||
3654 | return Inst32; | |||
3655 | } | |||
3656 | ||||
3657 | bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, | |||
3658 | const MachineOperand &MO, | |||
3659 | const MCOperandInfo &OpInfo) const { | |||
3660 | // Literal constants use the constant bus. | |||
3661 | //if (isLiteralConstantLike(MO, OpInfo)) | |||
3662 | // return true; | |||
3663 | if (MO.isImm()) | |||
3664 | return !isInlineConstant(MO, OpInfo); | |||
3665 | ||||
3666 | if (!MO.isReg()) | |||
3667 | return true; // Misc other operands like FrameIndex | |||
3668 | ||||
3669 | if (!MO.isUse()) | |||
3670 | return false; | |||
3671 | ||||
3672 | if (MO.getReg().isVirtual()) | |||
3673 | return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); | |||
3674 | ||||
3675 | // Null is free | |||
3676 | if (MO.getReg() == AMDGPU::SGPR_NULL) | |||
3677 | return false; | |||
3678 | ||||
3679 | // SGPRs use the constant bus | |||
3680 | if (MO.isImplicit()) { | |||
3681 | return MO.getReg() == AMDGPU::M0 || | |||
3682 | MO.getReg() == AMDGPU::VCC || | |||
3683 | MO.getReg() == AMDGPU::VCC_LO; | |||
3684 | } else { | |||
3685 | return AMDGPU::SReg_32RegClass.contains(MO.getReg()) || | |||
3686 | AMDGPU::SReg_64RegClass.contains(MO.getReg()); | |||
3687 | } | |||
3688 | } | |||
3689 | ||||
3690 | static Register findImplicitSGPRRead(const MachineInstr &MI) { | |||
3691 | for (const MachineOperand &MO : MI.implicit_operands()) { | |||
3692 | // We only care about reads. | |||
3693 | if (MO.isDef()) | |||
3694 | continue; | |||
3695 | ||||
3696 | switch (MO.getReg()) { | |||
3697 | case AMDGPU::VCC: | |||
3698 | case AMDGPU::VCC_LO: | |||
3699 | case AMDGPU::VCC_HI: | |||
3700 | case AMDGPU::M0: | |||
3701 | case AMDGPU::FLAT_SCR: | |||
3702 | return MO.getReg(); | |||
3703 | ||||
3704 | default: | |||
3705 | break; | |||
3706 | } | |||
3707 | } | |||
3708 | ||||
3709 | return AMDGPU::NoRegister; | |||
3710 | } | |||
3711 | ||||
3712 | static bool shouldReadExec(const MachineInstr &MI) { | |||
3713 | if (SIInstrInfo::isVALU(MI)) { | |||
3714 | switch (MI.getOpcode()) { | |||
3715 | case AMDGPU::V_READLANE_B32: | |||
3716 | case AMDGPU::V_WRITELANE_B32: | |||
3717 | return false; | |||
3718 | } | |||
3719 | ||||
3720 | return true; | |||
3721 | } | |||
3722 | ||||
3723 | if (MI.isPreISelOpcode() || | |||
3724 | SIInstrInfo::isGenericOpcode(MI.getOpcode()) || | |||
3725 | SIInstrInfo::isSALU(MI) || | |||
3726 | SIInstrInfo::isSMRD(MI)) | |||
3727 | return false; | |||
3728 | ||||
3729 | return true; | |||
3730 | } | |||
3731 | ||||
3732 | static bool isSubRegOf(const SIRegisterInfo &TRI, | |||
3733 | const MachineOperand &SuperVec, | |||
3734 | const MachineOperand &SubReg) { | |||
3735 | if (SubReg.getReg().isPhysical()) | |||
3736 | return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); | |||
3737 | ||||
3738 | return SubReg.getSubReg() != AMDGPU::NoSubRegister && | |||
3739 | SubReg.getReg() == SuperVec.getReg(); | |||
3740 | } | |||
3741 | ||||
3742 | bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, | |||
3743 | StringRef &ErrInfo) const { | |||
3744 | uint16_t Opcode = MI.getOpcode(); | |||
3745 | if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) | |||
3746 | return true; | |||
3747 | ||||
3748 | const MachineFunction *MF = MI.getParent()->getParent(); | |||
3749 | const MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
3750 | ||||
3751 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); | |||
3752 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); | |||
3753 | int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); | |||
3754 | ||||
3755 | // Make sure the number of operands is correct. | |||
3756 | const MCInstrDesc &Desc = get(Opcode); | |||
3757 | if (!Desc.isVariadic() && | |||
3758 | Desc.getNumOperands() != MI.getNumExplicitOperands()) { | |||
3759 | ErrInfo = "Instruction has wrong number of operands."; | |||
3760 | return false; | |||
3761 | } | |||
3762 | ||||
3763 | if (MI.isInlineAsm()) { | |||
3764 | // Verify register classes for inlineasm constraints. | |||
3765 | for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); | |||
3766 | I != E; ++I) { | |||
3767 | const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); | |||
3768 | if (!RC) | |||
3769 | continue; | |||
3770 | ||||
3771 | const MachineOperand &Op = MI.getOperand(I); | |||
3772 | if (!Op.isReg()) | |||
3773 | continue; | |||
3774 | ||||
3775 | Register Reg = Op.getReg(); | |||
3776 | if (!Reg.isVirtual() && !RC->contains(Reg)) { | |||
3777 | ErrInfo = "inlineasm operand has incorrect register class."; | |||
3778 | return false; | |||
3779 | } | |||
3780 | } | |||
3781 | ||||
3782 | return true; | |||
3783 | } | |||
3784 | ||||
3785 | if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) { | |||
3786 | ErrInfo = "missing memory operand from MIMG instruction."; | |||
3787 | return false; | |||
3788 | } | |||
3789 | ||||
3790 | // Make sure the register classes are correct. | |||
3791 | for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { | |||
3792 | const MachineOperand &MO = MI.getOperand(i); | |||
3793 | if (MO.isFPImm()) { | |||
3794 | ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " | |||
3795 | "all fp values to integers."; | |||
3796 | return false; | |||
3797 | } | |||
3798 | ||||
3799 | int RegClass = Desc.OpInfo[i].RegClass; | |||
3800 | ||||
3801 | switch (Desc.OpInfo[i].OperandType) { | |||
3802 | case MCOI::OPERAND_REGISTER: | |||
3803 | if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { | |||
3804 | ErrInfo = "Illegal immediate value for operand."; | |||
3805 | return false; | |||
3806 | } | |||
3807 | break; | |||
3808 | case AMDGPU::OPERAND_REG_IMM_INT32: | |||
3809 | case AMDGPU::OPERAND_REG_IMM_FP32: | |||
3810 | break; | |||
3811 | case AMDGPU::OPERAND_REG_INLINE_C_INT32: | |||
3812 | case AMDGPU::OPERAND_REG_INLINE_C_FP32: | |||
3813 | case AMDGPU::OPERAND_REG_INLINE_C_INT64: | |||
3814 | case AMDGPU::OPERAND_REG_INLINE_C_FP64: | |||
3815 | case AMDGPU::OPERAND_REG_INLINE_C_INT16: | |||
3816 | case AMDGPU::OPERAND_REG_INLINE_C_FP16: | |||
3817 | case AMDGPU::OPERAND_REG_INLINE_AC_INT32: | |||
3818 | case AMDGPU::OPERAND_REG_INLINE_AC_FP32: | |||
3819 | case AMDGPU::OPERAND_REG_INLINE_AC_INT16: | |||
3820 | case AMDGPU::OPERAND_REG_INLINE_AC_FP16: | |||
3821 | case AMDGPU::OPERAND_REG_INLINE_AC_FP64: { | |||
3822 | if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { | |||
3823 | ErrInfo = "Illegal immediate value for operand."; | |||
3824 | return false; | |||
3825 | } | |||
3826 | break; | |||
3827 | } | |||
3828 | case MCOI::OPERAND_IMMEDIATE: | |||
3829 | case AMDGPU::OPERAND_KIMM32: | |||
3830 | // Check if this operand is an immediate. | |||
3831 | // FrameIndex operands will be replaced by immediates, so they are | |||
3832 | // allowed. | |||
3833 | if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { | |||
3834 | ErrInfo = "Expected immediate, but got non-immediate"; | |||
3835 | return false; | |||
3836 | } | |||
3837 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | |||
3838 | default: | |||
3839 | continue; | |||
3840 | } | |||
3841 | ||||
3842 | if (!MO.isReg()) | |||
3843 | continue; | |||
3844 | Register Reg = MO.getReg(); | |||
3845 | if (!Reg) | |||
3846 | continue; | |||
3847 | ||||
3848 | // FIXME: Ideally we would have separate instruction definitions with the | |||
3849 | // aligned register constraint. | |||
3850 | // FIXME: We do not verify inline asm operands, but custom inline asm | |||
3851 | // verification is broken anyway | |||
3852 | if (ST.needsAlignedVGPRs()) { | |||
3853 | const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg); | |||
3854 | const bool IsVGPR = RI.hasVGPRs(RC); | |||
3855 | const bool IsAGPR = !IsVGPR && RI.hasAGPRs(RC); | |||
3856 | if ((IsVGPR || IsAGPR) && MO.getSubReg()) { | |||
3857 | const TargetRegisterClass *SubRC = | |||
3858 | RI.getSubRegClass(RC, MO.getSubReg()); | |||
3859 | RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg()); | |||
3860 | if (RC) | |||
3861 | RC = SubRC; | |||
3862 | } | |||
3863 | ||||
3864 | // Check that this is the aligned version of the class. | |||
3865 | if (!RC || ((IsVGPR && !RC->hasSuperClassEq(RI.getVGPRClassForBitWidth( | |||
3866 | RI.getRegSizeInBits(*RC)))) || | |||
3867 | (IsAGPR && !RC->hasSuperClassEq(RI.getAGPRClassForBitWidth( | |||
3868 | RI.getRegSizeInBits(*RC)))))) { | |||
3869 | ErrInfo = "Subtarget requires even aligned vector registers"; | |||
3870 | return false; | |||
3871 | } | |||
3872 | } | |||
3873 | ||||
3874 | if (RegClass != -1) { | |||
3875 | if (Reg.isVirtual()) | |||
3876 | continue; | |||
3877 | ||||
3878 | const TargetRegisterClass *RC = RI.getRegClass(RegClass); | |||
3879 | if (!RC->contains(Reg)) { | |||
3880 | ErrInfo = "Operand has incorrect register class."; | |||
3881 | return false; | |||
3882 | } | |||
3883 | } | |||
3884 | } | |||
3885 | ||||
3886 | // Verify SDWA | |||
3887 | if (isSDWA(MI)) { | |||
3888 | if (!ST.hasSDWA()) { | |||
3889 | ErrInfo = "SDWA is not supported on this target"; | |||
3890 | return false; | |||
3891 | } | |||
3892 | ||||
3893 | int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); | |||
3894 | ||||
3895 | const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; | |||
3896 | ||||
3897 | for (int OpIdx: OpIndicies) { | |||
3898 | if (OpIdx == -1) | |||
3899 | continue; | |||
3900 | const MachineOperand &MO = MI.getOperand(OpIdx); | |||
3901 | ||||
3902 | if (!ST.hasSDWAScalar()) { | |||
3903 | // Only VGPRS on VI | |||
3904 | if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { | |||
3905 | ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; | |||
3906 | return false; | |||
3907 | } | |||
3908 | } else { | |||
3909 | // No immediates on GFX9 | |||
3910 | if (!MO.isReg()) { | |||
3911 | ErrInfo = | |||
3912 | "Only reg allowed as operands in SDWA instructions on GFX9+"; | |||
3913 | return false; | |||
3914 | } | |||
3915 | } | |||
3916 | } | |||
3917 | ||||
3918 | if (!ST.hasSDWAOmod()) { | |||
3919 | // No omod allowed on VI | |||
3920 | const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); | |||
3921 | if (OMod != nullptr && | |||
3922 | (!OMod->isImm() || OMod->getImm() != 0)) { | |||
3923 | ErrInfo = "OMod not allowed in SDWA instructions on VI"; | |||
3924 | return false; | |||
3925 | } | |||
3926 | } | |||
3927 | ||||
3928 | uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); | |||
3929 | if (isVOPC(BasicOpcode)) { | |||
3930 | if (!ST.hasSDWASdst() && DstIdx != -1) { | |||
3931 | // Only vcc allowed as dst on VI for VOPC | |||
3932 | const MachineOperand &Dst = MI.getOperand(DstIdx); | |||
3933 | if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { | |||
3934 | ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; | |||
3935 | return false; | |||
3936 | } | |||
3937 | } else if (!ST.hasSDWAOutModsVOPC()) { | |||
3938 | // No clamp allowed on GFX9 for VOPC | |||
3939 | const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); | |||
3940 | if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { | |||
3941 | ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; | |||
3942 | return false; | |||
3943 | } | |||
3944 | ||||
3945 | // No omod allowed on GFX9 for VOPC | |||
3946 | const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); | |||
3947 | if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { | |||
3948 | ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; | |||
3949 | return false; | |||
3950 | } | |||
3951 | } | |||
3952 | } | |||
3953 | ||||
3954 | const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused); | |||
3955 | if (DstUnused && DstUnused->isImm() && | |||
3956 | DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) { | |||
3957 | const MachineOperand &Dst = MI.getOperand(DstIdx); | |||
3958 | if (!Dst.isReg() || !Dst.isTied()) { | |||
3959 | ErrInfo = "Dst register should have tied register"; | |||
3960 | return false; | |||
3961 | } | |||
3962 | ||||
3963 | const MachineOperand &TiedMO = | |||
3964 | MI.getOperand(MI.findTiedOperandIdx(DstIdx)); | |||
3965 | if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) { | |||
3966 | ErrInfo = | |||
3967 | "Dst register should be tied to implicit use of preserved register"; | |||
3968 | return false; | |||
3969 | } else if (TiedMO.getReg().isPhysical() && | |||
3970 | Dst.getReg() != TiedMO.getReg()) { | |||
3971 | ErrInfo = "Dst register should use same physical register as preserved"; | |||
3972 | return false; | |||
3973 | } | |||
3974 | } | |||
3975 | } | |||
3976 | ||||
3977 | // Verify MIMG | |||
3978 | if (isMIMG(MI.getOpcode()) && !MI.mayStore()) { | |||
3979 | // Ensure that the return type used is large enough for all the options | |||
3980 | // being used TFE/LWE require an extra result register. | |||
3981 | const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); | |||
3982 | if (DMask) { | |||
3983 | uint64_t DMaskImm = DMask->getImm(); | |||
3984 | uint32_t RegCount = | |||
3985 | isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm); | |||
3986 | const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe); | |||
3987 | const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe); | |||
3988 | const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16); | |||
3989 | ||||
3990 | // Adjust for packed 16 bit values | |||
3991 | if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem()) | |||
3992 | RegCount >>= 1; | |||
3993 | ||||
3994 | // Adjust if using LWE or TFE | |||
3995 | if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) | |||
3996 | RegCount += 1; | |||
3997 | ||||
3998 | const uint32_t DstIdx = | |||
3999 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); | |||
4000 | const MachineOperand &Dst = MI.getOperand(DstIdx); | |||
4001 | if (Dst.isReg()) { | |||
4002 | const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); | |||
4003 | uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; | |||
4004 | if (RegCount > DstSize) { | |||
4005 | ErrInfo = "MIMG instruction returns too many registers for dst " | |||
4006 | "register class"; | |||
4007 | return false; | |||
4008 | } | |||
4009 | } | |||
4010 | } | |||
4011 | } | |||
4012 | ||||
4013 | // Verify VOP*. Ignore multiple sgpr operands on writelane. | |||
4014 | if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 | |||
4015 | && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { | |||
4016 | // Only look at the true operands. Only a real operand can use the constant | |||
4017 | // bus, and we don't want to check pseudo-operands like the source modifier | |||
4018 | // flags. | |||
4019 | const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; | |||
4020 | ||||
4021 | unsigned ConstantBusCount = 0; | |||
4022 | unsigned LiteralCount = 0; | |||
4023 | ||||
4024 | if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) | |||
4025 | ++ConstantBusCount; | |||
4026 | ||||
4027 | SmallVector<Register, 2> SGPRsUsed; | |||
4028 | Register SGPRUsed; | |||
4029 | ||||
4030 | for (int OpIdx : OpIndices) { | |||
4031 | if (OpIdx == -1) | |||
4032 | break; | |||
4033 | const MachineOperand &MO = MI.getOperand(OpIdx); | |||
4034 | if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { | |||
4035 | if (MO.isReg()) { | |||
4036 | SGPRUsed = MO.getReg(); | |||
4037 | if (llvm::all_of(SGPRsUsed, [SGPRUsed](unsigned SGPR) { | |||
4038 | return SGPRUsed != SGPR; | |||
4039 | })) { | |||
4040 | ++ConstantBusCount; | |||
4041 | SGPRsUsed.push_back(SGPRUsed); | |||
4042 | } | |||
4043 | } else { | |||
4044 | ++ConstantBusCount; | |||
4045 | ++LiteralCount; | |||
4046 | } | |||
4047 | } | |||
4048 | } | |||
4049 | ||||
4050 | SGPRUsed = findImplicitSGPRRead(MI); | |||
4051 | if (SGPRUsed != AMDGPU::NoRegister) { | |||
4052 | // Implicit uses may safely overlap true overands | |||
4053 | if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { | |||
4054 | return !RI.regsOverlap(SGPRUsed, SGPR); | |||
4055 | })) { | |||
4056 | ++ConstantBusCount; | |||
4057 | SGPRsUsed.push_back(SGPRUsed); | |||
4058 | } | |||
4059 | } | |||
4060 | ||||
4061 | // v_writelane_b32 is an exception from constant bus restriction: | |||
4062 | // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const | |||
4063 | if (ConstantBusCount > ST.getConstantBusLimit(Opcode) && | |||
4064 | Opcode != AMDGPU::V_WRITELANE_B32) { | |||
4065 | ErrInfo = "VOP* instruction violates constant bus restriction"; | |||
4066 | return false; | |||
4067 | } | |||
4068 | ||||
4069 | if (isVOP3(MI) && LiteralCount) { | |||
4070 | if (!ST.hasVOP3Literal()) { | |||
4071 | ErrInfo = "VOP3 instruction uses literal"; | |||
4072 | return false; | |||
4073 | } | |||
4074 | if (LiteralCount > 1) { | |||
4075 | ErrInfo = "VOP3 instruction uses more than one literal"; | |||
4076 | return false; | |||
4077 | } | |||
4078 | } | |||
4079 | } | |||
4080 | ||||
4081 | // Special case for writelane - this can break the multiple constant bus rule, | |||
4082 | // but still can't use more than one SGPR register | |||
4083 | if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) { | |||
4084 | unsigned SGPRCount = 0; | |||
4085 | Register SGPRUsed = AMDGPU::NoRegister; | |||
4086 | ||||
4087 | for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { | |||
4088 | if (OpIdx == -1) | |||
4089 | break; | |||
4090 | ||||
4091 | const MachineOperand &MO = MI.getOperand(OpIdx); | |||
4092 | ||||
4093 | if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { | |||
4094 | if (MO.isReg() && MO.getReg() != AMDGPU::M0) { | |||
4095 | if (MO.getReg() != SGPRUsed) | |||
4096 | ++SGPRCount; | |||
4097 | SGPRUsed = MO.getReg(); | |||
4098 | } | |||
4099 | } | |||
4100 | if (SGPRCount > ST.getConstantBusLimit(Opcode)) { | |||
4101 | ErrInfo = "WRITELANE instruction violates constant bus restriction"; | |||
4102 | return false; | |||
4103 | } | |||
4104 | } | |||
4105 | } | |||
4106 | ||||
4107 | // Verify misc. restrictions on specific instructions. | |||
4108 | if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 || | |||
4109 | Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) { | |||
4110 | const MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
4111 | const MachineOperand &Src1 = MI.getOperand(Src1Idx); | |||
4112 | const MachineOperand &Src2 = MI.getOperand(Src2Idx); | |||
4113 | if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { | |||
4114 | if (!compareMachineOp(Src0, Src1) && | |||
4115 | !compareMachineOp(Src0, Src2)) { | |||
4116 | ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; | |||
4117 | return false; | |||
4118 | } | |||
4119 | } | |||
4120 | if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & | |||
4121 | SISrcMods::ABS) || | |||
4122 | (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() & | |||
4123 | SISrcMods::ABS) || | |||
4124 | (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() & | |||
4125 | SISrcMods::ABS)) { | |||
4126 | ErrInfo = "ABS not allowed in VOP3B instructions"; | |||
4127 | return false; | |||
4128 | } | |||
4129 | } | |||
4130 | ||||
4131 | if (isSOP2(MI) || isSOPC(MI)) { | |||
4132 | const MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
4133 | const MachineOperand &Src1 = MI.getOperand(Src1Idx); | |||
4134 | unsigned Immediates = 0; | |||
4135 | ||||
4136 | if (!Src0.isReg() && | |||
4137 | !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType)) | |||
4138 | Immediates++; | |||
4139 | if (!Src1.isReg() && | |||
4140 | !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType)) | |||
4141 | Immediates++; | |||
4142 | ||||
4143 | if (Immediates > 1) { | |||
4144 | ErrInfo = "SOP2/SOPC instruction requires too many immediate constants"; | |||
4145 | return false; | |||
4146 | } | |||
4147 | } | |||
4148 | ||||
4149 | if (isSOPK(MI)) { | |||
4150 | auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16); | |||
4151 | if (Desc.isBranch()) { | |||
4152 | if (!Op->isMBB()) { | |||
4153 | ErrInfo = "invalid branch target for SOPK instruction"; | |||
4154 | return false; | |||
4155 | } | |||
4156 | } else { | |||
4157 | uint64_t Imm = Op->getImm(); | |||
4158 | if (sopkIsZext(MI)) { | |||
4159 | if (!isUInt<16>(Imm)) { | |||
4160 | ErrInfo = "invalid immediate for SOPK instruction"; | |||
4161 | return false; | |||
4162 | } | |||
4163 | } else { | |||
4164 | if (!isInt<16>(Imm)) { | |||
4165 | ErrInfo = "invalid immediate for SOPK instruction"; | |||
4166 | return false; | |||
4167 | } | |||
4168 | } | |||
4169 | } | |||
4170 | } | |||
4171 | ||||
4172 | if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || | |||
4173 | Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || | |||
4174 | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || | |||
4175 | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { | |||
4176 | const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || | |||
4177 | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; | |||
4178 | ||||
4179 | const unsigned StaticNumOps = Desc.getNumOperands() + | |||
4180 | Desc.getNumImplicitUses(); | |||
4181 | const unsigned NumImplicitOps = IsDst ? 2 : 1; | |||
4182 | ||||
4183 | // Allow additional implicit operands. This allows a fixup done by the post | |||
4184 | // RA scheduler where the main implicit operand is killed and implicit-defs | |||
4185 | // are added for sub-registers that remain live after this instruction. | |||
4186 | if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { | |||
4187 | ErrInfo = "missing implicit register operands"; | |||
4188 | return false; | |||
4189 | } | |||
4190 | ||||
4191 | const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
4192 | if (IsDst) { | |||
4193 | if (!Dst->isUse()) { | |||
4194 | ErrInfo = "v_movreld_b32 vdst should be a use operand"; | |||
4195 | return false; | |||
4196 | } | |||
4197 | ||||
4198 | unsigned UseOpIdx; | |||
4199 | if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || | |||
4200 | UseOpIdx != StaticNumOps + 1) { | |||
4201 | ErrInfo = "movrel implicit operands should be tied"; | |||
4202 | return false; | |||
4203 | } | |||
4204 | } | |||
4205 | ||||
4206 | const MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
4207 | const MachineOperand &ImpUse | |||
4208 | = MI.getOperand(StaticNumOps + NumImplicitOps - 1); | |||
4209 | if (!ImpUse.isReg() || !ImpUse.isUse() || | |||
4210 | !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { | |||
4211 | ErrInfo = "src0 should be subreg of implicit vector use"; | |||
4212 | return false; | |||
4213 | } | |||
4214 | } | |||
4215 | ||||
4216 | // Make sure we aren't losing exec uses in the td files. This mostly requires | |||
4217 | // being careful when using let Uses to try to add other use registers. | |||
4218 | if (shouldReadExec(MI)) { | |||
4219 | if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { | |||
4220 | ErrInfo = "VALU instruction does not implicitly read exec mask"; | |||
4221 | return false; | |||
4222 | } | |||
4223 | } | |||
4224 | ||||
4225 | if (isSMRD(MI)) { | |||
4226 | if (MI.mayStore()) { | |||
4227 | // The register offset form of scalar stores may only use m0 as the | |||
4228 | // soffset register. | |||
4229 | const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); | |||
4230 | if (Soff && Soff->getReg() != AMDGPU::M0) { | |||
4231 | ErrInfo = "scalar stores must use m0 as offset register"; | |||
4232 | return false; | |||
4233 | } | |||
4234 | } | |||
4235 | } | |||
4236 | ||||
4237 | if (isFLAT(MI) && !ST.hasFlatInstOffsets()) { | |||
4238 | const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); | |||
4239 | if (Offset->getImm() != 0) { | |||
4240 | ErrInfo = "subtarget does not support offsets in flat instructions"; | |||
4241 | return false; | |||
4242 | } | |||
4243 | } | |||
4244 | ||||
4245 | if (isMIMG(MI)) { | |||
4246 | const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim); | |||
4247 | if (DimOp) { | |||
4248 | int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode, | |||
4249 | AMDGPU::OpName::vaddr0); | |||
4250 | int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); | |||
4251 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode); | |||
4252 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = | |||
4253 | AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); | |||
4254 | const AMDGPU::MIMGDimInfo *Dim = | |||
4255 | AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm()); | |||
4256 | ||||
4257 | if (!Dim) { | |||
4258 | ErrInfo = "dim is out of range"; | |||
4259 | return false; | |||
4260 | } | |||
4261 | ||||
4262 | bool IsA16 = false; | |||
4263 | if (ST.hasR128A16()) { | |||
4264 | const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128); | |||
4265 | IsA16 = R128A16->getImm() != 0; | |||
4266 | } else if (ST.hasGFX10A16()) { | |||
4267 | const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16); | |||
4268 | IsA16 = A16->getImm() != 0; | |||
4269 | } | |||
4270 | ||||
4271 | bool PackDerivatives = IsA16 || BaseOpcode->G16; | |||
4272 | bool IsNSA = SRsrcIdx - VAddr0Idx > 1; | |||
4273 | ||||
4274 | unsigned AddrWords = BaseOpcode->NumExtraArgs; | |||
4275 | unsigned AddrComponents = (BaseOpcode->Coordinates ? Dim->NumCoords : 0) + | |||
4276 | (BaseOpcode->LodOrClampOrMip ? 1 : 0); | |||
4277 | if (IsA16) | |||
4278 | AddrWords += (AddrComponents + 1) / 2; | |||
4279 | else | |||
4280 | AddrWords += AddrComponents; | |||
4281 | ||||
4282 | if (BaseOpcode->Gradients) { | |||
4283 | if (PackDerivatives) | |||
4284 | // There are two gradients per coordinate, we pack them separately. | |||
4285 | // For the 3d case, we get (dy/du, dx/du) (-, dz/du) (dy/dv, dx/dv) (-, dz/dv) | |||
4286 | AddrWords += (Dim->NumGradients / 2 + 1) / 2 * 2; | |||
4287 | else | |||
4288 | AddrWords += Dim->NumGradients; | |||
4289 | } | |||
4290 | ||||
4291 | unsigned VAddrWords; | |||
4292 | if (IsNSA) { | |||
4293 | VAddrWords = SRsrcIdx - VAddr0Idx; | |||
4294 | } else { | |||
4295 | const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx); | |||
4296 | VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32; | |||
4297 | if (AddrWords > 8) | |||
4298 | AddrWords = 16; | |||
4299 | else if (AddrWords > 4) | |||
4300 | AddrWords = 8; | |||
4301 | else if (AddrWords == 4) | |||
4302 | AddrWords = 4; | |||
4303 | else if (AddrWords == 3) | |||
4304 | AddrWords = 3; | |||
4305 | } | |||
4306 | ||||
4307 | if (VAddrWords != AddrWords) { | |||
4308 | LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWordsdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-instr-info")) { dbgs() << "bad vaddr size, expected " << AddrWords << " but got " << VAddrWords << "\n"; } } while (false) | |||
4309 | << " but got " << VAddrWords << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-instr-info")) { dbgs() << "bad vaddr size, expected " << AddrWords << " but got " << VAddrWords << "\n"; } } while (false); | |||
4310 | ErrInfo = "bad vaddr size"; | |||
4311 | return false; | |||
4312 | } | |||
4313 | } | |||
4314 | } | |||
4315 | ||||
4316 | const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl); | |||
4317 | if (DppCt) { | |||
4318 | using namespace AMDGPU::DPP; | |||
4319 | ||||
4320 | unsigned DC = DppCt->getImm(); | |||
4321 | if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 || | |||
4322 | DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST || | |||
4323 | (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || | |||
4324 | (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || | |||
4325 | (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || | |||
4326 | (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) || | |||
4327 | (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) { | |||
4328 | ErrInfo = "Invalid dpp_ctrl value"; | |||
4329 | return false; | |||
4330 | } | |||
4331 | if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 && | |||
4332 | ST.getGeneration() >= AMDGPUSubtarget::GFX10) { | |||
4333 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4334 | "wavefront shifts are not supported on GFX10+"; | |||
4335 | return false; | |||
4336 | } | |||
4337 | if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 && | |||
4338 | ST.getGeneration() >= AMDGPUSubtarget::GFX10) { | |||
4339 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4340 | "broadcasts are not supported on GFX10+"; | |||
4341 | return false; | |||
4342 | } | |||
4343 | if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && | |||
4344 | ST.getGeneration() < AMDGPUSubtarget::GFX10) { | |||
4345 | if (DC >= DppCtrl::ROW_NEWBCAST_FIRST && | |||
4346 | DC <= DppCtrl::ROW_NEWBCAST_LAST && | |||
4347 | !ST.hasGFX90AInsts()) { | |||
4348 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4349 | "row_newbroadcast/row_share is not supported before " | |||
4350 | "GFX90A/GFX10"; | |||
4351 | return false; | |||
4352 | } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) { | |||
4353 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4354 | "row_share and row_xmask are not supported before GFX10"; | |||
4355 | return false; | |||
4356 | } | |||
4357 | } | |||
4358 | ||||
4359 | int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); | |||
4360 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); | |||
4361 | ||||
4362 | if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO && | |||
4363 | ((DstIdx >= 0 && | |||
4364 | (Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID || | |||
4365 | Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64_Align2RegClassID)) || | |||
4366 | ((Src0Idx >= 0 && | |||
4367 | (Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID || | |||
4368 | Desc.OpInfo[Src0Idx].RegClass == | |||
4369 | AMDGPU::VReg_64_Align2RegClassID)))) && | |||
4370 | !AMDGPU::isLegal64BitDPPControl(DC)) { | |||
4371 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4372 | "64 bit dpp only support row_newbcast"; | |||
4373 | return false; | |||
4374 | } | |||
4375 | } | |||
4376 | ||||
4377 | if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) { | |||
4378 | const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
4379 | uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0 | |||
4380 | : AMDGPU::OpName::vdata; | |||
4381 | const MachineOperand *Data = getNamedOperand(MI, DataNameIdx); | |||
4382 | const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1); | |||
4383 | if (Data && !Data->isReg()) | |||
4384 | Data = nullptr; | |||
4385 | ||||
4386 | if (ST.hasGFX90AInsts()) { | |||
4387 | if (Dst && Data && | |||
4388 | (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) { | |||
4389 | ErrInfo = "Invalid register class: " | |||
4390 | "vdata and vdst should be both VGPR or AGPR"; | |||
4391 | return false; | |||
4392 | } | |||
4393 | if (Data && Data2 && | |||
4394 | (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) { | |||
4395 | ErrInfo = "Invalid register class: " | |||
4396 | "both data operands should be VGPR or AGPR"; | |||
4397 | return false; | |||
4398 | } | |||
4399 | } else { | |||
4400 | if ((Dst && RI.isAGPR(MRI, Dst->getReg())) || | |||
4401 | (Data && RI.isAGPR(MRI, Data->getReg())) || | |||
4402 | (Data2 && RI.isAGPR(MRI, Data2->getReg()))) { | |||
4403 | ErrInfo = "Invalid register class: " | |||
4404 | "agpr loads and stores not supported on this GPU"; | |||
4405 | return false; | |||
4406 | } | |||
4407 | } | |||
4408 | } | |||
4409 | ||||
4410 | return true; | |||
4411 | } | |||
4412 | ||||
4413 | unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { | |||
4414 | switch (MI.getOpcode()) { | |||
4415 | default: return AMDGPU::INSTRUCTION_LIST_END; | |||
4416 | case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; | |||
4417 | case AMDGPU::COPY: return AMDGPU::COPY; | |||
4418 | case AMDGPU::PHI: return AMDGPU::PHI; | |||
4419 | case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; | |||
4420 | case AMDGPU::WQM: return AMDGPU::WQM; | |||
4421 | case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; | |||
4422 | case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM; | |||
4423 | case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM; | |||
4424 | case AMDGPU::S_MOV_B32: { | |||
4425 | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); | |||
4426 | return MI.getOperand(1).isReg() || | |||
4427 | RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? | |||
4428 | AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; | |||
4429 | } | |||
4430 | case AMDGPU::S_ADD_I32: | |||
4431 | return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32; | |||
4432 | case AMDGPU::S_ADDC_U32: | |||
4433 | return AMDGPU::V_ADDC_U32_e32; | |||
4434 | case AMDGPU::S_SUB_I32: | |||
4435 | return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32; | |||
4436 | // FIXME: These are not consistently handled, and selected when the carry is | |||
4437 | // used. | |||
4438 | case AMDGPU::S_ADD_U32: | |||
4439 | return AMDGPU::V_ADD_CO_U32_e32; | |||
4440 | case AMDGPU::S_SUB_U32: | |||
4441 | return AMDGPU::V_SUB_CO_U32_e32; | |||
4442 | case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; | |||
4443 | case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64; | |||
4444 | case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64; | |||
4445 | case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64; | |||
4446 | case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; | |||
4447 | case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; | |||
4448 | case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; | |||
4449 | case AMDGPU::S_XNOR_B32: | |||
4450 | return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; | |||
4451 | case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; | |||
4452 | case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; | |||
4453 | case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; | |||
4454 | case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; | |||
4455 | case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; | |||
4456 | case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64; | |||
4457 | case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; | |||
4458 | case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64; | |||
4459 | case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; | |||
4460 | case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64; | |||
4461 | case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64; | |||
4462 | case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64; | |||
4463 | case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64; | |||
4464 | case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64; | |||
4465 | case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; | |||
4466 | case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; | |||
4467 | case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; | |||
4468 | case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; | |||
4469 | case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; | |||
4470 | case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; | |||
4471 | case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; | |||
4472 | case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; | |||
4473 | case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; | |||
4474 | case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; | |||
4475 | case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; | |||
4476 | case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; | |||
4477 | case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; | |||
4478 | case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; | |||
4479 | case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; | |||
4480 | case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; | |||
4481 | case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; | |||
4482 | case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; | |||
4483 | case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; | |||
4484 | case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; | |||
4485 | case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; | |||
4486 | case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; | |||
4487 | case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; | |||
4488 | case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; | |||
4489 | } | |||
4490 | llvm_unreachable(::llvm::llvm_unreachable_internal("Unexpected scalar opcode without corresponding vector one!" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4491) | |||
4491 | "Unexpected scalar opcode without corresponding vector one!")::llvm::llvm_unreachable_internal("Unexpected scalar opcode without corresponding vector one!" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4491); | |||
4492 | } | |||
4493 | ||||
4494 | static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST, | |||
4495 | const MachineRegisterInfo &MRI, | |||
4496 | const MCInstrDesc &TID, | |||
4497 | unsigned RCID, | |||
4498 | bool IsAllocatable) { | |||
4499 | if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && | |||
4500 | (TID.mayLoad() || TID.mayStore() || | |||
4501 | (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) { | |||
4502 | switch (RCID) { | |||
4503 | case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID; | |||
4504 | case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID; | |||
4505 | case AMDGPU::AV_96RegClassID: return AMDGPU::VReg_96RegClassID; | |||
4506 | case AMDGPU::AV_128RegClassID: return AMDGPU::VReg_128RegClassID; | |||
4507 | case AMDGPU::AV_160RegClassID: return AMDGPU::VReg_160RegClassID; | |||
4508 | default: | |||
4509 | break; | |||
4510 | } | |||
4511 | } | |||
4512 | return RCID; | |||
4513 | } | |||
4514 | ||||
4515 | const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID, | |||
4516 | unsigned OpNum, const TargetRegisterInfo *TRI, | |||
4517 | const MachineFunction &MF) | |||
4518 | const { | |||
4519 | if (OpNum >= TID.getNumOperands()) | |||
4520 | return nullptr; | |||
4521 | auto RegClass = TID.OpInfo[OpNum].RegClass; | |||
4522 | bool IsAllocatable = false; | |||
4523 | if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) { | |||
4524 | // vdst and vdata should be both VGPR or AGPR, same for the DS instructions | |||
4525 | // with two data operands. Request register class constainted to VGPR only | |||
4526 | // of both operands present as Machine Copy Propagation can not check this | |||
4527 | // constraint and possibly other passes too. | |||
4528 | // | |||
4529 | // The check is limited to FLAT and DS because atomics in non-flat encoding | |||
4530 | // have their vdst and vdata tied to be the same register. | |||
4531 | const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, | |||
4532 | AMDGPU::OpName::vdst); | |||
4533 | const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, | |||
4534 | (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0 | |||
4535 | : AMDGPU::OpName::vdata); | |||
4536 | if (DataIdx != -1) { | |||
4537 | IsAllocatable = VDstIdx != -1 || | |||
4538 | AMDGPU::getNamedOperandIdx(TID.Opcode, | |||
4539 | AMDGPU::OpName::data1) != -1; | |||
4540 | } | |||
4541 | } | |||
4542 | RegClass = adjustAllocatableRegClass(ST, MF.getRegInfo(), TID, RegClass, | |||
4543 | IsAllocatable); | |||
4544 | return RI.getRegClass(RegClass); | |||
4545 | } | |||
4546 | ||||
4547 | const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, | |||
4548 | unsigned OpNo) const { | |||
4549 | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); | |||
4550 | const MCInstrDesc &Desc = get(MI.getOpcode()); | |||
4551 | if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || | |||
4552 | Desc.OpInfo[OpNo].RegClass == -1) { | |||
4553 | Register Reg = MI.getOperand(OpNo).getReg(); | |||
4554 | ||||
4555 | if (Reg.isVirtual()) | |||
4556 | return MRI.getRegClass(Reg); | |||
4557 | return RI.getPhysRegClass(Reg); | |||
4558 | } | |||
4559 | ||||
4560 | unsigned RCID = Desc.OpInfo[OpNo].RegClass; | |||
4561 | RCID = adjustAllocatableRegClass(ST, MRI, Desc, RCID, true); | |||
4562 | return RI.getRegClass(RCID); | |||
4563 | } | |||
4564 | ||||
4565 | void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { | |||
4566 | MachineBasicBlock::iterator I = MI; | |||
4567 | MachineBasicBlock *MBB = MI.getParent(); | |||
4568 | MachineOperand &MO = MI.getOperand(OpIdx); | |||
4569 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | |||
4570 | unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; | |||
4571 | const TargetRegisterClass *RC = RI.getRegClass(RCID); | |||
4572 | unsigned Size = RI.getRegSizeInBits(*RC); | |||
4573 | unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32; | |||
4574 | if (MO.isReg()) | |||
4575 | Opcode = AMDGPU::COPY; | |||
4576 | else if (RI.isSGPRClass(RC)) | |||
4577 | Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; | |||
4578 | ||||
4579 | const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); | |||
4580 | const TargetRegisterClass *VRC64 = RI.getVGPR64Class(); | |||
4581 | if (RI.getCommonSubClass(VRC64, VRC)) | |||
4582 | VRC = VRC64; | |||
4583 | else | |||
4584 | VRC = &AMDGPU::VGPR_32RegClass; | |||
4585 | ||||
4586 | Register Reg = MRI.createVirtualRegister(VRC); | |||
4587 | DebugLoc DL = MBB->findDebugLoc(I); | |||
4588 | BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); | |||
4589 | MO.ChangeToRegister(Reg, false); | |||
4590 | } | |||
4591 | ||||
4592 | unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, | |||
4593 | MachineRegisterInfo &MRI, | |||
4594 | MachineOperand &SuperReg, | |||
4595 | const TargetRegisterClass *SuperRC, | |||
4596 | unsigned SubIdx, | |||
4597 | const TargetRegisterClass *SubRC) | |||
4598 | const { | |||
4599 | MachineBasicBlock *MBB = MI->getParent(); | |||
4600 | DebugLoc DL = MI->getDebugLoc(); | |||
4601 | Register SubReg = MRI.createVirtualRegister(SubRC); | |||
4602 | ||||
4603 | if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { | |||
4604 | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) | |||
4605 | .addReg(SuperReg.getReg(), 0, SubIdx); | |||
4606 | return SubReg; | |||
4607 | } | |||
4608 | ||||
4609 | // Just in case the super register is itself a sub-register, copy it to a new | |||
4610 | // value so we don't need to worry about merging its subreg index with the | |||
4611 | // SubIdx passed to this function. The register coalescer should be able to | |||
4612 | // eliminate this extra copy. | |||
4613 | Register NewSuperReg = MRI.createVirtualRegister(SuperRC); | |||
4614 | ||||
4615 | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) | |||
4616 | .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); | |||
4617 | ||||
4618 | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) | |||
4619 | .addReg(NewSuperReg, 0, SubIdx); | |||
4620 | ||||
4621 | return SubReg; | |||
4622 | } | |||
4623 | ||||
4624 | MachineOperand SIInstrInfo::buildExtractSubRegOrImm( | |||
4625 | MachineBasicBlock::iterator MII, | |||
4626 | MachineRegisterInfo &MRI, | |||
4627 | MachineOperand &Op, | |||
4628 | const TargetRegisterClass *SuperRC, | |||
4629 | unsigned SubIdx, | |||
4630 | const TargetRegisterClass *SubRC) const { | |||
4631 | if (Op.isImm()) { | |||
4632 | if (SubIdx == AMDGPU::sub0) | |||
4633 | return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); | |||
4634 | if (SubIdx == AMDGPU::sub1) | |||
4635 | return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); | |||
4636 | ||||
4637 | llvm_unreachable("Unhandled register index for immediate")::llvm::llvm_unreachable_internal("Unhandled register index for immediate" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4637); | |||
4638 | } | |||
4639 | ||||
4640 | unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, | |||
4641 | SubIdx, SubRC); | |||
4642 | return MachineOperand::CreateReg(SubReg, false); | |||
4643 | } | |||
4644 | ||||
4645 | // Change the order of operands from (0, 1, 2) to (0, 2, 1) | |||
4646 | void SIInstrInfo::swapOperands(MachineInstr &Inst) const { | |||
4647 | assert(Inst.getNumExplicitOperands() == 3)((Inst.getNumExplicitOperands() == 3) ? static_cast<void> (0) : __assert_fail ("Inst.getNumExplicitOperands() == 3", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4647, __PRETTY_FUNCTION__)); | |||
4648 | MachineOperand Op1 = Inst.getOperand(1); | |||
4649 | Inst.RemoveOperand(1); | |||
4650 | Inst.addOperand(Op1); | |||
4651 | } | |||
4652 | ||||
4653 | bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, | |||
4654 | const MCOperandInfo &OpInfo, | |||
4655 | const MachineOperand &MO) const { | |||
4656 | if (!MO.isReg()) | |||
4657 | return false; | |||
4658 | ||||
4659 | Register Reg = MO.getReg(); | |||
4660 | ||||
4661 | const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass); | |||
4662 | if (Reg.isPhysical()) | |||
4663 | return DRC->contains(Reg); | |||
4664 | ||||
4665 | const TargetRegisterClass *RC = MRI.getRegClass(Reg); | |||
4666 | ||||
4667 | if (MO.getSubReg()) { | |||
4668 | const MachineFunction *MF = MO.getParent()->getParent()->getParent(); | |||
4669 | const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF); | |||
4670 | if (!SuperRC) | |||
4671 | return false; | |||
4672 | ||||
4673 | DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()); | |||
4674 | if (!DRC) | |||
4675 | return false; | |||
4676 | } | |||
4677 | return RC->hasSuperClassEq(DRC); | |||
4678 | } | |||
4679 | ||||
4680 | bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, | |||
4681 | const MCOperandInfo &OpInfo, | |||
4682 | const MachineOperand &MO) const { | |||
4683 | if (MO.isReg()) | |||
4684 | return isLegalRegOperand(MRI, OpInfo, MO); | |||
4685 | ||||
4686 | // Handle non-register types that are treated like immediates. | |||
4687 | assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal())((MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal ()) ? static_cast<void> (0) : __assert_fail ("MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4687, __PRETTY_FUNCTION__)); | |||
4688 | return true; | |||
4689 | } | |||
4690 | ||||
4691 | bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, | |||
4692 | const MachineOperand *MO) const { | |||
4693 | const MachineFunction &MF = *MI.getParent()->getParent(); | |||
4694 | const MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
4695 | const MCInstrDesc &InstDesc = MI.getDesc(); | |||
4696 | const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; | |||
4697 | const TargetRegisterClass *DefinedRC = | |||
4698 | OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; | |||
4699 | if (!MO) | |||
4700 | MO = &MI.getOperand(OpIdx); | |||
4701 | ||||
4702 | int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); | |||
4703 | int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; | |||
4704 | if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { | |||
4705 | if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--) | |||
4706 | return false; | |||
4707 | ||||
4708 | SmallDenseSet<RegSubRegPair> SGPRsUsed; | |||
4709 | if (MO->isReg()) | |||
4710 | SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg())); | |||
4711 | ||||
4712 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { | |||
4713 | if (i == OpIdx) | |||
4714 | continue; | |||
4715 | const MachineOperand &Op = MI.getOperand(i); | |||
4716 | if (Op.isReg()) { | |||
4717 | RegSubRegPair SGPR(Op.getReg(), Op.getSubReg()); | |||
4718 | if (!SGPRsUsed.count(SGPR) && | |||
4719 | usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { | |||
4720 | if (--ConstantBusLimit <= 0) | |||
4721 | return false; | |||
4722 | SGPRsUsed.insert(SGPR); | |||
4723 | } | |||
4724 | } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { | |||
4725 | if (--ConstantBusLimit <= 0) | |||
4726 | return false; | |||
4727 | } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) && | |||
4728 | isLiteralConstantLike(Op, InstDesc.OpInfo[i])) { | |||
4729 | if (!VOP3LiteralLimit--) | |||
4730 | return false; | |||
4731 | if (--ConstantBusLimit <= 0) | |||
4732 | return false; | |||
4733 | } | |||
4734 | } | |||
4735 | } | |||
4736 | ||||
4737 | if (MO->isReg()) { | |||
4738 | assert(DefinedRC)((DefinedRC) ? static_cast<void> (0) : __assert_fail ("DefinedRC" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4738, __PRETTY_FUNCTION__)); | |||
4739 | if (!isLegalRegOperand(MRI, OpInfo, *MO)) | |||
4740 | return false; | |||
4741 | bool IsAGPR = RI.isAGPR(MRI, MO->getReg()); | |||
4742 | if (IsAGPR && !ST.hasMAIInsts()) | |||
4743 | return false; | |||
4744 | unsigned Opc = MI.getOpcode(); | |||
4745 | if (IsAGPR && | |||
4746 | (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && | |||
4747 | (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc))) | |||
4748 | return false; | |||
4749 | // Atomics should have both vdst and vdata either vgpr or agpr. | |||
4750 | const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
4751 | const int DataIdx = AMDGPU::getNamedOperandIdx(Opc, | |||
4752 | isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata); | |||
4753 | if ((int)OpIdx == VDstIdx && DataIdx != -1 && | |||
4754 | MI.getOperand(DataIdx).isReg() && | |||
4755 | RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR) | |||
4756 | return false; | |||
4757 | if ((int)OpIdx == DataIdx) { | |||
4758 | if (VDstIdx != -1 && | |||
4759 | RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR) | |||
4760 | return false; | |||
4761 | // DS instructions with 2 src operands also must have tied RC. | |||
4762 | const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, | |||
4763 | AMDGPU::OpName::data1); | |||
4764 | if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() && | |||
4765 | RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR) | |||
4766 | return false; | |||
4767 | } | |||
4768 | if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && | |||
4769 | (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) && | |||
4770 | RI.isSGPRReg(MRI, MO->getReg())) | |||
4771 | return false; | |||
4772 | return true; | |||
4773 | } | |||
4774 | ||||
4775 | // Handle non-register types that are treated like immediates. | |||
4776 | assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal())((MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()) ? static_cast<void> (0) : __assert_fail ("MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4776, __PRETTY_FUNCTION__)); | |||
4777 | ||||
4778 | if (!DefinedRC) { | |||
4779 | // This operand expects an immediate. | |||
4780 | return true; | |||
4781 | } | |||
4782 | ||||
4783 | return isImmOperandLegal(MI, OpIdx, *MO); | |||
4784 | } | |||
4785 | ||||
4786 | void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, | |||
4787 | MachineInstr &MI) const { | |||
4788 | unsigned Opc = MI.getOpcode(); | |||
4789 | const MCInstrDesc &InstrDesc = get(Opc); | |||
4790 | ||||
4791 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); | |||
4792 | MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
4793 | ||||
4794 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); | |||
4795 | MachineOperand &Src1 = MI.getOperand(Src1Idx); | |||
4796 | ||||
4797 | // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 | |||
4798 | // we need to only have one constant bus use before GFX10. | |||
4799 | bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; | |||
4800 | if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && | |||
4801 | Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) || | |||
4802 | isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx]))) | |||
4803 | legalizeOpWithMove(MI, Src0Idx); | |||
4804 | ||||
4805 | // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for | |||
4806 | // both the value to write (src0) and lane select (src1). Fix up non-SGPR | |||
4807 | // src0/src1 with V_READFIRSTLANE. | |||
4808 | if (Opc == AMDGPU::V_WRITELANE_B32) { | |||
4809 | const DebugLoc &DL = MI.getDebugLoc(); | |||
4810 | if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { | |||
4811 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
4812 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
4813 | .add(Src0); | |||
4814 | Src0.ChangeToRegister(Reg, false); | |||
4815 | } | |||
4816 | if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { | |||
4817 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
4818 | const DebugLoc &DL = MI.getDebugLoc(); | |||
4819 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
4820 | .add(Src1); | |||
4821 | Src1.ChangeToRegister(Reg, false); | |||
4822 | } | |||
4823 | return; | |||
4824 | } | |||
4825 | ||||
4826 | // No VOP2 instructions support AGPRs. | |||
4827 | if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg())) | |||
4828 | legalizeOpWithMove(MI, Src0Idx); | |||
4829 | ||||
4830 | if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) | |||
4831 | legalizeOpWithMove(MI, Src1Idx); | |||
4832 | ||||
4833 | // VOP2 src0 instructions support all operand types, so we don't need to check | |||
4834 | // their legality. If src1 is already legal, we don't need to do anything. | |||
4835 | if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) | |||
4836 | return; | |||
4837 | ||||
4838 | // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for | |||
4839 | // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane | |||
4840 | // select is uniform. | |||
4841 | if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && | |||
4842 | RI.isVGPR(MRI, Src1.getReg())) { | |||
4843 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
4844 | const DebugLoc &DL = MI.getDebugLoc(); | |||
4845 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
4846 | .add(Src1); | |||
4847 | Src1.ChangeToRegister(Reg, false); | |||
4848 | return; | |||
4849 | } | |||
4850 | ||||
4851 | // We do not use commuteInstruction here because it is too aggressive and will | |||
4852 | // commute if it is possible. We only want to commute here if it improves | |||
4853 | // legality. This can be called a fairly large number of times so don't waste | |||
4854 | // compile time pointlessly swapping and checking legality again. | |||
4855 | if (HasImplicitSGPR || !MI.isCommutable()) { | |||
4856 | legalizeOpWithMove(MI, Src1Idx); | |||
4857 | return; | |||
4858 | } | |||
4859 | ||||
4860 | // If src0 can be used as src1, commuting will make the operands legal. | |||
4861 | // Otherwise we have to give up and insert a move. | |||
4862 | // | |||
4863 | // TODO: Other immediate-like operand kinds could be commuted if there was a | |||
4864 | // MachineOperand::ChangeTo* for them. | |||
4865 | if ((!Src1.isImm() && !Src1.isReg()) || | |||
4866 | !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { | |||
4867 | legalizeOpWithMove(MI, Src1Idx); | |||
4868 | return; | |||
4869 | } | |||
4870 | ||||
4871 | int CommutedOpc = commuteOpcode(MI); | |||
4872 | if (CommutedOpc == -1) { | |||
4873 | legalizeOpWithMove(MI, Src1Idx); | |||
4874 | return; | |||
4875 | } | |||
4876 | ||||
4877 | MI.setDesc(get(CommutedOpc)); | |||
4878 | ||||
4879 | Register Src0Reg = Src0.getReg(); | |||
4880 | unsigned Src0SubReg = Src0.getSubReg(); | |||
4881 | bool Src0Kill = Src0.isKill(); | |||
4882 | ||||
4883 | if (Src1.isImm()) | |||
4884 | Src0.ChangeToImmediate(Src1.getImm()); | |||
4885 | else if (Src1.isReg()) { | |||
4886 | Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); | |||
4887 | Src0.setSubReg(Src1.getSubReg()); | |||
4888 | } else | |||
4889 | llvm_unreachable("Should only have register or immediate operands")::llvm::llvm_unreachable_internal("Should only have register or immediate operands" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4889); | |||
4890 | ||||
4891 | Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); | |||
4892 | Src1.setSubReg(Src0SubReg); | |||
4893 | fixImplicitOperands(MI); | |||
4894 | } | |||
4895 | ||||
4896 | // Legalize VOP3 operands. All operand types are supported for any operand | |||
4897 | // but only one literal constant and only starting from GFX10. | |||
4898 | void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, | |||
4899 | MachineInstr &MI) const { | |||
4900 | unsigned Opc = MI.getOpcode(); | |||
4901 | ||||
4902 | int VOP3Idx[3] = { | |||
4903 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), | |||
4904 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), | |||
4905 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) | |||
4906 | }; | |||
4907 | ||||
4908 | if (Opc == AMDGPU::V_PERMLANE16_B32_e64 || | |||
4909 | Opc == AMDGPU::V_PERMLANEX16_B32_e64) { | |||
4910 | // src1 and src2 must be scalar | |||
4911 | MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]); | |||
4912 | MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); | |||
4913 | const DebugLoc &DL = MI.getDebugLoc(); | |||
4914 | if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { | |||
4915 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
4916 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
4917 | .add(Src1); | |||
4918 | Src1.ChangeToRegister(Reg, false); | |||
4919 | } | |||
4920 | if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { | |||
4921 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
4922 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
4923 | .add(Src2); | |||
4924 | Src2.ChangeToRegister(Reg, false); | |||
4925 | } | |||
4926 | } | |||
4927 | ||||
4928 | // Find the one SGPR operand we are allowed to use. | |||
4929 | int ConstantBusLimit = ST.getConstantBusLimit(Opc); | |||
4930 | int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; | |||
4931 | SmallDenseSet<unsigned> SGPRsUsed; | |||
4932 | Register SGPRReg = findUsedSGPR(MI, VOP3Idx); | |||
4933 | if (SGPRReg != AMDGPU::NoRegister) { | |||
4934 | SGPRsUsed.insert(SGPRReg); | |||
4935 | --ConstantBusLimit; | |||
4936 | } | |||
4937 | ||||
4938 | for (unsigned i = 0; i < 3; ++i) { | |||
4939 | int Idx = VOP3Idx[i]; | |||
4940 | if (Idx == -1) | |||
4941 | break; | |||
4942 | MachineOperand &MO = MI.getOperand(Idx); | |||
4943 | ||||
4944 | if (!MO.isReg()) { | |||
4945 | if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx])) | |||
4946 | continue; | |||
4947 | ||||
4948 | if (LiteralLimit > 0 && ConstantBusLimit > 0) { | |||
4949 | --LiteralLimit; | |||
4950 | --ConstantBusLimit; | |||
4951 | continue; | |||
4952 | } | |||
4953 | ||||
4954 | --LiteralLimit; | |||
4955 | --ConstantBusLimit; | |||
4956 | legalizeOpWithMove(MI, Idx); | |||
4957 | continue; | |||
4958 | } | |||
4959 | ||||
4960 | if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) && | |||
4961 | !isOperandLegal(MI, Idx, &MO)) { | |||
4962 | legalizeOpWithMove(MI, Idx); | |||
4963 | continue; | |||
4964 | } | |||
4965 | ||||
4966 | if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) | |||
4967 | continue; // VGPRs are legal | |||
4968 | ||||
4969 | // We can use one SGPR in each VOP3 instruction prior to GFX10 | |||
4970 | // and two starting from GFX10. | |||
4971 | if (SGPRsUsed.count(MO.getReg())) | |||
4972 | continue; | |||
4973 | if (ConstantBusLimit > 0) { | |||
4974 | SGPRsUsed.insert(MO.getReg()); | |||
4975 | --ConstantBusLimit; | |||
4976 | continue; | |||
4977 | } | |||
4978 | ||||
4979 | // If we make it this far, then the operand is not legal and we must | |||
4980 | // legalize it. | |||
4981 | legalizeOpWithMove(MI, Idx); | |||
4982 | } | |||
4983 | } | |||
4984 | ||||
4985 | Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, | |||
4986 | MachineRegisterInfo &MRI) const { | |||
4987 | const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); | |||
4988 | const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); | |||
4989 | Register DstReg = MRI.createVirtualRegister(SRC); | |||
4990 | unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; | |||
4991 | ||||
4992 | if (RI.hasAGPRs(VRC)) { | |||
4993 | VRC = RI.getEquivalentVGPRClass(VRC); | |||
4994 | Register NewSrcReg = MRI.createVirtualRegister(VRC); | |||
4995 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), | |||
4996 | get(TargetOpcode::COPY), NewSrcReg) | |||
4997 | .addReg(SrcReg); | |||
4998 | SrcReg = NewSrcReg; | |||
4999 | } | |||
5000 | ||||
5001 | if (SubRegs == 1) { | |||
5002 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), | |||
5003 | get(AMDGPU::V_READFIRSTLANE_B32), DstReg) | |||
5004 | .addReg(SrcReg); | |||
5005 | return DstReg; | |||
5006 | } | |||
5007 | ||||
5008 | SmallVector<unsigned, 8> SRegs; | |||
5009 | for (unsigned i = 0; i < SubRegs; ++i) { | |||
5010 | Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5011 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), | |||
5012 | get(AMDGPU::V_READFIRSTLANE_B32), SGPR) | |||
5013 | .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); | |||
5014 | SRegs.push_back(SGPR); | |||
5015 | } | |||
5016 | ||||
5017 | MachineInstrBuilder MIB = | |||
5018 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), | |||
5019 | get(AMDGPU::REG_SEQUENCE), DstReg); | |||
5020 | for (unsigned i = 0; i < SubRegs; ++i) { | |||
5021 | MIB.addReg(SRegs[i]); | |||
5022 | MIB.addImm(RI.getSubRegFromChannel(i)); | |||
5023 | } | |||
5024 | return DstReg; | |||
5025 | } | |||
5026 | ||||
5027 | void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, | |||
5028 | MachineInstr &MI) const { | |||
5029 | ||||
5030 | // If the pointer is store in VGPRs, then we need to move them to | |||
5031 | // SGPRs using v_readfirstlane. This is safe because we only select | |||
5032 | // loads with uniform pointers to SMRD instruction so we know the | |||
5033 | // pointer value is uniform. | |||
5034 | MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); | |||
5035 | if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { | |||
5036 | Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); | |||
5037 | SBase->setReg(SGPR); | |||
5038 | } | |||
5039 | MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff); | |||
5040 | if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { | |||
5041 | Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); | |||
5042 | SOff->setReg(SGPR); | |||
5043 | } | |||
5044 | } | |||
5045 | ||||
5046 | // FIXME: Remove this when SelectionDAG is obsoleted. | |||
5047 | void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, | |||
5048 | MachineInstr &MI) const { | |||
5049 | if (!isSegmentSpecificFLAT(MI)) | |||
5050 | return; | |||
5051 | ||||
5052 | // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence | |||
5053 | // thinks they are uniform, so a readfirstlane should be valid. | |||
5054 | MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr); | |||
5055 | if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg()))) | |||
5056 | return; | |||
5057 | ||||
5058 | Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI); | |||
5059 | SAddr->setReg(ToSGPR); | |||
5060 | } | |||
5061 | ||||
5062 | void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, | |||
5063 | MachineBasicBlock::iterator I, | |||
5064 | const TargetRegisterClass *DstRC, | |||
5065 | MachineOperand &Op, | |||
5066 | MachineRegisterInfo &MRI, | |||
5067 | const DebugLoc &DL) const { | |||
5068 | Register OpReg = Op.getReg(); | |||
5069 | unsigned OpSubReg = Op.getSubReg(); | |||
5070 | ||||
5071 | const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( | |||
5072 | RI.getRegClassForReg(MRI, OpReg), OpSubReg); | |||
5073 | ||||
5074 | // Check if operand is already the correct register class. | |||
5075 | if (DstRC == OpRC) | |||
5076 | return; | |||
5077 | ||||
5078 | Register DstReg = MRI.createVirtualRegister(DstRC); | |||
5079 | MachineInstr *Copy = | |||
5080 | BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); | |||
5081 | ||||
5082 | Op.setReg(DstReg); | |||
5083 | Op.setSubReg(0); | |||
5084 | ||||
5085 | MachineInstr *Def = MRI.getVRegDef(OpReg); | |||
5086 | if (!Def) | |||
5087 | return; | |||
5088 | ||||
5089 | // Try to eliminate the copy if it is copying an immediate value. | |||
5090 | if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass) | |||
5091 | FoldImmediate(*Copy, *Def, OpReg, &MRI); | |||
5092 | ||||
5093 | bool ImpDef = Def->isImplicitDef(); | |||
5094 | while (!ImpDef && Def && Def->isCopy()) { | |||
5095 | if (Def->getOperand(1).getReg().isPhysical()) | |||
5096 | break; | |||
5097 | Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); | |||
5098 | ImpDef = Def && Def->isImplicitDef(); | |||
5099 | } | |||
5100 | if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) && | |||
5101 | !ImpDef) | |||
5102 | Copy->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); | |||
5103 | } | |||
5104 | ||||
5105 | // Emit the actual waterfall loop, executing the wrapped instruction for each | |||
5106 | // unique value of \p Rsrc across all lanes. In the best case we execute 1 | |||
5107 | // iteration, in the worst case we execute 64 (once per lane). | |||
5108 | static void | |||
5109 | emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, | |||
5110 | MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, | |||
5111 | const DebugLoc &DL, MachineOperand &Rsrc) { | |||
5112 | MachineFunction &MF = *OrigBB.getParent(); | |||
5113 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | |||
5114 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); | |||
5115 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
5116 | unsigned SaveExecOpc = | |||
5117 | ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; | |||
5118 | unsigned XorTermOpc = | |||
5119 | ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; | |||
5120 | unsigned AndOpc = | |||
5121 | ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; | |||
5122 | const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
5123 | ||||
5124 | MachineBasicBlock::iterator I = LoopBB.begin(); | |||
5125 | ||||
5126 | SmallVector<Register, 8> ReadlanePieces; | |||
5127 | Register CondReg = AMDGPU::NoRegister; | |||
5128 | ||||
5129 | Register VRsrc = Rsrc.getReg(); | |||
5130 | unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); | |||
5131 | ||||
5132 | unsigned RegSize = TRI->getRegSizeInBits(Rsrc.getReg(), MRI); | |||
5133 | unsigned NumSubRegs = RegSize / 32; | |||
5134 | assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size")((NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size") ? static_cast<void> (0) : __assert_fail ("NumSubRegs % 2 == 0 && NumSubRegs <= 32 && \"Unhandled register size\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5134, __PRETTY_FUNCTION__)); | |||
5135 | ||||
5136 | for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) { | |||
5137 | ||||
5138 | Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5139 | Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5140 | ||||
5141 | // Read the next variant <- also loop target. | |||
5142 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo) | |||
5143 | .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx)); | |||
5144 | ||||
5145 | // Read the next variant <- also loop target. | |||
5146 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi) | |||
5147 | .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx + 1)); | |||
5148 | ||||
5149 | ReadlanePieces.push_back(CurRegLo); | |||
5150 | ReadlanePieces.push_back(CurRegHi); | |||
5151 | ||||
5152 | // Comparison is to be done as 64-bit. | |||
5153 | Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); | |||
5154 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg) | |||
5155 | .addReg(CurRegLo) | |||
5156 | .addImm(AMDGPU::sub0) | |||
5157 | .addReg(CurRegHi) | |||
5158 | .addImm(AMDGPU::sub1); | |||
5159 | ||||
5160 | Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC); | |||
5161 | auto Cmp = | |||
5162 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg) | |||
5163 | .addReg(CurReg); | |||
5164 | if (NumSubRegs <= 2) | |||
5165 | Cmp.addReg(VRsrc); | |||
5166 | else | |||
5167 | Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2)); | |||
5168 | ||||
5169 | // Combine the comparision results with AND. | |||
5170 | if (CondReg == AMDGPU::NoRegister) // First. | |||
5171 | CondReg = NewCondReg; | |||
5172 | else { // If not the first, we create an AND. | |||
5173 | Register AndReg = MRI.createVirtualRegister(BoolXExecRC); | |||
5174 | BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg) | |||
5175 | .addReg(CondReg) | |||
5176 | .addReg(NewCondReg); | |||
5177 | CondReg = AndReg; | |||
5178 | } | |||
5179 | } // End for loop. | |||
5180 | ||||
5181 | auto SRsrcRC = TRI->getEquivalentSGPRClass(MRI.getRegClass(VRsrc)); | |||
5182 | Register SRsrc = MRI.createVirtualRegister(SRsrcRC); | |||
5183 | ||||
5184 | // Build scalar Rsrc. | |||
5185 | auto Merge = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc); | |||
5186 | unsigned Channel = 0; | |||
5187 | for (Register Piece : ReadlanePieces) { | |||
5188 | Merge.addReg(Piece) | |||
5189 | .addImm(TRI->getSubRegFromChannel(Channel++)); | |||
5190 | } | |||
5191 | ||||
5192 | // Update Rsrc operand to use the SGPR Rsrc. | |||
5193 | Rsrc.setReg(SRsrc); | |||
5194 | Rsrc.setIsKill(true); | |||
5195 | ||||
5196 | Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); | |||
5197 | MRI.setSimpleHint(SaveExec, CondReg); | |||
5198 | ||||
5199 | // Update EXEC to matching lanes, saving original to SaveExec. | |||
5200 | BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec) | |||
5201 | .addReg(CondReg, RegState::Kill); | |||
5202 | ||||
5203 | // The original instruction is here; we insert the terminators after it. | |||
5204 | I = LoopBB.end(); | |||
5205 | ||||
5206 | // Update EXEC, switch all done bits to 0 and all todo bits to 1. | |||
5207 | BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec) | |||
5208 | .addReg(Exec) | |||
5209 | .addReg(SaveExec); | |||
5210 | ||||
5211 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB); | |||
5212 | } | |||
5213 | ||||
5214 | // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register | |||
5215 | // with SGPRs by iterating over all unique values across all lanes. | |||
5216 | // Returns the loop basic block that now contains \p MI. | |||
5217 | static MachineBasicBlock * | |||
5218 | loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, | |||
5219 | MachineOperand &Rsrc, MachineDominatorTree *MDT, | |||
5220 | MachineBasicBlock::iterator Begin = nullptr, | |||
5221 | MachineBasicBlock::iterator End = nullptr) { | |||
5222 | MachineBasicBlock &MBB = *MI.getParent(); | |||
5223 | MachineFunction &MF = *MBB.getParent(); | |||
5224 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | |||
5225 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); | |||
5226 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
5227 | if (!Begin.isValid()) | |||
5228 | Begin = &MI; | |||
5229 | if (!End.isValid()) { | |||
5230 | End = &MI; | |||
5231 | ++End; | |||
5232 | } | |||
5233 | const DebugLoc &DL = MI.getDebugLoc(); | |||
5234 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
5235 | unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; | |||
5236 | const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
5237 | ||||
5238 | Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); | |||
5239 | ||||
5240 | // Save the EXEC mask | |||
5241 | BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); | |||
5242 | ||||
5243 | // Killed uses in the instruction we are waterfalling around will be | |||
5244 | // incorrect due to the added control-flow. | |||
5245 | MachineBasicBlock::iterator AfterMI = MI; | |||
5246 | ++AfterMI; | |||
5247 | for (auto I = Begin; I != AfterMI; I++) { | |||
5248 | for (auto &MO : I->uses()) { | |||
5249 | if (MO.isReg() && MO.isUse()) { | |||
5250 | MRI.clearKillFlags(MO.getReg()); | |||
5251 | } | |||
5252 | } | |||
5253 | } | |||
5254 | ||||
5255 | // To insert the loop we need to split the block. Move everything after this | |||
5256 | // point to a new block, and insert a new empty block between the two. | |||
5257 | MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); | |||
5258 | MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); | |||
5259 | MachineFunction::iterator MBBI(MBB); | |||
5260 | ++MBBI; | |||
5261 | ||||
5262 | MF.insert(MBBI, LoopBB); | |||
5263 | MF.insert(MBBI, RemainderBB); | |||
5264 | ||||
5265 | LoopBB->addSuccessor(LoopBB); | |||
5266 | LoopBB->addSuccessor(RemainderBB); | |||
5267 | ||||
5268 | // Move Begin to MI to the LoopBB, and the remainder of the block to | |||
5269 | // RemainderBB. | |||
5270 | RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); | |||
5271 | RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end()); | |||
5272 | LoopBB->splice(LoopBB->begin(), &MBB, Begin, MBB.end()); | |||
5273 | ||||
5274 | MBB.addSuccessor(LoopBB); | |||
5275 | ||||
5276 | // Update dominators. We know that MBB immediately dominates LoopBB, that | |||
5277 | // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately | |||
5278 | // dominates all of the successors transferred to it from MBB that MBB used | |||
5279 | // to properly dominate. | |||
5280 | if (MDT) { | |||
5281 | MDT->addNewBlock(LoopBB, &MBB); | |||
5282 | MDT->addNewBlock(RemainderBB, LoopBB); | |||
5283 | for (auto &Succ : RemainderBB->successors()) { | |||
5284 | if (MDT->properlyDominates(&MBB, Succ)) { | |||
5285 | MDT->changeImmediateDominator(Succ, RemainderBB); | |||
5286 | } | |||
5287 | } | |||
5288 | } | |||
5289 | ||||
5290 | emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc); | |||
5291 | ||||
5292 | // Restore the EXEC mask | |||
5293 | MachineBasicBlock::iterator First = RemainderBB->begin(); | |||
5294 | BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); | |||
5295 | return LoopBB; | |||
5296 | } | |||
5297 | ||||
5298 | // Extract pointer from Rsrc and return a zero-value Rsrc replacement. | |||
5299 | static std::tuple<unsigned, unsigned> | |||
5300 | extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { | |||
5301 | MachineBasicBlock &MBB = *MI.getParent(); | |||
5302 | MachineFunction &MF = *MBB.getParent(); | |||
5303 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
5304 | ||||
5305 | // Extract the ptr from the resource descriptor. | |||
5306 | unsigned RsrcPtr = | |||
5307 | TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass, | |||
5308 | AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); | |||
5309 | ||||
5310 | // Create an empty resource descriptor | |||
5311 | Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); | |||
5312 | Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5313 | Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5314 | Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); | |||
5315 | uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat(); | |||
5316 | ||||
5317 | // Zero64 = 0 | |||
5318 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64) | |||
5319 | .addImm(0); | |||
5320 | ||||
5321 | // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} | |||
5322 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo) | |||
5323 | .addImm(RsrcDataFormat & 0xFFFFFFFF); | |||
5324 | ||||
5325 | // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} | |||
5326 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi) | |||
5327 | .addImm(RsrcDataFormat >> 32); | |||
5328 | ||||
5329 | // NewSRsrc = {Zero64, SRsrcFormat} | |||
5330 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc) | |||
5331 | .addReg(Zero64) | |||
5332 | .addImm(AMDGPU::sub0_sub1) | |||
5333 | .addReg(SRsrcFormatLo) | |||
5334 | .addImm(AMDGPU::sub2) | |||
5335 | .addReg(SRsrcFormatHi) | |||
5336 | .addImm(AMDGPU::sub3); | |||
5337 | ||||
5338 | return std::make_tuple(RsrcPtr, NewSRsrc); | |||
5339 | } | |||
5340 | ||||
5341 | MachineBasicBlock * | |||
5342 | SIInstrInfo::legalizeOperands(MachineInstr &MI, | |||
5343 | MachineDominatorTree *MDT) const { | |||
5344 | MachineFunction &MF = *MI.getParent()->getParent(); | |||
5345 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
5346 | MachineBasicBlock *CreatedBB = nullptr; | |||
5347 | ||||
5348 | // Legalize VOP2 | |||
5349 | if (isVOP2(MI) || isVOPC(MI)) { | |||
5350 | legalizeOperandsVOP2(MRI, MI); | |||
5351 | return CreatedBB; | |||
5352 | } | |||
5353 | ||||
5354 | // Legalize VOP3 | |||
5355 | if (isVOP3(MI)) { | |||
5356 | legalizeOperandsVOP3(MRI, MI); | |||
5357 | return CreatedBB; | |||
5358 | } | |||
5359 | ||||
5360 | // Legalize SMRD | |||
5361 | if (isSMRD(MI)) { | |||
5362 | legalizeOperandsSMRD(MRI, MI); | |||
5363 | return CreatedBB; | |||
5364 | } | |||
5365 | ||||
5366 | // Legalize FLAT | |||
5367 | if (isFLAT(MI)) { | |||
5368 | legalizeOperandsFLAT(MRI, MI); | |||
5369 | return CreatedBB; | |||
5370 | } | |||
5371 | ||||
5372 | // Legalize REG_SEQUENCE and PHI | |||
5373 | // The register class of the operands much be the same type as the register | |||
5374 | // class of the output. | |||
5375 | if (MI.getOpcode() == AMDGPU::PHI) { | |||
5376 | const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; | |||
5377 | for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { | |||
5378 | if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual()) | |||
5379 | continue; | |||
5380 | const TargetRegisterClass *OpRC = | |||
5381 | MRI.getRegClass(MI.getOperand(i).getReg()); | |||
5382 | if (RI.hasVectorRegisters(OpRC)) { | |||
5383 | VRC = OpRC; | |||
5384 | } else { | |||
5385 | SRC = OpRC; | |||
5386 | } | |||
5387 | } | |||
5388 | ||||
5389 | // If any of the operands are VGPR registers, then they all most be | |||
5390 | // otherwise we will create illegal VGPR->SGPR copies when legalizing | |||
5391 | // them. | |||
5392 | if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { | |||
5393 | if (!VRC) { | |||
5394 | assert(SRC)((SRC) ? static_cast<void> (0) : __assert_fail ("SRC", "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5394, __PRETTY_FUNCTION__)); | |||
5395 | if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { | |||
5396 | VRC = &AMDGPU::VReg_1RegClass; | |||
5397 | } else | |||
5398 | VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) | |||
5399 | ? RI.getEquivalentAGPRClass(SRC) | |||
5400 | : RI.getEquivalentVGPRClass(SRC); | |||
5401 | } else { | |||
5402 | VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) | |||
5403 | ? RI.getEquivalentAGPRClass(VRC) | |||
5404 | : RI.getEquivalentVGPRClass(VRC); | |||
5405 | } | |||
5406 | RC = VRC; | |||
5407 | } else { | |||
5408 | RC = SRC; | |||
5409 | } | |||
5410 | ||||
5411 | // Update all the operands so they have the same type. | |||
5412 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { | |||
5413 | MachineOperand &Op = MI.getOperand(I); | |||
5414 | if (!Op.isReg() || !Op.getReg().isVirtual()) | |||
5415 | continue; | |||
5416 | ||||
5417 | // MI is a PHI instruction. | |||
5418 | MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); | |||
5419 | MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); | |||
5420 | ||||
5421 | // Avoid creating no-op copies with the same src and dst reg class. These | |||
5422 | // confuse some of the machine passes. | |||
5423 | legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); | |||
5424 | } | |||
5425 | } | |||
5426 | ||||
5427 | // REG_SEQUENCE doesn't really require operand legalization, but if one has a | |||
5428 | // VGPR dest type and SGPR sources, insert copies so all operands are | |||
5429 | // VGPRs. This seems to help operand folding / the register coalescer. | |||
5430 | if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { | |||
5431 | MachineBasicBlock *MBB = MI.getParent(); | |||
5432 | const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); | |||
5433 | if (RI.hasVGPRs(DstRC)) { | |||
5434 | // Update all the operands so they are VGPR register classes. These may | |||
5435 | // not be the same register class because REG_SEQUENCE supports mixing | |||
5436 | // subregister index types e.g. sub0_sub1 + sub2 + sub3 | |||
5437 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { | |||
5438 | MachineOperand &Op = MI.getOperand(I); | |||
5439 | if (!Op.isReg() || !Op.getReg().isVirtual()) | |||
5440 | continue; | |||
5441 | ||||
5442 | const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); | |||
5443 | const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); | |||
5444 | if (VRC == OpRC) | |||
5445 | continue; | |||
5446 | ||||
5447 | legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); | |||
5448 | Op.setIsKill(); | |||
5449 | } | |||
5450 | } | |||
5451 | ||||
5452 | return CreatedBB; | |||
5453 | } | |||
5454 | ||||
5455 | // Legalize INSERT_SUBREG | |||
5456 | // src0 must have the same register class as dst | |||
5457 | if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { | |||
5458 | Register Dst = MI.getOperand(0).getReg(); | |||
5459 | Register Src0 = MI.getOperand(1).getReg(); | |||
5460 | const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); | |||
5461 | const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); | |||
5462 | if (DstRC != Src0RC) { | |||
5463 | MachineBasicBlock *MBB = MI.getParent(); | |||
5464 | MachineOperand &Op = MI.getOperand(1); | |||
5465 | legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); | |||
5466 | } | |||
5467 | return CreatedBB; | |||
5468 | } | |||
5469 | ||||
5470 | // Legalize SI_INIT_M0 | |||
5471 | if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { | |||
5472 | MachineOperand &Src = MI.getOperand(0); | |||
5473 | if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) | |||
5474 | Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); | |||
5475 | return CreatedBB; | |||
5476 | } | |||
5477 | ||||
5478 | // Legalize MIMG and MUBUF/MTBUF for shaders. | |||
5479 | // | |||
5480 | // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via | |||
5481 | // scratch memory access. In both cases, the legalization never involves | |||
5482 | // conversion to the addr64 form. | |||
5483 | if (isMIMG(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) && | |||
5484 | (isMUBUF(MI) || isMTBUF(MI)))) { | |||
5485 | MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); | |||
5486 | if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) | |||
5487 | CreatedBB = loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT); | |||
5488 | ||||
5489 | MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); | |||
5490 | if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) | |||
5491 | CreatedBB = loadSRsrcFromVGPR(*this, MI, *SSamp, MDT); | |||
5492 | ||||
5493 | return CreatedBB; | |||
5494 | } | |||
5495 | ||||
5496 | // Legalize SI_CALL | |||
5497 | if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { | |||
5498 | MachineOperand *Dest = &MI.getOperand(0); | |||
5499 | if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) { | |||
5500 | // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and | |||
5501 | // following copies, we also need to move copies from and to physical | |||
5502 | // registers into the loop block. | |||
5503 | unsigned FrameSetupOpcode = getCallFrameSetupOpcode(); | |||
5504 | unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode(); | |||
5505 | ||||
5506 | // Also move the copies to physical registers into the loop block | |||
5507 | MachineBasicBlock &MBB = *MI.getParent(); | |||
5508 | MachineBasicBlock::iterator Start(&MI); | |||
5509 | while (Start->getOpcode() != FrameSetupOpcode) | |||
5510 | --Start; | |||
5511 | MachineBasicBlock::iterator End(&MI); | |||
5512 | while (End->getOpcode() != FrameDestroyOpcode) | |||
5513 | ++End; | |||
5514 | // Also include following copies of the return value | |||
5515 | ++End; | |||
5516 | while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() && | |||
5517 | MI.definesRegister(End->getOperand(1).getReg())) | |||
5518 | ++End; | |||
5519 | CreatedBB = loadSRsrcFromVGPR(*this, MI, *Dest, MDT, Start, End); | |||
5520 | } | |||
5521 | } | |||
5522 | ||||
5523 | // Legalize MUBUF* instructions. | |||
5524 | int RsrcIdx = | |||
5525 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); | |||
5526 | if (RsrcIdx != -1) { | |||
5527 | // We have an MUBUF instruction | |||
5528 | MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); | |||
5529 | unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass; | |||
5530 | if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()), | |||
5531 | RI.getRegClass(RsrcRC))) { | |||
5532 | // The operands are legal. | |||
5533 | // FIXME: We may need to legalize operands besided srsrc. | |||
5534 | return CreatedBB; | |||
5535 | } | |||
5536 | ||||
5537 | // Legalize a VGPR Rsrc. | |||
5538 | // | |||
5539 | // If the instruction is _ADDR64, we can avoid a waterfall by extracting | |||
5540 | // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using | |||
5541 | // a zero-value SRsrc. | |||
5542 | // | |||
5543 | // If the instruction is _OFFSET (both idxen and offen disabled), and we | |||
5544 | // support ADDR64 instructions, we can convert to ADDR64 and do the same as | |||
5545 | // above. | |||
5546 | // | |||
5547 | // Otherwise we are on non-ADDR64 hardware, and/or we have | |||
5548 | // idxen/offen/bothen and we fall back to a waterfall loop. | |||
5549 | ||||
5550 | MachineBasicBlock &MBB = *MI.getParent(); | |||
5551 | ||||
5552 | MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); | |||
5553 | if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) { | |||
5554 | // This is already an ADDR64 instruction so we need to add the pointer | |||
5555 | // extracted from the resource descriptor to the current value of VAddr. | |||
5556 | Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
5557 | Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
5558 | Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); | |||
5559 | ||||
5560 | const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
5561 | Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); | |||
5562 | Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); | |||
5563 | ||||
5564 | unsigned RsrcPtr, NewSRsrc; | |||
5565 | std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); | |||
5566 | ||||
5567 | // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0 | |||
5568 | const DebugLoc &DL = MI.getDebugLoc(); | |||
5569 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo) | |||
5570 | .addDef(CondReg0) | |||
5571 | .addReg(RsrcPtr, 0, AMDGPU::sub0) | |||
5572 | .addReg(VAddr->getReg(), 0, AMDGPU::sub0) | |||
5573 | .addImm(0); | |||
5574 | ||||
5575 | // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1 | |||
5576 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi) | |||
5577 | .addDef(CondReg1, RegState::Dead) | |||
5578 | .addReg(RsrcPtr, 0, AMDGPU::sub1) | |||
5579 | .addReg(VAddr->getReg(), 0, AMDGPU::sub1) | |||
5580 | .addReg(CondReg0, RegState::Kill) | |||
5581 | .addImm(0); | |||
5582 | ||||
5583 | // NewVaddr = {NewVaddrHi, NewVaddrLo} | |||
5584 | BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) | |||
5585 | .addReg(NewVAddrLo) | |||
5586 | .addImm(AMDGPU::sub0) | |||
5587 | .addReg(NewVAddrHi) | |||
5588 | .addImm(AMDGPU::sub1); | |||
5589 | ||||
5590 | VAddr->setReg(NewVAddr); | |||
5591 | Rsrc->setReg(NewSRsrc); | |||
5592 | } else if (!VAddr && ST.hasAddr64()) { | |||
5593 | // This instructions is the _OFFSET variant, so we need to convert it to | |||
5594 | // ADDR64. | |||
5595 | assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&((ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here") ? static_cast<void > (0) : __assert_fail ("ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && \"FIXME: Need to emit flat atomics here\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5596, __PRETTY_FUNCTION__)) | |||
5596 | "FIXME: Need to emit flat atomics here")((ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here") ? static_cast<void > (0) : __assert_fail ("ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && \"FIXME: Need to emit flat atomics here\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5596, __PRETTY_FUNCTION__)); | |||
5597 | ||||
5598 | unsigned RsrcPtr, NewSRsrc; | |||
5599 | std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); | |||
5600 | ||||
5601 | Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); | |||
5602 | MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); | |||
5603 | MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); | |||
5604 | MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); | |||
5605 | unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); | |||
5606 | ||||
5607 | // Atomics rith return have have an additional tied operand and are | |||
5608 | // missing some of the special bits. | |||
5609 | MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); | |||
5610 | MachineInstr *Addr64; | |||
5611 | ||||
5612 | if (!VDataIn) { | |||
5613 | // Regular buffer load / store. | |||
5614 | MachineInstrBuilder MIB = | |||
5615 | BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) | |||
5616 | .add(*VData) | |||
5617 | .addReg(NewVAddr) | |||
5618 | .addReg(NewSRsrc) | |||
5619 | .add(*SOffset) | |||
5620 | .add(*Offset); | |||
5621 | ||||
5622 | // Atomics do not have this operand. | |||
5623 | if (const MachineOperand *GLC = | |||
5624 | getNamedOperand(MI, AMDGPU::OpName::glc)) { | |||
5625 | MIB.addImm(GLC->getImm()); | |||
5626 | } | |||
5627 | if (const MachineOperand *DLC = | |||
5628 | getNamedOperand(MI, AMDGPU::OpName::dlc)) { | |||
5629 | MIB.addImm(DLC->getImm()); | |||
5630 | } | |||
5631 | if (const MachineOperand *SCCB = | |||
5632 | getNamedOperand(MI, AMDGPU::OpName::sccb)) { | |||
5633 | MIB.addImm(SCCB->getImm()); | |||
5634 | } | |||
5635 | ||||
5636 | MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); | |||
5637 | ||||
5638 | if (const MachineOperand *TFE = | |||
5639 | getNamedOperand(MI, AMDGPU::OpName::tfe)) { | |||
5640 | MIB.addImm(TFE->getImm()); | |||
5641 | } | |||
5642 | ||||
5643 | MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz)); | |||
5644 | ||||
5645 | MIB.cloneMemRefs(MI); | |||
5646 | Addr64 = MIB; | |||
5647 | } else { | |||
5648 | // Atomics with return. | |||
5649 | Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) | |||
5650 | .add(*VData) | |||
5651 | .add(*VDataIn) | |||
5652 | .addReg(NewVAddr) | |||
5653 | .addReg(NewSRsrc) | |||
5654 | .add(*SOffset) | |||
5655 | .add(*Offset) | |||
5656 | .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) | |||
5657 | .cloneMemRefs(MI); | |||
5658 | } | |||
5659 | ||||
5660 | MI.removeFromParent(); | |||
5661 | ||||
5662 | // NewVaddr = {NewVaddrHi, NewVaddrLo} | |||
5663 | BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), | |||
5664 | NewVAddr) | |||
5665 | .addReg(RsrcPtr, 0, AMDGPU::sub0) | |||
5666 | .addImm(AMDGPU::sub0) | |||
5667 | .addReg(RsrcPtr, 0, AMDGPU::sub1) | |||
5668 | .addImm(AMDGPU::sub1); | |||
5669 | } else { | |||
5670 | // This is another variant; legalize Rsrc with waterfall loop from VGPRs | |||
5671 | // to SGPRs. | |||
5672 | CreatedBB = loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT); | |||
5673 | return CreatedBB; | |||
5674 | } | |||
5675 | } | |||
5676 | return CreatedBB; | |||
5677 | } | |||
5678 | ||||
5679 | MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, | |||
5680 | MachineDominatorTree *MDT) const { | |||
5681 | SetVectorType Worklist; | |||
5682 | Worklist.insert(&TopInst); | |||
5683 | MachineBasicBlock *CreatedBB = nullptr; | |||
5684 | MachineBasicBlock *CreatedBBTmp = nullptr; | |||
5685 | ||||
5686 | while (!Worklist.empty()) { | |||
5687 | MachineInstr &Inst = *Worklist.pop_back_val(); | |||
5688 | MachineBasicBlock *MBB = Inst.getParent(); | |||
5689 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | |||
5690 | ||||
5691 | unsigned Opcode = Inst.getOpcode(); | |||
5692 | unsigned NewOpcode = getVALUOp(Inst); | |||
5693 | ||||
5694 | // Handle some special cases | |||
5695 | switch (Opcode) { | |||
5696 | default: | |||
5697 | break; | |||
5698 | case AMDGPU::S_ADD_U64_PSEUDO: | |||
5699 | case AMDGPU::S_SUB_U64_PSEUDO: | |||
5700 | splitScalar64BitAddSub(Worklist, Inst, MDT); | |||
5701 | Inst.eraseFromParent(); | |||
5702 | continue; | |||
5703 | case AMDGPU::S_ADD_I32: | |||
5704 | case AMDGPU::S_SUB_I32: { | |||
5705 | // FIXME: The u32 versions currently selected use the carry. | |||
5706 | bool Changed; | |||
5707 | std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT); | |||
5708 | if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) | |||
5709 | CreatedBB = CreatedBBTmp; | |||
5710 | if (Changed) | |||
5711 | continue; | |||
5712 | ||||
5713 | // Default handling | |||
5714 | break; | |||
5715 | } | |||
5716 | case AMDGPU::S_AND_B64: | |||
5717 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); | |||
5718 | Inst.eraseFromParent(); | |||
5719 | continue; | |||
5720 | ||||
5721 | case AMDGPU::S_OR_B64: | |||
5722 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); | |||
5723 | Inst.eraseFromParent(); | |||
5724 | continue; | |||
5725 | ||||
5726 | case AMDGPU::S_XOR_B64: | |||
5727 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); | |||
5728 | Inst.eraseFromParent(); | |||
5729 | continue; | |||
5730 | ||||
5731 | case AMDGPU::S_NAND_B64: | |||
5732 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); | |||
5733 | Inst.eraseFromParent(); | |||
5734 | continue; | |||
5735 | ||||
5736 | case AMDGPU::S_NOR_B64: | |||
5737 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); | |||
5738 | Inst.eraseFromParent(); | |||
5739 | continue; | |||
5740 | ||||
5741 | case AMDGPU::S_XNOR_B64: | |||
5742 | if (ST.hasDLInsts()) | |||
5743 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); | |||
5744 | else | |||
5745 | splitScalar64BitXnor(Worklist, Inst, MDT); | |||
5746 | Inst.eraseFromParent(); | |||
5747 | continue; | |||
5748 | ||||
5749 | case AMDGPU::S_ANDN2_B64: | |||
5750 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); | |||
5751 | Inst.eraseFromParent(); | |||
5752 | continue; | |||
5753 | ||||
5754 | case AMDGPU::S_ORN2_B64: | |||
5755 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); | |||
5756 | Inst.eraseFromParent(); | |||
5757 | continue; | |||
5758 | ||||
5759 | case AMDGPU::S_BREV_B64: | |||
5760 | splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true); | |||
5761 | Inst.eraseFromParent(); | |||
5762 | continue; | |||
5763 | ||||
5764 | case AMDGPU::S_NOT_B64: | |||
5765 | splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); | |||
5766 | Inst.eraseFromParent(); | |||
5767 | continue; | |||
5768 | ||||
5769 | case AMDGPU::S_BCNT1_I32_B64: | |||
5770 | splitScalar64BitBCNT(Worklist, Inst); | |||
5771 | Inst.eraseFromParent(); | |||
5772 | continue; | |||
5773 | ||||
5774 | case AMDGPU::S_BFE_I64: | |||
5775 | splitScalar64BitBFE(Worklist, Inst); | |||
5776 | Inst.eraseFromParent(); | |||
5777 | continue; | |||
5778 | ||||
5779 | case AMDGPU::S_LSHL_B32: | |||
5780 | if (ST.hasOnlyRevVALUShifts()) { | |||
5781 | NewOpcode = AMDGPU::V_LSHLREV_B32_e64; | |||
5782 | swapOperands(Inst); | |||
5783 | } | |||
5784 | break; | |||
5785 | case AMDGPU::S_ASHR_I32: | |||
5786 | if (ST.hasOnlyRevVALUShifts()) { | |||
5787 | NewOpcode = AMDGPU::V_ASHRREV_I32_e64; | |||
5788 | swapOperands(Inst); | |||
5789 | } | |||
5790 | break; | |||
5791 | case AMDGPU::S_LSHR_B32: | |||
5792 | if (ST.hasOnlyRevVALUShifts()) { | |||
5793 | NewOpcode = AMDGPU::V_LSHRREV_B32_e64; | |||
5794 | swapOperands(Inst); | |||
5795 | } | |||
5796 | break; | |||
5797 | case AMDGPU::S_LSHL_B64: | |||
5798 | if (ST.hasOnlyRevVALUShifts()) { | |||
5799 | NewOpcode = AMDGPU::V_LSHLREV_B64_e64; | |||
5800 | swapOperands(Inst); | |||
5801 | } | |||
5802 | break; | |||
5803 | case AMDGPU::S_ASHR_I64: | |||
5804 | if (ST.hasOnlyRevVALUShifts()) { | |||
5805 | NewOpcode = AMDGPU::V_ASHRREV_I64_e64; | |||
5806 | swapOperands(Inst); | |||
5807 | } | |||
5808 | break; | |||
5809 | case AMDGPU::S_LSHR_B64: | |||
5810 | if (ST.hasOnlyRevVALUShifts()) { | |||
5811 | NewOpcode = AMDGPU::V_LSHRREV_B64_e64; | |||
5812 | swapOperands(Inst); | |||
5813 | } | |||
5814 | break; | |||
5815 | ||||
5816 | case AMDGPU::S_ABS_I32: | |||
5817 | lowerScalarAbs(Worklist, Inst); | |||
5818 | Inst.eraseFromParent(); | |||
5819 | continue; | |||
5820 | ||||
5821 | case AMDGPU::S_CBRANCH_SCC0: | |||
5822 | case AMDGPU::S_CBRANCH_SCC1: | |||
5823 | // Clear unused bits of vcc | |||
5824 | if (ST.isWave32()) | |||
5825 | BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32), | |||
5826 | AMDGPU::VCC_LO) | |||
5827 | .addReg(AMDGPU::EXEC_LO) | |||
5828 | .addReg(AMDGPU::VCC_LO); | |||
5829 | else | |||
5830 | BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), | |||
5831 | AMDGPU::VCC) | |||
5832 | .addReg(AMDGPU::EXEC) | |||
5833 | .addReg(AMDGPU::VCC); | |||
5834 | break; | |||
5835 | ||||
5836 | case AMDGPU::S_BFE_U64: | |||
5837 | case AMDGPU::S_BFM_B64: | |||
5838 | llvm_unreachable("Moving this op to VALU not implemented")::llvm::llvm_unreachable_internal("Moving this op to VALU not implemented" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5838); | |||
5839 | ||||
5840 | case AMDGPU::S_PACK_LL_B32_B16: | |||
5841 | case AMDGPU::S_PACK_LH_B32_B16: | |||
5842 | case AMDGPU::S_PACK_HH_B32_B16: | |||
5843 | movePackToVALU(Worklist, MRI, Inst); | |||
5844 | Inst.eraseFromParent(); | |||
5845 | continue; | |||
5846 | ||||
5847 | case AMDGPU::S_XNOR_B32: | |||
5848 | lowerScalarXnor(Worklist, Inst); | |||
5849 | Inst.eraseFromParent(); | |||
5850 | continue; | |||
5851 | ||||
5852 | case AMDGPU::S_NAND_B32: | |||
5853 | splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); | |||
5854 | Inst.eraseFromParent(); | |||
5855 | continue; | |||
5856 | ||||
5857 | case AMDGPU::S_NOR_B32: | |||
5858 | splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); | |||
5859 | Inst.eraseFromParent(); | |||
5860 | continue; | |||
5861 | ||||
5862 | case AMDGPU::S_ANDN2_B32: | |||
5863 | splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); | |||
5864 | Inst.eraseFromParent(); | |||
5865 | continue; | |||
5866 | ||||
5867 | case AMDGPU::S_ORN2_B32: | |||
5868 | splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); | |||
5869 | Inst.eraseFromParent(); | |||
5870 | continue; | |||
5871 | ||||
5872 | // TODO: remove as soon as everything is ready | |||
5873 | // to replace VGPR to SGPR copy with V_READFIRSTLANEs. | |||
5874 | // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO | |||
5875 | // can only be selected from the uniform SDNode. | |||
5876 | case AMDGPU::S_ADD_CO_PSEUDO: | |||
5877 | case AMDGPU::S_SUB_CO_PSEUDO: { | |||
5878 | unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) | |||
5879 | ? AMDGPU::V_ADDC_U32_e64 | |||
5880 | : AMDGPU::V_SUBB_U32_e64; | |||
5881 | const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
5882 | ||||
5883 | Register CarryInReg = Inst.getOperand(4).getReg(); | |||
5884 | if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { | |||
5885 | Register NewCarryReg = MRI.createVirtualRegister(CarryRC); | |||
5886 | BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) | |||
5887 | .addReg(CarryInReg); | |||
5888 | } | |||
5889 | ||||
5890 | Register CarryOutReg = Inst.getOperand(1).getReg(); | |||
5891 | ||||
5892 | Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass( | |||
5893 | MRI.getRegClass(Inst.getOperand(0).getReg()))); | |||
5894 | MachineInstr *CarryOp = | |||
5895 | BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg) | |||
5896 | .addReg(CarryOutReg, RegState::Define) | |||
5897 | .add(Inst.getOperand(2)) | |||
5898 | .add(Inst.getOperand(3)) | |||
5899 | .addReg(CarryInReg) | |||
5900 | .addImm(0); | |||
5901 | CreatedBBTmp = legalizeOperands(*CarryOp); | |||
5902 | if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) | |||
5903 | CreatedBB = CreatedBBTmp; | |||
5904 | MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); | |||
5905 | addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); | |||
5906 | Inst.eraseFromParent(); | |||
5907 | } | |||
5908 | continue; | |||
5909 | case AMDGPU::S_UADDO_PSEUDO: | |||
5910 | case AMDGPU::S_USUBO_PSEUDO: { | |||
5911 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
5912 | MachineOperand &Dest0 = Inst.getOperand(0); | |||
5913 | MachineOperand &Dest1 = Inst.getOperand(1); | |||
5914 | MachineOperand &Src0 = Inst.getOperand(2); | |||
5915 | MachineOperand &Src1 = Inst.getOperand(3); | |||
5916 | ||||
5917 | unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO) | |||
5918 | ? AMDGPU::V_ADD_CO_U32_e64 | |||
5919 | : AMDGPU::V_SUB_CO_U32_e64; | |||
5920 | const TargetRegisterClass *NewRC = | |||
5921 | RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg())); | |||
5922 | Register DestReg = MRI.createVirtualRegister(NewRC); | |||
5923 | MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg) | |||
5924 | .addReg(Dest1.getReg(), RegState::Define) | |||
5925 | .add(Src0) | |||
5926 | .add(Src1) | |||
5927 | .addImm(0); // clamp bit | |||
5928 | ||||
5929 | CreatedBBTmp = legalizeOperands(*NewInstr, MDT); | |||
5930 | if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) | |||
5931 | CreatedBB = CreatedBBTmp; | |||
5932 | ||||
5933 | MRI.replaceRegWith(Dest0.getReg(), DestReg); | |||
5934 | addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, | |||
5935 | Worklist); | |||
5936 | Inst.eraseFromParent(); | |||
5937 | } | |||
5938 | continue; | |||
5939 | ||||
5940 | case AMDGPU::S_CSELECT_B32: | |||
5941 | case AMDGPU::S_CSELECT_B64: | |||
5942 | lowerSelect(Worklist, Inst, MDT); | |||
5943 | Inst.eraseFromParent(); | |||
5944 | continue; | |||
5945 | } | |||
5946 | ||||
5947 | if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { | |||
5948 | // We cannot move this instruction to the VALU, so we should try to | |||
5949 | // legalize its operands instead. | |||
5950 | CreatedBBTmp = legalizeOperands(Inst, MDT); | |||
5951 | if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) | |||
5952 | CreatedBB = CreatedBBTmp; | |||
5953 | continue; | |||
5954 | } | |||
5955 | ||||
5956 | // Use the new VALU Opcode. | |||
5957 | const MCInstrDesc &NewDesc = get(NewOpcode); | |||
5958 | Inst.setDesc(NewDesc); | |||
5959 | ||||
5960 | // Remove any references to SCC. Vector instructions can't read from it, and | |||
5961 | // We're just about to add the implicit use / defs of VCC, and we don't want | |||
5962 | // both. | |||
5963 | for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { | |||
5964 | MachineOperand &Op = Inst.getOperand(i); | |||
5965 | if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { | |||
5966 | // Only propagate through live-def of SCC. | |||
5967 | if (Op.isDef() && !Op.isDead()) | |||
5968 | addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); | |||
5969 | Inst.RemoveOperand(i); | |||
5970 | } | |||
5971 | } | |||
5972 | ||||
5973 | if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { | |||
5974 | // We are converting these to a BFE, so we need to add the missing | |||
5975 | // operands for the size and offset. | |||
5976 | unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; | |||
5977 | Inst.addOperand(MachineOperand::CreateImm(0)); | |||
5978 | Inst.addOperand(MachineOperand::CreateImm(Size)); | |||
5979 | ||||
5980 | } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { | |||
5981 | // The VALU version adds the second operand to the result, so insert an | |||
5982 | // extra 0 operand. | |||
5983 | Inst.addOperand(MachineOperand::CreateImm(0)); | |||
5984 | } | |||
5985 | ||||
5986 | Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); | |||
5987 | fixImplicitOperands(Inst); | |||
5988 | ||||
5989 | if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { | |||
5990 | const MachineOperand &OffsetWidthOp = Inst.getOperand(2); | |||
5991 | // If we need to move this to VGPRs, we need to unpack the second operand | |||
5992 | // back into the 2 separate ones for bit offset and width. | |||
5993 | assert(OffsetWidthOp.isImm() &&((OffsetWidthOp.isImm() && "Scalar BFE is only implemented for constant width and offset" ) ? static_cast<void> (0) : __assert_fail ("OffsetWidthOp.isImm() && \"Scalar BFE is only implemented for constant width and offset\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5994, __PRETTY_FUNCTION__)) | |||
5994 | "Scalar BFE is only implemented for constant width and offset")((OffsetWidthOp.isImm() && "Scalar BFE is only implemented for constant width and offset" ) ? static_cast<void> (0) : __assert_fail ("OffsetWidthOp.isImm() && \"Scalar BFE is only implemented for constant width and offset\"" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5994, __PRETTY_FUNCTION__)); | |||
5995 | uint32_t Imm = OffsetWidthOp.getImm(); | |||
5996 | ||||
5997 | uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. | |||
5998 | uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. | |||
5999 | Inst.RemoveOperand(2); // Remove old immediate. | |||
6000 | Inst.addOperand(MachineOperand::CreateImm(Offset)); | |||
6001 | Inst.addOperand(MachineOperand::CreateImm(BitWidth)); | |||
6002 | } | |||
6003 | ||||
6004 | bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); | |||
6005 | unsigned NewDstReg = AMDGPU::NoRegister; | |||
6006 | if (HasDst) { | |||
6007 | Register DstReg = Inst.getOperand(0).getReg(); | |||
6008 | if (DstReg.isPhysical()) | |||
6009 | continue; | |||
6010 | ||||
6011 | // Update the destination register class. | |||
6012 | const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); | |||
6013 | if (!NewDstRC) | |||
6014 | continue; | |||
6015 | ||||
6016 | if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && | |||
6017 | NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { | |||
6018 | // Instead of creating a copy where src and dst are the same register | |||
6019 | // class, we just replace all uses of dst with src. These kinds of | |||
6020 | // copies interfere with the heuristics MachineSink uses to decide | |||
6021 | // whether or not to split a critical edge. Since the pass assumes | |||
6022 | // that copies will end up as machine instructions and not be | |||
6023 | // eliminated. | |||
6024 | addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); | |||
6025 | MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); | |||
6026 | MRI.clearKillFlags(Inst.getOperand(1).getReg()); | |||
6027 | Inst.getOperand(0).setReg(DstReg); | |||
6028 | ||||
6029 | // Make sure we don't leave around a dead VGPR->SGPR copy. Normally | |||
6030 | // these are deleted later, but at -O0 it would leave a suspicious | |||
6031 | // looking illegal copy of an undef register. | |||
6032 | for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) | |||
6033 | Inst.RemoveOperand(I); | |||
6034 | Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); | |||
6035 | continue; | |||
6036 | } | |||
6037 | ||||
6038 | NewDstReg = MRI.createVirtualRegister(NewDstRC); | |||
6039 | MRI.replaceRegWith(DstReg, NewDstReg); | |||
6040 | } | |||
6041 | ||||
6042 | // Legalize the operands | |||
6043 | CreatedBBTmp = legalizeOperands(Inst, MDT); | |||
6044 | if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) | |||
6045 | CreatedBB = CreatedBBTmp; | |||
6046 | ||||
6047 | if (HasDst) | |||
6048 | addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); | |||
6049 | } | |||
6050 | return CreatedBB; | |||
6051 | } | |||
6052 | ||||
6053 | // Add/sub require special handling to deal with carry outs. | |||
6054 | std::pair<bool, MachineBasicBlock *> | |||
6055 | SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, | |||
6056 | MachineDominatorTree *MDT) const { | |||
6057 | if (ST.hasAddNoCarry()) { | |||
6058 | // Assume there is no user of scc since we don't select this in that case. | |||
6059 | // Since scc isn't used, it doesn't really matter if the i32 or u32 variant | |||
6060 | // is used. | |||
6061 | ||||
6062 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6063 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6064 | ||||
6065 | Register OldDstReg = Inst.getOperand(0).getReg(); | |||
6066 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6067 | ||||
6068 | unsigned Opc = Inst.getOpcode(); | |||
6069 | assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32)((Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32) ? static_cast <void> (0) : __assert_fail ("Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6069, __PRETTY_FUNCTION__)); | |||
6070 | ||||
6071 | unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ? | |||
6072 | AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64; | |||
6073 | ||||
6074 | assert(Inst.getOperand(3).getReg() == AMDGPU::SCC)((Inst.getOperand(3).getReg() == AMDGPU::SCC) ? static_cast< void> (0) : __assert_fail ("Inst.getOperand(3).getReg() == AMDGPU::SCC" , "/build/llvm-toolchain-snapshot-13~++20210308111132+66e3a4abe99c/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6074, __PRETTY_FUNCTION__)); | |||
6075 | Inst.RemoveOperand(3); | |||
6076 | ||||
6077 | Inst.setDesc(get(NewOpc)); | |||
6078 | Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit | |||
6079 | Inst.addImplicitDefUseOperands(*MBB.getParent()); | |||
6080 | MRI.replaceRegWith(OldDstReg, ResultReg); | |||
6081 | MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT); | |||
6082 | ||||
6083 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
6084 | return std::make_pair(true, NewBB); | |||
6085 | } | |||
6086 | ||||
6087 | return std::make_pair(false, nullptr); | |||
6088 | } | |||
6089 | ||||
6090 | void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, | |||
6091 | MachineDominatorTree *MDT) const { | |||
6092 | ||||
6093 | MachineBasicBlock &MBB = *Inst.getParent(); | |||