File: | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp |
Warning: | line 1930, column 15 Called C++ object pointer is uninitialized |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | // | |||
9 | /// \file | |||
10 | /// SI Implementation of TargetInstrInfo. | |||
11 | // | |||
12 | //===----------------------------------------------------------------------===// | |||
13 | ||||
14 | #include "SIInstrInfo.h" | |||
15 | #include "AMDGPU.h" | |||
16 | #include "AMDGPUInstrInfo.h" | |||
17 | #include "GCNHazardRecognizer.h" | |||
18 | #include "GCNSubtarget.h" | |||
19 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" | |||
20 | #include "SIMachineFunctionInfo.h" | |||
21 | #include "llvm/Analysis/ValueTracking.h" | |||
22 | #include "llvm/CodeGen/LiveVariables.h" | |||
23 | #include "llvm/CodeGen/MachineDominators.h" | |||
24 | #include "llvm/CodeGen/RegisterScavenging.h" | |||
25 | #include "llvm/CodeGen/ScheduleDAG.h" | |||
26 | #include "llvm/IR/DiagnosticInfo.h" | |||
27 | #include "llvm/IR/IntrinsicsAMDGPU.h" | |||
28 | #include "llvm/Support/CommandLine.h" | |||
29 | #include "llvm/Target/TargetMachine.h" | |||
30 | ||||
31 | using namespace llvm; | |||
32 | ||||
33 | #define DEBUG_TYPE"si-instr-info" "si-instr-info" | |||
34 | ||||
35 | #define GET_INSTRINFO_CTOR_DTOR | |||
36 | #include "AMDGPUGenInstrInfo.inc" | |||
37 | ||||
38 | namespace llvm { | |||
39 | ||||
40 | class AAResults; | |||
41 | ||||
42 | namespace AMDGPU { | |||
43 | #define GET_D16ImageDimIntrinsics_IMPL | |||
44 | #define GET_ImageDimIntrinsicTable_IMPL | |||
45 | #define GET_RsrcIntrinsics_IMPL | |||
46 | #include "AMDGPUGenSearchableTables.inc" | |||
47 | } | |||
48 | } | |||
49 | ||||
50 | ||||
51 | // Must be at least 4 to be able to branch over minimum unconditional branch | |||
52 | // code. This is only for making it possible to write reasonably small tests for | |||
53 | // long branches. | |||
54 | static cl::opt<unsigned> | |||
55 | BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), | |||
56 | cl::desc("Restrict range of branch instructions (DEBUG)")); | |||
57 | ||||
58 | static cl::opt<bool> Fix16BitCopies( | |||
59 | "amdgpu-fix-16-bit-physreg-copies", | |||
60 | cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), | |||
61 | cl::init(true), | |||
62 | cl::ReallyHidden); | |||
63 | ||||
64 | SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) | |||
65 | : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), | |||
66 | RI(ST), ST(ST) { | |||
67 | SchedModel.init(&ST); | |||
68 | } | |||
69 | ||||
70 | //===----------------------------------------------------------------------===// | |||
71 | // TargetInstrInfo callbacks | |||
72 | //===----------------------------------------------------------------------===// | |||
73 | ||||
74 | static unsigned getNumOperandsNoGlue(SDNode *Node) { | |||
75 | unsigned N = Node->getNumOperands(); | |||
76 | while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) | |||
77 | --N; | |||
78 | return N; | |||
79 | } | |||
80 | ||||
81 | /// Returns true if both nodes have the same value for the given | |||
82 | /// operand \p Op, or if both nodes do not have this operand. | |||
83 | static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { | |||
84 | unsigned Opc0 = N0->getMachineOpcode(); | |||
85 | unsigned Opc1 = N1->getMachineOpcode(); | |||
86 | ||||
87 | int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); | |||
88 | int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); | |||
89 | ||||
90 | if (Op0Idx == -1 && Op1Idx == -1) | |||
91 | return true; | |||
92 | ||||
93 | ||||
94 | if ((Op0Idx == -1 && Op1Idx != -1) || | |||
95 | (Op1Idx == -1 && Op0Idx != -1)) | |||
96 | return false; | |||
97 | ||||
98 | // getNamedOperandIdx returns the index for the MachineInstr's operands, | |||
99 | // which includes the result as the first operand. We are indexing into the | |||
100 | // MachineSDNode's operands, so we need to skip the result operand to get | |||
101 | // the real index. | |||
102 | --Op0Idx; | |||
103 | --Op1Idx; | |||
104 | ||||
105 | return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); | |||
106 | } | |||
107 | ||||
108 | bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, | |||
109 | AAResults *AA) const { | |||
110 | // TODO: The generic check fails for VALU instructions that should be | |||
111 | // rematerializable due to implicit reads of exec. We really want all of the | |||
112 | // generic logic for this except for this. | |||
113 | switch (MI.getOpcode()) { | |||
114 | case AMDGPU::V_MOV_B32_e32: | |||
115 | case AMDGPU::V_MOV_B32_e64: | |||
116 | case AMDGPU::V_MOV_B64_PSEUDO: | |||
117 | case AMDGPU::V_ACCVGPR_READ_B32_e64: | |||
118 | case AMDGPU::V_ACCVGPR_WRITE_B32_e64: | |||
119 | // No implicit operands. | |||
120 | return MI.getNumOperands() == MI.getDesc().getNumOperands(); | |||
121 | default: | |||
122 | return false; | |||
123 | } | |||
124 | } | |||
125 | ||||
126 | bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, | |||
127 | int64_t &Offset0, | |||
128 | int64_t &Offset1) const { | |||
129 | if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) | |||
130 | return false; | |||
131 | ||||
132 | unsigned Opc0 = Load0->getMachineOpcode(); | |||
133 | unsigned Opc1 = Load1->getMachineOpcode(); | |||
134 | ||||
135 | // Make sure both are actually loads. | |||
136 | if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) | |||
137 | return false; | |||
138 | ||||
139 | if (isDS(Opc0) && isDS(Opc1)) { | |||
140 | ||||
141 | // FIXME: Handle this case: | |||
142 | if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) | |||
143 | return false; | |||
144 | ||||
145 | // Check base reg. | |||
146 | if (Load0->getOperand(0) != Load1->getOperand(0)) | |||
147 | return false; | |||
148 | ||||
149 | // Skip read2 / write2 variants for simplicity. | |||
150 | // TODO: We should report true if the used offsets are adjacent (excluded | |||
151 | // st64 versions). | |||
152 | int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); | |||
153 | int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); | |||
154 | if (Offset0Idx == -1 || Offset1Idx == -1) | |||
155 | return false; | |||
156 | ||||
157 | // XXX - be careful of datalesss loads | |||
158 | // getNamedOperandIdx returns the index for MachineInstrs. Since they | |||
159 | // include the output in the operand list, but SDNodes don't, we need to | |||
160 | // subtract the index by one. | |||
161 | Offset0Idx -= get(Opc0).NumDefs; | |||
162 | Offset1Idx -= get(Opc1).NumDefs; | |||
163 | Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue(); | |||
164 | Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue(); | |||
165 | return true; | |||
166 | } | |||
167 | ||||
168 | if (isSMRD(Opc0) && isSMRD(Opc1)) { | |||
169 | // Skip time and cache invalidation instructions. | |||
170 | if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || | |||
171 | AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) | |||
172 | return false; | |||
173 | ||||
174 | assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1))((getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)) ? static_cast<void> (0) : __assert_fail ("getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 174, __PRETTY_FUNCTION__)); | |||
175 | ||||
176 | // Check base reg. | |||
177 | if (Load0->getOperand(0) != Load1->getOperand(0)) | |||
178 | return false; | |||
179 | ||||
180 | const ConstantSDNode *Load0Offset = | |||
181 | dyn_cast<ConstantSDNode>(Load0->getOperand(1)); | |||
182 | const ConstantSDNode *Load1Offset = | |||
183 | dyn_cast<ConstantSDNode>(Load1->getOperand(1)); | |||
184 | ||||
185 | if (!Load0Offset || !Load1Offset) | |||
186 | return false; | |||
187 | ||||
188 | Offset0 = Load0Offset->getZExtValue(); | |||
189 | Offset1 = Load1Offset->getZExtValue(); | |||
190 | return true; | |||
191 | } | |||
192 | ||||
193 | // MUBUF and MTBUF can access the same addresses. | |||
194 | if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { | |||
195 | ||||
196 | // MUBUF and MTBUF have vaddr at different indices. | |||
197 | if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || | |||
198 | !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || | |||
199 | !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) | |||
200 | return false; | |||
201 | ||||
202 | int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); | |||
203 | int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); | |||
204 | ||||
205 | if (OffIdx0 == -1 || OffIdx1 == -1) | |||
206 | return false; | |||
207 | ||||
208 | // getNamedOperandIdx returns the index for MachineInstrs. Since they | |||
209 | // include the output in the operand list, but SDNodes don't, we need to | |||
210 | // subtract the index by one. | |||
211 | OffIdx0 -= get(Opc0).NumDefs; | |||
212 | OffIdx1 -= get(Opc1).NumDefs; | |||
213 | ||||
214 | SDValue Off0 = Load0->getOperand(OffIdx0); | |||
215 | SDValue Off1 = Load1->getOperand(OffIdx1); | |||
216 | ||||
217 | // The offset might be a FrameIndexSDNode. | |||
218 | if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) | |||
219 | return false; | |||
220 | ||||
221 | Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); | |||
222 | Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); | |||
223 | return true; | |||
224 | } | |||
225 | ||||
226 | return false; | |||
227 | } | |||
228 | ||||
229 | static bool isStride64(unsigned Opc) { | |||
230 | switch (Opc) { | |||
231 | case AMDGPU::DS_READ2ST64_B32: | |||
232 | case AMDGPU::DS_READ2ST64_B64: | |||
233 | case AMDGPU::DS_WRITE2ST64_B32: | |||
234 | case AMDGPU::DS_WRITE2ST64_B64: | |||
235 | return true; | |||
236 | default: | |||
237 | return false; | |||
238 | } | |||
239 | } | |||
240 | ||||
241 | bool SIInstrInfo::getMemOperandsWithOffsetWidth( | |||
242 | const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, | |||
243 | int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, | |||
244 | const TargetRegisterInfo *TRI) const { | |||
245 | if (!LdSt.mayLoadOrStore()) | |||
246 | return false; | |||
247 | ||||
248 | unsigned Opc = LdSt.getOpcode(); | |||
249 | OffsetIsScalable = false; | |||
250 | const MachineOperand *BaseOp, *OffsetOp; | |||
251 | int DataOpIdx; | |||
252 | ||||
253 | if (isDS(LdSt)) { | |||
254 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); | |||
255 | OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); | |||
256 | if (OffsetOp) { | |||
257 | // Normal, single offset LDS instruction. | |||
258 | if (!BaseOp) { | |||
259 | // DS_CONSUME/DS_APPEND use M0 for the base address. | |||
260 | // TODO: find the implicit use operand for M0 and use that as BaseOp? | |||
261 | return false; | |||
262 | } | |||
263 | BaseOps.push_back(BaseOp); | |||
264 | Offset = OffsetOp->getImm(); | |||
265 | // Get appropriate operand, and compute width accordingly. | |||
266 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
267 | if (DataOpIdx == -1) | |||
268 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); | |||
269 | Width = getOpSize(LdSt, DataOpIdx); | |||
270 | } else { | |||
271 | // The 2 offset instructions use offset0 and offset1 instead. We can treat | |||
272 | // these as a load with a single offset if the 2 offsets are consecutive. | |||
273 | // We will use this for some partially aligned loads. | |||
274 | const MachineOperand *Offset0Op = | |||
275 | getNamedOperand(LdSt, AMDGPU::OpName::offset0); | |||
276 | const MachineOperand *Offset1Op = | |||
277 | getNamedOperand(LdSt, AMDGPU::OpName::offset1); | |||
278 | ||||
279 | unsigned Offset0 = Offset0Op->getImm(); | |||
280 | unsigned Offset1 = Offset1Op->getImm(); | |||
281 | if (Offset0 + 1 != Offset1) | |||
282 | return false; | |||
283 | ||||
284 | // Each of these offsets is in element sized units, so we need to convert | |||
285 | // to bytes of the individual reads. | |||
286 | ||||
287 | unsigned EltSize; | |||
288 | if (LdSt.mayLoad()) | |||
289 | EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; | |||
290 | else { | |||
291 | assert(LdSt.mayStore())((LdSt.mayStore()) ? static_cast<void> (0) : __assert_fail ("LdSt.mayStore()", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 291, __PRETTY_FUNCTION__)); | |||
292 | int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); | |||
293 | EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; | |||
294 | } | |||
295 | ||||
296 | if (isStride64(Opc)) | |||
297 | EltSize *= 64; | |||
298 | ||||
299 | BaseOps.push_back(BaseOp); | |||
300 | Offset = EltSize * Offset0; | |||
301 | // Get appropriate operand(s), and compute width accordingly. | |||
302 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
303 | if (DataOpIdx == -1) { | |||
304 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); | |||
305 | Width = getOpSize(LdSt, DataOpIdx); | |||
306 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1); | |||
307 | Width += getOpSize(LdSt, DataOpIdx); | |||
308 | } else { | |||
309 | Width = getOpSize(LdSt, DataOpIdx); | |||
310 | } | |||
311 | } | |||
312 | return true; | |||
313 | } | |||
314 | ||||
315 | if (isMUBUF(LdSt) || isMTBUF(LdSt)) { | |||
316 | const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); | |||
317 | if (SOffset && SOffset->isReg()) { | |||
318 | // We can only handle this if it's a stack access, as any other resource | |||
319 | // would require reporting multiple base registers. | |||
320 | const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); | |||
321 | if (AddrReg && !AddrReg->isFI()) | |||
322 | return false; | |||
323 | ||||
324 | const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); | |||
325 | const SIMachineFunctionInfo *MFI | |||
326 | = LdSt.getParent()->getParent()->getInfo<SIMachineFunctionInfo>(); | |||
327 | if (RSrc->getReg() != MFI->getScratchRSrcReg()) | |||
328 | return false; | |||
329 | ||||
330 | const MachineOperand *OffsetImm = | |||
331 | getNamedOperand(LdSt, AMDGPU::OpName::offset); | |||
332 | BaseOps.push_back(RSrc); | |||
333 | BaseOps.push_back(SOffset); | |||
334 | Offset = OffsetImm->getImm(); | |||
335 | } else { | |||
336 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); | |||
337 | if (!BaseOp) // e.g. BUFFER_WBINVL1_VOL | |||
338 | return false; | |||
339 | BaseOps.push_back(BaseOp); | |||
340 | ||||
341 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); | |||
342 | if (BaseOp) | |||
343 | BaseOps.push_back(BaseOp); | |||
344 | ||||
345 | const MachineOperand *OffsetImm = | |||
346 | getNamedOperand(LdSt, AMDGPU::OpName::offset); | |||
347 | Offset = OffsetImm->getImm(); | |||
348 | if (SOffset) // soffset can be an inline immediate. | |||
349 | Offset += SOffset->getImm(); | |||
350 | } | |||
351 | // Get appropriate operand, and compute width accordingly. | |||
352 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
353 | if (DataOpIdx == -1) | |||
354 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); | |||
355 | Width = getOpSize(LdSt, DataOpIdx); | |||
356 | return true; | |||
357 | } | |||
358 | ||||
359 | if (isMIMG(LdSt)) { | |||
360 | int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); | |||
361 | BaseOps.push_back(&LdSt.getOperand(SRsrcIdx)); | |||
362 | int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); | |||
363 | if (VAddr0Idx >= 0) { | |||
364 | // GFX10 possible NSA encoding. | |||
365 | for (int I = VAddr0Idx; I < SRsrcIdx; ++I) | |||
366 | BaseOps.push_back(&LdSt.getOperand(I)); | |||
367 | } else { | |||
368 | BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr)); | |||
369 | } | |||
370 | Offset = 0; | |||
371 | // Get appropriate operand, and compute width accordingly. | |||
372 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); | |||
373 | Width = getOpSize(LdSt, DataOpIdx); | |||
374 | return true; | |||
375 | } | |||
376 | ||||
377 | if (isSMRD(LdSt)) { | |||
378 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase); | |||
379 | if (!BaseOp) // e.g. S_MEMTIME | |||
380 | return false; | |||
381 | BaseOps.push_back(BaseOp); | |||
382 | OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); | |||
383 | Offset = OffsetOp ? OffsetOp->getImm() : 0; | |||
384 | // Get appropriate operand, and compute width accordingly. | |||
385 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst); | |||
386 | Width = getOpSize(LdSt, DataOpIdx); | |||
387 | return true; | |||
388 | } | |||
389 | ||||
390 | if (isFLAT(LdSt)) { | |||
391 | // Instructions have either vaddr or saddr or both or none. | |||
392 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); | |||
393 | if (BaseOp) | |||
394 | BaseOps.push_back(BaseOp); | |||
395 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); | |||
396 | if (BaseOp) | |||
397 | BaseOps.push_back(BaseOp); | |||
398 | Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); | |||
399 | // Get appropriate operand, and compute width accordingly. | |||
400 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
401 | if (DataOpIdx == -1) | |||
402 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); | |||
403 | Width = getOpSize(LdSt, DataOpIdx); | |||
404 | return true; | |||
405 | } | |||
406 | ||||
407 | return false; | |||
408 | } | |||
409 | ||||
410 | static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, | |||
411 | ArrayRef<const MachineOperand *> BaseOps1, | |||
412 | const MachineInstr &MI2, | |||
413 | ArrayRef<const MachineOperand *> BaseOps2) { | |||
414 | // Only examine the first "base" operand of each instruction, on the | |||
415 | // assumption that it represents the real base address of the memory access. | |||
416 | // Other operands are typically offsets or indices from this base address. | |||
417 | if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front())) | |||
418 | return true; | |||
419 | ||||
420 | if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) | |||
421 | return false; | |||
422 | ||||
423 | auto MO1 = *MI1.memoperands_begin(); | |||
424 | auto MO2 = *MI2.memoperands_begin(); | |||
425 | if (MO1->getAddrSpace() != MO2->getAddrSpace()) | |||
426 | return false; | |||
427 | ||||
428 | auto Base1 = MO1->getValue(); | |||
429 | auto Base2 = MO2->getValue(); | |||
430 | if (!Base1 || !Base2) | |||
431 | return false; | |||
432 | Base1 = getUnderlyingObject(Base1); | |||
433 | Base2 = getUnderlyingObject(Base2); | |||
434 | ||||
435 | if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2)) | |||
436 | return false; | |||
437 | ||||
438 | return Base1 == Base2; | |||
439 | } | |||
440 | ||||
441 | bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, | |||
442 | ArrayRef<const MachineOperand *> BaseOps2, | |||
443 | unsigned NumLoads, | |||
444 | unsigned NumBytes) const { | |||
445 | // If the mem ops (to be clustered) do not have the same base ptr, then they | |||
446 | // should not be clustered | |||
447 | if (!BaseOps1.empty() && !BaseOps2.empty()) { | |||
448 | const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); | |||
449 | const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); | |||
450 | if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) | |||
451 | return false; | |||
452 | } else if (!BaseOps1.empty() || !BaseOps2.empty()) { | |||
453 | // If only one base op is empty, they do not have the same base ptr | |||
454 | return false; | |||
455 | } | |||
456 | ||||
457 | // In order to avoid regester pressure, on an average, the number of DWORDS | |||
458 | // loaded together by all clustered mem ops should not exceed 8. This is an | |||
459 | // empirical value based on certain observations and performance related | |||
460 | // experiments. | |||
461 | // The good thing about this heuristic is - it avoids clustering of too many | |||
462 | // sub-word loads, and also avoids clustering of wide loads. Below is the | |||
463 | // brief summary of how the heuristic behaves for various `LoadSize`. | |||
464 | // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops | |||
465 | // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops | |||
466 | // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops | |||
467 | // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops | |||
468 | // (5) LoadSize >= 17: do not cluster | |||
469 | const unsigned LoadSize = NumBytes / NumLoads; | |||
470 | const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads; | |||
471 | return NumDWORDs <= 8; | |||
472 | } | |||
473 | ||||
474 | // FIXME: This behaves strangely. If, for example, you have 32 load + stores, | |||
475 | // the first 16 loads will be interleaved with the stores, and the next 16 will | |||
476 | // be clustered as expected. It should really split into 2 16 store batches. | |||
477 | // | |||
478 | // Loads are clustered until this returns false, rather than trying to schedule | |||
479 | // groups of stores. This also means we have to deal with saying different | |||
480 | // address space loads should be clustered, and ones which might cause bank | |||
481 | // conflicts. | |||
482 | // | |||
483 | // This might be deprecated so it might not be worth that much effort to fix. | |||
484 | bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, | |||
485 | int64_t Offset0, int64_t Offset1, | |||
486 | unsigned NumLoads) const { | |||
487 | assert(Offset1 > Offset0 &&((Offset1 > Offset0 && "Second offset should be larger than first offset!" ) ? static_cast<void> (0) : __assert_fail ("Offset1 > Offset0 && \"Second offset should be larger than first offset!\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 488, __PRETTY_FUNCTION__)) | |||
488 | "Second offset should be larger than first offset!")((Offset1 > Offset0 && "Second offset should be larger than first offset!" ) ? static_cast<void> (0) : __assert_fail ("Offset1 > Offset0 && \"Second offset should be larger than first offset!\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 488, __PRETTY_FUNCTION__)); | |||
489 | // If we have less than 16 loads in a row, and the offsets are within 64 | |||
490 | // bytes, then schedule together. | |||
491 | ||||
492 | // A cacheline is 64 bytes (for global memory). | |||
493 | return (NumLoads <= 16 && (Offset1 - Offset0) < 64); | |||
494 | } | |||
495 | ||||
496 | static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, | |||
497 | MachineBasicBlock::iterator MI, | |||
498 | const DebugLoc &DL, MCRegister DestReg, | |||
499 | MCRegister SrcReg, bool KillSrc, | |||
500 | const char *Msg = "illegal SGPR to VGPR copy") { | |||
501 | MachineFunction *MF = MBB.getParent(); | |||
502 | DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error); | |||
503 | LLVMContext &C = MF->getFunction().getContext(); | |||
504 | C.diagnose(IllegalCopy); | |||
505 | ||||
506 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) | |||
507 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
508 | } | |||
509 | ||||
510 | /// Handle copying from SGPR to AGPR, or from AGPR to AGPR. It is not possible | |||
511 | /// to directly copy, so an intermediate VGPR needs to be used. | |||
512 | static void indirectCopyToAGPR(const SIInstrInfo &TII, | |||
513 | MachineBasicBlock &MBB, | |||
514 | MachineBasicBlock::iterator MI, | |||
515 | const DebugLoc &DL, MCRegister DestReg, | |||
516 | MCRegister SrcReg, bool KillSrc, | |||
517 | RegScavenger &RS, | |||
518 | Register ImpDefSuperReg = Register(), | |||
519 | Register ImpUseSuperReg = Register()) { | |||
520 | const SIRegisterInfo &RI = TII.getRegisterInfo(); | |||
521 | ||||
522 | assert(AMDGPU::SReg_32RegClass.contains(SrcReg) ||((AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass .contains(SrcReg)) ? static_cast<void> (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 523, __PRETTY_FUNCTION__)) | |||
523 | AMDGPU::AGPR_32RegClass.contains(SrcReg))((AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass .contains(SrcReg)) ? static_cast<void> (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 523, __PRETTY_FUNCTION__)); | |||
524 | ||||
525 | // First try to find defining accvgpr_write to avoid temporary registers. | |||
526 | for (auto Def = MI, E = MBB.begin(); Def != E; ) { | |||
527 | --Def; | |||
528 | if (!Def->definesRegister(SrcReg, &RI)) | |||
529 | continue; | |||
530 | if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) | |||
531 | break; | |||
532 | ||||
533 | MachineOperand &DefOp = Def->getOperand(1); | |||
534 | assert(DefOp.isReg() || DefOp.isImm())((DefOp.isReg() || DefOp.isImm()) ? static_cast<void> ( 0) : __assert_fail ("DefOp.isReg() || DefOp.isImm()", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 534, __PRETTY_FUNCTION__)); | |||
535 | ||||
536 | if (DefOp.isReg()) { | |||
537 | // Check that register source operand if not clobbered before MI. | |||
538 | // Immediate operands are always safe to propagate. | |||
539 | bool SafeToPropagate = true; | |||
540 | for (auto I = Def; I != MI && SafeToPropagate; ++I) | |||
541 | if (I->modifiesRegister(DefOp.getReg(), &RI)) | |||
542 | SafeToPropagate = false; | |||
543 | ||||
544 | if (!SafeToPropagate) | |||
545 | break; | |||
546 | ||||
547 | DefOp.setIsKill(false); | |||
548 | } | |||
549 | ||||
550 | MachineInstrBuilder Builder = | |||
551 | BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) | |||
552 | .add(DefOp); | |||
553 | if (ImpDefSuperReg) | |||
554 | Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); | |||
555 | ||||
556 | if (ImpUseSuperReg) { | |||
557 | Builder.addReg(ImpUseSuperReg, | |||
558 | getKillRegState(KillSrc) | RegState::Implicit); | |||
559 | } | |||
560 | ||||
561 | return; | |||
562 | } | |||
563 | ||||
564 | RS.enterBasicBlock(MBB); | |||
565 | RS.forward(MI); | |||
566 | ||||
567 | // Ideally we want to have three registers for a long reg_sequence copy | |||
568 | // to hide 2 waitstates between v_mov_b32 and accvgpr_write. | |||
569 | unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, | |||
570 | *MBB.getParent()); | |||
571 | ||||
572 | // Registers in the sequence are allocated contiguously so we can just | |||
573 | // use register number to pick one of three round-robin temps. | |||
574 | unsigned RegNo = DestReg % 3; | |||
575 | Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); | |||
576 | if (!Tmp) | |||
577 | report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); | |||
578 | RS.setRegUsed(Tmp); | |||
579 | // Only loop through if there are any free registers left, otherwise | |||
580 | // scavenger may report a fatal error without emergency spill slot | |||
581 | // or spill with the slot. | |||
582 | while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { | |||
583 | Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); | |||
584 | if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) | |||
585 | break; | |||
586 | Tmp = Tmp2; | |||
587 | RS.setRegUsed(Tmp); | |||
588 | } | |||
589 | ||||
590 | // Insert copy to temporary VGPR. | |||
591 | unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32; | |||
592 | if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) { | |||
593 | TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64; | |||
594 | } else { | |||
595 | assert(AMDGPU::SReg_32RegClass.contains(SrcReg))((AMDGPU::SReg_32RegClass.contains(SrcReg)) ? static_cast< void> (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 595, __PRETTY_FUNCTION__)); | |||
596 | } | |||
597 | ||||
598 | MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp) | |||
599 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
600 | if (ImpUseSuperReg) { | |||
601 | UseBuilder.addReg(ImpUseSuperReg, | |||
602 | getKillRegState(KillSrc) | RegState::Implicit); | |||
603 | } | |||
604 | ||||
605 | MachineInstrBuilder DefBuilder | |||
606 | = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) | |||
607 | .addReg(Tmp, RegState::Kill); | |||
608 | ||||
609 | if (ImpDefSuperReg) | |||
610 | DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); | |||
611 | } | |||
612 | ||||
613 | static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, | |||
614 | MachineBasicBlock::iterator MI, const DebugLoc &DL, | |||
615 | MCRegister DestReg, MCRegister SrcReg, bool KillSrc, | |||
616 | const TargetRegisterClass *RC, bool Forward) { | |||
617 | const SIRegisterInfo &RI = TII.getRegisterInfo(); | |||
618 | ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4); | |||
619 | MachineBasicBlock::iterator I = MI; | |||
620 | MachineInstr *FirstMI = nullptr, *LastMI = nullptr; | |||
621 | ||||
622 | for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) { | |||
623 | int16_t SubIdx = BaseIndices[Idx]; | |||
624 | Register Reg = RI.getSubReg(DestReg, SubIdx); | |||
625 | unsigned Opcode = AMDGPU::S_MOV_B32; | |||
626 | ||||
627 | // Is SGPR aligned? If so try to combine with next. | |||
628 | Register Src = RI.getSubReg(SrcReg, SubIdx); | |||
629 | bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0; | |||
630 | bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0; | |||
631 | if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) { | |||
632 | // Can use SGPR64 copy | |||
633 | unsigned Channel = RI.getChannelFromSubReg(SubIdx); | |||
634 | SubIdx = RI.getSubRegFromChannel(Channel, 2); | |||
635 | Opcode = AMDGPU::S_MOV_B64; | |||
636 | Idx++; | |||
637 | } | |||
638 | ||||
639 | LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx)) | |||
640 | .addReg(RI.getSubReg(SrcReg, SubIdx)) | |||
641 | .addReg(SrcReg, RegState::Implicit); | |||
642 | ||||
643 | if (!FirstMI) | |||
644 | FirstMI = LastMI; | |||
645 | ||||
646 | if (!Forward) | |||
647 | I--; | |||
648 | } | |||
649 | ||||
650 | assert(FirstMI && LastMI)((FirstMI && LastMI) ? static_cast<void> (0) : __assert_fail ("FirstMI && LastMI", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 650, __PRETTY_FUNCTION__)); | |||
651 | if (!Forward) | |||
652 | std::swap(FirstMI, LastMI); | |||
653 | ||||
654 | FirstMI->addOperand( | |||
655 | MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/)); | |||
656 | ||||
657 | if (KillSrc) | |||
658 | LastMI->addRegisterKilled(SrcReg, &RI); | |||
659 | } | |||
660 | ||||
661 | void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, | |||
662 | MachineBasicBlock::iterator MI, | |||
663 | const DebugLoc &DL, MCRegister DestReg, | |||
664 | MCRegister SrcReg, bool KillSrc) const { | |||
665 | const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); | |||
666 | ||||
667 | // FIXME: This is hack to resolve copies between 16 bit and 32 bit | |||
668 | // registers until all patterns are fixed. | |||
669 | if (Fix16BitCopies && | |||
670 | ((RI.getRegSizeInBits(*RC) == 16) ^ | |||
671 | (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) { | |||
672 | MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg; | |||
673 | MCRegister Super = RI.get32BitRegister(RegToFix); | |||
674 | assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix)((RI.getSubReg(Super, AMDGPU::lo16) == RegToFix) ? static_cast <void> (0) : __assert_fail ("RI.getSubReg(Super, AMDGPU::lo16) == RegToFix" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 674, __PRETTY_FUNCTION__)); | |||
675 | RegToFix = Super; | |||
676 | ||||
677 | if (DestReg == SrcReg) { | |||
678 | // Insert empty bundle since ExpandPostRA expects an instruction here. | |||
679 | BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); | |||
680 | return; | |||
681 | } | |||
682 | ||||
683 | RC = RI.getPhysRegClass(DestReg); | |||
684 | } | |||
685 | ||||
686 | if (RC == &AMDGPU::VGPR_32RegClass) { | |||
687 | assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||((AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass .contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg) ) ? static_cast<void> (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 689, __PRETTY_FUNCTION__)) | |||
688 | AMDGPU::SReg_32RegClass.contains(SrcReg) ||((AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass .contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg) ) ? static_cast<void> (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 689, __PRETTY_FUNCTION__)) | |||
689 | AMDGPU::AGPR_32RegClass.contains(SrcReg))((AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass .contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg) ) ? static_cast<void> (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 689, __PRETTY_FUNCTION__)); | |||
690 | unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ? | |||
691 | AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32; | |||
692 | BuildMI(MBB, MI, DL, get(Opc), DestReg) | |||
693 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
694 | return; | |||
695 | } | |||
696 | ||||
697 | if (RC == &AMDGPU::SReg_32_XM0RegClass || | |||
698 | RC == &AMDGPU::SReg_32RegClass) { | |||
699 | if (SrcReg == AMDGPU::SCC) { | |||
700 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) | |||
701 | .addImm(1) | |||
702 | .addImm(0); | |||
703 | return; | |||
704 | } | |||
705 | ||||
706 | if (DestReg == AMDGPU::VCC_LO) { | |||
707 | if (AMDGPU::SReg_32RegClass.contains(SrcReg)) { | |||
708 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO) | |||
709 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
710 | } else { | |||
711 | // FIXME: Hack until VReg_1 removed. | |||
712 | assert(AMDGPU::VGPR_32RegClass.contains(SrcReg))((AMDGPU::VGPR_32RegClass.contains(SrcReg)) ? static_cast< void> (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 712, __PRETTY_FUNCTION__)); | |||
713 | BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) | |||
714 | .addImm(0) | |||
715 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
716 | } | |||
717 | ||||
718 | return; | |||
719 | } | |||
720 | ||||
721 | if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { | |||
722 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); | |||
723 | return; | |||
724 | } | |||
725 | ||||
726 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) | |||
727 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
728 | return; | |||
729 | } | |||
730 | ||||
731 | if (RC == &AMDGPU::SReg_64RegClass) { | |||
732 | if (SrcReg == AMDGPU::SCC) { | |||
733 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg) | |||
734 | .addImm(1) | |||
735 | .addImm(0); | |||
736 | return; | |||
737 | } | |||
738 | ||||
739 | if (DestReg == AMDGPU::VCC) { | |||
740 | if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { | |||
741 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) | |||
742 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
743 | } else { | |||
744 | // FIXME: Hack until VReg_1 removed. | |||
745 | assert(AMDGPU::VGPR_32RegClass.contains(SrcReg))((AMDGPU::VGPR_32RegClass.contains(SrcReg)) ? static_cast< void> (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 745, __PRETTY_FUNCTION__)); | |||
746 | BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) | |||
747 | .addImm(0) | |||
748 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
749 | } | |||
750 | ||||
751 | return; | |||
752 | } | |||
753 | ||||
754 | if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { | |||
755 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); | |||
756 | return; | |||
757 | } | |||
758 | ||||
759 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) | |||
760 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
761 | return; | |||
762 | } | |||
763 | ||||
764 | if (DestReg == AMDGPU::SCC) { | |||
765 | // Copying 64-bit or 32-bit sources to SCC barely makes sense, | |||
766 | // but SelectionDAG emits such copies for i1 sources. | |||
767 | if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { | |||
768 | // This copy can only be produced by patterns | |||
769 | // with explicit SCC, which are known to be enabled | |||
770 | // only for subtargets with S_CMP_LG_U64 present. | |||
771 | assert(ST.hasScalarCompareEq64())((ST.hasScalarCompareEq64()) ? static_cast<void> (0) : __assert_fail ("ST.hasScalarCompareEq64()", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 771, __PRETTY_FUNCTION__)); | |||
772 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64)) | |||
773 | .addReg(SrcReg, getKillRegState(KillSrc)) | |||
774 | .addImm(0); | |||
775 | } else { | |||
776 | assert(AMDGPU::SReg_32RegClass.contains(SrcReg))((AMDGPU::SReg_32RegClass.contains(SrcReg)) ? static_cast< void> (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 776, __PRETTY_FUNCTION__)); | |||
777 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) | |||
778 | .addReg(SrcReg, getKillRegState(KillSrc)) | |||
779 | .addImm(0); | |||
780 | } | |||
781 | ||||
782 | return; | |||
783 | } | |||
784 | ||||
785 | ||||
786 | if (RC == &AMDGPU::AGPR_32RegClass) { | |||
787 | if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) { | |||
788 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) | |||
789 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
790 | return; | |||
791 | } | |||
792 | ||||
793 | // FIXME: Pass should maintain scavenger to avoid scan through the block on | |||
794 | // every AGPR spill. | |||
795 | RegScavenger RS; | |||
796 | indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS); | |||
797 | return; | |||
798 | } | |||
799 | ||||
800 | if (RI.getRegSizeInBits(*RC) == 16) { | |||
801 | assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||((AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass .contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg ) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)) ? static_cast <void> (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 804, __PRETTY_FUNCTION__)) | |||
802 | AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||((AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass .contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg ) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)) ? static_cast <void> (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 804, __PRETTY_FUNCTION__)) | |||
803 | AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||((AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass .contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg ) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)) ? static_cast <void> (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 804, __PRETTY_FUNCTION__)) | |||
804 | AMDGPU::AGPR_LO16RegClass.contains(SrcReg))((AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass .contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg ) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)) ? static_cast <void> (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 804, __PRETTY_FUNCTION__)); | |||
805 | ||||
806 | bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg); | |||
807 | bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg); | |||
808 | bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg); | |||
809 | bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg); | |||
810 | bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) || | |||
811 | AMDGPU::SReg_LO16RegClass.contains(DestReg) || | |||
812 | AMDGPU::AGPR_LO16RegClass.contains(DestReg); | |||
813 | bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || | |||
814 | AMDGPU::SReg_LO16RegClass.contains(SrcReg) || | |||
815 | AMDGPU::AGPR_LO16RegClass.contains(SrcReg); | |||
816 | MCRegister NewDestReg = RI.get32BitRegister(DestReg); | |||
817 | MCRegister NewSrcReg = RI.get32BitRegister(SrcReg); | |||
818 | ||||
819 | if (IsSGPRDst) { | |||
820 | if (!IsSGPRSrc) { | |||
821 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); | |||
822 | return; | |||
823 | } | |||
824 | ||||
825 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg) | |||
826 | .addReg(NewSrcReg, getKillRegState(KillSrc)); | |||
827 | return; | |||
828 | } | |||
829 | ||||
830 | if (IsAGPRDst || IsAGPRSrc) { | |||
831 | if (!DstLow || !SrcLow) { | |||
832 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, | |||
833 | "Cannot use hi16 subreg with an AGPR!"); | |||
834 | } | |||
835 | ||||
836 | copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc); | |||
837 | return; | |||
838 | } | |||
839 | ||||
840 | if (IsSGPRSrc && !ST.hasSDWAScalar()) { | |||
841 | if (!DstLow || !SrcLow) { | |||
842 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, | |||
843 | "Cannot use hi16 subreg on VI!"); | |||
844 | } | |||
845 | ||||
846 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg) | |||
847 | .addReg(NewSrcReg, getKillRegState(KillSrc)); | |||
848 | return; | |||
849 | } | |||
850 | ||||
851 | auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg) | |||
852 | .addImm(0) // src0_modifiers | |||
853 | .addReg(NewSrcReg) | |||
854 | .addImm(0) // clamp | |||
855 | .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0 | |||
856 | : AMDGPU::SDWA::SdwaSel::WORD_1) | |||
857 | .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) | |||
858 | .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0 | |||
859 | : AMDGPU::SDWA::SdwaSel::WORD_1) | |||
860 | .addReg(NewDestReg, RegState::Implicit | RegState::Undef); | |||
861 | // First implicit operand is $exec. | |||
862 | MIB->tieOperands(0, MIB->getNumOperands() - 1); | |||
863 | return; | |||
864 | } | |||
865 | ||||
866 | const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); | |||
867 | if (RI.isSGPRClass(RC)) { | |||
868 | if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) { | |||
869 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); | |||
870 | return; | |||
871 | } | |||
872 | expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RC, Forward); | |||
873 | return; | |||
874 | } | |||
875 | ||||
876 | unsigned Opcode = AMDGPU::V_MOV_B32_e32; | |||
877 | if (RI.hasAGPRs(RC)) { | |||
878 | Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ? | |||
879 | AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; | |||
880 | } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { | |||
881 | Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64; | |||
882 | } | |||
883 | ||||
884 | // For the cases where we need an intermediate instruction/temporary register | |||
885 | // (destination is an AGPR), we need a scavenger. | |||
886 | // | |||
887 | // FIXME: The pass should maintain this for us so we don't have to re-scan the | |||
888 | // whole block for every handled copy. | |||
889 | std::unique_ptr<RegScavenger> RS; | |||
890 | if (Opcode == AMDGPU::INSTRUCTION_LIST_END) | |||
891 | RS.reset(new RegScavenger()); | |||
892 | ||||
893 | ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, 4); | |||
894 | ||||
895 | // If there is an overlap, we can't kill the super-register on the last | |||
896 | // instruction, since it will also kill the components made live by this def. | |||
897 | const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg); | |||
898 | ||||
899 | for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { | |||
900 | unsigned SubIdx; | |||
901 | if (Forward) | |||
902 | SubIdx = SubIndices[Idx]; | |||
903 | else | |||
904 | SubIdx = SubIndices[SubIndices.size() - Idx - 1]; | |||
905 | ||||
906 | bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1; | |||
907 | ||||
908 | if (Opcode == AMDGPU::INSTRUCTION_LIST_END) { | |||
909 | Register ImpDefSuper = Idx == 0 ? Register(DestReg) : Register(); | |||
910 | Register ImpUseSuper = SrcReg; | |||
911 | indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), | |||
912 | RI.getSubReg(SrcReg, SubIdx), UseKill, *RS, | |||
913 | ImpDefSuper, ImpUseSuper); | |||
914 | } else { | |||
915 | MachineInstrBuilder Builder = | |||
916 | BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)) | |||
917 | .addReg(RI.getSubReg(SrcReg, SubIdx)); | |||
918 | if (Idx == 0) | |||
919 | Builder.addReg(DestReg, RegState::Define | RegState::Implicit); | |||
920 | ||||
921 | Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); | |||
922 | } | |||
923 | } | |||
924 | } | |||
925 | ||||
926 | int SIInstrInfo::commuteOpcode(unsigned Opcode) const { | |||
927 | int NewOpc; | |||
928 | ||||
929 | // Try to map original to commuted opcode | |||
930 | NewOpc = AMDGPU::getCommuteRev(Opcode); | |||
931 | if (NewOpc != -1) | |||
932 | // Check if the commuted (REV) opcode exists on the target. | |||
933 | return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; | |||
934 | ||||
935 | // Try to map commuted to original opcode | |||
936 | NewOpc = AMDGPU::getCommuteOrig(Opcode); | |||
937 | if (NewOpc != -1) | |||
938 | // Check if the original (non-REV) opcode exists on the target. | |||
939 | return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; | |||
940 | ||||
941 | return Opcode; | |||
942 | } | |||
943 | ||||
944 | void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, | |||
945 | MachineBasicBlock::iterator MI, | |||
946 | const DebugLoc &DL, unsigned DestReg, | |||
947 | int64_t Value) const { | |||
948 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
949 | const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); | |||
950 | if (RegClass == &AMDGPU::SReg_32RegClass || | |||
951 | RegClass == &AMDGPU::SGPR_32RegClass || | |||
952 | RegClass == &AMDGPU::SReg_32_XM0RegClass || | |||
953 | RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { | |||
954 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) | |||
955 | .addImm(Value); | |||
956 | return; | |||
957 | } | |||
958 | ||||
959 | if (RegClass == &AMDGPU::SReg_64RegClass || | |||
960 | RegClass == &AMDGPU::SGPR_64RegClass || | |||
961 | RegClass == &AMDGPU::SReg_64_XEXECRegClass) { | |||
962 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) | |||
963 | .addImm(Value); | |||
964 | return; | |||
965 | } | |||
966 | ||||
967 | if (RegClass == &AMDGPU::VGPR_32RegClass) { | |||
968 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) | |||
969 | .addImm(Value); | |||
970 | return; | |||
971 | } | |||
972 | if (RegClass == &AMDGPU::VReg_64RegClass) { | |||
973 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) | |||
974 | .addImm(Value); | |||
975 | return; | |||
976 | } | |||
977 | ||||
978 | unsigned EltSize = 4; | |||
979 | unsigned Opcode = AMDGPU::V_MOV_B32_e32; | |||
980 | if (RI.isSGPRClass(RegClass)) { | |||
981 | if (RI.getRegSizeInBits(*RegClass) > 32) { | |||
982 | Opcode = AMDGPU::S_MOV_B64; | |||
983 | EltSize = 8; | |||
984 | } else { | |||
985 | Opcode = AMDGPU::S_MOV_B32; | |||
986 | EltSize = 4; | |||
987 | } | |||
988 | } | |||
989 | ||||
990 | ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); | |||
991 | for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { | |||
992 | int64_t IdxValue = Idx == 0 ? Value : 0; | |||
993 | ||||
994 | MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, | |||
995 | get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx])); | |||
996 | Builder.addImm(IdxValue); | |||
997 | } | |||
998 | } | |||
999 | ||||
1000 | const TargetRegisterClass * | |||
1001 | SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { | |||
1002 | return &AMDGPU::VGPR_32RegClass; | |||
1003 | } | |||
1004 | ||||
1005 | void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, | |||
1006 | MachineBasicBlock::iterator I, | |||
1007 | const DebugLoc &DL, Register DstReg, | |||
1008 | ArrayRef<MachineOperand> Cond, | |||
1009 | Register TrueReg, | |||
1010 | Register FalseReg) const { | |||
1011 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
1012 | const TargetRegisterClass *BoolXExecRC = | |||
1013 | RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
1014 | assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&((MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && "Not a VGPR32 reg") ? static_cast<void> (0) : __assert_fail ("MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && \"Not a VGPR32 reg\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1015, __PRETTY_FUNCTION__)) | |||
1015 | "Not a VGPR32 reg")((MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && "Not a VGPR32 reg") ? static_cast<void> (0) : __assert_fail ("MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && \"Not a VGPR32 reg\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1015, __PRETTY_FUNCTION__)); | |||
1016 | ||||
1017 | if (Cond.size() == 1) { | |||
1018 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1019 | BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) | |||
1020 | .add(Cond[0]); | |||
1021 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1022 | .addImm(0) | |||
1023 | .addReg(FalseReg) | |||
1024 | .addImm(0) | |||
1025 | .addReg(TrueReg) | |||
1026 | .addReg(SReg); | |||
1027 | } else if (Cond.size() == 2) { | |||
1028 | assert(Cond[0].isImm() && "Cond[0] is not an immediate")((Cond[0].isImm() && "Cond[0] is not an immediate") ? static_cast<void> (0) : __assert_fail ("Cond[0].isImm() && \"Cond[0] is not an immediate\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1028, __PRETTY_FUNCTION__)); | |||
1029 | switch (Cond[0].getImm()) { | |||
1030 | case SIInstrInfo::SCC_TRUE: { | |||
1031 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1032 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 | |||
1033 | : AMDGPU::S_CSELECT_B64), SReg) | |||
1034 | .addImm(1) | |||
1035 | .addImm(0); | |||
1036 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1037 | .addImm(0) | |||
1038 | .addReg(FalseReg) | |||
1039 | .addImm(0) | |||
1040 | .addReg(TrueReg) | |||
1041 | .addReg(SReg); | |||
1042 | break; | |||
1043 | } | |||
1044 | case SIInstrInfo::SCC_FALSE: { | |||
1045 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1046 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 | |||
1047 | : AMDGPU::S_CSELECT_B64), SReg) | |||
1048 | .addImm(0) | |||
1049 | .addImm(1); | |||
1050 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1051 | .addImm(0) | |||
1052 | .addReg(FalseReg) | |||
1053 | .addImm(0) | |||
1054 | .addReg(TrueReg) | |||
1055 | .addReg(SReg); | |||
1056 | break; | |||
1057 | } | |||
1058 | case SIInstrInfo::VCCNZ: { | |||
1059 | MachineOperand RegOp = Cond[1]; | |||
1060 | RegOp.setImplicit(false); | |||
1061 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1062 | BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) | |||
1063 | .add(RegOp); | |||
1064 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1065 | .addImm(0) | |||
1066 | .addReg(FalseReg) | |||
1067 | .addImm(0) | |||
1068 | .addReg(TrueReg) | |||
1069 | .addReg(SReg); | |||
1070 | break; | |||
1071 | } | |||
1072 | case SIInstrInfo::VCCZ: { | |||
1073 | MachineOperand RegOp = Cond[1]; | |||
1074 | RegOp.setImplicit(false); | |||
1075 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1076 | BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) | |||
1077 | .add(RegOp); | |||
1078 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1079 | .addImm(0) | |||
1080 | .addReg(TrueReg) | |||
1081 | .addImm(0) | |||
1082 | .addReg(FalseReg) | |||
1083 | .addReg(SReg); | |||
1084 | break; | |||
1085 | } | |||
1086 | case SIInstrInfo::EXECNZ: { | |||
1087 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1088 | Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); | |||
1089 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 | |||
1090 | : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) | |||
1091 | .addImm(0); | |||
1092 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 | |||
1093 | : AMDGPU::S_CSELECT_B64), SReg) | |||
1094 | .addImm(1) | |||
1095 | .addImm(0); | |||
1096 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1097 | .addImm(0) | |||
1098 | .addReg(FalseReg) | |||
1099 | .addImm(0) | |||
1100 | .addReg(TrueReg) | |||
1101 | .addReg(SReg); | |||
1102 | break; | |||
1103 | } | |||
1104 | case SIInstrInfo::EXECZ: { | |||
1105 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1106 | Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); | |||
1107 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 | |||
1108 | : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) | |||
1109 | .addImm(0); | |||
1110 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 | |||
1111 | : AMDGPU::S_CSELECT_B64), SReg) | |||
1112 | .addImm(0) | |||
1113 | .addImm(1); | |||
1114 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1115 | .addImm(0) | |||
1116 | .addReg(FalseReg) | |||
1117 | .addImm(0) | |||
1118 | .addReg(TrueReg) | |||
1119 | .addReg(SReg); | |||
1120 | llvm_unreachable("Unhandled branch predicate EXECZ")::llvm::llvm_unreachable_internal("Unhandled branch predicate EXECZ" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1120); | |||
1121 | break; | |||
1122 | } | |||
1123 | default: | |||
1124 | llvm_unreachable("invalid branch predicate")::llvm::llvm_unreachable_internal("invalid branch predicate", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1124); | |||
1125 | } | |||
1126 | } else { | |||
1127 | llvm_unreachable("Can only handle Cond size 1 or 2")::llvm::llvm_unreachable_internal("Can only handle Cond size 1 or 2" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1127); | |||
1128 | } | |||
1129 | } | |||
1130 | ||||
1131 | Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB, | |||
1132 | MachineBasicBlock::iterator I, | |||
1133 | const DebugLoc &DL, | |||
1134 | Register SrcReg, int Value) const { | |||
1135 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | |||
1136 | Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
1137 | BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) | |||
1138 | .addImm(Value) | |||
1139 | .addReg(SrcReg); | |||
1140 | ||||
1141 | return Reg; | |||
1142 | } | |||
1143 | ||||
1144 | Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, | |||
1145 | MachineBasicBlock::iterator I, | |||
1146 | const DebugLoc &DL, | |||
1147 | Register SrcReg, int Value) const { | |||
1148 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | |||
1149 | Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
1150 | BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) | |||
1151 | .addImm(Value) | |||
1152 | .addReg(SrcReg); | |||
1153 | ||||
1154 | return Reg; | |||
1155 | } | |||
1156 | ||||
1157 | unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { | |||
1158 | ||||
1159 | if (RI.hasAGPRs(DstRC)) | |||
1160 | return AMDGPU::COPY; | |||
1161 | if (RI.getRegSizeInBits(*DstRC) == 32) { | |||
1162 | return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; | |||
1163 | } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { | |||
1164 | return AMDGPU::S_MOV_B64; | |||
1165 | } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { | |||
1166 | return AMDGPU::V_MOV_B64_PSEUDO; | |||
1167 | } | |||
1168 | return AMDGPU::COPY; | |||
1169 | } | |||
1170 | ||||
1171 | const MCInstrDesc & | |||
1172 | SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize, | |||
1173 | bool IsIndirectSrc) const { | |||
1174 | if (IsIndirectSrc) { | |||
1175 | if (VecSize <= 32) // 4 bytes | |||
1176 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1); | |||
1177 | if (VecSize <= 64) // 8 bytes | |||
1178 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2); | |||
1179 | if (VecSize <= 96) // 12 bytes | |||
1180 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3); | |||
1181 | if (VecSize <= 128) // 16 bytes | |||
1182 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4); | |||
1183 | if (VecSize <= 160) // 20 bytes | |||
1184 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5); | |||
1185 | if (VecSize <= 256) // 32 bytes | |||
1186 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8); | |||
1187 | if (VecSize <= 512) // 64 bytes | |||
1188 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16); | |||
1189 | if (VecSize <= 1024) // 128 bytes | |||
1190 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32); | |||
1191 | ||||
1192 | llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegReadGPRIDX pseudos" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1192); | |||
1193 | } | |||
1194 | ||||
1195 | if (VecSize <= 32) // 4 bytes | |||
1196 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1); | |||
1197 | if (VecSize <= 64) // 8 bytes | |||
1198 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2); | |||
1199 | if (VecSize <= 96) // 12 bytes | |||
1200 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3); | |||
1201 | if (VecSize <= 128) // 16 bytes | |||
1202 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4); | |||
1203 | if (VecSize <= 160) // 20 bytes | |||
1204 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5); | |||
1205 | if (VecSize <= 256) // 32 bytes | |||
1206 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8); | |||
1207 | if (VecSize <= 512) // 64 bytes | |||
1208 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16); | |||
1209 | if (VecSize <= 1024) // 128 bytes | |||
1210 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32); | |||
1211 | ||||
1212 | llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWriteGPRIDX pseudos" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1212); | |||
1213 | } | |||
1214 | ||||
1215 | static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) { | |||
1216 | if (VecSize <= 32) // 4 bytes | |||
1217 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1; | |||
1218 | if (VecSize <= 64) // 8 bytes | |||
1219 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2; | |||
1220 | if (VecSize <= 96) // 12 bytes | |||
1221 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3; | |||
1222 | if (VecSize <= 128) // 16 bytes | |||
1223 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4; | |||
1224 | if (VecSize <= 160) // 20 bytes | |||
1225 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5; | |||
1226 | if (VecSize <= 256) // 32 bytes | |||
1227 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8; | |||
1228 | if (VecSize <= 512) // 64 bytes | |||
1229 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16; | |||
1230 | if (VecSize <= 1024) // 128 bytes | |||
1231 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32; | |||
1232 | ||||
1233 | llvm_unreachable("unsupported size for IndirectRegWrite pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWrite pseudos" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1233); | |||
1234 | } | |||
1235 | ||||
1236 | static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) { | |||
1237 | if (VecSize <= 32) // 4 bytes | |||
1238 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1; | |||
1239 | if (VecSize <= 64) // 8 bytes | |||
1240 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2; | |||
1241 | if (VecSize <= 96) // 12 bytes | |||
1242 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3; | |||
1243 | if (VecSize <= 128) // 16 bytes | |||
1244 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4; | |||
1245 | if (VecSize <= 160) // 20 bytes | |||
1246 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5; | |||
1247 | if (VecSize <= 256) // 32 bytes | |||
1248 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8; | |||
1249 | if (VecSize <= 512) // 64 bytes | |||
1250 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16; | |||
1251 | if (VecSize <= 1024) // 128 bytes | |||
1252 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32; | |||
1253 | ||||
1254 | llvm_unreachable("unsupported size for IndirectRegWrite pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWrite pseudos" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1254); | |||
1255 | } | |||
1256 | ||||
1257 | static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) { | |||
1258 | if (VecSize <= 64) // 8 bytes | |||
1259 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1; | |||
1260 | if (VecSize <= 128) // 16 bytes | |||
1261 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2; | |||
1262 | if (VecSize <= 256) // 32 bytes | |||
1263 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4; | |||
1264 | if (VecSize <= 512) // 64 bytes | |||
1265 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8; | |||
1266 | if (VecSize <= 1024) // 128 bytes | |||
1267 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16; | |||
1268 | ||||
1269 | llvm_unreachable("unsupported size for IndirectRegWrite pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWrite pseudos" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1269); | |||
1270 | } | |||
1271 | ||||
1272 | const MCInstrDesc & | |||
1273 | SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, | |||
1274 | bool IsSGPR) const { | |||
1275 | if (IsSGPR) { | |||
1276 | switch (EltSize) { | |||
1277 | case 32: | |||
1278 | return get(getIndirectSGPRWriteMovRelPseudo32(VecSize)); | |||
1279 | case 64: | |||
1280 | return get(getIndirectSGPRWriteMovRelPseudo64(VecSize)); | |||
1281 | default: | |||
1282 | llvm_unreachable("invalid reg indexing elt size")::llvm::llvm_unreachable_internal("invalid reg indexing elt size" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1282); | |||
1283 | } | |||
1284 | } | |||
1285 | ||||
1286 | assert(EltSize == 32 && "invalid reg indexing elt size")((EltSize == 32 && "invalid reg indexing elt size") ? static_cast<void> (0) : __assert_fail ("EltSize == 32 && \"invalid reg indexing elt size\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1286, __PRETTY_FUNCTION__)); | |||
1287 | return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize)); | |||
1288 | } | |||
1289 | ||||
1290 | static unsigned getSGPRSpillSaveOpcode(unsigned Size) { | |||
1291 | switch (Size) { | |||
1292 | case 4: | |||
1293 | return AMDGPU::SI_SPILL_S32_SAVE; | |||
1294 | case 8: | |||
1295 | return AMDGPU::SI_SPILL_S64_SAVE; | |||
1296 | case 12: | |||
1297 | return AMDGPU::SI_SPILL_S96_SAVE; | |||
1298 | case 16: | |||
1299 | return AMDGPU::SI_SPILL_S128_SAVE; | |||
1300 | case 20: | |||
1301 | return AMDGPU::SI_SPILL_S160_SAVE; | |||
1302 | case 24: | |||
1303 | return AMDGPU::SI_SPILL_S192_SAVE; | |||
1304 | case 32: | |||
1305 | return AMDGPU::SI_SPILL_S256_SAVE; | |||
1306 | case 64: | |||
1307 | return AMDGPU::SI_SPILL_S512_SAVE; | |||
1308 | case 128: | |||
1309 | return AMDGPU::SI_SPILL_S1024_SAVE; | |||
1310 | default: | |||
1311 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1311); | |||
1312 | } | |||
1313 | } | |||
1314 | ||||
1315 | static unsigned getVGPRSpillSaveOpcode(unsigned Size) { | |||
1316 | switch (Size) { | |||
1317 | case 4: | |||
1318 | return AMDGPU::SI_SPILL_V32_SAVE; | |||
1319 | case 8: | |||
1320 | return AMDGPU::SI_SPILL_V64_SAVE; | |||
1321 | case 12: | |||
1322 | return AMDGPU::SI_SPILL_V96_SAVE; | |||
1323 | case 16: | |||
1324 | return AMDGPU::SI_SPILL_V128_SAVE; | |||
1325 | case 20: | |||
1326 | return AMDGPU::SI_SPILL_V160_SAVE; | |||
1327 | case 24: | |||
1328 | return AMDGPU::SI_SPILL_V192_SAVE; | |||
1329 | case 32: | |||
1330 | return AMDGPU::SI_SPILL_V256_SAVE; | |||
1331 | case 64: | |||
1332 | return AMDGPU::SI_SPILL_V512_SAVE; | |||
1333 | case 128: | |||
1334 | return AMDGPU::SI_SPILL_V1024_SAVE; | |||
1335 | default: | |||
1336 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1336); | |||
1337 | } | |||
1338 | } | |||
1339 | ||||
1340 | static unsigned getAGPRSpillSaveOpcode(unsigned Size) { | |||
1341 | switch (Size) { | |||
1342 | case 4: | |||
1343 | return AMDGPU::SI_SPILL_A32_SAVE; | |||
1344 | case 8: | |||
1345 | return AMDGPU::SI_SPILL_A64_SAVE; | |||
1346 | case 12: | |||
1347 | return AMDGPU::SI_SPILL_A96_SAVE; | |||
1348 | case 16: | |||
1349 | return AMDGPU::SI_SPILL_A128_SAVE; | |||
1350 | case 20: | |||
1351 | return AMDGPU::SI_SPILL_A160_SAVE; | |||
1352 | case 24: | |||
1353 | return AMDGPU::SI_SPILL_A192_SAVE; | |||
1354 | case 32: | |||
1355 | return AMDGPU::SI_SPILL_A256_SAVE; | |||
1356 | case 64: | |||
1357 | return AMDGPU::SI_SPILL_A512_SAVE; | |||
1358 | case 128: | |||
1359 | return AMDGPU::SI_SPILL_A1024_SAVE; | |||
1360 | default: | |||
1361 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1361); | |||
1362 | } | |||
1363 | } | |||
1364 | ||||
1365 | void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, | |||
1366 | MachineBasicBlock::iterator MI, | |||
1367 | Register SrcReg, bool isKill, | |||
1368 | int FrameIndex, | |||
1369 | const TargetRegisterClass *RC, | |||
1370 | const TargetRegisterInfo *TRI) const { | |||
1371 | MachineFunction *MF = MBB.getParent(); | |||
1372 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); | |||
1373 | MachineFrameInfo &FrameInfo = MF->getFrameInfo(); | |||
1374 | const DebugLoc &DL = MBB.findDebugLoc(MI); | |||
1375 | ||||
1376 | MachinePointerInfo PtrInfo | |||
1377 | = MachinePointerInfo::getFixedStack(*MF, FrameIndex); | |||
1378 | MachineMemOperand *MMO = MF->getMachineMemOperand( | |||
1379 | PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), | |||
1380 | FrameInfo.getObjectAlign(FrameIndex)); | |||
1381 | unsigned SpillSize = TRI->getSpillSize(*RC); | |||
1382 | ||||
1383 | if (RI.isSGPRClass(RC)) { | |||
1384 | MFI->setHasSpilledSGPRs(); | |||
1385 | assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled")((SrcReg != AMDGPU::M0 && "m0 should not be spilled") ? static_cast<void> (0) : __assert_fail ("SrcReg != AMDGPU::M0 && \"m0 should not be spilled\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1385, __PRETTY_FUNCTION__)); | |||
1386 | assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&((SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && "exec should not be spilled" ) ? static_cast<void> (0) : __assert_fail ("SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && \"exec should not be spilled\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1387, __PRETTY_FUNCTION__)) | |||
1387 | SrcReg != AMDGPU::EXEC && "exec should not be spilled")((SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && "exec should not be spilled" ) ? static_cast<void> (0) : __assert_fail ("SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && \"exec should not be spilled\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1387, __PRETTY_FUNCTION__)); | |||
1388 | ||||
1389 | // We are only allowed to create one new instruction when spilling | |||
1390 | // registers, so we need to use pseudo instruction for spilling SGPRs. | |||
1391 | const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); | |||
1392 | ||||
1393 | // The SGPR spill/restore instructions only work on number sgprs, so we need | |||
1394 | // to make sure we are using the correct register class. | |||
1395 | if (SrcReg.isVirtual() && SpillSize == 4) { | |||
1396 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
1397 | MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); | |||
1398 | } | |||
1399 | ||||
1400 | BuildMI(MBB, MI, DL, OpDesc) | |||
1401 | .addReg(SrcReg, getKillRegState(isKill)) // data | |||
1402 | .addFrameIndex(FrameIndex) // addr | |||
1403 | .addMemOperand(MMO) | |||
1404 | .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); | |||
1405 | ||||
1406 | if (RI.spillSGPRToVGPR()) | |||
1407 | FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); | |||
1408 | return; | |||
1409 | } | |||
1410 | ||||
1411 | unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize) | |||
1412 | : getVGPRSpillSaveOpcode(SpillSize); | |||
1413 | MFI->setHasSpilledVGPRs(); | |||
1414 | ||||
1415 | BuildMI(MBB, MI, DL, get(Opcode)) | |||
1416 | .addReg(SrcReg, getKillRegState(isKill)) // data | |||
1417 | .addFrameIndex(FrameIndex) // addr | |||
1418 | .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset | |||
1419 | .addImm(0) // offset | |||
1420 | .addMemOperand(MMO); | |||
1421 | } | |||
1422 | ||||
1423 | static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { | |||
1424 | switch (Size) { | |||
1425 | case 4: | |||
1426 | return AMDGPU::SI_SPILL_S32_RESTORE; | |||
1427 | case 8: | |||
1428 | return AMDGPU::SI_SPILL_S64_RESTORE; | |||
1429 | case 12: | |||
1430 | return AMDGPU::SI_SPILL_S96_RESTORE; | |||
1431 | case 16: | |||
1432 | return AMDGPU::SI_SPILL_S128_RESTORE; | |||
1433 | case 20: | |||
1434 | return AMDGPU::SI_SPILL_S160_RESTORE; | |||
1435 | case 24: | |||
1436 | return AMDGPU::SI_SPILL_S192_RESTORE; | |||
1437 | case 32: | |||
1438 | return AMDGPU::SI_SPILL_S256_RESTORE; | |||
1439 | case 64: | |||
1440 | return AMDGPU::SI_SPILL_S512_RESTORE; | |||
1441 | case 128: | |||
1442 | return AMDGPU::SI_SPILL_S1024_RESTORE; | |||
1443 | default: | |||
1444 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1444); | |||
1445 | } | |||
1446 | } | |||
1447 | ||||
1448 | static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { | |||
1449 | switch (Size) { | |||
1450 | case 4: | |||
1451 | return AMDGPU::SI_SPILL_V32_RESTORE; | |||
1452 | case 8: | |||
1453 | return AMDGPU::SI_SPILL_V64_RESTORE; | |||
1454 | case 12: | |||
1455 | return AMDGPU::SI_SPILL_V96_RESTORE; | |||
1456 | case 16: | |||
1457 | return AMDGPU::SI_SPILL_V128_RESTORE; | |||
1458 | case 20: | |||
1459 | return AMDGPU::SI_SPILL_V160_RESTORE; | |||
1460 | case 24: | |||
1461 | return AMDGPU::SI_SPILL_V192_RESTORE; | |||
1462 | case 32: | |||
1463 | return AMDGPU::SI_SPILL_V256_RESTORE; | |||
1464 | case 64: | |||
1465 | return AMDGPU::SI_SPILL_V512_RESTORE; | |||
1466 | case 128: | |||
1467 | return AMDGPU::SI_SPILL_V1024_RESTORE; | |||
1468 | default: | |||
1469 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1469); | |||
1470 | } | |||
1471 | } | |||
1472 | ||||
1473 | static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { | |||
1474 | switch (Size) { | |||
1475 | case 4: | |||
1476 | return AMDGPU::SI_SPILL_A32_RESTORE; | |||
1477 | case 8: | |||
1478 | return AMDGPU::SI_SPILL_A64_RESTORE; | |||
1479 | case 12: | |||
1480 | return AMDGPU::SI_SPILL_A96_RESTORE; | |||
1481 | case 16: | |||
1482 | return AMDGPU::SI_SPILL_A128_RESTORE; | |||
1483 | case 20: | |||
1484 | return AMDGPU::SI_SPILL_A160_RESTORE; | |||
1485 | case 24: | |||
1486 | return AMDGPU::SI_SPILL_A192_RESTORE; | |||
1487 | case 32: | |||
1488 | return AMDGPU::SI_SPILL_A256_RESTORE; | |||
1489 | case 64: | |||
1490 | return AMDGPU::SI_SPILL_A512_RESTORE; | |||
1491 | case 128: | |||
1492 | return AMDGPU::SI_SPILL_A1024_RESTORE; | |||
1493 | default: | |||
1494 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1494); | |||
1495 | } | |||
1496 | } | |||
1497 | ||||
1498 | void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, | |||
1499 | MachineBasicBlock::iterator MI, | |||
1500 | Register DestReg, int FrameIndex, | |||
1501 | const TargetRegisterClass *RC, | |||
1502 | const TargetRegisterInfo *TRI) const { | |||
1503 | MachineFunction *MF = MBB.getParent(); | |||
1504 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); | |||
1505 | MachineFrameInfo &FrameInfo = MF->getFrameInfo(); | |||
1506 | const DebugLoc &DL = MBB.findDebugLoc(MI); | |||
1507 | unsigned SpillSize = TRI->getSpillSize(*RC); | |||
1508 | ||||
1509 | MachinePointerInfo PtrInfo | |||
1510 | = MachinePointerInfo::getFixedStack(*MF, FrameIndex); | |||
1511 | ||||
1512 | MachineMemOperand *MMO = MF->getMachineMemOperand( | |||
1513 | PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex), | |||
1514 | FrameInfo.getObjectAlign(FrameIndex)); | |||
1515 | ||||
1516 | if (RI.isSGPRClass(RC)) { | |||
1517 | MFI->setHasSpilledSGPRs(); | |||
1518 | assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into")((DestReg != AMDGPU::M0 && "m0 should not be reloaded into" ) ? static_cast<void> (0) : __assert_fail ("DestReg != AMDGPU::M0 && \"m0 should not be reloaded into\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1518, __PRETTY_FUNCTION__)); | |||
1519 | assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&((DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && "exec should not be spilled" ) ? static_cast<void> (0) : __assert_fail ("DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && \"exec should not be spilled\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1520, __PRETTY_FUNCTION__)) | |||
1520 | DestReg != AMDGPU::EXEC && "exec should not be spilled")((DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && "exec should not be spilled" ) ? static_cast<void> (0) : __assert_fail ("DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && \"exec should not be spilled\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1520, __PRETTY_FUNCTION__)); | |||
1521 | ||||
1522 | // FIXME: Maybe this should not include a memoperand because it will be | |||
1523 | // lowered to non-memory instructions. | |||
1524 | const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); | |||
1525 | if (DestReg.isVirtual() && SpillSize == 4) { | |||
1526 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
1527 | MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); | |||
1528 | } | |||
1529 | ||||
1530 | if (RI.spillSGPRToVGPR()) | |||
1531 | FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); | |||
1532 | BuildMI(MBB, MI, DL, OpDesc, DestReg) | |||
1533 | .addFrameIndex(FrameIndex) // addr | |||
1534 | .addMemOperand(MMO) | |||
1535 | .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); | |||
1536 | ||||
1537 | return; | |||
1538 | } | |||
1539 | ||||
1540 | unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize) | |||
1541 | : getVGPRSpillRestoreOpcode(SpillSize); | |||
1542 | BuildMI(MBB, MI, DL, get(Opcode), DestReg) | |||
1543 | .addFrameIndex(FrameIndex) // vaddr | |||
1544 | .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset | |||
1545 | .addImm(0) // offset | |||
1546 | .addMemOperand(MMO); | |||
1547 | } | |||
1548 | ||||
1549 | void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, | |||
1550 | MachineBasicBlock::iterator MI) const { | |||
1551 | insertNoops(MBB, MI, 1); | |||
1552 | } | |||
1553 | ||||
1554 | void SIInstrInfo::insertNoops(MachineBasicBlock &MBB, | |||
1555 | MachineBasicBlock::iterator MI, | |||
1556 | unsigned Quantity) const { | |||
1557 | DebugLoc DL = MBB.findDebugLoc(MI); | |||
1558 | while (Quantity > 0) { | |||
1559 | unsigned Arg = std::min(Quantity, 8u); | |||
1560 | Quantity -= Arg; | |||
1561 | BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1); | |||
1562 | } | |||
1563 | } | |||
1564 | ||||
1565 | void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { | |||
1566 | auto MF = MBB.getParent(); | |||
1567 | SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); | |||
1568 | ||||
1569 | assert(Info->isEntryFunction())((Info->isEntryFunction()) ? static_cast<void> (0) : __assert_fail ("Info->isEntryFunction()", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1569, __PRETTY_FUNCTION__)); | |||
1570 | ||||
1571 | if (MBB.succ_empty()) { | |||
1572 | bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); | |||
1573 | if (HasNoTerminator) { | |||
1574 | if (Info->returnsVoid()) { | |||
1575 | BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0); | |||
1576 | } else { | |||
1577 | BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG)); | |||
1578 | } | |||
1579 | } | |||
1580 | } | |||
1581 | } | |||
1582 | ||||
1583 | unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { | |||
1584 | switch (MI.getOpcode()) { | |||
1585 | default: return 1; // FIXME: Do wait states equal cycles? | |||
1586 | ||||
1587 | case AMDGPU::S_NOP: | |||
1588 | return MI.getOperand(0).getImm() + 1; | |||
1589 | } | |||
1590 | } | |||
1591 | ||||
1592 | bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { | |||
1593 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); | |||
1594 | MachineBasicBlock &MBB = *MI.getParent(); | |||
1595 | DebugLoc DL = MBB.findDebugLoc(MI); | |||
1596 | switch (MI.getOpcode()) { | |||
| ||||
1597 | default: return TargetInstrInfo::expandPostRAPseudo(MI); | |||
1598 | case AMDGPU::S_MOV_B64_term: | |||
1599 | // This is only a terminator to get the correct spill code placement during | |||
1600 | // register allocation. | |||
1601 | MI.setDesc(get(AMDGPU::S_MOV_B64)); | |||
1602 | break; | |||
1603 | ||||
1604 | case AMDGPU::S_MOV_B32_term: | |||
1605 | // This is only a terminator to get the correct spill code placement during | |||
1606 | // register allocation. | |||
1607 | MI.setDesc(get(AMDGPU::S_MOV_B32)); | |||
1608 | break; | |||
1609 | ||||
1610 | case AMDGPU::S_XOR_B64_term: | |||
1611 | // This is only a terminator to get the correct spill code placement during | |||
1612 | // register allocation. | |||
1613 | MI.setDesc(get(AMDGPU::S_XOR_B64)); | |||
1614 | break; | |||
1615 | ||||
1616 | case AMDGPU::S_XOR_B32_term: | |||
1617 | // This is only a terminator to get the correct spill code placement during | |||
1618 | // register allocation. | |||
1619 | MI.setDesc(get(AMDGPU::S_XOR_B32)); | |||
1620 | break; | |||
1621 | case AMDGPU::S_OR_B64_term: | |||
1622 | // This is only a terminator to get the correct spill code placement during | |||
1623 | // register allocation. | |||
1624 | MI.setDesc(get(AMDGPU::S_OR_B64)); | |||
1625 | break; | |||
1626 | case AMDGPU::S_OR_B32_term: | |||
1627 | // This is only a terminator to get the correct spill code placement during | |||
1628 | // register allocation. | |||
1629 | MI.setDesc(get(AMDGPU::S_OR_B32)); | |||
1630 | break; | |||
1631 | ||||
1632 | case AMDGPU::S_ANDN2_B64_term: | |||
1633 | // This is only a terminator to get the correct spill code placement during | |||
1634 | // register allocation. | |||
1635 | MI.setDesc(get(AMDGPU::S_ANDN2_B64)); | |||
1636 | break; | |||
1637 | ||||
1638 | case AMDGPU::S_ANDN2_B32_term: | |||
1639 | // This is only a terminator to get the correct spill code placement during | |||
1640 | // register allocation. | |||
1641 | MI.setDesc(get(AMDGPU::S_ANDN2_B32)); | |||
1642 | break; | |||
1643 | ||||
1644 | case AMDGPU::S_AND_B64_term: | |||
1645 | // This is only a terminator to get the correct spill code placement during | |||
1646 | // register allocation. | |||
1647 | MI.setDesc(get(AMDGPU::S_AND_B64)); | |||
1648 | break; | |||
1649 | ||||
1650 | case AMDGPU::S_AND_B32_term: | |||
1651 | // This is only a terminator to get the correct spill code placement during | |||
1652 | // register allocation. | |||
1653 | MI.setDesc(get(AMDGPU::S_AND_B32)); | |||
1654 | break; | |||
1655 | ||||
1656 | case AMDGPU::V_MOV_B64_PSEUDO: { | |||
1657 | Register Dst = MI.getOperand(0).getReg(); | |||
1658 | Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); | |||
1659 | Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); | |||
1660 | ||||
1661 | const MachineOperand &SrcOp = MI.getOperand(1); | |||
1662 | // FIXME: Will this work for 64-bit floating point immediates? | |||
1663 | assert(!SrcOp.isFPImm())((!SrcOp.isFPImm()) ? static_cast<void> (0) : __assert_fail ("!SrcOp.isFPImm()", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1663, __PRETTY_FUNCTION__)); | |||
1664 | if (SrcOp.isImm()) { | |||
1665 | APInt Imm(64, SrcOp.getImm()); | |||
1666 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) | |||
1667 | .addImm(Imm.getLoBits(32).getZExtValue()) | |||
1668 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1669 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) | |||
1670 | .addImm(Imm.getHiBits(32).getZExtValue()) | |||
1671 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1672 | } else { | |||
1673 | assert(SrcOp.isReg())((SrcOp.isReg()) ? static_cast<void> (0) : __assert_fail ("SrcOp.isReg()", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1673, __PRETTY_FUNCTION__)); | |||
1674 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) | |||
1675 | .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) | |||
1676 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1677 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) | |||
1678 | .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) | |||
1679 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1680 | } | |||
1681 | MI.eraseFromParent(); | |||
1682 | break; | |||
1683 | } | |||
1684 | case AMDGPU::V_MOV_B64_DPP_PSEUDO: { | |||
1685 | expandMovDPP64(MI); | |||
1686 | break; | |||
1687 | } | |||
1688 | case AMDGPU::V_SET_INACTIVE_B32: { | |||
1689 | unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; | |||
1690 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
1691 | auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); | |||
1692 | FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten | |||
1693 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) | |||
1694 | .add(MI.getOperand(2)); | |||
1695 | BuildMI(MBB, MI, DL, get(NotOpc), Exec) | |||
1696 | .addReg(Exec); | |||
1697 | MI.eraseFromParent(); | |||
1698 | break; | |||
1699 | } | |||
1700 | case AMDGPU::V_SET_INACTIVE_B64: { | |||
1701 | unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; | |||
1702 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
1703 | auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); | |||
1704 | FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten | |||
1705 | MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), | |||
1706 | MI.getOperand(0).getReg()) | |||
1707 | .add(MI.getOperand(2)); | |||
1708 | expandPostRAPseudo(*Copy); | |||
1709 | BuildMI(MBB, MI, DL, get(NotOpc), Exec) | |||
1710 | .addReg(Exec); | |||
1711 | MI.eraseFromParent(); | |||
1712 | break; | |||
1713 | } | |||
1714 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1: | |||
1715 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2: | |||
1716 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3: | |||
1717 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4: | |||
1718 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5: | |||
1719 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8: | |||
1720 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16: | |||
1721 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32: | |||
1722 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1: | |||
1723 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2: | |||
1724 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3: | |||
1725 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4: | |||
1726 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5: | |||
1727 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8: | |||
1728 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16: | |||
1729 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32: | |||
1730 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1: | |||
1731 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2: | |||
1732 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4: | |||
1733 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8: | |||
1734 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: { | |||
1735 | const TargetRegisterClass *EltRC = getOpRegClass(MI, 2); | |||
1736 | ||||
1737 | unsigned Opc; | |||
1738 | if (RI.hasVGPRs(EltRC)) { | |||
1739 | Opc = AMDGPU::V_MOVRELD_B32_e32; | |||
1740 | } else { | |||
1741 | Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64 | |||
1742 | : AMDGPU::S_MOVRELD_B32; | |||
1743 | } | |||
1744 | ||||
1745 | const MCInstrDesc &OpDesc = get(Opc); | |||
1746 | Register VecReg = MI.getOperand(0).getReg(); | |||
1747 | bool IsUndef = MI.getOperand(1).isUndef(); | |||
1748 | unsigned SubReg = MI.getOperand(3).getImm(); | |||
1749 | assert(VecReg == MI.getOperand(1).getReg())((VecReg == MI.getOperand(1).getReg()) ? static_cast<void> (0) : __assert_fail ("VecReg == MI.getOperand(1).getReg()", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1749, __PRETTY_FUNCTION__)); | |||
1750 | ||||
1751 | MachineInstrBuilder MIB = | |||
1752 | BuildMI(MBB, MI, DL, OpDesc) | |||
1753 | .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) | |||
1754 | .add(MI.getOperand(2)) | |||
1755 | .addReg(VecReg, RegState::ImplicitDefine) | |||
1756 | .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); | |||
1757 | ||||
1758 | const int ImpDefIdx = | |||
1759 | OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); | |||
1760 | const int ImpUseIdx = ImpDefIdx + 1; | |||
1761 | MIB->tieOperands(ImpDefIdx, ImpUseIdx); | |||
1762 | MI.eraseFromParent(); | |||
1763 | break; | |||
1764 | } | |||
1765 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1: | |||
1766 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2: | |||
1767 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3: | |||
1768 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4: | |||
1769 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5: | |||
1770 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8: | |||
1771 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16: | |||
1772 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: { | |||
1773 | assert(ST.useVGPRIndexMode())((ST.useVGPRIndexMode()) ? static_cast<void> (0) : __assert_fail ("ST.useVGPRIndexMode()", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1773, __PRETTY_FUNCTION__)); | |||
1774 | Register VecReg = MI.getOperand(0).getReg(); | |||
1775 | bool IsUndef = MI.getOperand(1).isUndef(); | |||
1776 | Register Idx = MI.getOperand(3).getReg(); | |||
1777 | Register SubReg = MI.getOperand(4).getImm(); | |||
1778 | ||||
1779 | MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) | |||
1780 | .addReg(Idx) | |||
1781 | .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); | |||
1782 | SetOn->getOperand(3).setIsUndef(); | |||
1783 | ||||
1784 | const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect); | |||
1785 | MachineInstrBuilder MIB = | |||
1786 | BuildMI(MBB, MI, DL, OpDesc) | |||
1787 | .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) | |||
1788 | .add(MI.getOperand(2)) | |||
1789 | .addReg(VecReg, RegState::ImplicitDefine) | |||
1790 | .addReg(VecReg, | |||
1791 | RegState::Implicit | (IsUndef ? RegState::Undef : 0)); | |||
1792 | ||||
1793 | const int ImpDefIdx = OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); | |||
1794 | const int ImpUseIdx = ImpDefIdx + 1; | |||
1795 | MIB->tieOperands(ImpDefIdx, ImpUseIdx); | |||
1796 | ||||
1797 | MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); | |||
1798 | ||||
1799 | finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); | |||
1800 | ||||
1801 | MI.eraseFromParent(); | |||
1802 | break; | |||
1803 | } | |||
1804 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1: | |||
1805 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2: | |||
1806 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3: | |||
1807 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4: | |||
1808 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5: | |||
1809 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8: | |||
1810 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16: | |||
1811 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: { | |||
1812 | assert(ST.useVGPRIndexMode())((ST.useVGPRIndexMode()) ? static_cast<void> (0) : __assert_fail ("ST.useVGPRIndexMode()", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1812, __PRETTY_FUNCTION__)); | |||
1813 | Register Dst = MI.getOperand(0).getReg(); | |||
1814 | Register VecReg = MI.getOperand(1).getReg(); | |||
1815 | bool IsUndef = MI.getOperand(1).isUndef(); | |||
1816 | Register Idx = MI.getOperand(2).getReg(); | |||
1817 | Register SubReg = MI.getOperand(3).getImm(); | |||
1818 | ||||
1819 | MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) | |||
1820 | .addReg(Idx) | |||
1821 | .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); | |||
1822 | SetOn->getOperand(3).setIsUndef(); | |||
1823 | ||||
1824 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32)) | |||
1825 | .addDef(Dst) | |||
1826 | .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) | |||
1827 | .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)) | |||
1828 | .addReg(AMDGPU::M0, RegState::Implicit); | |||
1829 | ||||
1830 | MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); | |||
1831 | ||||
1832 | finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); | |||
1833 | ||||
1834 | MI.eraseFromParent(); | |||
1835 | break; | |||
1836 | } | |||
1837 | case AMDGPU::SI_PC_ADD_REL_OFFSET: { | |||
1838 | MachineFunction &MF = *MBB.getParent(); | |||
1839 | Register Reg = MI.getOperand(0).getReg(); | |||
1840 | Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0); | |||
1841 | Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1); | |||
1842 | ||||
1843 | // Create a bundle so these instructions won't be re-ordered by the | |||
1844 | // post-RA scheduler. | |||
1845 | MIBundleBuilder Bundler(MBB, MI); | |||
1846 | Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); | |||
1847 | ||||
1848 | // Add 32-bit offset from this instruction to the start of the | |||
1849 | // constant data. | |||
1850 | Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) | |||
1851 | .addReg(RegLo) | |||
1852 | .add(MI.getOperand(1))); | |||
1853 | ||||
1854 | MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) | |||
1855 | .addReg(RegHi); | |||
1856 | MIB.add(MI.getOperand(2)); | |||
1857 | ||||
1858 | Bundler.append(MIB); | |||
1859 | finalizeBundle(MBB, Bundler.begin()); | |||
1860 | ||||
1861 | MI.eraseFromParent(); | |||
1862 | break; | |||
1863 | } | |||
1864 | case AMDGPU::ENTER_WWM: { | |||
1865 | // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when | |||
1866 | // WWM is entered. | |||
1867 | MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 | |||
1868 | : AMDGPU::S_OR_SAVEEXEC_B64)); | |||
1869 | break; | |||
1870 | } | |||
1871 | case AMDGPU::EXIT_WWM: { | |||
1872 | // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when | |||
1873 | // WWM is exited. | |||
1874 | MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); | |||
1875 | break; | |||
1876 | } | |||
1877 | } | |||
1878 | return true; | |||
1879 | } | |||
1880 | ||||
1881 | std::pair<MachineInstr*, MachineInstr*> | |||
1882 | SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { | |||
1883 | assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO)((MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) ? static_cast <void> (0) : __assert_fail ("MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1883, __PRETTY_FUNCTION__)); | |||
1884 | ||||
1885 | MachineBasicBlock &MBB = *MI.getParent(); | |||
1886 | DebugLoc DL = MBB.findDebugLoc(MI); | |||
1887 | MachineFunction *MF = MBB.getParent(); | |||
1888 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
1889 | Register Dst = MI.getOperand(0).getReg(); | |||
1890 | unsigned Part = 0; | |||
1891 | MachineInstr *Split[2]; | |||
1892 | ||||
1893 | ||||
1894 | for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) { | |||
1895 | auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp)); | |||
1896 | if (Dst.isPhysical()) { | |||
1897 | MovDPP.addDef(RI.getSubReg(Dst, Sub)); | |||
1898 | } else { | |||
1899 | assert(MRI.isSSA())((MRI.isSSA()) ? static_cast<void> (0) : __assert_fail ( "MRI.isSSA()", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1899, __PRETTY_FUNCTION__)); | |||
1900 | auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
1901 | MovDPP.addDef(Tmp); | |||
1902 | } | |||
1903 | ||||
1904 | for (unsigned I = 1; I <= 2; ++I) { // old and src operands. | |||
1905 | const MachineOperand &SrcOp = MI.getOperand(I); | |||
1906 | assert(!SrcOp.isFPImm())((!SrcOp.isFPImm()) ? static_cast<void> (0) : __assert_fail ("!SrcOp.isFPImm()", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1906, __PRETTY_FUNCTION__)); | |||
1907 | if (SrcOp.isImm()) { | |||
1908 | APInt Imm(64, SrcOp.getImm()); | |||
1909 | Imm.ashrInPlace(Part * 32); | |||
1910 | MovDPP.addImm(Imm.getLoBits(32).getZExtValue()); | |||
1911 | } else { | |||
1912 | assert(SrcOp.isReg())((SrcOp.isReg()) ? static_cast<void> (0) : __assert_fail ("SrcOp.isReg()", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1912, __PRETTY_FUNCTION__)); | |||
1913 | Register Src = SrcOp.getReg(); | |||
1914 | if (Src.isPhysical()) | |||
1915 | MovDPP.addReg(RI.getSubReg(Src, Sub)); | |||
1916 | else | |||
1917 | MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub); | |||
1918 | } | |||
1919 | } | |||
1920 | ||||
1921 | for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I) | |||
1922 | MovDPP.addImm(MI.getOperand(I).getImm()); | |||
1923 | ||||
1924 | Split[Part] = MovDPP; | |||
1925 | ++Part; | |||
1926 | } | |||
1927 | ||||
1928 | if (Dst.isVirtual()) | |||
1929 | BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst) | |||
1930 | .addReg(Split[0]->getOperand(0).getReg()) | |||
| ||||
1931 | .addImm(AMDGPU::sub0) | |||
1932 | .addReg(Split[1]->getOperand(0).getReg()) | |||
1933 | .addImm(AMDGPU::sub1); | |||
1934 | ||||
1935 | MI.eraseFromParent(); | |||
1936 | return std::make_pair(Split[0], Split[1]); | |||
1937 | } | |||
1938 | ||||
1939 | bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, | |||
1940 | MachineOperand &Src0, | |||
1941 | unsigned Src0OpName, | |||
1942 | MachineOperand &Src1, | |||
1943 | unsigned Src1OpName) const { | |||
1944 | MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); | |||
1945 | if (!Src0Mods) | |||
1946 | return false; | |||
1947 | ||||
1948 | MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); | |||
1949 | assert(Src1Mods &&((Src1Mods && "All commutable instructions have both src0 and src1 modifiers" ) ? static_cast<void> (0) : __assert_fail ("Src1Mods && \"All commutable instructions have both src0 and src1 modifiers\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1950, __PRETTY_FUNCTION__)) | |||
1950 | "All commutable instructions have both src0 and src1 modifiers")((Src1Mods && "All commutable instructions have both src0 and src1 modifiers" ) ? static_cast<void> (0) : __assert_fail ("Src1Mods && \"All commutable instructions have both src0 and src1 modifiers\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1950, __PRETTY_FUNCTION__)); | |||
1951 | ||||
1952 | int Src0ModsVal = Src0Mods->getImm(); | |||
1953 | int Src1ModsVal = Src1Mods->getImm(); | |||
1954 | ||||
1955 | Src1Mods->setImm(Src0ModsVal); | |||
1956 | Src0Mods->setImm(Src1ModsVal); | |||
1957 | return true; | |||
1958 | } | |||
1959 | ||||
1960 | static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, | |||
1961 | MachineOperand &RegOp, | |||
1962 | MachineOperand &NonRegOp) { | |||
1963 | Register Reg = RegOp.getReg(); | |||
1964 | unsigned SubReg = RegOp.getSubReg(); | |||
1965 | bool IsKill = RegOp.isKill(); | |||
1966 | bool IsDead = RegOp.isDead(); | |||
1967 | bool IsUndef = RegOp.isUndef(); | |||
1968 | bool IsDebug = RegOp.isDebug(); | |||
1969 | ||||
1970 | if (NonRegOp.isImm()) | |||
1971 | RegOp.ChangeToImmediate(NonRegOp.getImm()); | |||
1972 | else if (NonRegOp.isFI()) | |||
1973 | RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); | |||
1974 | else if (NonRegOp.isGlobal()) { | |||
1975 | RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(), | |||
1976 | NonRegOp.getTargetFlags()); | |||
1977 | } else | |||
1978 | return nullptr; | |||
1979 | ||||
1980 | // Make sure we don't reinterpret a subreg index in the target flags. | |||
1981 | RegOp.setTargetFlags(NonRegOp.getTargetFlags()); | |||
1982 | ||||
1983 | NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); | |||
1984 | NonRegOp.setSubReg(SubReg); | |||
1985 | ||||
1986 | return &MI; | |||
1987 | } | |||
1988 | ||||
1989 | MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, | |||
1990 | unsigned Src0Idx, | |||
1991 | unsigned Src1Idx) const { | |||
1992 | assert(!NewMI && "this should never be used")((!NewMI && "this should never be used") ? static_cast <void> (0) : __assert_fail ("!NewMI && \"this should never be used\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1992, __PRETTY_FUNCTION__)); | |||
1993 | ||||
1994 | unsigned Opc = MI.getOpcode(); | |||
1995 | int CommutedOpcode = commuteOpcode(Opc); | |||
1996 | if (CommutedOpcode == -1) | |||
1997 | return nullptr; | |||
1998 | ||||
1999 | assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==((AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast <int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc , AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && "inconsistency with findCommutedOpIndices") ? static_cast< void> (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2003, __PRETTY_FUNCTION__)) | |||
2000 | static_cast<int>(Src0Idx) &&((AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast <int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc , AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && "inconsistency with findCommutedOpIndices") ? static_cast< void> (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2003, __PRETTY_FUNCTION__)) | |||
2001 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==((AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast <int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc , AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && "inconsistency with findCommutedOpIndices") ? static_cast< void> (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2003, __PRETTY_FUNCTION__)) | |||
2002 | static_cast<int>(Src1Idx) &&((AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast <int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc , AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && "inconsistency with findCommutedOpIndices") ? static_cast< void> (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2003, __PRETTY_FUNCTION__)) | |||
2003 | "inconsistency with findCommutedOpIndices")((AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast <int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc , AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && "inconsistency with findCommutedOpIndices") ? static_cast< void> (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2003, __PRETTY_FUNCTION__)); | |||
2004 | ||||
2005 | MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
2006 | MachineOperand &Src1 = MI.getOperand(Src1Idx); | |||
2007 | ||||
2008 | MachineInstr *CommutedMI = nullptr; | |||
2009 | if (Src0.isReg() && Src1.isReg()) { | |||
2010 | if (isOperandLegal(MI, Src1Idx, &Src0)) { | |||
2011 | // Be sure to copy the source modifiers to the right place. | |||
2012 | CommutedMI | |||
2013 | = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); | |||
2014 | } | |||
2015 | ||||
2016 | } else if (Src0.isReg() && !Src1.isReg()) { | |||
2017 | // src0 should always be able to support any operand type, so no need to | |||
2018 | // check operand legality. | |||
2019 | CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); | |||
2020 | } else if (!Src0.isReg() && Src1.isReg()) { | |||
2021 | if (isOperandLegal(MI, Src1Idx, &Src0)) | |||
2022 | CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); | |||
2023 | } else { | |||
2024 | // FIXME: Found two non registers to commute. This does happen. | |||
2025 | return nullptr; | |||
2026 | } | |||
2027 | ||||
2028 | if (CommutedMI) { | |||
2029 | swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, | |||
2030 | Src1, AMDGPU::OpName::src1_modifiers); | |||
2031 | ||||
2032 | CommutedMI->setDesc(get(CommutedOpcode)); | |||
2033 | } | |||
2034 | ||||
2035 | return CommutedMI; | |||
2036 | } | |||
2037 | ||||
2038 | // This needs to be implemented because the source modifiers may be inserted | |||
2039 | // between the true commutable operands, and the base | |||
2040 | // TargetInstrInfo::commuteInstruction uses it. | |||
2041 | bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI, | |||
2042 | unsigned &SrcOpIdx0, | |||
2043 | unsigned &SrcOpIdx1) const { | |||
2044 | return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); | |||
2045 | } | |||
2046 | ||||
2047 | bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0, | |||
2048 | unsigned &SrcOpIdx1) const { | |||
2049 | if (!Desc.isCommutable()) | |||
2050 | return false; | |||
2051 | ||||
2052 | unsigned Opc = Desc.getOpcode(); | |||
2053 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); | |||
2054 | if (Src0Idx == -1) | |||
2055 | return false; | |||
2056 | ||||
2057 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); | |||
2058 | if (Src1Idx == -1) | |||
2059 | return false; | |||
2060 | ||||
2061 | return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); | |||
2062 | } | |||
2063 | ||||
2064 | bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, | |||
2065 | int64_t BrOffset) const { | |||
2066 | // BranchRelaxation should never have to check s_setpc_b64 because its dest | |||
2067 | // block is unanalyzable. | |||
2068 | assert(BranchOp != AMDGPU::S_SETPC_B64)((BranchOp != AMDGPU::S_SETPC_B64) ? static_cast<void> ( 0) : __assert_fail ("BranchOp != AMDGPU::S_SETPC_B64", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2068, __PRETTY_FUNCTION__)); | |||
2069 | ||||
2070 | // Convert to dwords. | |||
2071 | BrOffset /= 4; | |||
2072 | ||||
2073 | // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is | |||
2074 | // from the next instruction. | |||
2075 | BrOffset -= 1; | |||
2076 | ||||
2077 | return isIntN(BranchOffsetBits, BrOffset); | |||
2078 | } | |||
2079 | ||||
2080 | MachineBasicBlock *SIInstrInfo::getBranchDestBlock( | |||
2081 | const MachineInstr &MI) const { | |||
2082 | if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { | |||
2083 | // This would be a difficult analysis to perform, but can always be legal so | |||
2084 | // there's no need to analyze it. | |||
2085 | return nullptr; | |||
2086 | } | |||
2087 | ||||
2088 | return MI.getOperand(0).getMBB(); | |||
2089 | } | |||
2090 | ||||
2091 | unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, | |||
2092 | MachineBasicBlock &DestBB, | |||
2093 | const DebugLoc &DL, | |||
2094 | int64_t BrOffset, | |||
2095 | RegScavenger *RS) const { | |||
2096 | assert(RS && "RegScavenger required for long branching")((RS && "RegScavenger required for long branching") ? static_cast<void> (0) : __assert_fail ("RS && \"RegScavenger required for long branching\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2096, __PRETTY_FUNCTION__)); | |||
2097 | assert(MBB.empty() &&((MBB.empty() && "new block should be inserted for expanding unconditional branch" ) ? static_cast<void> (0) : __assert_fail ("MBB.empty() && \"new block should be inserted for expanding unconditional branch\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2098, __PRETTY_FUNCTION__)) | |||
2098 | "new block should be inserted for expanding unconditional branch")((MBB.empty() && "new block should be inserted for expanding unconditional branch" ) ? static_cast<void> (0) : __assert_fail ("MBB.empty() && \"new block should be inserted for expanding unconditional branch\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2098, __PRETTY_FUNCTION__)); | |||
2099 | assert(MBB.pred_size() == 1)((MBB.pred_size() == 1) ? static_cast<void> (0) : __assert_fail ("MBB.pred_size() == 1", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2099, __PRETTY_FUNCTION__)); | |||
2100 | ||||
2101 | MachineFunction *MF = MBB.getParent(); | |||
2102 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
2103 | ||||
2104 | // FIXME: Virtual register workaround for RegScavenger not working with empty | |||
2105 | // blocks. | |||
2106 | Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); | |||
2107 | ||||
2108 | auto I = MBB.end(); | |||
2109 | ||||
2110 | // We need to compute the offset relative to the instruction immediately after | |||
2111 | // s_getpc_b64. Insert pc arithmetic code before last terminator. | |||
2112 | MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); | |||
2113 | ||||
2114 | // TODO: Handle > 32-bit block address. | |||
2115 | if (BrOffset >= 0) { | |||
2116 | BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) | |||
2117 | .addReg(PCReg, RegState::Define, AMDGPU::sub0) | |||
2118 | .addReg(PCReg, 0, AMDGPU::sub0) | |||
2119 | .addMBB(&DestBB, MO_LONG_BRANCH_FORWARD); | |||
2120 | BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) | |||
2121 | .addReg(PCReg, RegState::Define, AMDGPU::sub1) | |||
2122 | .addReg(PCReg, 0, AMDGPU::sub1) | |||
2123 | .addImm(0); | |||
2124 | } else { | |||
2125 | // Backwards branch. | |||
2126 | BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) | |||
2127 | .addReg(PCReg, RegState::Define, AMDGPU::sub0) | |||
2128 | .addReg(PCReg, 0, AMDGPU::sub0) | |||
2129 | .addMBB(&DestBB, MO_LONG_BRANCH_BACKWARD); | |||
2130 | BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) | |||
2131 | .addReg(PCReg, RegState::Define, AMDGPU::sub1) | |||
2132 | .addReg(PCReg, 0, AMDGPU::sub1) | |||
2133 | .addImm(0); | |||
2134 | } | |||
2135 | ||||
2136 | // Insert the indirect branch after the other terminator. | |||
2137 | BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) | |||
2138 | .addReg(PCReg); | |||
2139 | ||||
2140 | // FIXME: If spilling is necessary, this will fail because this scavenger has | |||
2141 | // no emergency stack slots. It is non-trivial to spill in this situation, | |||
2142 | // because the restore code needs to be specially placed after the | |||
2143 | // jump. BranchRelaxation then needs to be made aware of the newly inserted | |||
2144 | // block. | |||
2145 | // | |||
2146 | // If a spill is needed for the pc register pair, we need to insert a spill | |||
2147 | // restore block right before the destination block, and insert a short branch | |||
2148 | // into the old destination block's fallthrough predecessor. | |||
2149 | // e.g.: | |||
2150 | // | |||
2151 | // s_cbranch_scc0 skip_long_branch: | |||
2152 | // | |||
2153 | // long_branch_bb: | |||
2154 | // spill s[8:9] | |||
2155 | // s_getpc_b64 s[8:9] | |||
2156 | // s_add_u32 s8, s8, restore_bb | |||
2157 | // s_addc_u32 s9, s9, 0 | |||
2158 | // s_setpc_b64 s[8:9] | |||
2159 | // | |||
2160 | // skip_long_branch: | |||
2161 | // foo; | |||
2162 | // | |||
2163 | // ..... | |||
2164 | // | |||
2165 | // dest_bb_fallthrough_predecessor: | |||
2166 | // bar; | |||
2167 | // s_branch dest_bb | |||
2168 | // | |||
2169 | // restore_bb: | |||
2170 | // restore s[8:9] | |||
2171 | // fallthrough dest_bb | |||
2172 | /// | |||
2173 | // dest_bb: | |||
2174 | // buzz; | |||
2175 | ||||
2176 | RS->enterBasicBlockEnd(MBB); | |||
2177 | Register Scav = RS->scavengeRegisterBackwards( | |||
2178 | AMDGPU::SReg_64RegClass, | |||
2179 | MachineBasicBlock::iterator(GetPC), false, 0); | |||
2180 | MRI.replaceRegWith(PCReg, Scav); | |||
2181 | MRI.clearVirtRegs(); | |||
2182 | RS->setRegUsed(Scav); | |||
2183 | ||||
2184 | return 4 + 8 + 4 + 4; | |||
2185 | } | |||
2186 | ||||
2187 | unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { | |||
2188 | switch (Cond) { | |||
2189 | case SIInstrInfo::SCC_TRUE: | |||
2190 | return AMDGPU::S_CBRANCH_SCC1; | |||
2191 | case SIInstrInfo::SCC_FALSE: | |||
2192 | return AMDGPU::S_CBRANCH_SCC0; | |||
2193 | case SIInstrInfo::VCCNZ: | |||
2194 | return AMDGPU::S_CBRANCH_VCCNZ; | |||
2195 | case SIInstrInfo::VCCZ: | |||
2196 | return AMDGPU::S_CBRANCH_VCCZ; | |||
2197 | case SIInstrInfo::EXECNZ: | |||
2198 | return AMDGPU::S_CBRANCH_EXECNZ; | |||
2199 | case SIInstrInfo::EXECZ: | |||
2200 | return AMDGPU::S_CBRANCH_EXECZ; | |||
2201 | default: | |||
2202 | llvm_unreachable("invalid branch predicate")::llvm::llvm_unreachable_internal("invalid branch predicate", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2202); | |||
2203 | } | |||
2204 | } | |||
2205 | ||||
2206 | SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { | |||
2207 | switch (Opcode) { | |||
2208 | case AMDGPU::S_CBRANCH_SCC0: | |||
2209 | return SCC_FALSE; | |||
2210 | case AMDGPU::S_CBRANCH_SCC1: | |||
2211 | return SCC_TRUE; | |||
2212 | case AMDGPU::S_CBRANCH_VCCNZ: | |||
2213 | return VCCNZ; | |||
2214 | case AMDGPU::S_CBRANCH_VCCZ: | |||
2215 | return VCCZ; | |||
2216 | case AMDGPU::S_CBRANCH_EXECNZ: | |||
2217 | return EXECNZ; | |||
2218 | case AMDGPU::S_CBRANCH_EXECZ: | |||
2219 | return EXECZ; | |||
2220 | default: | |||
2221 | return INVALID_BR; | |||
2222 | } | |||
2223 | } | |||
2224 | ||||
2225 | bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, | |||
2226 | MachineBasicBlock::iterator I, | |||
2227 | MachineBasicBlock *&TBB, | |||
2228 | MachineBasicBlock *&FBB, | |||
2229 | SmallVectorImpl<MachineOperand> &Cond, | |||
2230 | bool AllowModify) const { | |||
2231 | if (I->getOpcode() == AMDGPU::S_BRANCH) { | |||
2232 | // Unconditional Branch | |||
2233 | TBB = I->getOperand(0).getMBB(); | |||
2234 | return false; | |||
2235 | } | |||
2236 | ||||
2237 | MachineBasicBlock *CondBB = nullptr; | |||
2238 | ||||
2239 | if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { | |||
2240 | CondBB = I->getOperand(1).getMBB(); | |||
2241 | Cond.push_back(I->getOperand(0)); | |||
2242 | } else { | |||
2243 | BranchPredicate Pred = getBranchPredicate(I->getOpcode()); | |||
2244 | if (Pred == INVALID_BR) | |||
2245 | return true; | |||
2246 | ||||
2247 | CondBB = I->getOperand(0).getMBB(); | |||
2248 | Cond.push_back(MachineOperand::CreateImm(Pred)); | |||
2249 | Cond.push_back(I->getOperand(1)); // Save the branch register. | |||
2250 | } | |||
2251 | ++I; | |||
2252 | ||||
2253 | if (I == MBB.end()) { | |||
2254 | // Conditional branch followed by fall-through. | |||
2255 | TBB = CondBB; | |||
2256 | return false; | |||
2257 | } | |||
2258 | ||||
2259 | if (I->getOpcode() == AMDGPU::S_BRANCH) { | |||
2260 | TBB = CondBB; | |||
2261 | FBB = I->getOperand(0).getMBB(); | |||
2262 | return false; | |||
2263 | } | |||
2264 | ||||
2265 | return true; | |||
2266 | } | |||
2267 | ||||
2268 | bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, | |||
2269 | MachineBasicBlock *&FBB, | |||
2270 | SmallVectorImpl<MachineOperand> &Cond, | |||
2271 | bool AllowModify) const { | |||
2272 | MachineBasicBlock::iterator I = MBB.getFirstTerminator(); | |||
2273 | auto E = MBB.end(); | |||
2274 | if (I == E) | |||
2275 | return false; | |||
2276 | ||||
2277 | // Skip over the instructions that are artificially terminators for special | |||
2278 | // exec management. | |||
2279 | while (I != E && !I->isBranch() && !I->isReturn() && | |||
2280 | I->getOpcode() != AMDGPU::SI_MASK_BRANCH) { | |||
2281 | switch (I->getOpcode()) { | |||
2282 | case AMDGPU::SI_MASK_BRANCH: | |||
2283 | case AMDGPU::S_MOV_B64_term: | |||
2284 | case AMDGPU::S_XOR_B64_term: | |||
2285 | case AMDGPU::S_OR_B64_term: | |||
2286 | case AMDGPU::S_ANDN2_B64_term: | |||
2287 | case AMDGPU::S_AND_B64_term: | |||
2288 | case AMDGPU::S_MOV_B32_term: | |||
2289 | case AMDGPU::S_XOR_B32_term: | |||
2290 | case AMDGPU::S_OR_B32_term: | |||
2291 | case AMDGPU::S_ANDN2_B32_term: | |||
2292 | case AMDGPU::S_AND_B32_term: | |||
2293 | break; | |||
2294 | case AMDGPU::SI_IF: | |||
2295 | case AMDGPU::SI_ELSE: | |||
2296 | case AMDGPU::SI_KILL_I1_TERMINATOR: | |||
2297 | case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: | |||
2298 | // FIXME: It's messy that these need to be considered here at all. | |||
2299 | return true; | |||
2300 | default: | |||
2301 | llvm_unreachable("unexpected non-branch terminator inst")::llvm::llvm_unreachable_internal("unexpected non-branch terminator inst" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2301); | |||
2302 | } | |||
2303 | ||||
2304 | ++I; | |||
2305 | } | |||
2306 | ||||
2307 | if (I == E) | |||
2308 | return false; | |||
2309 | ||||
2310 | if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) | |||
2311 | return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); | |||
2312 | ||||
2313 | ++I; | |||
2314 | ||||
2315 | // TODO: Should be able to treat as fallthrough? | |||
2316 | if (I == MBB.end()) | |||
2317 | return true; | |||
2318 | ||||
2319 | if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) | |||
2320 | return true; | |||
2321 | ||||
2322 | MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); | |||
2323 | ||||
2324 | // Specifically handle the case where the conditional branch is to the same | |||
2325 | // destination as the mask branch. e.g. | |||
2326 | // | |||
2327 | // si_mask_branch BB8 | |||
2328 | // s_cbranch_execz BB8 | |||
2329 | // s_cbranch BB9 | |||
2330 | // | |||
2331 | // This is required to understand divergent loops which may need the branches | |||
2332 | // to be relaxed. | |||
2333 | if (TBB != MaskBrDest || Cond.empty()) | |||
2334 | return true; | |||
2335 | ||||
2336 | auto Pred = Cond[0].getImm(); | |||
2337 | return (Pred != EXECZ && Pred != EXECNZ); | |||
2338 | } | |||
2339 | ||||
2340 | unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, | |||
2341 | int *BytesRemoved) const { | |||
2342 | MachineBasicBlock::iterator I = MBB.getFirstTerminator(); | |||
2343 | ||||
2344 | unsigned Count = 0; | |||
2345 | unsigned RemovedSize = 0; | |||
2346 | while (I != MBB.end()) { | |||
2347 | MachineBasicBlock::iterator Next = std::next(I); | |||
2348 | if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { | |||
2349 | I = Next; | |||
2350 | continue; | |||
2351 | } | |||
2352 | ||||
2353 | RemovedSize += getInstSizeInBytes(*I); | |||
2354 | I->eraseFromParent(); | |||
2355 | ++Count; | |||
2356 | I = Next; | |||
2357 | } | |||
2358 | ||||
2359 | if (BytesRemoved) | |||
2360 | *BytesRemoved = RemovedSize; | |||
2361 | ||||
2362 | return Count; | |||
2363 | } | |||
2364 | ||||
2365 | // Copy the flags onto the implicit condition register operand. | |||
2366 | static void preserveCondRegFlags(MachineOperand &CondReg, | |||
2367 | const MachineOperand &OrigCond) { | |||
2368 | CondReg.setIsUndef(OrigCond.isUndef()); | |||
2369 | CondReg.setIsKill(OrigCond.isKill()); | |||
2370 | } | |||
2371 | ||||
2372 | unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, | |||
2373 | MachineBasicBlock *TBB, | |||
2374 | MachineBasicBlock *FBB, | |||
2375 | ArrayRef<MachineOperand> Cond, | |||
2376 | const DebugLoc &DL, | |||
2377 | int *BytesAdded) const { | |||
2378 | if (!FBB && Cond.empty()) { | |||
2379 | BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) | |||
2380 | .addMBB(TBB); | |||
2381 | if (BytesAdded) | |||
2382 | *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; | |||
2383 | return 1; | |||
2384 | } | |||
2385 | ||||
2386 | if(Cond.size() == 1 && Cond[0].isReg()) { | |||
2387 | BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) | |||
2388 | .add(Cond[0]) | |||
2389 | .addMBB(TBB); | |||
2390 | return 1; | |||
2391 | } | |||
2392 | ||||
2393 | assert(TBB && Cond[0].isImm())((TBB && Cond[0].isImm()) ? static_cast<void> ( 0) : __assert_fail ("TBB && Cond[0].isImm()", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2393, __PRETTY_FUNCTION__)); | |||
2394 | ||||
2395 | unsigned Opcode | |||
2396 | = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); | |||
2397 | ||||
2398 | if (!FBB) { | |||
2399 | Cond[1].isUndef(); | |||
2400 | MachineInstr *CondBr = | |||
2401 | BuildMI(&MBB, DL, get(Opcode)) | |||
2402 | .addMBB(TBB); | |||
2403 | ||||
2404 | // Copy the flags onto the implicit condition register operand. | |||
2405 | preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); | |||
2406 | fixImplicitOperands(*CondBr); | |||
2407 | ||||
2408 | if (BytesAdded) | |||
2409 | *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; | |||
2410 | return 1; | |||
2411 | } | |||
2412 | ||||
2413 | assert(TBB && FBB)((TBB && FBB) ? static_cast<void> (0) : __assert_fail ("TBB && FBB", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2413, __PRETTY_FUNCTION__)); | |||
2414 | ||||
2415 | MachineInstr *CondBr = | |||
2416 | BuildMI(&MBB, DL, get(Opcode)) | |||
2417 | .addMBB(TBB); | |||
2418 | BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) | |||
2419 | .addMBB(FBB); | |||
2420 | ||||
2421 | MachineOperand &CondReg = CondBr->getOperand(1); | |||
2422 | CondReg.setIsUndef(Cond[1].isUndef()); | |||
2423 | CondReg.setIsKill(Cond[1].isKill()); | |||
2424 | ||||
2425 | if (BytesAdded) | |||
2426 | *BytesAdded = ST.hasOffset3fBug() ? 16 : 8; | |||
2427 | ||||
2428 | return 2; | |||
2429 | } | |||
2430 | ||||
2431 | bool SIInstrInfo::reverseBranchCondition( | |||
2432 | SmallVectorImpl<MachineOperand> &Cond) const { | |||
2433 | if (Cond.size() != 2) { | |||
2434 | return true; | |||
2435 | } | |||
2436 | ||||
2437 | if (Cond[0].isImm()) { | |||
2438 | Cond[0].setImm(-Cond[0].getImm()); | |||
2439 | return false; | |||
2440 | } | |||
2441 | ||||
2442 | return true; | |||
2443 | } | |||
2444 | ||||
2445 | bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, | |||
2446 | ArrayRef<MachineOperand> Cond, | |||
2447 | Register DstReg, Register TrueReg, | |||
2448 | Register FalseReg, int &CondCycles, | |||
2449 | int &TrueCycles, int &FalseCycles) const { | |||
2450 | switch (Cond[0].getImm()) { | |||
2451 | case VCCNZ: | |||
2452 | case VCCZ: { | |||
2453 | const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
2454 | const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); | |||
2455 | if (MRI.getRegClass(FalseReg) != RC) | |||
2456 | return false; | |||
2457 | ||||
2458 | int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; | |||
2459 | CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? | |||
2460 | ||||
2461 | // Limit to equal cost for branch vs. N v_cndmask_b32s. | |||
2462 | return RI.hasVGPRs(RC) && NumInsts <= 6; | |||
2463 | } | |||
2464 | case SCC_TRUE: | |||
2465 | case SCC_FALSE: { | |||
2466 | // FIXME: We could insert for VGPRs if we could replace the original compare | |||
2467 | // with a vector one. | |||
2468 | const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
2469 | const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); | |||
2470 | if (MRI.getRegClass(FalseReg) != RC) | |||
2471 | return false; | |||
2472 | ||||
2473 | int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; | |||
2474 | ||||
2475 | // Multiples of 8 can do s_cselect_b64 | |||
2476 | if (NumInsts % 2 == 0) | |||
2477 | NumInsts /= 2; | |||
2478 | ||||
2479 | CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? | |||
2480 | return RI.isSGPRClass(RC); | |||
2481 | } | |||
2482 | default: | |||
2483 | return false; | |||
2484 | } | |||
2485 | } | |||
2486 | ||||
2487 | void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, | |||
2488 | MachineBasicBlock::iterator I, const DebugLoc &DL, | |||
2489 | Register DstReg, ArrayRef<MachineOperand> Cond, | |||
2490 | Register TrueReg, Register FalseReg) const { | |||
2491 | BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); | |||
2492 | if (Pred == VCCZ || Pred == SCC_FALSE) { | |||
2493 | Pred = static_cast<BranchPredicate>(-Pred); | |||
2494 | std::swap(TrueReg, FalseReg); | |||
2495 | } | |||
2496 | ||||
2497 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
2498 | const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); | |||
2499 | unsigned DstSize = RI.getRegSizeInBits(*DstRC); | |||
2500 | ||||
2501 | if (DstSize == 32) { | |||
2502 | MachineInstr *Select; | |||
2503 | if (Pred == SCC_TRUE) { | |||
2504 | Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg) | |||
2505 | .addReg(TrueReg) | |||
2506 | .addReg(FalseReg); | |||
2507 | } else { | |||
2508 | // Instruction's operands are backwards from what is expected. | |||
2509 | Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg) | |||
2510 | .addReg(FalseReg) | |||
2511 | .addReg(TrueReg); | |||
2512 | } | |||
2513 | ||||
2514 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); | |||
2515 | return; | |||
2516 | } | |||
2517 | ||||
2518 | if (DstSize == 64 && Pred == SCC_TRUE) { | |||
2519 | MachineInstr *Select = | |||
2520 | BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) | |||
2521 | .addReg(TrueReg) | |||
2522 | .addReg(FalseReg); | |||
2523 | ||||
2524 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); | |||
2525 | return; | |||
2526 | } | |||
2527 | ||||
2528 | static const int16_t Sub0_15[] = { | |||
2529 | AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, | |||
2530 | AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, | |||
2531 | AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, | |||
2532 | AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, | |||
2533 | }; | |||
2534 | ||||
2535 | static const int16_t Sub0_15_64[] = { | |||
2536 | AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, | |||
2537 | AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, | |||
2538 | AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, | |||
2539 | AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, | |||
2540 | }; | |||
2541 | ||||
2542 | unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; | |||
2543 | const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; | |||
2544 | const int16_t *SubIndices = Sub0_15; | |||
2545 | int NElts = DstSize / 32; | |||
2546 | ||||
2547 | // 64-bit select is only available for SALU. | |||
2548 | // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit. | |||
2549 | if (Pred == SCC_TRUE) { | |||
2550 | if (NElts % 2) { | |||
2551 | SelOp = AMDGPU::S_CSELECT_B32; | |||
2552 | EltRC = &AMDGPU::SGPR_32RegClass; | |||
2553 | } else { | |||
2554 | SelOp = AMDGPU::S_CSELECT_B64; | |||
2555 | EltRC = &AMDGPU::SGPR_64RegClass; | |||
2556 | SubIndices = Sub0_15_64; | |||
2557 | NElts /= 2; | |||
2558 | } | |||
2559 | } | |||
2560 | ||||
2561 | MachineInstrBuilder MIB = BuildMI( | |||
2562 | MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); | |||
2563 | ||||
2564 | I = MIB->getIterator(); | |||
2565 | ||||
2566 | SmallVector<Register, 8> Regs; | |||
2567 | for (int Idx = 0; Idx != NElts; ++Idx) { | |||
2568 | Register DstElt = MRI.createVirtualRegister(EltRC); | |||
2569 | Regs.push_back(DstElt); | |||
2570 | ||||
2571 | unsigned SubIdx = SubIndices[Idx]; | |||
2572 | ||||
2573 | MachineInstr *Select; | |||
2574 | if (SelOp == AMDGPU::V_CNDMASK_B32_e32) { | |||
2575 | Select = | |||
2576 | BuildMI(MBB, I, DL, get(SelOp), DstElt) | |||
2577 | .addReg(FalseReg, 0, SubIdx) | |||
2578 | .addReg(TrueReg, 0, SubIdx); | |||
2579 | } else { | |||
2580 | Select = | |||
2581 | BuildMI(MBB, I, DL, get(SelOp), DstElt) | |||
2582 | .addReg(TrueReg, 0, SubIdx) | |||
2583 | .addReg(FalseReg, 0, SubIdx); | |||
2584 | } | |||
2585 | ||||
2586 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); | |||
2587 | fixImplicitOperands(*Select); | |||
2588 | ||||
2589 | MIB.addReg(DstElt) | |||
2590 | .addImm(SubIdx); | |||
2591 | } | |||
2592 | } | |||
2593 | ||||
2594 | bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { | |||
2595 | switch (MI.getOpcode()) { | |||
2596 | case AMDGPU::V_MOV_B32_e32: | |||
2597 | case AMDGPU::V_MOV_B32_e64: | |||
2598 | case AMDGPU::V_MOV_B64_PSEUDO: { | |||
2599 | // If there are additional implicit register operands, this may be used for | |||
2600 | // register indexing so the source register operand isn't simply copied. | |||
2601 | unsigned NumOps = MI.getDesc().getNumOperands() + | |||
2602 | MI.getDesc().getNumImplicitUses(); | |||
2603 | ||||
2604 | return MI.getNumOperands() == NumOps; | |||
2605 | } | |||
2606 | case AMDGPU::S_MOV_B32: | |||
2607 | case AMDGPU::S_MOV_B64: | |||
2608 | case AMDGPU::COPY: | |||
2609 | case AMDGPU::V_ACCVGPR_WRITE_B32_e64: | |||
2610 | case AMDGPU::V_ACCVGPR_READ_B32_e64: | |||
2611 | return true; | |||
2612 | default: | |||
2613 | return false; | |||
2614 | } | |||
2615 | } | |||
2616 | ||||
2617 | unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( | |||
2618 | unsigned Kind) const { | |||
2619 | switch(Kind) { | |||
2620 | case PseudoSourceValue::Stack: | |||
2621 | case PseudoSourceValue::FixedStack: | |||
2622 | return AMDGPUAS::PRIVATE_ADDRESS; | |||
2623 | case PseudoSourceValue::ConstantPool: | |||
2624 | case PseudoSourceValue::GOT: | |||
2625 | case PseudoSourceValue::JumpTable: | |||
2626 | case PseudoSourceValue::GlobalValueCallEntry: | |||
2627 | case PseudoSourceValue::ExternalSymbolCallEntry: | |||
2628 | case PseudoSourceValue::TargetCustom: | |||
2629 | return AMDGPUAS::CONSTANT_ADDRESS; | |||
2630 | } | |||
2631 | return AMDGPUAS::FLAT_ADDRESS; | |||
2632 | } | |||
2633 | ||||
2634 | static void removeModOperands(MachineInstr &MI) { | |||
2635 | unsigned Opc = MI.getOpcode(); | |||
2636 | int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, | |||
2637 | AMDGPU::OpName::src0_modifiers); | |||
2638 | int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, | |||
2639 | AMDGPU::OpName::src1_modifiers); | |||
2640 | int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, | |||
2641 | AMDGPU::OpName::src2_modifiers); | |||
2642 | ||||
2643 | MI.RemoveOperand(Src2ModIdx); | |||
2644 | MI.RemoveOperand(Src1ModIdx); | |||
2645 | MI.RemoveOperand(Src0ModIdx); | |||
2646 | } | |||
2647 | ||||
2648 | bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, | |||
2649 | Register Reg, MachineRegisterInfo *MRI) const { | |||
2650 | if (!MRI->hasOneNonDBGUse(Reg)) | |||
2651 | return false; | |||
2652 | ||||
2653 | switch (DefMI.getOpcode()) { | |||
2654 | default: | |||
2655 | return false; | |||
2656 | case AMDGPU::S_MOV_B64: | |||
2657 | // TODO: We could fold 64-bit immediates, but this get compilicated | |||
2658 | // when there are sub-registers. | |||
2659 | return false; | |||
2660 | ||||
2661 | case AMDGPU::V_MOV_B32_e32: | |||
2662 | case AMDGPU::S_MOV_B32: | |||
2663 | case AMDGPU::V_ACCVGPR_WRITE_B32_e64: | |||
2664 | break; | |||
2665 | } | |||
2666 | ||||
2667 | const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); | |||
2668 | assert(ImmOp)((ImmOp) ? static_cast<void> (0) : __assert_fail ("ImmOp" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2668, __PRETTY_FUNCTION__)); | |||
2669 | // FIXME: We could handle FrameIndex values here. | |||
2670 | if (!ImmOp->isImm()) | |||
2671 | return false; | |||
2672 | ||||
2673 | unsigned Opc = UseMI.getOpcode(); | |||
2674 | if (Opc == AMDGPU::COPY) { | |||
2675 | Register DstReg = UseMI.getOperand(0).getReg(); | |||
2676 | bool Is16Bit = getOpSize(UseMI, 0) == 2; | |||
2677 | bool isVGPRCopy = RI.isVGPR(*MRI, DstReg); | |||
2678 | unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; | |||
2679 | APInt Imm(32, ImmOp->getImm()); | |||
2680 | ||||
2681 | if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16) | |||
2682 | Imm = Imm.ashr(16); | |||
2683 | ||||
2684 | if (RI.isAGPR(*MRI, DstReg)) { | |||
2685 | if (!isInlineConstant(Imm)) | |||
2686 | return false; | |||
2687 | NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64; | |||
2688 | } | |||
2689 | ||||
2690 | if (Is16Bit) { | |||
2691 | if (isVGPRCopy) | |||
2692 | return false; // Do not clobber vgpr_hi16 | |||
2693 | ||||
2694 | if (DstReg.isVirtual() && | |||
2695 | UseMI.getOperand(0).getSubReg() != AMDGPU::lo16) | |||
2696 | return false; | |||
2697 | ||||
2698 | UseMI.getOperand(0).setSubReg(0); | |||
2699 | if (DstReg.isPhysical()) { | |||
2700 | DstReg = RI.get32BitRegister(DstReg); | |||
2701 | UseMI.getOperand(0).setReg(DstReg); | |||
2702 | } | |||
2703 | assert(UseMI.getOperand(1).getReg().isVirtual())((UseMI.getOperand(1).getReg().isVirtual()) ? static_cast< void> (0) : __assert_fail ("UseMI.getOperand(1).getReg().isVirtual()" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2703, __PRETTY_FUNCTION__)); | |||
2704 | } | |||
2705 | ||||
2706 | UseMI.setDesc(get(NewOpc)); | |||
2707 | UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue()); | |||
2708 | UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); | |||
2709 | return true; | |||
2710 | } | |||
2711 | ||||
2712 | if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || | |||
2713 | Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 || | |||
2714 | Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || | |||
2715 | Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) { | |||
2716 | // Don't fold if we are using source or output modifiers. The new VOP2 | |||
2717 | // instructions don't have them. | |||
2718 | if (hasAnyModifiersSet(UseMI)) | |||
2719 | return false; | |||
2720 | ||||
2721 | // If this is a free constant, there's no reason to do this. | |||
2722 | // TODO: We could fold this here instead of letting SIFoldOperands do it | |||
2723 | // later. | |||
2724 | MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); | |||
2725 | ||||
2726 | // Any src operand can be used for the legality check. | |||
2727 | if (isInlineConstant(UseMI, *Src0, *ImmOp)) | |||
2728 | return false; | |||
2729 | ||||
2730 | bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || | |||
2731 | Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64; | |||
2732 | bool IsFMA = Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || | |||
2733 | Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64; | |||
2734 | MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); | |||
2735 | MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); | |||
2736 | ||||
2737 | // Multiplied part is the constant: Use v_madmk_{f16, f32}. | |||
2738 | // We should only expect these to be on src0 due to canonicalizations. | |||
2739 | if (Src0->isReg() && Src0->getReg() == Reg) { | |||
2740 | if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) | |||
2741 | return false; | |||
2742 | ||||
2743 | if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) | |||
2744 | return false; | |||
2745 | ||||
2746 | unsigned NewOpc = | |||
2747 | IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16) | |||
2748 | : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); | |||
2749 | if (pseudoToMCOpcode(NewOpc) == -1) | |||
2750 | return false; | |||
2751 | ||||
2752 | // We need to swap operands 0 and 1 since madmk constant is at operand 1. | |||
2753 | ||||
2754 | const int64_t Imm = ImmOp->getImm(); | |||
2755 | ||||
2756 | // FIXME: This would be a lot easier if we could return a new instruction | |||
2757 | // instead of having to modify in place. | |||
2758 | ||||
2759 | // Remove these first since they are at the end. | |||
2760 | UseMI.RemoveOperand( | |||
2761 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); | |||
2762 | UseMI.RemoveOperand( | |||
2763 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); | |||
2764 | ||||
2765 | Register Src1Reg = Src1->getReg(); | |||
2766 | unsigned Src1SubReg = Src1->getSubReg(); | |||
2767 | Src0->setReg(Src1Reg); | |||
2768 | Src0->setSubReg(Src1SubReg); | |||
2769 | Src0->setIsKill(Src1->isKill()); | |||
2770 | ||||
2771 | if (Opc == AMDGPU::V_MAC_F32_e64 || | |||
2772 | Opc == AMDGPU::V_MAC_F16_e64 || | |||
2773 | Opc == AMDGPU::V_FMAC_F32_e64 || | |||
2774 | Opc == AMDGPU::V_FMAC_F16_e64) | |||
2775 | UseMI.untieRegOperand( | |||
2776 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); | |||
2777 | ||||
2778 | Src1->ChangeToImmediate(Imm); | |||
2779 | ||||
2780 | removeModOperands(UseMI); | |||
2781 | UseMI.setDesc(get(NewOpc)); | |||
2782 | ||||
2783 | bool DeleteDef = MRI->hasOneNonDBGUse(Reg); | |||
2784 | if (DeleteDef) | |||
2785 | DefMI.eraseFromParent(); | |||
2786 | ||||
2787 | return true; | |||
2788 | } | |||
2789 | ||||
2790 | // Added part is the constant: Use v_madak_{f16, f32}. | |||
2791 | if (Src2->isReg() && Src2->getReg() == Reg) { | |||
2792 | // Not allowed to use constant bus for another operand. | |||
2793 | // We can however allow an inline immediate as src0. | |||
2794 | bool Src0Inlined = false; | |||
2795 | if (Src0->isReg()) { | |||
2796 | // Try to inline constant if possible. | |||
2797 | // If the Def moves immediate and the use is single | |||
2798 | // We are saving VGPR here. | |||
2799 | MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); | |||
2800 | if (Def && Def->isMoveImmediate() && | |||
2801 | isInlineConstant(Def->getOperand(1)) && | |||
2802 | MRI->hasOneUse(Src0->getReg())) { | |||
2803 | Src0->ChangeToImmediate(Def->getOperand(1).getImm()); | |||
2804 | Src0Inlined = true; | |||
2805 | } else if ((Src0->getReg().isPhysical() && | |||
2806 | (ST.getConstantBusLimit(Opc) <= 1 && | |||
2807 | RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) || | |||
2808 | (Src0->getReg().isVirtual() && | |||
2809 | (ST.getConstantBusLimit(Opc) <= 1 && | |||
2810 | RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) | |||
2811 | return false; | |||
2812 | // VGPR is okay as Src0 - fallthrough | |||
2813 | } | |||
2814 | ||||
2815 | if (Src1->isReg() && !Src0Inlined ) { | |||
2816 | // We have one slot for inlinable constant so far - try to fill it | |||
2817 | MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); | |||
2818 | if (Def && Def->isMoveImmediate() && | |||
2819 | isInlineConstant(Def->getOperand(1)) && | |||
2820 | MRI->hasOneUse(Src1->getReg()) && | |||
2821 | commuteInstruction(UseMI)) { | |||
2822 | Src0->ChangeToImmediate(Def->getOperand(1).getImm()); | |||
2823 | } else if ((Src1->getReg().isPhysical() && | |||
2824 | RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) || | |||
2825 | (Src1->getReg().isVirtual() && | |||
2826 | RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) | |||
2827 | return false; | |||
2828 | // VGPR is okay as Src1 - fallthrough | |||
2829 | } | |||
2830 | ||||
2831 | unsigned NewOpc = | |||
2832 | IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16) | |||
2833 | : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); | |||
2834 | if (pseudoToMCOpcode(NewOpc) == -1) | |||
2835 | return false; | |||
2836 | ||||
2837 | const int64_t Imm = ImmOp->getImm(); | |||
2838 | ||||
2839 | // FIXME: This would be a lot easier if we could return a new instruction | |||
2840 | // instead of having to modify in place. | |||
2841 | ||||
2842 | // Remove these first since they are at the end. | |||
2843 | UseMI.RemoveOperand( | |||
2844 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); | |||
2845 | UseMI.RemoveOperand( | |||
2846 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); | |||
2847 | ||||
2848 | if (Opc == AMDGPU::V_MAC_F32_e64 || | |||
2849 | Opc == AMDGPU::V_MAC_F16_e64 || | |||
2850 | Opc == AMDGPU::V_FMAC_F32_e64 || | |||
2851 | Opc == AMDGPU::V_FMAC_F16_e64) | |||
2852 | UseMI.untieRegOperand( | |||
2853 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); | |||
2854 | ||||
2855 | // ChangingToImmediate adds Src2 back to the instruction. | |||
2856 | Src2->ChangeToImmediate(Imm); | |||
2857 | ||||
2858 | // These come before src2. | |||
2859 | removeModOperands(UseMI); | |||
2860 | UseMI.setDesc(get(NewOpc)); | |||
2861 | // It might happen that UseMI was commuted | |||
2862 | // and we now have SGPR as SRC1. If so 2 inlined | |||
2863 | // constant and SGPR are illegal. | |||
2864 | legalizeOperands(UseMI); | |||
2865 | ||||
2866 | bool DeleteDef = MRI->hasOneNonDBGUse(Reg); | |||
2867 | if (DeleteDef) | |||
2868 | DefMI.eraseFromParent(); | |||
2869 | ||||
2870 | return true; | |||
2871 | } | |||
2872 | } | |||
2873 | ||||
2874 | return false; | |||
2875 | } | |||
2876 | ||||
2877 | static bool | |||
2878 | memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1, | |||
2879 | ArrayRef<const MachineOperand *> BaseOps2) { | |||
2880 | if (BaseOps1.size() != BaseOps2.size()) | |||
2881 | return false; | |||
2882 | for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) { | |||
2883 | if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I])) | |||
2884 | return false; | |||
2885 | } | |||
2886 | return true; | |||
2887 | } | |||
2888 | ||||
2889 | static bool offsetsDoNotOverlap(int WidthA, int OffsetA, | |||
2890 | int WidthB, int OffsetB) { | |||
2891 | int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; | |||
2892 | int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; | |||
2893 | int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; | |||
2894 | return LowOffset + LowWidth <= HighOffset; | |||
2895 | } | |||
2896 | ||||
2897 | bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, | |||
2898 | const MachineInstr &MIb) const { | |||
2899 | SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1; | |||
2900 | int64_t Offset0, Offset1; | |||
2901 | unsigned Dummy0, Dummy1; | |||
2902 | bool Offset0IsScalable, Offset1IsScalable; | |||
2903 | if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable, | |||
2904 | Dummy0, &RI) || | |||
2905 | !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable, | |||
2906 | Dummy1, &RI)) | |||
2907 | return false; | |||
2908 | ||||
2909 | if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1)) | |||
2910 | return false; | |||
2911 | ||||
2912 | if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { | |||
2913 | // FIXME: Handle ds_read2 / ds_write2. | |||
2914 | return false; | |||
2915 | } | |||
2916 | unsigned Width0 = MIa.memoperands().front()->getSize(); | |||
2917 | unsigned Width1 = MIb.memoperands().front()->getSize(); | |||
2918 | return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1); | |||
2919 | } | |||
2920 | ||||
2921 | bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, | |||
2922 | const MachineInstr &MIb) const { | |||
2923 | assert(MIa.mayLoadOrStore() &&((MIa.mayLoadOrStore() && "MIa must load from or modify a memory location" ) ? static_cast<void> (0) : __assert_fail ("MIa.mayLoadOrStore() && \"MIa must load from or modify a memory location\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2924, __PRETTY_FUNCTION__)) | |||
2924 | "MIa must load from or modify a memory location")((MIa.mayLoadOrStore() && "MIa must load from or modify a memory location" ) ? static_cast<void> (0) : __assert_fail ("MIa.mayLoadOrStore() && \"MIa must load from or modify a memory location\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2924, __PRETTY_FUNCTION__)); | |||
2925 | assert(MIb.mayLoadOrStore() &&((MIb.mayLoadOrStore() && "MIb must load from or modify a memory location" ) ? static_cast<void> (0) : __assert_fail ("MIb.mayLoadOrStore() && \"MIb must load from or modify a memory location\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2926, __PRETTY_FUNCTION__)) | |||
2926 | "MIb must load from or modify a memory location")((MIb.mayLoadOrStore() && "MIb must load from or modify a memory location" ) ? static_cast<void> (0) : __assert_fail ("MIb.mayLoadOrStore() && \"MIb must load from or modify a memory location\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2926, __PRETTY_FUNCTION__)); | |||
2927 | ||||
2928 | if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) | |||
2929 | return false; | |||
2930 | ||||
2931 | // XXX - Can we relax this between address spaces? | |||
2932 | if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) | |||
2933 | return false; | |||
2934 | ||||
2935 | // TODO: Should we check the address space from the MachineMemOperand? That | |||
2936 | // would allow us to distinguish objects we know don't alias based on the | |||
2937 | // underlying address space, even if it was lowered to a different one, | |||
2938 | // e.g. private accesses lowered to use MUBUF instructions on a scratch | |||
2939 | // buffer. | |||
2940 | if (isDS(MIa)) { | |||
2941 | if (isDS(MIb)) | |||
2942 | return checkInstOffsetsDoNotOverlap(MIa, MIb); | |||
2943 | ||||
2944 | return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb); | |||
2945 | } | |||
2946 | ||||
2947 | if (isMUBUF(MIa) || isMTBUF(MIa)) { | |||
2948 | if (isMUBUF(MIb) || isMTBUF(MIb)) | |||
2949 | return checkInstOffsetsDoNotOverlap(MIa, MIb); | |||
2950 | ||||
2951 | return !isFLAT(MIb) && !isSMRD(MIb); | |||
2952 | } | |||
2953 | ||||
2954 | if (isSMRD(MIa)) { | |||
2955 | if (isSMRD(MIb)) | |||
2956 | return checkInstOffsetsDoNotOverlap(MIa, MIb); | |||
2957 | ||||
2958 | return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb); | |||
2959 | } | |||
2960 | ||||
2961 | if (isFLAT(MIa)) { | |||
2962 | if (isFLAT(MIb)) | |||
2963 | return checkInstOffsetsDoNotOverlap(MIa, MIb); | |||
2964 | ||||
2965 | return false; | |||
2966 | } | |||
2967 | ||||
2968 | return false; | |||
2969 | } | |||
2970 | ||||
2971 | static int64_t getFoldableImm(const MachineOperand* MO) { | |||
2972 | if (!MO->isReg()) | |||
2973 | return false; | |||
2974 | const MachineFunction *MF = MO->getParent()->getParent()->getParent(); | |||
2975 | const MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
2976 | auto Def = MRI.getUniqueVRegDef(MO->getReg()); | |||
2977 | if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 && | |||
2978 | Def->getOperand(1).isImm()) | |||
2979 | return Def->getOperand(1).getImm(); | |||
2980 | return AMDGPU::NoRegister; | |||
2981 | } | |||
2982 | ||||
2983 | static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, | |||
2984 | MachineInstr &NewMI) { | |||
2985 | if (LV) { | |||
2986 | unsigned NumOps = MI.getNumOperands(); | |||
2987 | for (unsigned I = 1; I < NumOps; ++I) { | |||
2988 | MachineOperand &Op = MI.getOperand(I); | |||
2989 | if (Op.isReg() && Op.isKill()) | |||
2990 | LV->replaceKillInstruction(Op.getReg(), MI, NewMI); | |||
2991 | } | |||
2992 | } | |||
2993 | } | |||
2994 | ||||
2995 | MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, | |||
2996 | MachineInstr &MI, | |||
2997 | LiveVariables *LV) const { | |||
2998 | unsigned Opc = MI.getOpcode(); | |||
2999 | bool IsF16 = false; | |||
3000 | bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || | |||
3001 | Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64; | |||
3002 | ||||
3003 | switch (Opc) { | |||
3004 | default: | |||
3005 | return nullptr; | |||
3006 | case AMDGPU::V_MAC_F16_e64: | |||
3007 | case AMDGPU::V_FMAC_F16_e64: | |||
3008 | IsF16 = true; | |||
3009 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | |||
3010 | case AMDGPU::V_MAC_F32_e64: | |||
3011 | case AMDGPU::V_FMAC_F32_e64: | |||
3012 | break; | |||
3013 | case AMDGPU::V_MAC_F16_e32: | |||
3014 | case AMDGPU::V_FMAC_F16_e32: | |||
3015 | IsF16 = true; | |||
3016 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | |||
3017 | case AMDGPU::V_MAC_F32_e32: | |||
3018 | case AMDGPU::V_FMAC_F32_e32: { | |||
3019 | int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), | |||
3020 | AMDGPU::OpName::src0); | |||
3021 | const MachineOperand *Src0 = &MI.getOperand(Src0Idx); | |||
3022 | if (!Src0->isReg() && !Src0->isImm()) | |||
3023 | return nullptr; | |||
3024 | ||||
3025 | if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) | |||
3026 | return nullptr; | |||
3027 | ||||
3028 | break; | |||
3029 | } | |||
3030 | } | |||
3031 | ||||
3032 | const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
3033 | const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); | |||
3034 | const MachineOperand *Src0Mods = | |||
3035 | getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); | |||
3036 | const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); | |||
3037 | const MachineOperand *Src1Mods = | |||
3038 | getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); | |||
3039 | const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); | |||
3040 | const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); | |||
3041 | const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); | |||
3042 | MachineInstrBuilder MIB; | |||
3043 | ||||
3044 | if (!Src0Mods && !Src1Mods && !Clamp && !Omod && | |||
3045 | // If we have an SGPR input, we will violate the constant bus restriction. | |||
3046 | (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || | |||
3047 | !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { | |||
3048 | if (auto Imm = getFoldableImm(Src2)) { | |||
3049 | unsigned NewOpc = | |||
3050 | IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32) | |||
3051 | : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); | |||
3052 | if (pseudoToMCOpcode(NewOpc) != -1) { | |||
3053 | MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3054 | .add(*Dst) | |||
3055 | .add(*Src0) | |||
3056 | .add(*Src1) | |||
3057 | .addImm(Imm); | |||
3058 | updateLiveVariables(LV, MI, *MIB); | |||
3059 | return MIB; | |||
3060 | } | |||
3061 | } | |||
3062 | unsigned NewOpc = IsFMA | |||
3063 | ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32) | |||
3064 | : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); | |||
3065 | if (auto Imm = getFoldableImm(Src1)) { | |||
3066 | if (pseudoToMCOpcode(NewOpc) != -1) { | |||
3067 | MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3068 | .add(*Dst) | |||
3069 | .add(*Src0) | |||
3070 | .addImm(Imm) | |||
3071 | .add(*Src2); | |||
3072 | updateLiveVariables(LV, MI, *MIB); | |||
3073 | return MIB; | |||
3074 | } | |||
3075 | } | |||
3076 | if (auto Imm = getFoldableImm(Src0)) { | |||
3077 | if (pseudoToMCOpcode(NewOpc) != -1 && | |||
3078 | isOperandLegal( | |||
3079 | MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0), | |||
3080 | Src1)) { | |||
3081 | MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3082 | .add(*Dst) | |||
3083 | .add(*Src1) | |||
3084 | .addImm(Imm) | |||
3085 | .add(*Src2); | |||
3086 | updateLiveVariables(LV, MI, *MIB); | |||
3087 | return MIB; | |||
3088 | } | |||
3089 | } | |||
3090 | } | |||
3091 | ||||
3092 | unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64 : AMDGPU::V_FMA_F32_e64) | |||
3093 | : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64); | |||
3094 | if (pseudoToMCOpcode(NewOpc) == -1) | |||
3095 | return nullptr; | |||
3096 | ||||
3097 | MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3098 | .add(*Dst) | |||
3099 | .addImm(Src0Mods ? Src0Mods->getImm() : 0) | |||
3100 | .add(*Src0) | |||
3101 | .addImm(Src1Mods ? Src1Mods->getImm() : 0) | |||
3102 | .add(*Src1) | |||
3103 | .addImm(0) // Src mods | |||
3104 | .add(*Src2) | |||
3105 | .addImm(Clamp ? Clamp->getImm() : 0) | |||
3106 | .addImm(Omod ? Omod->getImm() : 0); | |||
3107 | updateLiveVariables(LV, MI, *MIB); | |||
3108 | return MIB; | |||
3109 | } | |||
3110 | ||||
3111 | // It's not generally safe to move VALU instructions across these since it will | |||
3112 | // start using the register as a base index rather than directly. | |||
3113 | // XXX - Why isn't hasSideEffects sufficient for these? | |||
3114 | static bool changesVGPRIndexingMode(const MachineInstr &MI) { | |||
3115 | switch (MI.getOpcode()) { | |||
3116 | case AMDGPU::S_SET_GPR_IDX_ON: | |||
3117 | case AMDGPU::S_SET_GPR_IDX_MODE: | |||
3118 | case AMDGPU::S_SET_GPR_IDX_OFF: | |||
3119 | return true; | |||
3120 | default: | |||
3121 | return false; | |||
3122 | } | |||
3123 | } | |||
3124 | ||||
3125 | bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, | |||
3126 | const MachineBasicBlock *MBB, | |||
3127 | const MachineFunction &MF) const { | |||
3128 | // Skipping the check for SP writes in the base implementation. The reason it | |||
3129 | // was added was apparently due to compile time concerns. | |||
3130 | // | |||
3131 | // TODO: Do we really want this barrier? It triggers unnecessary hazard nops | |||
3132 | // but is probably avoidable. | |||
3133 | ||||
3134 | // Copied from base implementation. | |||
3135 | // Terminators and labels can't be scheduled around. | |||
3136 | if (MI.isTerminator() || MI.isPosition()) | |||
3137 | return true; | |||
3138 | ||||
3139 | // INLINEASM_BR can jump to another block | |||
3140 | if (MI.getOpcode() == TargetOpcode::INLINEASM_BR) | |||
3141 | return true; | |||
3142 | ||||
3143 | // Target-independent instructions do not have an implicit-use of EXEC, even | |||
3144 | // when they operate on VGPRs. Treating EXEC modifications as scheduling | |||
3145 | // boundaries prevents incorrect movements of such instructions. | |||
3146 | return MI.modifiesRegister(AMDGPU::EXEC, &RI) || | |||
3147 | MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || | |||
3148 | MI.getOpcode() == AMDGPU::S_SETREG_B32 || | |||
3149 | changesVGPRIndexingMode(MI); | |||
3150 | } | |||
3151 | ||||
3152 | bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { | |||
3153 | return Opcode == AMDGPU::DS_ORDERED_COUNT || | |||
3154 | Opcode == AMDGPU::DS_GWS_INIT || | |||
3155 | Opcode == AMDGPU::DS_GWS_SEMA_V || | |||
3156 | Opcode == AMDGPU::DS_GWS_SEMA_BR || | |||
3157 | Opcode == AMDGPU::DS_GWS_SEMA_P || | |||
3158 | Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || | |||
3159 | Opcode == AMDGPU::DS_GWS_BARRIER; | |||
3160 | } | |||
3161 | ||||
3162 | bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { | |||
3163 | // Skip the full operand and register alias search modifiesRegister | |||
3164 | // does. There's only a handful of instructions that touch this, it's only an | |||
3165 | // implicit def, and doesn't alias any other registers. | |||
3166 | if (const MCPhysReg *ImpDef = MI.getDesc().getImplicitDefs()) { | |||
3167 | for (; ImpDef && *ImpDef; ++ImpDef) { | |||
3168 | if (*ImpDef == AMDGPU::MODE) | |||
3169 | return true; | |||
3170 | } | |||
3171 | } | |||
3172 | ||||
3173 | return false; | |||
3174 | } | |||
3175 | ||||
3176 | bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { | |||
3177 | unsigned Opcode = MI.getOpcode(); | |||
3178 | ||||
3179 | if (MI.mayStore() && isSMRD(MI)) | |||
3180 | return true; // scalar store or atomic | |||
3181 | ||||
3182 | // This will terminate the function when other lanes may need to continue. | |||
3183 | if (MI.isReturn()) | |||
3184 | return true; | |||
3185 | ||||
3186 | // These instructions cause shader I/O that may cause hardware lockups | |||
3187 | // when executed with an empty EXEC mask. | |||
3188 | // | |||
3189 | // Note: exp with VM = DONE = 0 is automatically skipped by hardware when | |||
3190 | // EXEC = 0, but checking for that case here seems not worth it | |||
3191 | // given the typical code patterns. | |||
3192 | if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || | |||
3193 | isEXP(Opcode) || | |||
3194 | Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP || | |||
3195 | Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER) | |||
3196 | return true; | |||
3197 | ||||
3198 | if (MI.isCall() || MI.isInlineAsm()) | |||
3199 | return true; // conservative assumption | |||
3200 | ||||
3201 | // A mode change is a scalar operation that influences vector instructions. | |||
3202 | if (modifiesModeRegister(MI)) | |||
3203 | return true; | |||
3204 | ||||
3205 | // These are like SALU instructions in terms of effects, so it's questionable | |||
3206 | // whether we should return true for those. | |||
3207 | // | |||
3208 | // However, executing them with EXEC = 0 causes them to operate on undefined | |||
3209 | // data, which we avoid by returning true here. | |||
3210 | if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || | |||
3211 | Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32) | |||
3212 | return true; | |||
3213 | ||||
3214 | return false; | |||
3215 | } | |||
3216 | ||||
3217 | bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI, | |||
3218 | const MachineInstr &MI) const { | |||
3219 | if (MI.isMetaInstruction()) | |||
3220 | return false; | |||
3221 | ||||
3222 | // This won't read exec if this is an SGPR->SGPR copy. | |||
3223 | if (MI.isCopyLike()) { | |||
3224 | if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg())) | |||
3225 | return true; | |||
3226 | ||||
3227 | // Make sure this isn't copying exec as a normal operand | |||
3228 | return MI.readsRegister(AMDGPU::EXEC, &RI); | |||
3229 | } | |||
3230 | ||||
3231 | // Make a conservative assumption about the callee. | |||
3232 | if (MI.isCall()) | |||
3233 | return true; | |||
3234 | ||||
3235 | // Be conservative with any unhandled generic opcodes. | |||
3236 | if (!isTargetSpecificOpcode(MI.getOpcode())) | |||
3237 | return true; | |||
3238 | ||||
3239 | return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI); | |||
3240 | } | |||
3241 | ||||
3242 | bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { | |||
3243 | switch (Imm.getBitWidth()) { | |||
3244 | case 1: // This likely will be a condition code mask. | |||
3245 | return true; | |||
3246 | ||||
3247 | case 32: | |||
3248 | return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), | |||
3249 | ST.hasInv2PiInlineImm()); | |||
3250 | case 64: | |||
3251 | return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), | |||
3252 | ST.hasInv2PiInlineImm()); | |||
3253 | case 16: | |||
3254 | return ST.has16BitInsts() && | |||
3255 | AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), | |||
3256 | ST.hasInv2PiInlineImm()); | |||
3257 | default: | |||
3258 | llvm_unreachable("invalid bitwidth")::llvm::llvm_unreachable_internal("invalid bitwidth", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3258); | |||
3259 | } | |||
3260 | } | |||
3261 | ||||
3262 | bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, | |||
3263 | uint8_t OperandType) const { | |||
3264 | if (!MO.isImm() || | |||
3265 | OperandType < AMDGPU::OPERAND_SRC_FIRST || | |||
3266 | OperandType > AMDGPU::OPERAND_SRC_LAST) | |||
3267 | return false; | |||
3268 | ||||
3269 | // MachineOperand provides no way to tell the true operand size, since it only | |||
3270 | // records a 64-bit value. We need to know the size to determine if a 32-bit | |||
3271 | // floating point immediate bit pattern is legal for an integer immediate. It | |||
3272 | // would be for any 32-bit integer operand, but would not be for a 64-bit one. | |||
3273 | ||||
3274 | int64_t Imm = MO.getImm(); | |||
3275 | switch (OperandType) { | |||
3276 | case AMDGPU::OPERAND_REG_IMM_INT32: | |||
3277 | case AMDGPU::OPERAND_REG_IMM_FP32: | |||
3278 | case AMDGPU::OPERAND_REG_INLINE_C_INT32: | |||
3279 | case AMDGPU::OPERAND_REG_INLINE_C_FP32: | |||
3280 | case AMDGPU::OPERAND_REG_INLINE_AC_INT32: | |||
3281 | case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { | |||
3282 | int32_t Trunc = static_cast<int32_t>(Imm); | |||
3283 | return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); | |||
3284 | } | |||
3285 | case AMDGPU::OPERAND_REG_IMM_INT64: | |||
3286 | case AMDGPU::OPERAND_REG_IMM_FP64: | |||
3287 | case AMDGPU::OPERAND_REG_INLINE_C_INT64: | |||
3288 | case AMDGPU::OPERAND_REG_INLINE_C_FP64: | |||
3289 | return AMDGPU::isInlinableLiteral64(MO.getImm(), | |||
3290 | ST.hasInv2PiInlineImm()); | |||
3291 | case AMDGPU::OPERAND_REG_IMM_INT16: | |||
3292 | case AMDGPU::OPERAND_REG_INLINE_C_INT16: | |||
3293 | case AMDGPU::OPERAND_REG_INLINE_AC_INT16: | |||
3294 | // We would expect inline immediates to not be concerned with an integer/fp | |||
3295 | // distinction. However, in the case of 16-bit integer operations, the | |||
3296 | // "floating point" values appear to not work. It seems read the low 16-bits | |||
3297 | // of 32-bit immediates, which happens to always work for the integer | |||
3298 | // values. | |||
3299 | // | |||
3300 | // See llvm bugzilla 46302. | |||
3301 | // | |||
3302 | // TODO: Theoretically we could use op-sel to use the high bits of the | |||
3303 | // 32-bit FP values. | |||
3304 | return AMDGPU::isInlinableIntLiteral(Imm); | |||
3305 | case AMDGPU::OPERAND_REG_IMM_V2INT16: | |||
3306 | case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: | |||
3307 | case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: | |||
3308 | // This suffers the same problem as the scalar 16-bit cases. | |||
3309 | return AMDGPU::isInlinableIntLiteralV216(Imm); | |||
3310 | case AMDGPU::OPERAND_REG_IMM_FP16: | |||
3311 | case AMDGPU::OPERAND_REG_INLINE_C_FP16: | |||
3312 | case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { | |||
3313 | if (isInt<16>(Imm) || isUInt<16>(Imm)) { | |||
3314 | // A few special case instructions have 16-bit operands on subtargets | |||
3315 | // where 16-bit instructions are not legal. | |||
3316 | // TODO: Do the 32-bit immediates work? We shouldn't really need to handle | |||
3317 | // constants in these cases | |||
3318 | int16_t Trunc = static_cast<int16_t>(Imm); | |||
3319 | return ST.has16BitInsts() && | |||
3320 | AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); | |||
3321 | } | |||
3322 | ||||
3323 | return false; | |||
3324 | } | |||
3325 | case AMDGPU::OPERAND_REG_IMM_V2FP16: | |||
3326 | case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: | |||
3327 | case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { | |||
3328 | uint32_t Trunc = static_cast<uint32_t>(Imm); | |||
3329 | return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); | |||
3330 | } | |||
3331 | default: | |||
3332 | llvm_unreachable("invalid bitwidth")::llvm::llvm_unreachable_internal("invalid bitwidth", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3332); | |||
3333 | } | |||
3334 | } | |||
3335 | ||||
3336 | bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, | |||
3337 | const MCOperandInfo &OpInfo) const { | |||
3338 | switch (MO.getType()) { | |||
3339 | case MachineOperand::MO_Register: | |||
3340 | return false; | |||
3341 | case MachineOperand::MO_Immediate: | |||
3342 | return !isInlineConstant(MO, OpInfo); | |||
3343 | case MachineOperand::MO_FrameIndex: | |||
3344 | case MachineOperand::MO_MachineBasicBlock: | |||
3345 | case MachineOperand::MO_ExternalSymbol: | |||
3346 | case MachineOperand::MO_GlobalAddress: | |||
3347 | case MachineOperand::MO_MCSymbol: | |||
3348 | return true; | |||
3349 | default: | |||
3350 | llvm_unreachable("unexpected operand type")::llvm::llvm_unreachable_internal("unexpected operand type", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3350); | |||
3351 | } | |||
3352 | } | |||
3353 | ||||
3354 | static bool compareMachineOp(const MachineOperand &Op0, | |||
3355 | const MachineOperand &Op1) { | |||
3356 | if (Op0.getType() != Op1.getType()) | |||
3357 | return false; | |||
3358 | ||||
3359 | switch (Op0.getType()) { | |||
3360 | case MachineOperand::MO_Register: | |||
3361 | return Op0.getReg() == Op1.getReg(); | |||
3362 | case MachineOperand::MO_Immediate: | |||
3363 | return Op0.getImm() == Op1.getImm(); | |||
3364 | default: | |||
3365 | llvm_unreachable("Didn't expect to be comparing these operand types")::llvm::llvm_unreachable_internal("Didn't expect to be comparing these operand types" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3365); | |||
3366 | } | |||
3367 | } | |||
3368 | ||||
3369 | bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, | |||
3370 | const MachineOperand &MO) const { | |||
3371 | const MCInstrDesc &InstDesc = MI.getDesc(); | |||
3372 | const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo]; | |||
3373 | ||||
3374 | assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal())((MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal ()) ? static_cast<void> (0) : __assert_fail ("MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3374, __PRETTY_FUNCTION__)); | |||
3375 | ||||
3376 | if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) | |||
3377 | return true; | |||
3378 | ||||
3379 | if (OpInfo.RegClass < 0) | |||
3380 | return false; | |||
3381 | ||||
3382 | if (MO.isImm() && isInlineConstant(MO, OpInfo)) { | |||
3383 | if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() && | |||
3384 | OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(), | |||
3385 | AMDGPU::OpName::src2)) | |||
3386 | return false; | |||
3387 | return RI.opCanUseInlineConstant(OpInfo.OperandType); | |||
3388 | } | |||
3389 | ||||
3390 | if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) | |||
3391 | return false; | |||
3392 | ||||
3393 | if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) | |||
3394 | return true; | |||
3395 | ||||
3396 | return ST.hasVOP3Literal(); | |||
3397 | } | |||
3398 | ||||
3399 | bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { | |||
3400 | int Op32 = AMDGPU::getVOPe32(Opcode); | |||
3401 | if (Op32 == -1) | |||
3402 | return false; | |||
3403 | ||||
3404 | return pseudoToMCOpcode(Op32) != -1; | |||
3405 | } | |||
3406 | ||||
3407 | bool SIInstrInfo::hasModifiers(unsigned Opcode) const { | |||
3408 | // The src0_modifier operand is present on all instructions | |||
3409 | // that have modifiers. | |||
3410 | ||||
3411 | return AMDGPU::getNamedOperandIdx(Opcode, | |||
3412 | AMDGPU::OpName::src0_modifiers) != -1; | |||
3413 | } | |||
3414 | ||||
3415 | bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, | |||
3416 | unsigned OpName) const { | |||
3417 | const MachineOperand *Mods = getNamedOperand(MI, OpName); | |||
3418 | return Mods && Mods->getImm(); | |||
3419 | } | |||
3420 | ||||
3421 | bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { | |||
3422 | return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || | |||
3423 | hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || | |||
3424 | hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || | |||
3425 | hasModifiersSet(MI, AMDGPU::OpName::clamp) || | |||
3426 | hasModifiersSet(MI, AMDGPU::OpName::omod); | |||
3427 | } | |||
3428 | ||||
3429 | bool SIInstrInfo::canShrink(const MachineInstr &MI, | |||
3430 | const MachineRegisterInfo &MRI) const { | |||
3431 | const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); | |||
3432 | // Can't shrink instruction with three operands. | |||
3433 | // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add | |||
3434 | // a special case for it. It can only be shrunk if the third operand | |||
3435 | // is vcc, and src0_modifiers and src1_modifiers are not set. | |||
3436 | // We should handle this the same way we handle vopc, by addding | |||
3437 | // a register allocation hint pre-regalloc and then do the shrinking | |||
3438 | // post-regalloc. | |||
3439 | if (Src2) { | |||
3440 | switch (MI.getOpcode()) { | |||
3441 | default: return false; | |||
3442 | ||||
3443 | case AMDGPU::V_ADDC_U32_e64: | |||
3444 | case AMDGPU::V_SUBB_U32_e64: | |||
3445 | case AMDGPU::V_SUBBREV_U32_e64: { | |||
3446 | const MachineOperand *Src1 | |||
3447 | = getNamedOperand(MI, AMDGPU::OpName::src1); | |||
3448 | if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg())) | |||
3449 | return false; | |||
3450 | // Additional verification is needed for sdst/src2. | |||
3451 | return true; | |||
3452 | } | |||
3453 | case AMDGPU::V_MAC_F32_e64: | |||
3454 | case AMDGPU::V_MAC_F16_e64: | |||
3455 | case AMDGPU::V_FMAC_F32_e64: | |||
3456 | case AMDGPU::V_FMAC_F16_e64: | |||
3457 | if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || | |||
3458 | hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) | |||
3459 | return false; | |||
3460 | break; | |||
3461 | ||||
3462 | case AMDGPU::V_CNDMASK_B32_e64: | |||
3463 | break; | |||
3464 | } | |||
3465 | } | |||
3466 | ||||
3467 | const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); | |||
3468 | if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) || | |||
3469 | hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) | |||
3470 | return false; | |||
3471 | ||||
3472 | // We don't need to check src0, all input types are legal, so just make sure | |||
3473 | // src0 isn't using any modifiers. | |||
3474 | if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) | |||
3475 | return false; | |||
3476 | ||||
3477 | // Can it be shrunk to a valid 32 bit opcode? | |||
3478 | if (!hasVALU32BitEncoding(MI.getOpcode())) | |||
3479 | return false; | |||
3480 | ||||
3481 | // Check output modifiers | |||
3482 | return !hasModifiersSet(MI, AMDGPU::OpName::omod) && | |||
3483 | !hasModifiersSet(MI, AMDGPU::OpName::clamp); | |||
3484 | } | |||
3485 | ||||
3486 | // Set VCC operand with all flags from \p Orig, except for setting it as | |||
3487 | // implicit. | |||
3488 | static void copyFlagsToImplicitVCC(MachineInstr &MI, | |||
3489 | const MachineOperand &Orig) { | |||
3490 | ||||
3491 | for (MachineOperand &Use : MI.implicit_operands()) { | |||
3492 | if (Use.isUse() && | |||
3493 | (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) { | |||
3494 | Use.setIsUndef(Orig.isUndef()); | |||
3495 | Use.setIsKill(Orig.isKill()); | |||
3496 | return; | |||
3497 | } | |||
3498 | } | |||
3499 | } | |||
3500 | ||||
3501 | MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, | |||
3502 | unsigned Op32) const { | |||
3503 | MachineBasicBlock *MBB = MI.getParent();; | |||
3504 | MachineInstrBuilder Inst32 = | |||
3505 | BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)) | |||
3506 | .setMIFlags(MI.getFlags()); | |||
3507 | ||||
3508 | // Add the dst operand if the 32-bit encoding also has an explicit $vdst. | |||
3509 | // For VOPC instructions, this is replaced by an implicit def of vcc. | |||
3510 | int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); | |||
3511 | if (Op32DstIdx != -1) { | |||
3512 | // dst | |||
3513 | Inst32.add(MI.getOperand(0)); | |||
3514 | } else { | |||
3515 | assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||((((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand (0).getReg() == AMDGPU::VCC_LO)) && "Unexpected case" ) ? static_cast<void> (0) : __assert_fail ("((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && \"Unexpected case\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3517, __PRETTY_FUNCTION__)) | |||
3516 | (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&((((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand (0).getReg() == AMDGPU::VCC_LO)) && "Unexpected case" ) ? static_cast<void> (0) : __assert_fail ("((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && \"Unexpected case\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3517, __PRETTY_FUNCTION__)) | |||
3517 | "Unexpected case")((((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand (0).getReg() == AMDGPU::VCC_LO)) && "Unexpected case" ) ? static_cast<void> (0) : __assert_fail ("((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && \"Unexpected case\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3517, __PRETTY_FUNCTION__)); | |||
3518 | } | |||
3519 | ||||
3520 | Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0)); | |||
3521 | ||||
3522 | const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); | |||
3523 | if (Src1) | |||
3524 | Inst32.add(*Src1); | |||
3525 | ||||
3526 | const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); | |||
3527 | ||||
3528 | if (Src2) { | |||
3529 | int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); | |||
3530 | if (Op32Src2Idx != -1) { | |||
3531 | Inst32.add(*Src2); | |||
3532 | } else { | |||
3533 | // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is | |||
3534 | // replaced with an implicit read of vcc or vcc_lo. The implicit read | |||
3535 | // of vcc was already added during the initial BuildMI, but we | |||
3536 | // 1) may need to change vcc to vcc_lo to preserve the original register | |||
3537 | // 2) have to preserve the original flags. | |||
3538 | fixImplicitOperands(*Inst32); | |||
3539 | copyFlagsToImplicitVCC(*Inst32, *Src2); | |||
3540 | } | |||
3541 | } | |||
3542 | ||||
3543 | return Inst32; | |||
3544 | } | |||
3545 | ||||
3546 | bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, | |||
3547 | const MachineOperand &MO, | |||
3548 | const MCOperandInfo &OpInfo) const { | |||
3549 | // Literal constants use the constant bus. | |||
3550 | //if (isLiteralConstantLike(MO, OpInfo)) | |||
3551 | // return true; | |||
3552 | if (MO.isImm()) | |||
3553 | return !isInlineConstant(MO, OpInfo); | |||
3554 | ||||
3555 | if (!MO.isReg()) | |||
3556 | return true; // Misc other operands like FrameIndex | |||
3557 | ||||
3558 | if (!MO.isUse()) | |||
3559 | return false; | |||
3560 | ||||
3561 | if (MO.getReg().isVirtual()) | |||
3562 | return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); | |||
3563 | ||||
3564 | // Null is free | |||
3565 | if (MO.getReg() == AMDGPU::SGPR_NULL) | |||
3566 | return false; | |||
3567 | ||||
3568 | // SGPRs use the constant bus | |||
3569 | if (MO.isImplicit()) { | |||
3570 | return MO.getReg() == AMDGPU::M0 || | |||
3571 | MO.getReg() == AMDGPU::VCC || | |||
3572 | MO.getReg() == AMDGPU::VCC_LO; | |||
3573 | } else { | |||
3574 | return AMDGPU::SReg_32RegClass.contains(MO.getReg()) || | |||
3575 | AMDGPU::SReg_64RegClass.contains(MO.getReg()); | |||
3576 | } | |||
3577 | } | |||
3578 | ||||
3579 | static Register findImplicitSGPRRead(const MachineInstr &MI) { | |||
3580 | for (const MachineOperand &MO : MI.implicit_operands()) { | |||
3581 | // We only care about reads. | |||
3582 | if (MO.isDef()) | |||
3583 | continue; | |||
3584 | ||||
3585 | switch (MO.getReg()) { | |||
3586 | case AMDGPU::VCC: | |||
3587 | case AMDGPU::VCC_LO: | |||
3588 | case AMDGPU::VCC_HI: | |||
3589 | case AMDGPU::M0: | |||
3590 | case AMDGPU::FLAT_SCR: | |||
3591 | return MO.getReg(); | |||
3592 | ||||
3593 | default: | |||
3594 | break; | |||
3595 | } | |||
3596 | } | |||
3597 | ||||
3598 | return AMDGPU::NoRegister; | |||
3599 | } | |||
3600 | ||||
3601 | static bool shouldReadExec(const MachineInstr &MI) { | |||
3602 | if (SIInstrInfo::isVALU(MI)) { | |||
3603 | switch (MI.getOpcode()) { | |||
3604 | case AMDGPU::V_READLANE_B32: | |||
3605 | case AMDGPU::V_WRITELANE_B32: | |||
3606 | return false; | |||
3607 | } | |||
3608 | ||||
3609 | return true; | |||
3610 | } | |||
3611 | ||||
3612 | if (MI.isPreISelOpcode() || | |||
3613 | SIInstrInfo::isGenericOpcode(MI.getOpcode()) || | |||
3614 | SIInstrInfo::isSALU(MI) || | |||
3615 | SIInstrInfo::isSMRD(MI)) | |||
3616 | return false; | |||
3617 | ||||
3618 | return true; | |||
3619 | } | |||
3620 | ||||
3621 | static bool isSubRegOf(const SIRegisterInfo &TRI, | |||
3622 | const MachineOperand &SuperVec, | |||
3623 | const MachineOperand &SubReg) { | |||
3624 | if (SubReg.getReg().isPhysical()) | |||
3625 | return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); | |||
3626 | ||||
3627 | return SubReg.getSubReg() != AMDGPU::NoSubRegister && | |||
3628 | SubReg.getReg() == SuperVec.getReg(); | |||
3629 | } | |||
3630 | ||||
3631 | bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, | |||
3632 | StringRef &ErrInfo) const { | |||
3633 | uint16_t Opcode = MI.getOpcode(); | |||
3634 | if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) | |||
3635 | return true; | |||
3636 | ||||
3637 | const MachineFunction *MF = MI.getParent()->getParent(); | |||
3638 | const MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
3639 | ||||
3640 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); | |||
3641 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); | |||
3642 | int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); | |||
3643 | ||||
3644 | // Make sure the number of operands is correct. | |||
3645 | const MCInstrDesc &Desc = get(Opcode); | |||
3646 | if (!Desc.isVariadic() && | |||
3647 | Desc.getNumOperands() != MI.getNumExplicitOperands()) { | |||
3648 | ErrInfo = "Instruction has wrong number of operands."; | |||
3649 | return false; | |||
3650 | } | |||
3651 | ||||
3652 | if (MI.isInlineAsm()) { | |||
3653 | // Verify register classes for inlineasm constraints. | |||
3654 | for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); | |||
3655 | I != E; ++I) { | |||
3656 | const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); | |||
3657 | if (!RC) | |||
3658 | continue; | |||
3659 | ||||
3660 | const MachineOperand &Op = MI.getOperand(I); | |||
3661 | if (!Op.isReg()) | |||
3662 | continue; | |||
3663 | ||||
3664 | Register Reg = Op.getReg(); | |||
3665 | if (!Reg.isVirtual() && !RC->contains(Reg)) { | |||
3666 | ErrInfo = "inlineasm operand has incorrect register class."; | |||
3667 | return false; | |||
3668 | } | |||
3669 | } | |||
3670 | ||||
3671 | return true; | |||
3672 | } | |||
3673 | ||||
3674 | if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) { | |||
3675 | ErrInfo = "missing memory operand from MIMG instruction."; | |||
3676 | return false; | |||
3677 | } | |||
3678 | ||||
3679 | // Make sure the register classes are correct. | |||
3680 | for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { | |||
3681 | if (MI.getOperand(i).isFPImm()) { | |||
3682 | ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " | |||
3683 | "all fp values to integers."; | |||
3684 | return false; | |||
3685 | } | |||
3686 | ||||
3687 | int RegClass = Desc.OpInfo[i].RegClass; | |||
3688 | ||||
3689 | switch (Desc.OpInfo[i].OperandType) { | |||
3690 | case MCOI::OPERAND_REGISTER: | |||
3691 | if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { | |||
3692 | ErrInfo = "Illegal immediate value for operand."; | |||
3693 | return false; | |||
3694 | } | |||
3695 | break; | |||
3696 | case AMDGPU::OPERAND_REG_IMM_INT32: | |||
3697 | case AMDGPU::OPERAND_REG_IMM_FP32: | |||
3698 | break; | |||
3699 | case AMDGPU::OPERAND_REG_INLINE_C_INT32: | |||
3700 | case AMDGPU::OPERAND_REG_INLINE_C_FP32: | |||
3701 | case AMDGPU::OPERAND_REG_INLINE_C_INT64: | |||
3702 | case AMDGPU::OPERAND_REG_INLINE_C_FP64: | |||
3703 | case AMDGPU::OPERAND_REG_INLINE_C_INT16: | |||
3704 | case AMDGPU::OPERAND_REG_INLINE_C_FP16: | |||
3705 | case AMDGPU::OPERAND_REG_INLINE_AC_INT32: | |||
3706 | case AMDGPU::OPERAND_REG_INLINE_AC_FP32: | |||
3707 | case AMDGPU::OPERAND_REG_INLINE_AC_INT16: | |||
3708 | case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { | |||
3709 | const MachineOperand &MO = MI.getOperand(i); | |||
3710 | if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { | |||
3711 | ErrInfo = "Illegal immediate value for operand."; | |||
3712 | return false; | |||
3713 | } | |||
3714 | break; | |||
3715 | } | |||
3716 | case MCOI::OPERAND_IMMEDIATE: | |||
3717 | case AMDGPU::OPERAND_KIMM32: | |||
3718 | // Check if this operand is an immediate. | |||
3719 | // FrameIndex operands will be replaced by immediates, so they are | |||
3720 | // allowed. | |||
3721 | if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { | |||
3722 | ErrInfo = "Expected immediate, but got non-immediate"; | |||
3723 | return false; | |||
3724 | } | |||
3725 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | |||
3726 | default: | |||
3727 | continue; | |||
3728 | } | |||
3729 | ||||
3730 | if (!MI.getOperand(i).isReg()) | |||
3731 | continue; | |||
3732 | ||||
3733 | if (RegClass != -1) { | |||
3734 | Register Reg = MI.getOperand(i).getReg(); | |||
3735 | if (Reg == AMDGPU::NoRegister || Reg.isVirtual()) | |||
3736 | continue; | |||
3737 | ||||
3738 | const TargetRegisterClass *RC = RI.getRegClass(RegClass); | |||
3739 | if (!RC->contains(Reg)) { | |||
3740 | ErrInfo = "Operand has incorrect register class."; | |||
3741 | return false; | |||
3742 | } | |||
3743 | } | |||
3744 | } | |||
3745 | ||||
3746 | // Verify SDWA | |||
3747 | if (isSDWA(MI)) { | |||
3748 | if (!ST.hasSDWA()) { | |||
3749 | ErrInfo = "SDWA is not supported on this target"; | |||
3750 | return false; | |||
3751 | } | |||
3752 | ||||
3753 | int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); | |||
3754 | ||||
3755 | const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; | |||
3756 | ||||
3757 | for (int OpIdx: OpIndicies) { | |||
3758 | if (OpIdx == -1) | |||
3759 | continue; | |||
3760 | const MachineOperand &MO = MI.getOperand(OpIdx); | |||
3761 | ||||
3762 | if (!ST.hasSDWAScalar()) { | |||
3763 | // Only VGPRS on VI | |||
3764 | if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { | |||
3765 | ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; | |||
3766 | return false; | |||
3767 | } | |||
3768 | } else { | |||
3769 | // No immediates on GFX9 | |||
3770 | if (!MO.isReg()) { | |||
3771 | ErrInfo = | |||
3772 | "Only reg allowed as operands in SDWA instructions on GFX9+"; | |||
3773 | return false; | |||
3774 | } | |||
3775 | } | |||
3776 | } | |||
3777 | ||||
3778 | if (!ST.hasSDWAOmod()) { | |||
3779 | // No omod allowed on VI | |||
3780 | const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); | |||
3781 | if (OMod != nullptr && | |||
3782 | (!OMod->isImm() || OMod->getImm() != 0)) { | |||
3783 | ErrInfo = "OMod not allowed in SDWA instructions on VI"; | |||
3784 | return false; | |||
3785 | } | |||
3786 | } | |||
3787 | ||||
3788 | uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); | |||
3789 | if (isVOPC(BasicOpcode)) { | |||
3790 | if (!ST.hasSDWASdst() && DstIdx != -1) { | |||
3791 | // Only vcc allowed as dst on VI for VOPC | |||
3792 | const MachineOperand &Dst = MI.getOperand(DstIdx); | |||
3793 | if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { | |||
3794 | ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; | |||
3795 | return false; | |||
3796 | } | |||
3797 | } else if (!ST.hasSDWAOutModsVOPC()) { | |||
3798 | // No clamp allowed on GFX9 for VOPC | |||
3799 | const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); | |||
3800 | if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { | |||
3801 | ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; | |||
3802 | return false; | |||
3803 | } | |||
3804 | ||||
3805 | // No omod allowed on GFX9 for VOPC | |||
3806 | const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); | |||
3807 | if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { | |||
3808 | ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; | |||
3809 | return false; | |||
3810 | } | |||
3811 | } | |||
3812 | } | |||
3813 | ||||
3814 | const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused); | |||
3815 | if (DstUnused && DstUnused->isImm() && | |||
3816 | DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) { | |||
3817 | const MachineOperand &Dst = MI.getOperand(DstIdx); | |||
3818 | if (!Dst.isReg() || !Dst.isTied()) { | |||
3819 | ErrInfo = "Dst register should have tied register"; | |||
3820 | return false; | |||
3821 | } | |||
3822 | ||||
3823 | const MachineOperand &TiedMO = | |||
3824 | MI.getOperand(MI.findTiedOperandIdx(DstIdx)); | |||
3825 | if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) { | |||
3826 | ErrInfo = | |||
3827 | "Dst register should be tied to implicit use of preserved register"; | |||
3828 | return false; | |||
3829 | } else if (TiedMO.getReg().isPhysical() && | |||
3830 | Dst.getReg() != TiedMO.getReg()) { | |||
3831 | ErrInfo = "Dst register should use same physical register as preserved"; | |||
3832 | return false; | |||
3833 | } | |||
3834 | } | |||
3835 | } | |||
3836 | ||||
3837 | // Verify MIMG | |||
3838 | if (isMIMG(MI.getOpcode()) && !MI.mayStore()) { | |||
3839 | // Ensure that the return type used is large enough for all the options | |||
3840 | // being used TFE/LWE require an extra result register. | |||
3841 | const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); | |||
3842 | if (DMask) { | |||
3843 | uint64_t DMaskImm = DMask->getImm(); | |||
3844 | uint32_t RegCount = | |||
3845 | isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm); | |||
3846 | const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe); | |||
3847 | const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe); | |||
3848 | const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16); | |||
3849 | ||||
3850 | // Adjust for packed 16 bit values | |||
3851 | if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem()) | |||
3852 | RegCount >>= 1; | |||
3853 | ||||
3854 | // Adjust if using LWE or TFE | |||
3855 | if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) | |||
3856 | RegCount += 1; | |||
3857 | ||||
3858 | const uint32_t DstIdx = | |||
3859 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); | |||
3860 | const MachineOperand &Dst = MI.getOperand(DstIdx); | |||
3861 | if (Dst.isReg()) { | |||
3862 | const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); | |||
3863 | uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; | |||
3864 | if (RegCount > DstSize) { | |||
3865 | ErrInfo = "MIMG instruction returns too many registers for dst " | |||
3866 | "register class"; | |||
3867 | return false; | |||
3868 | } | |||
3869 | } | |||
3870 | } | |||
3871 | } | |||
3872 | ||||
3873 | // Verify VOP*. Ignore multiple sgpr operands on writelane. | |||
3874 | if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 | |||
3875 | && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { | |||
3876 | // Only look at the true operands. Only a real operand can use the constant | |||
3877 | // bus, and we don't want to check pseudo-operands like the source modifier | |||
3878 | // flags. | |||
3879 | const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; | |||
3880 | ||||
3881 | unsigned ConstantBusCount = 0; | |||
3882 | unsigned LiteralCount = 0; | |||
3883 | ||||
3884 | if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) | |||
3885 | ++ConstantBusCount; | |||
3886 | ||||
3887 | SmallVector<Register, 2> SGPRsUsed; | |||
3888 | Register SGPRUsed; | |||
3889 | ||||
3890 | for (int OpIdx : OpIndices) { | |||
3891 | if (OpIdx == -1) | |||
3892 | break; | |||
3893 | const MachineOperand &MO = MI.getOperand(OpIdx); | |||
3894 | if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { | |||
3895 | if (MO.isReg()) { | |||
3896 | SGPRUsed = MO.getReg(); | |||
3897 | if (llvm::all_of(SGPRsUsed, [SGPRUsed](unsigned SGPR) { | |||
3898 | return SGPRUsed != SGPR; | |||
3899 | })) { | |||
3900 | ++ConstantBusCount; | |||
3901 | SGPRsUsed.push_back(SGPRUsed); | |||
3902 | } | |||
3903 | } else { | |||
3904 | ++ConstantBusCount; | |||
3905 | ++LiteralCount; | |||
3906 | } | |||
3907 | } | |||
3908 | } | |||
3909 | ||||
3910 | SGPRUsed = findImplicitSGPRRead(MI); | |||
3911 | if (SGPRUsed != AMDGPU::NoRegister) { | |||
3912 | // Implicit uses may safely overlap true overands | |||
3913 | if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { | |||
3914 | return !RI.regsOverlap(SGPRUsed, SGPR); | |||
3915 | })) { | |||
3916 | ++ConstantBusCount; | |||
3917 | SGPRsUsed.push_back(SGPRUsed); | |||
3918 | } | |||
3919 | } | |||
3920 | ||||
3921 | // v_writelane_b32 is an exception from constant bus restriction: | |||
3922 | // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const | |||
3923 | if (ConstantBusCount > ST.getConstantBusLimit(Opcode) && | |||
3924 | Opcode != AMDGPU::V_WRITELANE_B32) { | |||
3925 | ErrInfo = "VOP* instruction violates constant bus restriction"; | |||
3926 | return false; | |||
3927 | } | |||
3928 | ||||
3929 | if (isVOP3(MI) && LiteralCount) { | |||
3930 | if (!ST.hasVOP3Literal()) { | |||
3931 | ErrInfo = "VOP3 instruction uses literal"; | |||
3932 | return false; | |||
3933 | } | |||
3934 | if (LiteralCount > 1) { | |||
3935 | ErrInfo = "VOP3 instruction uses more than one literal"; | |||
3936 | return false; | |||
3937 | } | |||
3938 | } | |||
3939 | } | |||
3940 | ||||
3941 | // Special case for writelane - this can break the multiple constant bus rule, | |||
3942 | // but still can't use more than one SGPR register | |||
3943 | if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) { | |||
3944 | unsigned SGPRCount = 0; | |||
3945 | Register SGPRUsed = AMDGPU::NoRegister; | |||
3946 | ||||
3947 | for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { | |||
3948 | if (OpIdx == -1) | |||
3949 | break; | |||
3950 | ||||
3951 | const MachineOperand &MO = MI.getOperand(OpIdx); | |||
3952 | ||||
3953 | if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { | |||
3954 | if (MO.isReg() && MO.getReg() != AMDGPU::M0) { | |||
3955 | if (MO.getReg() != SGPRUsed) | |||
3956 | ++SGPRCount; | |||
3957 | SGPRUsed = MO.getReg(); | |||
3958 | } | |||
3959 | } | |||
3960 | if (SGPRCount > ST.getConstantBusLimit(Opcode)) { | |||
3961 | ErrInfo = "WRITELANE instruction violates constant bus restriction"; | |||
3962 | return false; | |||
3963 | } | |||
3964 | } | |||
3965 | } | |||
3966 | ||||
3967 | // Verify misc. restrictions on specific instructions. | |||
3968 | if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 || | |||
3969 | Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) { | |||
3970 | const MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
3971 | const MachineOperand &Src1 = MI.getOperand(Src1Idx); | |||
3972 | const MachineOperand &Src2 = MI.getOperand(Src2Idx); | |||
3973 | if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { | |||
3974 | if (!compareMachineOp(Src0, Src1) && | |||
3975 | !compareMachineOp(Src0, Src2)) { | |||
3976 | ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; | |||
3977 | return false; | |||
3978 | } | |||
3979 | } | |||
3980 | if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & | |||
3981 | SISrcMods::ABS) || | |||
3982 | (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() & | |||
3983 | SISrcMods::ABS) || | |||
3984 | (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() & | |||
3985 | SISrcMods::ABS)) { | |||
3986 | ErrInfo = "ABS not allowed in VOP3B instructions"; | |||
3987 | return false; | |||
3988 | } | |||
3989 | } | |||
3990 | ||||
3991 | if (isSOP2(MI) || isSOPC(MI)) { | |||
3992 | const MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
3993 | const MachineOperand &Src1 = MI.getOperand(Src1Idx); | |||
3994 | unsigned Immediates = 0; | |||
3995 | ||||
3996 | if (!Src0.isReg() && | |||
3997 | !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType)) | |||
3998 | Immediates++; | |||
3999 | if (!Src1.isReg() && | |||
4000 | !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType)) | |||
4001 | Immediates++; | |||
4002 | ||||
4003 | if (Immediates > 1) { | |||
4004 | ErrInfo = "SOP2/SOPC instruction requires too many immediate constants"; | |||
4005 | return false; | |||
4006 | } | |||
4007 | } | |||
4008 | ||||
4009 | if (isSOPK(MI)) { | |||
4010 | auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16); | |||
4011 | if (Desc.isBranch()) { | |||
4012 | if (!Op->isMBB()) { | |||
4013 | ErrInfo = "invalid branch target for SOPK instruction"; | |||
4014 | return false; | |||
4015 | } | |||
4016 | } else { | |||
4017 | uint64_t Imm = Op->getImm(); | |||
4018 | if (sopkIsZext(MI)) { | |||
4019 | if (!isUInt<16>(Imm)) { | |||
4020 | ErrInfo = "invalid immediate for SOPK instruction"; | |||
4021 | return false; | |||
4022 | } | |||
4023 | } else { | |||
4024 | if (!isInt<16>(Imm)) { | |||
4025 | ErrInfo = "invalid immediate for SOPK instruction"; | |||
4026 | return false; | |||
4027 | } | |||
4028 | } | |||
4029 | } | |||
4030 | } | |||
4031 | ||||
4032 | if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || | |||
4033 | Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || | |||
4034 | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || | |||
4035 | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { | |||
4036 | const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || | |||
4037 | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; | |||
4038 | ||||
4039 | const unsigned StaticNumOps = Desc.getNumOperands() + | |||
4040 | Desc.getNumImplicitUses(); | |||
4041 | const unsigned NumImplicitOps = IsDst ? 2 : 1; | |||
4042 | ||||
4043 | // Allow additional implicit operands. This allows a fixup done by the post | |||
4044 | // RA scheduler where the main implicit operand is killed and implicit-defs | |||
4045 | // are added for sub-registers that remain live after this instruction. | |||
4046 | if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { | |||
4047 | ErrInfo = "missing implicit register operands"; | |||
4048 | return false; | |||
4049 | } | |||
4050 | ||||
4051 | const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
4052 | if (IsDst) { | |||
4053 | if (!Dst->isUse()) { | |||
4054 | ErrInfo = "v_movreld_b32 vdst should be a use operand"; | |||
4055 | return false; | |||
4056 | } | |||
4057 | ||||
4058 | unsigned UseOpIdx; | |||
4059 | if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || | |||
4060 | UseOpIdx != StaticNumOps + 1) { | |||
4061 | ErrInfo = "movrel implicit operands should be tied"; | |||
4062 | return false; | |||
4063 | } | |||
4064 | } | |||
4065 | ||||
4066 | const MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
4067 | const MachineOperand &ImpUse | |||
4068 | = MI.getOperand(StaticNumOps + NumImplicitOps - 1); | |||
4069 | if (!ImpUse.isReg() || !ImpUse.isUse() || | |||
4070 | !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { | |||
4071 | ErrInfo = "src0 should be subreg of implicit vector use"; | |||
4072 | return false; | |||
4073 | } | |||
4074 | } | |||
4075 | ||||
4076 | // Make sure we aren't losing exec uses in the td files. This mostly requires | |||
4077 | // being careful when using let Uses to try to add other use registers. | |||
4078 | if (shouldReadExec(MI)) { | |||
4079 | if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { | |||
4080 | ErrInfo = "VALU instruction does not implicitly read exec mask"; | |||
4081 | return false; | |||
4082 | } | |||
4083 | } | |||
4084 | ||||
4085 | if (isSMRD(MI)) { | |||
4086 | if (MI.mayStore()) { | |||
4087 | // The register offset form of scalar stores may only use m0 as the | |||
4088 | // soffset register. | |||
4089 | const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); | |||
4090 | if (Soff && Soff->getReg() != AMDGPU::M0) { | |||
4091 | ErrInfo = "scalar stores must use m0 as offset register"; | |||
4092 | return false; | |||
4093 | } | |||
4094 | } | |||
4095 | } | |||
4096 | ||||
4097 | if (isFLAT(MI) && !ST.hasFlatInstOffsets()) { | |||
4098 | const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); | |||
4099 | if (Offset->getImm() != 0) { | |||
4100 | ErrInfo = "subtarget does not support offsets in flat instructions"; | |||
4101 | return false; | |||
4102 | } | |||
4103 | } | |||
4104 | ||||
4105 | if (isMIMG(MI)) { | |||
4106 | const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim); | |||
4107 | if (DimOp) { | |||
4108 | int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode, | |||
4109 | AMDGPU::OpName::vaddr0); | |||
4110 | int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); | |||
4111 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode); | |||
4112 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = | |||
4113 | AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); | |||
4114 | const AMDGPU::MIMGDimInfo *Dim = | |||
4115 | AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm()); | |||
4116 | ||||
4117 | if (!Dim) { | |||
4118 | ErrInfo = "dim is out of range"; | |||
4119 | return false; | |||
4120 | } | |||
4121 | ||||
4122 | bool IsA16 = false; | |||
4123 | if (ST.hasR128A16()) { | |||
4124 | const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128); | |||
4125 | IsA16 = R128A16->getImm() != 0; | |||
4126 | } else if (ST.hasGFX10A16()) { | |||
4127 | const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16); | |||
4128 | IsA16 = A16->getImm() != 0; | |||
4129 | } | |||
4130 | ||||
4131 | bool PackDerivatives = IsA16 || BaseOpcode->G16; | |||
4132 | bool IsNSA = SRsrcIdx - VAddr0Idx > 1; | |||
4133 | ||||
4134 | unsigned AddrWords = BaseOpcode->NumExtraArgs; | |||
4135 | unsigned AddrComponents = (BaseOpcode->Coordinates ? Dim->NumCoords : 0) + | |||
4136 | (BaseOpcode->LodOrClampOrMip ? 1 : 0); | |||
4137 | if (IsA16) | |||
4138 | AddrWords += (AddrComponents + 1) / 2; | |||
4139 | else | |||
4140 | AddrWords += AddrComponents; | |||
4141 | ||||
4142 | if (BaseOpcode->Gradients) { | |||
4143 | if (PackDerivatives) | |||
4144 | // There are two gradients per coordinate, we pack them separately. | |||
4145 | // For the 3d case, we get (dy/du, dx/du) (-, dz/du) (dy/dv, dx/dv) (-, dz/dv) | |||
4146 | AddrWords += (Dim->NumGradients / 2 + 1) / 2 * 2; | |||
4147 | else | |||
4148 | AddrWords += Dim->NumGradients; | |||
4149 | } | |||
4150 | ||||
4151 | unsigned VAddrWords; | |||
4152 | if (IsNSA) { | |||
4153 | VAddrWords = SRsrcIdx - VAddr0Idx; | |||
4154 | } else { | |||
4155 | const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx); | |||
4156 | VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32; | |||
4157 | if (AddrWords > 8) | |||
4158 | AddrWords = 16; | |||
4159 | else if (AddrWords > 4) | |||
4160 | AddrWords = 8; | |||
4161 | else if (AddrWords == 4) | |||
4162 | AddrWords = 4; | |||
4163 | else if (AddrWords == 3) | |||
4164 | AddrWords = 3; | |||
4165 | } | |||
4166 | ||||
4167 | if (VAddrWords != AddrWords) { | |||
4168 | LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWordsdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-instr-info")) { dbgs() << "bad vaddr size, expected " << AddrWords << " but got " << VAddrWords << "\n"; } } while (false) | |||
4169 | << " but got " << VAddrWords << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-instr-info")) { dbgs() << "bad vaddr size, expected " << AddrWords << " but got " << VAddrWords << "\n"; } } while (false); | |||
4170 | ErrInfo = "bad vaddr size"; | |||
4171 | return false; | |||
4172 | } | |||
4173 | } | |||
4174 | } | |||
4175 | ||||
4176 | const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl); | |||
4177 | if (DppCt) { | |||
4178 | using namespace AMDGPU::DPP; | |||
4179 | ||||
4180 | unsigned DC = DppCt->getImm(); | |||
4181 | if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 || | |||
4182 | DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST || | |||
4183 | (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || | |||
4184 | (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || | |||
4185 | (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || | |||
4186 | (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) || | |||
4187 | (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) { | |||
4188 | ErrInfo = "Invalid dpp_ctrl value"; | |||
4189 | return false; | |||
4190 | } | |||
4191 | if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 && | |||
4192 | ST.getGeneration() >= AMDGPUSubtarget::GFX10) { | |||
4193 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4194 | "wavefront shifts are not supported on GFX10+"; | |||
4195 | return false; | |||
4196 | } | |||
4197 | if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 && | |||
4198 | ST.getGeneration() >= AMDGPUSubtarget::GFX10) { | |||
4199 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4200 | "broadcasts are not supported on GFX10+"; | |||
4201 | return false; | |||
4202 | } | |||
4203 | if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && | |||
4204 | ST.getGeneration() < AMDGPUSubtarget::GFX10) { | |||
4205 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4206 | "row_share and row_xmask are not supported before GFX10"; | |||
4207 | return false; | |||
4208 | } | |||
4209 | } | |||
4210 | ||||
4211 | return true; | |||
4212 | } | |||
4213 | ||||
4214 | unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { | |||
4215 | switch (MI.getOpcode()) { | |||
4216 | default: return AMDGPU::INSTRUCTION_LIST_END; | |||
4217 | case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; | |||
4218 | case AMDGPU::COPY: return AMDGPU::COPY; | |||
4219 | case AMDGPU::PHI: return AMDGPU::PHI; | |||
4220 | case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; | |||
4221 | case AMDGPU::WQM: return AMDGPU::WQM; | |||
4222 | case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; | |||
4223 | case AMDGPU::WWM: return AMDGPU::WWM; | |||
4224 | case AMDGPU::S_MOV_B32: { | |||
4225 | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); | |||
4226 | return MI.getOperand(1).isReg() || | |||
4227 | RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? | |||
4228 | AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; | |||
4229 | } | |||
4230 | case AMDGPU::S_ADD_I32: | |||
4231 | return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32; | |||
4232 | case AMDGPU::S_ADDC_U32: | |||
4233 | return AMDGPU::V_ADDC_U32_e32; | |||
4234 | case AMDGPU::S_SUB_I32: | |||
4235 | return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32; | |||
4236 | // FIXME: These are not consistently handled, and selected when the carry is | |||
4237 | // used. | |||
4238 | case AMDGPU::S_ADD_U32: | |||
4239 | return AMDGPU::V_ADD_CO_U32_e32; | |||
4240 | case AMDGPU::S_SUB_U32: | |||
4241 | return AMDGPU::V_SUB_CO_U32_e32; | |||
4242 | case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; | |||
4243 | case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64; | |||
4244 | case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64; | |||
4245 | case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64; | |||
4246 | case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; | |||
4247 | case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; | |||
4248 | case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; | |||
4249 | case AMDGPU::S_XNOR_B32: | |||
4250 | return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; | |||
4251 | case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; | |||
4252 | case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; | |||
4253 | case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; | |||
4254 | case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; | |||
4255 | case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; | |||
4256 | case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64; | |||
4257 | case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; | |||
4258 | case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64; | |||
4259 | case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; | |||
4260 | case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64; | |||
4261 | case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64; | |||
4262 | case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64; | |||
4263 | case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64; | |||
4264 | case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64; | |||
4265 | case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; | |||
4266 | case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; | |||
4267 | case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; | |||
4268 | case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; | |||
4269 | case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; | |||
4270 | case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; | |||
4271 | case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; | |||
4272 | case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; | |||
4273 | case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; | |||
4274 | case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; | |||
4275 | case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; | |||
4276 | case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; | |||
4277 | case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; | |||
4278 | case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; | |||
4279 | case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; | |||
4280 | case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; | |||
4281 | case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; | |||
4282 | case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; | |||
4283 | case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; | |||
4284 | case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; | |||
4285 | case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; | |||
4286 | case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; | |||
4287 | case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; | |||
4288 | case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; | |||
4289 | } | |||
4290 | llvm_unreachable(::llvm::llvm_unreachable_internal("Unexpected scalar opcode without corresponding vector one!" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4291) | |||
4291 | "Unexpected scalar opcode without corresponding vector one!")::llvm::llvm_unreachable_internal("Unexpected scalar opcode without corresponding vector one!" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4291); | |||
4292 | } | |||
4293 | ||||
4294 | const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, | |||
4295 | unsigned OpNo) const { | |||
4296 | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); | |||
4297 | const MCInstrDesc &Desc = get(MI.getOpcode()); | |||
4298 | if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || | |||
4299 | Desc.OpInfo[OpNo].RegClass == -1) { | |||
4300 | Register Reg = MI.getOperand(OpNo).getReg(); | |||
4301 | ||||
4302 | if (Reg.isVirtual()) | |||
4303 | return MRI.getRegClass(Reg); | |||
4304 | return RI.getPhysRegClass(Reg); | |||
4305 | } | |||
4306 | ||||
4307 | unsigned RCID = Desc.OpInfo[OpNo].RegClass; | |||
4308 | return RI.getRegClass(RCID); | |||
4309 | } | |||
4310 | ||||
4311 | void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { | |||
4312 | MachineBasicBlock::iterator I = MI; | |||
4313 | MachineBasicBlock *MBB = MI.getParent(); | |||
4314 | MachineOperand &MO = MI.getOperand(OpIdx); | |||
4315 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | |||
4316 | unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; | |||
4317 | const TargetRegisterClass *RC = RI.getRegClass(RCID); | |||
4318 | unsigned Size = RI.getRegSizeInBits(*RC); | |||
4319 | unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32; | |||
4320 | if (MO.isReg()) | |||
4321 | Opcode = AMDGPU::COPY; | |||
4322 | else if (RI.isSGPRClass(RC)) | |||
4323 | Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; | |||
4324 | ||||
4325 | const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); | |||
4326 | if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) | |||
4327 | VRC = &AMDGPU::VReg_64RegClass; | |||
4328 | else | |||
4329 | VRC = &AMDGPU::VGPR_32RegClass; | |||
4330 | ||||
4331 | Register Reg = MRI.createVirtualRegister(VRC); | |||
4332 | DebugLoc DL = MBB->findDebugLoc(I); | |||
4333 | BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); | |||
4334 | MO.ChangeToRegister(Reg, false); | |||
4335 | } | |||
4336 | ||||
4337 | unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, | |||
4338 | MachineRegisterInfo &MRI, | |||
4339 | MachineOperand &SuperReg, | |||
4340 | const TargetRegisterClass *SuperRC, | |||
4341 | unsigned SubIdx, | |||
4342 | const TargetRegisterClass *SubRC) | |||
4343 | const { | |||
4344 | MachineBasicBlock *MBB = MI->getParent(); | |||
4345 | DebugLoc DL = MI->getDebugLoc(); | |||
4346 | Register SubReg = MRI.createVirtualRegister(SubRC); | |||
4347 | ||||
4348 | if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { | |||
4349 | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) | |||
4350 | .addReg(SuperReg.getReg(), 0, SubIdx); | |||
4351 | return SubReg; | |||
4352 | } | |||
4353 | ||||
4354 | // Just in case the super register is itself a sub-register, copy it to a new | |||
4355 | // value so we don't need to worry about merging its subreg index with the | |||
4356 | // SubIdx passed to this function. The register coalescer should be able to | |||
4357 | // eliminate this extra copy. | |||
4358 | Register NewSuperReg = MRI.createVirtualRegister(SuperRC); | |||
4359 | ||||
4360 | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) | |||
4361 | .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); | |||
4362 | ||||
4363 | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) | |||
4364 | .addReg(NewSuperReg, 0, SubIdx); | |||
4365 | ||||
4366 | return SubReg; | |||
4367 | } | |||
4368 | ||||
4369 | MachineOperand SIInstrInfo::buildExtractSubRegOrImm( | |||
4370 | MachineBasicBlock::iterator MII, | |||
4371 | MachineRegisterInfo &MRI, | |||
4372 | MachineOperand &Op, | |||
4373 | const TargetRegisterClass *SuperRC, | |||
4374 | unsigned SubIdx, | |||
4375 | const TargetRegisterClass *SubRC) const { | |||
4376 | if (Op.isImm()) { | |||
4377 | if (SubIdx == AMDGPU::sub0) | |||
4378 | return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); | |||
4379 | if (SubIdx == AMDGPU::sub1) | |||
4380 | return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); | |||
4381 | ||||
4382 | llvm_unreachable("Unhandled register index for immediate")::llvm::llvm_unreachable_internal("Unhandled register index for immediate" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4382); | |||
4383 | } | |||
4384 | ||||
4385 | unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, | |||
4386 | SubIdx, SubRC); | |||
4387 | return MachineOperand::CreateReg(SubReg, false); | |||
4388 | } | |||
4389 | ||||
4390 | // Change the order of operands from (0, 1, 2) to (0, 2, 1) | |||
4391 | void SIInstrInfo::swapOperands(MachineInstr &Inst) const { | |||
4392 | assert(Inst.getNumExplicitOperands() == 3)((Inst.getNumExplicitOperands() == 3) ? static_cast<void> (0) : __assert_fail ("Inst.getNumExplicitOperands() == 3", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4392, __PRETTY_FUNCTION__)); | |||
4393 | MachineOperand Op1 = Inst.getOperand(1); | |||
4394 | Inst.RemoveOperand(1); | |||
4395 | Inst.addOperand(Op1); | |||
4396 | } | |||
4397 | ||||
4398 | bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, | |||
4399 | const MCOperandInfo &OpInfo, | |||
4400 | const MachineOperand &MO) const { | |||
4401 | if (!MO.isReg()) | |||
4402 | return false; | |||
4403 | ||||
4404 | Register Reg = MO.getReg(); | |||
4405 | ||||
4406 | const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass); | |||
4407 | if (Reg.isPhysical()) | |||
4408 | return DRC->contains(Reg); | |||
4409 | ||||
4410 | const TargetRegisterClass *RC = MRI.getRegClass(Reg); | |||
4411 | ||||
4412 | if (MO.getSubReg()) { | |||
4413 | const MachineFunction *MF = MO.getParent()->getParent()->getParent(); | |||
4414 | const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF); | |||
4415 | if (!SuperRC) | |||
4416 | return false; | |||
4417 | ||||
4418 | DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()); | |||
4419 | if (!DRC) | |||
4420 | return false; | |||
4421 | } | |||
4422 | return RC->hasSuperClassEq(DRC); | |||
4423 | } | |||
4424 | ||||
4425 | bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, | |||
4426 | const MCOperandInfo &OpInfo, | |||
4427 | const MachineOperand &MO) const { | |||
4428 | if (MO.isReg()) | |||
4429 | return isLegalRegOperand(MRI, OpInfo, MO); | |||
4430 | ||||
4431 | // Handle non-register types that are treated like immediates. | |||
4432 | assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal())((MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal ()) ? static_cast<void> (0) : __assert_fail ("MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4432, __PRETTY_FUNCTION__)); | |||
4433 | return true; | |||
4434 | } | |||
4435 | ||||
4436 | bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, | |||
4437 | const MachineOperand *MO) const { | |||
4438 | const MachineFunction &MF = *MI.getParent()->getParent(); | |||
4439 | const MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
4440 | const MCInstrDesc &InstDesc = MI.getDesc(); | |||
4441 | const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; | |||
4442 | const TargetRegisterClass *DefinedRC = | |||
4443 | OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; | |||
4444 | if (!MO) | |||
4445 | MO = &MI.getOperand(OpIdx); | |||
4446 | ||||
4447 | int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); | |||
4448 | int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; | |||
4449 | if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { | |||
4450 | if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--) | |||
4451 | return false; | |||
4452 | ||||
4453 | SmallDenseSet<RegSubRegPair> SGPRsUsed; | |||
4454 | if (MO->isReg()) | |||
4455 | SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg())); | |||
4456 | ||||
4457 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { | |||
4458 | if (i == OpIdx) | |||
4459 | continue; | |||
4460 | const MachineOperand &Op = MI.getOperand(i); | |||
4461 | if (Op.isReg()) { | |||
4462 | RegSubRegPair SGPR(Op.getReg(), Op.getSubReg()); | |||
4463 | if (!SGPRsUsed.count(SGPR) && | |||
4464 | usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { | |||
4465 | if (--ConstantBusLimit <= 0) | |||
4466 | return false; | |||
4467 | SGPRsUsed.insert(SGPR); | |||
4468 | } | |||
4469 | } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { | |||
4470 | if (--ConstantBusLimit <= 0) | |||
4471 | return false; | |||
4472 | } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) && | |||
4473 | isLiteralConstantLike(Op, InstDesc.OpInfo[i])) { | |||
4474 | if (!VOP3LiteralLimit--) | |||
4475 | return false; | |||
4476 | if (--ConstantBusLimit <= 0) | |||
4477 | return false; | |||
4478 | } | |||
4479 | } | |||
4480 | } | |||
4481 | ||||
4482 | if (MO->isReg()) { | |||
4483 | assert(DefinedRC)((DefinedRC) ? static_cast<void> (0) : __assert_fail ("DefinedRC" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4483, __PRETTY_FUNCTION__)); | |||
4484 | return isLegalRegOperand(MRI, OpInfo, *MO); | |||
4485 | } | |||
4486 | ||||
4487 | // Handle non-register types that are treated like immediates. | |||
4488 | assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal())((MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()) ? static_cast<void> (0) : __assert_fail ("MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4488, __PRETTY_FUNCTION__)); | |||
4489 | ||||
4490 | if (!DefinedRC) { | |||
4491 | // This operand expects an immediate. | |||
4492 | return true; | |||
4493 | } | |||
4494 | ||||
4495 | return isImmOperandLegal(MI, OpIdx, *MO); | |||
4496 | } | |||
4497 | ||||
4498 | void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, | |||
4499 | MachineInstr &MI) const { | |||
4500 | unsigned Opc = MI.getOpcode(); | |||
4501 | const MCInstrDesc &InstrDesc = get(Opc); | |||
4502 | ||||
4503 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); | |||
4504 | MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
4505 | ||||
4506 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); | |||
4507 | MachineOperand &Src1 = MI.getOperand(Src1Idx); | |||
4508 | ||||
4509 | // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 | |||
4510 | // we need to only have one constant bus use before GFX10. | |||
4511 | bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; | |||
4512 | if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && | |||
4513 | Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) || | |||
4514 | isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx]))) | |||
4515 | legalizeOpWithMove(MI, Src0Idx); | |||
4516 | ||||
4517 | // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for | |||
4518 | // both the value to write (src0) and lane select (src1). Fix up non-SGPR | |||
4519 | // src0/src1 with V_READFIRSTLANE. | |||
4520 | if (Opc == AMDGPU::V_WRITELANE_B32) { | |||
4521 | const DebugLoc &DL = MI.getDebugLoc(); | |||
4522 | if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { | |||
4523 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
4524 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
4525 | .add(Src0); | |||
4526 | Src0.ChangeToRegister(Reg, false); | |||
4527 | } | |||
4528 | if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { | |||
4529 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
4530 | const DebugLoc &DL = MI.getDebugLoc(); | |||
4531 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
4532 | .add(Src1); | |||
4533 | Src1.ChangeToRegister(Reg, false); | |||
4534 | } | |||
4535 | return; | |||
4536 | } | |||
4537 | ||||
4538 | // No VOP2 instructions support AGPRs. | |||
4539 | if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg())) | |||
4540 | legalizeOpWithMove(MI, Src0Idx); | |||
4541 | ||||
4542 | if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) | |||
4543 | legalizeOpWithMove(MI, Src1Idx); | |||
4544 | ||||
4545 | // VOP2 src0 instructions support all operand types, so we don't need to check | |||
4546 | // their legality. If src1 is already legal, we don't need to do anything. | |||
4547 | if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) | |||
4548 | return; | |||
4549 | ||||
4550 | // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for | |||
4551 | // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane | |||
4552 | // select is uniform. | |||
4553 | if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && | |||
4554 | RI.isVGPR(MRI, Src1.getReg())) { | |||
4555 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
4556 | const DebugLoc &DL = MI.getDebugLoc(); | |||
4557 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
4558 | .add(Src1); | |||
4559 | Src1.ChangeToRegister(Reg, false); | |||
4560 | return; | |||
4561 | } | |||
4562 | ||||
4563 | // We do not use commuteInstruction here because it is too aggressive and will | |||
4564 | // commute if it is possible. We only want to commute here if it improves | |||
4565 | // legality. This can be called a fairly large number of times so don't waste | |||
4566 | // compile time pointlessly swapping and checking legality again. | |||
4567 | if (HasImplicitSGPR || !MI.isCommutable()) { | |||
4568 | legalizeOpWithMove(MI, Src1Idx); | |||
4569 | return; | |||
4570 | } | |||
4571 | ||||
4572 | // If src0 can be used as src1, commuting will make the operands legal. | |||
4573 | // Otherwise we have to give up and insert a move. | |||
4574 | // | |||
4575 | // TODO: Other immediate-like operand kinds could be commuted if there was a | |||
4576 | // MachineOperand::ChangeTo* for them. | |||
4577 | if ((!Src1.isImm() && !Src1.isReg()) || | |||
4578 | !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { | |||
4579 | legalizeOpWithMove(MI, Src1Idx); | |||
4580 | return; | |||
4581 | } | |||
4582 | ||||
4583 | int CommutedOpc = commuteOpcode(MI); | |||
4584 | if (CommutedOpc == -1) { | |||
4585 | legalizeOpWithMove(MI, Src1Idx); | |||
4586 | return; | |||
4587 | } | |||
4588 | ||||
4589 | MI.setDesc(get(CommutedOpc)); | |||
4590 | ||||
4591 | Register Src0Reg = Src0.getReg(); | |||
4592 | unsigned Src0SubReg = Src0.getSubReg(); | |||
4593 | bool Src0Kill = Src0.isKill(); | |||
4594 | ||||
4595 | if (Src1.isImm()) | |||
4596 | Src0.ChangeToImmediate(Src1.getImm()); | |||
4597 | else if (Src1.isReg()) { | |||
4598 | Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); | |||
4599 | Src0.setSubReg(Src1.getSubReg()); | |||
4600 | } else | |||
4601 | llvm_unreachable("Should only have register or immediate operands")::llvm::llvm_unreachable_internal("Should only have register or immediate operands" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4601); | |||
4602 | ||||
4603 | Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); | |||
4604 | Src1.setSubReg(Src0SubReg); | |||
4605 | fixImplicitOperands(MI); | |||
4606 | } | |||
4607 | ||||
4608 | // Legalize VOP3 operands. All operand types are supported for any operand | |||
4609 | // but only one literal constant and only starting from GFX10. | |||
4610 | void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, | |||
4611 | MachineInstr &MI) const { | |||
4612 | unsigned Opc = MI.getOpcode(); | |||
4613 | ||||
4614 | int VOP3Idx[3] = { | |||
4615 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), | |||
4616 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), | |||
4617 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) | |||
4618 | }; | |||
4619 | ||||
4620 | if (Opc == AMDGPU::V_PERMLANE16_B32_e64 || | |||
4621 | Opc == AMDGPU::V_PERMLANEX16_B32_e64) { | |||
4622 | // src1 and src2 must be scalar | |||
4623 | MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]); | |||
4624 | MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); | |||
4625 | const DebugLoc &DL = MI.getDebugLoc(); | |||
4626 | if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { | |||
4627 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
4628 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
4629 | .add(Src1); | |||
4630 | Src1.ChangeToRegister(Reg, false); | |||
4631 | } | |||
4632 | if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { | |||
4633 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
4634 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
4635 | .add(Src2); | |||
4636 | Src2.ChangeToRegister(Reg, false); | |||
4637 | } | |||
4638 | } | |||
4639 | ||||
4640 | // Find the one SGPR operand we are allowed to use. | |||
4641 | int ConstantBusLimit = ST.getConstantBusLimit(Opc); | |||
4642 | int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; | |||
4643 | SmallDenseSet<unsigned> SGPRsUsed; | |||
4644 | Register SGPRReg = findUsedSGPR(MI, VOP3Idx); | |||
4645 | if (SGPRReg != AMDGPU::NoRegister) { | |||
4646 | SGPRsUsed.insert(SGPRReg); | |||
4647 | --ConstantBusLimit; | |||
4648 | } | |||
4649 | ||||
4650 | for (unsigned i = 0; i < 3; ++i) { | |||
4651 | int Idx = VOP3Idx[i]; | |||
4652 | if (Idx == -1) | |||
4653 | break; | |||
4654 | MachineOperand &MO = MI.getOperand(Idx); | |||
4655 | ||||
4656 | if (!MO.isReg()) { | |||
4657 | if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx])) | |||
4658 | continue; | |||
4659 | ||||
4660 | if (LiteralLimit > 0 && ConstantBusLimit > 0) { | |||
4661 | --LiteralLimit; | |||
4662 | --ConstantBusLimit; | |||
4663 | continue; | |||
4664 | } | |||
4665 | ||||
4666 | --LiteralLimit; | |||
4667 | --ConstantBusLimit; | |||
4668 | legalizeOpWithMove(MI, Idx); | |||
4669 | continue; | |||
4670 | } | |||
4671 | ||||
4672 | if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) && | |||
4673 | !isOperandLegal(MI, Idx, &MO)) { | |||
4674 | legalizeOpWithMove(MI, Idx); | |||
4675 | continue; | |||
4676 | } | |||
4677 | ||||
4678 | if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) | |||
4679 | continue; // VGPRs are legal | |||
4680 | ||||
4681 | // We can use one SGPR in each VOP3 instruction prior to GFX10 | |||
4682 | // and two starting from GFX10. | |||
4683 | if (SGPRsUsed.count(MO.getReg())) | |||
4684 | continue; | |||
4685 | if (ConstantBusLimit > 0) { | |||
4686 | SGPRsUsed.insert(MO.getReg()); | |||
4687 | --ConstantBusLimit; | |||
4688 | continue; | |||
4689 | } | |||
4690 | ||||
4691 | // If we make it this far, then the operand is not legal and we must | |||
4692 | // legalize it. | |||
4693 | legalizeOpWithMove(MI, Idx); | |||
4694 | } | |||
4695 | } | |||
4696 | ||||
4697 | Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, | |||
4698 | MachineRegisterInfo &MRI) const { | |||
4699 | const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); | |||
4700 | const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); | |||
4701 | Register DstReg = MRI.createVirtualRegister(SRC); | |||
4702 | unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; | |||
4703 | ||||
4704 | if (RI.hasAGPRs(VRC)) { | |||
4705 | VRC = RI.getEquivalentVGPRClass(VRC); | |||
4706 | Register NewSrcReg = MRI.createVirtualRegister(VRC); | |||
4707 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), | |||
4708 | get(TargetOpcode::COPY), NewSrcReg) | |||
4709 | .addReg(SrcReg); | |||
4710 | SrcReg = NewSrcReg; | |||
4711 | } | |||
4712 | ||||
4713 | if (SubRegs == 1) { | |||
4714 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), | |||
4715 | get(AMDGPU::V_READFIRSTLANE_B32), DstReg) | |||
4716 | .addReg(SrcReg); | |||
4717 | return DstReg; | |||
4718 | } | |||
4719 | ||||
4720 | SmallVector<unsigned, 8> SRegs; | |||
4721 | for (unsigned i = 0; i < SubRegs; ++i) { | |||
4722 | Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
4723 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), | |||
4724 | get(AMDGPU::V_READFIRSTLANE_B32), SGPR) | |||
4725 | .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); | |||
4726 | SRegs.push_back(SGPR); | |||
4727 | } | |||
4728 | ||||
4729 | MachineInstrBuilder MIB = | |||
4730 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), | |||
4731 | get(AMDGPU::REG_SEQUENCE), DstReg); | |||
4732 | for (unsigned i = 0; i < SubRegs; ++i) { | |||
4733 | MIB.addReg(SRegs[i]); | |||
4734 | MIB.addImm(RI.getSubRegFromChannel(i)); | |||
4735 | } | |||
4736 | return DstReg; | |||
4737 | } | |||
4738 | ||||
4739 | void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, | |||
4740 | MachineInstr &MI) const { | |||
4741 | ||||
4742 | // If the pointer is store in VGPRs, then we need to move them to | |||
4743 | // SGPRs using v_readfirstlane. This is safe because we only select | |||
4744 | // loads with uniform pointers to SMRD instruction so we know the | |||
4745 | // pointer value is uniform. | |||
4746 | MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); | |||
4747 | if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { | |||
4748 | Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); | |||
4749 | SBase->setReg(SGPR); | |||
4750 | } | |||
4751 | MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff); | |||
4752 | if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { | |||
4753 | Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); | |||
4754 | SOff->setReg(SGPR); | |||
4755 | } | |||
4756 | } | |||
4757 | ||||
4758 | // FIXME: Remove this when SelectionDAG is obsoleted. | |||
4759 | void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, | |||
4760 | MachineInstr &MI) const { | |||
4761 | if (!isSegmentSpecificFLAT(MI)) | |||
4762 | return; | |||
4763 | ||||
4764 | // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence | |||
4765 | // thinks they are uniform, so a readfirstlane should be valid. | |||
4766 | MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr); | |||
4767 | if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg()))) | |||
4768 | return; | |||
4769 | ||||
4770 | Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI); | |||
4771 | SAddr->setReg(ToSGPR); | |||
4772 | } | |||
4773 | ||||
4774 | void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, | |||
4775 | MachineBasicBlock::iterator I, | |||
4776 | const TargetRegisterClass *DstRC, | |||
4777 | MachineOperand &Op, | |||
4778 | MachineRegisterInfo &MRI, | |||
4779 | const DebugLoc &DL) const { | |||
4780 | Register OpReg = Op.getReg(); | |||
4781 | unsigned OpSubReg = Op.getSubReg(); | |||
4782 | ||||
4783 | const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( | |||
4784 | RI.getRegClassForReg(MRI, OpReg), OpSubReg); | |||
4785 | ||||
4786 | // Check if operand is already the correct register class. | |||
4787 | if (DstRC == OpRC) | |||
4788 | return; | |||
4789 | ||||
4790 | Register DstReg = MRI.createVirtualRegister(DstRC); | |||
4791 | MachineInstr *Copy = | |||
4792 | BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); | |||
4793 | ||||
4794 | Op.setReg(DstReg); | |||
4795 | Op.setSubReg(0); | |||
4796 | ||||
4797 | MachineInstr *Def = MRI.getVRegDef(OpReg); | |||
4798 | if (!Def) | |||
4799 | return; | |||
4800 | ||||
4801 | // Try to eliminate the copy if it is copying an immediate value. | |||
4802 | if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass) | |||
4803 | FoldImmediate(*Copy, *Def, OpReg, &MRI); | |||
4804 | ||||
4805 | bool ImpDef = Def->isImplicitDef(); | |||
4806 | while (!ImpDef && Def && Def->isCopy()) { | |||
4807 | if (Def->getOperand(1).getReg().isPhysical()) | |||
4808 | break; | |||
4809 | Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); | |||
4810 | ImpDef = Def && Def->isImplicitDef(); | |||
4811 | } | |||
4812 | if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) && | |||
4813 | !ImpDef) | |||
4814 | Copy->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); | |||
4815 | } | |||
4816 | ||||
4817 | // Emit the actual waterfall loop, executing the wrapped instruction for each | |||
4818 | // unique value of \p Rsrc across all lanes. In the best case we execute 1 | |||
4819 | // iteration, in the worst case we execute 64 (once per lane). | |||
4820 | static void | |||
4821 | emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, | |||
4822 | MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, | |||
4823 | const DebugLoc &DL, MachineOperand &Rsrc) { | |||
4824 | MachineFunction &MF = *OrigBB.getParent(); | |||
4825 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | |||
4826 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); | |||
4827 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
4828 | unsigned SaveExecOpc = | |||
4829 | ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; | |||
4830 | unsigned XorTermOpc = | |||
4831 | ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; | |||
4832 | unsigned AndOpc = | |||
4833 | ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; | |||
4834 | const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
4835 | ||||
4836 | MachineBasicBlock::iterator I = LoopBB.begin(); | |||
4837 | ||||
4838 | SmallVector<Register, 8> ReadlanePieces; | |||
4839 | Register CondReg = AMDGPU::NoRegister; | |||
4840 | ||||
4841 | Register VRsrc = Rsrc.getReg(); | |||
4842 | unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); | |||
4843 | ||||
4844 | unsigned RegSize = TRI->getRegSizeInBits(Rsrc.getReg(), MRI); | |||
4845 | unsigned NumSubRegs = RegSize / 32; | |||
4846 | assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size")((NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size") ? static_cast<void> (0) : __assert_fail ("NumSubRegs % 2 == 0 && NumSubRegs <= 32 && \"Unhandled register size\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4846, __PRETTY_FUNCTION__)); | |||
4847 | ||||
4848 | for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) { | |||
4849 | ||||
4850 | Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
4851 | Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
4852 | ||||
4853 | // Read the next variant <- also loop target. | |||
4854 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo) | |||
4855 | .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx)); | |||
4856 | ||||
4857 | // Read the next variant <- also loop target. | |||
4858 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi) | |||
4859 | .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx + 1)); | |||
4860 | ||||
4861 | ReadlanePieces.push_back(CurRegLo); | |||
4862 | ReadlanePieces.push_back(CurRegHi); | |||
4863 | ||||
4864 | // Comparison is to be done as 64-bit. | |||
4865 | Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); | |||
4866 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg) | |||
4867 | .addReg(CurRegLo) | |||
4868 | .addImm(AMDGPU::sub0) | |||
4869 | .addReg(CurRegHi) | |||
4870 | .addImm(AMDGPU::sub1); | |||
4871 | ||||
4872 | Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC); | |||
4873 | auto Cmp = | |||
4874 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg) | |||
4875 | .addReg(CurReg); | |||
4876 | if (NumSubRegs <= 2) | |||
4877 | Cmp.addReg(VRsrc); | |||
4878 | else | |||
4879 | Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2)); | |||
4880 | ||||
4881 | // Combine the comparision results with AND. | |||
4882 | if (CondReg == AMDGPU::NoRegister) // First. | |||
4883 | CondReg = NewCondReg; | |||
4884 | else { // If not the first, we create an AND. | |||
4885 | Register AndReg = MRI.createVirtualRegister(BoolXExecRC); | |||
4886 | BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg) | |||
4887 | .addReg(CondReg) | |||
4888 | .addReg(NewCondReg); | |||
4889 | CondReg = AndReg; | |||
4890 | } | |||
4891 | } // End for loop. | |||
4892 | ||||
4893 | auto SRsrcRC = TRI->getEquivalentSGPRClass(MRI.getRegClass(VRsrc)); | |||
4894 | Register SRsrc = MRI.createVirtualRegister(SRsrcRC); | |||
4895 | ||||
4896 | // Build scalar Rsrc. | |||
4897 | auto Merge = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc); | |||
4898 | unsigned Channel = 0; | |||
4899 | for (Register Piece : ReadlanePieces) { | |||
4900 | Merge.addReg(Piece) | |||
4901 | .addImm(TRI->getSubRegFromChannel(Channel++)); | |||
4902 | } | |||
4903 | ||||
4904 | // Update Rsrc operand to use the SGPR Rsrc. | |||
4905 | Rsrc.setReg(SRsrc); | |||
4906 | Rsrc.setIsKill(true); | |||
4907 | ||||
4908 | Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); | |||
4909 | MRI.setSimpleHint(SaveExec, CondReg); | |||
4910 | ||||
4911 | // Update EXEC to matching lanes, saving original to SaveExec. | |||
4912 | BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec) | |||
4913 | .addReg(CondReg, RegState::Kill); | |||
4914 | ||||
4915 | // The original instruction is here; we insert the terminators after it. | |||
4916 | I = LoopBB.end(); | |||
4917 | ||||
4918 | // Update EXEC, switch all done bits to 0 and all todo bits to 1. | |||
4919 | BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec) | |||
4920 | .addReg(Exec) | |||
4921 | .addReg(SaveExec); | |||
4922 | ||||
4923 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB); | |||
4924 | } | |||
4925 | ||||
4926 | // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register | |||
4927 | // with SGPRs by iterating over all unique values across all lanes. | |||
4928 | // Returns the loop basic block that now contains \p MI. | |||
4929 | static MachineBasicBlock * | |||
4930 | loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, | |||
4931 | MachineOperand &Rsrc, MachineDominatorTree *MDT, | |||
4932 | MachineBasicBlock::iterator Begin = nullptr, | |||
4933 | MachineBasicBlock::iterator End = nullptr) { | |||
4934 | MachineBasicBlock &MBB = *MI.getParent(); | |||
4935 | MachineFunction &MF = *MBB.getParent(); | |||
4936 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | |||
4937 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); | |||
4938 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
4939 | if (!Begin.isValid()) | |||
4940 | Begin = &MI; | |||
4941 | if (!End.isValid()) { | |||
4942 | End = &MI; | |||
4943 | ++End; | |||
4944 | } | |||
4945 | const DebugLoc &DL = MI.getDebugLoc(); | |||
4946 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
4947 | unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; | |||
4948 | const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
4949 | ||||
4950 | Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); | |||
4951 | ||||
4952 | // Save the EXEC mask | |||
4953 | BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); | |||
4954 | ||||
4955 | // Killed uses in the instruction we are waterfalling around will be | |||
4956 | // incorrect due to the added control-flow. | |||
4957 | MachineBasicBlock::iterator AfterMI = MI; | |||
4958 | ++AfterMI; | |||
4959 | for (auto I = Begin; I != AfterMI; I++) { | |||
4960 | for (auto &MO : I->uses()) { | |||
4961 | if (MO.isReg() && MO.isUse()) { | |||
4962 | MRI.clearKillFlags(MO.getReg()); | |||
4963 | } | |||
4964 | } | |||
4965 | } | |||
4966 | ||||
4967 | // To insert the loop we need to split the block. Move everything after this | |||
4968 | // point to a new block, and insert a new empty block between the two. | |||
4969 | MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); | |||
4970 | MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); | |||
4971 | MachineFunction::iterator MBBI(MBB); | |||
4972 | ++MBBI; | |||
4973 | ||||
4974 | MF.insert(MBBI, LoopBB); | |||
4975 | MF.insert(MBBI, RemainderBB); | |||
4976 | ||||
4977 | LoopBB->addSuccessor(LoopBB); | |||
4978 | LoopBB->addSuccessor(RemainderBB); | |||
4979 | ||||
4980 | // Move Begin to MI to the LoopBB, and the remainder of the block to | |||
4981 | // RemainderBB. | |||
4982 | RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); | |||
4983 | RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end()); | |||
4984 | LoopBB->splice(LoopBB->begin(), &MBB, Begin, MBB.end()); | |||
4985 | ||||
4986 | MBB.addSuccessor(LoopBB); | |||
4987 | ||||
4988 | // Update dominators. We know that MBB immediately dominates LoopBB, that | |||
4989 | // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately | |||
4990 | // dominates all of the successors transferred to it from MBB that MBB used | |||
4991 | // to properly dominate. | |||
4992 | if (MDT) { | |||
4993 | MDT->addNewBlock(LoopBB, &MBB); | |||
4994 | MDT->addNewBlock(RemainderBB, LoopBB); | |||
4995 | for (auto &Succ : RemainderBB->successors()) { | |||
4996 | if (MDT->properlyDominates(&MBB, Succ)) { | |||
4997 | MDT->changeImmediateDominator(Succ, RemainderBB); | |||
4998 | } | |||
4999 | } | |||
5000 | } | |||
5001 | ||||
5002 | emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc); | |||
5003 | ||||
5004 | // Restore the EXEC mask | |||
5005 | MachineBasicBlock::iterator First = RemainderBB->begin(); | |||
5006 | BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); | |||
5007 | return LoopBB; | |||
5008 | } | |||
5009 | ||||
5010 | // Extract pointer from Rsrc and return a zero-value Rsrc replacement. | |||
5011 | static std::tuple<unsigned, unsigned> | |||
5012 | extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { | |||
5013 | MachineBasicBlock &MBB = *MI.getParent(); | |||
5014 | MachineFunction &MF = *MBB.getParent(); | |||
5015 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
5016 | ||||
5017 | // Extract the ptr from the resource descriptor. | |||
5018 | unsigned RsrcPtr = | |||
5019 | TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass, | |||
5020 | AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); | |||
5021 | ||||
5022 | // Create an empty resource descriptor | |||
5023 | Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); | |||
5024 | Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5025 | Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5026 | Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); | |||
5027 | uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat(); | |||
5028 | ||||
5029 | // Zero64 = 0 | |||
5030 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64) | |||
5031 | .addImm(0); | |||
5032 | ||||
5033 | // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} | |||
5034 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo) | |||
5035 | .addImm(RsrcDataFormat & 0xFFFFFFFF); | |||
5036 | ||||
5037 | // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} | |||
5038 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi) | |||
5039 | .addImm(RsrcDataFormat >> 32); | |||
5040 | ||||
5041 | // NewSRsrc = {Zero64, SRsrcFormat} | |||
5042 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc) | |||
5043 | .addReg(Zero64) | |||
5044 | .addImm(AMDGPU::sub0_sub1) | |||
5045 | .addReg(SRsrcFormatLo) | |||
5046 | .addImm(AMDGPU::sub2) | |||
5047 | .addReg(SRsrcFormatHi) | |||
5048 | .addImm(AMDGPU::sub3); | |||
5049 | ||||
5050 | return std::make_tuple(RsrcPtr, NewSRsrc); | |||
5051 | } | |||
5052 | ||||
5053 | MachineBasicBlock * | |||
5054 | SIInstrInfo::legalizeOperands(MachineInstr &MI, | |||
5055 | MachineDominatorTree *MDT) const { | |||
5056 | MachineFunction &MF = *MI.getParent()->getParent(); | |||
5057 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
5058 | MachineBasicBlock *CreatedBB = nullptr; | |||
5059 | ||||
5060 | // Legalize VOP2 | |||
5061 | if (isVOP2(MI) || isVOPC(MI)) { | |||
5062 | legalizeOperandsVOP2(MRI, MI); | |||
5063 | return CreatedBB; | |||
5064 | } | |||
5065 | ||||
5066 | // Legalize VOP3 | |||
5067 | if (isVOP3(MI)) { | |||
5068 | legalizeOperandsVOP3(MRI, MI); | |||
5069 | return CreatedBB; | |||
5070 | } | |||
5071 | ||||
5072 | // Legalize SMRD | |||
5073 | if (isSMRD(MI)) { | |||
5074 | legalizeOperandsSMRD(MRI, MI); | |||
5075 | return CreatedBB; | |||
5076 | } | |||
5077 | ||||
5078 | // Legalize FLAT | |||
5079 | if (isFLAT(MI)) { | |||
5080 | legalizeOperandsFLAT(MRI, MI); | |||
5081 | return CreatedBB; | |||
5082 | } | |||
5083 | ||||
5084 | // Legalize REG_SEQUENCE and PHI | |||
5085 | // The register class of the operands much be the same type as the register | |||
5086 | // class of the output. | |||
5087 | if (MI.getOpcode() == AMDGPU::PHI) { | |||
5088 | const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; | |||
5089 | for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { | |||
5090 | if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual()) | |||
5091 | continue; | |||
5092 | const TargetRegisterClass *OpRC = | |||
5093 | MRI.getRegClass(MI.getOperand(i).getReg()); | |||
5094 | if (RI.hasVectorRegisters(OpRC)) { | |||
5095 | VRC = OpRC; | |||
5096 | } else { | |||
5097 | SRC = OpRC; | |||
5098 | } | |||
5099 | } | |||
5100 | ||||
5101 | // If any of the operands are VGPR registers, then they all most be | |||
5102 | // otherwise we will create illegal VGPR->SGPR copies when legalizing | |||
5103 | // them. | |||
5104 | if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { | |||
5105 | if (!VRC) { | |||
5106 | assert(SRC)((SRC) ? static_cast<void> (0) : __assert_fail ("SRC", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5106, __PRETTY_FUNCTION__)); | |||
5107 | if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { | |||
5108 | VRC = &AMDGPU::VReg_1RegClass; | |||
5109 | } else | |||
5110 | VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) | |||
5111 | ? RI.getEquivalentAGPRClass(SRC) | |||
5112 | : RI.getEquivalentVGPRClass(SRC); | |||
5113 | } else { | |||
5114 | VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) | |||
5115 | ? RI.getEquivalentAGPRClass(VRC) | |||
5116 | : RI.getEquivalentVGPRClass(VRC); | |||
5117 | } | |||
5118 | RC = VRC; | |||
5119 | } else { | |||
5120 | RC = SRC; | |||
5121 | } | |||
5122 | ||||
5123 | // Update all the operands so they have the same type. | |||
5124 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { | |||
5125 | MachineOperand &Op = MI.getOperand(I); | |||
5126 | if (!Op.isReg() || !Op.getReg().isVirtual()) | |||
5127 | continue; | |||
5128 | ||||
5129 | // MI is a PHI instruction. | |||
5130 | MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); | |||
5131 | MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); | |||
5132 | ||||
5133 | // Avoid creating no-op copies with the same src and dst reg class. These | |||
5134 | // confuse some of the machine passes. | |||
5135 | legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); | |||
5136 | } | |||
5137 | } | |||
5138 | ||||
5139 | // REG_SEQUENCE doesn't really require operand legalization, but if one has a | |||
5140 | // VGPR dest type and SGPR sources, insert copies so all operands are | |||
5141 | // VGPRs. This seems to help operand folding / the register coalescer. | |||
5142 | if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { | |||
5143 | MachineBasicBlock *MBB = MI.getParent(); | |||
5144 | const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); | |||
5145 | if (RI.hasVGPRs(DstRC)) { | |||
5146 | // Update all the operands so they are VGPR register classes. These may | |||
5147 | // not be the same register class because REG_SEQUENCE supports mixing | |||
5148 | // subregister index types e.g. sub0_sub1 + sub2 + sub3 | |||
5149 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { | |||
5150 | MachineOperand &Op = MI.getOperand(I); | |||
5151 | if (!Op.isReg() || !Op.getReg().isVirtual()) | |||
5152 | continue; | |||
5153 | ||||
5154 | const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); | |||
5155 | const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); | |||
5156 | if (VRC == OpRC) | |||
5157 | continue; | |||
5158 | ||||
5159 | legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); | |||
5160 | Op.setIsKill(); | |||
5161 | } | |||
5162 | } | |||
5163 | ||||
5164 | return CreatedBB; | |||
5165 | } | |||
5166 | ||||
5167 | // Legalize INSERT_SUBREG | |||
5168 | // src0 must have the same register class as dst | |||
5169 | if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { | |||
5170 | Register Dst = MI.getOperand(0).getReg(); | |||
5171 | Register Src0 = MI.getOperand(1).getReg(); | |||
5172 | const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); | |||
5173 | const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); | |||
5174 | if (DstRC != Src0RC) { | |||
5175 | MachineBasicBlock *MBB = MI.getParent(); | |||
5176 | MachineOperand &Op = MI.getOperand(1); | |||
5177 | legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); | |||
5178 | } | |||
5179 | return CreatedBB; | |||
5180 | } | |||
5181 | ||||
5182 | // Legalize SI_INIT_M0 | |||
5183 | if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { | |||
5184 | MachineOperand &Src = MI.getOperand(0); | |||
5185 | if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) | |||
5186 | Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); | |||
5187 | return CreatedBB; | |||
5188 | } | |||
5189 | ||||
5190 | // Legalize MIMG and MUBUF/MTBUF for shaders. | |||
5191 | // | |||
5192 | // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via | |||
5193 | // scratch memory access. In both cases, the legalization never involves | |||
5194 | // conversion to the addr64 form. | |||
5195 | if (isMIMG(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) && | |||
5196 | (isMUBUF(MI) || isMTBUF(MI)))) { | |||
5197 | MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); | |||
5198 | if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) | |||
5199 | CreatedBB = loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT); | |||
5200 | ||||
5201 | MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); | |||
5202 | if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) | |||
5203 | CreatedBB = loadSRsrcFromVGPR(*this, MI, *SSamp, MDT); | |||
5204 | ||||
5205 | return CreatedBB; | |||
5206 | } | |||
5207 | ||||
5208 | // Legalize SI_CALL | |||
5209 | if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { | |||
5210 | MachineOperand *Dest = &MI.getOperand(0); | |||
5211 | if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) { | |||
5212 | // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and | |||
5213 | // following copies, we also need to move copies from and to physical | |||
5214 | // registers into the loop block. | |||
5215 | unsigned FrameSetupOpcode = getCallFrameSetupOpcode(); | |||
5216 | unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode(); | |||
5217 | ||||
5218 | // Also move the copies to physical registers into the loop block | |||
5219 | MachineBasicBlock &MBB = *MI.getParent(); | |||
5220 | MachineBasicBlock::iterator Start(&MI); | |||
5221 | while (Start->getOpcode() != FrameSetupOpcode) | |||
5222 | --Start; | |||
5223 | MachineBasicBlock::iterator End(&MI); | |||
5224 | while (End->getOpcode() != FrameDestroyOpcode) | |||
5225 | ++End; | |||
5226 | // Also include following copies of the return value | |||
5227 | ++End; | |||
5228 | while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() && | |||
5229 | MI.definesRegister(End->getOperand(1).getReg())) | |||
5230 | ++End; | |||
5231 | CreatedBB = loadSRsrcFromVGPR(*this, MI, *Dest, MDT, Start, End); | |||
5232 | } | |||
5233 | } | |||
5234 | ||||
5235 | // Legalize MUBUF* instructions. | |||
5236 | int RsrcIdx = | |||
5237 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); | |||
5238 | if (RsrcIdx != -1) { | |||
5239 | // We have an MUBUF instruction | |||
5240 | MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); | |||
5241 | unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass; | |||
5242 | if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()), | |||
5243 | RI.getRegClass(RsrcRC))) { | |||
5244 | // The operands are legal. | |||
5245 | // FIXME: We may need to legalize operands besided srsrc. | |||
5246 | return CreatedBB; | |||
5247 | } | |||
5248 | ||||
5249 | // Legalize a VGPR Rsrc. | |||
5250 | // | |||
5251 | // If the instruction is _ADDR64, we can avoid a waterfall by extracting | |||
5252 | // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using | |||
5253 | // a zero-value SRsrc. | |||
5254 | // | |||
5255 | // If the instruction is _OFFSET (both idxen and offen disabled), and we | |||
5256 | // support ADDR64 instructions, we can convert to ADDR64 and do the same as | |||
5257 | // above. | |||
5258 | // | |||
5259 | // Otherwise we are on non-ADDR64 hardware, and/or we have | |||
5260 | // idxen/offen/bothen and we fall back to a waterfall loop. | |||
5261 | ||||
5262 | MachineBasicBlock &MBB = *MI.getParent(); | |||
5263 | ||||
5264 | MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); | |||
5265 | if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) { | |||
5266 | // This is already an ADDR64 instruction so we need to add the pointer | |||
5267 | // extracted from the resource descriptor to the current value of VAddr. | |||
5268 | Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
5269 | Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
5270 | Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); | |||
5271 | ||||
5272 | const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
5273 | Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); | |||
5274 | Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); | |||
5275 | ||||
5276 | unsigned RsrcPtr, NewSRsrc; | |||
5277 | std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); | |||
5278 | ||||
5279 | // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0 | |||
5280 | const DebugLoc &DL = MI.getDebugLoc(); | |||
5281 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo) | |||
5282 | .addDef(CondReg0) | |||
5283 | .addReg(RsrcPtr, 0, AMDGPU::sub0) | |||
5284 | .addReg(VAddr->getReg(), 0, AMDGPU::sub0) | |||
5285 | .addImm(0); | |||
5286 | ||||
5287 | // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1 | |||
5288 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi) | |||
5289 | .addDef(CondReg1, RegState::Dead) | |||
5290 | .addReg(RsrcPtr, 0, AMDGPU::sub1) | |||
5291 | .addReg(VAddr->getReg(), 0, AMDGPU::sub1) | |||
5292 | .addReg(CondReg0, RegState::Kill) | |||
5293 | .addImm(0); | |||
5294 | ||||
5295 | // NewVaddr = {NewVaddrHi, NewVaddrLo} | |||
5296 | BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) | |||
5297 | .addReg(NewVAddrLo) | |||
5298 | .addImm(AMDGPU::sub0) | |||
5299 | .addReg(NewVAddrHi) | |||
5300 | .addImm(AMDGPU::sub1); | |||
5301 | ||||
5302 | VAddr->setReg(NewVAddr); | |||
5303 | Rsrc->setReg(NewSRsrc); | |||
5304 | } else if (!VAddr && ST.hasAddr64()) { | |||
5305 | // This instructions is the _OFFSET variant, so we need to convert it to | |||
5306 | // ADDR64. | |||
5307 | assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&((ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here") ? static_cast<void > (0) : __assert_fail ("ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && \"FIXME: Need to emit flat atomics here\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5308, __PRETTY_FUNCTION__)) | |||
5308 | "FIXME: Need to emit flat atomics here")((ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here") ? static_cast<void > (0) : __assert_fail ("ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && \"FIXME: Need to emit flat atomics here\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5308, __PRETTY_FUNCTION__)); | |||
5309 | ||||
5310 | unsigned RsrcPtr, NewSRsrc; | |||
5311 | std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); | |||
5312 | ||||
5313 | Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); | |||
5314 | MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); | |||
5315 | MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); | |||
5316 | MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); | |||
5317 | unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); | |||
5318 | ||||
5319 | // Atomics rith return have have an additional tied operand and are | |||
5320 | // missing some of the special bits. | |||
5321 | MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); | |||
5322 | MachineInstr *Addr64; | |||
5323 | ||||
5324 | if (!VDataIn) { | |||
5325 | // Regular buffer load / store. | |||
5326 | MachineInstrBuilder MIB = | |||
5327 | BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) | |||
5328 | .add(*VData) | |||
5329 | .addReg(NewVAddr) | |||
5330 | .addReg(NewSRsrc) | |||
5331 | .add(*SOffset) | |||
5332 | .add(*Offset); | |||
5333 | ||||
5334 | // Atomics do not have this operand. | |||
5335 | if (const MachineOperand *GLC = | |||
5336 | getNamedOperand(MI, AMDGPU::OpName::glc)) { | |||
5337 | MIB.addImm(GLC->getImm()); | |||
5338 | } | |||
5339 | if (const MachineOperand *DLC = | |||
5340 | getNamedOperand(MI, AMDGPU::OpName::dlc)) { | |||
5341 | MIB.addImm(DLC->getImm()); | |||
5342 | } | |||
5343 | ||||
5344 | MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); | |||
5345 | ||||
5346 | if (const MachineOperand *TFE = | |||
5347 | getNamedOperand(MI, AMDGPU::OpName::tfe)) { | |||
5348 | MIB.addImm(TFE->getImm()); | |||
5349 | } | |||
5350 | ||||
5351 | MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz)); | |||
5352 | ||||
5353 | MIB.cloneMemRefs(MI); | |||
5354 | Addr64 = MIB; | |||
5355 | } else { | |||
5356 | // Atomics with return. | |||
5357 | Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) | |||
5358 | .add(*VData) | |||
5359 | .add(*VDataIn) | |||
5360 | .addReg(NewVAddr) | |||
5361 | .addReg(NewSRsrc) | |||
5362 | .add(*SOffset) | |||
5363 | .add(*Offset) | |||
5364 | .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) | |||
5365 | .cloneMemRefs(MI); | |||
5366 | } | |||
5367 | ||||
5368 | MI.removeFromParent(); | |||
5369 | ||||
5370 | // NewVaddr = {NewVaddrHi, NewVaddrLo} | |||
5371 | BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), | |||
5372 | NewVAddr) | |||
5373 | .addReg(RsrcPtr, 0, AMDGPU::sub0) | |||
5374 | .addImm(AMDGPU::sub0) | |||
5375 | .addReg(RsrcPtr, 0, AMDGPU::sub1) | |||
5376 | .addImm(AMDGPU::sub1); | |||
5377 | } else { | |||
5378 | // This is another variant; legalize Rsrc with waterfall loop from VGPRs | |||
5379 | // to SGPRs. | |||
5380 | CreatedBB = loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT); | |||
5381 | return CreatedBB; | |||
5382 | } | |||
5383 | } | |||
5384 | return CreatedBB; | |||
5385 | } | |||
5386 | ||||
5387 | MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, | |||
5388 | MachineDominatorTree *MDT) const { | |||
5389 | SetVectorType Worklist; | |||
5390 | Worklist.insert(&TopInst); | |||
5391 | MachineBasicBlock *CreatedBB = nullptr; | |||
5392 | MachineBasicBlock *CreatedBBTmp = nullptr; | |||
5393 | ||||
5394 | while (!Worklist.empty()) { | |||
5395 | MachineInstr &Inst = *Worklist.pop_back_val(); | |||
5396 | MachineBasicBlock *MBB = Inst.getParent(); | |||
5397 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | |||
5398 | ||||
5399 | unsigned Opcode = Inst.getOpcode(); | |||
5400 | unsigned NewOpcode = getVALUOp(Inst); | |||
5401 | ||||
5402 | // Handle some special cases | |||
5403 | switch (Opcode) { | |||
5404 | default: | |||
5405 | break; | |||
5406 | case AMDGPU::S_ADD_U64_PSEUDO: | |||
5407 | case AMDGPU::S_SUB_U64_PSEUDO: | |||
5408 | splitScalar64BitAddSub(Worklist, Inst, MDT); | |||
5409 | Inst.eraseFromParent(); | |||
5410 | continue; | |||
5411 | case AMDGPU::S_ADD_I32: | |||
5412 | case AMDGPU::S_SUB_I32: { | |||
5413 | // FIXME: The u32 versions currently selected use the carry. | |||
5414 | bool Changed; | |||
5415 | std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT); | |||
5416 | if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) | |||
5417 | CreatedBB = CreatedBBTmp; | |||
5418 | if (Changed) | |||
5419 | continue; | |||
5420 | ||||
5421 | // Default handling | |||
5422 | break; | |||
5423 | } | |||
5424 | case AMDGPU::S_AND_B64: | |||
5425 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); | |||
5426 | Inst.eraseFromParent(); | |||
5427 | continue; | |||
5428 | ||||
5429 | case AMDGPU::S_OR_B64: | |||
5430 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); | |||
5431 | Inst.eraseFromParent(); | |||
5432 | continue; | |||
5433 | ||||
5434 | case AMDGPU::S_XOR_B64: | |||
5435 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); | |||
5436 | Inst.eraseFromParent(); | |||
5437 | continue; | |||
5438 | ||||
5439 | case AMDGPU::S_NAND_B64: | |||
5440 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); | |||
5441 | Inst.eraseFromParent(); | |||
5442 | continue; | |||
5443 | ||||
5444 | case AMDGPU::S_NOR_B64: | |||
5445 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); | |||
5446 | Inst.eraseFromParent(); | |||
5447 | continue; | |||
5448 | ||||
5449 | case AMDGPU::S_XNOR_B64: | |||
5450 | if (ST.hasDLInsts()) | |||
5451 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); | |||
5452 | else | |||
5453 | splitScalar64BitXnor(Worklist, Inst, MDT); | |||
5454 | Inst.eraseFromParent(); | |||
5455 | continue; | |||
5456 | ||||
5457 | case AMDGPU::S_ANDN2_B64: | |||
5458 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); | |||
5459 | Inst.eraseFromParent(); | |||
5460 | continue; | |||
5461 | ||||
5462 | case AMDGPU::S_ORN2_B64: | |||
5463 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); | |||
5464 | Inst.eraseFromParent(); | |||
5465 | continue; | |||
5466 | ||||
5467 | case AMDGPU::S_NOT_B64: | |||
5468 | splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); | |||
5469 | Inst.eraseFromParent(); | |||
5470 | continue; | |||
5471 | ||||
5472 | case AMDGPU::S_BCNT1_I32_B64: | |||
5473 | splitScalar64BitBCNT(Worklist, Inst); | |||
5474 | Inst.eraseFromParent(); | |||
5475 | continue; | |||
5476 | ||||
5477 | case AMDGPU::S_BFE_I64: | |||
5478 | splitScalar64BitBFE(Worklist, Inst); | |||
5479 | Inst.eraseFromParent(); | |||
5480 | continue; | |||
5481 | ||||
5482 | case AMDGPU::S_LSHL_B32: | |||
5483 | if (ST.hasOnlyRevVALUShifts()) { | |||
5484 | NewOpcode = AMDGPU::V_LSHLREV_B32_e64; | |||
5485 | swapOperands(Inst); | |||
5486 | } | |||
5487 | break; | |||
5488 | case AMDGPU::S_ASHR_I32: | |||
5489 | if (ST.hasOnlyRevVALUShifts()) { | |||
5490 | NewOpcode = AMDGPU::V_ASHRREV_I32_e64; | |||
5491 | swapOperands(Inst); | |||
5492 | } | |||
5493 | break; | |||
5494 | case AMDGPU::S_LSHR_B32: | |||
5495 | if (ST.hasOnlyRevVALUShifts()) { | |||
5496 | NewOpcode = AMDGPU::V_LSHRREV_B32_e64; | |||
5497 | swapOperands(Inst); | |||
5498 | } | |||
5499 | break; | |||
5500 | case AMDGPU::S_LSHL_B64: | |||
5501 | if (ST.hasOnlyRevVALUShifts()) { | |||
5502 | NewOpcode = AMDGPU::V_LSHLREV_B64_e64; | |||
5503 | swapOperands(Inst); | |||
5504 | } | |||
5505 | break; | |||
5506 | case AMDGPU::S_ASHR_I64: | |||
5507 | if (ST.hasOnlyRevVALUShifts()) { | |||
5508 | NewOpcode = AMDGPU::V_ASHRREV_I64_e64; | |||
5509 | swapOperands(Inst); | |||
5510 | } | |||
5511 | break; | |||
5512 | case AMDGPU::S_LSHR_B64: | |||
5513 | if (ST.hasOnlyRevVALUShifts()) { | |||
5514 | NewOpcode = AMDGPU::V_LSHRREV_B64_e64; | |||
5515 | swapOperands(Inst); | |||
5516 | } | |||
5517 | break; | |||
5518 | ||||
5519 | case AMDGPU::S_ABS_I32: | |||
5520 | lowerScalarAbs(Worklist, Inst); | |||
5521 | Inst.eraseFromParent(); | |||
5522 | continue; | |||
5523 | ||||
5524 | case AMDGPU::S_CBRANCH_SCC0: | |||
5525 | case AMDGPU::S_CBRANCH_SCC1: | |||
5526 | // Clear unused bits of vcc | |||
5527 | if (ST.isWave32()) | |||
5528 | BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32), | |||
5529 | AMDGPU::VCC_LO) | |||
5530 | .addReg(AMDGPU::EXEC_LO) | |||
5531 | .addReg(AMDGPU::VCC_LO); | |||
5532 | else | |||
5533 | BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), | |||
5534 | AMDGPU::VCC) | |||
5535 | .addReg(AMDGPU::EXEC) | |||
5536 | .addReg(AMDGPU::VCC); | |||
5537 | break; | |||
5538 | ||||
5539 | case AMDGPU::S_BFE_U64: | |||
5540 | case AMDGPU::S_BFM_B64: | |||
5541 | llvm_unreachable("Moving this op to VALU not implemented")::llvm::llvm_unreachable_internal("Moving this op to VALU not implemented" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5541); | |||
5542 | ||||
5543 | case AMDGPU::S_PACK_LL_B32_B16: | |||
5544 | case AMDGPU::S_PACK_LH_B32_B16: | |||
5545 | case AMDGPU::S_PACK_HH_B32_B16: | |||
5546 | movePackToVALU(Worklist, MRI, Inst); | |||
5547 | Inst.eraseFromParent(); | |||
5548 | continue; | |||
5549 | ||||
5550 | case AMDGPU::S_XNOR_B32: | |||
5551 | lowerScalarXnor(Worklist, Inst); | |||
5552 | Inst.eraseFromParent(); | |||
5553 | continue; | |||
5554 | ||||
5555 | case AMDGPU::S_NAND_B32: | |||
5556 | splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); | |||
5557 | Inst.eraseFromParent(); | |||
5558 | continue; | |||
5559 | ||||
5560 | case AMDGPU::S_NOR_B32: | |||
5561 | splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); | |||
5562 | Inst.eraseFromParent(); | |||
5563 | continue; | |||
5564 | ||||
5565 | case AMDGPU::S_ANDN2_B32: | |||
5566 | splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); | |||
5567 | Inst.eraseFromParent(); | |||
5568 | continue; | |||
5569 | ||||
5570 | case AMDGPU::S_ORN2_B32: | |||
5571 | splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); | |||
5572 | Inst.eraseFromParent(); | |||
5573 | continue; | |||
5574 | ||||
5575 | // TODO: remove as soon as everything is ready | |||
5576 | // to replace VGPR to SGPR copy with V_READFIRSTLANEs. | |||
5577 | // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO | |||
5578 | // can only be selected from the uniform SDNode. | |||
5579 | case AMDGPU::S_ADD_CO_PSEUDO: | |||
5580 | case AMDGPU::S_SUB_CO_PSEUDO: { | |||
5581 | unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) | |||
5582 | ? AMDGPU::V_ADDC_U32_e64 | |||
5583 | : AMDGPU::V_SUBB_U32_e64; | |||
5584 | const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
5585 | ||||
5586 | Register CarryInReg = Inst.getOperand(4).getReg(); | |||
5587 | if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { | |||
5588 | Register NewCarryReg = MRI.createVirtualRegister(CarryRC); | |||
5589 | BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) | |||
5590 | .addReg(CarryInReg); | |||
5591 | } | |||
5592 | ||||
5593 | Register CarryOutReg = Inst.getOperand(1).getReg(); | |||
5594 | ||||
5595 | Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass( | |||
5596 | MRI.getRegClass(Inst.getOperand(0).getReg()))); | |||
5597 | MachineInstr *CarryOp = | |||
5598 | BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg) | |||
5599 | .addReg(CarryOutReg, RegState::Define) | |||
5600 | .add(Inst.getOperand(2)) | |||
5601 | .add(Inst.getOperand(3)) | |||
5602 | .addReg(CarryInReg) | |||
5603 | .addImm(0); | |||
5604 | CreatedBBTmp = legalizeOperands(*CarryOp); | |||
5605 | if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) | |||
5606 | CreatedBB = CreatedBBTmp; | |||
5607 | MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); | |||
5608 | addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); | |||
5609 | Inst.eraseFromParent(); | |||
5610 | } | |||
5611 | continue; | |||
5612 | case AMDGPU::S_UADDO_PSEUDO: | |||
5613 | case AMDGPU::S_USUBO_PSEUDO: { | |||
5614 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
5615 | MachineOperand &Dest0 = Inst.getOperand(0); | |||
5616 | MachineOperand &Dest1 = Inst.getOperand(1); | |||
5617 | MachineOperand &Src0 = Inst.getOperand(2); | |||
5618 | MachineOperand &Src1 = Inst.getOperand(3); | |||
5619 | ||||
5620 | unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO) | |||
5621 | ? AMDGPU::V_ADD_CO_U32_e64 | |||
5622 | : AMDGPU::V_SUB_CO_U32_e64; | |||
5623 | const TargetRegisterClass *NewRC = | |||
5624 | RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg())); | |||
5625 | Register DestReg = MRI.createVirtualRegister(NewRC); | |||
5626 | MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg) | |||
5627 | .addReg(Dest1.getReg(), RegState::Define) | |||
5628 | .add(Src0) | |||
5629 | .add(Src1) | |||
5630 | .addImm(0); // clamp bit | |||
5631 | ||||
5632 | CreatedBBTmp = legalizeOperands(*NewInstr, MDT); | |||
5633 | if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) | |||
5634 | CreatedBB = CreatedBBTmp; | |||
5635 | ||||
5636 | MRI.replaceRegWith(Dest0.getReg(), DestReg); | |||
5637 | addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, | |||
5638 | Worklist); | |||
5639 | Inst.eraseFromParent(); | |||
5640 | } | |||
5641 | continue; | |||
5642 | ||||
5643 | case AMDGPU::S_CSELECT_B32: | |||
5644 | case AMDGPU::S_CSELECT_B64: | |||
5645 | lowerSelect(Worklist, Inst, MDT); | |||
5646 | Inst.eraseFromParent(); | |||
5647 | continue; | |||
5648 | } | |||
5649 | ||||
5650 | if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { | |||
5651 | // We cannot move this instruction to the VALU, so we should try to | |||
5652 | // legalize its operands instead. | |||
5653 | CreatedBBTmp = legalizeOperands(Inst, MDT); | |||
5654 | if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) | |||
5655 | CreatedBB = CreatedBBTmp; | |||
5656 | continue; | |||
5657 | } | |||
5658 | ||||
5659 | // Use the new VALU Opcode. | |||
5660 | const MCInstrDesc &NewDesc = get(NewOpcode); | |||
5661 | Inst.setDesc(NewDesc); | |||
5662 | ||||
5663 | // Remove any references to SCC. Vector instructions can't read from it, and | |||
5664 | // We're just about to add the implicit use / defs of VCC, and we don't want | |||
5665 | // both. | |||
5666 | for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { | |||
5667 | MachineOperand &Op = Inst.getOperand(i); | |||
5668 | if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { | |||
5669 | // Only propagate through live-def of SCC. | |||
5670 | if (Op.isDef() && !Op.isDead()) | |||
5671 | addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); | |||
5672 | Inst.RemoveOperand(i); | |||
5673 | } | |||
5674 | } | |||
5675 | ||||
5676 | if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { | |||
5677 | // We are converting these to a BFE, so we need to add the missing | |||
5678 | // operands for the size and offset. | |||
5679 | unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; | |||
5680 | Inst.addOperand(MachineOperand::CreateImm(0)); | |||
5681 | Inst.addOperand(MachineOperand::CreateImm(Size)); | |||
5682 | ||||
5683 | } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { | |||
5684 | // The VALU version adds the second operand to the result, so insert an | |||
5685 | // extra 0 operand. | |||
5686 | Inst.addOperand(MachineOperand::CreateImm(0)); | |||
5687 | } | |||
5688 | ||||
5689 | Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); | |||
5690 | fixImplicitOperands(Inst); | |||
5691 | ||||
5692 | if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { | |||
5693 | const MachineOperand &OffsetWidthOp = Inst.getOperand(2); | |||
5694 | // If we need to move this to VGPRs, we need to unpack the second operand | |||
5695 | // back into the 2 separate ones for bit offset and width. | |||
5696 | assert(OffsetWidthOp.isImm() &&((OffsetWidthOp.isImm() && "Scalar BFE is only implemented for constant width and offset" ) ? static_cast<void> (0) : __assert_fail ("OffsetWidthOp.isImm() && \"Scalar BFE is only implemented for constant width and offset\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5697, __PRETTY_FUNCTION__)) | |||
5697 | "Scalar BFE is only implemented for constant width and offset")((OffsetWidthOp.isImm() && "Scalar BFE is only implemented for constant width and offset" ) ? static_cast<void> (0) : __assert_fail ("OffsetWidthOp.isImm() && \"Scalar BFE is only implemented for constant width and offset\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5697, __PRETTY_FUNCTION__)); | |||
5698 | uint32_t Imm = OffsetWidthOp.getImm(); | |||
5699 | ||||
5700 | uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. | |||
5701 | uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. | |||
5702 | Inst.RemoveOperand(2); // Remove old immediate. | |||
5703 | Inst.addOperand(MachineOperand::CreateImm(Offset)); | |||
5704 | Inst.addOperand(MachineOperand::CreateImm(BitWidth)); | |||
5705 | } | |||
5706 | ||||
5707 | bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); | |||
5708 | unsigned NewDstReg = AMDGPU::NoRegister; | |||
5709 | if (HasDst) { | |||
5710 | Register DstReg = Inst.getOperand(0).getReg(); | |||
5711 | if (DstReg.isPhysical()) | |||
5712 | continue; | |||
5713 | ||||
5714 | // Update the destination register class. | |||
5715 | const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); | |||
5716 | if (!NewDstRC) | |||
5717 | continue; | |||
5718 | ||||
5719 | if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && | |||
5720 | NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { | |||
5721 | // Instead of creating a copy where src and dst are the same register | |||
5722 | // class, we just replace all uses of dst with src. These kinds of | |||
5723 | // copies interfere with the heuristics MachineSink uses to decide | |||
5724 | // whether or not to split a critical edge. Since the pass assumes | |||
5725 | // that copies will end up as machine instructions and not be | |||
5726 | // eliminated. | |||
5727 | addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); | |||
5728 | MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); | |||
5729 | MRI.clearKillFlags(Inst.getOperand(1).getReg()); | |||
5730 | Inst.getOperand(0).setReg(DstReg); | |||
5731 | ||||
5732 | // Make sure we don't leave around a dead VGPR->SGPR copy. Normally | |||
5733 | // these are deleted later, but at -O0 it would leave a suspicious | |||
5734 | // looking illegal copy of an undef register. | |||
5735 | for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) | |||
5736 | Inst.RemoveOperand(I); | |||
5737 | Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); | |||
5738 | continue; | |||
5739 | } | |||
5740 | ||||
5741 | NewDstReg = MRI.createVirtualRegister(NewDstRC); | |||
5742 | MRI.replaceRegWith(DstReg, NewDstReg); | |||
5743 | } | |||
5744 | ||||
5745 | // Legalize the operands | |||
5746 | CreatedBBTmp = legalizeOperands(Inst, MDT); | |||
5747 | if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) | |||
5748 | CreatedBB = CreatedBBTmp; | |||
5749 | ||||
5750 | if (HasDst) | |||
5751 | addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); | |||
5752 | } | |||
5753 | return CreatedBB; | |||
5754 | } | |||
5755 | ||||
5756 | // Add/sub require special handling to deal with carry outs. | |||
5757 | std::pair<bool, MachineBasicBlock *> | |||
5758 | SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, | |||
5759 | MachineDominatorTree *MDT) const { | |||
5760 | if (ST.hasAddNoCarry()) { | |||
5761 | // Assume there is no user of scc since we don't select this in that case. | |||
5762 | // Since scc isn't used, it doesn't really matter if the i32 or u32 variant | |||
5763 | // is used. | |||
5764 | ||||
5765 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
5766 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
5767 | ||||
5768 | Register OldDstReg = Inst.getOperand(0).getReg(); | |||
5769 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
5770 | ||||
5771 | unsigned Opc = Inst.getOpcode(); | |||
5772 | assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32)((Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32) ? static_cast <void> (0) : __assert_fail ("Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5772, __PRETTY_FUNCTION__)); | |||
5773 | ||||
5774 | unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ? | |||
5775 | AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64; | |||
5776 | ||||
5777 | assert(Inst.getOperand(3).getReg() == AMDGPU::SCC)((Inst.getOperand(3).getReg() == AMDGPU::SCC) ? static_cast< void> (0) : __assert_fail ("Inst.getOperand(3).getReg() == AMDGPU::SCC" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5777, __PRETTY_FUNCTION__)); | |||
5778 | Inst.RemoveOperand(3); | |||
5779 | ||||
5780 | Inst.setDesc(get(NewOpc)); | |||
5781 | Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit | |||
5782 | Inst.addImplicitDefUseOperands(*MBB.getParent()); | |||
5783 | MRI.replaceRegWith(OldDstReg, ResultReg); | |||
5784 | MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT); | |||
5785 | ||||
5786 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
5787 | return std::make_pair(true, NewBB); | |||
5788 | } | |||
5789 | ||||
5790 | return std::make_pair(false, nullptr); | |||
5791 | } | |||
5792 | ||||
5793 | void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, | |||
5794 | MachineDominatorTree *MDT) const { | |||
5795 | ||||
5796 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
5797 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
5798 | MachineBasicBlock::iterator MII = Inst; | |||
5799 | DebugLoc DL = Inst.getDebugLoc(); | |||
5800 | ||||
5801 | MachineOperand &Dest = Inst.getOperand(0); | |||
5802 | MachineOperand &Src0 = Inst.getOperand(1); | |||
5803 | MachineOperand &Src1 = Inst.getOperand(2); | |||
5804 | MachineOperand &Cond = Inst.getOperand(3); | |||
5805 | ||||
5806 | Register SCCSource = Cond.getReg(); | |||
5807 | // Find SCC def, and if that is a copy (SCC = COPY reg) then use reg instead. | |||
5808 | if (!Cond.isUndef()) { | |||
5809 | for (MachineInstr &CandI : | |||
5810 | make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)), | |||
5811 | Inst.getParent()->rend())) { | |||
5812 | if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != | |||
5813 | -1) { | |||
5814 | if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) { | |||
5815 | SCCSource = CandI.getOperand(1).getReg(); | |||
5816 | } | |||
5817 | break; | |||
5818 | } | |||
5819 | } | |||
5820 | } | |||
5821 | ||||
5822 | // If this is a trivial select where the condition is effectively not SCC | |||
5823 | // (SCCSource is a source of copy to SCC), then the select is semantically | |||
5824 | // equivalent to copying SCCSource. Hence, there is no need to create | |||
5825 | // V_CNDMASK, we can just use that and bail out. | |||
5826 | if ((SCCSource != AMDGPU::SCC) && Src0.isImm() && (Src0.getImm() == -1) && | |||
5827 | Src1.isImm() && (Src1.getImm() == 0)) { | |||
5828 | MRI.replaceRegWith(Dest.getReg(), SCCSource); | |||
5829 | return; | |||
5830 | } | |||
5831 | ||||
5832 | const TargetRegisterClass *TC = ST.getWavefrontSize() == 64 | |||
5833 | ? &AMDGPU::SReg_64_XEXECRegClass | |||
5834 | : &AMDGPU::SReg_32_XM0_XEXECRegClass; | |||
5835 | Register CopySCC = MRI.createVirtualRegister(TC); | |||
5836 | ||||
5837 | if (SCCSource == AMDGPU::SCC) { | |||
5838 | // Insert a trivial select instead of creating a copy, because a copy from | |||
5839 | // SCC would semantically mean just copying a single bit, but we may need | |||
5840 | // the result to be a vector condition mask that needs preserving. | |||
5841 | unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64 | |||
5842 | : AMDGPU::S_CSELECT_B32; | |||
5843 | auto NewSelect = | |||
5844 | BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0); | |||
5845 | NewSelect->getOperand(3).setIsUndef(Cond.isUndef()); | |||
5846 | } else { | |||
5847 | BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC).addReg(SCCSource); | |||
5848 | } | |||
5849 | ||||
5850 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
5851 | ||||
5852 | auto UpdatedInst = | |||
5853 | BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg) | |||
5854 | .addImm(0) | |||
5855 | .add(Src1) // False | |||
5856 | .addImm(0) | |||
5857 | .add(Src0) // True | |||
5858 | .addReg(CopySCC); | |||
5859 | ||||
5860 | MRI.replaceRegWith(Dest.getReg(), ResultReg); | |||
5861 | legalizeOperands(*UpdatedInst, MDT); | |||
5862 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
5863 | } | |||
5864 | ||||
5865 | void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, | |||
5866 | MachineInstr &Inst) const { | |||
5867 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
5868 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
5869 | MachineBasicBlock::iterator MII = Inst; | |||
5870 | DebugLoc DL = Inst.getDebugLoc(); | |||
5871 | ||||
5872 | MachineOperand &Dest = Inst.getOperand(0); | |||
5873 | MachineOperand &Src = Inst.getOperand(1); | |||
5874 | Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
5875 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
5876 | ||||
5877 | unsigned SubOp = ST.hasAddNoCarry() ? | |||
5878 | AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32; | |||
5879 | ||||
5880 | BuildMI(MBB, MII, DL, get(SubOp), TmpReg) | |||
5881 | .addImm(0) | |||
5882 | .addReg(Src.getReg()); | |||
5883 | ||||
5884 | BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) | |||
5885 | .addReg(Src.getReg()) | |||
5886 | .addReg(TmpReg); | |||
5887 | ||||
5888 | MRI.replaceRegWith(Dest.getReg(), ResultReg); | |||
5889 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
5890 | } | |||
5891 | ||||
5892 | void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, | |||
5893 | MachineInstr &Inst) const { | |||
5894 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
5895 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
5896 | MachineBasicBlock::iterator MII = Inst; | |||
5897 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
5898 | ||||
5899 | MachineOperand &Dest = Inst.getOperand(0); | |||
5900 | MachineOperand &Src0 = Inst.getOperand(1); | |||
5901 | MachineOperand &Src1 = Inst.getOperand(2); | |||
5902 | ||||
5903 | if (ST.hasDLInsts()) { | |||
5904 | Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
5905 | legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); | |||
5906 | legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); | |||
5907 | ||||
5908 | BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) | |||
5909 | .add(Src0) | |||
5910 | .add(Src1); | |||
5911 | ||||
5912 | MRI.replaceRegWith(Dest.getReg(), NewDest); | |||
5913 | addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); | |||
5914 | } else { | |||
5915 | // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can | |||
5916 | // invert either source and then perform the XOR. If either source is a | |||
5917 | // scalar register, then we can leave the inversion on the scalar unit to | |||
5918 | // acheive a better distrubution of scalar and vector instructions. | |||
5919 | bool Src0IsSGPR = Src0.isReg() && | |||
5920 | RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); | |||
5921 | bool Src1IsSGPR = Src1.isReg() && | |||
5922 | RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); | |||
5923 | MachineInstr *Xor; | |||
5924 | Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | |||
5925 | Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | |||
5926 | ||||
5927 | // Build a pair of scalar instructions and add them to the work list. | |||
5928 | // The next iteration over the work list will lower these to the vector | |||
5929 | // unit as necessary. | |||
5930 | if (Src0IsSGPR) { | |||
5931 | BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0); | |||
5932 | Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) | |||
5933 | .addReg(Temp) | |||
5934 | .add(Src1); | |||
5935 | } else if (Src1IsSGPR) { | |||
5936 | BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1); | |||
5937 | Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) | |||
5938 | .add(Src0) | |||
5939 | .addReg(Temp); | |||
5940 | } else { | |||
5941 | Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp) | |||
5942 | .add(Src0) | |||
5943 | .add(Src1); | |||
5944 | MachineInstr *Not = | |||
5945 | BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp); | |||
5946 | Worklist.insert(Not); | |||
5947 | } | |||
5948 | ||||
5949 | MRI.replaceRegWith(Dest.getReg(), NewDest); | |||
5950 | ||||
5951 | Worklist.insert(Xor); | |||
5952 | ||||
5953 | addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); | |||
5954 | } | |||
5955 | } | |||
5956 | ||||
5957 | void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, | |||
5958 | MachineInstr &Inst, | |||
5959 | unsigned Opcode) const { | |||
5960 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
5961 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
5962 | MachineBasicBlock::iterator MII = Inst; | |||
5963 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
5964 | ||||
5965 | MachineOperand &Dest = Inst.getOperand(0); | |||
5966 | MachineOperand &Src0 = Inst.getOperand(1); | |||
5967 | MachineOperand &Src1 = Inst.getOperand(2); | |||
5968 | ||||
5969 | Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | |||
5970 | Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | |||
5971 | ||||
5972 | MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm) | |||
5973 | .add(Src0) | |||
5974 | .add(Src1); | |||
5975 | ||||
5976 | MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) | |||
5977 | .addReg(Interm); | |||
5978 | ||||
5979 | Worklist.insert(&Op); | |||
5980 | Worklist.insert(&Not); | |||
5981 | ||||
5982 | MRI.replaceRegWith(Dest.getReg(), NewDest); | |||
5983 | addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); | |||
5984 | } | |||
5985 | ||||
5986 | void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, | |||
5987 | MachineInstr &Inst, | |||
5988 | unsigned Opcode) const { | |||
5989 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
5990 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
5991 | MachineBasicBlock::iterator MII = Inst; | |||
5992 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
5993 | ||||
5994 | MachineOperand &Dest = Inst.getOperand(0); | |||
5995 | MachineOperand &Src0 = Inst.getOperand(1); | |||
5996 | MachineOperand &Src1 = Inst.getOperand(2); | |||
5997 | ||||
5998 | Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
5999 | Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
6000 | ||||
6001 | MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm) | |||
6002 | .add(Src1); | |||
6003 | ||||
6004 | MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest) | |||
6005 | .add(Src0) | |||
6006 | .addReg(Interm); | |||
6007 | ||||
6008 | Worklist.insert(&Not); | |||
6009 | Worklist.insert(&Op); | |||
6010 | ||||
6011 | MRI.replaceRegWith(Dest.getReg(), NewDest); | |||
6012 | addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); | |||
6013 | } | |||
6014 | ||||
6015 | void SIInstrInfo::splitScalar64BitUnaryOp( | |||
6016 | SetVectorType &Worklist, MachineInstr &Inst, | |||
6017 | unsigned Opcode) const { | |||
6018 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6019 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6020 | ||||
6021 | MachineOperand &Dest = Inst.getOperand(0); | |||
6022 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6023 | DebugLoc DL = Inst.getDebugLoc(); | |||
6024 | ||||
6025 | MachineBasicBlock::iterator MII = Inst; | |||
6026 | ||||
6027 | const MCInstrDesc &InstDesc = get(Opcode); | |||
6028 | const TargetRegisterClass *Src0RC = Src0.isReg() ? | |||
6029 | MRI.getRegClass(Src0.getReg()) : | |||
6030 | &AMDGPU::SGPR_32RegClass; | |||
6031 | ||||
6032 | const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); | |||
6033 | ||||
6034 | MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, | |||
6035 | AMDGPU::sub0, Src0SubRC); | |||
6036 | ||||
6037 | const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); | |||
6038 | const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); | |||
6039 | const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); | |||
6040 | ||||
6041 | Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); | |||
6042 | MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); | |||
6043 | ||||
6044 | MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, | |||
6045 | AMDGPU::sub1, Src0SubRC); | |||
6046 | ||||
6047 | Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); | |||
6048 | MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); | |||
6049 | ||||
6050 | Register FullDestReg = MRI.createVirtualRegister(NewDestRC); | |||
6051 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) | |||
6052 | .addReg(DestSub0) | |||
6053 | .addImm(AMDGPU::sub0) | |||
6054 | .addReg(DestSub1) | |||
6055 | .addImm(AMDGPU::sub1); | |||
6056 | ||||
6057 | MRI.replaceRegWith(Dest.getReg(), FullDestReg); | |||
6058 | ||||
6059 | Worklist.insert(&LoHalf); | |||
6060 | Worklist.insert(&HiHalf); | |||
6061 | ||||
6062 | // We don't need to legalizeOperands here because for a single operand, src0 | |||
6063 | // will support any kind of input. | |||
6064 | ||||
6065 | // Move all users of this moved value. | |||
6066 | addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); | |||
6067 | } | |||
6068 | ||||
6069 | void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, | |||
6070 | MachineInstr &Inst, | |||
6071 | MachineDominatorTree *MDT) const { | |||
6072 | bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); | |||
6073 | ||||
6074 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6075 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6076 | const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
6077 | ||||
6078 | Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); | |||
6079 | Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6080 | Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6081 | ||||
6082 | Register CarryReg = MRI.createVirtualRegister(CarryRC); | |||
6083 | Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); | |||
6084 | ||||
6085 | MachineOperand &Dest = Inst.getOperand(0); | |||
6086 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6087 | MachineOperand &Src1 = Inst.getOperand(2); | |||
6088 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6089 | MachineBasicBlock::iterator MII = Inst; | |||
6090 | ||||
6091 | const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); | |||
6092 | const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); | |||
6093 | const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); | |||
6094 | const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); | |||
6095 | ||||
6096 | MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, | |||
6097 | AMDGPU::sub0, Src0SubRC); | |||
6098 | MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, | |||
6099 | AMDGPU::sub0, Src1SubRC); | |||
6100 | ||||
6101 | ||||
6102 | MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, | |||
6103 | AMDGPU::sub1, Src0SubRC); | |||
6104 | MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, | |||
6105 | AMDGPU::sub1, Src1SubRC); | |||
6106 | ||||
6107 | unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; | |||
6108 | MachineInstr *LoHalf = | |||
6109 | BuildMI(MBB, MII, DL, get(LoOpc), DestSub0) | |||
6110 | .addReg(CarryReg, RegState::Define) | |||
6111 | .add(SrcReg0Sub0) | |||
6112 | .add(SrcReg1Sub0) | |||
6113 | .addImm(0); // clamp bit | |||
6114 | ||||
6115 | unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; | |||
6116 | MachineInstr *HiHalf = | |||
6117 | BuildMI(MBB, MII, DL, get(HiOpc), DestSub1) | |||
6118 | .addReg(DeadCarryReg, RegState::Define | RegState::Dead) | |||
6119 | .add(SrcReg0Sub1) | |||
6120 | .add(SrcReg1Sub1) | |||
6121 | .addReg(CarryReg, RegState::Kill) | |||
6122 | .addImm(0); // clamp bit | |||
6123 | ||||
6124 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) | |||
6125 | .addReg(DestSub0) | |||
6126 | .addImm(AMDGPU::sub0) | |||
6127 | .addReg(DestSub1) | |||
6128 | .addImm(AMDGPU::sub1); | |||
6129 | ||||
6130 | MRI.replaceRegWith(Dest.getReg(), FullDestReg); | |||
6131 | ||||
6132 | // Try to legalize the operands in case we need to swap the order to keep it | |||
6133 | // valid. | |||
6134 | legalizeOperands(*LoHalf, MDT); | |||
6135 | legalizeOperands(*HiHalf, MDT); | |||
6136 | ||||
6137 | // Move all users of this moved vlaue. | |||
6138 | addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); | |||
6139 | } | |||
6140 | ||||
6141 | void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, | |||
6142 | MachineInstr &Inst, unsigned Opcode, | |||
6143 | MachineDominatorTree *MDT) const { | |||
6144 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6145 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6146 | ||||
6147 | MachineOperand &Dest = Inst.getOperand(0); | |||
6148 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6149 | MachineOperand &Src1 = Inst.getOperand(2); | |||
6150 | DebugLoc DL = Inst.getDebugLoc(); | |||
6151 | ||||
6152 | MachineBasicBlock::iterator MII = Inst; | |||
6153 | ||||
6154 | const MCInstrDesc &InstDesc = get(Opcode); | |||
6155 | const TargetRegisterClass *Src0RC = Src0.isReg() ? | |||
6156 | MRI.getRegClass(Src0.getReg()) : | |||
6157 | &AMDGPU::SGPR_32RegClass; | |||
6158 | ||||
6159 | const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); | |||
6160 | const TargetRegisterClass *Src1RC = Src1.isReg() ? | |||
6161 | MRI.getRegClass(Src1.getReg()) : | |||
6162 | &AMDGPU::SGPR_32RegClass; | |||
6163 | ||||
6164 | const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); | |||
6165 | ||||
6166 | MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, | |||
6167 | AMDGPU::sub0, Src0SubRC); | |||
6168 | MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, | |||
6169 | AMDGPU::sub0, Src1SubRC); | |||
6170 | MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, | |||
6171 | AMDGPU::sub1, Src0SubRC); | |||
6172 | MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, | |||
6173 | AMDGPU::sub1, Src1SubRC); | |||
6174 | ||||
6175 | const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); | |||
6176 | const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); | |||
6177 | const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); | |||
6178 | ||||
6179 | Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); | |||
6180 | MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) | |||
6181 | .add(SrcReg0Sub0) | |||
6182 | .add(SrcReg1Sub0); | |||
6183 | ||||
6184 | Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); | |||
6185 | MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) | |||
6186 | .add(SrcReg0Sub1) | |||
6187 | .add(SrcReg1Sub1); | |||
6188 | ||||
6189 | Register FullDestReg = MRI.createVirtualRegister(NewDestRC); | |||
6190 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) | |||
6191 | .addReg(DestSub0) | |||
6192 | .addImm(AMDGPU::sub0) | |||
6193 | .addReg(DestSub1) | |||
6194 | .addImm(AMDGPU::sub1); | |||
6195 | ||||
6196 | MRI.replaceRegWith(Dest.getReg(), FullDestReg); | |||
6197 | ||||
6198 | Worklist.insert(&LoHalf); | |||
6199 | Worklist.insert(&HiHalf); | |||
6200 | ||||
6201 | // Move all users of this moved vlaue. | |||
6202 | addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); | |||
6203 | } | |||
6204 | ||||
6205 | void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, | |||
6206 | MachineInstr &Inst, | |||
6207 | MachineDominatorTree *MDT) const { | |||
6208 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6209 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6210 | ||||
6211 | MachineOperand &Dest = Inst.getOperand(0); | |||
6212 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6213 | MachineOperand &Src1 = Inst.getOperand(2); | |||
6214 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6215 | ||||
6216 | MachineBasicBlock::iterator MII = Inst; | |||
6217 | ||||
6218 | const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); | |||
6219 | ||||
6220 | Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); | |||
6221 | ||||
6222 | MachineOperand* Op0; | |||
6223 | MachineOperand* Op1; | |||
6224 | ||||
6225 | if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) { | |||
6226 | Op0 = &Src0; | |||
6227 | Op1 = &Src1; | |||
6228 | } else { | |||
6229 | Op0 = &Src1; | |||
6230 | Op1 = &Src0; | |||
6231 | } | |||
6232 | ||||
6233 | BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm) | |||
6234 | .add(*Op0); | |||
6235 | ||||
6236 | Register NewDest = MRI.createVirtualRegister(DestRC); | |||
6237 | ||||
6238 | MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest) | |||
6239 | .addReg(Interm) | |||
6240 | .add(*Op1); | |||
6241 | ||||
6242 | MRI.replaceRegWith(Dest.getReg(), NewDest); | |||
6243 | ||||
6244 | Worklist.insert(&Xor); | |||
6245 | } | |||
6246 | ||||
6247 | void SIInstrInfo::splitScalar64BitBCNT( | |||
6248 | SetVectorType &Worklist, MachineInstr &Inst) const { | |||
6249 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6250 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6251 | ||||
6252 | MachineBasicBlock::iterator MII = Inst; | |||
6253 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6254 | ||||
6255 | MachineOperand &Dest = Inst.getOperand(0); | |||
6256 | MachineOperand &Src = Inst.getOperand(1); | |||
6257 | ||||
6258 | const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); | |||
6259 | const TargetRegisterClass *SrcRC = Src.isReg() ? | |||
6260 | MRI.getRegClass(Src.getReg()) : | |||
6261 | &AMDGPU::SGPR_32RegClass; | |||
6262 | ||||
6263 | Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6264 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6265 | ||||
6266 | const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); | |||
6267 | ||||
6268 | MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, | |||
6269 | AMDGPU::sub0, SrcSubRC); | |||
6270 | MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, | |||
6271 | AMDGPU::sub1, SrcSubRC); | |||
6272 | ||||
6273 | BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); | |||
6274 | ||||
6275 | BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); | |||
6276 | ||||
6277 | MRI.replaceRegWith(Dest.getReg(), ResultReg); | |||
6278 | ||||
6279 | // We don't need to legalize operands here. src0 for etiher instruction can be | |||
6280 | // an SGPR, and the second input is unused or determined here. | |||
6281 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
6282 | } | |||
6283 | ||||
6284 | void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, | |||
6285 | MachineInstr &Inst) const { | |||
6286 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6287 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6288 | MachineBasicBlock::iterator MII = Inst; | |||
6289 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6290 | ||||
6291 | MachineOperand &Dest = Inst.getOperand(0); | |||
6292 | uint32_t Imm = Inst.getOperand(2).getImm(); | |||
6293 | uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. | |||
6294 | uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. | |||
6295 | ||||
6296 | (void) Offset; | |||
6297 | ||||
6298 | // Only sext_inreg cases handled. | |||
6299 | assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&((Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && Offset == 0 && "Not implemented") ? static_cast <void> (0) : __assert_fail ("Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && Offset == 0 && \"Not implemented\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6300, __PRETTY_FUNCTION__)) | |||
6300 | Offset == 0 && "Not implemented")((Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && Offset == 0 && "Not implemented") ? static_cast <void> (0) : __assert_fail ("Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && Offset == 0 && \"Not implemented\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6300, __PRETTY_FUNCTION__)); | |||
6301 | ||||
6302 | if (BitWidth < 32) { | |||
6303 | Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6304 | Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6305 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); | |||
6306 | ||||
6307 | BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo) | |||
6308 | .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) | |||
6309 | .addImm(0) | |||
6310 | .addImm(BitWidth); | |||
6311 | ||||
6312 | BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) | |||
6313 | .addImm(31) | |||
6314 | .addReg(MidRegLo); | |||
6315 | ||||
6316 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) | |||
6317 | .addReg(MidRegLo) | |||
6318 | .addImm(AMDGPU::sub0) | |||
6319 | .addReg(MidRegHi) | |||
6320 | .addImm(AMDGPU::sub1); | |||
6321 | ||||
6322 | MRI.replaceRegWith(Dest.getReg(), ResultReg); | |||
6323 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
6324 | return; | |||
6325 | } | |||
6326 | ||||
6327 | MachineOperand &Src = Inst.getOperand(1); | |||
6328 | Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6329 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); | |||
6330 | ||||
6331 | BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) | |||
6332 | .addImm(31) | |||
6333 | .addReg(Src.getReg(), 0, AMDGPU::sub0); | |||
6334 | ||||
6335 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) | |||
6336 | .addReg(Src.getReg(), 0, AMDGPU::sub0) | |||
6337 | .addImm(AMDGPU::sub0) | |||
6338 | .addReg(TmpReg) | |||
6339 | .addImm(AMDGPU::sub1); | |||
6340 | ||||
6341 | MRI.replaceRegWith(Dest.getReg(), ResultReg); | |||
6342 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
6343 | } | |||
6344 | ||||
6345 | void SIInstrInfo::addUsersToMoveToVALUWorklist( | |||
6346 | Register DstReg, | |||
6347 | MachineRegisterInfo &MRI, | |||
6348 | SetVectorType &Worklist) const { | |||
6349 | for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), | |||
6350 | E = MRI.use_end(); I != E;) { | |||
6351 | MachineInstr &UseMI = *I->getParent(); | |||
6352 | ||||
6353 | unsigned OpNo = 0; | |||
6354 | ||||
6355 | switch (UseMI.getOpcode()) { | |||
6356 | case AMDGPU::COPY: | |||
6357 | case AMDGPU::WQM: | |||
6358 | case AMDGPU::SOFT_WQM: | |||
6359 | case AMDGPU::WWM: | |||
6360 | case AMDGPU::REG_SEQUENCE: | |||
6361 | case AMDGPU::PHI: | |||
6362 | case AMDGPU::INSERT_SUBREG: | |||
6363 | break; | |||
6364 | default: | |||
6365 | OpNo = I.getOperandNo(); | |||
6366 | break; | |||
6367 | } | |||
6368 | ||||
6369 | if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) { | |||
6370 | Worklist.insert(&UseMI); | |||
6371 | ||||
6372 | do { | |||
6373 | ++I; | |||
6374 | } while (I != E && I->getParent() == &UseMI); | |||
6375 | } else { | |||
6376 | ++I; | |||
6377 | } | |||
6378 | } | |||
6379 | } | |||
6380 | ||||
6381 | void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, | |||
6382 | MachineRegisterInfo &MRI, | |||
6383 | MachineInstr &Inst) const { | |||
6384 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6385 | MachineBasicBlock *MBB = Inst.getParent(); | |||
6386 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6387 | MachineOperand &Src1 = Inst.getOperand(2); | |||
6388 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6389 | ||||
6390 | switch (Inst.getOpcode()) { | |||
6391 | case AMDGPU::S_PACK_LL_B32_B16: { | |||
6392 | Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6393 | Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6394 | ||||
6395 | // FIXME: Can do a lot better if we know the high bits of src0 or src1 are | |||
6396 | // 0. | |||
6397 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) | |||
6398 | .addImm(0xffff); | |||
6399 | ||||
6400 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) | |||
6401 | .addReg(ImmReg, RegState::Kill) | |||
6402 | .add(Src0); | |||
6403 | ||||
6404 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg) | |||
6405 | .add(Src1) | |||
6406 | .addImm(16) | |||
6407 | .addReg(TmpReg, RegState::Kill); | |||
6408 | break; | |||
6409 | } | |||
6410 | case AMDGPU::S_PACK_LH_B32_B16: { | |||
6411 | Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6412 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) | |||
6413 | .addImm(0xffff); | |||
6414 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg) | |||
6415 | .addReg(ImmReg, RegState::Kill) | |||
6416 | .add(Src0) | |||
6417 | .add(Src1); | |||
6418 | break; | |||
6419 | } | |||
6420 | case AMDGPU::S_PACK_HH_B32_B16: { | |||
6421 | Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6422 | Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6423 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) | |||
6424 | .addImm(16) | |||
6425 | .add(Src0); | |||
6426 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) | |||
6427 | .addImm(0xffff0000); | |||
6428 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg) | |||
6429 | .add(Src1) | |||
6430 | .addReg(ImmReg, RegState::Kill) | |||
6431 | .addReg(TmpReg, RegState::Kill); | |||
6432 | break; | |||
6433 | } | |||
6434 | default: | |||
6435 | llvm_unreachable("unhandled s_pack_* instruction")::llvm::llvm_unreachable_internal("unhandled s_pack_* instruction" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6435); | |||
6436 | } | |||
6437 | ||||
6438 | MachineOperand &Dest = Inst.getOperand(0); | |||
6439 | MRI.replaceRegWith(Dest.getReg(), ResultReg); | |||
6440 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
6441 | } | |||
6442 | ||||
6443 | void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, | |||
6444 | MachineInstr &SCCDefInst, | |||
6445 | SetVectorType &Worklist) const { | |||
6446 | bool SCCUsedImplicitly = false; | |||
6447 | ||||
6448 | // Ensure that def inst defines SCC, which is still live. | |||
6449 | assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&((Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst) ? static_cast<void> (0) : __assert_fail ("Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6450, __PRETTY_FUNCTION__)) | |||
6450 | !Op.isDead() && Op.getParent() == &SCCDefInst)((Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst) ? static_cast<void> (0) : __assert_fail ("Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6450, __PRETTY_FUNCTION__)); | |||
6451 | SmallVector<MachineInstr *, 4> CopyToDelete; | |||
6452 | // This assumes that all the users of SCC are in the same block | |||
6453 | // as the SCC def. | |||
6454 | for (MachineInstr &MI : // Skip the def inst itself. | |||
6455 | make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)), | |||
6456 | SCCDefInst.getParent()->end())) { | |||
6457 | // Check if SCC is used first. | |||
6458 | if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) { | |||
6459 | if (MI.isCopy()) { | |||
6460 | MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); | |||
6461 | Register DestReg = MI.getOperand(0).getReg(); | |||
6462 | ||||
6463 | for (auto &User : MRI.use_nodbg_instructions(DestReg)) { | |||
6464 | if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) || | |||
6465 | (User.getOpcode() == AMDGPU::S_SUB_CO_PSEUDO)) { | |||
6466 | User.getOperand(4).setReg(RI.getVCC()); | |||
6467 | Worklist.insert(&User); | |||
6468 | } else if (User.getOpcode() == AMDGPU::V_CNDMASK_B32_e64) { | |||
6469 | User.getOperand(5).setReg(RI.getVCC()); | |||
6470 | // No need to add to Worklist. | |||
6471 | } | |||
6472 | } | |||
6473 | CopyToDelete.push_back(&MI); | |||
6474 | } else { | |||
6475 | if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 || | |||
6476 | MI.getOpcode() == AMDGPU::S_CSELECT_B64) { | |||
6477 | // This is an implicit use of SCC and it is really expected by | |||
6478 | // the SCC users to handle. | |||
6479 | // We cannot preserve the edge to the user so add the explicit | |||
6480 | // copy: SCC = COPY VCC. | |||
6481 | // The copy will be cleaned up during the processing of the user | |||
6482 | // in lowerSelect. | |||
6483 | SCCUsedImplicitly = true; | |||
6484 | } | |||
6485 | ||||
6486 | Worklist.insert(&MI); | |||
6487 | } | |||
6488 | } | |||
6489 | // Exit if we find another SCC def. | |||
6490 | if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) | |||
6491 | break; | |||
6492 | } | |||
6493 | for (auto &Copy : CopyToDelete) | |||
6494 | Copy->eraseFromParent(); | |||
6495 | ||||
6496 | if (SCCUsedImplicitly) { | |||
6497 | BuildMI(*SCCDefInst.getParent(), std::next(SCCDefInst.getIterator()), | |||
6498 | SCCDefInst.getDebugLoc(), get(AMDGPU::COPY), AMDGPU::SCC) | |||
6499 | .addReg(RI.getVCC()); | |||
6500 | } | |||
6501 | } | |||
6502 | ||||
6503 | const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( | |||
6504 | const MachineInstr &Inst) const { | |||
6505 | const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); | |||
6506 | ||||
6507 | switch (Inst.getOpcode()) { | |||
6508 | // For target instructions, getOpRegClass just returns the virtual register | |||
6509 | // class associated with the operand, so we need to find an equivalent VGPR | |||
6510 | // register class in order to move the instruction to the VALU. | |||
6511 | case AMDGPU::COPY: | |||
6512 | case AMDGPU::PHI: | |||
6513 | case AMDGPU::REG_SEQUENCE: | |||
6514 | case AMDGPU::INSERT_SUBREG: | |||
6515 | case AMDGPU::WQM: | |||
6516 | case AMDGPU::SOFT_WQM: | |||
6517 | case AMDGPU::WWM: { | |||
6518 | const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); | |||
6519 | if (RI.hasAGPRs(SrcRC)) { | |||
6520 | if (RI.hasAGPRs(NewDstRC)) | |||
6521 | return nullptr; | |||
6522 | ||||
6523 | switch (Inst.getOpcode()) { | |||
6524 | case AMDGPU::PHI: | |||
6525 | case AMDGPU::REG_SEQUENCE: | |||
6526 | case AMDGPU::INSERT_SUBREG: | |||
6527 | NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); | |||
6528 | break; | |||
6529 | default: | |||
6530 | NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); | |||
6531 | } | |||
6532 | ||||
6533 | if (!NewDstRC) | |||
6534 | return nullptr; | |||
6535 | } else { | |||
6536 | if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) | |||
6537 | return nullptr; | |||
6538 | ||||
6539 | NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); | |||
6540 | if (!NewDstRC) | |||
6541 | return nullptr; | |||
6542 | } | |||
6543 | ||||
6544 | return NewDstRC; | |||
6545 | } | |||
6546 | default: | |||
6547 | return NewDstRC; | |||
6548 | } | |||
6549 | } | |||
6550 | ||||
6551 | // Find the one SGPR operand we are allowed to use. | |||
6552 | Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, | |||
6553 | int OpIndices[3]) const { | |||
6554 | const MCInstrDesc &Desc = MI.getDesc(); | |||
6555 | ||||
6556 | // Find the one SGPR operand we are allowed to use. | |||
6557 | // | |||
6558 | // First we need to consider the instruction's operand requirements before | |||
6559 | // legalizing. Some operands are required to be SGPRs, such as implicit uses | |||
6560 | // of VCC, but we are still bound by the constant bus requirement to only use | |||
6561 | // one. | |||
6562 | // | |||
6563 | // If the operand's class is an SGPR, we can never move it. | |||
6564 | ||||
6565 | Register SGPRReg = findImplicitSGPRRead(MI); | |||
6566 | if (SGPRReg != AMDGPU::NoRegister) | |||
6567 | return SGPRReg; | |||
6568 | ||||
6569 | Register UsedSGPRs[3] = { AMDGPU::NoRegister }; | |||
6570 | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); | |||
6571 | ||||
6572 | for (unsigned i = 0; i < 3; ++i) { | |||
6573 | int Idx = OpIndices[i]; | |||
6574 | if (Idx == -1) | |||
6575 | break; | |||
6576 | ||||
6577 | const MachineOperand &MO = MI.getOperand(Idx); | |||
6578 | if (!MO.isReg()) | |||
6579 | continue; | |||
6580 | ||||
6581 | // Is this operand statically required to be an SGPR based on the operand | |||
6582 | // constraints? | |||
6583 | const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); | |||
6584 | bool IsRequiredSGPR = RI.isSGPRClass(OpRC); | |||
6585 | if (IsRequiredSGPR) | |||
6586 | return MO.getReg(); | |||
6587 | ||||
6588 | // If this could be a VGPR or an SGPR, Check the dynamic register class. | |||
6589 | Register Reg = MO.getReg(); | |||
6590 | const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); | |||
6591 | if (RI.isSGPRClass(RegRC)) | |||
6592 | UsedSGPRs[i] = Reg; | |||
6593 | } | |||
6594 | ||||
6595 | // We don't have a required SGPR operand, so we have a bit more freedom in | |||
6596 | // selecting operands to move. | |||
6597 | ||||
6598 | // Try to select the most used SGPR. If an SGPR is equal to one of the | |||
6599 | // others, we choose that. | |||
6600 | // | |||
6601 | // e.g. | |||
6602 | // V_FMA_F32 v0, s0, s0, s0 -> No moves | |||
6603 | // V_FMA_F32 v0, s0, s1, s0 -> Move s1 | |||
6604 | ||||
6605 | // TODO: If some of the operands are 64-bit SGPRs and some 32, we should | |||
6606 | // prefer those. | |||
6607 | ||||
6608 | if (UsedSGPRs[0] != AMDGPU::NoRegister) { | |||
6609 | if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) | |||
6610 | SGPRReg = UsedSGPRs[0]; | |||
6611 | } | |||
6612 | ||||
6613 | if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { | |||
6614 | if (UsedSGPRs[1] == UsedSGPRs[2]) | |||
6615 | SGPRReg = UsedSGPRs[1]; | |||
6616 | } | |||
6617 | ||||
6618 | return SGPRReg; | |||
6619 | } | |||
6620 | ||||
6621 | MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, | |||
6622 | unsigned OperandName) const { | |||
6623 | int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); | |||
6624 | if (Idx == -1) | |||
6625 | return nullptr; | |||
6626 | ||||
6627 | return &MI.getOperand(Idx); | |||
6628 | } | |||
6629 | ||||
6630 | uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { | |||
6631 | if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { | |||
6632 | return (AMDGPU::MTBUFFormat::UFMT_32_FLOAT << 44) | | |||
6633 | (1ULL << 56) | // RESOURCE_LEVEL = 1 | |||
6634 | (3ULL << 60); // OOB_SELECT = 3 | |||
6635 | } | |||
6636 | ||||
6637 | uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; | |||
6638 | if (ST.isAmdHsaOS()) { | |||
6639 | // Set ATC = 1. GFX9 doesn't have this bit. | |||
6640 | if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) | |||
6641 | RsrcDataFormat |= (1ULL << 56); | |||
6642 | ||||
6643 | // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. | |||
6644 | // BTW, it disables TC L2 and therefore decreases performance. | |||
6645 | if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) | |||
6646 | RsrcDataFormat |= (2ULL << 59); | |||
6647 | } | |||
6648 | ||||
6649 | return RsrcDataFormat; | |||
6650 | } | |||
6651 | ||||
6652 | uint64_t SIInstrInfo::getScratchRsrcWords23() const { | |||
6653 | uint64_t Rsrc23 = getDefaultRsrcDataFormat() | | |||
6654 | AMDGPU::RSRC_TID_ENABLE | | |||
6655 | 0xffffffff; // Size; | |||
6656 | ||||
6657 | // GFX9 doesn't have ELEMENT_SIZE. | |||
6658 | if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) { | |||
6659 | uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1; | |||
6660 | Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; | |||
6661 | } | |||
6662 | ||||
6663 | // IndexStride = 64 / 32. | |||
6664 | uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2; | |||
6665 | Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; | |||
6666 | ||||
6667 | // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. | |||
6668 | // Clear them unless we want a huge stride. | |||
6669 | if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && | |||
6670 | ST.getGeneration() <= AMDGPUSubtarget::GFX9) | |||
6671 | Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; | |||
6672 | ||||
6673 | return Rsrc23; | |||
6674 | } | |||
6675 | ||||
6676 | bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { | |||
6677 | unsigned Opc = MI.getOpcode(); | |||
6678 | ||||
6679 | return isSMRD(Opc); | |||
6680 | } | |||
6681 | ||||
6682 | bool SIInstrInfo::isHighLatencyDef(int Opc) const { | |||
6683 | return get(Opc).mayLoad() && | |||
6684 | (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc)); | |||
6685 | } | |||
6686 | ||||
6687 | unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, | |||
6688 | int &FrameIndex) const { | |||
6689 | const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); | |||
6690 | if (!Addr || !Addr->isFI()) | |||
6691 | return AMDGPU::NoRegister; | |||
6692 | ||||
6693 | assert(!MI.memoperands_empty() &&((!MI.memoperands_empty() && (*MI.memoperands_begin() )->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS) ? static_cast <void> (0) : __assert_fail ("!MI.memoperands_empty() && (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6694, __PRETTY_FUNCTION__)) | |||
6694 | (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS)((!MI.memoperands_empty() && (*MI.memoperands_begin() )->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS) ? static_cast <void> (0) : __assert_fail ("!MI.memoperands_empty() && (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6694, __PRETTY_FUNCTION__)); | |||
6695 | ||||
6696 | FrameIndex = Addr->getIndex(); | |||
6697 | return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); | |||
6698 | } | |||
6699 | ||||
6700 | unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, | |||
6701 | int &FrameIndex) const { | |||
6702 | const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); | |||
6703 | assert(Addr && Addr->isFI())((Addr && Addr->isFI()) ? static_cast<void> ( 0) : __assert_fail ("Addr && Addr->isFI()", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6703, __PRETTY_FUNCTION__)); | |||
6704 | FrameIndex = Addr->getIndex(); | |||
6705 | return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); | |||
6706 | } | |||
6707 | ||||
6708 | unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, | |||
6709 | int &FrameIndex) const { | |||
6710 | if (!MI.mayLoad()) | |||
6711 | return AMDGPU::NoRegister; | |||
6712 | ||||
6713 | if (isMUBUF(MI) || isVGPRSpill(MI)) | |||
6714 | return isStackAccess(MI, FrameIndex); | |||
6715 | ||||
6716 | if (isSGPRSpill(MI)) | |||
6717 | return isSGPRStackAccess(MI, FrameIndex); | |||
6718 | ||||
6719 | return AMDGPU::NoRegister; | |||
6720 | } | |||
6721 | ||||
6722 | unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, | |||
6723 | int &FrameIndex) const { | |||
6724 | if (!MI.mayStore()) | |||
6725 | return AMDGPU::NoRegister; | |||
6726 | ||||
6727 | if (isMUBUF(MI) || isVGPRSpill(MI)) | |||
6728 | return isStackAccess(MI, FrameIndex); | |||
6729 | ||||
6730 | if (isSGPRSpill(MI)) | |||
6731 | return isSGPRStackAccess(MI, FrameIndex); | |||
6732 | ||||
6733 | return AMDGPU::NoRegister; | |||
6734 | } | |||
6735 | ||||
6736 | unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const { | |||
6737 | unsigned Size = 0; | |||
6738 | MachineBasicBlock::const_instr_iterator I = MI.getIterator(); | |||
6739 | MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); | |||
6740 | while (++I != E && I->isInsideBundle()) { | |||
6741 | assert(!I->isBundle() && "No nested bundle!")((!I->isBundle() && "No nested bundle!") ? static_cast <void> (0) : __assert_fail ("!I->isBundle() && \"No nested bundle!\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6741, __PRETTY_FUNCTION__)); | |||
6742 | Size += getInstSizeInBytes(*I); | |||
6743 | } | |||
6744 | ||||
6745 | return Size; | |||
6746 | } | |||
6747 | ||||
6748 | unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { | |||
6749 | unsigned Opc = MI.getOpcode(); | |||
6750 | const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); | |||
6751 | unsigned DescSize = Desc.getSize(); | |||
6752 | ||||
6753 | // If we have a definitive size, we can use it. Otherwise we need to inspect | |||
6754 | // the operands to know the size. | |||
6755 | if (isFixedSize(MI)) { | |||
6756 | unsigned Size = DescSize; | |||
6757 | ||||
6758 | // If we hit the buggy offset, an extra nop will be inserted in MC so | |||
6759 | // estimate the worst case. | |||
6760 | if (MI.isBranch() && ST.hasOffset3fBug()) | |||
6761 | Size += 4; | |||
6762 | ||||
6763 | return Size; | |||
6764 | } | |||
6765 | ||||
6766 | // 4-byte instructions may have a 32-bit literal encoded after them. Check | |||
6767 | // operands that coud ever be literals. | |||
6768 | if (isVALU(MI) || isSALU(MI)) { | |||
6769 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); | |||
6770 | if (Src0Idx == -1) | |||
6771 | return DescSize; // No operands. | |||
6772 | ||||
6773 | if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) | |||
6774 | return isVOP3(MI) ? 12 : (DescSize + 4); | |||
6775 | ||||
6776 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); | |||
6777 | if (Src1Idx == -1) | |||
6778 | return DescSize; | |||
6779 | ||||
6780 | if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) | |||
6781 | return isVOP3(MI) ? 12 : (DescSize + 4); | |||
6782 | ||||
6783 | int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); | |||
6784 | if (Src2Idx == -1) | |||
6785 | return DescSize; | |||
6786 | ||||
6787 | if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx])) | |||
6788 | return isVOP3(MI) ? 12 : (DescSize + 4); | |||
6789 | ||||
6790 | return DescSize; | |||
6791 | } | |||
6792 | ||||
6793 | // Check whether we have extra NSA words. | |||
6794 | if (isMIMG(MI)) { | |||
6795 | int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); | |||
6796 | if (VAddr0Idx < 0) | |||
6797 | return 8; | |||
6798 | ||||
6799 | int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); | |||
6800 | return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4); | |||
6801 | } | |||
6802 | ||||
6803 | switch (Opc) { | |||
6804 | case TargetOpcode::IMPLICIT_DEF: | |||
6805 | case TargetOpcode::KILL: | |||
6806 | case TargetOpcode::DBG_VALUE: | |||
6807 | case TargetOpcode::EH_LABEL: | |||
6808 | return 0; | |||
6809 | case TargetOpcode::BUNDLE: | |||
6810 | return getInstBundleSize(MI); | |||
6811 | case TargetOpcode::INLINEASM: | |||
6812 | case TargetOpcode::INLINEASM_BR: { | |||
6813 | const MachineFunction *MF = MI.getParent()->getParent(); | |||
6814 | const char *AsmStr = MI.getOperand(0).getSymbolName(); | |||
6815 | return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST); | |||
6816 | } | |||
6817 | default: | |||
6818 | return DescSize; | |||
6819 | } | |||
6820 | } | |||
6821 | ||||
6822 | bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { | |||
6823 | if (!isFLAT(MI)) | |||
6824 | return false; | |||
6825 | ||||
6826 | if (MI.memoperands_empty()) | |||
6827 | return true; | |||
6828 | ||||
6829 | for (const MachineMemOperand *MMO : MI.memoperands()) { | |||
6830 | if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) | |||
6831 | return true; | |||
6832 | } | |||
6833 | return false; | |||
6834 | } | |||
6835 | ||||
6836 | bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { | |||
6837 | return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; | |||
6838 | } | |||
6839 | ||||
6840 | void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, | |||
6841 | MachineBasicBlock *IfEnd) const { | |||
6842 | MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); | |||
6843 | assert(TI != IfEntry->end())((TI != IfEntry->end()) ? static_cast<void> (0) : __assert_fail ("TI != IfEntry->end()", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6843, __PRETTY_FUNCTION__)); | |||
6844 | ||||
6845 | MachineInstr *Branch = &(*TI); | |||
6846 | MachineFunction *MF = IfEntry->getParent(); | |||
6847 | MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); | |||
6848 | ||||
6849 | if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { | |||
6850 | Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
6851 | MachineInstr *SIIF = | |||
6852 | BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) | |||
6853 | .add(Branch->getOperand(0)) | |||
6854 | .add(Branch->getOperand(1)); | |||
6855 | MachineInstr *SIEND = | |||
6856 | BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) | |||
6857 | .addReg(DstReg); | |||
6858 | ||||
6859 | IfEntry->erase(TI); | |||
6860 | IfEntry->insert(IfEntry->end(), SIIF); | |||
6861 | IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); | |||
6862 | } | |||
6863 | } | |||
6864 | ||||
6865 | void SIInstrInfo::convertNonUniformLoopRegion( | |||
6866 | MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { | |||
6867 | MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); | |||
6868 | // We expect 2 terminators, one conditional and one unconditional. | |||
6869 | assert(TI != LoopEnd->end())((TI != LoopEnd->end()) ? static_cast<void> (0) : __assert_fail ("TI != LoopEnd->end()", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6869, __PRETTY_FUNCTION__)); | |||
6870 | ||||
6871 | MachineInstr *Branch = &(*TI); | |||
6872 | MachineFunction *MF = LoopEnd->getParent(); | |||
6873 | MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); | |||
6874 | ||||
6875 | if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { | |||
6876 | ||||
6877 | Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
6878 | Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
6879 | MachineInstrBuilder HeaderPHIBuilder = | |||
6880 | BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); | |||
6881 | for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), | |||
6882 | E = LoopEntry->pred_end(); | |||
6883 | PI != E; ++PI) { | |||
6884 | if (*PI == LoopEnd) { | |||
6885 | HeaderPHIBuilder.addReg(BackEdgeReg); | |||
6886 | } else { | |||
6887 | MachineBasicBlock *PMBB = *PI; | |||
6888 | Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
6889 | materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), | |||
6890 | ZeroReg, 0); | |||
6891 | HeaderPHIBuilder.addReg(ZeroReg); | |||
6892 | } | |||
6893 | HeaderPHIBuilder.addMBB(*PI); | |||
6894 | } | |||
6895 | MachineInstr *HeaderPhi = HeaderPHIBuilder; | |||
6896 | MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), | |||
6897 | get(AMDGPU::SI_IF_BREAK), BackEdgeReg) | |||
6898 | .addReg(DstReg) | |||
6899 | .add(Branch->getOperand(0)); | |||
6900 | MachineInstr *SILOOP = | |||
6901 | BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) | |||
6902 | .addReg(BackEdgeReg) | |||
6903 | .addMBB(LoopEntry); | |||
6904 | ||||
6905 | LoopEntry->insert(LoopEntry->begin(), HeaderPhi); | |||
6906 | LoopEnd->erase(TI); | |||
6907 | LoopEnd->insert(LoopEnd->end(), SIIFBREAK); | |||
6908 | LoopEnd->insert(LoopEnd->end(), SILOOP); | |||
6909 | } | |||
6910 | } | |||
6911 | ||||
6912 | ArrayRef<std::pair<int, const char *>> | |||
6913 | SIInstrInfo::getSerializableTargetIndices() const { | |||
6914 | static const std::pair<int, const char *> TargetIndices[] = { | |||
6915 | {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, | |||
6916 | {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, | |||
6917 | {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, | |||
6918 | {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, | |||
6919 | {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; | |||
6920 | return makeArrayRef(TargetIndices); | |||
6921 | } | |||
6922 | ||||
6923 | /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The | |||
6924 | /// post-RA version of misched uses CreateTargetMIHazardRecognizer. | |||
6925 | ScheduleHazardRecognizer * | |||
6926 | SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, | |||
6927 | const ScheduleDAG *DAG) const { | |||
6928 | return new GCNHazardRecognizer(DAG->MF); | |||
6929 | } | |||
6930 | ||||
6931 | /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer | |||
6932 | /// pass. | |||
6933 | ScheduleHazardRecognizer * | |||
6934 | SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { | |||
6935 | return new GCNHazardRecognizer(MF); | |||
6936 | } | |||
6937 | ||||
6938 | std::pair<unsigned, unsigned> | |||
6939 | SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { | |||
6940 | return std::make_pair(TF & MO_MASK, TF & ~MO_MASK); | |||
6941 | } | |||
6942 | ||||
6943 | ArrayRef<std::pair<unsigned, const char *>> | |||
6944 | SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { | |||
6945 | static const std::pair<unsigned, const char *> TargetFlags[] = { | |||
6946 | { MO_GOTPCREL, "amdgpu-gotprel" }, | |||
6947 | { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, | |||
6948 | { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, | |||
6949 | { MO_REL32_LO, "amdgpu-rel32-lo" }, | |||
6950 | { MO_REL32_HI, "amdgpu-rel32-hi" }, | |||
6951 | { MO_ABS32_LO, "amdgpu-abs32-lo" }, | |||
6952 | { MO_ABS32_HI, "amdgpu-abs32-hi" }, | |||
6953 | }; | |||
6954 | ||||
6955 | return makeArrayRef(TargetFlags); | |||
6956 | } | |||
6957 | ||||
6958 | bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { | |||
6959 | return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && | |||
6960 | MI.modifiesRegister(AMDGPU::EXEC, &RI); | |||
6961 | } | |||
6962 | ||||
6963 | MachineInstrBuilder | |||
6964 | SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, | |||
6965 | MachineBasicBlock::iterator I, | |||
6966 | const DebugLoc &DL, | |||
6967 | Register DestReg) const { | |||
6968 | if (ST.hasAddNoCarry()) | |||
6969 | return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); | |||
6970 | ||||
6971 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6972 | Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC()); | |||
6973 | MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC()); | |||
6974 | ||||
6975 | return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg) | |||
6976 | .addReg(UnusedCarry, RegState::Define | RegState::Dead); | |||
6977 | } | |||
6978 | ||||
6979 | MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, | |||
6980 | MachineBasicBlock::iterator I, | |||
6981 | const DebugLoc &DL, | |||
6982 | Register DestReg, | |||
6983 | RegScavenger &RS) const { | |||
6984 | if (ST.hasAddNoCarry()) | |||
6985 | return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg); | |||
6986 | ||||
6987 | // If available, prefer to use vcc. | |||
6988 | Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC) | |||
6989 | ? Register(RI.getVCC()) | |||
6990 | : RS.scavengeRegister(RI.getBoolRC(), I, 0, false); | |||
6991 | ||||
6992 | // TODO: Users need to deal with this. | |||
6993 | if (!UnusedCarry.isValid()) | |||
6994 | return MachineInstrBuilder(); | |||
6995 | ||||
6996 | return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg) | |||
6997 | .addReg(UnusedCarry, RegState::Define | RegState::Dead); | |||
6998 | } | |||
6999 | ||||
7000 | bool SIInstrInfo::isKillTerminator(unsigned Opcode) { | |||
7001 | switch (Opcode) { | |||
7002 | case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: | |||
7003 | case AMDGPU::SI_KILL_I1_TERMINATOR: | |||
7004 | return true; | |||
7005 | default: | |||
7006 | return false; | |||
7007 | } | |||
7008 | } | |||
7009 | ||||
7010 | const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const { | |||
7011 | switch (Opcode) { | |||
7012 | case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: | |||
7013 | return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR); | |||
7014 | case AMDGPU::SI_KILL_I1_PSEUDO: | |||
7015 | return get(AMDGPU::SI_KILL_I1_TERMINATOR); | |||
7016 | default: | |||
7017 | llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO")::llvm::llvm_unreachable_internal("invalid opcode, expected SI_KILL_*_PSEUDO" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7017); | |||
7018 | } | |||
7019 | } | |||
7020 | ||||
7021 | void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const { | |||
7022 | if (!ST.isWave32()) | |||
7023 | return; | |||
7024 | ||||
7025 | for (auto &Op : MI.implicit_operands()) { | |||
7026 | if (Op.isReg() && Op.getReg() == AMDGPU::VCC) | |||
7027 | Op.setReg(AMDGPU::VCC_LO); | |||
7028 | } | |||
7029 | } | |||
7030 | ||||
7031 | bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { | |||
7032 | if (!isSMRD(MI)) | |||
7033 | return false; | |||
7034 | ||||
7035 | // Check that it is using a buffer resource. | |||
7036 | int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase); | |||
7037 | if (Idx == -1) // e.g. s_memtime | |||
7038 | return false; | |||
7039 | ||||
7040 | const auto RCID = MI.getDesc().OpInfo[Idx].RegClass; | |||
7041 | return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); | |||
7042 | } | |||
7043 | ||||
7044 | bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, | |||
7045 | bool Signed) const { | |||
7046 | // TODO: Should 0 be special cased? | |||
7047 | if (!ST.hasFlatInstOffsets()) | |||
7048 | return false; | |||
7049 | ||||
7050 | if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS) | |||
7051 | return false; | |||
7052 | ||||
7053 | unsigned N = AMDGPU::getNumFlatOffsetBits(ST, Signed); | |||
7054 | return Signed ? isIntN(N, Offset) : isUIntN(N, Offset); | |||
7055 | } | |||
7056 | ||||
7057 | std::pair<int64_t, int64_t> SIInstrInfo::splitFlatOffset(int64_t COffsetVal, | |||
7058 | unsigned AddrSpace, | |||
7059 | bool IsSigned) const { | |||
7060 | int64_t RemainderOffset = COffsetVal; | |||
7061 | int64_t ImmField = 0; | |||
7062 | const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, IsSigned); | |||
7063 | if (IsSigned) { | |||
7064 | // Use signed division by a power of two to truncate towards 0. | |||
7065 | int64_t D = 1LL << (NumBits - 1); | |||
7066 | RemainderOffset = (COffsetVal / D) * D; | |||
7067 | ImmField = COffsetVal - RemainderOffset; | |||
7068 | } else if (COffsetVal >= 0) { | |||
7069 | ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits); | |||
7070 | RemainderOffset = COffsetVal - ImmField; | |||
7071 | } | |||
7072 | ||||
7073 | assert(isLegalFLATOffset(ImmField, AddrSpace, IsSigned))((isLegalFLATOffset(ImmField, AddrSpace, IsSigned)) ? static_cast <void> (0) : __assert_fail ("isLegalFLATOffset(ImmField, AddrSpace, IsSigned)" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7073, __PRETTY_FUNCTION__)); | |||
7074 | assert(RemainderOffset + ImmField == COffsetVal)((RemainderOffset + ImmField == COffsetVal) ? static_cast< void> (0) : __assert_fail ("RemainderOffset + ImmField == COffsetVal" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7074, __PRETTY_FUNCTION__)); | |||
7075 | return {ImmField, RemainderOffset}; | |||
7076 | } | |||
7077 | ||||
7078 | // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td | |||
7079 | enum SIEncodingFamily { | |||
7080 | SI = 0, | |||
7081 | VI = 1, | |||
7082 | SDWA = 2, | |||
7083 | SDWA9 = 3, | |||
7084 | GFX80 = 4, | |||
7085 | GFX9 = 5, | |||
7086 | GFX10 = 6, | |||
7087 | SDWA10 = 7 | |||
7088 | }; | |||
7089 | ||||
7090 | static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { | |||
7091 | switch (ST.getGeneration()) { | |||
7092 | default: | |||
7093 | break; | |||
7094 | case AMDGPUSubtarget::SOUTHERN_ISLANDS: | |||
7095 | case AMDGPUSubtarget::SEA_ISLANDS: | |||
7096 | return SIEncodingFamily::SI; | |||
7097 | case AMDGPUSubtarget::VOLCANIC_ISLANDS: | |||
7098 | case AMDGPUSubtarget::GFX9: | |||
7099 | return SIEncodingFamily::VI; | |||
7100 | case AMDGPUSubtarget::GFX10: | |||
7101 | return SIEncodingFamily::GFX10; | |||
7102 | } | |||
7103 | llvm_unreachable("Unknown subtarget generation!")::llvm::llvm_unreachable_internal("Unknown subtarget generation!" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7103); | |||
7104 | } | |||
7105 | ||||
7106 | bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const { | |||
7107 | switch(MCOp) { | |||
7108 | // These opcodes use indirect register addressing so | |||
7109 | // they need special handling by codegen (currently missing). | |||
7110 | // Therefore it is too risky to allow these opcodes | |||
7111 | // to be selected by dpp combiner or sdwa peepholer. | |||
7112 | case AMDGPU::V_MOVRELS_B32_dpp_gfx10: | |||
7113 | case AMDGPU::V_MOVRELS_B32_sdwa_gfx10: | |||
7114 | case AMDGPU::V_MOVRELD_B32_dpp_gfx10: | |||
7115 | case AMDGPU::V_MOVRELD_B32_sdwa_gfx10: | |||
7116 | case AMDGPU::V_MOVRELSD_B32_dpp_gfx10: | |||
7117 | case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10: | |||
7118 | case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10: | |||
7119 | case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10: | |||
7120 | return true; | |||
7121 | default: | |||
7122 | return false; | |||
7123 | } | |||
7124 | } | |||
7125 | ||||
7126 | int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { | |||
7127 | SIEncodingFamily Gen = subtargetEncodingFamily(ST); | |||
7128 | ||||
7129 | if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && | |||
7130 | ST.getGeneration() == AMDGPUSubtarget::GFX9) | |||
7131 | Gen = SIEncodingFamily::GFX9; | |||
7132 | ||||
7133 | // Adjust the encoding family to GFX80 for D16 buffer instructions when the | |||
7134 | // subtarget has UnpackedD16VMem feature. | |||
7135 | // TODO: remove this when we discard GFX80 encoding. | |||
7136 | if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf)) | |||
7137 | Gen = SIEncodingFamily::GFX80; | |||
7138 | ||||
7139 | if (get(Opcode).TSFlags & SIInstrFlags::SDWA) { | |||
7140 | switch (ST.getGeneration()) { | |||
7141 | default: | |||
7142 | Gen = SIEncodingFamily::SDWA; | |||
7143 | break; | |||
7144 | case AMDGPUSubtarget::GFX9: | |||
7145 | Gen = SIEncodingFamily::SDWA9; | |||
7146 | break; | |||
7147 | case AMDGPUSubtarget::GFX10: | |||
7148 | Gen = SIEncodingFamily::SDWA10; | |||
7149 | break; | |||
7150 | } | |||
7151 | } | |||
7152 | ||||
7153 | int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); | |||
7154 | ||||
7155 | // -1 means that Opcode is already a native instruction. | |||
7156 | if (MCOp == -1) | |||
7157 | return Opcode; | |||
7158 | ||||
7159 | // (uint16_t)-1 means that Opcode is a pseudo instruction that has | |||
7160 | // no encoding in the given subtarget generation. | |||
7161 | if (MCOp == (uint16_t)-1) | |||
7162 | return -1; | |||
7163 | ||||
7164 | if (isAsmOnlyOpcode(MCOp)) | |||
7165 | return -1; | |||
7166 | ||||
7167 | return MCOp; | |||
7168 | } | |||
7169 | ||||
7170 | static | |||
7171 | TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) { | |||
7172 | assert(RegOpnd.isReg())((RegOpnd.isReg()) ? static_cast<void> (0) : __assert_fail ("RegOpnd.isReg()", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7172, __PRETTY_FUNCTION__)); | |||
7173 | return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() : | |||
7174 | getRegSubRegPair(RegOpnd); | |||
7175 | } | |||
7176 | ||||
7177 | TargetInstrInfo::RegSubRegPair | |||
7178 | llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) { | |||
7179 | assert(MI.isRegSequence())((MI.isRegSequence()) ? static_cast<void> (0) : __assert_fail ("MI.isRegSequence()", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7179, __PRETTY_FUNCTION__)); | |||
7180 | for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I) | |||
7181 | if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) { | |||
7182 | auto &RegOp = MI.getOperand(1 + 2 * I); | |||
7183 | return getRegOrUndef(RegOp); | |||
7184 | } | |||
7185 | return TargetInstrInfo::RegSubRegPair(); | |||
7186 | } | |||
7187 | ||||
7188 | // Try to find the definition of reg:subreg in subreg-manipulation pseudos | |||
7189 | // Following a subreg of reg:subreg isn't supported | |||
7190 | static bool followSubRegDef(MachineInstr &MI, | |||
7191 | TargetInstrInfo::RegSubRegPair &RSR) { | |||
7192 | if (!RSR.SubReg) | |||
7193 | return false; | |||
7194 | switch (MI.getOpcode()) { | |||
7195 | default: break; | |||
7196 | case AMDGPU::REG_SEQUENCE: | |||
7197 | RSR = getRegSequenceSubReg(MI, RSR.SubReg); | |||
7198 | return true; | |||
7199 | // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg | |||
7200 | case AMDGPU::INSERT_SUBREG: | |||
7201 | if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm()) | |||
7202 | // inserted the subreg we're looking for | |||
7203 | RSR = getRegOrUndef(MI.getOperand(2)); | |||
7204 | else { // the subreg in the rest of the reg | |||
7205 | auto R1 = getRegOrUndef(MI.getOperand(1)); | |||
7206 | if (R1.SubReg) // subreg of subreg isn't supported | |||
7207 | return false; | |||
7208 | RSR.Reg = R1.Reg; | |||
7209 | } | |||
7210 | return true; | |||
7211 | } | |||
7212 | return false; | |||
7213 | } | |||
7214 | ||||
7215 | MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, | |||
7216 | MachineRegisterInfo &MRI) { | |||
7217 | assert(MRI.isSSA())((MRI.isSSA()) ? static_cast<void> (0) : __assert_fail ( "MRI.isSSA()", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7217, __PRETTY_FUNCTION__)); | |||
7218 | if (!P.Reg.isVirtual()) | |||
7219 | return nullptr; | |||
7220 | ||||
7221 | auto RSR = P; | |||
7222 | auto *DefInst = MRI.getVRegDef(RSR.Reg); | |||
7223 | while (auto *MI = DefInst) { | |||
7224 | DefInst = nullptr; | |||
7225 | switch (MI->getOpcode()) { | |||
7226 | case AMDGPU::COPY: | |||
7227 | case AMDGPU::V_MOV_B32_e32: { | |||
7228 | auto &Op1 = MI->getOperand(1); | |||
7229 | if (Op1.isReg() && Op1.getReg().isVirtual()) { | |||
7230 | if (Op1.isUndef()) | |||
7231 | return nullptr; | |||
7232 | RSR = getRegSubRegPair(Op1); | |||
7233 | DefInst = MRI.getVRegDef(RSR.Reg); | |||
7234 | } | |||
7235 | break; | |||
7236 | } | |||
7237 | default: | |||
7238 | if (followSubRegDef(*MI, RSR)) { | |||
7239 | if (!RSR.Reg) | |||
7240 | return nullptr; | |||
7241 | DefInst = MRI.getVRegDef(RSR.Reg); | |||
7242 | } | |||
7243 | } | |||
7244 | if (!DefInst) | |||
7245 | return MI; | |||
7246 | } | |||
7247 | return nullptr; | |||
7248 | } | |||
7249 | ||||
7250 | bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, | |||
7251 | Register VReg, | |||
7252 | const MachineInstr &DefMI, | |||
7253 | const MachineInstr &UseMI) { | |||
7254 | assert(MRI.isSSA() && "Must be run on SSA")((MRI.isSSA() && "Must be run on SSA") ? static_cast< void> (0) : __assert_fail ("MRI.isSSA() && \"Must be run on SSA\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7254, __PRETTY_FUNCTION__)); | |||
7255 | ||||
7256 | auto *TRI = MRI.getTargetRegisterInfo(); | |||
7257 | auto *DefBB = DefMI.getParent(); | |||
7258 | ||||
7259 | // Don't bother searching between blocks, although it is possible this block | |||
7260 | // doesn't modify exec. | |||
7261 | if (UseMI.getParent() != DefBB) | |||
7262 | return true; | |||
7263 | ||||
7264 | const int MaxInstScan = 20; | |||
7265 | int NumInst = 0; | |||
7266 | ||||
7267 | // Stop scan at the use. | |||
7268 | auto E = UseMI.getIterator(); | |||
7269 | for (auto I = std::next(DefMI.getIterator()); I != E; ++I) { | |||
7270 | if (I->isDebugInstr()) | |||
7271 | continue; | |||
7272 | ||||
7273 | if (++NumInst > MaxInstScan) | |||
7274 | return true; | |||
7275 | ||||
7276 | if (I->modifiesRegister(AMDGPU::EXEC, TRI)) | |||
7277 | return true; | |||
7278 | } | |||
7279 | ||||
7280 | return false; | |||
7281 | } | |||
7282 | ||||
7283 | bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, | |||
7284 | Register VReg, | |||
7285 | const MachineInstr &DefMI) { | |||
7286 | assert(MRI.isSSA() && "Must be run on SSA")((MRI.isSSA() && "Must be run on SSA") ? static_cast< void> (0) : __assert_fail ("MRI.isSSA() && \"Must be run on SSA\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7286, __PRETTY_FUNCTION__)); | |||
7287 | ||||
7288 | auto *TRI = MRI.getTargetRegisterInfo(); | |||
7289 | auto *DefBB = DefMI.getParent(); | |||
7290 | ||||
7291 | const int MaxUseScan = 10; | |||
7292 | int NumUse = 0; | |||
7293 | ||||
7294 | for (auto &Use : MRI.use_nodbg_operands(VReg)) { | |||
7295 | auto &UseInst = *Use.getParent(); | |||
7296 | // Don't bother searching between blocks, although it is possible this block | |||
7297 | // doesn't modify exec. | |||
7298 | if (UseInst.getParent() != DefBB) | |||
7299 | return true; | |||
7300 | ||||
7301 | if (++NumUse > MaxUseScan) | |||
7302 | return true; | |||
7303 | } | |||
7304 | ||||
7305 | if (NumUse == 0) | |||
7306 | return false; | |||
7307 | ||||
7308 | const int MaxInstScan = 20; | |||
7309 | int NumInst = 0; | |||
7310 | ||||
7311 | // Stop scan when we have seen all the uses. | |||
7312 | for (auto I = std::next(DefMI.getIterator()); ; ++I) { | |||
7313 | assert(I != DefBB->end())((I != DefBB->end()) ? static_cast<void> (0) : __assert_fail ("I != DefBB->end()", "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7313, __PRETTY_FUNCTION__)); | |||
7314 | ||||
7315 | if (I->isDebugInstr()) | |||
7316 | continue; | |||
7317 | ||||
7318 | if (++NumInst > MaxInstScan) | |||
7319 | return true; | |||
7320 | ||||
7321 | for (const MachineOperand &Op : I->operands()) { | |||
7322 | // We don't check reg masks here as they're used only on calls: | |||
7323 | // 1. EXEC is only considered const within one BB | |||
7324 | // 2. Call should be a terminator instruction if present in a BB | |||
7325 | ||||
7326 | if (!Op.isReg()) | |||
7327 | continue; | |||
7328 | ||||
7329 | Register Reg = Op.getReg(); | |||
7330 | if (Op.isUse()) { | |||
7331 | if (Reg == VReg && --NumUse == 0) | |||
7332 | return false; | |||
7333 | } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC)) | |||
7334 | return true; | |||
7335 | } | |||
7336 | } | |||
7337 | } | |||
7338 | ||||
7339 | MachineInstr *SIInstrInfo::createPHIDestinationCopy( | |||
7340 | MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt, | |||
7341 | const DebugLoc &DL, Register Src, Register Dst) const { | |||
7342 | auto Cur = MBB.begin(); | |||
7343 | if (Cur != MBB.end()) | |||
7344 | do { | |||
7345 | if (!Cur->isPHI() && Cur->readsRegister(Dst)) | |||
7346 | return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src); | |||
7347 | ++Cur; | |||
7348 | } while (Cur != MBB.end() && Cur != LastPHIIt); | |||
7349 | ||||
7350 | return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src, | |||
7351 | Dst); | |||
7352 | } | |||
7353 | ||||
7354 | MachineInstr *SIInstrInfo::createPHISourceCopy( | |||
7355 | MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, | |||
7356 | const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const { | |||
7357 | if (InsPt != MBB.end() && | |||
7358 | (InsPt->getOpcode() == AMDGPU::SI_IF || | |||
7359 | InsPt->getOpcode() == AMDGPU::SI_ELSE || | |||
7360 | InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) && | |||
7361 | InsPt->definesRegister(Src)) { | |||
7362 | InsPt++; | |||
7363 | return BuildMI(MBB, InsPt, DL, | |||
7364 | get(ST.isWave32() ? AMDGPU::S_MOV_B32_term | |||
7365 | : AMDGPU::S_MOV_B64_term), | |||
7366 | Dst) | |||
7367 | .addReg(Src, 0, SrcSubReg) | |||
7368 | .addReg(AMDGPU::EXEC, RegState::Implicit); | |||
7369 | } | |||
7370 | return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg, | |||
7371 | Dst); | |||
7372 | } | |||
7373 | ||||
7374 | bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); } | |||
7375 | ||||
7376 | MachineInstr *SIInstrInfo::foldMemoryOperandImpl( | |||
7377 | MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, | |||
7378 | MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS, | |||
7379 | VirtRegMap *VRM) const { | |||
7380 | // This is a bit of a hack (copied from AArch64). Consider this instruction: | |||
7381 | // | |||
7382 | // %0:sreg_32 = COPY $m0 | |||
7383 | // | |||
7384 | // We explicitly chose SReg_32 for the virtual register so such a copy might | |||
7385 | // be eliminated by RegisterCoalescer. However, that may not be possible, and | |||
7386 | // %0 may even spill. We can't spill $m0 normally (it would require copying to | |||
7387 | // a numbered SGPR anyway), and since it is in the SReg_32 register class, | |||
7388 | // TargetInstrInfo::foldMemoryOperand() is going to try. | |||
7389 | // A similar issue also exists with spilling and reloading $exec registers. | |||
7390 | // | |||
7391 | // To prevent that, constrain the %0 register class here. | |||
7392 | if (MI.isFullCopy()) { | |||
7393 | Register DstReg = MI.getOperand(0).getReg(); | |||
7394 | Register SrcReg = MI.getOperand(1).getReg(); | |||
7395 | if ((DstReg.isVirtual() || SrcReg.isVirtual()) && | |||
7396 | (DstReg.isVirtual() != SrcReg.isVirtual())) { | |||
7397 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
7398 | Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg; | |||
7399 | const TargetRegisterClass *RC = MRI.getRegClass(VirtReg); | |||
7400 | if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) { | |||
7401 | MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); | |||
7402 | return nullptr; | |||
7403 | } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) { | |||
7404 | MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass); | |||
7405 | return nullptr; | |||
7406 | } | |||
7407 | } | |||
7408 | } | |||
7409 | ||||
7410 | return nullptr; | |||
7411 | } | |||
7412 | ||||
7413 | unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, | |||
7414 | const MachineInstr &MI, | |||
7415 | unsigned *PredCost) const { | |||
7416 | if (MI.isBundle()) { | |||
7417 | MachineBasicBlock::const_instr_iterator I(MI.getIterator()); | |||
7418 | MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end()); | |||
7419 | unsigned Lat = 0, Count = 0; | |||
7420 | for (++I; I != E && I->isBundledWithPred(); ++I) { | |||
7421 | ++Count; | |||
7422 | Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I)); | |||
7423 | } | |||
7424 | return Lat + Count - 1; | |||
7425 | } | |||
7426 | ||||
7427 | return SchedModel.computeInstrLatency(&MI); | |||
7428 | } | |||
7429 | ||||
7430 | unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) { | |||
7431 | switch (MF.getFunction().getCallingConv()) { | |||
7432 | case CallingConv::AMDGPU_PS: | |||
7433 | return 1; | |||
7434 | case CallingConv::AMDGPU_VS: | |||
7435 | return 2; | |||
7436 | case CallingConv::AMDGPU_GS: | |||
7437 | return 3; | |||
7438 | case CallingConv::AMDGPU_HS: | |||
7439 | case CallingConv::AMDGPU_LS: | |||
7440 | case CallingConv::AMDGPU_ES: | |||
7441 | report_fatal_error("ds_ordered_count unsupported for this calling conv"); | |||
7442 | case CallingConv::AMDGPU_CS: | |||
7443 | case CallingConv::AMDGPU_KERNEL: | |||
7444 | case CallingConv::C: | |||
7445 | case CallingConv::Fast: | |||
7446 | default: | |||
7447 | // Assume other calling conventions are various compute callable functions | |||
7448 | return 0; | |||
7449 | } | |||
7450 | } |
1 | //===-- llvm/CodeGen/Register.h ---------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef LLVM_CODEGEN_REGISTER_H |
10 | #define LLVM_CODEGEN_REGISTER_H |
11 | |
12 | #include "llvm/MC/MCRegister.h" |
13 | #include <cassert> |
14 | |
15 | namespace llvm { |
16 | |
17 | /// Wrapper class representing virtual and physical registers. Should be passed |
18 | /// by value. |
19 | class Register { |
20 | unsigned Reg; |
21 | |
22 | public: |
23 | constexpr Register(unsigned Val = 0): Reg(Val) {} |
24 | constexpr Register(MCRegister Val): Reg(Val) {} |
25 | |
26 | // Register numbers can represent physical registers, virtual registers, and |
27 | // sometimes stack slots. The unsigned values are divided into these ranges: |
28 | // |
29 | // 0 Not a register, can be used as a sentinel. |
30 | // [1;2^30) Physical registers assigned by TableGen. |
31 | // [2^30;2^31) Stack slots. (Rarely used.) |
32 | // [2^31;2^32) Virtual registers assigned by MachineRegisterInfo. |
33 | // |
34 | // Further sentinels can be allocated from the small negative integers. |
35 | // DenseMapInfo<unsigned> uses -1u and -2u. |
36 | static_assert(std::numeric_limits<decltype(Reg)>::max() >= 0xFFFFFFFF, |
37 | "Reg isn't large enough to hold full range."); |
38 | |
39 | /// isStackSlot - Sometimes it is useful the be able to store a non-negative |
40 | /// frame index in a variable that normally holds a register. isStackSlot() |
41 | /// returns true if Reg is in the range used for stack slots. |
42 | /// |
43 | /// FIXME: remove in favor of member. |
44 | static bool isStackSlot(unsigned Reg) { |
45 | return MCRegister::isStackSlot(Reg); |
46 | } |
47 | |
48 | /// Return true if this is a stack slot. |
49 | bool isStack() const { return MCRegister::isStackSlot(Reg); } |
50 | |
51 | /// Compute the frame index from a register value representing a stack slot. |
52 | static int stackSlot2Index(Register Reg) { |
53 | assert(Reg.isStack() && "Not a stack slot")((Reg.isStack() && "Not a stack slot") ? static_cast< void> (0) : __assert_fail ("Reg.isStack() && \"Not a stack slot\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/include/llvm/CodeGen/Register.h" , 53, __PRETTY_FUNCTION__)); |
54 | return int(Reg - MCRegister::FirstStackSlot); |
55 | } |
56 | |
57 | /// Convert a non-negative frame index to a stack slot register value. |
58 | static Register index2StackSlot(int FI) { |
59 | assert(FI >= 0 && "Cannot hold a negative frame index.")((FI >= 0 && "Cannot hold a negative frame index." ) ? static_cast<void> (0) : __assert_fail ("FI >= 0 && \"Cannot hold a negative frame index.\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/include/llvm/CodeGen/Register.h" , 59, __PRETTY_FUNCTION__)); |
60 | return Register(FI + MCRegister::FirstStackSlot); |
61 | } |
62 | |
63 | /// Return true if the specified register number is in |
64 | /// the physical register namespace. |
65 | static bool isPhysicalRegister(unsigned Reg) { |
66 | return MCRegister::isPhysicalRegister(Reg); |
67 | } |
68 | |
69 | /// Return true if the specified register number is in |
70 | /// the virtual register namespace. |
71 | static bool isVirtualRegister(unsigned Reg) { |
72 | return Reg & MCRegister::VirtualRegFlag && !isStackSlot(Reg); |
73 | } |
74 | |
75 | /// Convert a virtual register number to a 0-based index. |
76 | /// The first virtual register in a function will get the index 0. |
77 | static unsigned virtReg2Index(Register Reg) { |
78 | assert(isVirtualRegister(Reg) && "Not a virtual register")((isVirtualRegister(Reg) && "Not a virtual register") ? static_cast<void> (0) : __assert_fail ("isVirtualRegister(Reg) && \"Not a virtual register\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/include/llvm/CodeGen/Register.h" , 78, __PRETTY_FUNCTION__)); |
79 | return Reg & ~MCRegister::VirtualRegFlag; |
80 | } |
81 | |
82 | /// Convert a 0-based index to a virtual register number. |
83 | /// This is the inverse operation of VirtReg2IndexFunctor below. |
84 | static Register index2VirtReg(unsigned Index) { |
85 | assert(Index < (1u << 31) && "Index too large for virtual register range.")((Index < (1u << 31) && "Index too large for virtual register range." ) ? static_cast<void> (0) : __assert_fail ("Index < (1u << 31) && \"Index too large for virtual register range.\"" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/include/llvm/CodeGen/Register.h" , 85, __PRETTY_FUNCTION__)); |
86 | return Index | MCRegister::VirtualRegFlag; |
87 | } |
88 | |
89 | /// Return true if the specified register number is in the virtual register |
90 | /// namespace. |
91 | bool isVirtual() const { |
92 | return isVirtualRegister(Reg); |
93 | } |
94 | |
95 | /// Return true if the specified register number is in the physical register |
96 | /// namespace. |
97 | bool isPhysical() const { |
98 | return isPhysicalRegister(Reg); |
99 | } |
100 | |
101 | /// Convert a virtual register number to a 0-based index. The first virtual |
102 | /// register in a function will get the index 0. |
103 | unsigned virtRegIndex() const { |
104 | return virtReg2Index(Reg); |
105 | } |
106 | |
107 | constexpr operator unsigned() const { |
108 | return Reg; |
109 | } |
110 | |
111 | unsigned id() const { return Reg; } |
112 | |
113 | operator MCRegister() const { |
114 | return MCRegister(Reg); |
115 | } |
116 | |
117 | /// Utility to check-convert this value to a MCRegister. The caller is |
118 | /// expected to have already validated that this Register is, indeed, |
119 | /// physical. |
120 | MCRegister asMCReg() const { |
121 | assert(Reg == MCRegister::NoRegister ||((Reg == MCRegister::NoRegister || MCRegister::isPhysicalRegister (Reg)) ? static_cast<void> (0) : __assert_fail ("Reg == MCRegister::NoRegister || MCRegister::isPhysicalRegister(Reg)" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/include/llvm/CodeGen/Register.h" , 122, __PRETTY_FUNCTION__)) |
122 | MCRegister::isPhysicalRegister(Reg))((Reg == MCRegister::NoRegister || MCRegister::isPhysicalRegister (Reg)) ? static_cast<void> (0) : __assert_fail ("Reg == MCRegister::NoRegister || MCRegister::isPhysicalRegister(Reg)" , "/build/llvm-toolchain-snapshot-13~++20210216111115+df22133a8a40/llvm/include/llvm/CodeGen/Register.h" , 122, __PRETTY_FUNCTION__)); |
123 | return MCRegister(Reg); |
124 | } |
125 | |
126 | bool isValid() const { return Reg != MCRegister::NoRegister; } |
127 | |
128 | /// Comparisons between register objects |
129 | bool operator==(const Register &Other) const { return Reg == Other.Reg; } |
130 | bool operator!=(const Register &Other) const { return Reg != Other.Reg; } |
131 | bool operator==(const MCRegister &Other) const { return Reg == Other.id(); } |
132 | bool operator!=(const MCRegister &Other) const { return Reg != Other.id(); } |
133 | |
134 | /// Comparisons against register constants. E.g. |
135 | /// * R == AArch64::WZR |
136 | /// * R == 0 |
137 | /// * R == VirtRegMap::NO_PHYS_REG |
138 | bool operator==(unsigned Other) const { return Reg == Other; } |
139 | bool operator!=(unsigned Other) const { return Reg != Other; } |
140 | bool operator==(int Other) const { return Reg == unsigned(Other); } |
141 | bool operator!=(int Other) const { return Reg != unsigned(Other); } |
142 | // MSVC requires that we explicitly declare these two as well. |
143 | bool operator==(MCPhysReg Other) const { return Reg == unsigned(Other); } |
144 | bool operator!=(MCPhysReg Other) const { return Reg != unsigned(Other); } |
145 | }; |
146 | |
147 | // Provide DenseMapInfo for Register |
148 | template<> struct DenseMapInfo<Register> { |
149 | static inline unsigned getEmptyKey() { |
150 | return DenseMapInfo<unsigned>::getEmptyKey(); |
151 | } |
152 | static inline unsigned getTombstoneKey() { |
153 | return DenseMapInfo<unsigned>::getTombstoneKey(); |
154 | } |
155 | static unsigned getHashValue(const Register &Val) { |
156 | return DenseMapInfo<unsigned>::getHashValue(Val.id()); |
157 | } |
158 | static bool isEqual(const Register &LHS, const Register &RHS) { |
159 | return DenseMapInfo<unsigned>::isEqual(LHS.id(), RHS.id()); |
160 | } |
161 | }; |
162 | |
163 | } |
164 | |
165 | #endif // LLVM_CODEGEN_REGISTER_H |