File: | build/source/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp |
Warning: | line 2339, column 15 Called C++ object pointer is uninitialized |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | // | |||
9 | /// \file | |||
10 | /// SI Implementation of TargetInstrInfo. | |||
11 | // | |||
12 | //===----------------------------------------------------------------------===// | |||
13 | ||||
14 | #include "SIInstrInfo.h" | |||
15 | #include "AMDGPU.h" | |||
16 | #include "AMDGPUInstrInfo.h" | |||
17 | #include "GCNHazardRecognizer.h" | |||
18 | #include "GCNSubtarget.h" | |||
19 | #include "SIMachineFunctionInfo.h" | |||
20 | #include "llvm/Analysis/ValueTracking.h" | |||
21 | #include "llvm/CodeGen/LiveIntervals.h" | |||
22 | #include "llvm/CodeGen/LiveVariables.h" | |||
23 | #include "llvm/CodeGen/MachineDominators.h" | |||
24 | #include "llvm/CodeGen/MachineFrameInfo.h" | |||
25 | #include "llvm/CodeGen/MachineScheduler.h" | |||
26 | #include "llvm/CodeGen/RegisterScavenging.h" | |||
27 | #include "llvm/CodeGen/ScheduleDAG.h" | |||
28 | #include "llvm/IR/DiagnosticInfo.h" | |||
29 | #include "llvm/IR/IntrinsicsAMDGPU.h" | |||
30 | #include "llvm/MC/MCContext.h" | |||
31 | #include "llvm/Support/CommandLine.h" | |||
32 | #include "llvm/Target/TargetMachine.h" | |||
33 | ||||
34 | using namespace llvm; | |||
35 | ||||
36 | #define DEBUG_TYPE"si-instr-info" "si-instr-info" | |||
37 | ||||
38 | #define GET_INSTRINFO_CTOR_DTOR | |||
39 | #include "AMDGPUGenInstrInfo.inc" | |||
40 | ||||
41 | namespace llvm { | |||
42 | namespace AMDGPU { | |||
43 | #define GET_D16ImageDimIntrinsics_IMPL | |||
44 | #define GET_ImageDimIntrinsicTable_IMPL | |||
45 | #define GET_RsrcIntrinsics_IMPL | |||
46 | #include "AMDGPUGenSearchableTables.inc" | |||
47 | } | |||
48 | } | |||
49 | ||||
50 | ||||
51 | // Must be at least 4 to be able to branch over minimum unconditional branch | |||
52 | // code. This is only for making it possible to write reasonably small tests for | |||
53 | // long branches. | |||
54 | static cl::opt<unsigned> | |||
55 | BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), | |||
56 | cl::desc("Restrict range of branch instructions (DEBUG)")); | |||
57 | ||||
58 | static cl::opt<bool> Fix16BitCopies( | |||
59 | "amdgpu-fix-16-bit-physreg-copies", | |||
60 | cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), | |||
61 | cl::init(true), | |||
62 | cl::ReallyHidden); | |||
63 | ||||
64 | SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) | |||
65 | : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), | |||
66 | RI(ST), ST(ST) { | |||
67 | SchedModel.init(&ST); | |||
68 | } | |||
69 | ||||
70 | //===----------------------------------------------------------------------===// | |||
71 | // TargetInstrInfo callbacks | |||
72 | //===----------------------------------------------------------------------===// | |||
73 | ||||
74 | static unsigned getNumOperandsNoGlue(SDNode *Node) { | |||
75 | unsigned N = Node->getNumOperands(); | |||
76 | while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) | |||
77 | --N; | |||
78 | return N; | |||
79 | } | |||
80 | ||||
81 | /// Returns true if both nodes have the same value for the given | |||
82 | /// operand \p Op, or if both nodes do not have this operand. | |||
83 | static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { | |||
84 | unsigned Opc0 = N0->getMachineOpcode(); | |||
85 | unsigned Opc1 = N1->getMachineOpcode(); | |||
86 | ||||
87 | int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); | |||
88 | int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); | |||
89 | ||||
90 | if (Op0Idx == -1 && Op1Idx == -1) | |||
91 | return true; | |||
92 | ||||
93 | ||||
94 | if ((Op0Idx == -1 && Op1Idx != -1) || | |||
95 | (Op1Idx == -1 && Op0Idx != -1)) | |||
96 | return false; | |||
97 | ||||
98 | // getNamedOperandIdx returns the index for the MachineInstr's operands, | |||
99 | // which includes the result as the first operand. We are indexing into the | |||
100 | // MachineSDNode's operands, so we need to skip the result operand to get | |||
101 | // the real index. | |||
102 | --Op0Idx; | |||
103 | --Op1Idx; | |||
104 | ||||
105 | return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); | |||
106 | } | |||
107 | ||||
108 | bool SIInstrInfo::isReallyTriviallyReMaterializable( | |||
109 | const MachineInstr &MI) const { | |||
110 | if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) { | |||
111 | // Normally VALU use of exec would block the rematerialization, but that | |||
112 | // is OK in this case to have an implicit exec read as all VALU do. | |||
113 | // We really want all of the generic logic for this except for this. | |||
114 | ||||
115 | // Another potential implicit use is mode register. The core logic of | |||
116 | // the RA will not attempt rematerialization if mode is set anywhere | |||
117 | // in the function, otherwise it is safe since mode is not changed. | |||
118 | ||||
119 | // There is difference to generic method which does not allow | |||
120 | // rematerialization if there are virtual register uses. We allow this, | |||
121 | // therefore this method includes SOP instructions as well. | |||
122 | return !MI.hasImplicitDef() && | |||
123 | MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() && | |||
124 | !MI.mayRaiseFPException(); | |||
125 | } | |||
126 | ||||
127 | return false; | |||
128 | } | |||
129 | ||||
130 | // Returns true if the scalar result of a VALU instruction depends on exec. | |||
131 | static bool resultDependsOnExec(const MachineInstr &MI) { | |||
132 | // Ignore comparisons which are only used masked with exec. | |||
133 | // This allows some hoisting/sinking of VALU comparisons. | |||
134 | if (MI.isCompare()) { | |||
135 | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); | |||
136 | Register DstReg = MI.getOperand(0).getReg(); | |||
137 | if (!DstReg.isVirtual()) | |||
138 | return true; | |||
139 | for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) { | |||
140 | switch (Use.getOpcode()) { | |||
141 | case AMDGPU::S_AND_SAVEEXEC_B32: | |||
142 | case AMDGPU::S_AND_SAVEEXEC_B64: | |||
143 | break; | |||
144 | case AMDGPU::S_AND_B32: | |||
145 | case AMDGPU::S_AND_B64: | |||
146 | if (!Use.readsRegister(AMDGPU::EXEC)) | |||
147 | return true; | |||
148 | break; | |||
149 | default: | |||
150 | return true; | |||
151 | } | |||
152 | } | |||
153 | return false; | |||
154 | } | |||
155 | ||||
156 | switch (MI.getOpcode()) { | |||
157 | default: | |||
158 | break; | |||
159 | case AMDGPU::V_READFIRSTLANE_B32: | |||
160 | return true; | |||
161 | } | |||
162 | ||||
163 | return false; | |||
164 | } | |||
165 | ||||
166 | bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const { | |||
167 | // Any implicit use of exec by VALU is not a real register read. | |||
168 | return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() && | |||
169 | isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent()); | |||
170 | } | |||
171 | ||||
172 | bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, | |||
173 | int64_t &Offset0, | |||
174 | int64_t &Offset1) const { | |||
175 | if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) | |||
176 | return false; | |||
177 | ||||
178 | unsigned Opc0 = Load0->getMachineOpcode(); | |||
179 | unsigned Opc1 = Load1->getMachineOpcode(); | |||
180 | ||||
181 | // Make sure both are actually loads. | |||
182 | if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) | |||
183 | return false; | |||
184 | ||||
185 | if (isDS(Opc0) && isDS(Opc1)) { | |||
186 | ||||
187 | // FIXME: Handle this case: | |||
188 | if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) | |||
189 | return false; | |||
190 | ||||
191 | // Check base reg. | |||
192 | if (Load0->getOperand(0) != Load1->getOperand(0)) | |||
193 | return false; | |||
194 | ||||
195 | // Skip read2 / write2 variants for simplicity. | |||
196 | // TODO: We should report true if the used offsets are adjacent (excluded | |||
197 | // st64 versions). | |||
198 | int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); | |||
199 | int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); | |||
200 | if (Offset0Idx == -1 || Offset1Idx == -1) | |||
201 | return false; | |||
202 | ||||
203 | // XXX - be careful of dataless loads | |||
204 | // getNamedOperandIdx returns the index for MachineInstrs. Since they | |||
205 | // include the output in the operand list, but SDNodes don't, we need to | |||
206 | // subtract the index by one. | |||
207 | Offset0Idx -= get(Opc0).NumDefs; | |||
208 | Offset1Idx -= get(Opc1).NumDefs; | |||
209 | Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue(); | |||
210 | Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue(); | |||
211 | return true; | |||
212 | } | |||
213 | ||||
214 | if (isSMRD(Opc0) && isSMRD(Opc1)) { | |||
215 | // Skip time and cache invalidation instructions. | |||
216 | if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) || | |||
217 | !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase)) | |||
218 | return false; | |||
219 | ||||
220 | unsigned NumOps = getNumOperandsNoGlue(Load0); | |||
221 | if (NumOps != getNumOperandsNoGlue(Load1)) | |||
222 | return false; | |||
223 | ||||
224 | // Check base reg. | |||
225 | if (Load0->getOperand(0) != Load1->getOperand(0)) | |||
226 | return false; | |||
227 | ||||
228 | // Match register offsets, if both register and immediate offsets present. | |||
229 | assert(NumOps == 4 || NumOps == 5)(static_cast <bool> (NumOps == 4 || NumOps == 5) ? void (0) : __assert_fail ("NumOps == 4 || NumOps == 5", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 229, __extension__ __PRETTY_FUNCTION__)); | |||
230 | if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1)) | |||
231 | return false; | |||
232 | ||||
233 | const ConstantSDNode *Load0Offset = | |||
234 | dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3)); | |||
235 | const ConstantSDNode *Load1Offset = | |||
236 | dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3)); | |||
237 | ||||
238 | if (!Load0Offset || !Load1Offset) | |||
239 | return false; | |||
240 | ||||
241 | Offset0 = Load0Offset->getZExtValue(); | |||
242 | Offset1 = Load1Offset->getZExtValue(); | |||
243 | return true; | |||
244 | } | |||
245 | ||||
246 | // MUBUF and MTBUF can access the same addresses. | |||
247 | if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { | |||
248 | ||||
249 | // MUBUF and MTBUF have vaddr at different indices. | |||
250 | if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || | |||
251 | !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || | |||
252 | !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) | |||
253 | return false; | |||
254 | ||||
255 | int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); | |||
256 | int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); | |||
257 | ||||
258 | if (OffIdx0 == -1 || OffIdx1 == -1) | |||
259 | return false; | |||
260 | ||||
261 | // getNamedOperandIdx returns the index for MachineInstrs. Since they | |||
262 | // include the output in the operand list, but SDNodes don't, we need to | |||
263 | // subtract the index by one. | |||
264 | OffIdx0 -= get(Opc0).NumDefs; | |||
265 | OffIdx1 -= get(Opc1).NumDefs; | |||
266 | ||||
267 | SDValue Off0 = Load0->getOperand(OffIdx0); | |||
268 | SDValue Off1 = Load1->getOperand(OffIdx1); | |||
269 | ||||
270 | // The offset might be a FrameIndexSDNode. | |||
271 | if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) | |||
272 | return false; | |||
273 | ||||
274 | Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); | |||
275 | Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); | |||
276 | return true; | |||
277 | } | |||
278 | ||||
279 | return false; | |||
280 | } | |||
281 | ||||
282 | static bool isStride64(unsigned Opc) { | |||
283 | switch (Opc) { | |||
284 | case AMDGPU::DS_READ2ST64_B32: | |||
285 | case AMDGPU::DS_READ2ST64_B64: | |||
286 | case AMDGPU::DS_WRITE2ST64_B32: | |||
287 | case AMDGPU::DS_WRITE2ST64_B64: | |||
288 | return true; | |||
289 | default: | |||
290 | return false; | |||
291 | } | |||
292 | } | |||
293 | ||||
294 | bool SIInstrInfo::getMemOperandsWithOffsetWidth( | |||
295 | const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, | |||
296 | int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, | |||
297 | const TargetRegisterInfo *TRI) const { | |||
298 | if (!LdSt.mayLoadOrStore()) | |||
299 | return false; | |||
300 | ||||
301 | unsigned Opc = LdSt.getOpcode(); | |||
302 | OffsetIsScalable = false; | |||
303 | const MachineOperand *BaseOp, *OffsetOp; | |||
304 | int DataOpIdx; | |||
305 | ||||
306 | if (isDS(LdSt)) { | |||
307 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); | |||
308 | OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); | |||
309 | if (OffsetOp) { | |||
310 | // Normal, single offset LDS instruction. | |||
311 | if (!BaseOp) { | |||
312 | // DS_CONSUME/DS_APPEND use M0 for the base address. | |||
313 | // TODO: find the implicit use operand for M0 and use that as BaseOp? | |||
314 | return false; | |||
315 | } | |||
316 | BaseOps.push_back(BaseOp); | |||
317 | Offset = OffsetOp->getImm(); | |||
318 | // Get appropriate operand, and compute width accordingly. | |||
319 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
320 | if (DataOpIdx == -1) | |||
321 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); | |||
322 | Width = getOpSize(LdSt, DataOpIdx); | |||
323 | } else { | |||
324 | // The 2 offset instructions use offset0 and offset1 instead. We can treat | |||
325 | // these as a load with a single offset if the 2 offsets are consecutive. | |||
326 | // We will use this for some partially aligned loads. | |||
327 | const MachineOperand *Offset0Op = | |||
328 | getNamedOperand(LdSt, AMDGPU::OpName::offset0); | |||
329 | const MachineOperand *Offset1Op = | |||
330 | getNamedOperand(LdSt, AMDGPU::OpName::offset1); | |||
331 | ||||
332 | unsigned Offset0 = Offset0Op->getImm(); | |||
333 | unsigned Offset1 = Offset1Op->getImm(); | |||
334 | if (Offset0 + 1 != Offset1) | |||
335 | return false; | |||
336 | ||||
337 | // Each of these offsets is in element sized units, so we need to convert | |||
338 | // to bytes of the individual reads. | |||
339 | ||||
340 | unsigned EltSize; | |||
341 | if (LdSt.mayLoad()) | |||
342 | EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; | |||
343 | else { | |||
344 | assert(LdSt.mayStore())(static_cast <bool> (LdSt.mayStore()) ? void (0) : __assert_fail ("LdSt.mayStore()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 344, __extension__ __PRETTY_FUNCTION__)); | |||
345 | int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); | |||
346 | EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; | |||
347 | } | |||
348 | ||||
349 | if (isStride64(Opc)) | |||
350 | EltSize *= 64; | |||
351 | ||||
352 | BaseOps.push_back(BaseOp); | |||
353 | Offset = EltSize * Offset0; | |||
354 | // Get appropriate operand(s), and compute width accordingly. | |||
355 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
356 | if (DataOpIdx == -1) { | |||
357 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); | |||
358 | Width = getOpSize(LdSt, DataOpIdx); | |||
359 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1); | |||
360 | Width += getOpSize(LdSt, DataOpIdx); | |||
361 | } else { | |||
362 | Width = getOpSize(LdSt, DataOpIdx); | |||
363 | } | |||
364 | } | |||
365 | return true; | |||
366 | } | |||
367 | ||||
368 | if (isMUBUF(LdSt) || isMTBUF(LdSt)) { | |||
369 | const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); | |||
370 | if (!RSrc) // e.g. BUFFER_WBINVL1_VOL | |||
371 | return false; | |||
372 | BaseOps.push_back(RSrc); | |||
373 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); | |||
374 | if (BaseOp && !BaseOp->isFI()) | |||
375 | BaseOps.push_back(BaseOp); | |||
376 | const MachineOperand *OffsetImm = | |||
377 | getNamedOperand(LdSt, AMDGPU::OpName::offset); | |||
378 | Offset = OffsetImm->getImm(); | |||
379 | const MachineOperand *SOffset = | |||
380 | getNamedOperand(LdSt, AMDGPU::OpName::soffset); | |||
381 | if (SOffset) { | |||
382 | if (SOffset->isReg()) | |||
383 | BaseOps.push_back(SOffset); | |||
384 | else | |||
385 | Offset += SOffset->getImm(); | |||
386 | } | |||
387 | // Get appropriate operand, and compute width accordingly. | |||
388 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
389 | if (DataOpIdx == -1) | |||
390 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); | |||
391 | if (DataOpIdx == -1) // LDS DMA | |||
392 | return false; | |||
393 | Width = getOpSize(LdSt, DataOpIdx); | |||
394 | return true; | |||
395 | } | |||
396 | ||||
397 | if (isMIMG(LdSt)) { | |||
398 | int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); | |||
399 | BaseOps.push_back(&LdSt.getOperand(SRsrcIdx)); | |||
400 | int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); | |||
401 | if (VAddr0Idx >= 0) { | |||
402 | // GFX10 possible NSA encoding. | |||
403 | for (int I = VAddr0Idx; I < SRsrcIdx; ++I) | |||
404 | BaseOps.push_back(&LdSt.getOperand(I)); | |||
405 | } else { | |||
406 | BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr)); | |||
407 | } | |||
408 | Offset = 0; | |||
409 | // Get appropriate operand, and compute width accordingly. | |||
410 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); | |||
411 | Width = getOpSize(LdSt, DataOpIdx); | |||
412 | return true; | |||
413 | } | |||
414 | ||||
415 | if (isSMRD(LdSt)) { | |||
416 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase); | |||
417 | if (!BaseOp) // e.g. S_MEMTIME | |||
418 | return false; | |||
419 | BaseOps.push_back(BaseOp); | |||
420 | OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); | |||
421 | Offset = OffsetOp ? OffsetOp->getImm() : 0; | |||
422 | // Get appropriate operand, and compute width accordingly. | |||
423 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst); | |||
424 | Width = getOpSize(LdSt, DataOpIdx); | |||
425 | return true; | |||
426 | } | |||
427 | ||||
428 | if (isFLAT(LdSt)) { | |||
429 | // Instructions have either vaddr or saddr or both or none. | |||
430 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); | |||
431 | if (BaseOp) | |||
432 | BaseOps.push_back(BaseOp); | |||
433 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); | |||
434 | if (BaseOp) | |||
435 | BaseOps.push_back(BaseOp); | |||
436 | Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); | |||
437 | // Get appropriate operand, and compute width accordingly. | |||
438 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
439 | if (DataOpIdx == -1) | |||
440 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); | |||
441 | if (DataOpIdx == -1) // LDS DMA | |||
442 | return false; | |||
443 | Width = getOpSize(LdSt, DataOpIdx); | |||
444 | return true; | |||
445 | } | |||
446 | ||||
447 | return false; | |||
448 | } | |||
449 | ||||
450 | static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, | |||
451 | ArrayRef<const MachineOperand *> BaseOps1, | |||
452 | const MachineInstr &MI2, | |||
453 | ArrayRef<const MachineOperand *> BaseOps2) { | |||
454 | // Only examine the first "base" operand of each instruction, on the | |||
455 | // assumption that it represents the real base address of the memory access. | |||
456 | // Other operands are typically offsets or indices from this base address. | |||
457 | if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front())) | |||
458 | return true; | |||
459 | ||||
460 | if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) | |||
461 | return false; | |||
462 | ||||
463 | auto MO1 = *MI1.memoperands_begin(); | |||
464 | auto MO2 = *MI2.memoperands_begin(); | |||
465 | if (MO1->getAddrSpace() != MO2->getAddrSpace()) | |||
466 | return false; | |||
467 | ||||
468 | auto Base1 = MO1->getValue(); | |||
469 | auto Base2 = MO2->getValue(); | |||
470 | if (!Base1 || !Base2) | |||
471 | return false; | |||
472 | Base1 = getUnderlyingObject(Base1); | |||
473 | Base2 = getUnderlyingObject(Base2); | |||
474 | ||||
475 | if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2)) | |||
476 | return false; | |||
477 | ||||
478 | return Base1 == Base2; | |||
479 | } | |||
480 | ||||
481 | bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, | |||
482 | ArrayRef<const MachineOperand *> BaseOps2, | |||
483 | unsigned NumLoads, | |||
484 | unsigned NumBytes) const { | |||
485 | // If the mem ops (to be clustered) do not have the same base ptr, then they | |||
486 | // should not be clustered | |||
487 | if (!BaseOps1.empty() && !BaseOps2.empty()) { | |||
488 | const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); | |||
489 | const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); | |||
490 | if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) | |||
491 | return false; | |||
492 | } else if (!BaseOps1.empty() || !BaseOps2.empty()) { | |||
493 | // If only one base op is empty, they do not have the same base ptr | |||
494 | return false; | |||
495 | } | |||
496 | ||||
497 | // In order to avoid register pressure, on an average, the number of DWORDS | |||
498 | // loaded together by all clustered mem ops should not exceed 8. This is an | |||
499 | // empirical value based on certain observations and performance related | |||
500 | // experiments. | |||
501 | // The good thing about this heuristic is - it avoids clustering of too many | |||
502 | // sub-word loads, and also avoids clustering of wide loads. Below is the | |||
503 | // brief summary of how the heuristic behaves for various `LoadSize`. | |||
504 | // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops | |||
505 | // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops | |||
506 | // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops | |||
507 | // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops | |||
508 | // (5) LoadSize >= 17: do not cluster | |||
509 | const unsigned LoadSize = NumBytes / NumLoads; | |||
510 | const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads; | |||
511 | return NumDWORDs <= 8; | |||
512 | } | |||
513 | ||||
514 | // FIXME: This behaves strangely. If, for example, you have 32 load + stores, | |||
515 | // the first 16 loads will be interleaved with the stores, and the next 16 will | |||
516 | // be clustered as expected. It should really split into 2 16 store batches. | |||
517 | // | |||
518 | // Loads are clustered until this returns false, rather than trying to schedule | |||
519 | // groups of stores. This also means we have to deal with saying different | |||
520 | // address space loads should be clustered, and ones which might cause bank | |||
521 | // conflicts. | |||
522 | // | |||
523 | // This might be deprecated so it might not be worth that much effort to fix. | |||
524 | bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, | |||
525 | int64_t Offset0, int64_t Offset1, | |||
526 | unsigned NumLoads) const { | |||
527 | assert(Offset1 > Offset0 &&(static_cast <bool> (Offset1 > Offset0 && "Second offset should be larger than first offset!" ) ? void (0) : __assert_fail ("Offset1 > Offset0 && \"Second offset should be larger than first offset!\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 528, __extension__ __PRETTY_FUNCTION__)) | |||
528 | "Second offset should be larger than first offset!")(static_cast <bool> (Offset1 > Offset0 && "Second offset should be larger than first offset!" ) ? void (0) : __assert_fail ("Offset1 > Offset0 && \"Second offset should be larger than first offset!\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 528, __extension__ __PRETTY_FUNCTION__)); | |||
529 | // If we have less than 16 loads in a row, and the offsets are within 64 | |||
530 | // bytes, then schedule together. | |||
531 | ||||
532 | // A cacheline is 64 bytes (for global memory). | |||
533 | return (NumLoads <= 16 && (Offset1 - Offset0) < 64); | |||
534 | } | |||
535 | ||||
536 | static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, | |||
537 | MachineBasicBlock::iterator MI, | |||
538 | const DebugLoc &DL, MCRegister DestReg, | |||
539 | MCRegister SrcReg, bool KillSrc, | |||
540 | const char *Msg = "illegal VGPR to SGPR copy") { | |||
541 | MachineFunction *MF = MBB.getParent(); | |||
542 | DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error); | |||
543 | LLVMContext &C = MF->getFunction().getContext(); | |||
544 | C.diagnose(IllegalCopy); | |||
545 | ||||
546 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) | |||
547 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
548 | } | |||
549 | ||||
550 | /// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not | |||
551 | /// possible to have a direct copy in these cases on GFX908, so an intermediate | |||
552 | /// VGPR copy is required. | |||
553 | static void indirectCopyToAGPR(const SIInstrInfo &TII, | |||
554 | MachineBasicBlock &MBB, | |||
555 | MachineBasicBlock::iterator MI, | |||
556 | const DebugLoc &DL, MCRegister DestReg, | |||
557 | MCRegister SrcReg, bool KillSrc, | |||
558 | RegScavenger &RS, bool RegsOverlap, | |||
559 | Register ImpDefSuperReg = Register(), | |||
560 | Register ImpUseSuperReg = Register()) { | |||
561 | assert((TII.getSubtarget().hasMAIInsts() &&(static_cast <bool> ((TII.getSubtarget().hasMAIInsts() && !TII.getSubtarget().hasGFX90AInsts()) && "Expected GFX908 subtarget." ) ? void (0) : __assert_fail ("(TII.getSubtarget().hasMAIInsts() && !TII.getSubtarget().hasGFX90AInsts()) && \"Expected GFX908 subtarget.\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 563, __extension__ __PRETTY_FUNCTION__)) | |||
562 | !TII.getSubtarget().hasGFX90AInsts()) &&(static_cast <bool> ((TII.getSubtarget().hasMAIInsts() && !TII.getSubtarget().hasGFX90AInsts()) && "Expected GFX908 subtarget." ) ? void (0) : __assert_fail ("(TII.getSubtarget().hasMAIInsts() && !TII.getSubtarget().hasGFX90AInsts()) && \"Expected GFX908 subtarget.\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 563, __extension__ __PRETTY_FUNCTION__)) | |||
563 | "Expected GFX908 subtarget.")(static_cast <bool> ((TII.getSubtarget().hasMAIInsts() && !TII.getSubtarget().hasGFX90AInsts()) && "Expected GFX908 subtarget." ) ? void (0) : __assert_fail ("(TII.getSubtarget().hasMAIInsts() && !TII.getSubtarget().hasGFX90AInsts()) && \"Expected GFX908 subtarget.\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 563, __extension__ __PRETTY_FUNCTION__)); | |||
564 | ||||
565 | assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||(static_cast <bool> ((AMDGPU::SReg_32RegClass.contains( SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)) && "Source register of the copy should be either an SGPR or an AGPR." ) ? void (0) : __assert_fail ("(AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)) && \"Source register of the copy should be either an SGPR or an AGPR.\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 567, __extension__ __PRETTY_FUNCTION__)) | |||
566 | AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&(static_cast <bool> ((AMDGPU::SReg_32RegClass.contains( SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)) && "Source register of the copy should be either an SGPR or an AGPR." ) ? void (0) : __assert_fail ("(AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)) && \"Source register of the copy should be either an SGPR or an AGPR.\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 567, __extension__ __PRETTY_FUNCTION__)) | |||
567 | "Source register of the copy should be either an SGPR or an AGPR.")(static_cast <bool> ((AMDGPU::SReg_32RegClass.contains( SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)) && "Source register of the copy should be either an SGPR or an AGPR." ) ? void (0) : __assert_fail ("(AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)) && \"Source register of the copy should be either an SGPR or an AGPR.\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 567, __extension__ __PRETTY_FUNCTION__)); | |||
568 | ||||
569 | assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&(static_cast <bool> (AMDGPU::AGPR_32RegClass.contains(DestReg ) && "Destination register of the copy should be an AGPR." ) ? void (0) : __assert_fail ("AMDGPU::AGPR_32RegClass.contains(DestReg) && \"Destination register of the copy should be an AGPR.\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 570, __extension__ __PRETTY_FUNCTION__)) | |||
570 | "Destination register of the copy should be an AGPR.")(static_cast <bool> (AMDGPU::AGPR_32RegClass.contains(DestReg ) && "Destination register of the copy should be an AGPR." ) ? void (0) : __assert_fail ("AMDGPU::AGPR_32RegClass.contains(DestReg) && \"Destination register of the copy should be an AGPR.\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 570, __extension__ __PRETTY_FUNCTION__)); | |||
571 | ||||
572 | const SIRegisterInfo &RI = TII.getRegisterInfo(); | |||
573 | ||||
574 | // First try to find defining accvgpr_write to avoid temporary registers. | |||
575 | // In the case of copies of overlapping AGPRs, we conservatively do not | |||
576 | // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up | |||
577 | // an accvgpr_write used for this same copy due to implicit-defs | |||
578 | if (!RegsOverlap) { | |||
579 | for (auto Def = MI, E = MBB.begin(); Def != E; ) { | |||
580 | --Def; | |||
581 | if (!Def->definesRegister(SrcReg, &RI)) | |||
582 | continue; | |||
583 | if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) | |||
584 | break; | |||
585 | ||||
586 | MachineOperand &DefOp = Def->getOperand(1); | |||
587 | assert(DefOp.isReg() || DefOp.isImm())(static_cast <bool> (DefOp.isReg() || DefOp.isImm()) ? void (0) : __assert_fail ("DefOp.isReg() || DefOp.isImm()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 587, __extension__ __PRETTY_FUNCTION__)); | |||
588 | ||||
589 | if (DefOp.isReg()) { | |||
590 | bool SafeToPropagate = true; | |||
591 | // Check that register source operand is not clobbered before MI. | |||
592 | // Immediate operands are always safe to propagate. | |||
593 | for (auto I = Def; I != MI && SafeToPropagate; ++I) | |||
594 | if (I->modifiesRegister(DefOp.getReg(), &RI)) | |||
595 | SafeToPropagate = false; | |||
596 | ||||
597 | if (!SafeToPropagate) | |||
598 | break; | |||
599 | ||||
600 | DefOp.setIsKill(false); | |||
601 | } | |||
602 | ||||
603 | MachineInstrBuilder Builder = | |||
604 | BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) | |||
605 | .add(DefOp); | |||
606 | if (ImpDefSuperReg) | |||
607 | Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); | |||
608 | ||||
609 | if (ImpUseSuperReg) { | |||
610 | Builder.addReg(ImpUseSuperReg, | |||
611 | getKillRegState(KillSrc) | RegState::Implicit); | |||
612 | } | |||
613 | ||||
614 | return; | |||
615 | } | |||
616 | } | |||
617 | ||||
618 | RS.enterBasicBlock(MBB); | |||
619 | RS.forward(MI); | |||
620 | ||||
621 | // Ideally we want to have three registers for a long reg_sequence copy | |||
622 | // to hide 2 waitstates between v_mov_b32 and accvgpr_write. | |||
623 | unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, | |||
624 | *MBB.getParent()); | |||
625 | ||||
626 | // Registers in the sequence are allocated contiguously so we can just | |||
627 | // use register number to pick one of three round-robin temps. | |||
628 | unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3; | |||
629 | Register Tmp = | |||
630 | MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy(); | |||
631 | assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&(static_cast <bool> (MBB.getParent()->getRegInfo().isReserved (Tmp) && "VGPR used for an intermediate copy should have been reserved." ) ? void (0) : __assert_fail ("MBB.getParent()->getRegInfo().isReserved(Tmp) && \"VGPR used for an intermediate copy should have been reserved.\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 632, __extension__ __PRETTY_FUNCTION__)) | |||
632 | "VGPR used for an intermediate copy should have been reserved.")(static_cast <bool> (MBB.getParent()->getRegInfo().isReserved (Tmp) && "VGPR used for an intermediate copy should have been reserved." ) ? void (0) : __assert_fail ("MBB.getParent()->getRegInfo().isReserved(Tmp) && \"VGPR used for an intermediate copy should have been reserved.\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 632, __extension__ __PRETTY_FUNCTION__)); | |||
633 | ||||
634 | // Only loop through if there are any free registers left, otherwise | |||
635 | // scavenger may report a fatal error without emergency spill slot | |||
636 | // or spill with the slot. | |||
637 | while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { | |||
638 | Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); | |||
639 | if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) | |||
640 | break; | |||
641 | Tmp = Tmp2; | |||
642 | RS.setRegUsed(Tmp); | |||
643 | } | |||
644 | ||||
645 | // Insert copy to temporary VGPR. | |||
646 | unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32; | |||
647 | if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) { | |||
648 | TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64; | |||
649 | } else { | |||
650 | assert(AMDGPU::SReg_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::SReg_32RegClass.contains(SrcReg )) ? void (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 650, __extension__ __PRETTY_FUNCTION__)); | |||
651 | } | |||
652 | ||||
653 | MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp) | |||
654 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
655 | if (ImpUseSuperReg) { | |||
656 | UseBuilder.addReg(ImpUseSuperReg, | |||
657 | getKillRegState(KillSrc) | RegState::Implicit); | |||
658 | } | |||
659 | ||||
660 | MachineInstrBuilder DefBuilder | |||
661 | = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) | |||
662 | .addReg(Tmp, RegState::Kill); | |||
663 | ||||
664 | if (ImpDefSuperReg) | |||
665 | DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); | |||
666 | } | |||
667 | ||||
668 | static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, | |||
669 | MachineBasicBlock::iterator MI, const DebugLoc &DL, | |||
670 | MCRegister DestReg, MCRegister SrcReg, bool KillSrc, | |||
671 | const TargetRegisterClass *RC, bool Forward) { | |||
672 | const SIRegisterInfo &RI = TII.getRegisterInfo(); | |||
673 | ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4); | |||
674 | MachineBasicBlock::iterator I = MI; | |||
675 | MachineInstr *FirstMI = nullptr, *LastMI = nullptr; | |||
676 | ||||
677 | for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) { | |||
678 | int16_t SubIdx = BaseIndices[Idx]; | |||
679 | Register Reg = RI.getSubReg(DestReg, SubIdx); | |||
680 | unsigned Opcode = AMDGPU::S_MOV_B32; | |||
681 | ||||
682 | // Is SGPR aligned? If so try to combine with next. | |||
683 | Register Src = RI.getSubReg(SrcReg, SubIdx); | |||
684 | bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0; | |||
685 | bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0; | |||
686 | if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) { | |||
687 | // Can use SGPR64 copy | |||
688 | unsigned Channel = RI.getChannelFromSubReg(SubIdx); | |||
689 | SubIdx = RI.getSubRegFromChannel(Channel, 2); | |||
690 | Opcode = AMDGPU::S_MOV_B64; | |||
691 | Idx++; | |||
692 | } | |||
693 | ||||
694 | LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx)) | |||
695 | .addReg(RI.getSubReg(SrcReg, SubIdx)) | |||
696 | .addReg(SrcReg, RegState::Implicit); | |||
697 | ||||
698 | if (!FirstMI) | |||
699 | FirstMI = LastMI; | |||
700 | ||||
701 | if (!Forward) | |||
702 | I--; | |||
703 | } | |||
704 | ||||
705 | assert(FirstMI && LastMI)(static_cast <bool> (FirstMI && LastMI) ? void ( 0) : __assert_fail ("FirstMI && LastMI", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 705, __extension__ __PRETTY_FUNCTION__)); | |||
706 | if (!Forward) | |||
707 | std::swap(FirstMI, LastMI); | |||
708 | ||||
709 | FirstMI->addOperand( | |||
710 | MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/)); | |||
711 | ||||
712 | if (KillSrc) | |||
713 | LastMI->addRegisterKilled(SrcReg, &RI); | |||
714 | } | |||
715 | ||||
716 | void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, | |||
717 | MachineBasicBlock::iterator MI, | |||
718 | const DebugLoc &DL, MCRegister DestReg, | |||
719 | MCRegister SrcReg, bool KillSrc) const { | |||
720 | const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg); | |||
721 | ||||
722 | // FIXME: This is hack to resolve copies between 16 bit and 32 bit | |||
723 | // registers until all patterns are fixed. | |||
724 | if (Fix16BitCopies && | |||
725 | ((RI.getRegSizeInBits(*RC) == 16) ^ | |||
726 | (RI.getRegSizeInBits(*RI.getPhysRegBaseClass(SrcReg)) == 16))) { | |||
727 | MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg; | |||
728 | MCRegister Super = RI.get32BitRegister(RegToFix); | |||
729 | assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix)(static_cast <bool> (RI.getSubReg(Super, AMDGPU::lo16) == RegToFix) ? void (0) : __assert_fail ("RI.getSubReg(Super, AMDGPU::lo16) == RegToFix" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 729, __extension__ __PRETTY_FUNCTION__)); | |||
730 | RegToFix = Super; | |||
731 | ||||
732 | if (DestReg == SrcReg) { | |||
733 | // Insert empty bundle since ExpandPostRA expects an instruction here. | |||
734 | BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); | |||
735 | return; | |||
736 | } | |||
737 | ||||
738 | RC = RI.getPhysRegBaseClass(DestReg); | |||
739 | } | |||
740 | ||||
741 | if (RC == &AMDGPU::VGPR_32RegClass) { | |||
742 | assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::VGPR_32RegClass.contains(SrcReg ) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 744, __extension__ __PRETTY_FUNCTION__)) | |||
743 | AMDGPU::SReg_32RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::VGPR_32RegClass.contains(SrcReg ) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 744, __extension__ __PRETTY_FUNCTION__)) | |||
744 | AMDGPU::AGPR_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::VGPR_32RegClass.contains(SrcReg ) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 744, __extension__ __PRETTY_FUNCTION__)); | |||
745 | unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ? | |||
746 | AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32; | |||
747 | BuildMI(MBB, MI, DL, get(Opc), DestReg) | |||
748 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
749 | return; | |||
750 | } | |||
751 | ||||
752 | if (RC == &AMDGPU::SReg_32_XM0RegClass || | |||
753 | RC == &AMDGPU::SReg_32RegClass) { | |||
754 | if (SrcReg == AMDGPU::SCC) { | |||
755 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) | |||
756 | .addImm(1) | |||
757 | .addImm(0); | |||
758 | return; | |||
759 | } | |||
760 | ||||
761 | if (DestReg == AMDGPU::VCC_LO) { | |||
762 | if (AMDGPU::SReg_32RegClass.contains(SrcReg)) { | |||
763 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO) | |||
764 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
765 | } else { | |||
766 | // FIXME: Hack until VReg_1 removed. | |||
767 | assert(AMDGPU::VGPR_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::VGPR_32RegClass.contains(SrcReg )) ? void (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 767, __extension__ __PRETTY_FUNCTION__)); | |||
768 | BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) | |||
769 | .addImm(0) | |||
770 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
771 | } | |||
772 | ||||
773 | return; | |||
774 | } | |||
775 | ||||
776 | if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { | |||
777 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); | |||
778 | return; | |||
779 | } | |||
780 | ||||
781 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) | |||
782 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
783 | return; | |||
784 | } | |||
785 | ||||
786 | if (RC == &AMDGPU::SReg_64RegClass) { | |||
787 | if (SrcReg == AMDGPU::SCC) { | |||
788 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg) | |||
789 | .addImm(1) | |||
790 | .addImm(0); | |||
791 | return; | |||
792 | } | |||
793 | ||||
794 | if (DestReg == AMDGPU::VCC) { | |||
795 | if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { | |||
796 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) | |||
797 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
798 | } else { | |||
799 | // FIXME: Hack until VReg_1 removed. | |||
800 | assert(AMDGPU::VGPR_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::VGPR_32RegClass.contains(SrcReg )) ? void (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 800, __extension__ __PRETTY_FUNCTION__)); | |||
801 | BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) | |||
802 | .addImm(0) | |||
803 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
804 | } | |||
805 | ||||
806 | return; | |||
807 | } | |||
808 | ||||
809 | if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { | |||
810 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); | |||
811 | return; | |||
812 | } | |||
813 | ||||
814 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) | |||
815 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
816 | return; | |||
817 | } | |||
818 | ||||
819 | if (DestReg == AMDGPU::SCC) { | |||
820 | // Copying 64-bit or 32-bit sources to SCC barely makes sense, | |||
821 | // but SelectionDAG emits such copies for i1 sources. | |||
822 | if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { | |||
823 | // This copy can only be produced by patterns | |||
824 | // with explicit SCC, which are known to be enabled | |||
825 | // only for subtargets with S_CMP_LG_U64 present. | |||
826 | assert(ST.hasScalarCompareEq64())(static_cast <bool> (ST.hasScalarCompareEq64()) ? void ( 0) : __assert_fail ("ST.hasScalarCompareEq64()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 826, __extension__ __PRETTY_FUNCTION__)); | |||
827 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64)) | |||
828 | .addReg(SrcReg, getKillRegState(KillSrc)) | |||
829 | .addImm(0); | |||
830 | } else { | |||
831 | assert(AMDGPU::SReg_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::SReg_32RegClass.contains(SrcReg )) ? void (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 831, __extension__ __PRETTY_FUNCTION__)); | |||
832 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) | |||
833 | .addReg(SrcReg, getKillRegState(KillSrc)) | |||
834 | .addImm(0); | |||
835 | } | |||
836 | ||||
837 | return; | |||
838 | } | |||
839 | ||||
840 | if (RC == &AMDGPU::AGPR_32RegClass) { | |||
841 | if (AMDGPU::VGPR_32RegClass.contains(SrcReg) || | |||
842 | (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) { | |||
843 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) | |||
844 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
845 | return; | |||
846 | } | |||
847 | ||||
848 | if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) { | |||
849 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg) | |||
850 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
851 | return; | |||
852 | } | |||
853 | ||||
854 | // FIXME: Pass should maintain scavenger to avoid scan through the block on | |||
855 | // every AGPR spill. | |||
856 | RegScavenger RS; | |||
857 | const bool Overlap = RI.regsOverlap(SrcReg, DestReg); | |||
858 | indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap); | |||
859 | return; | |||
860 | } | |||
861 | ||||
862 | const unsigned Size = RI.getRegSizeInBits(*RC); | |||
863 | if (Size == 16) { | |||
864 | assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::VGPR_LO16RegClass.contains (SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU ::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 867, __extension__ __PRETTY_FUNCTION__)) | |||
865 | AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::VGPR_LO16RegClass.contains (SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU ::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 867, __extension__ __PRETTY_FUNCTION__)) | |||
866 | AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::VGPR_LO16RegClass.contains (SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU ::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 867, __extension__ __PRETTY_FUNCTION__)) | |||
867 | AMDGPU::AGPR_LO16RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::VGPR_LO16RegClass.contains (SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU ::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 867, __extension__ __PRETTY_FUNCTION__)); | |||
868 | ||||
869 | bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg); | |||
870 | bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg); | |||
871 | bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg); | |||
872 | bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg); | |||
873 | bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) || | |||
874 | AMDGPU::SReg_LO16RegClass.contains(DestReg) || | |||
875 | AMDGPU::AGPR_LO16RegClass.contains(DestReg); | |||
876 | bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || | |||
877 | AMDGPU::SReg_LO16RegClass.contains(SrcReg) || | |||
878 | AMDGPU::AGPR_LO16RegClass.contains(SrcReg); | |||
879 | MCRegister NewDestReg = RI.get32BitRegister(DestReg); | |||
880 | MCRegister NewSrcReg = RI.get32BitRegister(SrcReg); | |||
881 | ||||
882 | if (IsSGPRDst) { | |||
883 | if (!IsSGPRSrc) { | |||
884 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); | |||
885 | return; | |||
886 | } | |||
887 | ||||
888 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg) | |||
889 | .addReg(NewSrcReg, getKillRegState(KillSrc)); | |||
890 | return; | |||
891 | } | |||
892 | ||||
893 | if (IsAGPRDst || IsAGPRSrc) { | |||
894 | if (!DstLow || !SrcLow) { | |||
895 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, | |||
896 | "Cannot use hi16 subreg with an AGPR!"); | |||
897 | } | |||
898 | ||||
899 | copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc); | |||
900 | return; | |||
901 | } | |||
902 | ||||
903 | if (IsSGPRSrc && !ST.hasSDWAScalar()) { | |||
904 | if (!DstLow || !SrcLow) { | |||
905 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, | |||
906 | "Cannot use hi16 subreg on VI!"); | |||
907 | } | |||
908 | ||||
909 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg) | |||
910 | .addReg(NewSrcReg, getKillRegState(KillSrc)); | |||
911 | return; | |||
912 | } | |||
913 | ||||
914 | auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg) | |||
915 | .addImm(0) // src0_modifiers | |||
916 | .addReg(NewSrcReg) | |||
917 | .addImm(0) // clamp | |||
918 | .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0 | |||
919 | : AMDGPU::SDWA::SdwaSel::WORD_1) | |||
920 | .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) | |||
921 | .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0 | |||
922 | : AMDGPU::SDWA::SdwaSel::WORD_1) | |||
923 | .addReg(NewDestReg, RegState::Implicit | RegState::Undef); | |||
924 | // First implicit operand is $exec. | |||
925 | MIB->tieOperands(0, MIB->getNumOperands() - 1); | |||
926 | return; | |||
927 | } | |||
928 | ||||
929 | const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg); | |||
930 | if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) { | |||
931 | if (ST.hasMovB64()) { | |||
932 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg) | |||
933 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
934 | return; | |||
935 | } | |||
936 | if (ST.hasPackedFP32Ops()) { | |||
937 | BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg) | |||
938 | .addImm(SISrcMods::OP_SEL_1) | |||
939 | .addReg(SrcReg) | |||
940 | .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) | |||
941 | .addReg(SrcReg) | |||
942 | .addImm(0) // op_sel_lo | |||
943 | .addImm(0) // op_sel_hi | |||
944 | .addImm(0) // neg_lo | |||
945 | .addImm(0) // neg_hi | |||
946 | .addImm(0) // clamp | |||
947 | .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); | |||
948 | return; | |||
949 | } | |||
950 | } | |||
951 | ||||
952 | const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); | |||
953 | if (RI.isSGPRClass(RC)) { | |||
954 | if (!RI.isSGPRClass(SrcRC)) { | |||
955 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); | |||
956 | return; | |||
957 | } | |||
958 | const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg); | |||
959 | expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC, | |||
960 | Forward); | |||
961 | return; | |||
962 | } | |||
963 | ||||
964 | unsigned EltSize = 4; | |||
965 | unsigned Opcode = AMDGPU::V_MOV_B32_e32; | |||
966 | if (RI.isAGPRClass(RC)) { | |||
967 | if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC)) | |||
968 | Opcode = AMDGPU::V_ACCVGPR_MOV_B32; | |||
969 | else if (RI.hasVGPRs(SrcRC) || | |||
970 | (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC))) | |||
971 | Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64; | |||
972 | else | |||
973 | Opcode = AMDGPU::INSTRUCTION_LIST_END; | |||
974 | } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) { | |||
975 | Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64; | |||
976 | } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) && | |||
977 | (RI.isProperlyAlignedRC(*RC) && | |||
978 | (SrcRC == RC || RI.isSGPRClass(SrcRC)))) { | |||
979 | // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov. | |||
980 | if (ST.hasMovB64()) { | |||
981 | Opcode = AMDGPU::V_MOV_B64_e32; | |||
982 | EltSize = 8; | |||
983 | } else if (ST.hasPackedFP32Ops()) { | |||
984 | Opcode = AMDGPU::V_PK_MOV_B32; | |||
985 | EltSize = 8; | |||
986 | } | |||
987 | } | |||
988 | ||||
989 | // For the cases where we need an intermediate instruction/temporary register | |||
990 | // (destination is an AGPR), we need a scavenger. | |||
991 | // | |||
992 | // FIXME: The pass should maintain this for us so we don't have to re-scan the | |||
993 | // whole block for every handled copy. | |||
994 | std::unique_ptr<RegScavenger> RS; | |||
995 | if (Opcode == AMDGPU::INSTRUCTION_LIST_END) | |||
996 | RS.reset(new RegScavenger()); | |||
997 | ||||
998 | ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); | |||
999 | ||||
1000 | // If there is an overlap, we can't kill the super-register on the last | |||
1001 | // instruction, since it will also kill the components made live by this def. | |||
1002 | const bool Overlap = RI.regsOverlap(SrcReg, DestReg); | |||
1003 | const bool CanKillSuperReg = KillSrc && !Overlap; | |||
1004 | ||||
1005 | for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { | |||
1006 | unsigned SubIdx; | |||
1007 | if (Forward) | |||
1008 | SubIdx = SubIndices[Idx]; | |||
1009 | else | |||
1010 | SubIdx = SubIndices[SubIndices.size() - Idx - 1]; | |||
1011 | ||||
1012 | bool IsFirstSubreg = Idx == 0; | |||
1013 | bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1; | |||
1014 | ||||
1015 | if (Opcode == AMDGPU::INSTRUCTION_LIST_END) { | |||
1016 | Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register(); | |||
1017 | Register ImpUseSuper = SrcReg; | |||
1018 | indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), | |||
1019 | RI.getSubReg(SrcReg, SubIdx), UseKill, *RS, Overlap, | |||
1020 | ImpDefSuper, ImpUseSuper); | |||
1021 | } else if (Opcode == AMDGPU::V_PK_MOV_B32) { | |||
1022 | Register DstSubReg = RI.getSubReg(DestReg, SubIdx); | |||
1023 | Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); | |||
1024 | MachineInstrBuilder MIB = | |||
1025 | BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg) | |||
1026 | .addImm(SISrcMods::OP_SEL_1) | |||
1027 | .addReg(SrcSubReg) | |||
1028 | .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) | |||
1029 | .addReg(SrcSubReg) | |||
1030 | .addImm(0) // op_sel_lo | |||
1031 | .addImm(0) // op_sel_hi | |||
1032 | .addImm(0) // neg_lo | |||
1033 | .addImm(0) // neg_hi | |||
1034 | .addImm(0) // clamp | |||
1035 | .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); | |||
1036 | if (IsFirstSubreg) | |||
1037 | MIB.addReg(DestReg, RegState::Define | RegState::Implicit); | |||
1038 | } else { | |||
1039 | MachineInstrBuilder Builder = | |||
1040 | BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)) | |||
1041 | .addReg(RI.getSubReg(SrcReg, SubIdx)); | |||
1042 | if (IsFirstSubreg) | |||
1043 | Builder.addReg(DestReg, RegState::Define | RegState::Implicit); | |||
1044 | ||||
1045 | Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); | |||
1046 | } | |||
1047 | } | |||
1048 | } | |||
1049 | ||||
1050 | int SIInstrInfo::commuteOpcode(unsigned Opcode) const { | |||
1051 | int NewOpc; | |||
1052 | ||||
1053 | // Try to map original to commuted opcode | |||
1054 | NewOpc = AMDGPU::getCommuteRev(Opcode); | |||
1055 | if (NewOpc != -1) | |||
1056 | // Check if the commuted (REV) opcode exists on the target. | |||
1057 | return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; | |||
1058 | ||||
1059 | // Try to map commuted to original opcode | |||
1060 | NewOpc = AMDGPU::getCommuteOrig(Opcode); | |||
1061 | if (NewOpc != -1) | |||
1062 | // Check if the original (non-REV) opcode exists on the target. | |||
1063 | return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; | |||
1064 | ||||
1065 | return Opcode; | |||
1066 | } | |||
1067 | ||||
1068 | void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, | |||
1069 | MachineBasicBlock::iterator MI, | |||
1070 | const DebugLoc &DL, Register DestReg, | |||
1071 | int64_t Value) const { | |||
1072 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
1073 | const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); | |||
1074 | if (RegClass == &AMDGPU::SReg_32RegClass || | |||
1075 | RegClass == &AMDGPU::SGPR_32RegClass || | |||
1076 | RegClass == &AMDGPU::SReg_32_XM0RegClass || | |||
1077 | RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { | |||
1078 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) | |||
1079 | .addImm(Value); | |||
1080 | return; | |||
1081 | } | |||
1082 | ||||
1083 | if (RegClass == &AMDGPU::SReg_64RegClass || | |||
1084 | RegClass == &AMDGPU::SGPR_64RegClass || | |||
1085 | RegClass == &AMDGPU::SReg_64_XEXECRegClass) { | |||
1086 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) | |||
1087 | .addImm(Value); | |||
1088 | return; | |||
1089 | } | |||
1090 | ||||
1091 | if (RegClass == &AMDGPU::VGPR_32RegClass) { | |||
1092 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) | |||
1093 | .addImm(Value); | |||
1094 | return; | |||
1095 | } | |||
1096 | if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) { | |||
1097 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) | |||
1098 | .addImm(Value); | |||
1099 | return; | |||
1100 | } | |||
1101 | ||||
1102 | unsigned EltSize = 4; | |||
1103 | unsigned Opcode = AMDGPU::V_MOV_B32_e32; | |||
1104 | if (RI.isSGPRClass(RegClass)) { | |||
1105 | if (RI.getRegSizeInBits(*RegClass) > 32) { | |||
1106 | Opcode = AMDGPU::S_MOV_B64; | |||
1107 | EltSize = 8; | |||
1108 | } else { | |||
1109 | Opcode = AMDGPU::S_MOV_B32; | |||
1110 | EltSize = 4; | |||
1111 | } | |||
1112 | } | |||
1113 | ||||
1114 | ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); | |||
1115 | for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { | |||
1116 | int64_t IdxValue = Idx == 0 ? Value : 0; | |||
1117 | ||||
1118 | MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, | |||
1119 | get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx])); | |||
1120 | Builder.addImm(IdxValue); | |||
1121 | } | |||
1122 | } | |||
1123 | ||||
1124 | const TargetRegisterClass * | |||
1125 | SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { | |||
1126 | return &AMDGPU::VGPR_32RegClass; | |||
1127 | } | |||
1128 | ||||
1129 | void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, | |||
1130 | MachineBasicBlock::iterator I, | |||
1131 | const DebugLoc &DL, Register DstReg, | |||
1132 | ArrayRef<MachineOperand> Cond, | |||
1133 | Register TrueReg, | |||
1134 | Register FalseReg) const { | |||
1135 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
1136 | const TargetRegisterClass *BoolXExecRC = | |||
1137 | RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
1138 | assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&(static_cast <bool> (MRI.getRegClass(DstReg) == &AMDGPU ::VGPR_32RegClass && "Not a VGPR32 reg") ? void (0) : __assert_fail ("MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && \"Not a VGPR32 reg\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1139, __extension__ __PRETTY_FUNCTION__)) | |||
1139 | "Not a VGPR32 reg")(static_cast <bool> (MRI.getRegClass(DstReg) == &AMDGPU ::VGPR_32RegClass && "Not a VGPR32 reg") ? void (0) : __assert_fail ("MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && \"Not a VGPR32 reg\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1139, __extension__ __PRETTY_FUNCTION__)); | |||
1140 | ||||
1141 | if (Cond.size() == 1) { | |||
1142 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1143 | BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) | |||
1144 | .add(Cond[0]); | |||
1145 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1146 | .addImm(0) | |||
1147 | .addReg(FalseReg) | |||
1148 | .addImm(0) | |||
1149 | .addReg(TrueReg) | |||
1150 | .addReg(SReg); | |||
1151 | } else if (Cond.size() == 2) { | |||
1152 | assert(Cond[0].isImm() && "Cond[0] is not an immediate")(static_cast <bool> (Cond[0].isImm() && "Cond[0] is not an immediate" ) ? void (0) : __assert_fail ("Cond[0].isImm() && \"Cond[0] is not an immediate\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1152, __extension__ __PRETTY_FUNCTION__)); | |||
1153 | switch (Cond[0].getImm()) { | |||
1154 | case SIInstrInfo::SCC_TRUE: { | |||
1155 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1156 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 | |||
1157 | : AMDGPU::S_CSELECT_B64), SReg) | |||
1158 | .addImm(1) | |||
1159 | .addImm(0); | |||
1160 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1161 | .addImm(0) | |||
1162 | .addReg(FalseReg) | |||
1163 | .addImm(0) | |||
1164 | .addReg(TrueReg) | |||
1165 | .addReg(SReg); | |||
1166 | break; | |||
1167 | } | |||
1168 | case SIInstrInfo::SCC_FALSE: { | |||
1169 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1170 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 | |||
1171 | : AMDGPU::S_CSELECT_B64), SReg) | |||
1172 | .addImm(0) | |||
1173 | .addImm(1); | |||
1174 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1175 | .addImm(0) | |||
1176 | .addReg(FalseReg) | |||
1177 | .addImm(0) | |||
1178 | .addReg(TrueReg) | |||
1179 | .addReg(SReg); | |||
1180 | break; | |||
1181 | } | |||
1182 | case SIInstrInfo::VCCNZ: { | |||
1183 | MachineOperand RegOp = Cond[1]; | |||
1184 | RegOp.setImplicit(false); | |||
1185 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1186 | BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) | |||
1187 | .add(RegOp); | |||
1188 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1189 | .addImm(0) | |||
1190 | .addReg(FalseReg) | |||
1191 | .addImm(0) | |||
1192 | .addReg(TrueReg) | |||
1193 | .addReg(SReg); | |||
1194 | break; | |||
1195 | } | |||
1196 | case SIInstrInfo::VCCZ: { | |||
1197 | MachineOperand RegOp = Cond[1]; | |||
1198 | RegOp.setImplicit(false); | |||
1199 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1200 | BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) | |||
1201 | .add(RegOp); | |||
1202 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1203 | .addImm(0) | |||
1204 | .addReg(TrueReg) | |||
1205 | .addImm(0) | |||
1206 | .addReg(FalseReg) | |||
1207 | .addReg(SReg); | |||
1208 | break; | |||
1209 | } | |||
1210 | case SIInstrInfo::EXECNZ: { | |||
1211 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1212 | Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); | |||
1213 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 | |||
1214 | : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) | |||
1215 | .addImm(0); | |||
1216 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 | |||
1217 | : AMDGPU::S_CSELECT_B64), SReg) | |||
1218 | .addImm(1) | |||
1219 | .addImm(0); | |||
1220 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1221 | .addImm(0) | |||
1222 | .addReg(FalseReg) | |||
1223 | .addImm(0) | |||
1224 | .addReg(TrueReg) | |||
1225 | .addReg(SReg); | |||
1226 | break; | |||
1227 | } | |||
1228 | case SIInstrInfo::EXECZ: { | |||
1229 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1230 | Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); | |||
1231 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 | |||
1232 | : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) | |||
1233 | .addImm(0); | |||
1234 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 | |||
1235 | : AMDGPU::S_CSELECT_B64), SReg) | |||
1236 | .addImm(0) | |||
1237 | .addImm(1); | |||
1238 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1239 | .addImm(0) | |||
1240 | .addReg(FalseReg) | |||
1241 | .addImm(0) | |||
1242 | .addReg(TrueReg) | |||
1243 | .addReg(SReg); | |||
1244 | llvm_unreachable("Unhandled branch predicate EXECZ")::llvm::llvm_unreachable_internal("Unhandled branch predicate EXECZ" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1244); | |||
1245 | break; | |||
1246 | } | |||
1247 | default: | |||
1248 | llvm_unreachable("invalid branch predicate")::llvm::llvm_unreachable_internal("invalid branch predicate", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1248); | |||
1249 | } | |||
1250 | } else { | |||
1251 | llvm_unreachable("Can only handle Cond size 1 or 2")::llvm::llvm_unreachable_internal("Can only handle Cond size 1 or 2" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1251); | |||
1252 | } | |||
1253 | } | |||
1254 | ||||
1255 | Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB, | |||
1256 | MachineBasicBlock::iterator I, | |||
1257 | const DebugLoc &DL, | |||
1258 | Register SrcReg, int Value) const { | |||
1259 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | |||
1260 | Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
1261 | BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) | |||
1262 | .addImm(Value) | |||
1263 | .addReg(SrcReg); | |||
1264 | ||||
1265 | return Reg; | |||
1266 | } | |||
1267 | ||||
1268 | Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, | |||
1269 | MachineBasicBlock::iterator I, | |||
1270 | const DebugLoc &DL, | |||
1271 | Register SrcReg, int Value) const { | |||
1272 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | |||
1273 | Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
1274 | BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) | |||
1275 | .addImm(Value) | |||
1276 | .addReg(SrcReg); | |||
1277 | ||||
1278 | return Reg; | |||
1279 | } | |||
1280 | ||||
1281 | unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { | |||
1282 | ||||
1283 | if (RI.isAGPRClass(DstRC)) | |||
1284 | return AMDGPU::COPY; | |||
1285 | if (RI.getRegSizeInBits(*DstRC) == 32) { | |||
1286 | return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; | |||
1287 | } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { | |||
1288 | return AMDGPU::S_MOV_B64; | |||
1289 | } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { | |||
1290 | return AMDGPU::V_MOV_B64_PSEUDO; | |||
1291 | } | |||
1292 | return AMDGPU::COPY; | |||
1293 | } | |||
1294 | ||||
1295 | const MCInstrDesc & | |||
1296 | SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize, | |||
1297 | bool IsIndirectSrc) const { | |||
1298 | if (IsIndirectSrc) { | |||
1299 | if (VecSize <= 32) // 4 bytes | |||
1300 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1); | |||
1301 | if (VecSize <= 64) // 8 bytes | |||
1302 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2); | |||
1303 | if (VecSize <= 96) // 12 bytes | |||
1304 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3); | |||
1305 | if (VecSize <= 128) // 16 bytes | |||
1306 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4); | |||
1307 | if (VecSize <= 160) // 20 bytes | |||
1308 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5); | |||
1309 | if (VecSize <= 256) // 32 bytes | |||
1310 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8); | |||
1311 | if (VecSize <= 288) // 36 bytes | |||
1312 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9); | |||
1313 | if (VecSize <= 320) // 40 bytes | |||
1314 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10); | |||
1315 | if (VecSize <= 352) // 44 bytes | |||
1316 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11); | |||
1317 | if (VecSize <= 384) // 48 bytes | |||
1318 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12); | |||
1319 | if (VecSize <= 512) // 64 bytes | |||
1320 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16); | |||
1321 | if (VecSize <= 1024) // 128 bytes | |||
1322 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32); | |||
1323 | ||||
1324 | llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegReadGPRIDX pseudos" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1324); | |||
1325 | } | |||
1326 | ||||
1327 | if (VecSize <= 32) // 4 bytes | |||
1328 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1); | |||
1329 | if (VecSize <= 64) // 8 bytes | |||
1330 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2); | |||
1331 | if (VecSize <= 96) // 12 bytes | |||
1332 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3); | |||
1333 | if (VecSize <= 128) // 16 bytes | |||
1334 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4); | |||
1335 | if (VecSize <= 160) // 20 bytes | |||
1336 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5); | |||
1337 | if (VecSize <= 256) // 32 bytes | |||
1338 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8); | |||
1339 | if (VecSize <= 288) // 36 bytes | |||
1340 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9); | |||
1341 | if (VecSize <= 320) // 40 bytes | |||
1342 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10); | |||
1343 | if (VecSize <= 352) // 44 bytes | |||
1344 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11); | |||
1345 | if (VecSize <= 384) // 48 bytes | |||
1346 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12); | |||
1347 | if (VecSize <= 512) // 64 bytes | |||
1348 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16); | |||
1349 | if (VecSize <= 1024) // 128 bytes | |||
1350 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32); | |||
1351 | ||||
1352 | llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWriteGPRIDX pseudos" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1352); | |||
1353 | } | |||
1354 | ||||
1355 | static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) { | |||
1356 | if (VecSize <= 32) // 4 bytes | |||
1357 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1; | |||
1358 | if (VecSize <= 64) // 8 bytes | |||
1359 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2; | |||
1360 | if (VecSize <= 96) // 12 bytes | |||
1361 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3; | |||
1362 | if (VecSize <= 128) // 16 bytes | |||
1363 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4; | |||
1364 | if (VecSize <= 160) // 20 bytes | |||
1365 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5; | |||
1366 | if (VecSize <= 256) // 32 bytes | |||
1367 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8; | |||
1368 | if (VecSize <= 288) // 36 bytes | |||
1369 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9; | |||
1370 | if (VecSize <= 320) // 40 bytes | |||
1371 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10; | |||
1372 | if (VecSize <= 352) // 44 bytes | |||
1373 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11; | |||
1374 | if (VecSize <= 384) // 48 bytes | |||
1375 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12; | |||
1376 | if (VecSize <= 512) // 64 bytes | |||
1377 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16; | |||
1378 | if (VecSize <= 1024) // 128 bytes | |||
1379 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32; | |||
1380 | ||||
1381 | llvm_unreachable("unsupported size for IndirectRegWrite pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWrite pseudos" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1381); | |||
1382 | } | |||
1383 | ||||
1384 | static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) { | |||
1385 | if (VecSize <= 32) // 4 bytes | |||
1386 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1; | |||
1387 | if (VecSize <= 64) // 8 bytes | |||
1388 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2; | |||
1389 | if (VecSize <= 96) // 12 bytes | |||
1390 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3; | |||
1391 | if (VecSize <= 128) // 16 bytes | |||
1392 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4; | |||
1393 | if (VecSize <= 160) // 20 bytes | |||
1394 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5; | |||
1395 | if (VecSize <= 256) // 32 bytes | |||
1396 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8; | |||
1397 | if (VecSize <= 512) // 64 bytes | |||
1398 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16; | |||
1399 | if (VecSize <= 1024) // 128 bytes | |||
1400 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32; | |||
1401 | ||||
1402 | llvm_unreachable("unsupported size for IndirectRegWrite pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWrite pseudos" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1402); | |||
1403 | } | |||
1404 | ||||
1405 | static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) { | |||
1406 | if (VecSize <= 64) // 8 bytes | |||
1407 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1; | |||
1408 | if (VecSize <= 128) // 16 bytes | |||
1409 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2; | |||
1410 | if (VecSize <= 256) // 32 bytes | |||
1411 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4; | |||
1412 | if (VecSize <= 512) // 64 bytes | |||
1413 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8; | |||
1414 | if (VecSize <= 1024) // 128 bytes | |||
1415 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16; | |||
1416 | ||||
1417 | llvm_unreachable("unsupported size for IndirectRegWrite pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWrite pseudos" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1417); | |||
1418 | } | |||
1419 | ||||
1420 | const MCInstrDesc & | |||
1421 | SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, | |||
1422 | bool IsSGPR) const { | |||
1423 | if (IsSGPR) { | |||
1424 | switch (EltSize) { | |||
1425 | case 32: | |||
1426 | return get(getIndirectSGPRWriteMovRelPseudo32(VecSize)); | |||
1427 | case 64: | |||
1428 | return get(getIndirectSGPRWriteMovRelPseudo64(VecSize)); | |||
1429 | default: | |||
1430 | llvm_unreachable("invalid reg indexing elt size")::llvm::llvm_unreachable_internal("invalid reg indexing elt size" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1430); | |||
1431 | } | |||
1432 | } | |||
1433 | ||||
1434 | assert(EltSize == 32 && "invalid reg indexing elt size")(static_cast <bool> (EltSize == 32 && "invalid reg indexing elt size" ) ? void (0) : __assert_fail ("EltSize == 32 && \"invalid reg indexing elt size\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1434, __extension__ __PRETTY_FUNCTION__)); | |||
1435 | return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize)); | |||
1436 | } | |||
1437 | ||||
1438 | static unsigned getSGPRSpillSaveOpcode(unsigned Size) { | |||
1439 | switch (Size) { | |||
1440 | case 4: | |||
1441 | return AMDGPU::SI_SPILL_S32_SAVE; | |||
1442 | case 8: | |||
1443 | return AMDGPU::SI_SPILL_S64_SAVE; | |||
1444 | case 12: | |||
1445 | return AMDGPU::SI_SPILL_S96_SAVE; | |||
1446 | case 16: | |||
1447 | return AMDGPU::SI_SPILL_S128_SAVE; | |||
1448 | case 20: | |||
1449 | return AMDGPU::SI_SPILL_S160_SAVE; | |||
1450 | case 24: | |||
1451 | return AMDGPU::SI_SPILL_S192_SAVE; | |||
1452 | case 28: | |||
1453 | return AMDGPU::SI_SPILL_S224_SAVE; | |||
1454 | case 32: | |||
1455 | return AMDGPU::SI_SPILL_S256_SAVE; | |||
1456 | case 36: | |||
1457 | return AMDGPU::SI_SPILL_S288_SAVE; | |||
1458 | case 40: | |||
1459 | return AMDGPU::SI_SPILL_S320_SAVE; | |||
1460 | case 44: | |||
1461 | return AMDGPU::SI_SPILL_S352_SAVE; | |||
1462 | case 48: | |||
1463 | return AMDGPU::SI_SPILL_S384_SAVE; | |||
1464 | case 64: | |||
1465 | return AMDGPU::SI_SPILL_S512_SAVE; | |||
1466 | case 128: | |||
1467 | return AMDGPU::SI_SPILL_S1024_SAVE; | |||
1468 | default: | |||
1469 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1469); | |||
1470 | } | |||
1471 | } | |||
1472 | ||||
1473 | static unsigned getVGPRSpillSaveOpcode(unsigned Size) { | |||
1474 | switch (Size) { | |||
1475 | case 4: | |||
1476 | return AMDGPU::SI_SPILL_V32_SAVE; | |||
1477 | case 8: | |||
1478 | return AMDGPU::SI_SPILL_V64_SAVE; | |||
1479 | case 12: | |||
1480 | return AMDGPU::SI_SPILL_V96_SAVE; | |||
1481 | case 16: | |||
1482 | return AMDGPU::SI_SPILL_V128_SAVE; | |||
1483 | case 20: | |||
1484 | return AMDGPU::SI_SPILL_V160_SAVE; | |||
1485 | case 24: | |||
1486 | return AMDGPU::SI_SPILL_V192_SAVE; | |||
1487 | case 28: | |||
1488 | return AMDGPU::SI_SPILL_V224_SAVE; | |||
1489 | case 32: | |||
1490 | return AMDGPU::SI_SPILL_V256_SAVE; | |||
1491 | case 36: | |||
1492 | return AMDGPU::SI_SPILL_V288_SAVE; | |||
1493 | case 40: | |||
1494 | return AMDGPU::SI_SPILL_V320_SAVE; | |||
1495 | case 44: | |||
1496 | return AMDGPU::SI_SPILL_V352_SAVE; | |||
1497 | case 48: | |||
1498 | return AMDGPU::SI_SPILL_V384_SAVE; | |||
1499 | case 64: | |||
1500 | return AMDGPU::SI_SPILL_V512_SAVE; | |||
1501 | case 128: | |||
1502 | return AMDGPU::SI_SPILL_V1024_SAVE; | |||
1503 | default: | |||
1504 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1504); | |||
1505 | } | |||
1506 | } | |||
1507 | ||||
1508 | static unsigned getAGPRSpillSaveOpcode(unsigned Size) { | |||
1509 | switch (Size) { | |||
1510 | case 4: | |||
1511 | return AMDGPU::SI_SPILL_A32_SAVE; | |||
1512 | case 8: | |||
1513 | return AMDGPU::SI_SPILL_A64_SAVE; | |||
1514 | case 12: | |||
1515 | return AMDGPU::SI_SPILL_A96_SAVE; | |||
1516 | case 16: | |||
1517 | return AMDGPU::SI_SPILL_A128_SAVE; | |||
1518 | case 20: | |||
1519 | return AMDGPU::SI_SPILL_A160_SAVE; | |||
1520 | case 24: | |||
1521 | return AMDGPU::SI_SPILL_A192_SAVE; | |||
1522 | case 28: | |||
1523 | return AMDGPU::SI_SPILL_A224_SAVE; | |||
1524 | case 32: | |||
1525 | return AMDGPU::SI_SPILL_A256_SAVE; | |||
1526 | case 36: | |||
1527 | return AMDGPU::SI_SPILL_A288_SAVE; | |||
1528 | case 40: | |||
1529 | return AMDGPU::SI_SPILL_A320_SAVE; | |||
1530 | case 44: | |||
1531 | return AMDGPU::SI_SPILL_A352_SAVE; | |||
1532 | case 48: | |||
1533 | return AMDGPU::SI_SPILL_A384_SAVE; | |||
1534 | case 64: | |||
1535 | return AMDGPU::SI_SPILL_A512_SAVE; | |||
1536 | case 128: | |||
1537 | return AMDGPU::SI_SPILL_A1024_SAVE; | |||
1538 | default: | |||
1539 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1539); | |||
1540 | } | |||
1541 | } | |||
1542 | ||||
1543 | static unsigned getAVSpillSaveOpcode(unsigned Size) { | |||
1544 | switch (Size) { | |||
1545 | case 4: | |||
1546 | return AMDGPU::SI_SPILL_AV32_SAVE; | |||
1547 | case 8: | |||
1548 | return AMDGPU::SI_SPILL_AV64_SAVE; | |||
1549 | case 12: | |||
1550 | return AMDGPU::SI_SPILL_AV96_SAVE; | |||
1551 | case 16: | |||
1552 | return AMDGPU::SI_SPILL_AV128_SAVE; | |||
1553 | case 20: | |||
1554 | return AMDGPU::SI_SPILL_AV160_SAVE; | |||
1555 | case 24: | |||
1556 | return AMDGPU::SI_SPILL_AV192_SAVE; | |||
1557 | case 28: | |||
1558 | return AMDGPU::SI_SPILL_AV224_SAVE; | |||
1559 | case 32: | |||
1560 | return AMDGPU::SI_SPILL_AV256_SAVE; | |||
1561 | case 36: | |||
1562 | return AMDGPU::SI_SPILL_AV288_SAVE; | |||
1563 | case 40: | |||
1564 | return AMDGPU::SI_SPILL_AV320_SAVE; | |||
1565 | case 44: | |||
1566 | return AMDGPU::SI_SPILL_AV352_SAVE; | |||
1567 | case 48: | |||
1568 | return AMDGPU::SI_SPILL_AV384_SAVE; | |||
1569 | case 64: | |||
1570 | return AMDGPU::SI_SPILL_AV512_SAVE; | |||
1571 | case 128: | |||
1572 | return AMDGPU::SI_SPILL_AV1024_SAVE; | |||
1573 | default: | |||
1574 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1574); | |||
1575 | } | |||
1576 | } | |||
1577 | ||||
1578 | void SIInstrInfo::storeRegToStackSlot( | |||
1579 | MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, | |||
1580 | bool isKill, int FrameIndex, const TargetRegisterClass *RC, | |||
1581 | const TargetRegisterInfo *TRI, Register VReg) const { | |||
1582 | MachineFunction *MF = MBB.getParent(); | |||
1583 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); | |||
1584 | MachineFrameInfo &FrameInfo = MF->getFrameInfo(); | |||
1585 | const DebugLoc &DL = MBB.findDebugLoc(MI); | |||
1586 | ||||
1587 | MachinePointerInfo PtrInfo | |||
1588 | = MachinePointerInfo::getFixedStack(*MF, FrameIndex); | |||
1589 | MachineMemOperand *MMO = MF->getMachineMemOperand( | |||
1590 | PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), | |||
1591 | FrameInfo.getObjectAlign(FrameIndex)); | |||
1592 | unsigned SpillSize = TRI->getSpillSize(*RC); | |||
1593 | ||||
1594 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
1595 | if (RI.isSGPRClass(RC)) { | |||
1596 | MFI->setHasSpilledSGPRs(); | |||
1597 | assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled")(static_cast <bool> (SrcReg != AMDGPU::M0 && "m0 should not be spilled" ) ? void (0) : __assert_fail ("SrcReg != AMDGPU::M0 && \"m0 should not be spilled\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1597, __extension__ __PRETTY_FUNCTION__)); | |||
1598 | assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&(static_cast <bool> (SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && "exec should not be spilled") ? void (0) : __assert_fail ("SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && \"exec should not be spilled\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1599, __extension__ __PRETTY_FUNCTION__)) | |||
1599 | SrcReg != AMDGPU::EXEC && "exec should not be spilled")(static_cast <bool> (SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && "exec should not be spilled") ? void (0) : __assert_fail ("SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && \"exec should not be spilled\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1599, __extension__ __PRETTY_FUNCTION__)); | |||
1600 | ||||
1601 | // We are only allowed to create one new instruction when spilling | |||
1602 | // registers, so we need to use pseudo instruction for spilling SGPRs. | |||
1603 | const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); | |||
1604 | ||||
1605 | // The SGPR spill/restore instructions only work on number sgprs, so we need | |||
1606 | // to make sure we are using the correct register class. | |||
1607 | if (SrcReg.isVirtual() && SpillSize == 4) { | |||
1608 | MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); | |||
1609 | } | |||
1610 | ||||
1611 | BuildMI(MBB, MI, DL, OpDesc) | |||
1612 | .addReg(SrcReg, getKillRegState(isKill)) // data | |||
1613 | .addFrameIndex(FrameIndex) // addr | |||
1614 | .addMemOperand(MMO) | |||
1615 | .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); | |||
1616 | ||||
1617 | if (RI.spillSGPRToVGPR()) | |||
1618 | FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); | |||
1619 | return; | |||
1620 | } | |||
1621 | ||||
1622 | unsigned Opcode = RI.isVectorSuperClass(RC) | |||
1623 | ? getAVSpillSaveOpcode(SpillSize) | |||
1624 | : RI.isAGPRClass(RC) | |||
1625 | ? getAGPRSpillSaveOpcode(SpillSize) | |||
1626 | : getVGPRSpillSaveOpcode(SpillSize); | |||
1627 | MFI->setHasSpilledVGPRs(); | |||
1628 | ||||
1629 | BuildMI(MBB, MI, DL, get(Opcode)) | |||
1630 | .addReg(SrcReg, getKillRegState(isKill)) // data | |||
1631 | .addFrameIndex(FrameIndex) // addr | |||
1632 | .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset | |||
1633 | .addImm(0) // offset | |||
1634 | .addMemOperand(MMO); | |||
1635 | } | |||
1636 | ||||
1637 | static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { | |||
1638 | switch (Size) { | |||
1639 | case 4: | |||
1640 | return AMDGPU::SI_SPILL_S32_RESTORE; | |||
1641 | case 8: | |||
1642 | return AMDGPU::SI_SPILL_S64_RESTORE; | |||
1643 | case 12: | |||
1644 | return AMDGPU::SI_SPILL_S96_RESTORE; | |||
1645 | case 16: | |||
1646 | return AMDGPU::SI_SPILL_S128_RESTORE; | |||
1647 | case 20: | |||
1648 | return AMDGPU::SI_SPILL_S160_RESTORE; | |||
1649 | case 24: | |||
1650 | return AMDGPU::SI_SPILL_S192_RESTORE; | |||
1651 | case 28: | |||
1652 | return AMDGPU::SI_SPILL_S224_RESTORE; | |||
1653 | case 32: | |||
1654 | return AMDGPU::SI_SPILL_S256_RESTORE; | |||
1655 | case 36: | |||
1656 | return AMDGPU::SI_SPILL_S288_RESTORE; | |||
1657 | case 40: | |||
1658 | return AMDGPU::SI_SPILL_S320_RESTORE; | |||
1659 | case 44: | |||
1660 | return AMDGPU::SI_SPILL_S352_RESTORE; | |||
1661 | case 48: | |||
1662 | return AMDGPU::SI_SPILL_S384_RESTORE; | |||
1663 | case 64: | |||
1664 | return AMDGPU::SI_SPILL_S512_RESTORE; | |||
1665 | case 128: | |||
1666 | return AMDGPU::SI_SPILL_S1024_RESTORE; | |||
1667 | default: | |||
1668 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1668); | |||
1669 | } | |||
1670 | } | |||
1671 | ||||
1672 | static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { | |||
1673 | switch (Size) { | |||
1674 | case 4: | |||
1675 | return AMDGPU::SI_SPILL_V32_RESTORE; | |||
1676 | case 8: | |||
1677 | return AMDGPU::SI_SPILL_V64_RESTORE; | |||
1678 | case 12: | |||
1679 | return AMDGPU::SI_SPILL_V96_RESTORE; | |||
1680 | case 16: | |||
1681 | return AMDGPU::SI_SPILL_V128_RESTORE; | |||
1682 | case 20: | |||
1683 | return AMDGPU::SI_SPILL_V160_RESTORE; | |||
1684 | case 24: | |||
1685 | return AMDGPU::SI_SPILL_V192_RESTORE; | |||
1686 | case 28: | |||
1687 | return AMDGPU::SI_SPILL_V224_RESTORE; | |||
1688 | case 32: | |||
1689 | return AMDGPU::SI_SPILL_V256_RESTORE; | |||
1690 | case 36: | |||
1691 | return AMDGPU::SI_SPILL_V288_RESTORE; | |||
1692 | case 40: | |||
1693 | return AMDGPU::SI_SPILL_V320_RESTORE; | |||
1694 | case 44: | |||
1695 | return AMDGPU::SI_SPILL_V352_RESTORE; | |||
1696 | case 48: | |||
1697 | return AMDGPU::SI_SPILL_V384_RESTORE; | |||
1698 | case 64: | |||
1699 | return AMDGPU::SI_SPILL_V512_RESTORE; | |||
1700 | case 128: | |||
1701 | return AMDGPU::SI_SPILL_V1024_RESTORE; | |||
1702 | default: | |||
1703 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1703); | |||
1704 | } | |||
1705 | } | |||
1706 | ||||
1707 | static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { | |||
1708 | switch (Size) { | |||
1709 | case 4: | |||
1710 | return AMDGPU::SI_SPILL_A32_RESTORE; | |||
1711 | case 8: | |||
1712 | return AMDGPU::SI_SPILL_A64_RESTORE; | |||
1713 | case 12: | |||
1714 | return AMDGPU::SI_SPILL_A96_RESTORE; | |||
1715 | case 16: | |||
1716 | return AMDGPU::SI_SPILL_A128_RESTORE; | |||
1717 | case 20: | |||
1718 | return AMDGPU::SI_SPILL_A160_RESTORE; | |||
1719 | case 24: | |||
1720 | return AMDGPU::SI_SPILL_A192_RESTORE; | |||
1721 | case 28: | |||
1722 | return AMDGPU::SI_SPILL_A224_RESTORE; | |||
1723 | case 32: | |||
1724 | return AMDGPU::SI_SPILL_A256_RESTORE; | |||
1725 | case 36: | |||
1726 | return AMDGPU::SI_SPILL_A288_RESTORE; | |||
1727 | case 40: | |||
1728 | return AMDGPU::SI_SPILL_A320_RESTORE; | |||
1729 | case 44: | |||
1730 | return AMDGPU::SI_SPILL_A352_RESTORE; | |||
1731 | case 48: | |||
1732 | return AMDGPU::SI_SPILL_A384_RESTORE; | |||
1733 | case 64: | |||
1734 | return AMDGPU::SI_SPILL_A512_RESTORE; | |||
1735 | case 128: | |||
1736 | return AMDGPU::SI_SPILL_A1024_RESTORE; | |||
1737 | default: | |||
1738 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1738); | |||
1739 | } | |||
1740 | } | |||
1741 | ||||
1742 | static unsigned getAVSpillRestoreOpcode(unsigned Size) { | |||
1743 | switch (Size) { | |||
1744 | case 4: | |||
1745 | return AMDGPU::SI_SPILL_AV32_RESTORE; | |||
1746 | case 8: | |||
1747 | return AMDGPU::SI_SPILL_AV64_RESTORE; | |||
1748 | case 12: | |||
1749 | return AMDGPU::SI_SPILL_AV96_RESTORE; | |||
1750 | case 16: | |||
1751 | return AMDGPU::SI_SPILL_AV128_RESTORE; | |||
1752 | case 20: | |||
1753 | return AMDGPU::SI_SPILL_AV160_RESTORE; | |||
1754 | case 24: | |||
1755 | return AMDGPU::SI_SPILL_AV192_RESTORE; | |||
1756 | case 28: | |||
1757 | return AMDGPU::SI_SPILL_AV224_RESTORE; | |||
1758 | case 32: | |||
1759 | return AMDGPU::SI_SPILL_AV256_RESTORE; | |||
1760 | case 36: | |||
1761 | return AMDGPU::SI_SPILL_AV288_RESTORE; | |||
1762 | case 40: | |||
1763 | return AMDGPU::SI_SPILL_AV320_RESTORE; | |||
1764 | case 44: | |||
1765 | return AMDGPU::SI_SPILL_AV352_RESTORE; | |||
1766 | case 48: | |||
1767 | return AMDGPU::SI_SPILL_AV384_RESTORE; | |||
1768 | case 64: | |||
1769 | return AMDGPU::SI_SPILL_AV512_RESTORE; | |||
1770 | case 128: | |||
1771 | return AMDGPU::SI_SPILL_AV1024_RESTORE; | |||
1772 | default: | |||
1773 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1773); | |||
1774 | } | |||
1775 | } | |||
1776 | ||||
1777 | void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, | |||
1778 | MachineBasicBlock::iterator MI, | |||
1779 | Register DestReg, int FrameIndex, | |||
1780 | const TargetRegisterClass *RC, | |||
1781 | const TargetRegisterInfo *TRI, | |||
1782 | Register VReg) const { | |||
1783 | MachineFunction *MF = MBB.getParent(); | |||
1784 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); | |||
1785 | MachineFrameInfo &FrameInfo = MF->getFrameInfo(); | |||
1786 | const DebugLoc &DL = MBB.findDebugLoc(MI); | |||
1787 | unsigned SpillSize = TRI->getSpillSize(*RC); | |||
1788 | ||||
1789 | MachinePointerInfo PtrInfo | |||
1790 | = MachinePointerInfo::getFixedStack(*MF, FrameIndex); | |||
1791 | ||||
1792 | MachineMemOperand *MMO = MF->getMachineMemOperand( | |||
1793 | PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex), | |||
1794 | FrameInfo.getObjectAlign(FrameIndex)); | |||
1795 | ||||
1796 | if (RI.isSGPRClass(RC)) { | |||
1797 | MFI->setHasSpilledSGPRs(); | |||
1798 | assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into")(static_cast <bool> (DestReg != AMDGPU::M0 && "m0 should not be reloaded into" ) ? void (0) : __assert_fail ("DestReg != AMDGPU::M0 && \"m0 should not be reloaded into\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1798, __extension__ __PRETTY_FUNCTION__)); | |||
1799 | assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&(static_cast <bool> (DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && "exec should not be spilled") ? void (0) : __assert_fail ("DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && \"exec should not be spilled\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1800, __extension__ __PRETTY_FUNCTION__)) | |||
1800 | DestReg != AMDGPU::EXEC && "exec should not be spilled")(static_cast <bool> (DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && "exec should not be spilled") ? void (0) : __assert_fail ("DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && \"exec should not be spilled\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1800, __extension__ __PRETTY_FUNCTION__)); | |||
1801 | ||||
1802 | // FIXME: Maybe this should not include a memoperand because it will be | |||
1803 | // lowered to non-memory instructions. | |||
1804 | const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); | |||
1805 | if (DestReg.isVirtual() && SpillSize == 4) { | |||
1806 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
1807 | MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); | |||
1808 | } | |||
1809 | ||||
1810 | if (RI.spillSGPRToVGPR()) | |||
1811 | FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); | |||
1812 | BuildMI(MBB, MI, DL, OpDesc, DestReg) | |||
1813 | .addFrameIndex(FrameIndex) // addr | |||
1814 | .addMemOperand(MMO) | |||
1815 | .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); | |||
1816 | ||||
1817 | return; | |||
1818 | } | |||
1819 | ||||
1820 | unsigned Opcode = RI.isVectorSuperClass(RC) | |||
1821 | ? getAVSpillRestoreOpcode(SpillSize) | |||
1822 | : RI.isAGPRClass(RC) | |||
1823 | ? getAGPRSpillRestoreOpcode(SpillSize) | |||
1824 | : getVGPRSpillRestoreOpcode(SpillSize); | |||
1825 | BuildMI(MBB, MI, DL, get(Opcode), DestReg) | |||
1826 | .addFrameIndex(FrameIndex) // vaddr | |||
1827 | .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset | |||
1828 | .addImm(0) // offset | |||
1829 | .addMemOperand(MMO); | |||
1830 | } | |||
1831 | ||||
1832 | void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, | |||
1833 | MachineBasicBlock::iterator MI) const { | |||
1834 | insertNoops(MBB, MI, 1); | |||
1835 | } | |||
1836 | ||||
1837 | void SIInstrInfo::insertNoops(MachineBasicBlock &MBB, | |||
1838 | MachineBasicBlock::iterator MI, | |||
1839 | unsigned Quantity) const { | |||
1840 | DebugLoc DL = MBB.findDebugLoc(MI); | |||
1841 | while (Quantity > 0) { | |||
1842 | unsigned Arg = std::min(Quantity, 8u); | |||
1843 | Quantity -= Arg; | |||
1844 | BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1); | |||
1845 | } | |||
1846 | } | |||
1847 | ||||
1848 | void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { | |||
1849 | auto MF = MBB.getParent(); | |||
1850 | SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); | |||
1851 | ||||
1852 | assert(Info->isEntryFunction())(static_cast <bool> (Info->isEntryFunction()) ? void (0) : __assert_fail ("Info->isEntryFunction()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1852, __extension__ __PRETTY_FUNCTION__)); | |||
1853 | ||||
1854 | if (MBB.succ_empty()) { | |||
1855 | bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); | |||
1856 | if (HasNoTerminator) { | |||
1857 | if (Info->returnsVoid()) { | |||
1858 | BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0); | |||
1859 | } else { | |||
1860 | BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG)); | |||
1861 | } | |||
1862 | } | |||
1863 | } | |||
1864 | } | |||
1865 | ||||
1866 | unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { | |||
1867 | switch (MI.getOpcode()) { | |||
1868 | default: | |||
1869 | if (MI.isMetaInstruction()) | |||
1870 | return 0; | |||
1871 | return 1; // FIXME: Do wait states equal cycles? | |||
1872 | ||||
1873 | case AMDGPU::S_NOP: | |||
1874 | return MI.getOperand(0).getImm() + 1; | |||
1875 | // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The | |||
1876 | // hazard, even if one exist, won't really be visible. Should we handle it? | |||
1877 | } | |||
1878 | } | |||
1879 | ||||
1880 | bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { | |||
1881 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); | |||
1882 | MachineBasicBlock &MBB = *MI.getParent(); | |||
1883 | DebugLoc DL = MBB.findDebugLoc(MI); | |||
1884 | switch (MI.getOpcode()) { | |||
| ||||
1885 | default: return TargetInstrInfo::expandPostRAPseudo(MI); | |||
1886 | case AMDGPU::S_MOV_B64_term: | |||
1887 | // This is only a terminator to get the correct spill code placement during | |||
1888 | // register allocation. | |||
1889 | MI.setDesc(get(AMDGPU::S_MOV_B64)); | |||
1890 | break; | |||
1891 | ||||
1892 | case AMDGPU::S_MOV_B32_term: | |||
1893 | // This is only a terminator to get the correct spill code placement during | |||
1894 | // register allocation. | |||
1895 | MI.setDesc(get(AMDGPU::S_MOV_B32)); | |||
1896 | break; | |||
1897 | ||||
1898 | case AMDGPU::S_XOR_B64_term: | |||
1899 | // This is only a terminator to get the correct spill code placement during | |||
1900 | // register allocation. | |||
1901 | MI.setDesc(get(AMDGPU::S_XOR_B64)); | |||
1902 | break; | |||
1903 | ||||
1904 | case AMDGPU::S_XOR_B32_term: | |||
1905 | // This is only a terminator to get the correct spill code placement during | |||
1906 | // register allocation. | |||
1907 | MI.setDesc(get(AMDGPU::S_XOR_B32)); | |||
1908 | break; | |||
1909 | case AMDGPU::S_OR_B64_term: | |||
1910 | // This is only a terminator to get the correct spill code placement during | |||
1911 | // register allocation. | |||
1912 | MI.setDesc(get(AMDGPU::S_OR_B64)); | |||
1913 | break; | |||
1914 | case AMDGPU::S_OR_B32_term: | |||
1915 | // This is only a terminator to get the correct spill code placement during | |||
1916 | // register allocation. | |||
1917 | MI.setDesc(get(AMDGPU::S_OR_B32)); | |||
1918 | break; | |||
1919 | ||||
1920 | case AMDGPU::S_ANDN2_B64_term: | |||
1921 | // This is only a terminator to get the correct spill code placement during | |||
1922 | // register allocation. | |||
1923 | MI.setDesc(get(AMDGPU::S_ANDN2_B64)); | |||
1924 | break; | |||
1925 | ||||
1926 | case AMDGPU::S_ANDN2_B32_term: | |||
1927 | // This is only a terminator to get the correct spill code placement during | |||
1928 | // register allocation. | |||
1929 | MI.setDesc(get(AMDGPU::S_ANDN2_B32)); | |||
1930 | break; | |||
1931 | ||||
1932 | case AMDGPU::S_AND_B64_term: | |||
1933 | // This is only a terminator to get the correct spill code placement during | |||
1934 | // register allocation. | |||
1935 | MI.setDesc(get(AMDGPU::S_AND_B64)); | |||
1936 | break; | |||
1937 | ||||
1938 | case AMDGPU::S_AND_B32_term: | |||
1939 | // This is only a terminator to get the correct spill code placement during | |||
1940 | // register allocation. | |||
1941 | MI.setDesc(get(AMDGPU::S_AND_B32)); | |||
1942 | break; | |||
1943 | ||||
1944 | case AMDGPU::V_MOV_B64_PSEUDO: { | |||
1945 | Register Dst = MI.getOperand(0).getReg(); | |||
1946 | Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); | |||
1947 | Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); | |||
1948 | ||||
1949 | const MachineOperand &SrcOp = MI.getOperand(1); | |||
1950 | // FIXME: Will this work for 64-bit floating point immediates? | |||
1951 | assert(!SrcOp.isFPImm())(static_cast <bool> (!SrcOp.isFPImm()) ? void (0) : __assert_fail ("!SrcOp.isFPImm()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1951, __extension__ __PRETTY_FUNCTION__)); | |||
1952 | if (ST.hasMovB64()) { | |||
1953 | MI.setDesc(get(AMDGPU::V_MOV_B64_e32)); | |||
1954 | if (SrcOp.isReg() || isInlineConstant(MI, 1) || | |||
1955 | isUInt<32>(SrcOp.getImm())) | |||
1956 | break; | |||
1957 | } | |||
1958 | if (SrcOp.isImm()) { | |||
1959 | APInt Imm(64, SrcOp.getImm()); | |||
1960 | APInt Lo(32, Imm.getLoBits(32).getZExtValue()); | |||
1961 | APInt Hi(32, Imm.getHiBits(32).getZExtValue()); | |||
1962 | if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) { | |||
1963 | BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) | |||
1964 | .addImm(SISrcMods::OP_SEL_1) | |||
1965 | .addImm(Lo.getSExtValue()) | |||
1966 | .addImm(SISrcMods::OP_SEL_1) | |||
1967 | .addImm(Lo.getSExtValue()) | |||
1968 | .addImm(0) // op_sel_lo | |||
1969 | .addImm(0) // op_sel_hi | |||
1970 | .addImm(0) // neg_lo | |||
1971 | .addImm(0) // neg_hi | |||
1972 | .addImm(0); // clamp | |||
1973 | } else { | |||
1974 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) | |||
1975 | .addImm(Lo.getSExtValue()) | |||
1976 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1977 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) | |||
1978 | .addImm(Hi.getSExtValue()) | |||
1979 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1980 | } | |||
1981 | } else { | |||
1982 | assert(SrcOp.isReg())(static_cast <bool> (SrcOp.isReg()) ? void (0) : __assert_fail ("SrcOp.isReg()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1982 , __extension__ __PRETTY_FUNCTION__)); | |||
1983 | if (ST.hasPackedFP32Ops() && | |||
1984 | !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) { | |||
1985 | BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) | |||
1986 | .addImm(SISrcMods::OP_SEL_1) // src0_mod | |||
1987 | .addReg(SrcOp.getReg()) | |||
1988 | .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod | |||
1989 | .addReg(SrcOp.getReg()) | |||
1990 | .addImm(0) // op_sel_lo | |||
1991 | .addImm(0) // op_sel_hi | |||
1992 | .addImm(0) // neg_lo | |||
1993 | .addImm(0) // neg_hi | |||
1994 | .addImm(0); // clamp | |||
1995 | } else { | |||
1996 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) | |||
1997 | .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) | |||
1998 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1999 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) | |||
2000 | .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) | |||
2001 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
2002 | } | |||
2003 | } | |||
2004 | MI.eraseFromParent(); | |||
2005 | break; | |||
2006 | } | |||
2007 | case AMDGPU::V_MOV_B64_DPP_PSEUDO: { | |||
2008 | expandMovDPP64(MI); | |||
2009 | break; | |||
2010 | } | |||
2011 | case AMDGPU::S_MOV_B64_IMM_PSEUDO: { | |||
2012 | const MachineOperand &SrcOp = MI.getOperand(1); | |||
2013 | assert(!SrcOp.isFPImm())(static_cast <bool> (!SrcOp.isFPImm()) ? void (0) : __assert_fail ("!SrcOp.isFPImm()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2013, __extension__ __PRETTY_FUNCTION__)); | |||
2014 | APInt Imm(64, SrcOp.getImm()); | |||
2015 | if (Imm.isIntN(32) || isInlineConstant(Imm)) { | |||
2016 | MI.setDesc(get(AMDGPU::S_MOV_B64)); | |||
2017 | break; | |||
2018 | } | |||
2019 | ||||
2020 | Register Dst = MI.getOperand(0).getReg(); | |||
2021 | Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); | |||
2022 | Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); | |||
2023 | ||||
2024 | APInt Lo(32, Imm.getLoBits(32).getZExtValue()); | |||
2025 | APInt Hi(32, Imm.getHiBits(32).getZExtValue()); | |||
2026 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo) | |||
2027 | .addImm(Lo.getSExtValue()) | |||
2028 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
2029 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi) | |||
2030 | .addImm(Hi.getSExtValue()) | |||
2031 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
2032 | MI.eraseFromParent(); | |||
2033 | break; | |||
2034 | } | |||
2035 | case AMDGPU::V_SET_INACTIVE_B32: { | |||
2036 | unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; | |||
2037 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
2038 | // FIXME: We may possibly optimize the COPY once we find ways to make LLVM | |||
2039 | // optimizations (mainly Register Coalescer) aware of WWM register liveness. | |||
2040 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) | |||
2041 | .add(MI.getOperand(1)); | |||
2042 | auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); | |||
2043 | FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten | |||
2044 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) | |||
2045 | .add(MI.getOperand(2)); | |||
2046 | BuildMI(MBB, MI, DL, get(NotOpc), Exec) | |||
2047 | .addReg(Exec); | |||
2048 | MI.eraseFromParent(); | |||
2049 | break; | |||
2050 | } | |||
2051 | case AMDGPU::V_SET_INACTIVE_B64: { | |||
2052 | unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; | |||
2053 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
2054 | MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), | |||
2055 | MI.getOperand(0).getReg()) | |||
2056 | .add(MI.getOperand(1)); | |||
2057 | expandPostRAPseudo(*Copy); | |||
2058 | auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); | |||
2059 | FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten | |||
2060 | Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), | |||
2061 | MI.getOperand(0).getReg()) | |||
2062 | .add(MI.getOperand(2)); | |||
2063 | expandPostRAPseudo(*Copy); | |||
2064 | BuildMI(MBB, MI, DL, get(NotOpc), Exec) | |||
2065 | .addReg(Exec); | |||
2066 | MI.eraseFromParent(); | |||
2067 | break; | |||
2068 | } | |||
2069 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1: | |||
2070 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2: | |||
2071 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3: | |||
2072 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4: | |||
2073 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5: | |||
2074 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8: | |||
2075 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9: | |||
2076 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10: | |||
2077 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11: | |||
2078 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12: | |||
2079 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16: | |||
2080 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32: | |||
2081 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1: | |||
2082 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2: | |||
2083 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3: | |||
2084 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4: | |||
2085 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5: | |||
2086 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8: | |||
2087 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16: | |||
2088 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32: | |||
2089 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1: | |||
2090 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2: | |||
2091 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4: | |||
2092 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8: | |||
2093 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: { | |||
2094 | const TargetRegisterClass *EltRC = getOpRegClass(MI, 2); | |||
2095 | ||||
2096 | unsigned Opc; | |||
2097 | if (RI.hasVGPRs(EltRC)) { | |||
2098 | Opc = AMDGPU::V_MOVRELD_B32_e32; | |||
2099 | } else { | |||
2100 | Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64 | |||
2101 | : AMDGPU::S_MOVRELD_B32; | |||
2102 | } | |||
2103 | ||||
2104 | const MCInstrDesc &OpDesc = get(Opc); | |||
2105 | Register VecReg = MI.getOperand(0).getReg(); | |||
2106 | bool IsUndef = MI.getOperand(1).isUndef(); | |||
2107 | unsigned SubReg = MI.getOperand(3).getImm(); | |||
2108 | assert(VecReg == MI.getOperand(1).getReg())(static_cast <bool> (VecReg == MI.getOperand(1).getReg( )) ? void (0) : __assert_fail ("VecReg == MI.getOperand(1).getReg()" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2108, __extension__ __PRETTY_FUNCTION__)); | |||
2109 | ||||
2110 | MachineInstrBuilder MIB = | |||
2111 | BuildMI(MBB, MI, DL, OpDesc) | |||
2112 | .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) | |||
2113 | .add(MI.getOperand(2)) | |||
2114 | .addReg(VecReg, RegState::ImplicitDefine) | |||
2115 | .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); | |||
2116 | ||||
2117 | const int ImpDefIdx = | |||
2118 | OpDesc.getNumOperands() + OpDesc.implicit_uses().size(); | |||
2119 | const int ImpUseIdx = ImpDefIdx + 1; | |||
2120 | MIB->tieOperands(ImpDefIdx, ImpUseIdx); | |||
2121 | MI.eraseFromParent(); | |||
2122 | break; | |||
2123 | } | |||
2124 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1: | |||
2125 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2: | |||
2126 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3: | |||
2127 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4: | |||
2128 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5: | |||
2129 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8: | |||
2130 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9: | |||
2131 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10: | |||
2132 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11: | |||
2133 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12: | |||
2134 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16: | |||
2135 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: { | |||
2136 | assert(ST.useVGPRIndexMode())(static_cast <bool> (ST.useVGPRIndexMode()) ? void (0) : __assert_fail ("ST.useVGPRIndexMode()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2136, __extension__ __PRETTY_FUNCTION__)); | |||
2137 | Register VecReg = MI.getOperand(0).getReg(); | |||
2138 | bool IsUndef = MI.getOperand(1).isUndef(); | |||
2139 | Register Idx = MI.getOperand(3).getReg(); | |||
2140 | Register SubReg = MI.getOperand(4).getImm(); | |||
2141 | ||||
2142 | MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) | |||
2143 | .addReg(Idx) | |||
2144 | .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); | |||
2145 | SetOn->getOperand(3).setIsUndef(); | |||
2146 | ||||
2147 | const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write); | |||
2148 | MachineInstrBuilder MIB = | |||
2149 | BuildMI(MBB, MI, DL, OpDesc) | |||
2150 | .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) | |||
2151 | .add(MI.getOperand(2)) | |||
2152 | .addReg(VecReg, RegState::ImplicitDefine) | |||
2153 | .addReg(VecReg, | |||
2154 | RegState::Implicit | (IsUndef ? RegState::Undef : 0)); | |||
2155 | ||||
2156 | const int ImpDefIdx = | |||
2157 | OpDesc.getNumOperands() + OpDesc.implicit_uses().size(); | |||
2158 | const int ImpUseIdx = ImpDefIdx + 1; | |||
2159 | MIB->tieOperands(ImpDefIdx, ImpUseIdx); | |||
2160 | ||||
2161 | MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); | |||
2162 | ||||
2163 | finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); | |||
2164 | ||||
2165 | MI.eraseFromParent(); | |||
2166 | break; | |||
2167 | } | |||
2168 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1: | |||
2169 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2: | |||
2170 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3: | |||
2171 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4: | |||
2172 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5: | |||
2173 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8: | |||
2174 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9: | |||
2175 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10: | |||
2176 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11: | |||
2177 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12: | |||
2178 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16: | |||
2179 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: { | |||
2180 | assert(ST.useVGPRIndexMode())(static_cast <bool> (ST.useVGPRIndexMode()) ? void (0) : __assert_fail ("ST.useVGPRIndexMode()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2180, __extension__ __PRETTY_FUNCTION__)); | |||
2181 | Register Dst = MI.getOperand(0).getReg(); | |||
2182 | Register VecReg = MI.getOperand(1).getReg(); | |||
2183 | bool IsUndef = MI.getOperand(1).isUndef(); | |||
2184 | Register Idx = MI.getOperand(2).getReg(); | |||
2185 | Register SubReg = MI.getOperand(3).getImm(); | |||
2186 | ||||
2187 | MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) | |||
2188 | .addReg(Idx) | |||
2189 | .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); | |||
2190 | SetOn->getOperand(3).setIsUndef(); | |||
2191 | ||||
2192 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read)) | |||
2193 | .addDef(Dst) | |||
2194 | .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) | |||
2195 | .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); | |||
2196 | ||||
2197 | MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); | |||
2198 | ||||
2199 | finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); | |||
2200 | ||||
2201 | MI.eraseFromParent(); | |||
2202 | break; | |||
2203 | } | |||
2204 | case AMDGPU::SI_PC_ADD_REL_OFFSET: { | |||
2205 | MachineFunction &MF = *MBB.getParent(); | |||
2206 | Register Reg = MI.getOperand(0).getReg(); | |||
2207 | Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0); | |||
2208 | Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1); | |||
2209 | ||||
2210 | // Create a bundle so these instructions won't be re-ordered by the | |||
2211 | // post-RA scheduler. | |||
2212 | MIBundleBuilder Bundler(MBB, MI); | |||
2213 | Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); | |||
2214 | ||||
2215 | // Add 32-bit offset from this instruction to the start of the | |||
2216 | // constant data. | |||
2217 | Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) | |||
2218 | .addReg(RegLo) | |||
2219 | .add(MI.getOperand(1))); | |||
2220 | ||||
2221 | MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) | |||
2222 | .addReg(RegHi); | |||
2223 | MIB.add(MI.getOperand(2)); | |||
2224 | ||||
2225 | Bundler.append(MIB); | |||
2226 | finalizeBundle(MBB, Bundler.begin()); | |||
2227 | ||||
2228 | MI.eraseFromParent(); | |||
2229 | break; | |||
2230 | } | |||
2231 | case AMDGPU::ENTER_STRICT_WWM: { | |||
2232 | // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when | |||
2233 | // Whole Wave Mode is entered. | |||
2234 | MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 | |||
2235 | : AMDGPU::S_OR_SAVEEXEC_B64)); | |||
2236 | break; | |||
2237 | } | |||
2238 | case AMDGPU::ENTER_STRICT_WQM: { | |||
2239 | // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when | |||
2240 | // STRICT_WQM is entered. | |||
2241 | const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
2242 | const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64; | |||
2243 | const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; | |||
2244 | BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec); | |||
2245 | BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec); | |||
2246 | ||||
2247 | MI.eraseFromParent(); | |||
2248 | break; | |||
2249 | } | |||
2250 | case AMDGPU::EXIT_STRICT_WWM: | |||
2251 | case AMDGPU::EXIT_STRICT_WQM: { | |||
2252 | // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when | |||
2253 | // WWM/STICT_WQM is exited. | |||
2254 | MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); | |||
2255 | break; | |||
2256 | } | |||
2257 | case AMDGPU::ENTER_PSEUDO_WM: | |||
2258 | case AMDGPU::EXIT_PSEUDO_WM: { | |||
2259 | // These do nothing. | |||
2260 | MI.eraseFromParent(); | |||
2261 | break; | |||
2262 | } | |||
2263 | case AMDGPU::SI_RETURN: { | |||
2264 | const MachineFunction *MF = MBB.getParent(); | |||
2265 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); | |||
2266 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); | |||
2267 | // Hiding the return address use with SI_RETURN may lead to extra kills in | |||
2268 | // the function and missing live-ins. We are fine in practice because callee | |||
2269 | // saved register handling ensures the register value is restored before | |||
2270 | // RET, but we need the undef flag here to appease the MachineVerifier | |||
2271 | // liveness checks. | |||
2272 | MachineInstrBuilder MIB = | |||
2273 | BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return)) | |||
2274 | .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef); | |||
2275 | ||||
2276 | MIB.copyImplicitOps(MI); | |||
2277 | MI.eraseFromParent(); | |||
2278 | break; | |||
2279 | } | |||
2280 | } | |||
2281 | return true; | |||
2282 | } | |||
2283 | ||||
2284 | std::pair<MachineInstr*, MachineInstr*> | |||
2285 | SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { | |||
2286 | assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO)(static_cast <bool> (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ) ? void (0) : __assert_fail ("MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2286, __extension__ __PRETTY_FUNCTION__)); | |||
2287 | ||||
2288 | if (ST.hasMovB64() && | |||
2289 | AMDGPU::isLegal64BitDPPControl( | |||
2290 | getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) { | |||
2291 | MI.setDesc(get(AMDGPU::V_MOV_B64_dpp)); | |||
2292 | return std::pair(&MI, nullptr); | |||
2293 | } | |||
2294 | ||||
2295 | MachineBasicBlock &MBB = *MI.getParent(); | |||
2296 | DebugLoc DL = MBB.findDebugLoc(MI); | |||
2297 | MachineFunction *MF = MBB.getParent(); | |||
2298 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
2299 | Register Dst = MI.getOperand(0).getReg(); | |||
2300 | unsigned Part = 0; | |||
2301 | MachineInstr *Split[2]; | |||
2302 | ||||
2303 | for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) { | |||
2304 | auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp)); | |||
2305 | if (Dst.isPhysical()) { | |||
2306 | MovDPP.addDef(RI.getSubReg(Dst, Sub)); | |||
2307 | } else { | |||
2308 | assert(MRI.isSSA())(static_cast <bool> (MRI.isSSA()) ? void (0) : __assert_fail ("MRI.isSSA()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2308 , __extension__ __PRETTY_FUNCTION__)); | |||
2309 | auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
2310 | MovDPP.addDef(Tmp); | |||
2311 | } | |||
2312 | ||||
2313 | for (unsigned I = 1; I <= 2; ++I) { // old and src operands. | |||
2314 | const MachineOperand &SrcOp = MI.getOperand(I); | |||
2315 | assert(!SrcOp.isFPImm())(static_cast <bool> (!SrcOp.isFPImm()) ? void (0) : __assert_fail ("!SrcOp.isFPImm()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2315, __extension__ __PRETTY_FUNCTION__)); | |||
2316 | if (SrcOp.isImm()) { | |||
2317 | APInt Imm(64, SrcOp.getImm()); | |||
2318 | Imm.ashrInPlace(Part * 32); | |||
2319 | MovDPP.addImm(Imm.getLoBits(32).getZExtValue()); | |||
2320 | } else { | |||
2321 | assert(SrcOp.isReg())(static_cast <bool> (SrcOp.isReg()) ? void (0) : __assert_fail ("SrcOp.isReg()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2321 , __extension__ __PRETTY_FUNCTION__)); | |||
2322 | Register Src = SrcOp.getReg(); | |||
2323 | if (Src.isPhysical()) | |||
2324 | MovDPP.addReg(RI.getSubReg(Src, Sub)); | |||
2325 | else | |||
2326 | MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub); | |||
2327 | } | |||
2328 | } | |||
2329 | ||||
2330 | for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3)) | |||
2331 | MovDPP.addImm(MO.getImm()); | |||
2332 | ||||
2333 | Split[Part] = MovDPP; | |||
2334 | ++Part; | |||
2335 | } | |||
2336 | ||||
2337 | if (Dst.isVirtual()) | |||
2338 | BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst) | |||
2339 | .addReg(Split[0]->getOperand(0).getReg()) | |||
| ||||
2340 | .addImm(AMDGPU::sub0) | |||
2341 | .addReg(Split[1]->getOperand(0).getReg()) | |||
2342 | .addImm(AMDGPU::sub1); | |||
2343 | ||||
2344 | MI.eraseFromParent(); | |||
2345 | return std::pair(Split[0], Split[1]); | |||
2346 | } | |||
2347 | ||||
2348 | bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, | |||
2349 | MachineOperand &Src0, | |||
2350 | unsigned Src0OpName, | |||
2351 | MachineOperand &Src1, | |||
2352 | unsigned Src1OpName) const { | |||
2353 | MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); | |||
2354 | if (!Src0Mods) | |||
2355 | return false; | |||
2356 | ||||
2357 | MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); | |||
2358 | assert(Src1Mods &&(static_cast <bool> (Src1Mods && "All commutable instructions have both src0 and src1 modifiers" ) ? void (0) : __assert_fail ("Src1Mods && \"All commutable instructions have both src0 and src1 modifiers\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2359, __extension__ __PRETTY_FUNCTION__)) | |||
2359 | "All commutable instructions have both src0 and src1 modifiers")(static_cast <bool> (Src1Mods && "All commutable instructions have both src0 and src1 modifiers" ) ? void (0) : __assert_fail ("Src1Mods && \"All commutable instructions have both src0 and src1 modifiers\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2359, __extension__ __PRETTY_FUNCTION__)); | |||
2360 | ||||
2361 | int Src0ModsVal = Src0Mods->getImm(); | |||
2362 | int Src1ModsVal = Src1Mods->getImm(); | |||
2363 | ||||
2364 | Src1Mods->setImm(Src0ModsVal); | |||
2365 | Src0Mods->setImm(Src1ModsVal); | |||
2366 | return true; | |||
2367 | } | |||
2368 | ||||
2369 | static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, | |||
2370 | MachineOperand &RegOp, | |||
2371 | MachineOperand &NonRegOp) { | |||
2372 | Register Reg = RegOp.getReg(); | |||
2373 | unsigned SubReg = RegOp.getSubReg(); | |||
2374 | bool IsKill = RegOp.isKill(); | |||
2375 | bool IsDead = RegOp.isDead(); | |||
2376 | bool IsUndef = RegOp.isUndef(); | |||
2377 | bool IsDebug = RegOp.isDebug(); | |||
2378 | ||||
2379 | if (NonRegOp.isImm()) | |||
2380 | RegOp.ChangeToImmediate(NonRegOp.getImm()); | |||
2381 | else if (NonRegOp.isFI()) | |||
2382 | RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); | |||
2383 | else if (NonRegOp.isGlobal()) { | |||
2384 | RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(), | |||
2385 | NonRegOp.getTargetFlags()); | |||
2386 | } else | |||
2387 | return nullptr; | |||
2388 | ||||
2389 | // Make sure we don't reinterpret a subreg index in the target flags. | |||
2390 | RegOp.setTargetFlags(NonRegOp.getTargetFlags()); | |||
2391 | ||||
2392 | NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); | |||
2393 | NonRegOp.setSubReg(SubReg); | |||
2394 | ||||
2395 | return &MI; | |||
2396 | } | |||
2397 | ||||
2398 | MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, | |||
2399 | unsigned Src0Idx, | |||
2400 | unsigned Src1Idx) const { | |||
2401 | assert(!NewMI && "this should never be used")(static_cast <bool> (!NewMI && "this should never be used" ) ? void (0) : __assert_fail ("!NewMI && \"this should never be used\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2401, __extension__ __PRETTY_FUNCTION__)); | |||
2402 | ||||
2403 | unsigned Opc = MI.getOpcode(); | |||
2404 | int CommutedOpcode = commuteOpcode(Opc); | |||
2405 | if (CommutedOpcode == -1) | |||
2406 | return nullptr; | |||
2407 | ||||
2408 | assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==(static_cast <bool> (AMDGPU::getNamedOperandIdx(Opc, AMDGPU ::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast <int>(Src1Idx) && "inconsistency with findCommutedOpIndices" ) ? void (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2412, __extension__ __PRETTY_FUNCTION__)) | |||
2409 | static_cast<int>(Src0Idx) &&(static_cast <bool> (AMDGPU::getNamedOperandIdx(Opc, AMDGPU ::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast <int>(Src1Idx) && "inconsistency with findCommutedOpIndices" ) ? void (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2412, __extension__ __PRETTY_FUNCTION__)) | |||
2410 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==(static_cast <bool> (AMDGPU::getNamedOperandIdx(Opc, AMDGPU ::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast <int>(Src1Idx) && "inconsistency with findCommutedOpIndices" ) ? void (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2412, __extension__ __PRETTY_FUNCTION__)) | |||
2411 | static_cast<int>(Src1Idx) &&(static_cast <bool> (AMDGPU::getNamedOperandIdx(Opc, AMDGPU ::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast <int>(Src1Idx) && "inconsistency with findCommutedOpIndices" ) ? void (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2412, __extension__ __PRETTY_FUNCTION__)) | |||
2412 | "inconsistency with findCommutedOpIndices")(static_cast <bool> (AMDGPU::getNamedOperandIdx(Opc, AMDGPU ::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast <int>(Src1Idx) && "inconsistency with findCommutedOpIndices" ) ? void (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2412, __extension__ __PRETTY_FUNCTION__)); | |||
2413 | ||||
2414 | MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
2415 | MachineOperand &Src1 = MI.getOperand(Src1Idx); | |||
2416 | ||||
2417 | MachineInstr *CommutedMI = nullptr; | |||
2418 | if (Src0.isReg() && Src1.isReg()) { | |||
2419 | if (isOperandLegal(MI, Src1Idx, &Src0)) { | |||
2420 | // Be sure to copy the source modifiers to the right place. | |||
2421 | CommutedMI | |||
2422 | = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); | |||
2423 | } | |||
2424 | ||||
2425 | } else if (Src0.isReg() && !Src1.isReg()) { | |||
2426 | // src0 should always be able to support any operand type, so no need to | |||
2427 | // check operand legality. | |||
2428 | CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); | |||
2429 | } else if (!Src0.isReg() && Src1.isReg()) { | |||
2430 | if (isOperandLegal(MI, Src1Idx, &Src0)) | |||
2431 | CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); | |||
2432 | } else { | |||
2433 | // FIXME: Found two non registers to commute. This does happen. | |||
2434 | return nullptr; | |||
2435 | } | |||
2436 | ||||
2437 | if (CommutedMI) { | |||
2438 | swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, | |||
2439 | Src1, AMDGPU::OpName::src1_modifiers); | |||
2440 | ||||
2441 | CommutedMI->setDesc(get(CommutedOpcode)); | |||
2442 | } | |||
2443 | ||||
2444 | return CommutedMI; | |||
2445 | } | |||
2446 | ||||
2447 | // This needs to be implemented because the source modifiers may be inserted | |||
2448 | // between the true commutable operands, and the base | |||
2449 | // TargetInstrInfo::commuteInstruction uses it. | |||
2450 | bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI, | |||
2451 | unsigned &SrcOpIdx0, | |||
2452 | unsigned &SrcOpIdx1) const { | |||
2453 | return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); | |||
2454 | } | |||
2455 | ||||
2456 | bool SIInstrInfo::findCommutedOpIndices(const MCInstrDesc &Desc, | |||
2457 | unsigned &SrcOpIdx0, | |||
2458 | unsigned &SrcOpIdx1) const { | |||
2459 | if (!Desc.isCommutable()) | |||
2460 | return false; | |||
2461 | ||||
2462 | unsigned Opc = Desc.getOpcode(); | |||
2463 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); | |||
2464 | if (Src0Idx == -1) | |||
2465 | return false; | |||
2466 | ||||
2467 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); | |||
2468 | if (Src1Idx == -1) | |||
2469 | return false; | |||
2470 | ||||
2471 | return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); | |||
2472 | } | |||
2473 | ||||
2474 | bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, | |||
2475 | int64_t BrOffset) const { | |||
2476 | // BranchRelaxation should never have to check s_setpc_b64 because its dest | |||
2477 | // block is unanalyzable. | |||
2478 | assert(BranchOp != AMDGPU::S_SETPC_B64)(static_cast <bool> (BranchOp != AMDGPU::S_SETPC_B64) ? void (0) : __assert_fail ("BranchOp != AMDGPU::S_SETPC_B64", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2478, __extension__ __PRETTY_FUNCTION__)); | |||
2479 | ||||
2480 | // Convert to dwords. | |||
2481 | BrOffset /= 4; | |||
2482 | ||||
2483 | // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is | |||
2484 | // from the next instruction. | |||
2485 | BrOffset -= 1; | |||
2486 | ||||
2487 | return isIntN(BranchOffsetBits, BrOffset); | |||
2488 | } | |||
2489 | ||||
2490 | MachineBasicBlock *SIInstrInfo::getBranchDestBlock( | |||
2491 | const MachineInstr &MI) const { | |||
2492 | if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { | |||
2493 | // This would be a difficult analysis to perform, but can always be legal so | |||
2494 | // there's no need to analyze it. | |||
2495 | return nullptr; | |||
2496 | } | |||
2497 | ||||
2498 | return MI.getOperand(0).getMBB(); | |||
2499 | } | |||
2500 | ||||
2501 | bool SIInstrInfo::hasDivergentBranch(const MachineBasicBlock *MBB) const { | |||
2502 | for (const MachineInstr &MI : MBB->terminators()) { | |||
2503 | if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO || | |||
2504 | MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE || | |||
2505 | MI.getOpcode() == AMDGPU::SI_LOOP) | |||
2506 | return true; | |||
2507 | } | |||
2508 | return false; | |||
2509 | } | |||
2510 | ||||
2511 | void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, | |||
2512 | MachineBasicBlock &DestBB, | |||
2513 | MachineBasicBlock &RestoreBB, | |||
2514 | const DebugLoc &DL, int64_t BrOffset, | |||
2515 | RegScavenger *RS) const { | |||
2516 | assert(RS && "RegScavenger required for long branching")(static_cast <bool> (RS && "RegScavenger required for long branching" ) ? void (0) : __assert_fail ("RS && \"RegScavenger required for long branching\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2516, __extension__ __PRETTY_FUNCTION__)); | |||
2517 | assert(MBB.empty() &&(static_cast <bool> (MBB.empty() && "new block should be inserted for expanding unconditional branch" ) ? void (0) : __assert_fail ("MBB.empty() && \"new block should be inserted for expanding unconditional branch\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2518, __extension__ __PRETTY_FUNCTION__)) | |||
2518 | "new block should be inserted for expanding unconditional branch")(static_cast <bool> (MBB.empty() && "new block should be inserted for expanding unconditional branch" ) ? void (0) : __assert_fail ("MBB.empty() && \"new block should be inserted for expanding unconditional branch\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2518, __extension__ __PRETTY_FUNCTION__)); | |||
2519 | assert(MBB.pred_size() == 1)(static_cast <bool> (MBB.pred_size() == 1) ? void (0) : __assert_fail ("MBB.pred_size() == 1", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2519, __extension__ __PRETTY_FUNCTION__)); | |||
2520 | assert(RestoreBB.empty() &&(static_cast <bool> (RestoreBB.empty() && "restore block should be inserted for restoring clobbered registers" ) ? void (0) : __assert_fail ("RestoreBB.empty() && \"restore block should be inserted for restoring clobbered registers\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2521, __extension__ __PRETTY_FUNCTION__)) | |||
2521 | "restore block should be inserted for restoring clobbered registers")(static_cast <bool> (RestoreBB.empty() && "restore block should be inserted for restoring clobbered registers" ) ? void (0) : __assert_fail ("RestoreBB.empty() && \"restore block should be inserted for restoring clobbered registers\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2521, __extension__ __PRETTY_FUNCTION__)); | |||
2522 | ||||
2523 | MachineFunction *MF = MBB.getParent(); | |||
2524 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
2525 | ||||
2526 | // FIXME: Virtual register workaround for RegScavenger not working with empty | |||
2527 | // blocks. | |||
2528 | Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); | |||
2529 | ||||
2530 | auto I = MBB.end(); | |||
2531 | ||||
2532 | // We need to compute the offset relative to the instruction immediately after | |||
2533 | // s_getpc_b64. Insert pc arithmetic code before last terminator. | |||
2534 | MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); | |||
2535 | ||||
2536 | auto &MCCtx = MF->getContext(); | |||
2537 | MCSymbol *PostGetPCLabel = | |||
2538 | MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true); | |||
2539 | GetPC->setPostInstrSymbol(*MF, PostGetPCLabel); | |||
2540 | ||||
2541 | MCSymbol *OffsetLo = | |||
2542 | MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true); | |||
2543 | MCSymbol *OffsetHi = | |||
2544 | MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true); | |||
2545 | BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) | |||
2546 | .addReg(PCReg, RegState::Define, AMDGPU::sub0) | |||
2547 | .addReg(PCReg, 0, AMDGPU::sub0) | |||
2548 | .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET); | |||
2549 | BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) | |||
2550 | .addReg(PCReg, RegState::Define, AMDGPU::sub1) | |||
2551 | .addReg(PCReg, 0, AMDGPU::sub1) | |||
2552 | .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET); | |||
2553 | ||||
2554 | // Insert the indirect branch after the other terminator. | |||
2555 | BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) | |||
2556 | .addReg(PCReg); | |||
2557 | ||||
2558 | // If a spill is needed for the pc register pair, we need to insert a spill | |||
2559 | // restore block right before the destination block, and insert a short branch | |||
2560 | // into the old destination block's fallthrough predecessor. | |||
2561 | // e.g.: | |||
2562 | // | |||
2563 | // s_cbranch_scc0 skip_long_branch: | |||
2564 | // | |||
2565 | // long_branch_bb: | |||
2566 | // spill s[8:9] | |||
2567 | // s_getpc_b64 s[8:9] | |||
2568 | // s_add_u32 s8, s8, restore_bb | |||
2569 | // s_addc_u32 s9, s9, 0 | |||
2570 | // s_setpc_b64 s[8:9] | |||
2571 | // | |||
2572 | // skip_long_branch: | |||
2573 | // foo; | |||
2574 | // | |||
2575 | // ..... | |||
2576 | // | |||
2577 | // dest_bb_fallthrough_predecessor: | |||
2578 | // bar; | |||
2579 | // s_branch dest_bb | |||
2580 | // | |||
2581 | // restore_bb: | |||
2582 | // restore s[8:9] | |||
2583 | // fallthrough dest_bb | |||
2584 | /// | |||
2585 | // dest_bb: | |||
2586 | // buzz; | |||
2587 | ||||
2588 | RS->enterBasicBlockEnd(MBB); | |||
2589 | Register Scav = RS->scavengeRegisterBackwards( | |||
2590 | AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC), | |||
2591 | /* RestoreAfter */ false, 0, /* AllowSpill */ false); | |||
2592 | if (Scav) { | |||
2593 | RS->setRegUsed(Scav); | |||
2594 | MRI.replaceRegWith(PCReg, Scav); | |||
2595 | MRI.clearVirtRegs(); | |||
2596 | } else { | |||
2597 | // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for | |||
2598 | // SGPR spill. | |||
2599 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); | |||
2600 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); | |||
2601 | TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS); | |||
2602 | MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1); | |||
2603 | MRI.clearVirtRegs(); | |||
2604 | } | |||
2605 | ||||
2606 | MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol(); | |||
2607 | // Now, the distance could be defined. | |||
2608 | auto *Offset = MCBinaryExpr::createSub( | |||
2609 | MCSymbolRefExpr::create(DestLabel, MCCtx), | |||
2610 | MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx); | |||
2611 | // Add offset assignments. | |||
2612 | auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx); | |||
2613 | OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx)); | |||
2614 | auto *ShAmt = MCConstantExpr::create(32, MCCtx); | |||
2615 | OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx)); | |||
2616 | } | |||
2617 | ||||
2618 | unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { | |||
2619 | switch (Cond) { | |||
2620 | case SIInstrInfo::SCC_TRUE: | |||
2621 | return AMDGPU::S_CBRANCH_SCC1; | |||
2622 | case SIInstrInfo::SCC_FALSE: | |||
2623 | return AMDGPU::S_CBRANCH_SCC0; | |||
2624 | case SIInstrInfo::VCCNZ: | |||
2625 | return AMDGPU::S_CBRANCH_VCCNZ; | |||
2626 | case SIInstrInfo::VCCZ: | |||
2627 | return AMDGPU::S_CBRANCH_VCCZ; | |||
2628 | case SIInstrInfo::EXECNZ: | |||
2629 | return AMDGPU::S_CBRANCH_EXECNZ; | |||
2630 | case SIInstrInfo::EXECZ: | |||
2631 | return AMDGPU::S_CBRANCH_EXECZ; | |||
2632 | default: | |||
2633 | llvm_unreachable("invalid branch predicate")::llvm::llvm_unreachable_internal("invalid branch predicate", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2633); | |||
2634 | } | |||
2635 | } | |||
2636 | ||||
2637 | SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { | |||
2638 | switch (Opcode) { | |||
2639 | case AMDGPU::S_CBRANCH_SCC0: | |||
2640 | return SCC_FALSE; | |||
2641 | case AMDGPU::S_CBRANCH_SCC1: | |||
2642 | return SCC_TRUE; | |||
2643 | case AMDGPU::S_CBRANCH_VCCNZ: | |||
2644 | return VCCNZ; | |||
2645 | case AMDGPU::S_CBRANCH_VCCZ: | |||
2646 | return VCCZ; | |||
2647 | case AMDGPU::S_CBRANCH_EXECNZ: | |||
2648 | return EXECNZ; | |||
2649 | case AMDGPU::S_CBRANCH_EXECZ: | |||
2650 | return EXECZ; | |||
2651 | default: | |||
2652 | return INVALID_BR; | |||
2653 | } | |||
2654 | } | |||
2655 | ||||
2656 | bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, | |||
2657 | MachineBasicBlock::iterator I, | |||
2658 | MachineBasicBlock *&TBB, | |||
2659 | MachineBasicBlock *&FBB, | |||
2660 | SmallVectorImpl<MachineOperand> &Cond, | |||
2661 | bool AllowModify) const { | |||
2662 | if (I->getOpcode() == AMDGPU::S_BRANCH) { | |||
2663 | // Unconditional Branch | |||
2664 | TBB = I->getOperand(0).getMBB(); | |||
2665 | return false; | |||
2666 | } | |||
2667 | ||||
2668 | MachineBasicBlock *CondBB = nullptr; | |||
2669 | ||||
2670 | if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { | |||
2671 | CondBB = I->getOperand(1).getMBB(); | |||
2672 | Cond.push_back(I->getOperand(0)); | |||
2673 | } else { | |||
2674 | BranchPredicate Pred = getBranchPredicate(I->getOpcode()); | |||
2675 | if (Pred == INVALID_BR) | |||
2676 | return true; | |||
2677 | ||||
2678 | CondBB = I->getOperand(0).getMBB(); | |||
2679 | Cond.push_back(MachineOperand::CreateImm(Pred)); | |||
2680 | Cond.push_back(I->getOperand(1)); // Save the branch register. | |||
2681 | } | |||
2682 | ++I; | |||
2683 | ||||
2684 | if (I == MBB.end()) { | |||
2685 | // Conditional branch followed by fall-through. | |||
2686 | TBB = CondBB; | |||
2687 | return false; | |||
2688 | } | |||
2689 | ||||
2690 | if (I->getOpcode() == AMDGPU::S_BRANCH) { | |||
2691 | TBB = CondBB; | |||
2692 | FBB = I->getOperand(0).getMBB(); | |||
2693 | return false; | |||
2694 | } | |||
2695 | ||||
2696 | return true; | |||
2697 | } | |||
2698 | ||||
2699 | bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, | |||
2700 | MachineBasicBlock *&FBB, | |||
2701 | SmallVectorImpl<MachineOperand> &Cond, | |||
2702 | bool AllowModify) const { | |||
2703 | MachineBasicBlock::iterator I = MBB.getFirstTerminator(); | |||
2704 | auto E = MBB.end(); | |||
2705 | if (I == E) | |||
2706 | return false; | |||
2707 | ||||
2708 | // Skip over the instructions that are artificially terminators for special | |||
2709 | // exec management. | |||
2710 | while (I != E && !I->isBranch() && !I->isReturn()) { | |||
2711 | switch (I->getOpcode()) { | |||
2712 | case AMDGPU::S_MOV_B64_term: | |||
2713 | case AMDGPU::S_XOR_B64_term: | |||
2714 | case AMDGPU::S_OR_B64_term: | |||
2715 | case AMDGPU::S_ANDN2_B64_term: | |||
2716 | case AMDGPU::S_AND_B64_term: | |||
2717 | case AMDGPU::S_MOV_B32_term: | |||
2718 | case AMDGPU::S_XOR_B32_term: | |||
2719 | case AMDGPU::S_OR_B32_term: | |||
2720 | case AMDGPU::S_ANDN2_B32_term: | |||
2721 | case AMDGPU::S_AND_B32_term: | |||
2722 | break; | |||
2723 | case AMDGPU::SI_IF: | |||
2724 | case AMDGPU::SI_ELSE: | |||
2725 | case AMDGPU::SI_KILL_I1_TERMINATOR: | |||
2726 | case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: | |||
2727 | // FIXME: It's messy that these need to be considered here at all. | |||
2728 | return true; | |||
2729 | default: | |||
2730 | llvm_unreachable("unexpected non-branch terminator inst")::llvm::llvm_unreachable_internal("unexpected non-branch terminator inst" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2730); | |||
2731 | } | |||
2732 | ||||
2733 | ++I; | |||
2734 | } | |||
2735 | ||||
2736 | if (I == E) | |||
2737 | return false; | |||
2738 | ||||
2739 | return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); | |||
2740 | } | |||
2741 | ||||
2742 | unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, | |||
2743 | int *BytesRemoved) const { | |||
2744 | unsigned Count = 0; | |||
2745 | unsigned RemovedSize = 0; | |||
2746 | for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) { | |||
2747 | // Skip over artificial terminators when removing instructions. | |||
2748 | if (MI.isBranch() || MI.isReturn()) { | |||
2749 | RemovedSize += getInstSizeInBytes(MI); | |||
2750 | MI.eraseFromParent(); | |||
2751 | ++Count; | |||
2752 | } | |||
2753 | } | |||
2754 | ||||
2755 | if (BytesRemoved) | |||
2756 | *BytesRemoved = RemovedSize; | |||
2757 | ||||
2758 | return Count; | |||
2759 | } | |||
2760 | ||||
2761 | // Copy the flags onto the implicit condition register operand. | |||
2762 | static void preserveCondRegFlags(MachineOperand &CondReg, | |||
2763 | const MachineOperand &OrigCond) { | |||
2764 | CondReg.setIsUndef(OrigCond.isUndef()); | |||
2765 | CondReg.setIsKill(OrigCond.isKill()); | |||
2766 | } | |||
2767 | ||||
2768 | unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, | |||
2769 | MachineBasicBlock *TBB, | |||
2770 | MachineBasicBlock *FBB, | |||
2771 | ArrayRef<MachineOperand> Cond, | |||
2772 | const DebugLoc &DL, | |||
2773 | int *BytesAdded) const { | |||
2774 | if (!FBB && Cond.empty()) { | |||
2775 | BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) | |||
2776 | .addMBB(TBB); | |||
2777 | if (BytesAdded) | |||
2778 | *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; | |||
2779 | return 1; | |||
2780 | } | |||
2781 | ||||
2782 | if(Cond.size() == 1 && Cond[0].isReg()) { | |||
2783 | BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) | |||
2784 | .add(Cond[0]) | |||
2785 | .addMBB(TBB); | |||
2786 | return 1; | |||
2787 | } | |||
2788 | ||||
2789 | assert(TBB && Cond[0].isImm())(static_cast <bool> (TBB && Cond[0].isImm()) ? void (0) : __assert_fail ("TBB && Cond[0].isImm()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2789, __extension__ __PRETTY_FUNCTION__)); | |||
2790 | ||||
2791 | unsigned Opcode | |||
2792 | = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); | |||
2793 | ||||
2794 | if (!FBB) { | |||
2795 | Cond[1].isUndef(); | |||
2796 | MachineInstr *CondBr = | |||
2797 | BuildMI(&MBB, DL, get(Opcode)) | |||
2798 | .addMBB(TBB); | |||
2799 | ||||
2800 | // Copy the flags onto the implicit condition register operand. | |||
2801 | preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); | |||
2802 | fixImplicitOperands(*CondBr); | |||
2803 | ||||
2804 | if (BytesAdded) | |||
2805 | *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; | |||
2806 | return 1; | |||
2807 | } | |||
2808 | ||||
2809 | assert(TBB && FBB)(static_cast <bool> (TBB && FBB) ? void (0) : __assert_fail ("TBB && FBB", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2809, __extension__ __PRETTY_FUNCTION__)); | |||
2810 | ||||
2811 | MachineInstr *CondBr = | |||
2812 | BuildMI(&MBB, DL, get(Opcode)) | |||
2813 | .addMBB(TBB); | |||
2814 | fixImplicitOperands(*CondBr); | |||
2815 | BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) | |||
2816 | .addMBB(FBB); | |||
2817 | ||||
2818 | MachineOperand &CondReg = CondBr->getOperand(1); | |||
2819 | CondReg.setIsUndef(Cond[1].isUndef()); | |||
2820 | CondReg.setIsKill(Cond[1].isKill()); | |||
2821 | ||||
2822 | if (BytesAdded) | |||
2823 | *BytesAdded = ST.hasOffset3fBug() ? 16 : 8; | |||
2824 | ||||
2825 | return 2; | |||
2826 | } | |||
2827 | ||||
2828 | bool SIInstrInfo::reverseBranchCondition( | |||
2829 | SmallVectorImpl<MachineOperand> &Cond) const { | |||
2830 | if (Cond.size() != 2) { | |||
2831 | return true; | |||
2832 | } | |||
2833 | ||||
2834 | if (Cond[0].isImm()) { | |||
2835 | Cond[0].setImm(-Cond[0].getImm()); | |||
2836 | return false; | |||
2837 | } | |||
2838 | ||||
2839 | return true; | |||
2840 | } | |||
2841 | ||||
2842 | bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, | |||
2843 | ArrayRef<MachineOperand> Cond, | |||
2844 | Register DstReg, Register TrueReg, | |||
2845 | Register FalseReg, int &CondCycles, | |||
2846 | int &TrueCycles, int &FalseCycles) const { | |||
2847 | switch (Cond[0].getImm()) { | |||
2848 | case VCCNZ: | |||
2849 | case VCCZ: { | |||
2850 | const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
2851 | const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); | |||
2852 | if (MRI.getRegClass(FalseReg) != RC) | |||
2853 | return false; | |||
2854 | ||||
2855 | int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; | |||
2856 | CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? | |||
2857 | ||||
2858 | // Limit to equal cost for branch vs. N v_cndmask_b32s. | |||
2859 | return RI.hasVGPRs(RC) && NumInsts <= 6; | |||
2860 | } | |||
2861 | case SCC_TRUE: | |||
2862 | case SCC_FALSE: { | |||
2863 | // FIXME: We could insert for VGPRs if we could replace the original compare | |||
2864 | // with a vector one. | |||
2865 | const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
2866 | const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); | |||
2867 | if (MRI.getRegClass(FalseReg) != RC) | |||
2868 | return false; | |||
2869 | ||||
2870 | int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; | |||
2871 | ||||
2872 | // Multiples of 8 can do s_cselect_b64 | |||
2873 | if (NumInsts % 2 == 0) | |||
2874 | NumInsts /= 2; | |||
2875 | ||||
2876 | CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? | |||
2877 | return RI.isSGPRClass(RC); | |||
2878 | } | |||
2879 | default: | |||
2880 | return false; | |||
2881 | } | |||
2882 | } | |||
2883 | ||||
2884 | void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, | |||
2885 | MachineBasicBlock::iterator I, const DebugLoc &DL, | |||
2886 | Register DstReg, ArrayRef<MachineOperand> Cond, | |||
2887 | Register TrueReg, Register FalseReg) const { | |||
2888 | BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); | |||
2889 | if (Pred == VCCZ || Pred == SCC_FALSE) { | |||
2890 | Pred = static_cast<BranchPredicate>(-Pred); | |||
2891 | std::swap(TrueReg, FalseReg); | |||
2892 | } | |||
2893 | ||||
2894 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
2895 | const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); | |||
2896 | unsigned DstSize = RI.getRegSizeInBits(*DstRC); | |||
2897 | ||||
2898 | if (DstSize == 32) { | |||
2899 | MachineInstr *Select; | |||
2900 | if (Pred == SCC_TRUE) { | |||
2901 | Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg) | |||
2902 | .addReg(TrueReg) | |||
2903 | .addReg(FalseReg); | |||
2904 | } else { | |||
2905 | // Instruction's operands are backwards from what is expected. | |||
2906 | Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg) | |||
2907 | .addReg(FalseReg) | |||
2908 | .addReg(TrueReg); | |||
2909 | } | |||
2910 | ||||
2911 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); | |||
2912 | return; | |||
2913 | } | |||
2914 | ||||
2915 | if (DstSize == 64 && Pred == SCC_TRUE) { | |||
2916 | MachineInstr *Select = | |||
2917 | BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) | |||
2918 | .addReg(TrueReg) | |||
2919 | .addReg(FalseReg); | |||
2920 | ||||
2921 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); | |||
2922 | return; | |||
2923 | } | |||
2924 | ||||
2925 | static const int16_t Sub0_15[] = { | |||
2926 | AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, | |||
2927 | AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, | |||
2928 | AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, | |||
2929 | AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, | |||
2930 | }; | |||
2931 | ||||
2932 | static const int16_t Sub0_15_64[] = { | |||
2933 | AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, | |||
2934 | AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, | |||
2935 | AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, | |||
2936 | AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, | |||
2937 | }; | |||
2938 | ||||
2939 | unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; | |||
2940 | const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; | |||
2941 | const int16_t *SubIndices = Sub0_15; | |||
2942 | int NElts = DstSize / 32; | |||
2943 | ||||
2944 | // 64-bit select is only available for SALU. | |||
2945 | // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit. | |||
2946 | if (Pred == SCC_TRUE) { | |||
2947 | if (NElts % 2) { | |||
2948 | SelOp = AMDGPU::S_CSELECT_B32; | |||
2949 | EltRC = &AMDGPU::SGPR_32RegClass; | |||
2950 | } else { | |||
2951 | SelOp = AMDGPU::S_CSELECT_B64; | |||
2952 | EltRC = &AMDGPU::SGPR_64RegClass; | |||
2953 | SubIndices = Sub0_15_64; | |||
2954 | NElts /= 2; | |||
2955 | } | |||
2956 | } | |||
2957 | ||||
2958 | MachineInstrBuilder MIB = BuildMI( | |||
2959 | MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); | |||
2960 | ||||
2961 | I = MIB->getIterator(); | |||
2962 | ||||
2963 | SmallVector<Register, 8> Regs; | |||
2964 | for (int Idx = 0; Idx != NElts; ++Idx) { | |||
2965 | Register DstElt = MRI.createVirtualRegister(EltRC); | |||
2966 | Regs.push_back(DstElt); | |||
2967 | ||||
2968 | unsigned SubIdx = SubIndices[Idx]; | |||
2969 | ||||
2970 | MachineInstr *Select; | |||
2971 | if (SelOp == AMDGPU::V_CNDMASK_B32_e32) { | |||
2972 | Select = | |||
2973 | BuildMI(MBB, I, DL, get(SelOp), DstElt) | |||
2974 | .addReg(FalseReg, 0, SubIdx) | |||
2975 | .addReg(TrueReg, 0, SubIdx); | |||
2976 | } else { | |||
2977 | Select = | |||
2978 | BuildMI(MBB, I, DL, get(SelOp), DstElt) | |||
2979 | .addReg(TrueReg, 0, SubIdx) | |||
2980 | .addReg(FalseReg, 0, SubIdx); | |||
2981 | } | |||
2982 | ||||
2983 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); | |||
2984 | fixImplicitOperands(*Select); | |||
2985 | ||||
2986 | MIB.addReg(DstElt) | |||
2987 | .addImm(SubIdx); | |||
2988 | } | |||
2989 | } | |||
2990 | ||||
2991 | bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) { | |||
2992 | switch (MI.getOpcode()) { | |||
2993 | case AMDGPU::V_MOV_B32_e32: | |||
2994 | case AMDGPU::V_MOV_B32_e64: | |||
2995 | case AMDGPU::V_MOV_B64_PSEUDO: | |||
2996 | case AMDGPU::V_MOV_B64_e32: | |||
2997 | case AMDGPU::V_MOV_B64_e64: | |||
2998 | case AMDGPU::S_MOV_B32: | |||
2999 | case AMDGPU::S_MOV_B64: | |||
3000 | case AMDGPU::COPY: | |||
3001 | case AMDGPU::V_ACCVGPR_WRITE_B32_e64: | |||
3002 | case AMDGPU::V_ACCVGPR_READ_B32_e64: | |||
3003 | case AMDGPU::V_ACCVGPR_MOV_B32: | |||
3004 | return true; | |||
3005 | default: | |||
3006 | return false; | |||
3007 | } | |||
3008 | } | |||
3009 | ||||
3010 | static constexpr unsigned ModifierOpNames[] = { | |||
3011 | AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers, | |||
3012 | AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp, | |||
3013 | AMDGPU::OpName::omod, AMDGPU::OpName::op_sel}; | |||
3014 | ||||
3015 | void SIInstrInfo::removeModOperands(MachineInstr &MI) const { | |||
3016 | unsigned Opc = MI.getOpcode(); | |||
3017 | for (unsigned Name : reverse(ModifierOpNames)) { | |||
3018 | int Idx = AMDGPU::getNamedOperandIdx(Opc, Name); | |||
3019 | if (Idx >= 0) | |||
3020 | MI.removeOperand(Idx); | |||
3021 | } | |||
3022 | } | |||
3023 | ||||
3024 | bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, | |||
3025 | Register Reg, MachineRegisterInfo *MRI) const { | |||
3026 | if (!MRI->hasOneNonDBGUse(Reg)) | |||
3027 | return false; | |||
3028 | ||||
3029 | switch (DefMI.getOpcode()) { | |||
3030 | default: | |||
3031 | return false; | |||
3032 | case AMDGPU::S_MOV_B64: | |||
3033 | // TODO: We could fold 64-bit immediates, but this get complicated | |||
3034 | // when there are sub-registers. | |||
3035 | return false; | |||
3036 | ||||
3037 | case AMDGPU::V_MOV_B32_e32: | |||
3038 | case AMDGPU::S_MOV_B32: | |||
3039 | case AMDGPU::V_ACCVGPR_WRITE_B32_e64: | |||
3040 | break; | |||
3041 | } | |||
3042 | ||||
3043 | const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); | |||
3044 | assert(ImmOp)(static_cast <bool> (ImmOp) ? void (0) : __assert_fail ( "ImmOp", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3044, __extension__ __PRETTY_FUNCTION__)); | |||
3045 | // FIXME: We could handle FrameIndex values here. | |||
3046 | if (!ImmOp->isImm()) | |||
3047 | return false; | |||
3048 | ||||
3049 | unsigned Opc = UseMI.getOpcode(); | |||
3050 | if (Opc == AMDGPU::COPY) { | |||
3051 | Register DstReg = UseMI.getOperand(0).getReg(); | |||
3052 | bool Is16Bit = getOpSize(UseMI, 0) == 2; | |||
3053 | bool isVGPRCopy = RI.isVGPR(*MRI, DstReg); | |||
3054 | unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; | |||
3055 | APInt Imm(32, ImmOp->getImm()); | |||
3056 | ||||
3057 | if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16) | |||
3058 | Imm = Imm.ashr(16); | |||
3059 | ||||
3060 | if (RI.isAGPR(*MRI, DstReg)) { | |||
3061 | if (!isInlineConstant(Imm)) | |||
3062 | return false; | |||
3063 | NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64; | |||
3064 | } | |||
3065 | ||||
3066 | if (Is16Bit) { | |||
3067 | if (isVGPRCopy) | |||
3068 | return false; // Do not clobber vgpr_hi16 | |||
3069 | ||||
3070 | if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16) | |||
3071 | return false; | |||
3072 | ||||
3073 | UseMI.getOperand(0).setSubReg(0); | |||
3074 | if (DstReg.isPhysical()) { | |||
3075 | DstReg = RI.get32BitRegister(DstReg); | |||
3076 | UseMI.getOperand(0).setReg(DstReg); | |||
3077 | } | |||
3078 | assert(UseMI.getOperand(1).getReg().isVirtual())(static_cast <bool> (UseMI.getOperand(1).getReg().isVirtual ()) ? void (0) : __assert_fail ("UseMI.getOperand(1).getReg().isVirtual()" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3078, __extension__ __PRETTY_FUNCTION__)); | |||
3079 | } | |||
3080 | ||||
3081 | UseMI.setDesc(get(NewOpc)); | |||
3082 | UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue()); | |||
3083 | UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); | |||
3084 | return true; | |||
3085 | } | |||
3086 | ||||
3087 | if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || | |||
3088 | Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 || | |||
3089 | Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || | |||
3090 | Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 || | |||
3091 | Opc == AMDGPU::V_FMAC_F16_t16_e64) { | |||
3092 | // Don't fold if we are using source or output modifiers. The new VOP2 | |||
3093 | // instructions don't have them. | |||
3094 | if (hasAnyModifiersSet(UseMI)) | |||
3095 | return false; | |||
3096 | ||||
3097 | // If this is a free constant, there's no reason to do this. | |||
3098 | // TODO: We could fold this here instead of letting SIFoldOperands do it | |||
3099 | // later. | |||
3100 | MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); | |||
3101 | ||||
3102 | // Any src operand can be used for the legality check. | |||
3103 | if (isInlineConstant(UseMI, *Src0, *ImmOp)) | |||
3104 | return false; | |||
3105 | ||||
3106 | bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || | |||
3107 | Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64; | |||
3108 | bool IsFMA = | |||
3109 | Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || | |||
3110 | Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 || | |||
3111 | Opc == AMDGPU::V_FMAC_F16_t16_e64; | |||
3112 | MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); | |||
3113 | MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); | |||
3114 | ||||
3115 | // Multiplied part is the constant: Use v_madmk_{f16, f32}. | |||
3116 | // We should only expect these to be on src0 due to canonicalization. | |||
3117 | if (Src0->isReg() && Src0->getReg() == Reg) { | |||
3118 | if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) | |||
3119 | return false; | |||
3120 | ||||
3121 | if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) | |||
3122 | return false; | |||
3123 | ||||
3124 | unsigned NewOpc = | |||
3125 | IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 | |||
3126 | : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16 | |||
3127 | : AMDGPU::V_FMAMK_F16) | |||
3128 | : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); | |||
3129 | if (pseudoToMCOpcode(NewOpc) == -1) | |||
3130 | return false; | |||
3131 | ||||
3132 | // We need to swap operands 0 and 1 since madmk constant is at operand 1. | |||
3133 | ||||
3134 | const int64_t Imm = ImmOp->getImm(); | |||
3135 | ||||
3136 | // FIXME: This would be a lot easier if we could return a new instruction | |||
3137 | // instead of having to modify in place. | |||
3138 | ||||
3139 | Register Src1Reg = Src1->getReg(); | |||
3140 | unsigned Src1SubReg = Src1->getSubReg(); | |||
3141 | Src0->setReg(Src1Reg); | |||
3142 | Src0->setSubReg(Src1SubReg); | |||
3143 | Src0->setIsKill(Src1->isKill()); | |||
3144 | ||||
3145 | if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || | |||
3146 | Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || | |||
3147 | Opc == AMDGPU::V_FMAC_F16_e64) | |||
3148 | UseMI.untieRegOperand( | |||
3149 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); | |||
3150 | ||||
3151 | Src1->ChangeToImmediate(Imm); | |||
3152 | ||||
3153 | removeModOperands(UseMI); | |||
3154 | UseMI.setDesc(get(NewOpc)); | |||
3155 | ||||
3156 | bool DeleteDef = MRI->use_nodbg_empty(Reg); | |||
3157 | if (DeleteDef) | |||
3158 | DefMI.eraseFromParent(); | |||
3159 | ||||
3160 | return true; | |||
3161 | } | |||
3162 | ||||
3163 | // Added part is the constant: Use v_madak_{f16, f32}. | |||
3164 | if (Src2->isReg() && Src2->getReg() == Reg) { | |||
3165 | // Not allowed to use constant bus for another operand. | |||
3166 | // We can however allow an inline immediate as src0. | |||
3167 | bool Src0Inlined = false; | |||
3168 | if (Src0->isReg()) { | |||
3169 | // Try to inline constant if possible. | |||
3170 | // If the Def moves immediate and the use is single | |||
3171 | // We are saving VGPR here. | |||
3172 | MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); | |||
3173 | if (Def && Def->isMoveImmediate() && | |||
3174 | isInlineConstant(Def->getOperand(1)) && | |||
3175 | MRI->hasOneUse(Src0->getReg())) { | |||
3176 | Src0->ChangeToImmediate(Def->getOperand(1).getImm()); | |||
3177 | Src0Inlined = true; | |||
3178 | } else if ((Src0->getReg().isPhysical() && | |||
3179 | (ST.getConstantBusLimit(Opc) <= 1 && | |||
3180 | RI.isSGPRClass(RI.getPhysRegBaseClass(Src0->getReg())))) || | |||
3181 | (Src0->getReg().isVirtual() && | |||
3182 | (ST.getConstantBusLimit(Opc) <= 1 && | |||
3183 | RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) | |||
3184 | return false; | |||
3185 | // VGPR is okay as Src0 - fallthrough | |||
3186 | } | |||
3187 | ||||
3188 | if (Src1->isReg() && !Src0Inlined ) { | |||
3189 | // We have one slot for inlinable constant so far - try to fill it | |||
3190 | MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); | |||
3191 | if (Def && Def->isMoveImmediate() && | |||
3192 | isInlineConstant(Def->getOperand(1)) && | |||
3193 | MRI->hasOneUse(Src1->getReg()) && | |||
3194 | commuteInstruction(UseMI)) { | |||
3195 | Src0->ChangeToImmediate(Def->getOperand(1).getImm()); | |||
3196 | } else if ((Src1->getReg().isPhysical() && | |||
3197 | RI.isSGPRClass(RI.getPhysRegBaseClass(Src1->getReg()))) || | |||
3198 | (Src1->getReg().isVirtual() && | |||
3199 | RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) | |||
3200 | return false; | |||
3201 | // VGPR is okay as Src1 - fallthrough | |||
3202 | } | |||
3203 | ||||
3204 | unsigned NewOpc = | |||
3205 | IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 | |||
3206 | : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16 | |||
3207 | : AMDGPU::V_FMAAK_F16) | |||
3208 | : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); | |||
3209 | if (pseudoToMCOpcode(NewOpc) == -1) | |||
3210 | return false; | |||
3211 | ||||
3212 | const int64_t Imm = ImmOp->getImm(); | |||
3213 | ||||
3214 | // FIXME: This would be a lot easier if we could return a new instruction | |||
3215 | // instead of having to modify in place. | |||
3216 | ||||
3217 | if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || | |||
3218 | Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || | |||
3219 | Opc == AMDGPU::V_FMAC_F16_e64) | |||
3220 | UseMI.untieRegOperand( | |||
3221 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); | |||
3222 | ||||
3223 | // ChangingToImmediate adds Src2 back to the instruction. | |||
3224 | Src2->ChangeToImmediate(Imm); | |||
3225 | ||||
3226 | // These come before src2. | |||
3227 | removeModOperands(UseMI); | |||
3228 | UseMI.setDesc(get(NewOpc)); | |||
3229 | // It might happen that UseMI was commuted | |||
3230 | // and we now have SGPR as SRC1. If so 2 inlined | |||
3231 | // constant and SGPR are illegal. | |||
3232 | legalizeOperands(UseMI); | |||
3233 | ||||
3234 | bool DeleteDef = MRI->use_nodbg_empty(Reg); | |||
3235 | if (DeleteDef) | |||
3236 | DefMI.eraseFromParent(); | |||
3237 | ||||
3238 | return true; | |||
3239 | } | |||
3240 | } | |||
3241 | ||||
3242 | return false; | |||
3243 | } | |||
3244 | ||||
3245 | static bool | |||
3246 | memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1, | |||
3247 | ArrayRef<const MachineOperand *> BaseOps2) { | |||
3248 | if (BaseOps1.size() != BaseOps2.size()) | |||
3249 | return false; | |||
3250 | for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) { | |||
3251 | if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I])) | |||
3252 | return false; | |||
3253 | } | |||
3254 | return true; | |||
3255 | } | |||
3256 | ||||
3257 | static bool offsetsDoNotOverlap(int WidthA, int OffsetA, | |||
3258 | int WidthB, int OffsetB) { | |||
3259 | int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; | |||
3260 | int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; | |||
3261 | int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; | |||
3262 | return LowOffset + LowWidth <= HighOffset; | |||
3263 | } | |||
3264 | ||||
3265 | bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, | |||
3266 | const MachineInstr &MIb) const { | |||
3267 | SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1; | |||
3268 | int64_t Offset0, Offset1; | |||
3269 | unsigned Dummy0, Dummy1; | |||
3270 | bool Offset0IsScalable, Offset1IsScalable; | |||
3271 | if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable, | |||
3272 | Dummy0, &RI) || | |||
3273 | !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable, | |||
3274 | Dummy1, &RI)) | |||
3275 | return false; | |||
3276 | ||||
3277 | if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1)) | |||
3278 | return false; | |||
3279 | ||||
3280 | if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { | |||
3281 | // FIXME: Handle ds_read2 / ds_write2. | |||
3282 | return false; | |||
3283 | } | |||
3284 | unsigned Width0 = MIa.memoperands().front()->getSize(); | |||
3285 | unsigned Width1 = MIb.memoperands().front()->getSize(); | |||
3286 | return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1); | |||
3287 | } | |||
3288 | ||||
3289 | bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, | |||
3290 | const MachineInstr &MIb) const { | |||
3291 | assert(MIa.mayLoadOrStore() &&(static_cast <bool> (MIa.mayLoadOrStore() && "MIa must load from or modify a memory location" ) ? void (0) : __assert_fail ("MIa.mayLoadOrStore() && \"MIa must load from or modify a memory location\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3292, __extension__ __PRETTY_FUNCTION__)) | |||
3292 | "MIa must load from or modify a memory location")(static_cast <bool> (MIa.mayLoadOrStore() && "MIa must load from or modify a memory location" ) ? void (0) : __assert_fail ("MIa.mayLoadOrStore() && \"MIa must load from or modify a memory location\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3292, __extension__ __PRETTY_FUNCTION__)); | |||
3293 | assert(MIb.mayLoadOrStore() &&(static_cast <bool> (MIb.mayLoadOrStore() && "MIb must load from or modify a memory location" ) ? void (0) : __assert_fail ("MIb.mayLoadOrStore() && \"MIb must load from or modify a memory location\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3294, __extension__ __PRETTY_FUNCTION__)) | |||
3294 | "MIb must load from or modify a memory location")(static_cast <bool> (MIb.mayLoadOrStore() && "MIb must load from or modify a memory location" ) ? void (0) : __assert_fail ("MIb.mayLoadOrStore() && \"MIb must load from or modify a memory location\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3294, __extension__ __PRETTY_FUNCTION__)); | |||
3295 | ||||
3296 | if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) | |||
3297 | return false; | |||
3298 | ||||
3299 | // XXX - Can we relax this between address spaces? | |||
3300 | if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) | |||
3301 | return false; | |||
3302 | ||||
3303 | // TODO: Should we check the address space from the MachineMemOperand? That | |||
3304 | // would allow us to distinguish objects we know don't alias based on the | |||
3305 | // underlying address space, even if it was lowered to a different one, | |||
3306 | // e.g. private accesses lowered to use MUBUF instructions on a scratch | |||
3307 | // buffer. | |||
3308 | if (isDS(MIa)) { | |||
3309 | if (isDS(MIb)) | |||
3310 | return checkInstOffsetsDoNotOverlap(MIa, MIb); | |||
3311 | ||||
3312 | return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb); | |||
3313 | } | |||
3314 | ||||
3315 | if (isMUBUF(MIa) || isMTBUF(MIa)) { | |||
3316 | if (isMUBUF(MIb) || isMTBUF(MIb)) | |||
3317 | return checkInstOffsetsDoNotOverlap(MIa, MIb); | |||
3318 | ||||
3319 | return !isFLAT(MIb) && !isSMRD(MIb); | |||
3320 | } | |||
3321 | ||||
3322 | if (isSMRD(MIa)) { | |||
3323 | if (isSMRD(MIb)) | |||
3324 | return checkInstOffsetsDoNotOverlap(MIa, MIb); | |||
3325 | ||||
3326 | return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb); | |||
3327 | } | |||
3328 | ||||
3329 | if (isFLAT(MIa)) { | |||
3330 | if (isFLAT(MIb)) | |||
3331 | return checkInstOffsetsDoNotOverlap(MIa, MIb); | |||
3332 | ||||
3333 | return false; | |||
3334 | } | |||
3335 | ||||
3336 | return false; | |||
3337 | } | |||
3338 | ||||
3339 | static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, | |||
3340 | int64_t &Imm, MachineInstr **DefMI = nullptr) { | |||
3341 | if (Reg.isPhysical()) | |||
3342 | return false; | |||
3343 | auto *Def = MRI.getUniqueVRegDef(Reg); | |||
3344 | if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) { | |||
3345 | Imm = Def->getOperand(1).getImm(); | |||
3346 | if (DefMI) | |||
3347 | *DefMI = Def; | |||
3348 | return true; | |||
3349 | } | |||
3350 | return false; | |||
3351 | } | |||
3352 | ||||
3353 | static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm, | |||
3354 | MachineInstr **DefMI = nullptr) { | |||
3355 | if (!MO->isReg()) | |||
3356 | return false; | |||
3357 | const MachineFunction *MF = MO->getParent()->getParent()->getParent(); | |||
3358 | const MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
3359 | return getFoldableImm(MO->getReg(), MRI, Imm, DefMI); | |||
3360 | } | |||
3361 | ||||
3362 | static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, | |||
3363 | MachineInstr &NewMI) { | |||
3364 | if (LV) { | |||
3365 | unsigned NumOps = MI.getNumOperands(); | |||
3366 | for (unsigned I = 1; I < NumOps; ++I) { | |||
3367 | MachineOperand &Op = MI.getOperand(I); | |||
3368 | if (Op.isReg() && Op.isKill()) | |||
3369 | LV->replaceKillInstruction(Op.getReg(), MI, NewMI); | |||
3370 | } | |||
3371 | } | |||
3372 | } | |||
3373 | ||||
3374 | MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, | |||
3375 | LiveVariables *LV, | |||
3376 | LiveIntervals *LIS) const { | |||
3377 | MachineBasicBlock &MBB = *MI.getParent(); | |||
3378 | unsigned Opc = MI.getOpcode(); | |||
3379 | ||||
3380 | // Handle MFMA. | |||
3381 | int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc); | |||
3382 | if (NewMFMAOpc != -1) { | |||
3383 | MachineInstrBuilder MIB = | |||
3384 | BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc)); | |||
3385 | for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) | |||
3386 | MIB.add(MI.getOperand(I)); | |||
3387 | updateLiveVariables(LV, MI, *MIB); | |||
3388 | if (LIS) | |||
3389 | LIS->ReplaceMachineInstrInMaps(MI, *MIB); | |||
3390 | return MIB; | |||
3391 | } | |||
3392 | ||||
3393 | if (SIInstrInfo::isWMMA(MI)) { | |||
3394 | unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode()); | |||
3395 | MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3396 | .setMIFlags(MI.getFlags()); | |||
3397 | for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) | |||
3398 | MIB->addOperand(MI.getOperand(I)); | |||
3399 | ||||
3400 | updateLiveVariables(LV, MI, *MIB); | |||
3401 | if (LIS) | |||
3402 | LIS->ReplaceMachineInstrInMaps(MI, *MIB); | |||
3403 | ||||
3404 | return MIB; | |||
3405 | } | |||
3406 | ||||
3407 | assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&(static_cast <bool> (Opc != AMDGPU::V_FMAC_F16_t16_e32 && "V_FMAC_F16_t16_e32 is not supported and not expected to be present " "pre-RA") ? void (0) : __assert_fail ("Opc != AMDGPU::V_FMAC_F16_t16_e32 && \"V_FMAC_F16_t16_e32 is not supported and not expected to be present \" \"pre-RA\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3409, __extension__ __PRETTY_FUNCTION__)) | |||
3408 | "V_FMAC_F16_t16_e32 is not supported and not expected to be present "(static_cast <bool> (Opc != AMDGPU::V_FMAC_F16_t16_e32 && "V_FMAC_F16_t16_e32 is not supported and not expected to be present " "pre-RA") ? void (0) : __assert_fail ("Opc != AMDGPU::V_FMAC_F16_t16_e32 && \"V_FMAC_F16_t16_e32 is not supported and not expected to be present \" \"pre-RA\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3409, __extension__ __PRETTY_FUNCTION__)) | |||
3409 | "pre-RA")(static_cast <bool> (Opc != AMDGPU::V_FMAC_F16_t16_e32 && "V_FMAC_F16_t16_e32 is not supported and not expected to be present " "pre-RA") ? void (0) : __assert_fail ("Opc != AMDGPU::V_FMAC_F16_t16_e32 && \"V_FMAC_F16_t16_e32 is not supported and not expected to be present \" \"pre-RA\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3409, __extension__ __PRETTY_FUNCTION__)); | |||
3410 | ||||
3411 | // Handle MAC/FMAC. | |||
3412 | bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 || | |||
3413 | Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || | |||
3414 | Opc == AMDGPU::V_FMAC_F16_t16_e64; | |||
3415 | bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || | |||
3416 | Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 || | |||
3417 | Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 || | |||
3418 | Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || | |||
3419 | Opc == AMDGPU::V_FMAC_F16_t16_e64 || | |||
3420 | Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; | |||
3421 | bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; | |||
3422 | bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 || | |||
3423 | Opc == AMDGPU::V_MAC_LEGACY_F32_e64 || | |||
3424 | Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 || | |||
3425 | Opc == AMDGPU::V_FMAC_LEGACY_F32_e64; | |||
3426 | bool Src0Literal = false; | |||
3427 | ||||
3428 | switch (Opc) { | |||
3429 | default: | |||
3430 | return nullptr; | |||
3431 | case AMDGPU::V_MAC_F16_e64: | |||
3432 | case AMDGPU::V_FMAC_F16_e64: | |||
3433 | case AMDGPU::V_FMAC_F16_t16_e64: | |||
3434 | case AMDGPU::V_MAC_F32_e64: | |||
3435 | case AMDGPU::V_MAC_LEGACY_F32_e64: | |||
3436 | case AMDGPU::V_FMAC_F32_e64: | |||
3437 | case AMDGPU::V_FMAC_LEGACY_F32_e64: | |||
3438 | case AMDGPU::V_FMAC_F64_e64: | |||
3439 | break; | |||
3440 | case AMDGPU::V_MAC_F16_e32: | |||
3441 | case AMDGPU::V_FMAC_F16_e32: | |||
3442 | case AMDGPU::V_MAC_F32_e32: | |||
3443 | case AMDGPU::V_MAC_LEGACY_F32_e32: | |||
3444 | case AMDGPU::V_FMAC_F32_e32: | |||
3445 | case AMDGPU::V_FMAC_LEGACY_F32_e32: | |||
3446 | case AMDGPU::V_FMAC_F64_e32: { | |||
3447 | int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), | |||
3448 | AMDGPU::OpName::src0); | |||
3449 | const MachineOperand *Src0 = &MI.getOperand(Src0Idx); | |||
3450 | if (!Src0->isReg() && !Src0->isImm()) | |||
3451 | return nullptr; | |||
3452 | ||||
3453 | if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) | |||
3454 | Src0Literal = true; | |||
3455 | ||||
3456 | break; | |||
3457 | } | |||
3458 | } | |||
3459 | ||||
3460 | MachineInstrBuilder MIB; | |||
3461 | const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
3462 | const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); | |||
3463 | const MachineOperand *Src0Mods = | |||
3464 | getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); | |||
3465 | const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); | |||
3466 | const MachineOperand *Src1Mods = | |||
3467 | getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); | |||
3468 | const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); | |||
3469 | const MachineOperand *Src2Mods = | |||
3470 | getNamedOperand(MI, AMDGPU::OpName::src2_modifiers); | |||
3471 | const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); | |||
3472 | const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); | |||
3473 | const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel); | |||
3474 | ||||
3475 | if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 && | |||
3476 | !IsLegacy && | |||
3477 | // If we have an SGPR input, we will violate the constant bus restriction. | |||
3478 | (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || | |||
3479 | !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) { | |||
3480 | MachineInstr *DefMI; | |||
3481 | const auto killDef = [&]() -> void { | |||
3482 | const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
3483 | // The only user is the instruction which will be killed. | |||
3484 | Register DefReg = DefMI->getOperand(0).getReg(); | |||
3485 | if (!MRI.hasOneNonDBGUse(DefReg)) | |||
3486 | return; | |||
3487 | // We cannot just remove the DefMI here, calling pass will crash. | |||
3488 | DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF)); | |||
3489 | for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I) | |||
3490 | DefMI->removeOperand(I); | |||
3491 | if (LV) | |||
3492 | LV->getVarInfo(DefReg).AliveBlocks.clear(); | |||
3493 | }; | |||
3494 | ||||
3495 | int64_t Imm; | |||
3496 | if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) { | |||
3497 | unsigned NewOpc = | |||
3498 | IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16 | |||
3499 | : AMDGPU::V_FMAAK_F16) | |||
3500 | : AMDGPU::V_FMAAK_F32) | |||
3501 | : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); | |||
3502 | if (pseudoToMCOpcode(NewOpc) != -1) { | |||
3503 | MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3504 | .add(*Dst) | |||
3505 | .add(*Src0) | |||
3506 | .add(*Src1) | |||
3507 | .addImm(Imm); | |||
3508 | updateLiveVariables(LV, MI, *MIB); | |||
3509 | if (LIS) | |||
3510 | LIS->ReplaceMachineInstrInMaps(MI, *MIB); | |||
3511 | killDef(); | |||
3512 | return MIB; | |||
3513 | } | |||
3514 | } | |||
3515 | unsigned NewOpc = | |||
3516 | IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16 | |||
3517 | : AMDGPU::V_FMAMK_F16) | |||
3518 | : AMDGPU::V_FMAMK_F32) | |||
3519 | : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); | |||
3520 | if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) { | |||
3521 | if (pseudoToMCOpcode(NewOpc) != -1) { | |||
3522 | MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3523 | .add(*Dst) | |||
3524 | .add(*Src0) | |||
3525 | .addImm(Imm) | |||
3526 | .add(*Src2); | |||
3527 | updateLiveVariables(LV, MI, *MIB); | |||
3528 | if (LIS) | |||
3529 | LIS->ReplaceMachineInstrInMaps(MI, *MIB); | |||
3530 | killDef(); | |||
3531 | return MIB; | |||
3532 | } | |||
3533 | } | |||
3534 | if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) { | |||
3535 | if (Src0Literal) { | |||
3536 | Imm = Src0->getImm(); | |||
3537 | DefMI = nullptr; | |||
3538 | } | |||
3539 | if (pseudoToMCOpcode(NewOpc) != -1 && | |||
3540 | isOperandLegal( | |||
3541 | MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0), | |||
3542 | Src1)) { | |||
3543 | MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3544 | .add(*Dst) | |||
3545 | .add(*Src1) | |||
3546 | .addImm(Imm) | |||
3547 | .add(*Src2); | |||
3548 | updateLiveVariables(LV, MI, *MIB); | |||
3549 | if (LIS) | |||
3550 | LIS->ReplaceMachineInstrInMaps(MI, *MIB); | |||
3551 | if (DefMI) | |||
3552 | killDef(); | |||
3553 | return MIB; | |||
3554 | } | |||
3555 | } | |||
3556 | } | |||
3557 | ||||
3558 | // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma | |||
3559 | // if VOP3 does not allow a literal operand. | |||
3560 | if (Src0Literal && !ST.hasVOP3Literal()) | |||
3561 | return nullptr; | |||
3562 | ||||
3563 | unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64 | |||
3564 | : IsF64 ? AMDGPU::V_FMA_F64_e64 | |||
3565 | : IsLegacy | |||
3566 | ? AMDGPU::V_FMA_LEGACY_F32_e64 | |||
3567 | : AMDGPU::V_FMA_F32_e64 | |||
3568 | : IsF16 ? AMDGPU::V_MAD_F16_e64 | |||
3569 | : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64 | |||
3570 | : AMDGPU::V_MAD_F32_e64; | |||
3571 | if (pseudoToMCOpcode(NewOpc) == -1) | |||
3572 | return nullptr; | |||
3573 | ||||
3574 | MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3575 | .add(*Dst) | |||
3576 | .addImm(Src0Mods ? Src0Mods->getImm() : 0) | |||
3577 | .add(*Src0) | |||
3578 | .addImm(Src1Mods ? Src1Mods->getImm() : 0) | |||
3579 | .add(*Src1) | |||
3580 | .addImm(Src2Mods ? Src2Mods->getImm() : 0) | |||
3581 | .add(*Src2) | |||
3582 | .addImm(Clamp ? Clamp->getImm() : 0) | |||
3583 | .addImm(Omod ? Omod->getImm() : 0); | |||
3584 | if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel)) | |||
3585 | MIB.addImm(OpSel ? OpSel->getImm() : 0); | |||
3586 | updateLiveVariables(LV, MI, *MIB); | |||
3587 | if (LIS) | |||
3588 | LIS->ReplaceMachineInstrInMaps(MI, *MIB); | |||
3589 | return MIB; | |||
3590 | } | |||
3591 | ||||
3592 | // It's not generally safe to move VALU instructions across these since it will | |||
3593 | // start using the register as a base index rather than directly. | |||
3594 | // XXX - Why isn't hasSideEffects sufficient for these? | |||
3595 | static bool changesVGPRIndexingMode(const MachineInstr &MI) { | |||
3596 | switch (MI.getOpcode()) { | |||
3597 | case AMDGPU::S_SET_GPR_IDX_ON: | |||
3598 | case AMDGPU::S_SET_GPR_IDX_MODE: | |||
3599 | case AMDGPU::S_SET_GPR_IDX_OFF: | |||
3600 | return true; | |||
3601 | default: | |||
3602 | return false; | |||
3603 | } | |||
3604 | } | |||
3605 | ||||
3606 | bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, | |||
3607 | const MachineBasicBlock *MBB, | |||
3608 | const MachineFunction &MF) const { | |||
3609 | // Skipping the check for SP writes in the base implementation. The reason it | |||
3610 | // was added was apparently due to compile time concerns. | |||
3611 | // | |||
3612 | // TODO: Do we really want this barrier? It triggers unnecessary hazard nops | |||
3613 | // but is probably avoidable. | |||
3614 | ||||
3615 | // Copied from base implementation. | |||
3616 | // Terminators and labels can't be scheduled around. | |||
3617 | if (MI.isTerminator() || MI.isPosition()) | |||
3618 | return true; | |||
3619 | ||||
3620 | // INLINEASM_BR can jump to another block | |||
3621 | if (MI.getOpcode() == TargetOpcode::INLINEASM_BR) | |||
3622 | return true; | |||
3623 | ||||
3624 | if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0) | |||
3625 | return true; | |||
3626 | ||||
3627 | // Target-independent instructions do not have an implicit-use of EXEC, even | |||
3628 | // when they operate on VGPRs. Treating EXEC modifications as scheduling | |||
3629 | // boundaries prevents incorrect movements of such instructions. | |||
3630 | return MI.modifiesRegister(AMDGPU::EXEC, &RI) || | |||
3631 | MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || | |||
3632 | MI.getOpcode() == AMDGPU::S_SETREG_B32 || | |||
3633 | MI.getOpcode() == AMDGPU::S_SETPRIO || | |||
3634 | changesVGPRIndexingMode(MI); | |||
3635 | } | |||
3636 | ||||
3637 | bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { | |||
3638 | return Opcode == AMDGPU::DS_ORDERED_COUNT || | |||
3639 | Opcode == AMDGPU::DS_GWS_INIT || | |||
3640 | Opcode == AMDGPU::DS_GWS_SEMA_V || | |||
3641 | Opcode == AMDGPU::DS_GWS_SEMA_BR || | |||
3642 | Opcode == AMDGPU::DS_GWS_SEMA_P || | |||
3643 | Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || | |||
3644 | Opcode == AMDGPU::DS_GWS_BARRIER; | |||
3645 | } | |||
3646 | ||||
3647 | bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { | |||
3648 | // Skip the full operand and register alias search modifiesRegister | |||
3649 | // does. There's only a handful of instructions that touch this, it's only an | |||
3650 | // implicit def, and doesn't alias any other registers. | |||
3651 | return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE); | |||
3652 | } | |||
3653 | ||||
3654 | bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { | |||
3655 | unsigned Opcode = MI.getOpcode(); | |||
3656 | ||||
3657 | if (MI.mayStore() && isSMRD(MI)) | |||
3658 | return true; // scalar store or atomic | |||
3659 | ||||
3660 | // This will terminate the function when other lanes may need to continue. | |||
3661 | if (MI.isReturn()) | |||
3662 | return true; | |||
3663 | ||||
3664 | // These instructions cause shader I/O that may cause hardware lockups | |||
3665 | // when executed with an empty EXEC mask. | |||
3666 | // | |||
3667 | // Note: exp with VM = DONE = 0 is automatically skipped by hardware when | |||
3668 | // EXEC = 0, but checking for that case here seems not worth it | |||
3669 | // given the typical code patterns. | |||
3670 | if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || | |||
3671 | isEXP(Opcode) || | |||
3672 | Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP || | |||
3673 | Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER) | |||
3674 | return true; | |||
3675 | ||||
3676 | if (MI.isCall() || MI.isInlineAsm()) | |||
3677 | return true; // conservative assumption | |||
3678 | ||||
3679 | // A mode change is a scalar operation that influences vector instructions. | |||
3680 | if (modifiesModeRegister(MI)) | |||
3681 | return true; | |||
3682 | ||||
3683 | // These are like SALU instructions in terms of effects, so it's questionable | |||
3684 | // whether we should return true for those. | |||
3685 | // | |||
3686 | // However, executing them with EXEC = 0 causes them to operate on undefined | |||
3687 | // data, which we avoid by returning true here. | |||
3688 | if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || | |||
3689 | Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32) | |||
3690 | return true; | |||
3691 | ||||
3692 | return false; | |||
3693 | } | |||
3694 | ||||
3695 | bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI, | |||
3696 | const MachineInstr &MI) const { | |||
3697 | if (MI.isMetaInstruction()) | |||
3698 | return false; | |||
3699 | ||||
3700 | // This won't read exec if this is an SGPR->SGPR copy. | |||
3701 | if (MI.isCopyLike()) { | |||
3702 | if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg())) | |||
3703 | return true; | |||
3704 | ||||
3705 | // Make sure this isn't copying exec as a normal operand | |||
3706 | return MI.readsRegister(AMDGPU::EXEC, &RI); | |||
3707 | } | |||
3708 | ||||
3709 | // Make a conservative assumption about the callee. | |||
3710 | if (MI.isCall()) | |||
3711 | return true; | |||
3712 | ||||
3713 | // Be conservative with any unhandled generic opcodes. | |||
3714 | if (!isTargetSpecificOpcode(MI.getOpcode())) | |||
3715 | return true; | |||
3716 | ||||
3717 | return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI); | |||
3718 | } | |||
3719 | ||||
3720 | bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { | |||
3721 | switch (Imm.getBitWidth()) { | |||
3722 | case 1: // This likely will be a condition code mask. | |||
3723 | return true; | |||
3724 | ||||
3725 | case 32: | |||
3726 | return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), | |||
3727 | ST.hasInv2PiInlineImm()); | |||
3728 | case 64: | |||
3729 | return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), | |||
3730 | ST.hasInv2PiInlineImm()); | |||
3731 | case 16: | |||
3732 | return ST.has16BitInsts() && | |||
3733 | AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), | |||
3734 | ST.hasInv2PiInlineImm()); | |||
3735 | default: | |||
3736 | llvm_unreachable("invalid bitwidth")::llvm::llvm_unreachable_internal("invalid bitwidth", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3736); | |||
3737 | } | |||
3738 | } | |||
3739 | ||||
3740 | bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, | |||
3741 | uint8_t OperandType) const { | |||
3742 | assert(!MO.isReg() && "isInlineConstant called on register operand!")(static_cast <bool> (!MO.isReg() && "isInlineConstant called on register operand!" ) ? void (0) : __assert_fail ("!MO.isReg() && \"isInlineConstant called on register operand!\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3742, __extension__ __PRETTY_FUNCTION__)); | |||
3743 | if (!MO.isImm() || | |||
3744 | OperandType < AMDGPU::OPERAND_SRC_FIRST || | |||
3745 | OperandType > AMDGPU::OPERAND_SRC_LAST) | |||
3746 | return false; | |||
3747 | ||||
3748 | // MachineOperand provides no way to tell the true operand size, since it only | |||
3749 | // records a 64-bit value. We need to know the size to determine if a 32-bit | |||
3750 | // floating point immediate bit pattern is legal for an integer immediate. It | |||
3751 | // would be for any 32-bit integer operand, but would not be for a 64-bit one. | |||
3752 | ||||
3753 | int64_t Imm = MO.getImm(); | |||
3754 | switch (OperandType) { | |||
3755 | case AMDGPU::OPERAND_REG_IMM_INT32: | |||
3756 | case AMDGPU::OPERAND_REG_IMM_FP32: | |||
3757 | case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED: | |||
3758 | case AMDGPU::OPERAND_REG_INLINE_C_INT32: | |||
3759 | case AMDGPU::OPERAND_REG_INLINE_C_FP32: | |||
3760 | case AMDGPU::OPERAND_REG_IMM_V2FP32: | |||
3761 | case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: | |||
3762 | case AMDGPU::OPERAND_REG_IMM_V2INT32: | |||
3763 | case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: | |||
3764 | case AMDGPU::OPERAND_REG_INLINE_AC_INT32: | |||
3765 | case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { | |||
3766 | int32_t Trunc = static_cast<int32_t>(Imm); | |||
3767 | return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); | |||
3768 | } | |||
3769 | case AMDGPU::OPERAND_REG_IMM_INT64: | |||
3770 | case AMDGPU::OPERAND_REG_IMM_FP64: | |||
3771 | case AMDGPU::OPERAND_REG_INLINE_C_INT64: | |||
3772 | case AMDGPU::OPERAND_REG_INLINE_C_FP64: | |||
3773 | case AMDGPU::OPERAND_REG_INLINE_AC_FP64: | |||
3774 | return AMDGPU::isInlinableLiteral64(MO.getImm(), | |||
3775 | ST.hasInv2PiInlineImm()); | |||
3776 | case AMDGPU::OPERAND_REG_IMM_INT16: | |||
3777 | case AMDGPU::OPERAND_REG_INLINE_C_INT16: | |||
3778 | case AMDGPU::OPERAND_REG_INLINE_AC_INT16: | |||
3779 | // We would expect inline immediates to not be concerned with an integer/fp | |||
3780 | // distinction. However, in the case of 16-bit integer operations, the | |||
3781 | // "floating point" values appear to not work. It seems read the low 16-bits | |||
3782 | // of 32-bit immediates, which happens to always work for the integer | |||
3783 | // values. | |||
3784 | // | |||
3785 | // See llvm bugzilla 46302. | |||
3786 | // | |||
3787 | // TODO: Theoretically we could use op-sel to use the high bits of the | |||
3788 | // 32-bit FP values. | |||
3789 | return AMDGPU::isInlinableIntLiteral(Imm); | |||
3790 | case AMDGPU::OPERAND_REG_IMM_V2INT16: | |||
3791 | case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: | |||
3792 | case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: | |||
3793 | // This suffers the same problem as the scalar 16-bit cases. | |||
3794 | return AMDGPU::isInlinableIntLiteralV216(Imm); | |||
3795 | case AMDGPU::OPERAND_REG_IMM_FP16: | |||
3796 | case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: | |||
3797 | case AMDGPU::OPERAND_REG_INLINE_C_FP16: | |||
3798 | case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { | |||
3799 | if (isInt<16>(Imm) || isUInt<16>(Imm)) { | |||
3800 | // A few special case instructions have 16-bit operands on subtargets | |||
3801 | // where 16-bit instructions are not legal. | |||
3802 | // TODO: Do the 32-bit immediates work? We shouldn't really need to handle | |||
3803 | // constants in these cases | |||
3804 | int16_t Trunc = static_cast<int16_t>(Imm); | |||
3805 | return ST.has16BitInsts() && | |||
3806 | AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); | |||
3807 | } | |||
3808 | ||||
3809 | return false; | |||
3810 | } | |||
3811 | case AMDGPU::OPERAND_REG_IMM_V2FP16: | |||
3812 | case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: | |||
3813 | case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { | |||
3814 | uint32_t Trunc = static_cast<uint32_t>(Imm); | |||
3815 | return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); | |||
3816 | } | |||
3817 | case AMDGPU::OPERAND_KIMM32: | |||
3818 | case AMDGPU::OPERAND_KIMM16: | |||
3819 | return false; | |||
3820 | default: | |||
3821 | llvm_unreachable("invalid bitwidth")::llvm::llvm_unreachable_internal("invalid bitwidth", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3821); | |||
3822 | } | |||
3823 | } | |||
3824 | ||||
3825 | static bool compareMachineOp(const MachineOperand &Op0, | |||
3826 | const MachineOperand &Op1) { | |||
3827 | if (Op0.getType() != Op1.getType()) | |||
3828 | return false; | |||
3829 | ||||
3830 | switch (Op0.getType()) { | |||
3831 | case MachineOperand::MO_Register: | |||
3832 | return Op0.getReg() == Op1.getReg(); | |||
3833 | case MachineOperand::MO_Immediate: | |||
3834 | return Op0.getImm() == Op1.getImm(); | |||
3835 | default: | |||
3836 | llvm_unreachable("Didn't expect to be comparing these operand types")::llvm::llvm_unreachable_internal("Didn't expect to be comparing these operand types" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3836); | |||
3837 | } | |||
3838 | } | |||
3839 | ||||
3840 | bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, | |||
3841 | const MachineOperand &MO) const { | |||
3842 | const MCInstrDesc &InstDesc = MI.getDesc(); | |||
3843 | const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; | |||
3844 | ||||
3845 | assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal())(static_cast <bool> (MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) ? void (0) : __assert_fail ("MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3845, __extension__ __PRETTY_FUNCTION__)); | |||
3846 | ||||
3847 | if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) | |||
3848 | return true; | |||
3849 | ||||
3850 | if (OpInfo.RegClass < 0) | |||
3851 | return false; | |||
3852 | ||||
3853 | if (MO.isImm() && isInlineConstant(MO, OpInfo)) { | |||
3854 | if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() && | |||
3855 | OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(), | |||
3856 | AMDGPU::OpName::src2)) | |||
3857 | return false; | |||
3858 | return RI.opCanUseInlineConstant(OpInfo.OperandType); | |||
3859 | } | |||
3860 | ||||
3861 | if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) | |||
3862 | return false; | |||
3863 | ||||
3864 | if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) | |||
3865 | return true; | |||
3866 | ||||
3867 | return ST.hasVOP3Literal(); | |||
3868 | } | |||
3869 | ||||
3870 | bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { | |||
3871 | // GFX90A does not have V_MUL_LEGACY_F32_e32. | |||
3872 | if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts()) | |||
3873 | return false; | |||
3874 | ||||
3875 | int Op32 = AMDGPU::getVOPe32(Opcode); | |||
3876 | if (Op32 == -1) | |||
3877 | return false; | |||
3878 | ||||
3879 | return pseudoToMCOpcode(Op32) != -1; | |||
3880 | } | |||
3881 | ||||
3882 | bool SIInstrInfo::hasModifiers(unsigned Opcode) const { | |||
3883 | // The src0_modifier operand is present on all instructions | |||
3884 | // that have modifiers. | |||
3885 | ||||
3886 | return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers); | |||
3887 | } | |||
3888 | ||||
3889 | bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, | |||
3890 | unsigned OpName) const { | |||
3891 | const MachineOperand *Mods = getNamedOperand(MI, OpName); | |||
3892 | return Mods && Mods->getImm(); | |||
3893 | } | |||
3894 | ||||
3895 | bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { | |||
3896 | return any_of(ModifierOpNames, | |||
3897 | [&](unsigned Name) { return hasModifiersSet(MI, Name); }); | |||
3898 | } | |||
3899 | ||||
3900 | bool SIInstrInfo::canShrink(const MachineInstr &MI, | |||
3901 | const MachineRegisterInfo &MRI) const { | |||
3902 | const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); | |||
3903 | // Can't shrink instruction with three operands. | |||
3904 | if (Src2) { | |||
3905 | switch (MI.getOpcode()) { | |||
3906 | default: return false; | |||
3907 | ||||
3908 | case AMDGPU::V_ADDC_U32_e64: | |||
3909 | case AMDGPU::V_SUBB_U32_e64: | |||
3910 | case AMDGPU::V_SUBBREV_U32_e64: { | |||
3911 | const MachineOperand *Src1 | |||
3912 | = getNamedOperand(MI, AMDGPU::OpName::src1); | |||
3913 | if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg())) | |||
3914 | return false; | |||
3915 | // Additional verification is needed for sdst/src2. | |||
3916 | return true; | |||
3917 | } | |||
3918 | case AMDGPU::V_MAC_F16_e64: | |||
3919 | case AMDGPU::V_MAC_F32_e64: | |||
3920 | case AMDGPU::V_MAC_LEGACY_F32_e64: | |||
3921 | case AMDGPU::V_FMAC_F16_e64: | |||
3922 | case AMDGPU::V_FMAC_F16_t16_e64: | |||
3923 | case AMDGPU::V_FMAC_F32_e64: | |||
3924 | case AMDGPU::V_FMAC_F64_e64: | |||
3925 | case AMDGPU::V_FMAC_LEGACY_F32_e64: | |||
3926 | if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || | |||
3927 | hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) | |||
3928 | return false; | |||
3929 | break; | |||
3930 | ||||
3931 | case AMDGPU::V_CNDMASK_B32_e64: | |||
3932 | break; | |||
3933 | } | |||
3934 | } | |||
3935 | ||||
3936 | const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); | |||
3937 | if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) || | |||
3938 | hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) | |||
3939 | return false; | |||
3940 | ||||
3941 | // We don't need to check src0, all input types are legal, so just make sure | |||
3942 | // src0 isn't using any modifiers. | |||
3943 | if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) | |||
3944 | return false; | |||
3945 | ||||
3946 | // Can it be shrunk to a valid 32 bit opcode? | |||
3947 | if (!hasVALU32BitEncoding(MI.getOpcode())) | |||
3948 | return false; | |||
3949 | ||||
3950 | // Check output modifiers | |||
3951 | return !hasModifiersSet(MI, AMDGPU::OpName::omod) && | |||
3952 | !hasModifiersSet(MI, AMDGPU::OpName::clamp); | |||
3953 | } | |||
3954 | ||||
3955 | // Set VCC operand with all flags from \p Orig, except for setting it as | |||
3956 | // implicit. | |||
3957 | static void copyFlagsToImplicitVCC(MachineInstr &MI, | |||
3958 | const MachineOperand &Orig) { | |||
3959 | ||||
3960 | for (MachineOperand &Use : MI.implicit_operands()) { | |||
3961 | if (Use.isUse() && | |||
3962 | (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) { | |||
3963 | Use.setIsUndef(Orig.isUndef()); | |||
3964 | Use.setIsKill(Orig.isKill()); | |||
3965 | return; | |||
3966 | } | |||
3967 | } | |||
3968 | } | |||
3969 | ||||
3970 | MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, | |||
3971 | unsigned Op32) const { | |||
3972 | MachineBasicBlock *MBB = MI.getParent(); | |||
3973 | MachineInstrBuilder Inst32 = | |||
3974 | BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)) | |||
3975 | .setMIFlags(MI.getFlags()); | |||
3976 | ||||
3977 | // Add the dst operand if the 32-bit encoding also has an explicit $vdst. | |||
3978 | // For VOPC instructions, this is replaced by an implicit def of vcc. | |||
3979 | if (AMDGPU::hasNamedOperand(Op32, AMDGPU::OpName::vdst)) { | |||
3980 | // dst | |||
3981 | Inst32.add(MI.getOperand(0)); | |||
3982 | } else if (AMDGPU::hasNamedOperand(Op32, AMDGPU::OpName::sdst)) { | |||
3983 | // VOPCX instructions won't be writing to an explicit dst, so this should | |||
3984 | // not fail for these instructions. | |||
3985 | assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||(static_cast <bool> (((MI.getOperand(0).getReg() == AMDGPU ::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && "Unexpected case") ? void (0) : __assert_fail ("((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && \"Unexpected case\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3987, __extension__ __PRETTY_FUNCTION__)) | |||
3986 | (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&(static_cast <bool> (((MI.getOperand(0).getReg() == AMDGPU ::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && "Unexpected case") ? void (0) : __assert_fail ("((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && \"Unexpected case\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3987, __extension__ __PRETTY_FUNCTION__)) | |||
3987 | "Unexpected case")(static_cast <bool> (((MI.getOperand(0).getReg() == AMDGPU ::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && "Unexpected case") ? void (0) : __assert_fail ("((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && \"Unexpected case\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3987, __extension__ __PRETTY_FUNCTION__)); | |||
3988 | } | |||
3989 | ||||
3990 | Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0)); | |||
3991 | ||||
3992 | const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); | |||
3993 | if (Src1) | |||
3994 | Inst32.add(*Src1); | |||
3995 | ||||
3996 | const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); | |||
3997 | ||||
3998 | if (Src2) { | |||
3999 | int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); | |||
4000 | if (Op32Src2Idx != -1) { | |||
4001 | Inst32.add(*Src2); | |||
4002 | } else { | |||
4003 | // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is | |||
4004 | // replaced with an implicit read of vcc or vcc_lo. The implicit read | |||
4005 | // of vcc was already added during the initial BuildMI, but we | |||
4006 | // 1) may need to change vcc to vcc_lo to preserve the original register | |||
4007 | // 2) have to preserve the original flags. | |||
4008 | fixImplicitOperands(*Inst32); | |||
4009 | copyFlagsToImplicitVCC(*Inst32, *Src2); | |||
4010 | } | |||
4011 | } | |||
4012 | ||||
4013 | return Inst32; | |||
4014 | } | |||
4015 | ||||
4016 | bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, | |||
4017 | const MachineOperand &MO, | |||
4018 | const MCOperandInfo &OpInfo) const { | |||
4019 | // Literal constants use the constant bus. | |||
4020 | if (!MO.isReg()) | |||
4021 | return !isInlineConstant(MO, OpInfo); | |||
4022 | ||||
4023 | if (!MO.isUse()) | |||
4024 | return false; | |||
4025 | ||||
4026 | if (MO.getReg().isVirtual()) | |||
4027 | return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); | |||
4028 | ||||
4029 | // Null is free | |||
4030 | if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64) | |||
4031 | return false; | |||
4032 | ||||
4033 | // SGPRs use the constant bus | |||
4034 | if (MO.isImplicit()) { | |||
4035 | return MO.getReg() == AMDGPU::M0 || | |||
4036 | MO.getReg() == AMDGPU::VCC || | |||
4037 | MO.getReg() == AMDGPU::VCC_LO; | |||
4038 | } else { | |||
4039 | return AMDGPU::SReg_32RegClass.contains(MO.getReg()) || | |||
4040 | AMDGPU::SReg_64RegClass.contains(MO.getReg()); | |||
4041 | } | |||
4042 | } | |||
4043 | ||||
4044 | static Register findImplicitSGPRRead(const MachineInstr &MI) { | |||
4045 | for (const MachineOperand &MO : MI.implicit_operands()) { | |||
4046 | // We only care about reads. | |||
4047 | if (MO.isDef()) | |||
4048 | continue; | |||
4049 | ||||
4050 | switch (MO.getReg()) { | |||
4051 | case AMDGPU::VCC: | |||
4052 | case AMDGPU::VCC_LO: | |||
4053 | case AMDGPU::VCC_HI: | |||
4054 | case AMDGPU::M0: | |||
4055 | case AMDGPU::FLAT_SCR: | |||
4056 | return MO.getReg(); | |||
4057 | ||||
4058 | default: | |||
4059 | break; | |||
4060 | } | |||
4061 | } | |||
4062 | ||||
4063 | return Register(); | |||
4064 | } | |||
4065 | ||||
4066 | static bool shouldReadExec(const MachineInstr &MI) { | |||
4067 | if (SIInstrInfo::isVALU(MI)) { | |||
4068 | switch (MI.getOpcode()) { | |||
4069 | case AMDGPU::V_READLANE_B32: | |||
4070 | case AMDGPU::V_WRITELANE_B32: | |||
4071 | return false; | |||
4072 | } | |||
4073 | ||||
4074 | return true; | |||
4075 | } | |||
4076 | ||||
4077 | if (MI.isPreISelOpcode() || | |||
4078 | SIInstrInfo::isGenericOpcode(MI.getOpcode()) || | |||
4079 | SIInstrInfo::isSALU(MI) || | |||
4080 | SIInstrInfo::isSMRD(MI)) | |||
4081 | return false; | |||
4082 | ||||
4083 | return true; | |||
4084 | } | |||
4085 | ||||
4086 | static bool isSubRegOf(const SIRegisterInfo &TRI, | |||
4087 | const MachineOperand &SuperVec, | |||
4088 | const MachineOperand &SubReg) { | |||
4089 | if (SubReg.getReg().isPhysical()) | |||
4090 | return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); | |||
4091 | ||||
4092 | return SubReg.getSubReg() != AMDGPU::NoSubRegister && | |||
4093 | SubReg.getReg() == SuperVec.getReg(); | |||
4094 | } | |||
4095 | ||||
4096 | bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, | |||
4097 | StringRef &ErrInfo) const { | |||
4098 | uint16_t Opcode = MI.getOpcode(); | |||
4099 | if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) | |||
4100 | return true; | |||
4101 | ||||
4102 | const MachineFunction *MF = MI.getParent()->getParent(); | |||
4103 | const MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
4104 | ||||
4105 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); | |||
4106 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); | |||
4107 | int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); | |||
4108 | int Src3Idx = -1; | |||
4109 | if (Src0Idx == -1) { | |||
4110 | // VOPD V_DUAL_* instructions use different operand names. | |||
4111 | Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X); | |||
4112 | Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X); | |||
4113 | Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y); | |||
4114 | Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y); | |||
4115 | } | |||
4116 | ||||
4117 | // Make sure the number of operands is correct. | |||
4118 | const MCInstrDesc &Desc = get(Opcode); | |||
4119 | if (!Desc.isVariadic() && | |||
4120 | Desc.getNumOperands() != MI.getNumExplicitOperands()) { | |||
4121 | ErrInfo = "Instruction has wrong number of operands."; | |||
4122 | return false; | |||
4123 | } | |||
4124 | ||||
4125 | if (MI.isInlineAsm()) { | |||
4126 | // Verify register classes for inlineasm constraints. | |||
4127 | for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); | |||
4128 | I != E; ++I) { | |||
4129 | const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); | |||
4130 | if (!RC) | |||
4131 | continue; | |||
4132 | ||||
4133 | const MachineOperand &Op = MI.getOperand(I); | |||
4134 | if (!Op.isReg()) | |||
4135 | continue; | |||
4136 | ||||
4137 | Register Reg = Op.getReg(); | |||
4138 | if (!Reg.isVirtual() && !RC->contains(Reg)) { | |||
4139 | ErrInfo = "inlineasm operand has incorrect register class."; | |||
4140 | return false; | |||
4141 | } | |||
4142 | } | |||
4143 | ||||
4144 | return true; | |||
4145 | } | |||
4146 | ||||
4147 | if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) { | |||
4148 | ErrInfo = "missing memory operand from MIMG instruction."; | |||
4149 | return false; | |||
4150 | } | |||
4151 | ||||
4152 | // Make sure the register classes are correct. | |||
4153 | for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { | |||
4154 | const MachineOperand &MO = MI.getOperand(i); | |||
4155 | if (MO.isFPImm()) { | |||
4156 | ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " | |||
4157 | "all fp values to integers."; | |||
4158 | return false; | |||
4159 | } | |||
4160 | ||||
4161 | int RegClass = Desc.operands()[i].RegClass; | |||
4162 | ||||
4163 | switch (Desc.operands()[i].OperandType) { | |||
4164 | case MCOI::OPERAND_REGISTER: | |||
4165 | if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { | |||
4166 | ErrInfo = "Illegal immediate value for operand."; | |||
4167 | return false; | |||
4168 | } | |||
4169 | break; | |||
4170 | case AMDGPU::OPERAND_REG_IMM_INT32: | |||
4171 | case AMDGPU::OPERAND_REG_IMM_FP32: | |||
4172 | case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED: | |||
4173 | case AMDGPU::OPERAND_REG_IMM_V2FP32: | |||
4174 | break; | |||
4175 | case AMDGPU::OPERAND_REG_INLINE_C_INT32: | |||
4176 | case AMDGPU::OPERAND_REG_INLINE_C_FP32: | |||
4177 | case AMDGPU::OPERAND_REG_INLINE_C_INT64: | |||
4178 | case AMDGPU::OPERAND_REG_INLINE_C_FP64: | |||
4179 | case AMDGPU::OPERAND_REG_INLINE_C_INT16: | |||
4180 | case AMDGPU::OPERAND_REG_INLINE_C_FP16: | |||
4181 | case AMDGPU::OPERAND_REG_INLINE_AC_INT32: | |||
4182 | case AMDGPU::OPERAND_REG_INLINE_AC_FP32: | |||
4183 | case AMDGPU::OPERAND_REG_INLINE_AC_INT16: | |||
4184 | case AMDGPU::OPERAND_REG_INLINE_AC_FP16: | |||
4185 | case AMDGPU::OPERAND_REG_INLINE_AC_FP64: { | |||
4186 | if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { | |||
4187 | ErrInfo = "Illegal immediate value for operand."; | |||
4188 | return false; | |||
4189 | } | |||
4190 | break; | |||
4191 | } | |||
4192 | case MCOI::OPERAND_IMMEDIATE: | |||
4193 | case AMDGPU::OPERAND_KIMM32: | |||
4194 | // Check if this operand is an immediate. | |||
4195 | // FrameIndex operands will be replaced by immediates, so they are | |||
4196 | // allowed. | |||
4197 | if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { | |||
4198 | ErrInfo = "Expected immediate, but got non-immediate"; | |||
4199 | return false; | |||
4200 | } | |||
4201 | [[fallthrough]]; | |||
4202 | default: | |||
4203 | continue; | |||
4204 | } | |||
4205 | ||||
4206 | if (!MO.isReg()) | |||
4207 | continue; | |||
4208 | Register Reg = MO.getReg(); | |||
4209 | if (!Reg) | |||
4210 | continue; | |||
4211 | ||||
4212 | // FIXME: Ideally we would have separate instruction definitions with the | |||
4213 | // aligned register constraint. | |||
4214 | // FIXME: We do not verify inline asm operands, but custom inline asm | |||
4215 | // verification is broken anyway | |||
4216 | if (ST.needsAlignedVGPRs()) { | |||
4217 | const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg); | |||
4218 | if (RI.hasVectorRegisters(RC) && MO.getSubReg()) { | |||
4219 | const TargetRegisterClass *SubRC = | |||
4220 | RI.getSubRegisterClass(RC, MO.getSubReg()); | |||
4221 | RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg()); | |||
4222 | if (RC) | |||
4223 | RC = SubRC; | |||
4224 | } | |||
4225 | ||||
4226 | // Check that this is the aligned version of the class. | |||
4227 | if (!RC || !RI.isProperlyAlignedRC(*RC)) { | |||
4228 | ErrInfo = "Subtarget requires even aligned vector registers"; | |||
4229 | return false; | |||
4230 | } | |||
4231 | } | |||
4232 | ||||
4233 | if (RegClass != -1) { | |||
4234 | if (Reg.isVirtual()) | |||
4235 | continue; | |||
4236 | ||||
4237 | const TargetRegisterClass *RC = RI.getRegClass(RegClass); | |||
4238 | if (!RC->contains(Reg)) { | |||
4239 | ErrInfo = "Operand has incorrect register class."; | |||
4240 | return false; | |||
4241 | } | |||
4242 | } | |||
4243 | } | |||
4244 | ||||
4245 | // Verify SDWA | |||
4246 | if (isSDWA(MI)) { | |||
4247 | if (!ST.hasSDWA()) { | |||
4248 | ErrInfo = "SDWA is not supported on this target"; | |||
4249 | return false; | |||
4250 | } | |||
4251 | ||||
4252 | int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); | |||
4253 | ||||
4254 | for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) { | |||
4255 | if (OpIdx == -1) | |||
4256 | continue; | |||
4257 | const MachineOperand &MO = MI.getOperand(OpIdx); | |||
4258 | ||||
4259 | if (!ST.hasSDWAScalar()) { | |||
4260 | // Only VGPRS on VI | |||
4261 | if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { | |||
4262 | ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; | |||
4263 | return false; | |||
4264 | } | |||
4265 | } else { | |||
4266 | // No immediates on GFX9 | |||
4267 | if (!MO.isReg()) { | |||
4268 | ErrInfo = | |||
4269 | "Only reg allowed as operands in SDWA instructions on GFX9+"; | |||
4270 | return false; | |||
4271 | } | |||
4272 | } | |||
4273 | } | |||
4274 | ||||
4275 | if (!ST.hasSDWAOmod()) { | |||
4276 | // No omod allowed on VI | |||
4277 | const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); | |||
4278 | if (OMod != nullptr && | |||
4279 | (!OMod->isImm() || OMod->getImm() != 0)) { | |||
4280 | ErrInfo = "OMod not allowed in SDWA instructions on VI"; | |||
4281 | return false; | |||
4282 | } | |||
4283 | } | |||
4284 | ||||
4285 | uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); | |||
4286 | if (isVOPC(BasicOpcode)) { | |||
4287 | if (!ST.hasSDWASdst() && DstIdx != -1) { | |||
4288 | // Only vcc allowed as dst on VI for VOPC | |||
4289 | const MachineOperand &Dst = MI.getOperand(DstIdx); | |||
4290 | if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { | |||
4291 | ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; | |||
4292 | return false; | |||
4293 | } | |||
4294 | } else if (!ST.hasSDWAOutModsVOPC()) { | |||
4295 | // No clamp allowed on GFX9 for VOPC | |||
4296 | const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); | |||
4297 | if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { | |||
4298 | ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; | |||
4299 | return false; | |||
4300 | } | |||
4301 | ||||
4302 | // No omod allowed on GFX9 for VOPC | |||
4303 | const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); | |||
4304 | if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { | |||
4305 | ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; | |||
4306 | return false; | |||
4307 | } | |||
4308 | } | |||
4309 | } | |||
4310 | ||||
4311 | const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused); | |||
4312 | if (DstUnused && DstUnused->isImm() && | |||
4313 | DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) { | |||
4314 | const MachineOperand &Dst = MI.getOperand(DstIdx); | |||
4315 | if (!Dst.isReg() || !Dst.isTied()) { | |||
4316 | ErrInfo = "Dst register should have tied register"; | |||
4317 | return false; | |||
4318 | } | |||
4319 | ||||
4320 | const MachineOperand &TiedMO = | |||
4321 | MI.getOperand(MI.findTiedOperandIdx(DstIdx)); | |||
4322 | if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) { | |||
4323 | ErrInfo = | |||
4324 | "Dst register should be tied to implicit use of preserved register"; | |||
4325 | return false; | |||
4326 | } else if (TiedMO.getReg().isPhysical() && | |||
4327 | Dst.getReg() != TiedMO.getReg()) { | |||
4328 | ErrInfo = "Dst register should use same physical register as preserved"; | |||
4329 | return false; | |||
4330 | } | |||
4331 | } | |||
4332 | } | |||
4333 | ||||
4334 | // Verify MIMG | |||
4335 | if (isMIMG(MI.getOpcode()) && !MI.mayStore()) { | |||
4336 | // Ensure that the return type used is large enough for all the options | |||
4337 | // being used TFE/LWE require an extra result register. | |||
4338 | const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); | |||
4339 | if (DMask) { | |||
4340 | uint64_t DMaskImm = DMask->getImm(); | |||
4341 | uint32_t RegCount = | |||
4342 | isGather4(MI.getOpcode()) ? 4 : llvm::popcount(DMaskImm); | |||
4343 | const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe); | |||
4344 | const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe); | |||
4345 | const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16); | |||
4346 | ||||
4347 | // Adjust for packed 16 bit values | |||
4348 | if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem()) | |||
4349 | RegCount = divideCeil(RegCount, 2); | |||
4350 | ||||
4351 | // Adjust if using LWE or TFE | |||
4352 | if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) | |||
4353 | RegCount += 1; | |||
4354 | ||||
4355 | const uint32_t DstIdx = | |||
4356 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); | |||
4357 | const MachineOperand &Dst = MI.getOperand(DstIdx); | |||
4358 | if (Dst.isReg()) { | |||
4359 | const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); | |||
4360 | uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; | |||
4361 | if (RegCount > DstSize) { | |||
4362 | ErrInfo = "Image instruction returns too many registers for dst " | |||
4363 | "register class"; | |||
4364 | return false; | |||
4365 | } | |||
4366 | } | |||
4367 | } | |||
4368 | } | |||
4369 | ||||
4370 | // Verify VOP*. Ignore multiple sgpr operands on writelane. | |||
4371 | if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) { | |||
4372 | unsigned ConstantBusCount = 0; | |||
4373 | bool UsesLiteral = false; | |||
4374 | const MachineOperand *LiteralVal = nullptr; | |||
4375 | ||||
4376 | int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm); | |||
4377 | if (ImmIdx != -1) { | |||
4378 | ++ConstantBusCount; | |||
4379 | UsesLiteral = true; | |||
4380 | LiteralVal = &MI.getOperand(ImmIdx); | |||
4381 | } | |||
4382 | ||||
4383 | SmallVector<Register, 2> SGPRsUsed; | |||
4384 | Register SGPRUsed; | |||
4385 | ||||
4386 | // Only look at the true operands. Only a real operand can use the constant | |||
4387 | // bus, and we don't want to check pseudo-operands like the source modifier | |||
4388 | // flags. | |||
4389 | for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) { | |||
4390 | if (OpIdx == -1) | |||
4391 | continue; | |||
4392 | const MachineOperand &MO = MI.getOperand(OpIdx); | |||
4393 | if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) { | |||
4394 | if (MO.isReg()) { | |||
4395 | SGPRUsed = MO.getReg(); | |||
4396 | if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) { | |||
4397 | ++ConstantBusCount; | |||
4398 | SGPRsUsed.push_back(SGPRUsed); | |||
4399 | } | |||
4400 | } else { | |||
4401 | if (!UsesLiteral) { | |||
4402 | ++ConstantBusCount; | |||
4403 | UsesLiteral = true; | |||
4404 | LiteralVal = &MO; | |||
4405 | } else if (!MO.isIdenticalTo(*LiteralVal)) { | |||
4406 | assert(isVOP2(MI) || isVOP3(MI))(static_cast <bool> (isVOP2(MI) || isVOP3(MI)) ? void ( 0) : __assert_fail ("isVOP2(MI) || isVOP3(MI)", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4406, __extension__ __PRETTY_FUNCTION__)); | |||
4407 | ErrInfo = "VOP2/VOP3 instruction uses more than one literal"; | |||
4408 | return false; | |||
4409 | } | |||
4410 | } | |||
4411 | } | |||
4412 | } | |||
4413 | ||||
4414 | SGPRUsed = findImplicitSGPRRead(MI); | |||
4415 | if (SGPRUsed) { | |||
4416 | // Implicit uses may safely overlap true operands | |||
4417 | if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { | |||
4418 | return !RI.regsOverlap(SGPRUsed, SGPR); | |||
4419 | })) { | |||
4420 | ++ConstantBusCount; | |||
4421 | SGPRsUsed.push_back(SGPRUsed); | |||
4422 | } | |||
4423 | } | |||
4424 | ||||
4425 | // v_writelane_b32 is an exception from constant bus restriction: | |||
4426 | // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const | |||
4427 | if (ConstantBusCount > ST.getConstantBusLimit(Opcode) && | |||
4428 | Opcode != AMDGPU::V_WRITELANE_B32) { | |||
4429 | ErrInfo = "VOP* instruction violates constant bus restriction"; | |||
4430 | return false; | |||
4431 | } | |||
4432 | ||||
4433 | if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) { | |||
4434 | ErrInfo = "VOP3 instruction uses literal"; | |||
4435 | return false; | |||
4436 | } | |||
4437 | } | |||
4438 | ||||
4439 | // Special case for writelane - this can break the multiple constant bus rule, | |||
4440 | // but still can't use more than one SGPR register | |||
4441 | if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) { | |||
4442 | unsigned SGPRCount = 0; | |||
4443 | Register SGPRUsed; | |||
4444 | ||||
4445 | for (int OpIdx : {Src0Idx, Src1Idx}) { | |||
4446 | if (OpIdx == -1) | |||
4447 | break; | |||
4448 | ||||
4449 | const MachineOperand &MO = MI.getOperand(OpIdx); | |||
4450 | ||||
4451 | if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) { | |||
4452 | if (MO.isReg() && MO.getReg() != AMDGPU::M0) { | |||
4453 | if (MO.getReg() != SGPRUsed) | |||
4454 | ++SGPRCount; | |||
4455 | SGPRUsed = MO.getReg(); | |||
4456 | } | |||
4457 | } | |||
4458 | if (SGPRCount > ST.getConstantBusLimit(Opcode)) { | |||
4459 | ErrInfo = "WRITELANE instruction violates constant bus restriction"; | |||
4460 | return false; | |||
4461 | } | |||
4462 | } | |||
4463 | } | |||
4464 | ||||
4465 | // Verify misc. restrictions on specific instructions. | |||
4466 | if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 || | |||
4467 | Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) { | |||
4468 | const MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
4469 | const MachineOperand &Src1 = MI.getOperand(Src1Idx); | |||
4470 | const MachineOperand &Src2 = MI.getOperand(Src2Idx); | |||
4471 | if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { | |||
4472 | if (!compareMachineOp(Src0, Src1) && | |||
4473 | !compareMachineOp(Src0, Src2)) { | |||
4474 | ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; | |||
4475 | return false; | |||
4476 | } | |||
4477 | } | |||
4478 | if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & | |||
4479 | SISrcMods::ABS) || | |||
4480 | (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() & | |||
4481 | SISrcMods::ABS) || | |||
4482 | (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() & | |||
4483 | SISrcMods::ABS)) { | |||
4484 | ErrInfo = "ABS not allowed in VOP3B instructions"; | |||
4485 | return false; | |||
4486 | } | |||
4487 | } | |||
4488 | ||||
4489 | if (isSOP2(MI) || isSOPC(MI)) { | |||
4490 | const MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
4491 | const MachineOperand &Src1 = MI.getOperand(Src1Idx); | |||
4492 | ||||
4493 | if (!Src0.isReg() && !Src1.isReg() && | |||
4494 | !isInlineConstant(Src0, Desc.operands()[Src0Idx]) && | |||
4495 | !isInlineConstant(Src1, Desc.operands()[Src1Idx]) && | |||
4496 | !Src0.isIdenticalTo(Src1)) { | |||
4497 | ErrInfo = "SOP2/SOPC instruction requires too many immediate constants"; | |||
4498 | return false; | |||
4499 | } | |||
4500 | } | |||
4501 | ||||
4502 | if (isSOPK(MI)) { | |||
4503 | auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16); | |||
4504 | if (Desc.isBranch()) { | |||
4505 | if (!Op->isMBB()) { | |||
4506 | ErrInfo = "invalid branch target for SOPK instruction"; | |||
4507 | return false; | |||
4508 | } | |||
4509 | } else { | |||
4510 | uint64_t Imm = Op->getImm(); | |||
4511 | if (sopkIsZext(MI)) { | |||
4512 | if (!isUInt<16>(Imm)) { | |||
4513 | ErrInfo = "invalid immediate for SOPK instruction"; | |||
4514 | return false; | |||
4515 | } | |||
4516 | } else { | |||
4517 | if (!isInt<16>(Imm)) { | |||
4518 | ErrInfo = "invalid immediate for SOPK instruction"; | |||
4519 | return false; | |||
4520 | } | |||
4521 | } | |||
4522 | } | |||
4523 | } | |||
4524 | ||||
4525 | if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || | |||
4526 | Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || | |||
4527 | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || | |||
4528 | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { | |||
4529 | const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || | |||
4530 | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; | |||
4531 | ||||
4532 | const unsigned StaticNumOps = | |||
4533 | Desc.getNumOperands() + Desc.implicit_uses().size(); | |||
4534 | const unsigned NumImplicitOps = IsDst ? 2 : 1; | |||
4535 | ||||
4536 | // Allow additional implicit operands. This allows a fixup done by the post | |||
4537 | // RA scheduler where the main implicit operand is killed and implicit-defs | |||
4538 | // are added for sub-registers that remain live after this instruction. | |||
4539 | if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { | |||
4540 | ErrInfo = "missing implicit register operands"; | |||
4541 | return false; | |||
4542 | } | |||
4543 | ||||
4544 | const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
4545 | if (IsDst) { | |||
4546 | if (!Dst->isUse()) { | |||
4547 | ErrInfo = "v_movreld_b32 vdst should be a use operand"; | |||
4548 | return false; | |||
4549 | } | |||
4550 | ||||
4551 | unsigned UseOpIdx; | |||
4552 | if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || | |||
4553 | UseOpIdx != StaticNumOps + 1) { | |||
4554 | ErrInfo = "movrel implicit operands should be tied"; | |||
4555 | return false; | |||
4556 | } | |||
4557 | } | |||
4558 | ||||
4559 | const MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
4560 | const MachineOperand &ImpUse | |||
4561 | = MI.getOperand(StaticNumOps + NumImplicitOps - 1); | |||
4562 | if (!ImpUse.isReg() || !ImpUse.isUse() || | |||
4563 | !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { | |||
4564 | ErrInfo = "src0 should be subreg of implicit vector use"; | |||
4565 | return false; | |||
4566 | } | |||
4567 | } | |||
4568 | ||||
4569 | // Make sure we aren't losing exec uses in the td files. This mostly requires | |||
4570 | // being careful when using let Uses to try to add other use registers. | |||
4571 | if (shouldReadExec(MI)) { | |||
4572 | if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { | |||
4573 | ErrInfo = "VALU instruction does not implicitly read exec mask"; | |||
4574 | return false; | |||
4575 | } | |||
4576 | } | |||
4577 | ||||
4578 | if (isSMRD(MI)) { | |||
4579 | if (MI.mayStore() && | |||
4580 | ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) { | |||
4581 | // The register offset form of scalar stores may only use m0 as the | |||
4582 | // soffset register. | |||
4583 | const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset); | |||
4584 | if (Soff && Soff->getReg() != AMDGPU::M0) { | |||
4585 | ErrInfo = "scalar stores must use m0 as offset register"; | |||
4586 | return false; | |||
4587 | } | |||
4588 | } | |||
4589 | } | |||
4590 | ||||
4591 | if (isFLAT(MI) && !ST.hasFlatInstOffsets()) { | |||
4592 | const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); | |||
4593 | if (Offset->getImm() != 0) { | |||
4594 | ErrInfo = "subtarget does not support offsets in flat instructions"; | |||
4595 | return false; | |||
4596 | } | |||
4597 | } | |||
4598 | ||||
4599 | if (isMIMG(MI)) { | |||
4600 | const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim); | |||
4601 | if (DimOp) { | |||
4602 | int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode, | |||
4603 | AMDGPU::OpName::vaddr0); | |||
4604 | int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); | |||
4605 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode); | |||
4606 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = | |||
4607 | AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); | |||
4608 | const AMDGPU::MIMGDimInfo *Dim = | |||
4609 | AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm()); | |||
4610 | ||||
4611 | if (!Dim) { | |||
4612 | ErrInfo = "dim is out of range"; | |||
4613 | return false; | |||
4614 | } | |||
4615 | ||||
4616 | bool IsA16 = false; | |||
4617 | if (ST.hasR128A16()) { | |||
4618 | const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128); | |||
4619 | IsA16 = R128A16->getImm() != 0; | |||
4620 | } else if (ST.hasA16()) { | |||
4621 | const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16); | |||
4622 | IsA16 = A16->getImm() != 0; | |||
4623 | } | |||
4624 | ||||
4625 | bool IsNSA = SRsrcIdx - VAddr0Idx > 1; | |||
4626 | ||||
4627 | unsigned AddrWords = | |||
4628 | AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16()); | |||
4629 | ||||
4630 | unsigned VAddrWords; | |||
4631 | if (IsNSA) { | |||
4632 | VAddrWords = SRsrcIdx - VAddr0Idx; | |||
4633 | if (ST.hasPartialNSAEncoding() && AddrWords > ST.getNSAMaxSize()) { | |||
4634 | unsigned LastVAddrIdx = SRsrcIdx - 1; | |||
4635 | VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1; | |||
4636 | } | |||
4637 | } else { | |||
4638 | VAddrWords = getOpSize(MI, VAddr0Idx) / 4; | |||
4639 | if (AddrWords > 12) | |||
4640 | AddrWords = 16; | |||
4641 | } | |||
4642 | ||||
4643 | if (VAddrWords != AddrWords) { | |||
4644 | LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWordsdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-instr-info")) { dbgs() << "bad vaddr size, expected " << AddrWords << " but got " << VAddrWords << "\n"; } } while (false) | |||
4645 | << " but got " << VAddrWords << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-instr-info")) { dbgs() << "bad vaddr size, expected " << AddrWords << " but got " << VAddrWords << "\n"; } } while (false); | |||
4646 | ErrInfo = "bad vaddr size"; | |||
4647 | return false; | |||
4648 | } | |||
4649 | } | |||
4650 | } | |||
4651 | ||||
4652 | const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl); | |||
4653 | if (DppCt) { | |||
4654 | using namespace AMDGPU::DPP; | |||
4655 | ||||
4656 | unsigned DC = DppCt->getImm(); | |||
4657 | if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 || | |||
4658 | DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST || | |||
4659 | (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || | |||
4660 | (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || | |||
4661 | (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || | |||
4662 | (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) || | |||
4663 | (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) { | |||
4664 | ErrInfo = "Invalid dpp_ctrl value"; | |||
4665 | return false; | |||
4666 | } | |||
4667 | if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 && | |||
4668 | ST.getGeneration() >= AMDGPUSubtarget::GFX10) { | |||
4669 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4670 | "wavefront shifts are not supported on GFX10+"; | |||
4671 | return false; | |||
4672 | } | |||
4673 | if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 && | |||
4674 | ST.getGeneration() >= AMDGPUSubtarget::GFX10) { | |||
4675 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4676 | "broadcasts are not supported on GFX10+"; | |||
4677 | return false; | |||
4678 | } | |||
4679 | if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && | |||
4680 | ST.getGeneration() < AMDGPUSubtarget::GFX10) { | |||
4681 | if (DC >= DppCtrl::ROW_NEWBCAST_FIRST && | |||
4682 | DC <= DppCtrl::ROW_NEWBCAST_LAST && | |||
4683 | !ST.hasGFX90AInsts()) { | |||
4684 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4685 | "row_newbroadcast/row_share is not supported before " | |||
4686 | "GFX90A/GFX10"; | |||
4687 | return false; | |||
4688 | } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) { | |||
4689 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4690 | "row_share and row_xmask are not supported before GFX10"; | |||
4691 | return false; | |||
4692 | } | |||
4693 | } | |||
4694 | ||||
4695 | int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); | |||
4696 | ||||
4697 | if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO && | |||
4698 | ((DstIdx >= 0 && | |||
4699 | (Desc.operands()[DstIdx].RegClass == AMDGPU::VReg_64RegClassID || | |||
4700 | Desc.operands()[DstIdx].RegClass == | |||
4701 | AMDGPU::VReg_64_Align2RegClassID)) || | |||
4702 | ((Src0Idx >= 0 && | |||
4703 | (Desc.operands()[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID || | |||
4704 | Desc.operands()[Src0Idx].RegClass == | |||
4705 | AMDGPU::VReg_64_Align2RegClassID)))) && | |||
4706 | !AMDGPU::isLegal64BitDPPControl(DC)) { | |||
4707 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4708 | "64 bit dpp only support row_newbcast"; | |||
4709 | return false; | |||
4710 | } | |||
4711 | } | |||
4712 | ||||
4713 | if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) { | |||
4714 | const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
4715 | uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0 | |||
4716 | : AMDGPU::OpName::vdata; | |||
4717 | const MachineOperand *Data = getNamedOperand(MI, DataNameIdx); | |||
4718 | const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1); | |||
4719 | if (Data && !Data->isReg()) | |||
4720 | Data = nullptr; | |||
4721 | ||||
4722 | if (ST.hasGFX90AInsts()) { | |||
4723 | if (Dst && Data && | |||
4724 | (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) { | |||
4725 | ErrInfo = "Invalid register class: " | |||
4726 | "vdata and vdst should be both VGPR or AGPR"; | |||
4727 | return false; | |||
4728 | } | |||
4729 | if (Data && Data2 && | |||
4730 | (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) { | |||
4731 | ErrInfo = "Invalid register class: " | |||
4732 | "both data operands should be VGPR or AGPR"; | |||
4733 | return false; | |||
4734 | } | |||
4735 | } else { | |||
4736 | if ((Dst && RI.isAGPR(MRI, Dst->getReg())) || | |||
4737 | (Data && RI.isAGPR(MRI, Data->getReg())) || | |||
4738 | (Data2 && RI.isAGPR(MRI, Data2->getReg()))) { | |||
4739 | ErrInfo = "Invalid register class: " | |||
4740 | "agpr loads and stores not supported on this GPU"; | |||
4741 | return false; | |||
4742 | } | |||
4743 | } | |||
4744 | } | |||
4745 | ||||
4746 | if (ST.needsAlignedVGPRs()) { | |||
4747 | const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool { | |||
4748 | const MachineOperand *Op = getNamedOperand(MI, OpName); | |||
4749 | if (!Op) | |||
4750 | return true; | |||
4751 | Register Reg = Op->getReg(); | |||
4752 | if (Reg.isPhysical()) | |||
4753 | return !(RI.getHWRegIndex(Reg) & 1); | |||
4754 | const TargetRegisterClass &RC = *MRI.getRegClass(Reg); | |||
4755 | return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) && | |||
4756 | !(RI.getChannelFromSubReg(Op->getSubReg()) & 1); | |||
4757 | }; | |||
4758 | ||||
4759 | if (MI.getOpcode() == AMDGPU::DS_GWS_INIT || | |||
4760 | MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR || | |||
4761 | MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) { | |||
4762 | ||||
4763 | if (!isAlignedReg(AMDGPU::OpName::data0)) { | |||
4764 | ErrInfo = "Subtarget requires even aligned vector registers " | |||
4765 | "for DS_GWS instructions"; | |||
4766 | return false; | |||
4767 | } | |||
4768 | } | |||
4769 | ||||
4770 | if (isMIMG(MI)) { | |||
4771 | if (!isAlignedReg(AMDGPU::OpName::vaddr)) { | |||
4772 | ErrInfo = "Subtarget requires even aligned vector registers " | |||
4773 | "for vaddr operand of image instructions"; | |||
4774 | return false; | |||
4775 | } | |||
4776 | } | |||
4777 | } | |||
4778 | ||||
4779 | if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && | |||
4780 | !ST.hasGFX90AInsts()) { | |||
4781 | const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0); | |||
4782 | if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) { | |||
4783 | ErrInfo = "Invalid register class: " | |||
4784 | "v_accvgpr_write with an SGPR is not supported on this GPU"; | |||
4785 | return false; | |||
4786 | } | |||
4787 | } | |||
4788 | ||||
4789 | if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) { | |||
4790 | const MachineOperand &SrcOp = MI.getOperand(1); | |||
4791 | if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) { | |||
4792 | ErrInfo = "pseudo expects only physical SGPRs"; | |||
4793 | return false; | |||
4794 | } | |||
4795 | } | |||
4796 | ||||
4797 | return true; | |||
4798 | } | |||
4799 | ||||
4800 | unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { | |||
4801 | switch (MI.getOpcode()) { | |||
4802 | default: return AMDGPU::INSTRUCTION_LIST_END; | |||
4803 | case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; | |||
4804 | case AMDGPU::COPY: return AMDGPU::COPY; | |||
4805 | case AMDGPU::PHI: return AMDGPU::PHI; | |||
4806 | case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; | |||
4807 | case AMDGPU::WQM: return AMDGPU::WQM; | |||
4808 | case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; | |||
4809 | case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM; | |||
4810 | case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM; | |||
4811 | case AMDGPU::S_MOV_B32: { | |||
4812 | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); | |||
4813 | return MI.getOperand(1).isReg() || | |||
4814 | RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? | |||
4815 | AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; | |||
4816 | } | |||
4817 | case AMDGPU::S_ADD_I32: | |||
4818 | return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32; | |||
4819 | case AMDGPU::S_ADDC_U32: | |||
4820 | return AMDGPU::V_ADDC_U32_e32; | |||
4821 | case AMDGPU::S_SUB_I32: | |||
4822 | return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32; | |||
4823 | // FIXME: These are not consistently handled, and selected when the carry is | |||
4824 | // used. | |||
4825 | case AMDGPU::S_ADD_U32: | |||
4826 | return AMDGPU::V_ADD_CO_U32_e32; | |||
4827 | case AMDGPU::S_SUB_U32: | |||
4828 | return AMDGPU::V_SUB_CO_U32_e32; | |||
4829 | case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; | |||
4830 | case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64; | |||
4831 | case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64; | |||
4832 | case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64; | |||
4833 | case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; | |||
4834 | case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; | |||
4835 | case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; | |||
4836 | case AMDGPU::S_XNOR_B32: | |||
4837 | return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; | |||
4838 | case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; | |||
4839 | case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; | |||
4840 | case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; | |||
4841 | case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; | |||
4842 | case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; | |||
4843 | case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64; | |||
4844 | case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; | |||
4845 | case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64; | |||
4846 | case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; | |||
4847 | case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64; | |||
4848 | case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64; | |||
4849 | case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64; | |||
4850 | case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64; | |||
4851 | case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64; | |||
4852 | case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; | |||
4853 | case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; | |||
4854 | case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; | |||
4855 | case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; | |||
4856 | case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64; | |||
4857 | case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64; | |||
4858 | case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64; | |||
4859 | case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64; | |||
4860 | case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64; | |||
4861 | case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64; | |||
4862 | case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64; | |||
4863 | case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64; | |||
4864 | case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64; | |||
4865 | case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64; | |||
4866 | case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64; | |||
4867 | case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64; | |||
4868 | case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64; | |||
4869 | case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64; | |||
4870 | case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; | |||
4871 | case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; | |||
4872 | case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; | |||
4873 | case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; | |||
4874 | case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; | |||
4875 | case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; | |||
4876 | } | |||
4877 | llvm_unreachable(::llvm::llvm_unreachable_internal("Unexpected scalar opcode without corresponding vector one!" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 4878) | |||
4878 | "Unexpected scalar opcode without corresponding vector one!")::llvm::llvm_unreachable_internal("Unexpected scalar opcode without corresponding vector one!" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 4878); | |||
4879 | } | |||
4880 | ||||
4881 | static const TargetRegisterClass * | |||
4882 | adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, | |||
4883 | const MachineRegisterInfo &MRI, | |||
4884 | const MCInstrDesc &TID, unsigned RCID, | |||
4885 | bool IsAllocatable) { | |||
4886 | if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && | |||
4887 | (((TID.mayLoad() || TID.mayStore()) && | |||
4888 | !(TID.TSFlags & SIInstrFlags::VGPRSpill)) || | |||
4889 | (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) { | |||
4890 | switch (RCID) { | |||
4891 | case AMDGPU::AV_32RegClassID: | |||
4892 | RCID = AMDGPU::VGPR_32RegClassID; | |||
4893 | break; | |||
4894 | case AMDGPU::AV_64RegClassID: | |||
4895 | RCID = AMDGPU::VReg_64RegClassID; | |||
4896 | break; | |||
4897 | case AMDGPU::AV_96RegClassID: | |||
4898 | RCID = AMDGPU::VReg_96RegClassID; | |||
4899 | break; | |||
4900 | case AMDGPU::AV_128RegClassID: | |||
4901 | RCID = AMDGPU::VReg_128RegClassID; | |||
4902 | break; | |||
4903 | case AMDGPU::AV_160RegClassID: | |||
4904 | RCID = AMDGPU::VReg_160RegClassID; | |||
4905 | break; | |||
4906 | case AMDGPU::AV_512RegClassID: | |||
4907 | RCID = AMDGPU::VReg_512RegClassID; | |||
4908 | break; | |||
4909 | default: | |||
4910 | break; | |||
4911 | } | |||
4912 | } | |||
4913 | ||||
4914 | return RI.getProperlyAlignedRC(RI.getRegClass(RCID)); | |||
4915 | } | |||
4916 | ||||
4917 | const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID, | |||
4918 | unsigned OpNum, const TargetRegisterInfo *TRI, | |||
4919 | const MachineFunction &MF) | |||
4920 | const { | |||
4921 | if (OpNum >= TID.getNumOperands()) | |||
4922 | return nullptr; | |||
4923 | auto RegClass = TID.operands()[OpNum].RegClass; | |||
4924 | bool IsAllocatable = false; | |||
4925 | if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) { | |||
4926 | // vdst and vdata should be both VGPR or AGPR, same for the DS instructions | |||
4927 | // with two data operands. Request register class constrained to VGPR only | |||
4928 | // of both operands present as Machine Copy Propagation can not check this | |||
4929 | // constraint and possibly other passes too. | |||
4930 | // | |||
4931 | // The check is limited to FLAT and DS because atomics in non-flat encoding | |||
4932 | // have their vdst and vdata tied to be the same register. | |||
4933 | const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, | |||
4934 | AMDGPU::OpName::vdst); | |||
4935 | const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, | |||
4936 | (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0 | |||
4937 | : AMDGPU::OpName::vdata); | |||
4938 | if (DataIdx != -1) { | |||
4939 | IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand( | |||
4940 | TID.Opcode, AMDGPU::OpName::data1); | |||
4941 | } | |||
4942 | } | |||
4943 | return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass, | |||
4944 | IsAllocatable); | |||
4945 | } | |||
4946 | ||||
4947 | const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, | |||
4948 | unsigned OpNo) const { | |||
4949 | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); | |||
4950 | const MCInstrDesc &Desc = get(MI.getOpcode()); | |||
4951 | if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || | |||
4952 | Desc.operands()[OpNo].RegClass == -1) { | |||
4953 | Register Reg = MI.getOperand(OpNo).getReg(); | |||
4954 | ||||
4955 | if (Reg.isVirtual()) | |||
4956 | return MRI.getRegClass(Reg); | |||
4957 | return RI.getPhysRegBaseClass(Reg); | |||
4958 | } | |||
4959 | ||||
4960 | unsigned RCID = Desc.operands()[OpNo].RegClass; | |||
4961 | return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true); | |||
4962 | } | |||
4963 | ||||
4964 | void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { | |||
4965 | MachineBasicBlock::iterator I = MI; | |||
4966 | MachineBasicBlock *MBB = MI.getParent(); | |||
4967 | MachineOperand &MO = MI.getOperand(OpIdx); | |||
4968 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | |||
4969 | unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass; | |||
4970 | const TargetRegisterClass *RC = RI.getRegClass(RCID); | |||
4971 | unsigned Size = RI.getRegSizeInBits(*RC); | |||
4972 | unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32; | |||
4973 | if (MO.isReg()) | |||
4974 | Opcode = AMDGPU::COPY; | |||
4975 | else if (RI.isSGPRClass(RC)) | |||
4976 | Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; | |||
4977 | ||||
4978 | const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); | |||
4979 | const TargetRegisterClass *VRC64 = RI.getVGPR64Class(); | |||
4980 | if (RI.getCommonSubClass(VRC64, VRC)) | |||
4981 | VRC = VRC64; | |||
4982 | else | |||
4983 | VRC = &AMDGPU::VGPR_32RegClass; | |||
4984 | ||||
4985 | Register Reg = MRI.createVirtualRegister(VRC); | |||
4986 | DebugLoc DL = MBB->findDebugLoc(I); | |||
4987 | BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); | |||
4988 | MO.ChangeToRegister(Reg, false); | |||
4989 | } | |||
4990 | ||||
4991 | unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, | |||
4992 | MachineRegisterInfo &MRI, | |||
4993 | MachineOperand &SuperReg, | |||
4994 | const TargetRegisterClass *SuperRC, | |||
4995 | unsigned SubIdx, | |||
4996 | const TargetRegisterClass *SubRC) | |||
4997 | const { | |||
4998 | MachineBasicBlock *MBB = MI->getParent(); | |||
4999 | DebugLoc DL = MI->getDebugLoc(); | |||
5000 | Register SubReg = MRI.createVirtualRegister(SubRC); | |||
5001 | ||||
5002 | if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { | |||
5003 | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) | |||
5004 | .addReg(SuperReg.getReg(), 0, SubIdx); | |||
5005 | return SubReg; | |||
5006 | } | |||
5007 | ||||
5008 | // Just in case the super register is itself a sub-register, copy it to a new | |||
5009 | // value so we don't need to worry about merging its subreg index with the | |||
5010 | // SubIdx passed to this function. The register coalescer should be able to | |||
5011 | // eliminate this extra copy. | |||
5012 | Register NewSuperReg = MRI.createVirtualRegister(SuperRC); | |||
5013 | ||||
5014 | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) | |||
5015 | .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); | |||
5016 | ||||
5017 | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) | |||
5018 | .addReg(NewSuperReg, 0, SubIdx); | |||
5019 | ||||
5020 | return SubReg; | |||
5021 | } | |||
5022 | ||||
5023 | MachineOperand SIInstrInfo::buildExtractSubRegOrImm( | |||
5024 | MachineBasicBlock::iterator MII, | |||
5025 | MachineRegisterInfo &MRI, | |||
5026 | MachineOperand &Op, | |||
5027 | const TargetRegisterClass *SuperRC, | |||
5028 | unsigned SubIdx, | |||
5029 | const TargetRegisterClass *SubRC) const { | |||
5030 | if (Op.isImm()) { | |||
5031 | if (SubIdx == AMDGPU::sub0) | |||
5032 | return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); | |||
5033 | if (SubIdx == AMDGPU::sub1) | |||
5034 | return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); | |||
5035 | ||||
5036 | llvm_unreachable("Unhandled register index for immediate")::llvm::llvm_unreachable_internal("Unhandled register index for immediate" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 5036); | |||
5037 | } | |||
5038 | ||||
5039 | unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, | |||
5040 | SubIdx, SubRC); | |||
5041 | return MachineOperand::CreateReg(SubReg, false); | |||
5042 | } | |||
5043 | ||||
5044 | // Change the order of operands from (0, 1, 2) to (0, 2, 1) | |||
5045 | void SIInstrInfo::swapOperands(MachineInstr &Inst) const { | |||
5046 | assert(Inst.getNumExplicitOperands() == 3)(static_cast <bool> (Inst.getNumExplicitOperands() == 3 ) ? void (0) : __assert_fail ("Inst.getNumExplicitOperands() == 3" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 5046, __extension__ __PRETTY_FUNCTION__)); | |||
5047 | MachineOperand Op1 = Inst.getOperand(1); | |||
5048 | Inst.removeOperand(1); | |||
5049 | Inst.addOperand(Op1); | |||
5050 | } | |||
5051 | ||||
5052 | bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, | |||
5053 | const MCOperandInfo &OpInfo, | |||
5054 | const MachineOperand &MO) const { | |||
5055 | if (!MO.isReg()) | |||
5056 | return false; | |||
5057 | ||||
5058 | Register Reg = MO.getReg(); | |||
5059 | ||||
5060 | const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass); | |||
5061 | if (Reg.isPhysical()) | |||
5062 | return DRC->contains(Reg); | |||
5063 | ||||
5064 | const TargetRegisterClass *RC = MRI.getRegClass(Reg); | |||
5065 | ||||
5066 | if (MO.getSubReg()) { | |||
5067 | const MachineFunction *MF = MO.getParent()->getParent()->getParent(); | |||
5068 | const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF); | |||
5069 | if (!SuperRC) | |||
5070 | return false; | |||
5071 | ||||
5072 | DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()); | |||
5073 | if (!DRC) | |||
5074 | return false; | |||
5075 | } | |||
5076 | return RC->hasSuperClassEq(DRC); | |||
5077 | } | |||
5078 | ||||
5079 | bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, | |||
5080 | const MCOperandInfo &OpInfo, | |||
5081 | const MachineOperand &MO) const { | |||
5082 | if (MO.isReg()) | |||
5083 | return isLegalRegOperand(MRI, OpInfo, MO); | |||
5084 | ||||
5085 | // Handle non-register types that are treated like immediates. | |||
5086 | assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal())(static_cast <bool> (MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) ? void (0) : __assert_fail ("MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 5086, __extension__ __PRETTY_FUNCTION__)); | |||
5087 | return true; | |||
5088 | } | |||
5089 | ||||
5090 | bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, | |||
5091 | const MachineOperand *MO) const { | |||
5092 | const MachineFunction &MF = *MI.getParent()->getParent(); | |||
5093 | const MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
5094 | const MCInstrDesc &InstDesc = MI.getDesc(); | |||
5095 | const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx]; | |||
5096 | const TargetRegisterClass *DefinedRC = | |||
5097 | OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; | |||
5098 | if (!MO) | |||
5099 | MO = &MI.getOperand(OpIdx); | |||
5100 | ||||
5101 | int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); | |||
5102 | int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0; | |||
5103 | if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { | |||
5104 | if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--) | |||
5105 | return false; | |||
5106 | ||||
5107 | SmallDenseSet<RegSubRegPair> SGPRsUsed; | |||
5108 | if (MO->isReg()) | |||
5109 | SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg())); | |||
5110 | ||||
5111 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { | |||
5112 | if (i == OpIdx) | |||
5113 | continue; | |||
5114 | const MachineOperand &Op = MI.getOperand(i); | |||
5115 | if (Op.isReg()) { | |||
5116 | RegSubRegPair SGPR(Op.getReg(), Op.getSubReg()); | |||
5117 | if (!SGPRsUsed.count(SGPR) && | |||
5118 | // FIXME: This can access off the end of the operands() array. | |||
5119 | usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) { | |||
5120 | if (--ConstantBusLimit <= 0) | |||
5121 | return false; | |||
5122 | SGPRsUsed.insert(SGPR); | |||
5123 | } | |||
5124 | } else if (InstDesc.operands()[i].OperandType == AMDGPU::OPERAND_KIMM32 || | |||
5125 | (AMDGPU::isSISrcOperand(InstDesc, i) && | |||
5126 | !isInlineConstant(Op, InstDesc.operands()[i]))) { | |||
5127 | if (!LiteralLimit--) | |||
5128 | return false; | |||
5129 | if (--ConstantBusLimit <= 0) | |||
5130 | return false; | |||
5131 | } | |||
5132 | } | |||
5133 | } | |||
5134 | ||||
5135 | if (MO->isReg()) { | |||
5136 | if (!DefinedRC) | |||
5137 | return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN; | |||
5138 | if (!isLegalRegOperand(MRI, OpInfo, *MO)) | |||
5139 | return false; | |||
5140 | bool IsAGPR = RI.isAGPR(MRI, MO->getReg()); | |||
5141 | if (IsAGPR && !ST.hasMAIInsts()) | |||
5142 | return false; | |||
5143 | unsigned Opc = MI.getOpcode(); | |||
5144 | if (IsAGPR && | |||
5145 | (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && | |||
5146 | (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc))) | |||
5147 | return false; | |||
5148 | // Atomics should have both vdst and vdata either vgpr or agpr. | |||
5149 | const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
5150 | const int DataIdx = AMDGPU::getNamedOperandIdx(Opc, | |||
5151 | isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata); | |||
5152 | if ((int)OpIdx == VDstIdx && DataIdx != -1 && | |||
5153 | MI.getOperand(DataIdx).isReg() && | |||
5154 | RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR) | |||
5155 | return false; | |||
5156 | if ((int)OpIdx == DataIdx) { | |||
5157 | if (VDstIdx != -1 && | |||
5158 | RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR) | |||
5159 | return false; | |||
5160 | // DS instructions with 2 src operands also must have tied RC. | |||
5161 | const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, | |||
5162 | AMDGPU::OpName::data1); | |||
5163 | if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() && | |||
5164 | RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR) | |||
5165 | return false; | |||
5166 | } | |||
5167 | if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() && | |||
5168 | (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) && | |||
5169 | RI.isSGPRReg(MRI, MO->getReg())) | |||
5170 | return false; | |||
5171 | return true; | |||
5172 | } | |||
5173 | ||||
5174 | // Handle non-register types that are treated like immediates. | |||
5175 | assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal())(static_cast <bool> (MO->isImm() || MO->isTargetIndex () || MO->isFI() || MO->isGlobal()) ? void (0) : __assert_fail ("MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 5175, __extension__ __PRETTY_FUNCTION__)); | |||
5176 | ||||
5177 | if (!DefinedRC) { | |||
5178 | // This operand expects an immediate. | |||
5179 | return true; | |||
5180 | } | |||
5181 | ||||
5182 | return isImmOperandLegal(MI, OpIdx, *MO); | |||
5183 | } | |||
5184 | ||||
5185 | void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, | |||
5186 | MachineInstr &MI) const { | |||
5187 | unsigned Opc = MI.getOpcode(); | |||
5188 | const MCInstrDesc &InstrDesc = get(Opc); | |||
5189 | ||||
5190 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); | |||
5191 | MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
5192 | ||||
5193 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); | |||
5194 | MachineOperand &Src1 = MI.getOperand(Src1Idx); | |||
5195 | ||||
5196 | // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 | |||
5197 | // we need to only have one constant bus use before GFX10. | |||
5198 | bool HasImplicitSGPR = findImplicitSGPRRead(MI); | |||
5199 | if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() && | |||
5200 | RI.isSGPRReg(MRI, Src0.getReg())) | |||
5201 | legalizeOpWithMove(MI, Src0Idx); | |||
5202 | ||||
5203 | // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for | |||
5204 | // both the value to write (src0) and lane select (src1). Fix up non-SGPR | |||
5205 | // src0/src1 with V_READFIRSTLANE. | |||
5206 | if (Opc == AMDGPU::V_WRITELANE_B32) { | |||
5207 | const DebugLoc &DL = MI.getDebugLoc(); | |||
5208 | if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { | |||
5209 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
5210 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
5211 | .add(Src0); | |||
5212 | Src0.ChangeToRegister(Reg, false); | |||
5213 | } | |||
5214 | if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { | |||
5215 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
5216 | const DebugLoc &DL = MI.getDebugLoc(); | |||
5217 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
5218 | .add(Src1); | |||
5219 | Src1.ChangeToRegister(Reg, false); | |||
5220 | } | |||
5221 | return; | |||
5222 | } | |||
5223 | ||||
5224 | // No VOP2 instructions support AGPRs. | |||
5225 | if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg())) | |||
5226 | legalizeOpWithMove(MI, Src0Idx); | |||
5227 | ||||
5228 | if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) | |||
5229 | legalizeOpWithMove(MI, Src1Idx); | |||
5230 | ||||
5231 | // VOP2 src0 instructions support all operand types, so we don't need to check | |||
5232 | // their legality. If src1 is already legal, we don't need to do anything. | |||
5233 | if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1)) | |||
5234 | return; | |||
5235 | ||||
5236 | // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for | |||
5237 | // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane | |||
5238 | // select is uniform. | |||
5239 | if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && | |||
5240 | RI.isVGPR(MRI, Src1.getReg())) { | |||
5241 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
5242 | const DebugLoc &DL = MI.getDebugLoc(); | |||
5243 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
5244 | .add(Src1); | |||
5245 | Src1.ChangeToRegister(Reg, false); | |||
5246 | return; | |||
5247 | } | |||
5248 | ||||
5249 | // We do not use commuteInstruction here because it is too aggressive and will | |||
5250 | // commute if it is possible. We only want to commute here if it improves | |||
5251 | // legality. This can be called a fairly large number of times so don't waste | |||
5252 | // compile time pointlessly swapping and checking legality again. | |||
5253 | if (HasImplicitSGPR || !MI.isCommutable()) { | |||
5254 | legalizeOpWithMove(MI, Src1Idx); | |||
5255 | return; | |||
5256 | } | |||
5257 | ||||
5258 | // If src0 can be used as src1, commuting will make the operands legal. | |||
5259 | // Otherwise we have to give up and insert a move. | |||
5260 | // | |||
5261 | // TODO: Other immediate-like operand kinds could be commuted if there was a | |||
5262 | // MachineOperand::ChangeTo* for them. | |||
5263 | if ((!Src1.isImm() && !Src1.isReg()) || | |||
5264 | !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) { | |||
5265 | legalizeOpWithMove(MI, Src1Idx); | |||
5266 | return; | |||
5267 | } | |||
5268 | ||||
5269 | int CommutedOpc = commuteOpcode(MI); | |||
5270 | if (CommutedOpc == -1) { | |||
5271 | legalizeOpWithMove(MI, Src1Idx); | |||
5272 | return; | |||
5273 | } | |||
5274 | ||||
5275 | MI.setDesc(get(CommutedOpc)); | |||
5276 | ||||
5277 | Register Src0Reg = Src0.getReg(); | |||
5278 | unsigned Src0SubReg = Src0.getSubReg(); | |||
5279 | bool Src0Kill = Src0.isKill(); | |||
5280 | ||||
5281 | if (Src1.isImm()) | |||
5282 | Src0.ChangeToImmediate(Src1.getImm()); | |||
5283 | else if (Src1.isReg()) { | |||
5284 | Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); | |||
5285 | Src0.setSubReg(Src1.getSubReg()); | |||
5286 | } else | |||
5287 | llvm_unreachable("Should only have register or immediate operands")::llvm::llvm_unreachable_internal("Should only have register or immediate operands" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 5287); | |||
5288 | ||||
5289 | Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); | |||
5290 | Src1.setSubReg(Src0SubReg); | |||
5291 | fixImplicitOperands(MI); | |||
5292 | } | |||
5293 | ||||
5294 | // Legalize VOP3 operands. All operand types are supported for any operand | |||
5295 | // but only one literal constant and only starting from GFX10. | |||
5296 | void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, | |||
5297 | MachineInstr &MI) const { | |||
5298 | unsigned Opc = MI.getOpcode(); | |||
5299 | ||||
5300 | int VOP3Idx[3] = { | |||
5301 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), | |||
5302 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), | |||
5303 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) | |||
5304 | }; | |||
5305 | ||||
5306 | if (Opc == AMDGPU::V_PERMLANE16_B32_e64 || | |||
5307 | Opc == AMDGPU::V_PERMLANEX16_B32_e64) { | |||
5308 | // src1 and src2 must be scalar | |||
5309 | MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]); | |||
5310 | MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); | |||
5311 | const DebugLoc &DL = MI.getDebugLoc(); | |||
5312 | if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { | |||
5313 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
5314 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
5315 | .add(Src1); | |||
5316 | Src1.ChangeToRegister(Reg, false); | |||
5317 | } | |||
5318 | if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { | |||
5319 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
5320 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
5321 | .add(Src2); | |||
5322 | Src2.ChangeToRegister(Reg, false); | |||
5323 | } | |||
5324 | } | |||
5325 | ||||
5326 | // Find the one SGPR operand we are allowed to use. | |||
5327 | int ConstantBusLimit = ST.getConstantBusLimit(Opc); | |||
5328 | int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; | |||
5329 | SmallDenseSet<unsigned> SGPRsUsed; | |||
5330 | Register SGPRReg = findUsedSGPR(MI, VOP3Idx); | |||
5331 | if (SGPRReg) { | |||
5332 | SGPRsUsed.insert(SGPRReg); | |||
5333 | --ConstantBusLimit; | |||
5334 | } | |||
5335 | ||||
5336 | for (int Idx : VOP3Idx) { | |||
5337 | if (Idx == -1) | |||
5338 | break; | |||
5339 | MachineOperand &MO = MI.getOperand(Idx); | |||
5340 | ||||
5341 | if (!MO.isReg()) { | |||
5342 | if (isInlineConstant(MO, get(Opc).operands()[Idx])) | |||
5343 | continue; | |||
5344 | ||||
5345 | if (LiteralLimit > 0 && ConstantBusLimit > 0) { | |||
5346 | --LiteralLimit; | |||
5347 | --ConstantBusLimit; | |||
5348 | continue; | |||
5349 | } | |||
5350 | ||||
5351 | --LiteralLimit; | |||
5352 | --ConstantBusLimit; | |||
5353 | legalizeOpWithMove(MI, Idx); | |||
5354 | continue; | |||
5355 | } | |||
5356 | ||||
5357 | if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) && | |||
5358 | !isOperandLegal(MI, Idx, &MO)) { | |||
5359 | legalizeOpWithMove(MI, Idx); | |||
5360 | continue; | |||
5361 | } | |||
5362 | ||||
5363 | if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg()))) | |||
5364 | continue; // VGPRs are legal | |||
5365 | ||||
5366 | // We can use one SGPR in each VOP3 instruction prior to GFX10 | |||
5367 | // and two starting from GFX10. | |||
5368 | if (SGPRsUsed.count(MO.getReg())) | |||
5369 | continue; | |||
5370 | if (ConstantBusLimit > 0) { | |||
5371 | SGPRsUsed.insert(MO.getReg()); | |||
5372 | --ConstantBusLimit; | |||
5373 | continue; | |||
5374 | } | |||
5375 | ||||
5376 | // If we make it this far, then the operand is not legal and we must | |||
5377 | // legalize it. | |||
5378 | legalizeOpWithMove(MI, Idx); | |||
5379 | } | |||
5380 | } | |||
5381 | ||||
5382 | Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, | |||
5383 | MachineRegisterInfo &MRI) const { | |||
5384 | const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); | |||
5385 | const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); | |||
5386 | Register DstReg = MRI.createVirtualRegister(SRC); | |||
5387 | unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; | |||
5388 | ||||
5389 | if (RI.hasAGPRs(VRC)) { | |||
5390 | VRC = RI.getEquivalentVGPRClass(VRC); | |||
5391 | Register NewSrcReg = MRI.createVirtualRegister(VRC); | |||
5392 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), | |||
5393 | get(TargetOpcode::COPY), NewSrcReg) | |||
5394 | .addReg(SrcReg); | |||
5395 | SrcReg = NewSrcReg; | |||
5396 | } | |||
5397 | ||||
5398 | if (SubRegs == 1) { | |||
5399 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), | |||
5400 | get(AMDGPU::V_READFIRSTLANE_B32), DstReg) | |||
5401 | .addReg(SrcReg); | |||
5402 | return DstReg; | |||
5403 | } | |||
5404 | ||||
5405 | SmallVector<Register, 8> SRegs; | |||
5406 | for (unsigned i = 0; i < SubRegs; ++i) { | |||
5407 | Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5408 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), | |||
5409 | get(AMDGPU::V_READFIRSTLANE_B32), SGPR) | |||
5410 | .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); | |||
5411 | SRegs.push_back(SGPR); | |||
5412 | } | |||
5413 | ||||
5414 | MachineInstrBuilder MIB = | |||
5415 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), | |||
5416 | get(AMDGPU::REG_SEQUENCE), DstReg); | |||
5417 | for (unsigned i = 0; i < SubRegs; ++i) { | |||
5418 | MIB.addReg(SRegs[i]); | |||
5419 | MIB.addImm(RI.getSubRegFromChannel(i)); | |||
5420 | } | |||
5421 | return DstReg; | |||
5422 | } | |||
5423 | ||||
5424 | void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, | |||
5425 | MachineInstr &MI) const { | |||
5426 | ||||
5427 | // If the pointer is store in VGPRs, then we need to move them to | |||
5428 | // SGPRs using v_readfirstlane. This is safe because we only select | |||
5429 | // loads with uniform pointers to SMRD instruction so we know the | |||
5430 | // pointer value is uniform. | |||
5431 | MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); | |||
5432 | if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { | |||
5433 | Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); | |||
5434 | SBase->setReg(SGPR); | |||
5435 | } | |||
5436 | MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset); | |||
5437 | if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { | |||
5438 | Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); | |||
5439 | SOff->setReg(SGPR); | |||
5440 | } | |||
5441 | } | |||
5442 | ||||
5443 | bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { | |||
5444 | unsigned Opc = Inst.getOpcode(); | |||
5445 | int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); | |||
5446 | if (OldSAddrIdx < 0) | |||
5447 | return false; | |||
5448 | ||||
5449 | assert(isSegmentSpecificFLAT(Inst))(static_cast <bool> (isSegmentSpecificFLAT(Inst)) ? void (0) : __assert_fail ("isSegmentSpecificFLAT(Inst)", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5449, __extension__ __PRETTY_FUNCTION__)); | |||
5450 | ||||
5451 | int NewOpc = AMDGPU::getGlobalVaddrOp(Opc); | |||
5452 | if (NewOpc < 0) | |||
5453 | NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opc); | |||
5454 | if (NewOpc < 0) | |||
5455 | return false; | |||
5456 | ||||
5457 | MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo(); | |||
5458 | MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx); | |||
5459 | if (RI.isSGPRReg(MRI, SAddr.getReg())) | |||
5460 | return false; | |||
5461 | ||||
5462 | int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr); | |||
5463 | if (NewVAddrIdx < 0) | |||
5464 | return false; | |||
5465 | ||||
5466 | int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); | |||
5467 | ||||
5468 | // Check vaddr, it shall be zero or absent. | |||
5469 | MachineInstr *VAddrDef = nullptr; | |||
5470 | if (OldVAddrIdx >= 0) { | |||
5471 | MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx); | |||
5472 | VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg()); | |||
5473 | if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 || | |||
5474 | !VAddrDef->getOperand(1).isImm() || | |||
5475 | VAddrDef->getOperand(1).getImm() != 0) | |||
5476 | return false; | |||
5477 | } | |||
5478 | ||||
5479 | const MCInstrDesc &NewDesc = get(NewOpc); | |||
5480 | Inst.setDesc(NewDesc); | |||
5481 | ||||
5482 | // Callers expect iterator to be valid after this call, so modify the | |||
5483 | // instruction in place. | |||
5484 | if (OldVAddrIdx == NewVAddrIdx) { | |||
5485 | MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx); | |||
5486 | // Clear use list from the old vaddr holding a zero register. | |||
5487 | MRI.removeRegOperandFromUseList(&NewVAddr); | |||
5488 | MRI.moveOperands(&NewVAddr, &SAddr, 1); | |||
5489 | Inst.removeOperand(OldSAddrIdx); | |||
5490 | // Update the use list with the pointer we have just moved from vaddr to | |||
5491 | // saddr position. Otherwise new vaddr will be missing from the use list. | |||
5492 | MRI.removeRegOperandFromUseList(&NewVAddr); | |||
5493 | MRI.addRegOperandToUseList(&NewVAddr); | |||
5494 | } else { | |||
5495 | assert(OldSAddrIdx == NewVAddrIdx)(static_cast <bool> (OldSAddrIdx == NewVAddrIdx) ? void (0) : __assert_fail ("OldSAddrIdx == NewVAddrIdx", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5495, __extension__ __PRETTY_FUNCTION__)); | |||
5496 | ||||
5497 | if (OldVAddrIdx >= 0) { | |||
5498 | int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc, | |||
5499 | AMDGPU::OpName::vdst_in); | |||
5500 | ||||
5501 | // removeOperand doesn't try to fixup tied operand indexes at it goes, so | |||
5502 | // it asserts. Untie the operands for now and retie them afterwards. | |||
5503 | if (NewVDstIn != -1) { | |||
5504 | int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); | |||
5505 | Inst.untieRegOperand(OldVDstIn); | |||
5506 | } | |||
5507 | ||||
5508 | Inst.removeOperand(OldVAddrIdx); | |||
5509 | ||||
5510 | if (NewVDstIn != -1) { | |||
5511 | int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst); | |||
5512 | Inst.tieOperands(NewVDst, NewVDstIn); | |||
5513 | } | |||
5514 | } | |||
5515 | } | |||
5516 | ||||
5517 | if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg())) | |||
5518 | VAddrDef->eraseFromParent(); | |||
5519 | ||||
5520 | return true; | |||
5521 | } | |||
5522 | ||||
5523 | // FIXME: Remove this when SelectionDAG is obsoleted. | |||
5524 | void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, | |||
5525 | MachineInstr &MI) const { | |||
5526 | if (!isSegmentSpecificFLAT(MI)) | |||
5527 | return; | |||
5528 | ||||
5529 | // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence | |||
5530 | // thinks they are uniform, so a readfirstlane should be valid. | |||
5531 | MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr); | |||
5532 | if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg()))) | |||
5533 | return; | |||
5534 | ||||
5535 | if (moveFlatAddrToVGPR(MI)) | |||
5536 | return; | |||
5537 | ||||
5538 | Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI); | |||
5539 | SAddr->setReg(ToSGPR); | |||
5540 | } | |||
5541 | ||||
5542 | void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, | |||
5543 | MachineBasicBlock::iterator I, | |||
5544 | const TargetRegisterClass *DstRC, | |||
5545 | MachineOperand &Op, | |||
5546 | MachineRegisterInfo &MRI, | |||
5547 | const DebugLoc &DL) const { | |||
5548 | Register OpReg = Op.getReg(); | |||
5549 | unsigned OpSubReg = Op.getSubReg(); | |||
5550 | ||||
5551 | const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( | |||
5552 | RI.getRegClassForReg(MRI, OpReg), OpSubReg); | |||
5553 | ||||
5554 | // Check if operand is already the correct register class. | |||
5555 | if (DstRC == OpRC) | |||
5556 | return; | |||
5557 | ||||
5558 | Register DstReg = MRI.createVirtualRegister(DstRC); | |||
5559 | auto Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); | |||
5560 | ||||
5561 | Op.setReg(DstReg); | |||
5562 | Op.setSubReg(0); | |||
5563 | ||||
5564 | MachineInstr *Def = MRI.getVRegDef(OpReg); | |||
5565 | if (!Def) | |||
5566 | return; | |||
5567 | ||||
5568 | // Try to eliminate the copy if it is copying an immediate value. | |||
5569 | if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass) | |||
5570 | FoldImmediate(*Copy, *Def, OpReg, &MRI); | |||
5571 | ||||
5572 | bool ImpDef = Def->isImplicitDef(); | |||
5573 | while (!ImpDef && Def && Def->isCopy()) { | |||
5574 | if (Def->getOperand(1).getReg().isPhysical()) | |||
5575 | break; | |||
5576 | Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); | |||
5577 | ImpDef = Def && Def->isImplicitDef(); | |||
5578 | } | |||
5579 | if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) && | |||
5580 | !ImpDef) | |||
5581 | Copy.addReg(AMDGPU::EXEC, RegState::Implicit); | |||
5582 | } | |||
5583 | ||||
5584 | // Emit the actual waterfall loop, executing the wrapped instruction for each | |||
5585 | // unique value of \p Rsrc across all lanes. In the best case we execute 1 | |||
5586 | // iteration, in the worst case we execute 64 (once per lane). | |||
5587 | static void | |||
5588 | emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, | |||
5589 | MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, | |||
5590 | MachineBasicBlock &BodyBB, const DebugLoc &DL, | |||
5591 | MachineOperand &Rsrc) { | |||
5592 | MachineFunction &MF = *OrigBB.getParent(); | |||
5593 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | |||
5594 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); | |||
5595 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
5596 | unsigned SaveExecOpc = | |||
5597 | ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; | |||
5598 | unsigned XorTermOpc = | |||
5599 | ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; | |||
5600 | unsigned AndOpc = | |||
5601 | ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; | |||
5602 | const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
5603 | ||||
5604 | MachineBasicBlock::iterator I = LoopBB.begin(); | |||
5605 | ||||
5606 | SmallVector<Register, 8> ReadlanePieces; | |||
5607 | Register CondReg; | |||
5608 | ||||
5609 | Register VRsrc = Rsrc.getReg(); | |||
5610 | unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); | |||
5611 | ||||
5612 | unsigned RegSize = TRI->getRegSizeInBits(Rsrc.getReg(), MRI); | |||
5613 | unsigned NumSubRegs = RegSize / 32; | |||
5614 | assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size")(static_cast <bool> (NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size") ? void (0) : __assert_fail ("NumSubRegs % 2 == 0 && NumSubRegs <= 32 && \"Unhandled register size\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 5614, __extension__ __PRETTY_FUNCTION__)); | |||
5615 | ||||
5616 | for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) { | |||
5617 | ||||
5618 | Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5619 | Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5620 | ||||
5621 | // Read the next variant <- also loop target. | |||
5622 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo) | |||
5623 | .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx)); | |||
5624 | ||||
5625 | // Read the next variant <- also loop target. | |||
5626 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi) | |||
5627 | .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx + 1)); | |||
5628 | ||||
5629 | ReadlanePieces.push_back(CurRegLo); | |||
5630 | ReadlanePieces.push_back(CurRegHi); | |||
5631 | ||||
5632 | // Comparison is to be done as 64-bit. | |||
5633 | Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); | |||
5634 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg) | |||
5635 | .addReg(CurRegLo) | |||
5636 | .addImm(AMDGPU::sub0) | |||
5637 | .addReg(CurRegHi) | |||
5638 | .addImm(AMDGPU::sub1); | |||
5639 | ||||
5640 | Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC); | |||
5641 | auto Cmp = | |||
5642 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg) | |||
5643 | .addReg(CurReg); | |||
5644 | if (NumSubRegs <= 2) | |||
5645 | Cmp.addReg(VRsrc); | |||
5646 | else | |||
5647 | Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2)); | |||
5648 | ||||
5649 | // Combine the comparison results with AND. | |||
5650 | if (!CondReg) // First. | |||
5651 | CondReg = NewCondReg; | |||
5652 | else { // If not the first, we create an AND. | |||
5653 | Register AndReg = MRI.createVirtualRegister(BoolXExecRC); | |||
5654 | BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg) | |||
5655 | .addReg(CondReg) | |||
5656 | .addReg(NewCondReg); | |||
5657 | CondReg = AndReg; | |||
5658 | } | |||
5659 | } // End for loop. | |||
5660 | ||||
5661 | auto SRsrcRC = TRI->getEquivalentSGPRClass(MRI.getRegClass(VRsrc)); | |||
5662 | Register SRsrc = MRI.createVirtualRegister(SRsrcRC); | |||
5663 | ||||
5664 | // Build scalar Rsrc. | |||
5665 | auto Merge = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc); | |||
5666 | unsigned Channel = 0; | |||
5667 | for (Register Piece : ReadlanePieces) { | |||
5668 | Merge.addReg(Piece) | |||
5669 | .addImm(TRI->getSubRegFromChannel(Channel++)); | |||
5670 | } | |||
5671 | ||||
5672 | // Update Rsrc operand to use the SGPR Rsrc. | |||
5673 | Rsrc.setReg(SRsrc); | |||
5674 | Rsrc.setIsKill(); | |||
5675 | ||||
5676 | Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); | |||
5677 | MRI.setSimpleHint(SaveExec, CondReg); | |||
5678 | ||||
5679 | // Update EXEC to matching lanes, saving original to SaveExec. | |||
5680 | BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec) | |||
5681 | .addReg(CondReg, RegState::Kill); | |||
5682 | ||||
5683 | // The original instruction is here; we insert the terminators after it. | |||
5684 | I = BodyBB.end(); | |||
5685 | ||||
5686 | // Update EXEC, switch all done bits to 0 and all todo bits to 1. | |||
5687 | BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec) | |||
5688 | .addReg(Exec) | |||
5689 | .addReg(SaveExec); | |||
5690 | ||||
5691 | BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB); | |||
5692 | } | |||
5693 | ||||
5694 | // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register | |||
5695 | // with SGPRs by iterating over all unique values across all lanes. | |||
5696 | // Returns the loop basic block that now contains \p MI. | |||
5697 | static MachineBasicBlock * | |||
5698 | loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, | |||
5699 | MachineOperand &Rsrc, MachineDominatorTree *MDT, | |||
5700 | MachineBasicBlock::iterator Begin = nullptr, | |||
5701 | MachineBasicBlock::iterator End = nullptr) { | |||
5702 | MachineBasicBlock &MBB = *MI.getParent(); | |||
5703 | MachineFunction &MF = *MBB.getParent(); | |||
5704 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | |||
5705 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); | |||
5706 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
5707 | if (!Begin.isValid()) | |||
5708 | Begin = &MI; | |||
5709 | if (!End.isValid()) { | |||
5710 | End = &MI; | |||
5711 | ++End; | |||
5712 | } | |||
5713 | const DebugLoc &DL = MI.getDebugLoc(); | |||
5714 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
5715 | unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; | |||
5716 | const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
5717 | ||||
5718 | Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); | |||
5719 | ||||
5720 | // Save the EXEC mask | |||
5721 | BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); | |||
5722 | ||||
5723 | // Killed uses in the instruction we are waterfalling around will be | |||
5724 | // incorrect due to the added control-flow. | |||
5725 | MachineBasicBlock::iterator AfterMI = MI; | |||
5726 | ++AfterMI; | |||
5727 | for (auto I = Begin; I != AfterMI; I++) { | |||
5728 | for (auto &MO : I->uses()) { | |||
5729 | if (MO.isReg() && MO.isUse()) { | |||
5730 | MRI.clearKillFlags(MO.getReg()); | |||
5731 | } | |||
5732 | } | |||
5733 | } | |||
5734 | ||||
5735 | // To insert the loop we need to split the block. Move everything after this | |||
5736 | // point to a new block, and insert a new empty block between the two. | |||
5737 | MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); | |||
5738 | MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock(); | |||
5739 | MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); | |||
5740 | MachineFunction::iterator MBBI(MBB); | |||
5741 | ++MBBI; | |||
5742 | ||||
5743 | MF.insert(MBBI, LoopBB); | |||
5744 | MF.insert(MBBI, BodyBB); | |||
5745 | MF.insert(MBBI, RemainderBB); | |||
5746 | ||||
5747 | LoopBB->addSuccessor(BodyBB); | |||
5748 | BodyBB->addSuccessor(LoopBB); | |||
5749 | BodyBB->addSuccessor(RemainderBB); | |||
5750 | ||||
5751 | // Move Begin to MI to the BodyBB, and the remainder of the block to | |||
5752 | // RemainderBB. | |||
5753 | RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); | |||
5754 | RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end()); | |||
5755 | BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end()); | |||
5756 | ||||
5757 | MBB.addSuccessor(LoopBB); | |||
5758 | ||||
5759 | // Update dominators. We know that MBB immediately dominates LoopBB, that | |||
5760 | // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates | |||
5761 | // RemainderBB. RemainderBB immediately dominates all of the successors | |||
5762 | // transferred to it from MBB that MBB used to properly dominate. | |||
5763 | if (MDT) { | |||
5764 | MDT->addNewBlock(LoopBB, &MBB); | |||
5765 | MDT->addNewBlock(BodyBB, LoopBB); | |||
5766 | MDT->addNewBlock(RemainderBB, BodyBB); | |||
5767 | for (auto &Succ : RemainderBB->successors()) { | |||
5768 | if (MDT->properlyDominates(&MBB, Succ)) { | |||
5769 | MDT->changeImmediateDominator(Succ, RemainderBB); | |||
5770 | } | |||
5771 | } | |||
5772 | } | |||
5773 | ||||
5774 | emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, Rsrc); | |||
5775 | ||||
5776 | // Restore the EXEC mask | |||
5777 | MachineBasicBlock::iterator First = RemainderBB->begin(); | |||
5778 | BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); | |||
5779 | return BodyBB; | |||
5780 | } | |||
5781 | ||||
5782 | // Extract pointer from Rsrc and return a zero-value Rsrc replacement. | |||
5783 | static std::tuple<unsigned, unsigned> | |||
5784 | extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { | |||
5785 | MachineBasicBlock &MBB = *MI.getParent(); | |||
5786 | MachineFunction &MF = *MBB.getParent(); | |||
5787 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
5788 | ||||
5789 | // Extract the ptr from the resource descriptor. | |||
5790 | unsigned RsrcPtr = | |||
5791 | TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass, | |||
5792 | AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); | |||
5793 | ||||
5794 | // Create an empty resource descriptor | |||
5795 | Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); | |||
5796 | Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5797 | Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5798 | Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); | |||
5799 | uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat(); | |||
5800 | ||||
5801 | // Zero64 = 0 | |||
5802 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64) | |||
5803 | .addImm(0); | |||
5804 | ||||
5805 | // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} | |||
5806 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo) | |||
5807 | .addImm(RsrcDataFormat & 0xFFFFFFFF); | |||
5808 | ||||
5809 | // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} | |||
5810 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi) | |||
5811 | .addImm(RsrcDataFormat >> 32); | |||
5812 | ||||
5813 | // NewSRsrc = {Zero64, SRsrcFormat} | |||
5814 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc) | |||
5815 | .addReg(Zero64) | |||
5816 | .addImm(AMDGPU::sub0_sub1) | |||
5817 | .addReg(SRsrcFormatLo) | |||
5818 | .addImm(AMDGPU::sub2) | |||
5819 | .addReg(SRsrcFormatHi) | |||
5820 | .addImm(AMDGPU::sub3); | |||
5821 | ||||
5822 | return std::tuple(RsrcPtr, NewSRsrc); | |||
5823 | } | |||
5824 | ||||
5825 | MachineBasicBlock * | |||
5826 | SIInstrInfo::legalizeOperands(MachineInstr &MI, | |||
5827 | MachineDominatorTree *MDT) const { | |||
5828 | MachineFunction &MF = *MI.getParent()->getParent(); | |||
5829 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
5830 | MachineBasicBlock *CreatedBB = nullptr; | |||
5831 | ||||
5832 | // Legalize VOP2 | |||
5833 | if (isVOP2(MI) || isVOPC(MI)) { | |||
5834 | legalizeOperandsVOP2(MRI, MI); | |||
5835 | return CreatedBB; | |||
5836 | } | |||
5837 | ||||
5838 | // Legalize VOP3 | |||
5839 | if (isVOP3(MI)) { | |||
5840 | legalizeOperandsVOP3(MRI, MI); | |||
5841 | return CreatedBB; | |||
5842 | } | |||
5843 | ||||
5844 | // Legalize SMRD | |||
5845 | if (isSMRD(MI)) { | |||
5846 | legalizeOperandsSMRD(MRI, MI); | |||
5847 | return CreatedBB; | |||
5848 | } | |||
5849 | ||||
5850 | // Legalize FLAT | |||
5851 | if (isFLAT(MI)) { | |||
5852 | legalizeOperandsFLAT(MRI, MI); | |||
5853 | return CreatedBB; | |||
5854 | } | |||
5855 | ||||
5856 | // Legalize REG_SEQUENCE and PHI | |||
5857 | // The register class of the operands much be the same type as the register | |||
5858 | // class of the output. | |||
5859 | if (MI.getOpcode() == AMDGPU::PHI) { | |||
5860 | const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; | |||
5861 | for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { | |||
5862 | if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual()) | |||
5863 | continue; | |||
5864 | const TargetRegisterClass *OpRC = | |||
5865 | MRI.getRegClass(MI.getOperand(i).getReg()); | |||
5866 | if (RI.hasVectorRegisters(OpRC)) { | |||
5867 | VRC = OpRC; | |||
5868 | } else { | |||
5869 | SRC = OpRC; | |||
5870 | } | |||
5871 | } | |||
5872 | ||||
5873 | // If any of the operands are VGPR registers, then they all most be | |||
5874 | // otherwise we will create illegal VGPR->SGPR copies when legalizing | |||
5875 | // them. | |||
5876 | if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { | |||
5877 | if (!VRC) { | |||
5878 | assert(SRC)(static_cast <bool> (SRC) ? void (0) : __assert_fail ("SRC" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 5878, __extension__ __PRETTY_FUNCTION__)); | |||
5879 | if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { | |||
5880 | VRC = &AMDGPU::VReg_1RegClass; | |||
5881 | } else | |||
5882 | VRC = RI.isAGPRClass(getOpRegClass(MI, 0)) | |||
5883 | ? RI.getEquivalentAGPRClass(SRC) | |||
5884 | : RI.getEquivalentVGPRClass(SRC); | |||
5885 | } else { | |||
5886 | VRC = RI.isAGPRClass(getOpRegClass(MI, 0)) | |||
5887 | ? RI.getEquivalentAGPRClass(VRC) | |||
5888 | : RI.getEquivalentVGPRClass(VRC); | |||
5889 | } | |||
5890 | RC = VRC; | |||
5891 | } else { | |||
5892 | RC = SRC; | |||
5893 | } | |||
5894 | ||||
5895 | // Update all the operands so they have the same type. | |||
5896 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { | |||
5897 | MachineOperand &Op = MI.getOperand(I); | |||
5898 | if (!Op.isReg() || !Op.getReg().isVirtual()) | |||
5899 | continue; | |||
5900 | ||||
5901 | // MI is a PHI instruction. | |||
5902 | MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); | |||
5903 | MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); | |||
5904 | ||||
5905 | // Avoid creating no-op copies with the same src and dst reg class. These | |||
5906 | // confuse some of the machine passes. | |||
5907 | legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); | |||
5908 | } | |||
5909 | } | |||
5910 | ||||
5911 | // REG_SEQUENCE doesn't really require operand legalization, but if one has a | |||
5912 | // VGPR dest type and SGPR sources, insert copies so all operands are | |||
5913 | // VGPRs. This seems to help operand folding / the register coalescer. | |||
5914 | if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { | |||
5915 | MachineBasicBlock *MBB = MI.getParent(); | |||
5916 | const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); | |||
5917 | if (RI.hasVGPRs(DstRC)) { | |||
5918 | // Update all the operands so they are VGPR register classes. These may | |||
5919 | // not be the same register class because REG_SEQUENCE supports mixing | |||
5920 | // subregister index types e.g. sub0_sub1 + sub2 + sub3 | |||
5921 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { | |||
5922 | MachineOperand &Op = MI.getOperand(I); | |||
5923 | if (!Op.isReg() || !Op.getReg().isVirtual()) | |||
5924 | continue; | |||
5925 | ||||
5926 | const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); | |||
5927 | const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); | |||
5928 | if (VRC == OpRC) | |||
5929 | continue; | |||
5930 | ||||
5931 | legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); | |||
5932 | Op.setIsKill(); | |||
5933 | } | |||
5934 | } | |||
5935 | ||||
5936 | return CreatedBB; | |||
5937 | } | |||
5938 | ||||
5939 | // Legalize INSERT_SUBREG | |||
5940 | // src0 must have the same register class as dst | |||
5941 | if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { | |||
5942 | Register Dst = MI.getOperand(0).getReg(); | |||
5943 | Register Src0 = MI.getOperand(1).getReg(); | |||
5944 | const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); | |||
5945 | const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); | |||
5946 | if (DstRC != Src0RC) { | |||
5947 | MachineBasicBlock *MBB = MI.getParent(); | |||
5948 | MachineOperand &Op = MI.getOperand(1); | |||
5949 | legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); | |||
5950 | } | |||
5951 | return CreatedBB; | |||
5952 | } | |||
5953 | ||||
5954 | // Legalize SI_INIT_M0 | |||
5955 | if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { | |||
5956 | MachineOperand &Src = MI.getOperand(0); | |||
5957 | if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) | |||
5958 | Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); | |||
5959 | return CreatedBB; | |||
5960 | } | |||
5961 | ||||
5962 | // Legalize MIMG and MUBUF/MTBUF for shaders. | |||
5963 | // | |||
5964 | // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via | |||
5965 | // scratch memory access. In both cases, the legalization never involves | |||
5966 | // conversion to the addr64 form. | |||
5967 | if (isMIMG(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) && | |||
5968 | (isMUBUF(MI) || isMTBUF(MI)))) { | |||
5969 | MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); | |||
5970 | if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) | |||
5971 | CreatedBB = loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT); | |||
5972 | ||||
5973 | MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); | |||
5974 | if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) | |||
5975 | CreatedBB = loadSRsrcFromVGPR(*this, MI, *SSamp, MDT); | |||
5976 | ||||
5977 | return CreatedBB; | |||
5978 | } | |||
5979 | ||||
5980 | // Legalize SI_CALL | |||
5981 | if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { | |||
5982 | MachineOperand *Dest = &MI.getOperand(0); | |||
5983 | if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) { | |||
5984 | // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and | |||
5985 | // following copies, we also need to move copies from and to physical | |||
5986 | // registers into the loop block. | |||
5987 | unsigned FrameSetupOpcode = getCallFrameSetupOpcode(); | |||
5988 | unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode(); | |||
5989 | ||||
5990 | // Also move the copies to physical registers into the loop block | |||
5991 | MachineBasicBlock &MBB = *MI.ge |