File: | build/llvm-toolchain-snapshot-15~++20220310101044+47f652d69517/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp |
Warning: | line 2204, column 15 Called C++ object pointer is uninitialized |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | // | |||
9 | /// \file | |||
10 | /// SI Implementation of TargetInstrInfo. | |||
11 | // | |||
12 | //===----------------------------------------------------------------------===// | |||
13 | ||||
14 | #include "SIInstrInfo.h" | |||
15 | #include "AMDGPU.h" | |||
16 | #include "AMDGPUInstrInfo.h" | |||
17 | #include "GCNHazardRecognizer.h" | |||
18 | #include "GCNSubtarget.h" | |||
19 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" | |||
20 | #include "SIMachineFunctionInfo.h" | |||
21 | #include "llvm/Analysis/ValueTracking.h" | |||
22 | #include "llvm/CodeGen/LiveIntervals.h" | |||
23 | #include "llvm/CodeGen/LiveVariables.h" | |||
24 | #include "llvm/CodeGen/MachineDominators.h" | |||
25 | #include "llvm/CodeGen/MachineScheduler.h" | |||
26 | #include "llvm/CodeGen/RegisterScavenging.h" | |||
27 | #include "llvm/CodeGen/ScheduleDAG.h" | |||
28 | #include "llvm/IR/DiagnosticInfo.h" | |||
29 | #include "llvm/IR/IntrinsicsAMDGPU.h" | |||
30 | #include "llvm/MC/MCContext.h" | |||
31 | #include "llvm/Support/CommandLine.h" | |||
32 | #include "llvm/Target/TargetMachine.h" | |||
33 | ||||
34 | using namespace llvm; | |||
35 | ||||
36 | #define DEBUG_TYPE"si-instr-info" "si-instr-info" | |||
37 | ||||
38 | #define GET_INSTRINFO_CTOR_DTOR | |||
39 | #include "AMDGPUGenInstrInfo.inc" | |||
40 | ||||
41 | namespace llvm { | |||
42 | ||||
43 | class AAResults; | |||
44 | ||||
45 | namespace AMDGPU { | |||
46 | #define GET_D16ImageDimIntrinsics_IMPL | |||
47 | #define GET_ImageDimIntrinsicTable_IMPL | |||
48 | #define GET_RsrcIntrinsics_IMPL | |||
49 | #include "AMDGPUGenSearchableTables.inc" | |||
50 | } | |||
51 | } | |||
52 | ||||
53 | ||||
54 | // Must be at least 4 to be able to branch over minimum unconditional branch | |||
55 | // code. This is only for making it possible to write reasonably small tests for | |||
56 | // long branches. | |||
57 | static cl::opt<unsigned> | |||
58 | BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), | |||
59 | cl::desc("Restrict range of branch instructions (DEBUG)")); | |||
60 | ||||
61 | static cl::opt<bool> Fix16BitCopies( | |||
62 | "amdgpu-fix-16-bit-physreg-copies", | |||
63 | cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), | |||
64 | cl::init(true), | |||
65 | cl::ReallyHidden); | |||
66 | ||||
67 | SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) | |||
68 | : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), | |||
69 | RI(ST), ST(ST) { | |||
70 | SchedModel.init(&ST); | |||
71 | } | |||
72 | ||||
73 | //===----------------------------------------------------------------------===// | |||
74 | // TargetInstrInfo callbacks | |||
75 | //===----------------------------------------------------------------------===// | |||
76 | ||||
77 | static unsigned getNumOperandsNoGlue(SDNode *Node) { | |||
78 | unsigned N = Node->getNumOperands(); | |||
79 | while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) | |||
80 | --N; | |||
81 | return N; | |||
82 | } | |||
83 | ||||
84 | /// Returns true if both nodes have the same value for the given | |||
85 | /// operand \p Op, or if both nodes do not have this operand. | |||
86 | static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { | |||
87 | unsigned Opc0 = N0->getMachineOpcode(); | |||
88 | unsigned Opc1 = N1->getMachineOpcode(); | |||
89 | ||||
90 | int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); | |||
91 | int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); | |||
92 | ||||
93 | if (Op0Idx == -1 && Op1Idx == -1) | |||
94 | return true; | |||
95 | ||||
96 | ||||
97 | if ((Op0Idx == -1 && Op1Idx != -1) || | |||
98 | (Op1Idx == -1 && Op0Idx != -1)) | |||
99 | return false; | |||
100 | ||||
101 | // getNamedOperandIdx returns the index for the MachineInstr's operands, | |||
102 | // which includes the result as the first operand. We are indexing into the | |||
103 | // MachineSDNode's operands, so we need to skip the result operand to get | |||
104 | // the real index. | |||
105 | --Op0Idx; | |||
106 | --Op1Idx; | |||
107 | ||||
108 | return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); | |||
109 | } | |||
110 | ||||
111 | bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, | |||
112 | AAResults *AA) const { | |||
113 | if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) { | |||
114 | // Normally VALU use of exec would block the rematerialization, but that | |||
115 | // is OK in this case to have an implicit exec read as all VALU do. | |||
116 | // We really want all of the generic logic for this except for this. | |||
117 | ||||
118 | // Another potential implicit use is mode register. The core logic of | |||
119 | // the RA will not attempt rematerialization if mode is set anywhere | |||
120 | // in the function, otherwise it is safe since mode is not changed. | |||
121 | ||||
122 | // There is difference to generic method which does not allow | |||
123 | // rematerialization if there are virtual register uses. We allow this, | |||
124 | // therefore this method includes SOP instructions as well. | |||
125 | return !MI.hasImplicitDef() && | |||
126 | MI.getNumImplicitOperands() == MI.getDesc().getNumImplicitUses() && | |||
127 | !MI.mayRaiseFPException(); | |||
128 | } | |||
129 | ||||
130 | return false; | |||
131 | } | |||
132 | ||||
133 | // Returns true if the scalar result of a VALU instruction depends on exec. | |||
134 | static bool resultDependsOnExec(const MachineInstr &MI) { | |||
135 | // Ignore comparisons which are only used masked with exec. | |||
136 | // This allows some hoisting/sinking of VALU comparisons. | |||
137 | if (MI.isCompare()) { | |||
138 | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); | |||
139 | Register DstReg = MI.getOperand(0).getReg(); | |||
140 | if (!DstReg.isVirtual()) | |||
141 | return true; | |||
142 | for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) { | |||
143 | switch (Use.getOpcode()) { | |||
144 | case AMDGPU::S_AND_SAVEEXEC_B32: | |||
145 | case AMDGPU::S_AND_SAVEEXEC_B64: | |||
146 | break; | |||
147 | case AMDGPU::S_AND_B32: | |||
148 | case AMDGPU::S_AND_B64: | |||
149 | if (!Use.readsRegister(AMDGPU::EXEC)) | |||
150 | return true; | |||
151 | break; | |||
152 | default: | |||
153 | return true; | |||
154 | } | |||
155 | } | |||
156 | return false; | |||
157 | } | |||
158 | ||||
159 | switch (MI.getOpcode()) { | |||
160 | default: | |||
161 | break; | |||
162 | case AMDGPU::V_READFIRSTLANE_B32: | |||
163 | return true; | |||
164 | } | |||
165 | ||||
166 | return false; | |||
167 | } | |||
168 | ||||
169 | bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const { | |||
170 | // Any implicit use of exec by VALU is not a real register read. | |||
171 | return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() && | |||
172 | isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent()); | |||
173 | } | |||
174 | ||||
175 | bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, | |||
176 | int64_t &Offset0, | |||
177 | int64_t &Offset1) const { | |||
178 | if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) | |||
179 | return false; | |||
180 | ||||
181 | unsigned Opc0 = Load0->getMachineOpcode(); | |||
182 | unsigned Opc1 = Load1->getMachineOpcode(); | |||
183 | ||||
184 | // Make sure both are actually loads. | |||
185 | if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) | |||
186 | return false; | |||
187 | ||||
188 | if (isDS(Opc0) && isDS(Opc1)) { | |||
189 | ||||
190 | // FIXME: Handle this case: | |||
191 | if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) | |||
192 | return false; | |||
193 | ||||
194 | // Check base reg. | |||
195 | if (Load0->getOperand(0) != Load1->getOperand(0)) | |||
196 | return false; | |||
197 | ||||
198 | // Skip read2 / write2 variants for simplicity. | |||
199 | // TODO: We should report true if the used offsets are adjacent (excluded | |||
200 | // st64 versions). | |||
201 | int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); | |||
202 | int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); | |||
203 | if (Offset0Idx == -1 || Offset1Idx == -1) | |||
204 | return false; | |||
205 | ||||
206 | // XXX - be careful of dataless loads | |||
207 | // getNamedOperandIdx returns the index for MachineInstrs. Since they | |||
208 | // include the output in the operand list, but SDNodes don't, we need to | |||
209 | // subtract the index by one. | |||
210 | Offset0Idx -= get(Opc0).NumDefs; | |||
211 | Offset1Idx -= get(Opc1).NumDefs; | |||
212 | Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue(); | |||
213 | Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue(); | |||
214 | return true; | |||
215 | } | |||
216 | ||||
217 | if (isSMRD(Opc0) && isSMRD(Opc1)) { | |||
218 | // Skip time and cache invalidation instructions. | |||
219 | if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || | |||
220 | AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) | |||
221 | return false; | |||
222 | ||||
223 | assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1))(static_cast <bool> (getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue (Load1)) ? void (0) : __assert_fail ("getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 223, __extension__ __PRETTY_FUNCTION__)); | |||
224 | ||||
225 | // Check base reg. | |||
226 | if (Load0->getOperand(0) != Load1->getOperand(0)) | |||
227 | return false; | |||
228 | ||||
229 | const ConstantSDNode *Load0Offset = | |||
230 | dyn_cast<ConstantSDNode>(Load0->getOperand(1)); | |||
231 | const ConstantSDNode *Load1Offset = | |||
232 | dyn_cast<ConstantSDNode>(Load1->getOperand(1)); | |||
233 | ||||
234 | if (!Load0Offset || !Load1Offset) | |||
235 | return false; | |||
236 | ||||
237 | Offset0 = Load0Offset->getZExtValue(); | |||
238 | Offset1 = Load1Offset->getZExtValue(); | |||
239 | return true; | |||
240 | } | |||
241 | ||||
242 | // MUBUF and MTBUF can access the same addresses. | |||
243 | if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { | |||
244 | ||||
245 | // MUBUF and MTBUF have vaddr at different indices. | |||
246 | if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || | |||
247 | !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || | |||
248 | !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) | |||
249 | return false; | |||
250 | ||||
251 | int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); | |||
252 | int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); | |||
253 | ||||
254 | if (OffIdx0 == -1 || OffIdx1 == -1) | |||
255 | return false; | |||
256 | ||||
257 | // getNamedOperandIdx returns the index for MachineInstrs. Since they | |||
258 | // include the output in the operand list, but SDNodes don't, we need to | |||
259 | // subtract the index by one. | |||
260 | OffIdx0 -= get(Opc0).NumDefs; | |||
261 | OffIdx1 -= get(Opc1).NumDefs; | |||
262 | ||||
263 | SDValue Off0 = Load0->getOperand(OffIdx0); | |||
264 | SDValue Off1 = Load1->getOperand(OffIdx1); | |||
265 | ||||
266 | // The offset might be a FrameIndexSDNode. | |||
267 | if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) | |||
268 | return false; | |||
269 | ||||
270 | Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); | |||
271 | Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); | |||
272 | return true; | |||
273 | } | |||
274 | ||||
275 | return false; | |||
276 | } | |||
277 | ||||
278 | static bool isStride64(unsigned Opc) { | |||
279 | switch (Opc) { | |||
280 | case AMDGPU::DS_READ2ST64_B32: | |||
281 | case AMDGPU::DS_READ2ST64_B64: | |||
282 | case AMDGPU::DS_WRITE2ST64_B32: | |||
283 | case AMDGPU::DS_WRITE2ST64_B64: | |||
284 | return true; | |||
285 | default: | |||
286 | return false; | |||
287 | } | |||
288 | } | |||
289 | ||||
290 | bool SIInstrInfo::getMemOperandsWithOffsetWidth( | |||
291 | const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, | |||
292 | int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, | |||
293 | const TargetRegisterInfo *TRI) const { | |||
294 | if (!LdSt.mayLoadOrStore()) | |||
295 | return false; | |||
296 | ||||
297 | unsigned Opc = LdSt.getOpcode(); | |||
298 | OffsetIsScalable = false; | |||
299 | const MachineOperand *BaseOp, *OffsetOp; | |||
300 | int DataOpIdx; | |||
301 | ||||
302 | if (isDS(LdSt)) { | |||
303 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); | |||
304 | OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); | |||
305 | if (OffsetOp) { | |||
306 | // Normal, single offset LDS instruction. | |||
307 | if (!BaseOp) { | |||
308 | // DS_CONSUME/DS_APPEND use M0 for the base address. | |||
309 | // TODO: find the implicit use operand for M0 and use that as BaseOp? | |||
310 | return false; | |||
311 | } | |||
312 | BaseOps.push_back(BaseOp); | |||
313 | Offset = OffsetOp->getImm(); | |||
314 | // Get appropriate operand, and compute width accordingly. | |||
315 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
316 | if (DataOpIdx == -1) | |||
317 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); | |||
318 | Width = getOpSize(LdSt, DataOpIdx); | |||
319 | } else { | |||
320 | // The 2 offset instructions use offset0 and offset1 instead. We can treat | |||
321 | // these as a load with a single offset if the 2 offsets are consecutive. | |||
322 | // We will use this for some partially aligned loads. | |||
323 | const MachineOperand *Offset0Op = | |||
324 | getNamedOperand(LdSt, AMDGPU::OpName::offset0); | |||
325 | const MachineOperand *Offset1Op = | |||
326 | getNamedOperand(LdSt, AMDGPU::OpName::offset1); | |||
327 | ||||
328 | unsigned Offset0 = Offset0Op->getImm(); | |||
329 | unsigned Offset1 = Offset1Op->getImm(); | |||
330 | if (Offset0 + 1 != Offset1) | |||
331 | return false; | |||
332 | ||||
333 | // Each of these offsets is in element sized units, so we need to convert | |||
334 | // to bytes of the individual reads. | |||
335 | ||||
336 | unsigned EltSize; | |||
337 | if (LdSt.mayLoad()) | |||
338 | EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; | |||
339 | else { | |||
340 | assert(LdSt.mayStore())(static_cast <bool> (LdSt.mayStore()) ? void (0) : __assert_fail ("LdSt.mayStore()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 340, __extension__ __PRETTY_FUNCTION__)); | |||
341 | int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); | |||
342 | EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; | |||
343 | } | |||
344 | ||||
345 | if (isStride64(Opc)) | |||
346 | EltSize *= 64; | |||
347 | ||||
348 | BaseOps.push_back(BaseOp); | |||
349 | Offset = EltSize * Offset0; | |||
350 | // Get appropriate operand(s), and compute width accordingly. | |||
351 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
352 | if (DataOpIdx == -1) { | |||
353 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); | |||
354 | Width = getOpSize(LdSt, DataOpIdx); | |||
355 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1); | |||
356 | Width += getOpSize(LdSt, DataOpIdx); | |||
357 | } else { | |||
358 | Width = getOpSize(LdSt, DataOpIdx); | |||
359 | } | |||
360 | } | |||
361 | return true; | |||
362 | } | |||
363 | ||||
364 | if (isMUBUF(LdSt) || isMTBUF(LdSt)) { | |||
365 | const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); | |||
366 | if (!RSrc) // e.g. BUFFER_WBINVL1_VOL | |||
367 | return false; | |||
368 | BaseOps.push_back(RSrc); | |||
369 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); | |||
370 | if (BaseOp && !BaseOp->isFI()) | |||
371 | BaseOps.push_back(BaseOp); | |||
372 | const MachineOperand *OffsetImm = | |||
373 | getNamedOperand(LdSt, AMDGPU::OpName::offset); | |||
374 | Offset = OffsetImm->getImm(); | |||
375 | const MachineOperand *SOffset = | |||
376 | getNamedOperand(LdSt, AMDGPU::OpName::soffset); | |||
377 | if (SOffset) { | |||
378 | if (SOffset->isReg()) | |||
379 | BaseOps.push_back(SOffset); | |||
380 | else | |||
381 | Offset += SOffset->getImm(); | |||
382 | } | |||
383 | // Get appropriate operand, and compute width accordingly. | |||
384 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
385 | if (DataOpIdx == -1) | |||
386 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); | |||
387 | Width = getOpSize(LdSt, DataOpIdx); | |||
388 | return true; | |||
389 | } | |||
390 | ||||
391 | if (isMIMG(LdSt)) { | |||
392 | int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); | |||
393 | BaseOps.push_back(&LdSt.getOperand(SRsrcIdx)); | |||
394 | int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); | |||
395 | if (VAddr0Idx >= 0) { | |||
396 | // GFX10 possible NSA encoding. | |||
397 | for (int I = VAddr0Idx; I < SRsrcIdx; ++I) | |||
398 | BaseOps.push_back(&LdSt.getOperand(I)); | |||
399 | } else { | |||
400 | BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr)); | |||
401 | } | |||
402 | Offset = 0; | |||
403 | // Get appropriate operand, and compute width accordingly. | |||
404 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); | |||
405 | Width = getOpSize(LdSt, DataOpIdx); | |||
406 | return true; | |||
407 | } | |||
408 | ||||
409 | if (isSMRD(LdSt)) { | |||
410 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase); | |||
411 | if (!BaseOp) // e.g. S_MEMTIME | |||
412 | return false; | |||
413 | BaseOps.push_back(BaseOp); | |||
414 | OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); | |||
415 | Offset = OffsetOp ? OffsetOp->getImm() : 0; | |||
416 | // Get appropriate operand, and compute width accordingly. | |||
417 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst); | |||
418 | Width = getOpSize(LdSt, DataOpIdx); | |||
419 | return true; | |||
420 | } | |||
421 | ||||
422 | if (isFLAT(LdSt)) { | |||
423 | // Instructions have either vaddr or saddr or both or none. | |||
424 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); | |||
425 | if (BaseOp) | |||
426 | BaseOps.push_back(BaseOp); | |||
427 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); | |||
428 | if (BaseOp) | |||
429 | BaseOps.push_back(BaseOp); | |||
430 | Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); | |||
431 | // Get appropriate operand, and compute width accordingly. | |||
432 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
433 | if (DataOpIdx == -1) | |||
434 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); | |||
435 | Width = getOpSize(LdSt, DataOpIdx); | |||
436 | return true; | |||
437 | } | |||
438 | ||||
439 | return false; | |||
440 | } | |||
441 | ||||
442 | static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, | |||
443 | ArrayRef<const MachineOperand *> BaseOps1, | |||
444 | const MachineInstr &MI2, | |||
445 | ArrayRef<const MachineOperand *> BaseOps2) { | |||
446 | // Only examine the first "base" operand of each instruction, on the | |||
447 | // assumption that it represents the real base address of the memory access. | |||
448 | // Other operands are typically offsets or indices from this base address. | |||
449 | if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front())) | |||
450 | return true; | |||
451 | ||||
452 | if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) | |||
453 | return false; | |||
454 | ||||
455 | auto MO1 = *MI1.memoperands_begin(); | |||
456 | auto MO2 = *MI2.memoperands_begin(); | |||
457 | if (MO1->getAddrSpace() != MO2->getAddrSpace()) | |||
458 | return false; | |||
459 | ||||
460 | auto Base1 = MO1->getValue(); | |||
461 | auto Base2 = MO2->getValue(); | |||
462 | if (!Base1 || !Base2) | |||
463 | return false; | |||
464 | Base1 = getUnderlyingObject(Base1); | |||
465 | Base2 = getUnderlyingObject(Base2); | |||
466 | ||||
467 | if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2)) | |||
468 | return false; | |||
469 | ||||
470 | return Base1 == Base2; | |||
471 | } | |||
472 | ||||
473 | bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, | |||
474 | ArrayRef<const MachineOperand *> BaseOps2, | |||
475 | unsigned NumLoads, | |||
476 | unsigned NumBytes) const { | |||
477 | // If the mem ops (to be clustered) do not have the same base ptr, then they | |||
478 | // should not be clustered | |||
479 | if (!BaseOps1.empty() && !BaseOps2.empty()) { | |||
480 | const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); | |||
481 | const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); | |||
482 | if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) | |||
483 | return false; | |||
484 | } else if (!BaseOps1.empty() || !BaseOps2.empty()) { | |||
485 | // If only one base op is empty, they do not have the same base ptr | |||
486 | return false; | |||
487 | } | |||
488 | ||||
489 | // In order to avoid register pressure, on an average, the number of DWORDS | |||
490 | // loaded together by all clustered mem ops should not exceed 8. This is an | |||
491 | // empirical value based on certain observations and performance related | |||
492 | // experiments. | |||
493 | // The good thing about this heuristic is - it avoids clustering of too many | |||
494 | // sub-word loads, and also avoids clustering of wide loads. Below is the | |||
495 | // brief summary of how the heuristic behaves for various `LoadSize`. | |||
496 | // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops | |||
497 | // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops | |||
498 | // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops | |||
499 | // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops | |||
500 | // (5) LoadSize >= 17: do not cluster | |||
501 | const unsigned LoadSize = NumBytes / NumLoads; | |||
502 | const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads; | |||
503 | return NumDWORDs <= 8; | |||
504 | } | |||
505 | ||||
506 | // FIXME: This behaves strangely. If, for example, you have 32 load + stores, | |||
507 | // the first 16 loads will be interleaved with the stores, and the next 16 will | |||
508 | // be clustered as expected. It should really split into 2 16 store batches. | |||
509 | // | |||
510 | // Loads are clustered until this returns false, rather than trying to schedule | |||
511 | // groups of stores. This also means we have to deal with saying different | |||
512 | // address space loads should be clustered, and ones which might cause bank | |||
513 | // conflicts. | |||
514 | // | |||
515 | // This might be deprecated so it might not be worth that much effort to fix. | |||
516 | bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, | |||
517 | int64_t Offset0, int64_t Offset1, | |||
518 | unsigned NumLoads) const { | |||
519 | assert(Offset1 > Offset0 &&(static_cast <bool> (Offset1 > Offset0 && "Second offset should be larger than first offset!" ) ? void (0) : __assert_fail ("Offset1 > Offset0 && \"Second offset should be larger than first offset!\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 520, __extension__ __PRETTY_FUNCTION__)) | |||
520 | "Second offset should be larger than first offset!")(static_cast <bool> (Offset1 > Offset0 && "Second offset should be larger than first offset!" ) ? void (0) : __assert_fail ("Offset1 > Offset0 && \"Second offset should be larger than first offset!\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 520, __extension__ __PRETTY_FUNCTION__)); | |||
521 | // If we have less than 16 loads in a row, and the offsets are within 64 | |||
522 | // bytes, then schedule together. | |||
523 | ||||
524 | // A cacheline is 64 bytes (for global memory). | |||
525 | return (NumLoads <= 16 && (Offset1 - Offset0) < 64); | |||
526 | } | |||
527 | ||||
528 | static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, | |||
529 | MachineBasicBlock::iterator MI, | |||
530 | const DebugLoc &DL, MCRegister DestReg, | |||
531 | MCRegister SrcReg, bool KillSrc, | |||
532 | const char *Msg = "illegal SGPR to VGPR copy") { | |||
533 | MachineFunction *MF = MBB.getParent(); | |||
534 | DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error); | |||
535 | LLVMContext &C = MF->getFunction().getContext(); | |||
536 | C.diagnose(IllegalCopy); | |||
537 | ||||
538 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) | |||
539 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
540 | } | |||
541 | ||||
542 | /// Handle copying from SGPR to AGPR, or from AGPR to AGPR. It is not possible | |||
543 | /// to directly copy, so an intermediate VGPR needs to be used. | |||
544 | static void indirectCopyToAGPR(const SIInstrInfo &TII, | |||
545 | MachineBasicBlock &MBB, | |||
546 | MachineBasicBlock::iterator MI, | |||
547 | const DebugLoc &DL, MCRegister DestReg, | |||
548 | MCRegister SrcReg, bool KillSrc, | |||
549 | RegScavenger &RS, | |||
550 | Register ImpDefSuperReg = Register(), | |||
551 | Register ImpUseSuperReg = Register()) { | |||
552 | const SIRegisterInfo &RI = TII.getRegisterInfo(); | |||
553 | ||||
554 | assert(AMDGPU::SReg_32RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::SReg_32RegClass.contains(SrcReg ) || AMDGPU::AGPR_32RegClass.contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 555, __extension__ __PRETTY_FUNCTION__)) | |||
555 | AMDGPU::AGPR_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::SReg_32RegClass.contains(SrcReg ) || AMDGPU::AGPR_32RegClass.contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 555, __extension__ __PRETTY_FUNCTION__)); | |||
556 | ||||
557 | // First try to find defining accvgpr_write to avoid temporary registers. | |||
558 | for (auto Def = MI, E = MBB.begin(); Def != E; ) { | |||
559 | --Def; | |||
560 | if (!Def->definesRegister(SrcReg, &RI)) | |||
561 | continue; | |||
562 | if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) | |||
563 | break; | |||
564 | ||||
565 | MachineOperand &DefOp = Def->getOperand(1); | |||
566 | assert(DefOp.isReg() || DefOp.isImm())(static_cast <bool> (DefOp.isReg() || DefOp.isImm()) ? void (0) : __assert_fail ("DefOp.isReg() || DefOp.isImm()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 566, __extension__ __PRETTY_FUNCTION__)); | |||
567 | ||||
568 | if (DefOp.isReg()) { | |||
569 | // Check that register source operand if not clobbered before MI. | |||
570 | // Immediate operands are always safe to propagate. | |||
571 | bool SafeToPropagate = true; | |||
572 | for (auto I = Def; I != MI && SafeToPropagate; ++I) | |||
573 | if (I->modifiesRegister(DefOp.getReg(), &RI)) | |||
574 | SafeToPropagate = false; | |||
575 | ||||
576 | if (!SafeToPropagate) | |||
577 | break; | |||
578 | ||||
579 | DefOp.setIsKill(false); | |||
580 | } | |||
581 | ||||
582 | MachineInstrBuilder Builder = | |||
583 | BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) | |||
584 | .add(DefOp); | |||
585 | if (ImpDefSuperReg) | |||
586 | Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); | |||
587 | ||||
588 | if (ImpUseSuperReg) { | |||
589 | Builder.addReg(ImpUseSuperReg, | |||
590 | getKillRegState(KillSrc) | RegState::Implicit); | |||
591 | } | |||
592 | ||||
593 | return; | |||
594 | } | |||
595 | ||||
596 | RS.enterBasicBlock(MBB); | |||
597 | RS.forward(MI); | |||
598 | ||||
599 | // Ideally we want to have three registers for a long reg_sequence copy | |||
600 | // to hide 2 waitstates between v_mov_b32 and accvgpr_write. | |||
601 | unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, | |||
602 | *MBB.getParent()); | |||
603 | ||||
604 | // Registers in the sequence are allocated contiguously so we can just | |||
605 | // use register number to pick one of three round-robin temps. | |||
606 | unsigned RegNo = DestReg % 3; | |||
607 | Register Tmp; | |||
608 | if (!TII.getSubtarget().hasGFX90AInsts()) { | |||
609 | Tmp = AMDGPU::VGPR32; | |||
610 | assert(MBB.getParent()->getRegInfo().isReserved(AMDGPU::VGPR32))(static_cast <bool> (MBB.getParent()->getRegInfo().isReserved (AMDGPU::VGPR32)) ? void (0) : __assert_fail ("MBB.getParent()->getRegInfo().isReserved(AMDGPU::VGPR32)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 610, __extension__ __PRETTY_FUNCTION__)); | |||
611 | ||||
612 | // Only loop through if there are any free registers left, otherwise | |||
613 | // scavenger may report a fatal error without emergency spill slot | |||
614 | // or spill with the slot. | |||
615 | while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { | |||
616 | Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); | |||
617 | if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) | |||
618 | break; | |||
619 | Tmp = Tmp2; | |||
620 | RS.setRegUsed(Tmp); | |||
621 | } | |||
622 | } else { | |||
623 | Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); | |||
624 | RS.setRegUsed(Tmp); | |||
625 | } | |||
626 | ||||
627 | // Insert copy to temporary VGPR. | |||
628 | unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32; | |||
629 | if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) { | |||
630 | TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64; | |||
631 | } else { | |||
632 | assert(AMDGPU::SReg_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::SReg_32RegClass.contains(SrcReg )) ? void (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 632, __extension__ __PRETTY_FUNCTION__)); | |||
633 | } | |||
634 | ||||
635 | MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp) | |||
636 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
637 | if (ImpUseSuperReg) { | |||
638 | UseBuilder.addReg(ImpUseSuperReg, | |||
639 | getKillRegState(KillSrc) | RegState::Implicit); | |||
640 | } | |||
641 | ||||
642 | MachineInstrBuilder DefBuilder | |||
643 | = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) | |||
644 | .addReg(Tmp, RegState::Kill); | |||
645 | ||||
646 | if (ImpDefSuperReg) | |||
647 | DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); | |||
648 | } | |||
649 | ||||
650 | static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, | |||
651 | MachineBasicBlock::iterator MI, const DebugLoc &DL, | |||
652 | MCRegister DestReg, MCRegister SrcReg, bool KillSrc, | |||
653 | const TargetRegisterClass *RC, bool Forward) { | |||
654 | const SIRegisterInfo &RI = TII.getRegisterInfo(); | |||
655 | ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4); | |||
656 | MachineBasicBlock::iterator I = MI; | |||
657 | MachineInstr *FirstMI = nullptr, *LastMI = nullptr; | |||
658 | ||||
659 | for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) { | |||
660 | int16_t SubIdx = BaseIndices[Idx]; | |||
661 | Register Reg = RI.getSubReg(DestReg, SubIdx); | |||
662 | unsigned Opcode = AMDGPU::S_MOV_B32; | |||
663 | ||||
664 | // Is SGPR aligned? If so try to combine with next. | |||
665 | Register Src = RI.getSubReg(SrcReg, SubIdx); | |||
666 | bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0; | |||
667 | bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0; | |||
668 | if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) { | |||
669 | // Can use SGPR64 copy | |||
670 | unsigned Channel = RI.getChannelFromSubReg(SubIdx); | |||
671 | SubIdx = RI.getSubRegFromChannel(Channel, 2); | |||
672 | Opcode = AMDGPU::S_MOV_B64; | |||
673 | Idx++; | |||
674 | } | |||
675 | ||||
676 | LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx)) | |||
677 | .addReg(RI.getSubReg(SrcReg, SubIdx)) | |||
678 | .addReg(SrcReg, RegState::Implicit); | |||
679 | ||||
680 | if (!FirstMI) | |||
681 | FirstMI = LastMI; | |||
682 | ||||
683 | if (!Forward) | |||
684 | I--; | |||
685 | } | |||
686 | ||||
687 | assert(FirstMI && LastMI)(static_cast <bool> (FirstMI && LastMI) ? void ( 0) : __assert_fail ("FirstMI && LastMI", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 687, __extension__ __PRETTY_FUNCTION__)); | |||
688 | if (!Forward) | |||
689 | std::swap(FirstMI, LastMI); | |||
690 | ||||
691 | FirstMI->addOperand( | |||
692 | MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/)); | |||
693 | ||||
694 | if (KillSrc) | |||
695 | LastMI->addRegisterKilled(SrcReg, &RI); | |||
696 | } | |||
697 | ||||
698 | void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, | |||
699 | MachineBasicBlock::iterator MI, | |||
700 | const DebugLoc &DL, MCRegister DestReg, | |||
701 | MCRegister SrcReg, bool KillSrc) const { | |||
702 | const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); | |||
703 | ||||
704 | // FIXME: This is hack to resolve copies between 16 bit and 32 bit | |||
705 | // registers until all patterns are fixed. | |||
706 | if (Fix16BitCopies && | |||
707 | ((RI.getRegSizeInBits(*RC) == 16) ^ | |||
708 | (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) { | |||
709 | MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg; | |||
710 | MCRegister Super = RI.get32BitRegister(RegToFix); | |||
711 | assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix)(static_cast <bool> (RI.getSubReg(Super, AMDGPU::lo16) == RegToFix) ? void (0) : __assert_fail ("RI.getSubReg(Super, AMDGPU::lo16) == RegToFix" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 711, __extension__ __PRETTY_FUNCTION__)); | |||
712 | RegToFix = Super; | |||
713 | ||||
714 | if (DestReg == SrcReg) { | |||
715 | // Insert empty bundle since ExpandPostRA expects an instruction here. | |||
716 | BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); | |||
717 | return; | |||
718 | } | |||
719 | ||||
720 | RC = RI.getPhysRegClass(DestReg); | |||
721 | } | |||
722 | ||||
723 | if (RC == &AMDGPU::VGPR_32RegClass) { | |||
724 | assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::VGPR_32RegClass.contains(SrcReg ) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 726, __extension__ __PRETTY_FUNCTION__)) | |||
725 | AMDGPU::SReg_32RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::VGPR_32RegClass.contains(SrcReg ) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 726, __extension__ __PRETTY_FUNCTION__)) | |||
726 | AMDGPU::AGPR_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::VGPR_32RegClass.contains(SrcReg ) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 726, __extension__ __PRETTY_FUNCTION__)); | |||
727 | unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ? | |||
728 | AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32; | |||
729 | BuildMI(MBB, MI, DL, get(Opc), DestReg) | |||
730 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
731 | return; | |||
732 | } | |||
733 | ||||
734 | if (RC == &AMDGPU::SReg_32_XM0RegClass || | |||
735 | RC == &AMDGPU::SReg_32RegClass) { | |||
736 | if (SrcReg == AMDGPU::SCC) { | |||
737 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) | |||
738 | .addImm(1) | |||
739 | .addImm(0); | |||
740 | return; | |||
741 | } | |||
742 | ||||
743 | if (DestReg == AMDGPU::VCC_LO) { | |||
744 | if (AMDGPU::SReg_32RegClass.contains(SrcReg)) { | |||
745 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO) | |||
746 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
747 | } else { | |||
748 | // FIXME: Hack until VReg_1 removed. | |||
749 | assert(AMDGPU::VGPR_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::VGPR_32RegClass.contains(SrcReg )) ? void (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 749, __extension__ __PRETTY_FUNCTION__)); | |||
750 | BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) | |||
751 | .addImm(0) | |||
752 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
753 | } | |||
754 | ||||
755 | return; | |||
756 | } | |||
757 | ||||
758 | if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { | |||
759 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); | |||
760 | return; | |||
761 | } | |||
762 | ||||
763 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) | |||
764 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
765 | return; | |||
766 | } | |||
767 | ||||
768 | if (RC == &AMDGPU::SReg_64RegClass) { | |||
769 | if (SrcReg == AMDGPU::SCC) { | |||
770 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg) | |||
771 | .addImm(1) | |||
772 | .addImm(0); | |||
773 | return; | |||
774 | } | |||
775 | ||||
776 | if (DestReg == AMDGPU::VCC) { | |||
777 | if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { | |||
778 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) | |||
779 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
780 | } else { | |||
781 | // FIXME: Hack until VReg_1 removed. | |||
782 | assert(AMDGPU::VGPR_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::VGPR_32RegClass.contains(SrcReg )) ? void (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 782, __extension__ __PRETTY_FUNCTION__)); | |||
783 | BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) | |||
784 | .addImm(0) | |||
785 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
786 | } | |||
787 | ||||
788 | return; | |||
789 | } | |||
790 | ||||
791 | if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { | |||
792 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); | |||
793 | return; | |||
794 | } | |||
795 | ||||
796 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) | |||
797 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
798 | return; | |||
799 | } | |||
800 | ||||
801 | if (DestReg == AMDGPU::SCC) { | |||
802 | // Copying 64-bit or 32-bit sources to SCC barely makes sense, | |||
803 | // but SelectionDAG emits such copies for i1 sources. | |||
804 | if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { | |||
805 | // This copy can only be produced by patterns | |||
806 | // with explicit SCC, which are known to be enabled | |||
807 | // only for subtargets with S_CMP_LG_U64 present. | |||
808 | assert(ST.hasScalarCompareEq64())(static_cast <bool> (ST.hasScalarCompareEq64()) ? void ( 0) : __assert_fail ("ST.hasScalarCompareEq64()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 808, __extension__ __PRETTY_FUNCTION__)); | |||
809 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64)) | |||
810 | .addReg(SrcReg, getKillRegState(KillSrc)) | |||
811 | .addImm(0); | |||
812 | } else { | |||
813 | assert(AMDGPU::SReg_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::SReg_32RegClass.contains(SrcReg )) ? void (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 813, __extension__ __PRETTY_FUNCTION__)); | |||
814 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) | |||
815 | .addReg(SrcReg, getKillRegState(KillSrc)) | |||
816 | .addImm(0); | |||
817 | } | |||
818 | ||||
819 | return; | |||
820 | } | |||
821 | ||||
822 | if (RC == &AMDGPU::AGPR_32RegClass) { | |||
823 | if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) { | |||
824 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) | |||
825 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
826 | return; | |||
827 | } | |||
828 | ||||
829 | if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) { | |||
830 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg) | |||
831 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
832 | return; | |||
833 | } | |||
834 | ||||
835 | // FIXME: Pass should maintain scavenger to avoid scan through the block on | |||
836 | // every AGPR spill. | |||
837 | RegScavenger RS; | |||
838 | indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS); | |||
839 | return; | |||
840 | } | |||
841 | ||||
842 | const unsigned Size = RI.getRegSizeInBits(*RC); | |||
843 | if (Size == 16) { | |||
844 | assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::VGPR_LO16RegClass.contains (SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU ::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 847, __extension__ __PRETTY_FUNCTION__)) | |||
845 | AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::VGPR_LO16RegClass.contains (SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU ::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 847, __extension__ __PRETTY_FUNCTION__)) | |||
846 | AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::VGPR_LO16RegClass.contains (SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU ::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 847, __extension__ __PRETTY_FUNCTION__)) | |||
847 | AMDGPU::AGPR_LO16RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::VGPR_LO16RegClass.contains (SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU ::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 847, __extension__ __PRETTY_FUNCTION__)); | |||
848 | ||||
849 | bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg); | |||
850 | bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg); | |||
851 | bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg); | |||
852 | bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg); | |||
853 | bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) || | |||
854 | AMDGPU::SReg_LO16RegClass.contains(DestReg) || | |||
855 | AMDGPU::AGPR_LO16RegClass.contains(DestReg); | |||
856 | bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || | |||
857 | AMDGPU::SReg_LO16RegClass.contains(SrcReg) || | |||
858 | AMDGPU::AGPR_LO16RegClass.contains(SrcReg); | |||
859 | MCRegister NewDestReg = RI.get32BitRegister(DestReg); | |||
860 | MCRegister NewSrcReg = RI.get32BitRegister(SrcReg); | |||
861 | ||||
862 | if (IsSGPRDst) { | |||
863 | if (!IsSGPRSrc) { | |||
864 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); | |||
865 | return; | |||
866 | } | |||
867 | ||||
868 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg) | |||
869 | .addReg(NewSrcReg, getKillRegState(KillSrc)); | |||
870 | return; | |||
871 | } | |||
872 | ||||
873 | if (IsAGPRDst || IsAGPRSrc) { | |||
874 | if (!DstLow || !SrcLow) { | |||
875 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, | |||
876 | "Cannot use hi16 subreg with an AGPR!"); | |||
877 | } | |||
878 | ||||
879 | copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc); | |||
880 | return; | |||
881 | } | |||
882 | ||||
883 | if (IsSGPRSrc && !ST.hasSDWAScalar()) { | |||
884 | if (!DstLow || !SrcLow) { | |||
885 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, | |||
886 | "Cannot use hi16 subreg on VI!"); | |||
887 | } | |||
888 | ||||
889 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg) | |||
890 | .addReg(NewSrcReg, getKillRegState(KillSrc)); | |||
891 | return; | |||
892 | } | |||
893 | ||||
894 | auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg) | |||
895 | .addImm(0) // src0_modifiers | |||
896 | .addReg(NewSrcReg) | |||
897 | .addImm(0) // clamp | |||
898 | .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0 | |||
899 | : AMDGPU::SDWA::SdwaSel::WORD_1) | |||
900 | .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) | |||
901 | .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0 | |||
902 | : AMDGPU::SDWA::SdwaSel::WORD_1) | |||
903 | .addReg(NewDestReg, RegState::Implicit | RegState::Undef); | |||
904 | // First implicit operand is $exec. | |||
905 | MIB->tieOperands(0, MIB->getNumOperands() - 1); | |||
906 | return; | |||
907 | } | |||
908 | ||||
909 | const TargetRegisterClass *SrcRC = RI.getPhysRegClass(SrcReg); | |||
910 | if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) { | |||
911 | if (ST.hasMovB64()) { | |||
912 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg) | |||
913 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
914 | return; | |||
915 | } | |||
916 | if (ST.hasPackedFP32Ops()) { | |||
917 | BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg) | |||
918 | .addImm(SISrcMods::OP_SEL_1) | |||
919 | .addReg(SrcReg) | |||
920 | .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) | |||
921 | .addReg(SrcReg) | |||
922 | .addImm(0) // op_sel_lo | |||
923 | .addImm(0) // op_sel_hi | |||
924 | .addImm(0) // neg_lo | |||
925 | .addImm(0) // neg_hi | |||
926 | .addImm(0) // clamp | |||
927 | .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); | |||
928 | return; | |||
929 | } | |||
930 | } | |||
931 | ||||
932 | const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); | |||
933 | if (RI.isSGPRClass(RC)) { | |||
934 | if (!RI.isSGPRClass(SrcRC)) { | |||
935 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); | |||
936 | return; | |||
937 | } | |||
938 | const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg); | |||
939 | expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC, | |||
940 | Forward); | |||
941 | return; | |||
942 | } | |||
943 | ||||
944 | unsigned EltSize = 4; | |||
945 | unsigned Opcode = AMDGPU::V_MOV_B32_e32; | |||
946 | if (RI.isAGPRClass(RC)) { | |||
947 | if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC)) | |||
948 | Opcode = AMDGPU::V_ACCVGPR_MOV_B32; | |||
949 | else if (RI.hasVGPRs(SrcRC)) | |||
950 | Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64; | |||
951 | else | |||
952 | Opcode = AMDGPU::INSTRUCTION_LIST_END; | |||
953 | } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) { | |||
954 | Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64; | |||
955 | } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) && | |||
956 | (RI.isProperlyAlignedRC(*RC) && | |||
957 | (SrcRC == RC || RI.isSGPRClass(SrcRC)))) { | |||
958 | // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov. | |||
959 | if (ST.hasMovB64()) { | |||
960 | Opcode = AMDGPU::V_MOV_B64_e32; | |||
961 | EltSize = 8; | |||
962 | } else if (ST.hasPackedFP32Ops()) { | |||
963 | Opcode = AMDGPU::V_PK_MOV_B32; | |||
964 | EltSize = 8; | |||
965 | } | |||
966 | } | |||
967 | ||||
968 | // For the cases where we need an intermediate instruction/temporary register | |||
969 | // (destination is an AGPR), we need a scavenger. | |||
970 | // | |||
971 | // FIXME: The pass should maintain this for us so we don't have to re-scan the | |||
972 | // whole block for every handled copy. | |||
973 | std::unique_ptr<RegScavenger> RS; | |||
974 | if (Opcode == AMDGPU::INSTRUCTION_LIST_END) | |||
975 | RS.reset(new RegScavenger()); | |||
976 | ||||
977 | ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); | |||
978 | ||||
979 | // If there is an overlap, we can't kill the super-register on the last | |||
980 | // instruction, since it will also kill the components made live by this def. | |||
981 | const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg); | |||
982 | ||||
983 | for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { | |||
984 | unsigned SubIdx; | |||
985 | if (Forward) | |||
986 | SubIdx = SubIndices[Idx]; | |||
987 | else | |||
988 | SubIdx = SubIndices[SubIndices.size() - Idx - 1]; | |||
989 | ||||
990 | bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1; | |||
991 | ||||
992 | if (Opcode == AMDGPU::INSTRUCTION_LIST_END) { | |||
993 | Register ImpDefSuper = Idx == 0 ? Register(DestReg) : Register(); | |||
994 | Register ImpUseSuper = SrcReg; | |||
995 | indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), | |||
996 | RI.getSubReg(SrcReg, SubIdx), UseKill, *RS, | |||
997 | ImpDefSuper, ImpUseSuper); | |||
998 | } else if (Opcode == AMDGPU::V_PK_MOV_B32) { | |||
999 | Register DstSubReg = RI.getSubReg(DestReg, SubIdx); | |||
1000 | Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); | |||
1001 | MachineInstrBuilder MIB = | |||
1002 | BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg) | |||
1003 | .addImm(SISrcMods::OP_SEL_1) | |||
1004 | .addReg(SrcSubReg) | |||
1005 | .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) | |||
1006 | .addReg(SrcSubReg) | |||
1007 | .addImm(0) // op_sel_lo | |||
1008 | .addImm(0) // op_sel_hi | |||
1009 | .addImm(0) // neg_lo | |||
1010 | .addImm(0) // neg_hi | |||
1011 | .addImm(0) // clamp | |||
1012 | .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); | |||
1013 | if (Idx == 0) | |||
1014 | MIB.addReg(DestReg, RegState::Define | RegState::Implicit); | |||
1015 | } else { | |||
1016 | MachineInstrBuilder Builder = | |||
1017 | BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)) | |||
1018 | .addReg(RI.getSubReg(SrcReg, SubIdx)); | |||
1019 | if (Idx == 0) | |||
1020 | Builder.addReg(DestReg, RegState::Define | RegState::Implicit); | |||
1021 | ||||
1022 | Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); | |||
1023 | } | |||
1024 | } | |||
1025 | } | |||
1026 | ||||
1027 | int SIInstrInfo::commuteOpcode(unsigned Opcode) const { | |||
1028 | int NewOpc; | |||
1029 | ||||
1030 | // Try to map original to commuted opcode | |||
1031 | NewOpc = AMDGPU::getCommuteRev(Opcode); | |||
1032 | if (NewOpc != -1) | |||
1033 | // Check if the commuted (REV) opcode exists on the target. | |||
1034 | return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; | |||
1035 | ||||
1036 | // Try to map commuted to original opcode | |||
1037 | NewOpc = AMDGPU::getCommuteOrig(Opcode); | |||
1038 | if (NewOpc != -1) | |||
1039 | // Check if the original (non-REV) opcode exists on the target. | |||
1040 | return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; | |||
1041 | ||||
1042 | return Opcode; | |||
1043 | } | |||
1044 | ||||
1045 | void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, | |||
1046 | MachineBasicBlock::iterator MI, | |||
1047 | const DebugLoc &DL, unsigned DestReg, | |||
1048 | int64_t Value) const { | |||
1049 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
1050 | const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); | |||
1051 | if (RegClass == &AMDGPU::SReg_32RegClass || | |||
1052 | RegClass == &AMDGPU::SGPR_32RegClass || | |||
1053 | RegClass == &AMDGPU::SReg_32_XM0RegClass || | |||
1054 | RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { | |||
1055 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) | |||
1056 | .addImm(Value); | |||
1057 | return; | |||
1058 | } | |||
1059 | ||||
1060 | if (RegClass == &AMDGPU::SReg_64RegClass || | |||
1061 | RegClass == &AMDGPU::SGPR_64RegClass || | |||
1062 | RegClass == &AMDGPU::SReg_64_XEXECRegClass) { | |||
1063 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) | |||
1064 | .addImm(Value); | |||
1065 | return; | |||
1066 | } | |||
1067 | ||||
1068 | if (RegClass == &AMDGPU::VGPR_32RegClass) { | |||
1069 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) | |||
1070 | .addImm(Value); | |||
1071 | return; | |||
1072 | } | |||
1073 | if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) { | |||
1074 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) | |||
1075 | .addImm(Value); | |||
1076 | return; | |||
1077 | } | |||
1078 | ||||
1079 | unsigned EltSize = 4; | |||
1080 | unsigned Opcode = AMDGPU::V_MOV_B32_e32; | |||
1081 | if (RI.isSGPRClass(RegClass)) { | |||
1082 | if (RI.getRegSizeInBits(*RegClass) > 32) { | |||
1083 | Opcode = AMDGPU::S_MOV_B64; | |||
1084 | EltSize = 8; | |||
1085 | } else { | |||
1086 | Opcode = AMDGPU::S_MOV_B32; | |||
1087 | EltSize = 4; | |||
1088 | } | |||
1089 | } | |||
1090 | ||||
1091 | ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); | |||
1092 | for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { | |||
1093 | int64_t IdxValue = Idx == 0 ? Value : 0; | |||
1094 | ||||
1095 | MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, | |||
1096 | get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx])); | |||
1097 | Builder.addImm(IdxValue); | |||
1098 | } | |||
1099 | } | |||
1100 | ||||
1101 | const TargetRegisterClass * | |||
1102 | SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { | |||
1103 | return &AMDGPU::VGPR_32RegClass; | |||
1104 | } | |||
1105 | ||||
1106 | void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, | |||
1107 | MachineBasicBlock::iterator I, | |||
1108 | const DebugLoc &DL, Register DstReg, | |||
1109 | ArrayRef<MachineOperand> Cond, | |||
1110 | Register TrueReg, | |||
1111 | Register FalseReg) const { | |||
1112 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
1113 | const TargetRegisterClass *BoolXExecRC = | |||
1114 | RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
1115 | assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&(static_cast <bool> (MRI.getRegClass(DstReg) == &AMDGPU ::VGPR_32RegClass && "Not a VGPR32 reg") ? void (0) : __assert_fail ("MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && \"Not a VGPR32 reg\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1116, __extension__ __PRETTY_FUNCTION__)) | |||
1116 | "Not a VGPR32 reg")(static_cast <bool> (MRI.getRegClass(DstReg) == &AMDGPU ::VGPR_32RegClass && "Not a VGPR32 reg") ? void (0) : __assert_fail ("MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && \"Not a VGPR32 reg\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1116, __extension__ __PRETTY_FUNCTION__)); | |||
1117 | ||||
1118 | if (Cond.size() == 1) { | |||
1119 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1120 | BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) | |||
1121 | .add(Cond[0]); | |||
1122 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1123 | .addImm(0) | |||
1124 | .addReg(FalseReg) | |||
1125 | .addImm(0) | |||
1126 | .addReg(TrueReg) | |||
1127 | .addReg(SReg); | |||
1128 | } else if (Cond.size() == 2) { | |||
1129 | assert(Cond[0].isImm() && "Cond[0] is not an immediate")(static_cast <bool> (Cond[0].isImm() && "Cond[0] is not an immediate" ) ? void (0) : __assert_fail ("Cond[0].isImm() && \"Cond[0] is not an immediate\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1129, __extension__ __PRETTY_FUNCTION__)); | |||
1130 | switch (Cond[0].getImm()) { | |||
1131 | case SIInstrInfo::SCC_TRUE: { | |||
1132 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1133 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 | |||
1134 | : AMDGPU::S_CSELECT_B64), SReg) | |||
1135 | .addImm(1) | |||
1136 | .addImm(0); | |||
1137 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1138 | .addImm(0) | |||
1139 | .addReg(FalseReg) | |||
1140 | .addImm(0) | |||
1141 | .addReg(TrueReg) | |||
1142 | .addReg(SReg); | |||
1143 | break; | |||
1144 | } | |||
1145 | case SIInstrInfo::SCC_FALSE: { | |||
1146 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1147 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 | |||
1148 | : AMDGPU::S_CSELECT_B64), SReg) | |||
1149 | .addImm(0) | |||
1150 | .addImm(1); | |||
1151 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1152 | .addImm(0) | |||
1153 | .addReg(FalseReg) | |||
1154 | .addImm(0) | |||
1155 | .addReg(TrueReg) | |||
1156 | .addReg(SReg); | |||
1157 | break; | |||
1158 | } | |||
1159 | case SIInstrInfo::VCCNZ: { | |||
1160 | MachineOperand RegOp = Cond[1]; | |||
1161 | RegOp.setImplicit(false); | |||
1162 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1163 | BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) | |||
1164 | .add(RegOp); | |||
1165 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1166 | .addImm(0) | |||
1167 | .addReg(FalseReg) | |||
1168 | .addImm(0) | |||
1169 | .addReg(TrueReg) | |||
1170 | .addReg(SReg); | |||
1171 | break; | |||
1172 | } | |||
1173 | case SIInstrInfo::VCCZ: { | |||
1174 | MachineOperand RegOp = Cond[1]; | |||
1175 | RegOp.setImplicit(false); | |||
1176 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1177 | BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) | |||
1178 | .add(RegOp); | |||
1179 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1180 | .addImm(0) | |||
1181 | .addReg(TrueReg) | |||
1182 | .addImm(0) | |||
1183 | .addReg(FalseReg) | |||
1184 | .addReg(SReg); | |||
1185 | break; | |||
1186 | } | |||
1187 | case SIInstrInfo::EXECNZ: { | |||
1188 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1189 | Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); | |||
1190 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 | |||
1191 | : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) | |||
1192 | .addImm(0); | |||
1193 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 | |||
1194 | : AMDGPU::S_CSELECT_B64), SReg) | |||
1195 | .addImm(1) | |||
1196 | .addImm(0); | |||
1197 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1198 | .addImm(0) | |||
1199 | .addReg(FalseReg) | |||
1200 | .addImm(0) | |||
1201 | .addReg(TrueReg) | |||
1202 | .addReg(SReg); | |||
1203 | break; | |||
1204 | } | |||
1205 | case SIInstrInfo::EXECZ: { | |||
1206 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1207 | Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); | |||
1208 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 | |||
1209 | : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) | |||
1210 | .addImm(0); | |||
1211 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 | |||
1212 | : AMDGPU::S_CSELECT_B64), SReg) | |||
1213 | .addImm(0) | |||
1214 | .addImm(1); | |||
1215 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1216 | .addImm(0) | |||
1217 | .addReg(FalseReg) | |||
1218 | .addImm(0) | |||
1219 | .addReg(TrueReg) | |||
1220 | .addReg(SReg); | |||
1221 | llvm_unreachable("Unhandled branch predicate EXECZ")::llvm::llvm_unreachable_internal("Unhandled branch predicate EXECZ" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1221); | |||
1222 | break; | |||
1223 | } | |||
1224 | default: | |||
1225 | llvm_unreachable("invalid branch predicate")::llvm::llvm_unreachable_internal("invalid branch predicate", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1225); | |||
1226 | } | |||
1227 | } else { | |||
1228 | llvm_unreachable("Can only handle Cond size 1 or 2")::llvm::llvm_unreachable_internal("Can only handle Cond size 1 or 2" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1228); | |||
1229 | } | |||
1230 | } | |||
1231 | ||||
1232 | Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB, | |||
1233 | MachineBasicBlock::iterator I, | |||
1234 | const DebugLoc &DL, | |||
1235 | Register SrcReg, int Value) const { | |||
1236 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | |||
1237 | Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
1238 | BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) | |||
1239 | .addImm(Value) | |||
1240 | .addReg(SrcReg); | |||
1241 | ||||
1242 | return Reg; | |||
1243 | } | |||
1244 | ||||
1245 | Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, | |||
1246 | MachineBasicBlock::iterator I, | |||
1247 | const DebugLoc &DL, | |||
1248 | Register SrcReg, int Value) const { | |||
1249 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | |||
1250 | Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
1251 | BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) | |||
1252 | .addImm(Value) | |||
1253 | .addReg(SrcReg); | |||
1254 | ||||
1255 | return Reg; | |||
1256 | } | |||
1257 | ||||
1258 | unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { | |||
1259 | ||||
1260 | if (RI.isAGPRClass(DstRC)) | |||
1261 | return AMDGPU::COPY; | |||
1262 | if (RI.getRegSizeInBits(*DstRC) == 32) { | |||
1263 | return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; | |||
1264 | } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { | |||
1265 | return AMDGPU::S_MOV_B64; | |||
1266 | } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { | |||
1267 | return AMDGPU::V_MOV_B64_PSEUDO; | |||
1268 | } | |||
1269 | return AMDGPU::COPY; | |||
1270 | } | |||
1271 | ||||
1272 | const MCInstrDesc & | |||
1273 | SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize, | |||
1274 | bool IsIndirectSrc) const { | |||
1275 | if (IsIndirectSrc) { | |||
1276 | if (VecSize <= 32) // 4 bytes | |||
1277 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1); | |||
1278 | if (VecSize <= 64) // 8 bytes | |||
1279 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2); | |||
1280 | if (VecSize <= 96) // 12 bytes | |||
1281 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3); | |||
1282 | if (VecSize <= 128) // 16 bytes | |||
1283 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4); | |||
1284 | if (VecSize <= 160) // 20 bytes | |||
1285 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5); | |||
1286 | if (VecSize <= 256) // 32 bytes | |||
1287 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8); | |||
1288 | if (VecSize <= 512) // 64 bytes | |||
1289 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16); | |||
1290 | if (VecSize <= 1024) // 128 bytes | |||
1291 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32); | |||
1292 | ||||
1293 | llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegReadGPRIDX pseudos" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1293); | |||
1294 | } | |||
1295 | ||||
1296 | if (VecSize <= 32) // 4 bytes | |||
1297 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1); | |||
1298 | if (VecSize <= 64) // 8 bytes | |||
1299 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2); | |||
1300 | if (VecSize <= 96) // 12 bytes | |||
1301 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3); | |||
1302 | if (VecSize <= 128) // 16 bytes | |||
1303 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4); | |||
1304 | if (VecSize <= 160) // 20 bytes | |||
1305 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5); | |||
1306 | if (VecSize <= 256) // 32 bytes | |||
1307 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8); | |||
1308 | if (VecSize <= 512) // 64 bytes | |||
1309 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16); | |||
1310 | if (VecSize <= 1024) // 128 bytes | |||
1311 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32); | |||
1312 | ||||
1313 | llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWriteGPRIDX pseudos" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1313); | |||
1314 | } | |||
1315 | ||||
1316 | static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) { | |||
1317 | if (VecSize <= 32) // 4 bytes | |||
1318 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1; | |||
1319 | if (VecSize <= 64) // 8 bytes | |||
1320 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2; | |||
1321 | if (VecSize <= 96) // 12 bytes | |||
1322 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3; | |||
1323 | if (VecSize <= 128) // 16 bytes | |||
1324 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4; | |||
1325 | if (VecSize <= 160) // 20 bytes | |||
1326 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5; | |||
1327 | if (VecSize <= 256) // 32 bytes | |||
1328 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8; | |||
1329 | if (VecSize <= 512) // 64 bytes | |||
1330 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16; | |||
1331 | if (VecSize <= 1024) // 128 bytes | |||
1332 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32; | |||
1333 | ||||
1334 | llvm_unreachable("unsupported size for IndirectRegWrite pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWrite pseudos" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1334); | |||
1335 | } | |||
1336 | ||||
1337 | static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) { | |||
1338 | if (VecSize <= 32) // 4 bytes | |||
1339 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1; | |||
1340 | if (VecSize <= 64) // 8 bytes | |||
1341 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2; | |||
1342 | if (VecSize <= 96) // 12 bytes | |||
1343 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3; | |||
1344 | if (VecSize <= 128) // 16 bytes | |||
1345 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4; | |||
1346 | if (VecSize <= 160) // 20 bytes | |||
1347 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5; | |||
1348 | if (VecSize <= 256) // 32 bytes | |||
1349 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8; | |||
1350 | if (VecSize <= 512) // 64 bytes | |||
1351 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16; | |||
1352 | if (VecSize <= 1024) // 128 bytes | |||
1353 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32; | |||
1354 | ||||
1355 | llvm_unreachable("unsupported size for IndirectRegWrite pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWrite pseudos" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1355); | |||
1356 | } | |||
1357 | ||||
1358 | static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) { | |||
1359 | if (VecSize <= 64) // 8 bytes | |||
1360 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1; | |||
1361 | if (VecSize <= 128) // 16 bytes | |||
1362 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2; | |||
1363 | if (VecSize <= 256) // 32 bytes | |||
1364 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4; | |||
1365 | if (VecSize <= 512) // 64 bytes | |||
1366 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8; | |||
1367 | if (VecSize <= 1024) // 128 bytes | |||
1368 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16; | |||
1369 | ||||
1370 | llvm_unreachable("unsupported size for IndirectRegWrite pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWrite pseudos" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1370); | |||
1371 | } | |||
1372 | ||||
1373 | const MCInstrDesc & | |||
1374 | SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, | |||
1375 | bool IsSGPR) const { | |||
1376 | if (IsSGPR) { | |||
1377 | switch (EltSize) { | |||
1378 | case 32: | |||
1379 | return get(getIndirectSGPRWriteMovRelPseudo32(VecSize)); | |||
1380 | case 64: | |||
1381 | return get(getIndirectSGPRWriteMovRelPseudo64(VecSize)); | |||
1382 | default: | |||
1383 | llvm_unreachable("invalid reg indexing elt size")::llvm::llvm_unreachable_internal("invalid reg indexing elt size" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1383); | |||
1384 | } | |||
1385 | } | |||
1386 | ||||
1387 | assert(EltSize == 32 && "invalid reg indexing elt size")(static_cast <bool> (EltSize == 32 && "invalid reg indexing elt size" ) ? void (0) : __assert_fail ("EltSize == 32 && \"invalid reg indexing elt size\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1387, __extension__ __PRETTY_FUNCTION__)); | |||
1388 | return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize)); | |||
1389 | } | |||
1390 | ||||
1391 | static unsigned getSGPRSpillSaveOpcode(unsigned Size) { | |||
1392 | switch (Size) { | |||
1393 | case 4: | |||
1394 | return AMDGPU::SI_SPILL_S32_SAVE; | |||
1395 | case 8: | |||
1396 | return AMDGPU::SI_SPILL_S64_SAVE; | |||
1397 | case 12: | |||
1398 | return AMDGPU::SI_SPILL_S96_SAVE; | |||
1399 | case 16: | |||
1400 | return AMDGPU::SI_SPILL_S128_SAVE; | |||
1401 | case 20: | |||
1402 | return AMDGPU::SI_SPILL_S160_SAVE; | |||
1403 | case 24: | |||
1404 | return AMDGPU::SI_SPILL_S192_SAVE; | |||
1405 | case 28: | |||
1406 | return AMDGPU::SI_SPILL_S224_SAVE; | |||
1407 | case 32: | |||
1408 | return AMDGPU::SI_SPILL_S256_SAVE; | |||
1409 | case 64: | |||
1410 | return AMDGPU::SI_SPILL_S512_SAVE; | |||
1411 | case 128: | |||
1412 | return AMDGPU::SI_SPILL_S1024_SAVE; | |||
1413 | default: | |||
1414 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1414); | |||
1415 | } | |||
1416 | } | |||
1417 | ||||
1418 | static unsigned getVGPRSpillSaveOpcode(unsigned Size) { | |||
1419 | switch (Size) { | |||
1420 | case 4: | |||
1421 | return AMDGPU::SI_SPILL_V32_SAVE; | |||
1422 | case 8: | |||
1423 | return AMDGPU::SI_SPILL_V64_SAVE; | |||
1424 | case 12: | |||
1425 | return AMDGPU::SI_SPILL_V96_SAVE; | |||
1426 | case 16: | |||
1427 | return AMDGPU::SI_SPILL_V128_SAVE; | |||
1428 | case 20: | |||
1429 | return AMDGPU::SI_SPILL_V160_SAVE; | |||
1430 | case 24: | |||
1431 | return AMDGPU::SI_SPILL_V192_SAVE; | |||
1432 | case 28: | |||
1433 | return AMDGPU::SI_SPILL_V224_SAVE; | |||
1434 | case 32: | |||
1435 | return AMDGPU::SI_SPILL_V256_SAVE; | |||
1436 | case 64: | |||
1437 | return AMDGPU::SI_SPILL_V512_SAVE; | |||
1438 | case 128: | |||
1439 | return AMDGPU::SI_SPILL_V1024_SAVE; | |||
1440 | default: | |||
1441 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1441); | |||
1442 | } | |||
1443 | } | |||
1444 | ||||
1445 | static unsigned getAGPRSpillSaveOpcode(unsigned Size) { | |||
1446 | switch (Size) { | |||
1447 | case 4: | |||
1448 | return AMDGPU::SI_SPILL_A32_SAVE; | |||
1449 | case 8: | |||
1450 | return AMDGPU::SI_SPILL_A64_SAVE; | |||
1451 | case 12: | |||
1452 | return AMDGPU::SI_SPILL_A96_SAVE; | |||
1453 | case 16: | |||
1454 | return AMDGPU::SI_SPILL_A128_SAVE; | |||
1455 | case 20: | |||
1456 | return AMDGPU::SI_SPILL_A160_SAVE; | |||
1457 | case 24: | |||
1458 | return AMDGPU::SI_SPILL_A192_SAVE; | |||
1459 | case 28: | |||
1460 | return AMDGPU::SI_SPILL_A224_SAVE; | |||
1461 | case 32: | |||
1462 | return AMDGPU::SI_SPILL_A256_SAVE; | |||
1463 | case 64: | |||
1464 | return AMDGPU::SI_SPILL_A512_SAVE; | |||
1465 | case 128: | |||
1466 | return AMDGPU::SI_SPILL_A1024_SAVE; | |||
1467 | default: | |||
1468 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1468); | |||
1469 | } | |||
1470 | } | |||
1471 | ||||
1472 | static unsigned getAVSpillSaveOpcode(unsigned Size) { | |||
1473 | switch (Size) { | |||
1474 | case 4: | |||
1475 | return AMDGPU::SI_SPILL_AV32_SAVE; | |||
1476 | case 8: | |||
1477 | return AMDGPU::SI_SPILL_AV64_SAVE; | |||
1478 | case 12: | |||
1479 | return AMDGPU::SI_SPILL_AV96_SAVE; | |||
1480 | case 16: | |||
1481 | return AMDGPU::SI_SPILL_AV128_SAVE; | |||
1482 | case 20: | |||
1483 | return AMDGPU::SI_SPILL_AV160_SAVE; | |||
1484 | case 24: | |||
1485 | return AMDGPU::SI_SPILL_AV192_SAVE; | |||
1486 | case 28: | |||
1487 | return AMDGPU::SI_SPILL_AV224_SAVE; | |||
1488 | case 32: | |||
1489 | return AMDGPU::SI_SPILL_AV256_SAVE; | |||
1490 | case 64: | |||
1491 | return AMDGPU::SI_SPILL_AV512_SAVE; | |||
1492 | case 128: | |||
1493 | return AMDGPU::SI_SPILL_AV1024_SAVE; | |||
1494 | default: | |||
1495 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1495); | |||
1496 | } | |||
1497 | } | |||
1498 | ||||
1499 | void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, | |||
1500 | MachineBasicBlock::iterator MI, | |||
1501 | Register SrcReg, bool isKill, | |||
1502 | int FrameIndex, | |||
1503 | const TargetRegisterClass *RC, | |||
1504 | const TargetRegisterInfo *TRI) const { | |||
1505 | MachineFunction *MF = MBB.getParent(); | |||
1506 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); | |||
1507 | MachineFrameInfo &FrameInfo = MF->getFrameInfo(); | |||
1508 | const DebugLoc &DL = MBB.findDebugLoc(MI); | |||
1509 | ||||
1510 | MachinePointerInfo PtrInfo | |||
1511 | = MachinePointerInfo::getFixedStack(*MF, FrameIndex); | |||
1512 | MachineMemOperand *MMO = MF->getMachineMemOperand( | |||
1513 | PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), | |||
1514 | FrameInfo.getObjectAlign(FrameIndex)); | |||
1515 | unsigned SpillSize = TRI->getSpillSize(*RC); | |||
1516 | ||||
1517 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
1518 | if (RI.isSGPRClass(RC)) { | |||
1519 | MFI->setHasSpilledSGPRs(); | |||
1520 | assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled")(static_cast <bool> (SrcReg != AMDGPU::M0 && "m0 should not be spilled" ) ? void (0) : __assert_fail ("SrcReg != AMDGPU::M0 && \"m0 should not be spilled\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1520, __extension__ __PRETTY_FUNCTION__)); | |||
1521 | assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&(static_cast <bool> (SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && "exec should not be spilled") ? void (0) : __assert_fail ("SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && \"exec should not be spilled\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1522, __extension__ __PRETTY_FUNCTION__)) | |||
1522 | SrcReg != AMDGPU::EXEC && "exec should not be spilled")(static_cast <bool> (SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && "exec should not be spilled") ? void (0) : __assert_fail ("SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && \"exec should not be spilled\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1522, __extension__ __PRETTY_FUNCTION__)); | |||
1523 | ||||
1524 | // We are only allowed to create one new instruction when spilling | |||
1525 | // registers, so we need to use pseudo instruction for spilling SGPRs. | |||
1526 | const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); | |||
1527 | ||||
1528 | // The SGPR spill/restore instructions only work on number sgprs, so we need | |||
1529 | // to make sure we are using the correct register class. | |||
1530 | if (SrcReg.isVirtual() && SpillSize == 4) { | |||
1531 | MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); | |||
1532 | } | |||
1533 | ||||
1534 | BuildMI(MBB, MI, DL, OpDesc) | |||
1535 | .addReg(SrcReg, getKillRegState(isKill)) // data | |||
1536 | .addFrameIndex(FrameIndex) // addr | |||
1537 | .addMemOperand(MMO) | |||
1538 | .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); | |||
1539 | ||||
1540 | if (RI.spillSGPRToVGPR()) | |||
1541 | FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); | |||
1542 | return; | |||
1543 | } | |||
1544 | ||||
1545 | unsigned Opcode = RI.isVectorSuperClass(RC) ? getAVSpillSaveOpcode(SpillSize) | |||
1546 | : RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize) | |||
1547 | : getVGPRSpillSaveOpcode(SpillSize); | |||
1548 | MFI->setHasSpilledVGPRs(); | |||
1549 | ||||
1550 | BuildMI(MBB, MI, DL, get(Opcode)) | |||
1551 | .addReg(SrcReg, getKillRegState(isKill)) // data | |||
1552 | .addFrameIndex(FrameIndex) // addr | |||
1553 | .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset | |||
1554 | .addImm(0) // offset | |||
1555 | .addMemOperand(MMO); | |||
1556 | } | |||
1557 | ||||
1558 | static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { | |||
1559 | switch (Size) { | |||
1560 | case 4: | |||
1561 | return AMDGPU::SI_SPILL_S32_RESTORE; | |||
1562 | case 8: | |||
1563 | return AMDGPU::SI_SPILL_S64_RESTORE; | |||
1564 | case 12: | |||
1565 | return AMDGPU::SI_SPILL_S96_RESTORE; | |||
1566 | case 16: | |||
1567 | return AMDGPU::SI_SPILL_S128_RESTORE; | |||
1568 | case 20: | |||
1569 | return AMDGPU::SI_SPILL_S160_RESTORE; | |||
1570 | case 24: | |||
1571 | return AMDGPU::SI_SPILL_S192_RESTORE; | |||
1572 | case 28: | |||
1573 | return AMDGPU::SI_SPILL_S224_RESTORE; | |||
1574 | case 32: | |||
1575 | return AMDGPU::SI_SPILL_S256_RESTORE; | |||
1576 | case 64: | |||
1577 | return AMDGPU::SI_SPILL_S512_RESTORE; | |||
1578 | case 128: | |||
1579 | return AMDGPU::SI_SPILL_S1024_RESTORE; | |||
1580 | default: | |||
1581 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1581); | |||
1582 | } | |||
1583 | } | |||
1584 | ||||
1585 | static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { | |||
1586 | switch (Size) { | |||
1587 | case 4: | |||
1588 | return AMDGPU::SI_SPILL_V32_RESTORE; | |||
1589 | case 8: | |||
1590 | return AMDGPU::SI_SPILL_V64_RESTORE; | |||
1591 | case 12: | |||
1592 | return AMDGPU::SI_SPILL_V96_RESTORE; | |||
1593 | case 16: | |||
1594 | return AMDGPU::SI_SPILL_V128_RESTORE; | |||
1595 | case 20: | |||
1596 | return AMDGPU::SI_SPILL_V160_RESTORE; | |||
1597 | case 24: | |||
1598 | return AMDGPU::SI_SPILL_V192_RESTORE; | |||
1599 | case 28: | |||
1600 | return AMDGPU::SI_SPILL_V224_RESTORE; | |||
1601 | case 32: | |||
1602 | return AMDGPU::SI_SPILL_V256_RESTORE; | |||
1603 | case 64: | |||
1604 | return AMDGPU::SI_SPILL_V512_RESTORE; | |||
1605 | case 128: | |||
1606 | return AMDGPU::SI_SPILL_V1024_RESTORE; | |||
1607 | default: | |||
1608 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1608); | |||
1609 | } | |||
1610 | } | |||
1611 | ||||
1612 | static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { | |||
1613 | switch (Size) { | |||
1614 | case 4: | |||
1615 | return AMDGPU::SI_SPILL_A32_RESTORE; | |||
1616 | case 8: | |||
1617 | return AMDGPU::SI_SPILL_A64_RESTORE; | |||
1618 | case 12: | |||
1619 | return AMDGPU::SI_SPILL_A96_RESTORE; | |||
1620 | case 16: | |||
1621 | return AMDGPU::SI_SPILL_A128_RESTORE; | |||
1622 | case 20: | |||
1623 | return AMDGPU::SI_SPILL_A160_RESTORE; | |||
1624 | case 24: | |||
1625 | return AMDGPU::SI_SPILL_A192_RESTORE; | |||
1626 | case 28: | |||
1627 | return AMDGPU::SI_SPILL_A224_RESTORE; | |||
1628 | case 32: | |||
1629 | return AMDGPU::SI_SPILL_A256_RESTORE; | |||
1630 | case 64: | |||
1631 | return AMDGPU::SI_SPILL_A512_RESTORE; | |||
1632 | case 128: | |||
1633 | return AMDGPU::SI_SPILL_A1024_RESTORE; | |||
1634 | default: | |||
1635 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1635); | |||
1636 | } | |||
1637 | } | |||
1638 | ||||
1639 | static unsigned getAVSpillRestoreOpcode(unsigned Size) { | |||
1640 | switch (Size) { | |||
1641 | case 4: | |||
1642 | return AMDGPU::SI_SPILL_AV32_RESTORE; | |||
1643 | case 8: | |||
1644 | return AMDGPU::SI_SPILL_AV64_RESTORE; | |||
1645 | case 12: | |||
1646 | return AMDGPU::SI_SPILL_AV96_RESTORE; | |||
1647 | case 16: | |||
1648 | return AMDGPU::SI_SPILL_AV128_RESTORE; | |||
1649 | case 20: | |||
1650 | return AMDGPU::SI_SPILL_AV160_RESTORE; | |||
1651 | case 24: | |||
1652 | return AMDGPU::SI_SPILL_AV192_RESTORE; | |||
1653 | case 28: | |||
1654 | return AMDGPU::SI_SPILL_AV224_RESTORE; | |||
1655 | case 32: | |||
1656 | return AMDGPU::SI_SPILL_AV256_RESTORE; | |||
1657 | case 64: | |||
1658 | return AMDGPU::SI_SPILL_AV512_RESTORE; | |||
1659 | case 128: | |||
1660 | return AMDGPU::SI_SPILL_AV1024_RESTORE; | |||
1661 | default: | |||
1662 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1662); | |||
1663 | } | |||
1664 | } | |||
1665 | ||||
1666 | void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, | |||
1667 | MachineBasicBlock::iterator MI, | |||
1668 | Register DestReg, int FrameIndex, | |||
1669 | const TargetRegisterClass *RC, | |||
1670 | const TargetRegisterInfo *TRI) const { | |||
1671 | MachineFunction *MF = MBB.getParent(); | |||
1672 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); | |||
1673 | MachineFrameInfo &FrameInfo = MF->getFrameInfo(); | |||
1674 | const DebugLoc &DL = MBB.findDebugLoc(MI); | |||
1675 | unsigned SpillSize = TRI->getSpillSize(*RC); | |||
1676 | ||||
1677 | MachinePointerInfo PtrInfo | |||
1678 | = MachinePointerInfo::getFixedStack(*MF, FrameIndex); | |||
1679 | ||||
1680 | MachineMemOperand *MMO = MF->getMachineMemOperand( | |||
1681 | PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex), | |||
1682 | FrameInfo.getObjectAlign(FrameIndex)); | |||
1683 | ||||
1684 | if (RI.isSGPRClass(RC)) { | |||
1685 | MFI->setHasSpilledSGPRs(); | |||
1686 | assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into")(static_cast <bool> (DestReg != AMDGPU::M0 && "m0 should not be reloaded into" ) ? void (0) : __assert_fail ("DestReg != AMDGPU::M0 && \"m0 should not be reloaded into\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1686, __extension__ __PRETTY_FUNCTION__)); | |||
1687 | assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&(static_cast <bool> (DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && "exec should not be spilled") ? void (0) : __assert_fail ("DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && \"exec should not be spilled\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1688, __extension__ __PRETTY_FUNCTION__)) | |||
1688 | DestReg != AMDGPU::EXEC && "exec should not be spilled")(static_cast <bool> (DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && "exec should not be spilled") ? void (0) : __assert_fail ("DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && \"exec should not be spilled\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1688, __extension__ __PRETTY_FUNCTION__)); | |||
1689 | ||||
1690 | // FIXME: Maybe this should not include a memoperand because it will be | |||
1691 | // lowered to non-memory instructions. | |||
1692 | const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); | |||
1693 | if (DestReg.isVirtual() && SpillSize == 4) { | |||
1694 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
1695 | MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); | |||
1696 | } | |||
1697 | ||||
1698 | if (RI.spillSGPRToVGPR()) | |||
1699 | FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); | |||
1700 | BuildMI(MBB, MI, DL, OpDesc, DestReg) | |||
1701 | .addFrameIndex(FrameIndex) // addr | |||
1702 | .addMemOperand(MMO) | |||
1703 | .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); | |||
1704 | ||||
1705 | return; | |||
1706 | } | |||
1707 | ||||
1708 | unsigned Opcode = RI.isVectorSuperClass(RC) | |||
1709 | ? getAVSpillRestoreOpcode(SpillSize) | |||
1710 | : RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize) | |||
1711 | : getVGPRSpillRestoreOpcode(SpillSize); | |||
1712 | BuildMI(MBB, MI, DL, get(Opcode), DestReg) | |||
1713 | .addFrameIndex(FrameIndex) // vaddr | |||
1714 | .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset | |||
1715 | .addImm(0) // offset | |||
1716 | .addMemOperand(MMO); | |||
1717 | } | |||
1718 | ||||
1719 | void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, | |||
1720 | MachineBasicBlock::iterator MI) const { | |||
1721 | insertNoops(MBB, MI, 1); | |||
1722 | } | |||
1723 | ||||
1724 | void SIInstrInfo::insertNoops(MachineBasicBlock &MBB, | |||
1725 | MachineBasicBlock::iterator MI, | |||
1726 | unsigned Quantity) const { | |||
1727 | DebugLoc DL = MBB.findDebugLoc(MI); | |||
1728 | while (Quantity > 0) { | |||
1729 | unsigned Arg = std::min(Quantity, 8u); | |||
1730 | Quantity -= Arg; | |||
1731 | BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1); | |||
1732 | } | |||
1733 | } | |||
1734 | ||||
1735 | void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { | |||
1736 | auto MF = MBB.getParent(); | |||
1737 | SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); | |||
1738 | ||||
1739 | assert(Info->isEntryFunction())(static_cast <bool> (Info->isEntryFunction()) ? void (0) : __assert_fail ("Info->isEntryFunction()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1739, __extension__ __PRETTY_FUNCTION__)); | |||
1740 | ||||
1741 | if (MBB.succ_empty()) { | |||
1742 | bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); | |||
1743 | if (HasNoTerminator) { | |||
1744 | if (Info->returnsVoid()) { | |||
1745 | BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0); | |||
1746 | } else { | |||
1747 | BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG)); | |||
1748 | } | |||
1749 | } | |||
1750 | } | |||
1751 | } | |||
1752 | ||||
1753 | unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { | |||
1754 | switch (MI.getOpcode()) { | |||
1755 | default: | |||
1756 | if (MI.isMetaInstruction()) | |||
1757 | return 0; | |||
1758 | return 1; // FIXME: Do wait states equal cycles? | |||
1759 | ||||
1760 | case AMDGPU::S_NOP: | |||
1761 | return MI.getOperand(0).getImm() + 1; | |||
1762 | ||||
1763 | // FIXME: Any other pseudo instruction? | |||
1764 | // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The | |||
1765 | // hazard, even if one exist, won't really be visible. Should we handle it? | |||
1766 | case AMDGPU::SI_MASKED_UNREACHABLE: | |||
1767 | case AMDGPU::WAVE_BARRIER: | |||
1768 | return 0; | |||
1769 | } | |||
1770 | } | |||
1771 | ||||
1772 | bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { | |||
1773 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); | |||
1774 | MachineBasicBlock &MBB = *MI.getParent(); | |||
1775 | DebugLoc DL = MBB.findDebugLoc(MI); | |||
1776 | switch (MI.getOpcode()) { | |||
| ||||
1777 | default: return TargetInstrInfo::expandPostRAPseudo(MI); | |||
1778 | case AMDGPU::S_MOV_B64_term: | |||
1779 | // This is only a terminator to get the correct spill code placement during | |||
1780 | // register allocation. | |||
1781 | MI.setDesc(get(AMDGPU::S_MOV_B64)); | |||
1782 | break; | |||
1783 | ||||
1784 | case AMDGPU::S_MOV_B32_term: | |||
1785 | // This is only a terminator to get the correct spill code placement during | |||
1786 | // register allocation. | |||
1787 | MI.setDesc(get(AMDGPU::S_MOV_B32)); | |||
1788 | break; | |||
1789 | ||||
1790 | case AMDGPU::S_XOR_B64_term: | |||
1791 | // This is only a terminator to get the correct spill code placement during | |||
1792 | // register allocation. | |||
1793 | MI.setDesc(get(AMDGPU::S_XOR_B64)); | |||
1794 | break; | |||
1795 | ||||
1796 | case AMDGPU::S_XOR_B32_term: | |||
1797 | // This is only a terminator to get the correct spill code placement during | |||
1798 | // register allocation. | |||
1799 | MI.setDesc(get(AMDGPU::S_XOR_B32)); | |||
1800 | break; | |||
1801 | case AMDGPU::S_OR_B64_term: | |||
1802 | // This is only a terminator to get the correct spill code placement during | |||
1803 | // register allocation. | |||
1804 | MI.setDesc(get(AMDGPU::S_OR_B64)); | |||
1805 | break; | |||
1806 | case AMDGPU::S_OR_B32_term: | |||
1807 | // This is only a terminator to get the correct spill code placement during | |||
1808 | // register allocation. | |||
1809 | MI.setDesc(get(AMDGPU::S_OR_B32)); | |||
1810 | break; | |||
1811 | ||||
1812 | case AMDGPU::S_ANDN2_B64_term: | |||
1813 | // This is only a terminator to get the correct spill code placement during | |||
1814 | // register allocation. | |||
1815 | MI.setDesc(get(AMDGPU::S_ANDN2_B64)); | |||
1816 | break; | |||
1817 | ||||
1818 | case AMDGPU::S_ANDN2_B32_term: | |||
1819 | // This is only a terminator to get the correct spill code placement during | |||
1820 | // register allocation. | |||
1821 | MI.setDesc(get(AMDGPU::S_ANDN2_B32)); | |||
1822 | break; | |||
1823 | ||||
1824 | case AMDGPU::S_AND_B64_term: | |||
1825 | // This is only a terminator to get the correct spill code placement during | |||
1826 | // register allocation. | |||
1827 | MI.setDesc(get(AMDGPU::S_AND_B64)); | |||
1828 | break; | |||
1829 | ||||
1830 | case AMDGPU::S_AND_B32_term: | |||
1831 | // This is only a terminator to get the correct spill code placement during | |||
1832 | // register allocation. | |||
1833 | MI.setDesc(get(AMDGPU::S_AND_B32)); | |||
1834 | break; | |||
1835 | ||||
1836 | case AMDGPU::V_MOV_B64_PSEUDO: { | |||
1837 | Register Dst = MI.getOperand(0).getReg(); | |||
1838 | Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); | |||
1839 | Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); | |||
1840 | ||||
1841 | const MachineOperand &SrcOp = MI.getOperand(1); | |||
1842 | // FIXME: Will this work for 64-bit floating point immediates? | |||
1843 | assert(!SrcOp.isFPImm())(static_cast <bool> (!SrcOp.isFPImm()) ? void (0) : __assert_fail ("!SrcOp.isFPImm()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1843, __extension__ __PRETTY_FUNCTION__)); | |||
1844 | if (ST.hasMovB64()) { | |||
1845 | MI.setDesc(get(AMDGPU::V_MOV_B64_e32)); | |||
1846 | if (!isLiteralConstant(MI, 1) || isUInt<32>(SrcOp.getImm())) | |||
1847 | break; | |||
1848 | } | |||
1849 | if (SrcOp.isImm()) { | |||
1850 | APInt Imm(64, SrcOp.getImm()); | |||
1851 | APInt Lo(32, Imm.getLoBits(32).getZExtValue()); | |||
1852 | APInt Hi(32, Imm.getHiBits(32).getZExtValue()); | |||
1853 | if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) { | |||
1854 | BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) | |||
1855 | .addImm(SISrcMods::OP_SEL_1) | |||
1856 | .addImm(Lo.getSExtValue()) | |||
1857 | .addImm(SISrcMods::OP_SEL_1) | |||
1858 | .addImm(Lo.getSExtValue()) | |||
1859 | .addImm(0) // op_sel_lo | |||
1860 | .addImm(0) // op_sel_hi | |||
1861 | .addImm(0) // neg_lo | |||
1862 | .addImm(0) // neg_hi | |||
1863 | .addImm(0); // clamp | |||
1864 | } else { | |||
1865 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) | |||
1866 | .addImm(Lo.getSExtValue()) | |||
1867 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1868 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) | |||
1869 | .addImm(Hi.getSExtValue()) | |||
1870 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1871 | } | |||
1872 | } else { | |||
1873 | assert(SrcOp.isReg())(static_cast <bool> (SrcOp.isReg()) ? void (0) : __assert_fail ("SrcOp.isReg()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1873 , __extension__ __PRETTY_FUNCTION__)); | |||
1874 | if (ST.hasPackedFP32Ops() && | |||
1875 | !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) { | |||
1876 | BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) | |||
1877 | .addImm(SISrcMods::OP_SEL_1) // src0_mod | |||
1878 | .addReg(SrcOp.getReg()) | |||
1879 | .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod | |||
1880 | .addReg(SrcOp.getReg()) | |||
1881 | .addImm(0) // op_sel_lo | |||
1882 | .addImm(0) // op_sel_hi | |||
1883 | .addImm(0) // neg_lo | |||
1884 | .addImm(0) // neg_hi | |||
1885 | .addImm(0); // clamp | |||
1886 | } else { | |||
1887 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) | |||
1888 | .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) | |||
1889 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1890 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) | |||
1891 | .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) | |||
1892 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1893 | } | |||
1894 | } | |||
1895 | MI.eraseFromParent(); | |||
1896 | break; | |||
1897 | } | |||
1898 | case AMDGPU::V_MOV_B64_DPP_PSEUDO: { | |||
1899 | expandMovDPP64(MI); | |||
1900 | break; | |||
1901 | } | |||
1902 | case AMDGPU::S_MOV_B64_IMM_PSEUDO: { | |||
1903 | const MachineOperand &SrcOp = MI.getOperand(1); | |||
1904 | assert(!SrcOp.isFPImm())(static_cast <bool> (!SrcOp.isFPImm()) ? void (0) : __assert_fail ("!SrcOp.isFPImm()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1904, __extension__ __PRETTY_FUNCTION__)); | |||
1905 | APInt Imm(64, SrcOp.getImm()); | |||
1906 | if (Imm.isIntN(32) || isInlineConstant(Imm)) { | |||
1907 | MI.setDesc(get(AMDGPU::S_MOV_B64)); | |||
1908 | break; | |||
1909 | } | |||
1910 | ||||
1911 | Register Dst = MI.getOperand(0).getReg(); | |||
1912 | Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); | |||
1913 | Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); | |||
1914 | ||||
1915 | APInt Lo(32, Imm.getLoBits(32).getZExtValue()); | |||
1916 | APInt Hi(32, Imm.getHiBits(32).getZExtValue()); | |||
1917 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo) | |||
1918 | .addImm(Lo.getSExtValue()) | |||
1919 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1920 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi) | |||
1921 | .addImm(Hi.getSExtValue()) | |||
1922 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1923 | MI.eraseFromParent(); | |||
1924 | break; | |||
1925 | } | |||
1926 | case AMDGPU::V_SET_INACTIVE_B32: { | |||
1927 | unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; | |||
1928 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
1929 | // FIXME: We may possibly optimize the COPY once we find ways to make LLVM | |||
1930 | // optimizations (mainly Register Coalescer) aware of WWM register liveness. | |||
1931 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) | |||
1932 | .add(MI.getOperand(1)); | |||
1933 | auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); | |||
1934 | FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten | |||
1935 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) | |||
1936 | .add(MI.getOperand(2)); | |||
1937 | BuildMI(MBB, MI, DL, get(NotOpc), Exec) | |||
1938 | .addReg(Exec); | |||
1939 | MI.eraseFromParent(); | |||
1940 | break; | |||
1941 | } | |||
1942 | case AMDGPU::V_SET_INACTIVE_B64: { | |||
1943 | unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; | |||
1944 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
1945 | MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), | |||
1946 | MI.getOperand(0).getReg()) | |||
1947 | .add(MI.getOperand(1)); | |||
1948 | expandPostRAPseudo(*Copy); | |||
1949 | auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); | |||
1950 | FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten | |||
1951 | Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), | |||
1952 | MI.getOperand(0).getReg()) | |||
1953 | .add(MI.getOperand(2)); | |||
1954 | expandPostRAPseudo(*Copy); | |||
1955 | BuildMI(MBB, MI, DL, get(NotOpc), Exec) | |||
1956 | .addReg(Exec); | |||
1957 | MI.eraseFromParent(); | |||
1958 | break; | |||
1959 | } | |||
1960 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1: | |||
1961 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2: | |||
1962 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3: | |||
1963 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4: | |||
1964 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5: | |||
1965 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8: | |||
1966 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16: | |||
1967 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32: | |||
1968 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1: | |||
1969 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2: | |||
1970 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3: | |||
1971 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4: | |||
1972 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5: | |||
1973 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8: | |||
1974 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16: | |||
1975 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32: | |||
1976 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1: | |||
1977 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2: | |||
1978 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4: | |||
1979 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8: | |||
1980 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: { | |||
1981 | const TargetRegisterClass *EltRC = getOpRegClass(MI, 2); | |||
1982 | ||||
1983 | unsigned Opc; | |||
1984 | if (RI.hasVGPRs(EltRC)) { | |||
1985 | Opc = AMDGPU::V_MOVRELD_B32_e32; | |||
1986 | } else { | |||
1987 | Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64 | |||
1988 | : AMDGPU::S_MOVRELD_B32; | |||
1989 | } | |||
1990 | ||||
1991 | const MCInstrDesc &OpDesc = get(Opc); | |||
1992 | Register VecReg = MI.getOperand(0).getReg(); | |||
1993 | bool IsUndef = MI.getOperand(1).isUndef(); | |||
1994 | unsigned SubReg = MI.getOperand(3).getImm(); | |||
1995 | assert(VecReg == MI.getOperand(1).getReg())(static_cast <bool> (VecReg == MI.getOperand(1).getReg( )) ? void (0) : __assert_fail ("VecReg == MI.getOperand(1).getReg()" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 1995, __extension__ __PRETTY_FUNCTION__)); | |||
1996 | ||||
1997 | MachineInstrBuilder MIB = | |||
1998 | BuildMI(MBB, MI, DL, OpDesc) | |||
1999 | .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) | |||
2000 | .add(MI.getOperand(2)) | |||
2001 | .addReg(VecReg, RegState::ImplicitDefine) | |||
2002 | .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); | |||
2003 | ||||
2004 | const int ImpDefIdx = | |||
2005 | OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); | |||
2006 | const int ImpUseIdx = ImpDefIdx + 1; | |||
2007 | MIB->tieOperands(ImpDefIdx, ImpUseIdx); | |||
2008 | MI.eraseFromParent(); | |||
2009 | break; | |||
2010 | } | |||
2011 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1: | |||
2012 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2: | |||
2013 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3: | |||
2014 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4: | |||
2015 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5: | |||
2016 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8: | |||
2017 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16: | |||
2018 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: { | |||
2019 | assert(ST.useVGPRIndexMode())(static_cast <bool> (ST.useVGPRIndexMode()) ? void (0) : __assert_fail ("ST.useVGPRIndexMode()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2019, __extension__ __PRETTY_FUNCTION__)); | |||
2020 | Register VecReg = MI.getOperand(0).getReg(); | |||
2021 | bool IsUndef = MI.getOperand(1).isUndef(); | |||
2022 | Register Idx = MI.getOperand(3).getReg(); | |||
2023 | Register SubReg = MI.getOperand(4).getImm(); | |||
2024 | ||||
2025 | MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) | |||
2026 | .addReg(Idx) | |||
2027 | .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); | |||
2028 | SetOn->getOperand(3).setIsUndef(); | |||
2029 | ||||
2030 | const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write); | |||
2031 | MachineInstrBuilder MIB = | |||
2032 | BuildMI(MBB, MI, DL, OpDesc) | |||
2033 | .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) | |||
2034 | .add(MI.getOperand(2)) | |||
2035 | .addReg(VecReg, RegState::ImplicitDefine) | |||
2036 | .addReg(VecReg, | |||
2037 | RegState::Implicit | (IsUndef ? RegState::Undef : 0)); | |||
2038 | ||||
2039 | const int ImpDefIdx = OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); | |||
2040 | const int ImpUseIdx = ImpDefIdx + 1; | |||
2041 | MIB->tieOperands(ImpDefIdx, ImpUseIdx); | |||
2042 | ||||
2043 | MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); | |||
2044 | ||||
2045 | finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); | |||
2046 | ||||
2047 | MI.eraseFromParent(); | |||
2048 | break; | |||
2049 | } | |||
2050 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1: | |||
2051 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2: | |||
2052 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3: | |||
2053 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4: | |||
2054 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5: | |||
2055 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8: | |||
2056 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16: | |||
2057 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: { | |||
2058 | assert(ST.useVGPRIndexMode())(static_cast <bool> (ST.useVGPRIndexMode()) ? void (0) : __assert_fail ("ST.useVGPRIndexMode()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2058, __extension__ __PRETTY_FUNCTION__)); | |||
2059 | Register Dst = MI.getOperand(0).getReg(); | |||
2060 | Register VecReg = MI.getOperand(1).getReg(); | |||
2061 | bool IsUndef = MI.getOperand(1).isUndef(); | |||
2062 | Register Idx = MI.getOperand(2).getReg(); | |||
2063 | Register SubReg = MI.getOperand(3).getImm(); | |||
2064 | ||||
2065 | MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) | |||
2066 | .addReg(Idx) | |||
2067 | .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); | |||
2068 | SetOn->getOperand(3).setIsUndef(); | |||
2069 | ||||
2070 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read)) | |||
2071 | .addDef(Dst) | |||
2072 | .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) | |||
2073 | .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); | |||
2074 | ||||
2075 | MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); | |||
2076 | ||||
2077 | finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); | |||
2078 | ||||
2079 | MI.eraseFromParent(); | |||
2080 | break; | |||
2081 | } | |||
2082 | case AMDGPU::SI_PC_ADD_REL_OFFSET: { | |||
2083 | MachineFunction &MF = *MBB.getParent(); | |||
2084 | Register Reg = MI.getOperand(0).getReg(); | |||
2085 | Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0); | |||
2086 | Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1); | |||
2087 | ||||
2088 | // Create a bundle so these instructions won't be re-ordered by the | |||
2089 | // post-RA scheduler. | |||
2090 | MIBundleBuilder Bundler(MBB, MI); | |||
2091 | Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); | |||
2092 | ||||
2093 | // Add 32-bit offset from this instruction to the start of the | |||
2094 | // constant data. | |||
2095 | Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) | |||
2096 | .addReg(RegLo) | |||
2097 | .add(MI.getOperand(1))); | |||
2098 | ||||
2099 | MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) | |||
2100 | .addReg(RegHi); | |||
2101 | MIB.add(MI.getOperand(2)); | |||
2102 | ||||
2103 | Bundler.append(MIB); | |||
2104 | finalizeBundle(MBB, Bundler.begin()); | |||
2105 | ||||
2106 | MI.eraseFromParent(); | |||
2107 | break; | |||
2108 | } | |||
2109 | case AMDGPU::ENTER_STRICT_WWM: { | |||
2110 | // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when | |||
2111 | // Whole Wave Mode is entered. | |||
2112 | MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 | |||
2113 | : AMDGPU::S_OR_SAVEEXEC_B64)); | |||
2114 | break; | |||
2115 | } | |||
2116 | case AMDGPU::ENTER_STRICT_WQM: { | |||
2117 | // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when | |||
2118 | // STRICT_WQM is entered. | |||
2119 | const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
2120 | const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64; | |||
2121 | const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; | |||
2122 | BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec); | |||
2123 | BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec); | |||
2124 | ||||
2125 | MI.eraseFromParent(); | |||
2126 | break; | |||
2127 | } | |||
2128 | case AMDGPU::EXIT_STRICT_WWM: | |||
2129 | case AMDGPU::EXIT_STRICT_WQM: { | |||
2130 | // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when | |||
2131 | // WWM/STICT_WQM is exited. | |||
2132 | MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); | |||
2133 | break; | |||
2134 | } | |||
2135 | case AMDGPU::SI_RETURN: { | |||
2136 | const MachineFunction *MF = MBB.getParent(); | |||
2137 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); | |||
2138 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); | |||
2139 | // Hiding the return address use with SI_RETURN may lead to extra kills in | |||
2140 | // the function and missing live-ins. We are fine in practice because callee | |||
2141 | // saved register handling ensures the register value is restored before | |||
2142 | // RET, but we need the undef flag here to appease the MachineVerifier | |||
2143 | // liveness checks. | |||
2144 | MachineInstrBuilder MIB = | |||
2145 | BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return)) | |||
2146 | .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef); | |||
2147 | ||||
2148 | MIB.copyImplicitOps(MI); | |||
2149 | MI.eraseFromParent(); | |||
2150 | break; | |||
2151 | } | |||
2152 | } | |||
2153 | return true; | |||
2154 | } | |||
2155 | ||||
2156 | std::pair<MachineInstr*, MachineInstr*> | |||
2157 | SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { | |||
2158 | assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO)(static_cast <bool> (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ) ? void (0) : __assert_fail ("MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2158, __extension__ __PRETTY_FUNCTION__)); | |||
2159 | ||||
2160 | MachineBasicBlock &MBB = *MI.getParent(); | |||
2161 | DebugLoc DL = MBB.findDebugLoc(MI); | |||
2162 | MachineFunction *MF = MBB.getParent(); | |||
2163 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
2164 | Register Dst = MI.getOperand(0).getReg(); | |||
2165 | unsigned Part = 0; | |||
2166 | MachineInstr *Split[2]; | |||
2167 | ||||
2168 | for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) { | |||
2169 | auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp)); | |||
2170 | if (Dst.isPhysical()) { | |||
2171 | MovDPP.addDef(RI.getSubReg(Dst, Sub)); | |||
2172 | } else { | |||
2173 | assert(MRI.isSSA())(static_cast <bool> (MRI.isSSA()) ? void (0) : __assert_fail ("MRI.isSSA()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2173 , __extension__ __PRETTY_FUNCTION__)); | |||
2174 | auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
2175 | MovDPP.addDef(Tmp); | |||
2176 | } | |||
2177 | ||||
2178 | for (unsigned I = 1; I <= 2; ++I) { // old and src operands. | |||
2179 | const MachineOperand &SrcOp = MI.getOperand(I); | |||
2180 | assert(!SrcOp.isFPImm())(static_cast <bool> (!SrcOp.isFPImm()) ? void (0) : __assert_fail ("!SrcOp.isFPImm()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2180, __extension__ __PRETTY_FUNCTION__)); | |||
2181 | if (SrcOp.isImm()) { | |||
2182 | APInt Imm(64, SrcOp.getImm()); | |||
2183 | Imm.ashrInPlace(Part * 32); | |||
2184 | MovDPP.addImm(Imm.getLoBits(32).getZExtValue()); | |||
2185 | } else { | |||
2186 | assert(SrcOp.isReg())(static_cast <bool> (SrcOp.isReg()) ? void (0) : __assert_fail ("SrcOp.isReg()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2186 , __extension__ __PRETTY_FUNCTION__)); | |||
2187 | Register Src = SrcOp.getReg(); | |||
2188 | if (Src.isPhysical()) | |||
2189 | MovDPP.addReg(RI.getSubReg(Src, Sub)); | |||
2190 | else | |||
2191 | MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub); | |||
2192 | } | |||
2193 | } | |||
2194 | ||||
2195 | for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I) | |||
2196 | MovDPP.addImm(MI.getOperand(I).getImm()); | |||
2197 | ||||
2198 | Split[Part] = MovDPP; | |||
2199 | ++Part; | |||
2200 | } | |||
2201 | ||||
2202 | if (Dst.isVirtual()) | |||
2203 | BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst) | |||
2204 | .addReg(Split[0]->getOperand(0).getReg()) | |||
| ||||
2205 | .addImm(AMDGPU::sub0) | |||
2206 | .addReg(Split[1]->getOperand(0).getReg()) | |||
2207 | .addImm(AMDGPU::sub1); | |||
2208 | ||||
2209 | MI.eraseFromParent(); | |||
2210 | return std::make_pair(Split[0], Split[1]); | |||
2211 | } | |||
2212 | ||||
2213 | bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, | |||
2214 | MachineOperand &Src0, | |||
2215 | unsigned Src0OpName, | |||
2216 | MachineOperand &Src1, | |||
2217 | unsigned Src1OpName) const { | |||
2218 | MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); | |||
2219 | if (!Src0Mods) | |||
2220 | return false; | |||
2221 | ||||
2222 | MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); | |||
2223 | assert(Src1Mods &&(static_cast <bool> (Src1Mods && "All commutable instructions have both src0 and src1 modifiers" ) ? void (0) : __assert_fail ("Src1Mods && \"All commutable instructions have both src0 and src1 modifiers\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2224, __extension__ __PRETTY_FUNCTION__)) | |||
2224 | "All commutable instructions have both src0 and src1 modifiers")(static_cast <bool> (Src1Mods && "All commutable instructions have both src0 and src1 modifiers" ) ? void (0) : __assert_fail ("Src1Mods && \"All commutable instructions have both src0 and src1 modifiers\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2224, __extension__ __PRETTY_FUNCTION__)); | |||
2225 | ||||
2226 | int Src0ModsVal = Src0Mods->getImm(); | |||
2227 | int Src1ModsVal = Src1Mods->getImm(); | |||
2228 | ||||
2229 | Src1Mods->setImm(Src0ModsVal); | |||
2230 | Src0Mods->setImm(Src1ModsVal); | |||
2231 | return true; | |||
2232 | } | |||
2233 | ||||
2234 | static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, | |||
2235 | MachineOperand &RegOp, | |||
2236 | MachineOperand &NonRegOp) { | |||
2237 | Register Reg = RegOp.getReg(); | |||
2238 | unsigned SubReg = RegOp.getSubReg(); | |||
2239 | bool IsKill = RegOp.isKill(); | |||
2240 | bool IsDead = RegOp.isDead(); | |||
2241 | bool IsUndef = RegOp.isUndef(); | |||
2242 | bool IsDebug = RegOp.isDebug(); | |||
2243 | ||||
2244 | if (NonRegOp.isImm()) | |||
2245 | RegOp.ChangeToImmediate(NonRegOp.getImm()); | |||
2246 | else if (NonRegOp.isFI()) | |||
2247 | RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); | |||
2248 | else if (NonRegOp.isGlobal()) { | |||
2249 | RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(), | |||
2250 | NonRegOp.getTargetFlags()); | |||
2251 | } else | |||
2252 | return nullptr; | |||
2253 | ||||
2254 | // Make sure we don't reinterpret a subreg index in the target flags. | |||
2255 | RegOp.setTargetFlags(NonRegOp.getTargetFlags()); | |||
2256 | ||||
2257 | NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); | |||
2258 | NonRegOp.setSubReg(SubReg); | |||
2259 | ||||
2260 | return &MI; | |||
2261 | } | |||
2262 | ||||
2263 | MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, | |||
2264 | unsigned Src0Idx, | |||
2265 | unsigned Src1Idx) const { | |||
2266 | assert(!NewMI && "this should never be used")(static_cast <bool> (!NewMI && "this should never be used" ) ? void (0) : __assert_fail ("!NewMI && \"this should never be used\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2266, __extension__ __PRETTY_FUNCTION__)); | |||
2267 | ||||
2268 | unsigned Opc = MI.getOpcode(); | |||
2269 | int CommutedOpcode = commuteOpcode(Opc); | |||
2270 | if (CommutedOpcode == -1) | |||
2271 | return nullptr; | |||
2272 | ||||
2273 | assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==(static_cast <bool> (AMDGPU::getNamedOperandIdx(Opc, AMDGPU ::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast <int>(Src1Idx) && "inconsistency with findCommutedOpIndices" ) ? void (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2277, __extension__ __PRETTY_FUNCTION__)) | |||
2274 | static_cast<int>(Src0Idx) &&(static_cast <bool> (AMDGPU::getNamedOperandIdx(Opc, AMDGPU ::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast <int>(Src1Idx) && "inconsistency with findCommutedOpIndices" ) ? void (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2277, __extension__ __PRETTY_FUNCTION__)) | |||
2275 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==(static_cast <bool> (AMDGPU::getNamedOperandIdx(Opc, AMDGPU ::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast <int>(Src1Idx) && "inconsistency with findCommutedOpIndices" ) ? void (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2277, __extension__ __PRETTY_FUNCTION__)) | |||
2276 | static_cast<int>(Src1Idx) &&(static_cast <bool> (AMDGPU::getNamedOperandIdx(Opc, AMDGPU ::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast <int>(Src1Idx) && "inconsistency with findCommutedOpIndices" ) ? void (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2277, __extension__ __PRETTY_FUNCTION__)) | |||
2277 | "inconsistency with findCommutedOpIndices")(static_cast <bool> (AMDGPU::getNamedOperandIdx(Opc, AMDGPU ::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast <int>(Src1Idx) && "inconsistency with findCommutedOpIndices" ) ? void (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2277, __extension__ __PRETTY_FUNCTION__)); | |||
2278 | ||||
2279 | MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
2280 | MachineOperand &Src1 = MI.getOperand(Src1Idx); | |||
2281 | ||||
2282 | MachineInstr *CommutedMI = nullptr; | |||
2283 | if (Src0.isReg() && Src1.isReg()) { | |||
2284 | if (isOperandLegal(MI, Src1Idx, &Src0)) { | |||
2285 | // Be sure to copy the source modifiers to the right place. | |||
2286 | CommutedMI | |||
2287 | = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); | |||
2288 | } | |||
2289 | ||||
2290 | } else if (Src0.isReg() && !Src1.isReg()) { | |||
2291 | // src0 should always be able to support any operand type, so no need to | |||
2292 | // check operand legality. | |||
2293 | CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); | |||
2294 | } else if (!Src0.isReg() && Src1.isReg()) { | |||
2295 | if (isOperandLegal(MI, Src1Idx, &Src0)) | |||
2296 | CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); | |||
2297 | } else { | |||
2298 | // FIXME: Found two non registers to commute. This does happen. | |||
2299 | return nullptr; | |||
2300 | } | |||
2301 | ||||
2302 | if (CommutedMI) { | |||
2303 | swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, | |||
2304 | Src1, AMDGPU::OpName::src1_modifiers); | |||
2305 | ||||
2306 | CommutedMI->setDesc(get(CommutedOpcode)); | |||
2307 | } | |||
2308 | ||||
2309 | return CommutedMI; | |||
2310 | } | |||
2311 | ||||
2312 | // This needs to be implemented because the source modifiers may be inserted | |||
2313 | // between the true commutable operands, and the base | |||
2314 | // TargetInstrInfo::commuteInstruction uses it. | |||
2315 | bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI, | |||
2316 | unsigned &SrcOpIdx0, | |||
2317 | unsigned &SrcOpIdx1) const { | |||
2318 | return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); | |||
2319 | } | |||
2320 | ||||
2321 | bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0, | |||
2322 | unsigned &SrcOpIdx1) const { | |||
2323 | if (!Desc.isCommutable()) | |||
2324 | return false; | |||
2325 | ||||
2326 | unsigned Opc = Desc.getOpcode(); | |||
2327 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); | |||
2328 | if (Src0Idx == -1) | |||
2329 | return false; | |||
2330 | ||||
2331 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); | |||
2332 | if (Src1Idx == -1) | |||
2333 | return false; | |||
2334 | ||||
2335 | return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); | |||
2336 | } | |||
2337 | ||||
2338 | bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, | |||
2339 | int64_t BrOffset) const { | |||
2340 | // BranchRelaxation should never have to check s_setpc_b64 because its dest | |||
2341 | // block is unanalyzable. | |||
2342 | assert(BranchOp != AMDGPU::S_SETPC_B64)(static_cast <bool> (BranchOp != AMDGPU::S_SETPC_B64) ? void (0) : __assert_fail ("BranchOp != AMDGPU::S_SETPC_B64", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2342, __extension__ __PRETTY_FUNCTION__)); | |||
2343 | ||||
2344 | // Convert to dwords. | |||
2345 | BrOffset /= 4; | |||
2346 | ||||
2347 | // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is | |||
2348 | // from the next instruction. | |||
2349 | BrOffset -= 1; | |||
2350 | ||||
2351 | return isIntN(BranchOffsetBits, BrOffset); | |||
2352 | } | |||
2353 | ||||
2354 | MachineBasicBlock *SIInstrInfo::getBranchDestBlock( | |||
2355 | const MachineInstr &MI) const { | |||
2356 | if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { | |||
2357 | // This would be a difficult analysis to perform, but can always be legal so | |||
2358 | // there's no need to analyze it. | |||
2359 | return nullptr; | |||
2360 | } | |||
2361 | ||||
2362 | return MI.getOperand(0).getMBB(); | |||
2363 | } | |||
2364 | ||||
2365 | void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, | |||
2366 | MachineBasicBlock &DestBB, | |||
2367 | MachineBasicBlock &RestoreBB, | |||
2368 | const DebugLoc &DL, int64_t BrOffset, | |||
2369 | RegScavenger *RS) const { | |||
2370 | assert(RS && "RegScavenger required for long branching")(static_cast <bool> (RS && "RegScavenger required for long branching" ) ? void (0) : __assert_fail ("RS && \"RegScavenger required for long branching\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2370, __extension__ __PRETTY_FUNCTION__)); | |||
2371 | assert(MBB.empty() &&(static_cast <bool> (MBB.empty() && "new block should be inserted for expanding unconditional branch" ) ? void (0) : __assert_fail ("MBB.empty() && \"new block should be inserted for expanding unconditional branch\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2372, __extension__ __PRETTY_FUNCTION__)) | |||
2372 | "new block should be inserted for expanding unconditional branch")(static_cast <bool> (MBB.empty() && "new block should be inserted for expanding unconditional branch" ) ? void (0) : __assert_fail ("MBB.empty() && \"new block should be inserted for expanding unconditional branch\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2372, __extension__ __PRETTY_FUNCTION__)); | |||
2373 | assert(MBB.pred_size() == 1)(static_cast <bool> (MBB.pred_size() == 1) ? void (0) : __assert_fail ("MBB.pred_size() == 1", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2373, __extension__ __PRETTY_FUNCTION__)); | |||
2374 | assert(RestoreBB.empty() &&(static_cast <bool> (RestoreBB.empty() && "restore block should be inserted for restoring clobbered registers" ) ? void (0) : __assert_fail ("RestoreBB.empty() && \"restore block should be inserted for restoring clobbered registers\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2375, __extension__ __PRETTY_FUNCTION__)) | |||
2375 | "restore block should be inserted for restoring clobbered registers")(static_cast <bool> (RestoreBB.empty() && "restore block should be inserted for restoring clobbered registers" ) ? void (0) : __assert_fail ("RestoreBB.empty() && \"restore block should be inserted for restoring clobbered registers\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2375, __extension__ __PRETTY_FUNCTION__)); | |||
2376 | ||||
2377 | MachineFunction *MF = MBB.getParent(); | |||
2378 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
2379 | ||||
2380 | // FIXME: Virtual register workaround for RegScavenger not working with empty | |||
2381 | // blocks. | |||
2382 | Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); | |||
2383 | ||||
2384 | auto I = MBB.end(); | |||
2385 | ||||
2386 | // We need to compute the offset relative to the instruction immediately after | |||
2387 | // s_getpc_b64. Insert pc arithmetic code before last terminator. | |||
2388 | MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); | |||
2389 | ||||
2390 | auto &MCCtx = MF->getContext(); | |||
2391 | MCSymbol *PostGetPCLabel = | |||
2392 | MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true); | |||
2393 | GetPC->setPostInstrSymbol(*MF, PostGetPCLabel); | |||
2394 | ||||
2395 | MCSymbol *OffsetLo = | |||
2396 | MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true); | |||
2397 | MCSymbol *OffsetHi = | |||
2398 | MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true); | |||
2399 | BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) | |||
2400 | .addReg(PCReg, RegState::Define, AMDGPU::sub0) | |||
2401 | .addReg(PCReg, 0, AMDGPU::sub0) | |||
2402 | .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET); | |||
2403 | BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) | |||
2404 | .addReg(PCReg, RegState::Define, AMDGPU::sub1) | |||
2405 | .addReg(PCReg, 0, AMDGPU::sub1) | |||
2406 | .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET); | |||
2407 | ||||
2408 | // Insert the indirect branch after the other terminator. | |||
2409 | BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) | |||
2410 | .addReg(PCReg); | |||
2411 | ||||
2412 | // FIXME: If spilling is necessary, this will fail because this scavenger has | |||
2413 | // no emergency stack slots. It is non-trivial to spill in this situation, | |||
2414 | // because the restore code needs to be specially placed after the | |||
2415 | // jump. BranchRelaxation then needs to be made aware of the newly inserted | |||
2416 | // block. | |||
2417 | // | |||
2418 | // If a spill is needed for the pc register pair, we need to insert a spill | |||
2419 | // restore block right before the destination block, and insert a short branch | |||
2420 | // into the old destination block's fallthrough predecessor. | |||
2421 | // e.g.: | |||
2422 | // | |||
2423 | // s_cbranch_scc0 skip_long_branch: | |||
2424 | // | |||
2425 | // long_branch_bb: | |||
2426 | // spill s[8:9] | |||
2427 | // s_getpc_b64 s[8:9] | |||
2428 | // s_add_u32 s8, s8, restore_bb | |||
2429 | // s_addc_u32 s9, s9, 0 | |||
2430 | // s_setpc_b64 s[8:9] | |||
2431 | // | |||
2432 | // skip_long_branch: | |||
2433 | // foo; | |||
2434 | // | |||
2435 | // ..... | |||
2436 | // | |||
2437 | // dest_bb_fallthrough_predecessor: | |||
2438 | // bar; | |||
2439 | // s_branch dest_bb | |||
2440 | // | |||
2441 | // restore_bb: | |||
2442 | // restore s[8:9] | |||
2443 | // fallthrough dest_bb | |||
2444 | /// | |||
2445 | // dest_bb: | |||
2446 | // buzz; | |||
2447 | ||||
2448 | RS->enterBasicBlockEnd(MBB); | |||
2449 | Register Scav = RS->scavengeRegisterBackwards( | |||
2450 | AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC), | |||
2451 | /* RestoreAfter */ false, 0, /* AllowSpill */ false); | |||
2452 | if (Scav) { | |||
2453 | RS->setRegUsed(Scav); | |||
2454 | MRI.replaceRegWith(PCReg, Scav); | |||
2455 | MRI.clearVirtRegs(); | |||
2456 | } else { | |||
2457 | // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for | |||
2458 | // SGPR spill. | |||
2459 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); | |||
2460 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); | |||
2461 | TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS); | |||
2462 | MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1); | |||
2463 | MRI.clearVirtRegs(); | |||
2464 | } | |||
2465 | ||||
2466 | MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol(); | |||
2467 | // Now, the distance could be defined. | |||
2468 | auto *Offset = MCBinaryExpr::createSub( | |||
2469 | MCSymbolRefExpr::create(DestLabel, MCCtx), | |||
2470 | MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx); | |||
2471 | // Add offset assignments. | |||
2472 | auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx); | |||
2473 | OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx)); | |||
2474 | auto *ShAmt = MCConstantExpr::create(32, MCCtx); | |||
2475 | OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx)); | |||
2476 | } | |||
2477 | ||||
2478 | unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { | |||
2479 | switch (Cond) { | |||
2480 | case SIInstrInfo::SCC_TRUE: | |||
2481 | return AMDGPU::S_CBRANCH_SCC1; | |||
2482 | case SIInstrInfo::SCC_FALSE: | |||
2483 | return AMDGPU::S_CBRANCH_SCC0; | |||
2484 | case SIInstrInfo::VCCNZ: | |||
2485 | return AMDGPU::S_CBRANCH_VCCNZ; | |||
2486 | case SIInstrInfo::VCCZ: | |||
2487 | return AMDGPU::S_CBRANCH_VCCZ; | |||
2488 | case SIInstrInfo::EXECNZ: | |||
2489 | return AMDGPU::S_CBRANCH_EXECNZ; | |||
2490 | case SIInstrInfo::EXECZ: | |||
2491 | return AMDGPU::S_CBRANCH_EXECZ; | |||
2492 | default: | |||
2493 | llvm_unreachable("invalid branch predicate")::llvm::llvm_unreachable_internal("invalid branch predicate", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2493); | |||
2494 | } | |||
2495 | } | |||
2496 | ||||
2497 | SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { | |||
2498 | switch (Opcode) { | |||
2499 | case AMDGPU::S_CBRANCH_SCC0: | |||
2500 | return SCC_FALSE; | |||
2501 | case AMDGPU::S_CBRANCH_SCC1: | |||
2502 | return SCC_TRUE; | |||
2503 | case AMDGPU::S_CBRANCH_VCCNZ: | |||
2504 | return VCCNZ; | |||
2505 | case AMDGPU::S_CBRANCH_VCCZ: | |||
2506 | return VCCZ; | |||
2507 | case AMDGPU::S_CBRANCH_EXECNZ: | |||
2508 | return EXECNZ; | |||
2509 | case AMDGPU::S_CBRANCH_EXECZ: | |||
2510 | return EXECZ; | |||
2511 | default: | |||
2512 | return INVALID_BR; | |||
2513 | } | |||
2514 | } | |||
2515 | ||||
2516 | bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, | |||
2517 | MachineBasicBlock::iterator I, | |||
2518 | MachineBasicBlock *&TBB, | |||
2519 | MachineBasicBlock *&FBB, | |||
2520 | SmallVectorImpl<MachineOperand> &Cond, | |||
2521 | bool AllowModify) const { | |||
2522 | if (I->getOpcode() == AMDGPU::S_BRANCH) { | |||
2523 | // Unconditional Branch | |||
2524 | TBB = I->getOperand(0).getMBB(); | |||
2525 | return false; | |||
2526 | } | |||
2527 | ||||
2528 | MachineBasicBlock *CondBB = nullptr; | |||
2529 | ||||
2530 | if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { | |||
2531 | CondBB = I->getOperand(1).getMBB(); | |||
2532 | Cond.push_back(I->getOperand(0)); | |||
2533 | } else { | |||
2534 | BranchPredicate Pred = getBranchPredicate(I->getOpcode()); | |||
2535 | if (Pred == INVALID_BR) | |||
2536 | return true; | |||
2537 | ||||
2538 | CondBB = I->getOperand(0).getMBB(); | |||
2539 | Cond.push_back(MachineOperand::CreateImm(Pred)); | |||
2540 | Cond.push_back(I->getOperand(1)); // Save the branch register. | |||
2541 | } | |||
2542 | ++I; | |||
2543 | ||||
2544 | if (I == MBB.end()) { | |||
2545 | // Conditional branch followed by fall-through. | |||
2546 | TBB = CondBB; | |||
2547 | return false; | |||
2548 | } | |||
2549 | ||||
2550 | if (I->getOpcode() == AMDGPU::S_BRANCH) { | |||
2551 | TBB = CondBB; | |||
2552 | FBB = I->getOperand(0).getMBB(); | |||
2553 | return false; | |||
2554 | } | |||
2555 | ||||
2556 | return true; | |||
2557 | } | |||
2558 | ||||
2559 | bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, | |||
2560 | MachineBasicBlock *&FBB, | |||
2561 | SmallVectorImpl<MachineOperand> &Cond, | |||
2562 | bool AllowModify) const { | |||
2563 | MachineBasicBlock::iterator I = MBB.getFirstTerminator(); | |||
2564 | auto E = MBB.end(); | |||
2565 | if (I == E) | |||
2566 | return false; | |||
2567 | ||||
2568 | // Skip over the instructions that are artificially terminators for special | |||
2569 | // exec management. | |||
2570 | while (I != E && !I->isBranch() && !I->isReturn()) { | |||
2571 | switch (I->getOpcode()) { | |||
2572 | case AMDGPU::S_MOV_B64_term: | |||
2573 | case AMDGPU::S_XOR_B64_term: | |||
2574 | case AMDGPU::S_OR_B64_term: | |||
2575 | case AMDGPU::S_ANDN2_B64_term: | |||
2576 | case AMDGPU::S_AND_B64_term: | |||
2577 | case AMDGPU::S_MOV_B32_term: | |||
2578 | case AMDGPU::S_XOR_B32_term: | |||
2579 | case AMDGPU::S_OR_B32_term: | |||
2580 | case AMDGPU::S_ANDN2_B32_term: | |||
2581 | case AMDGPU::S_AND_B32_term: | |||
2582 | break; | |||
2583 | case AMDGPU::SI_IF: | |||
2584 | case AMDGPU::SI_ELSE: | |||
2585 | case AMDGPU::SI_KILL_I1_TERMINATOR: | |||
2586 | case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: | |||
2587 | // FIXME: It's messy that these need to be considered here at all. | |||
2588 | return true; | |||
2589 | default: | |||
2590 | llvm_unreachable("unexpected non-branch terminator inst")::llvm::llvm_unreachable_internal("unexpected non-branch terminator inst" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2590); | |||
2591 | } | |||
2592 | ||||
2593 | ++I; | |||
2594 | } | |||
2595 | ||||
2596 | if (I == E) | |||
2597 | return false; | |||
2598 | ||||
2599 | return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); | |||
2600 | } | |||
2601 | ||||
2602 | unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, | |||
2603 | int *BytesRemoved) const { | |||
2604 | unsigned Count = 0; | |||
2605 | unsigned RemovedSize = 0; | |||
2606 | for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) { | |||
2607 | // Skip over artificial terminators when removing instructions. | |||
2608 | if (MI.isBranch() || MI.isReturn()) { | |||
2609 | RemovedSize += getInstSizeInBytes(MI); | |||
2610 | MI.eraseFromParent(); | |||
2611 | ++Count; | |||
2612 | } | |||
2613 | } | |||
2614 | ||||
2615 | if (BytesRemoved) | |||
2616 | *BytesRemoved = RemovedSize; | |||
2617 | ||||
2618 | return Count; | |||
2619 | } | |||
2620 | ||||
2621 | // Copy the flags onto the implicit condition register operand. | |||
2622 | static void preserveCondRegFlags(MachineOperand &CondReg, | |||
2623 | const MachineOperand &OrigCond) { | |||
2624 | CondReg.setIsUndef(OrigCond.isUndef()); | |||
2625 | CondReg.setIsKill(OrigCond.isKill()); | |||
2626 | } | |||
2627 | ||||
2628 | unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, | |||
2629 | MachineBasicBlock *TBB, | |||
2630 | MachineBasicBlock *FBB, | |||
2631 | ArrayRef<MachineOperand> Cond, | |||
2632 | const DebugLoc &DL, | |||
2633 | int *BytesAdded) const { | |||
2634 | if (!FBB && Cond.empty()) { | |||
2635 | BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) | |||
2636 | .addMBB(TBB); | |||
2637 | if (BytesAdded) | |||
2638 | *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; | |||
2639 | return 1; | |||
2640 | } | |||
2641 | ||||
2642 | if(Cond.size() == 1 && Cond[0].isReg()) { | |||
2643 | BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) | |||
2644 | .add(Cond[0]) | |||
2645 | .addMBB(TBB); | |||
2646 | return 1; | |||
2647 | } | |||
2648 | ||||
2649 | assert(TBB && Cond[0].isImm())(static_cast <bool> (TBB && Cond[0].isImm()) ? void (0) : __assert_fail ("TBB && Cond[0].isImm()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2649, __extension__ __PRETTY_FUNCTION__)); | |||
2650 | ||||
2651 | unsigned Opcode | |||
2652 | = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); | |||
2653 | ||||
2654 | if (!FBB) { | |||
2655 | Cond[1].isUndef(); | |||
2656 | MachineInstr *CondBr = | |||
2657 | BuildMI(&MBB, DL, get(Opcode)) | |||
2658 | .addMBB(TBB); | |||
2659 | ||||
2660 | // Copy the flags onto the implicit condition register operand. | |||
2661 | preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); | |||
2662 | fixImplicitOperands(*CondBr); | |||
2663 | ||||
2664 | if (BytesAdded) | |||
2665 | *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; | |||
2666 | return 1; | |||
2667 | } | |||
2668 | ||||
2669 | assert(TBB && FBB)(static_cast <bool> (TBB && FBB) ? void (0) : __assert_fail ("TBB && FBB", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2669, __extension__ __PRETTY_FUNCTION__)); | |||
2670 | ||||
2671 | MachineInstr *CondBr = | |||
2672 | BuildMI(&MBB, DL, get(Opcode)) | |||
2673 | .addMBB(TBB); | |||
2674 | fixImplicitOperands(*CondBr); | |||
2675 | BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) | |||
2676 | .addMBB(FBB); | |||
2677 | ||||
2678 | MachineOperand &CondReg = CondBr->getOperand(1); | |||
2679 | CondReg.setIsUndef(Cond[1].isUndef()); | |||
2680 | CondReg.setIsKill(Cond[1].isKill()); | |||
2681 | ||||
2682 | if (BytesAdded) | |||
2683 | *BytesAdded = ST.hasOffset3fBug() ? 16 : 8; | |||
2684 | ||||
2685 | return 2; | |||
2686 | } | |||
2687 | ||||
2688 | bool SIInstrInfo::reverseBranchCondition( | |||
2689 | SmallVectorImpl<MachineOperand> &Cond) const { | |||
2690 | if (Cond.size() != 2) { | |||
2691 | return true; | |||
2692 | } | |||
2693 | ||||
2694 | if (Cond[0].isImm()) { | |||
2695 | Cond[0].setImm(-Cond[0].getImm()); | |||
2696 | return false; | |||
2697 | } | |||
2698 | ||||
2699 | return true; | |||
2700 | } | |||
2701 | ||||
2702 | bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, | |||
2703 | ArrayRef<MachineOperand> Cond, | |||
2704 | Register DstReg, Register TrueReg, | |||
2705 | Register FalseReg, int &CondCycles, | |||
2706 | int &TrueCycles, int &FalseCycles) const { | |||
2707 | switch (Cond[0].getImm()) { | |||
2708 | case VCCNZ: | |||
2709 | case VCCZ: { | |||
2710 | const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
2711 | const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); | |||
2712 | if (MRI.getRegClass(FalseReg) != RC) | |||
2713 | return false; | |||
2714 | ||||
2715 | int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; | |||
2716 | CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? | |||
2717 | ||||
2718 | // Limit to equal cost for branch vs. N v_cndmask_b32s. | |||
2719 | return RI.hasVGPRs(RC) && NumInsts <= 6; | |||
2720 | } | |||
2721 | case SCC_TRUE: | |||
2722 | case SCC_FALSE: { | |||
2723 | // FIXME: We could insert for VGPRs if we could replace the original compare | |||
2724 | // with a vector one. | |||
2725 | const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
2726 | const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); | |||
2727 | if (MRI.getRegClass(FalseReg) != RC) | |||
2728 | return false; | |||
2729 | ||||
2730 | int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; | |||
2731 | ||||
2732 | // Multiples of 8 can do s_cselect_b64 | |||
2733 | if (NumInsts % 2 == 0) | |||
2734 | NumInsts /= 2; | |||
2735 | ||||
2736 | CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? | |||
2737 | return RI.isSGPRClass(RC); | |||
2738 | } | |||
2739 | default: | |||
2740 | return false; | |||
2741 | } | |||
2742 | } | |||
2743 | ||||
2744 | void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, | |||
2745 | MachineBasicBlock::iterator I, const DebugLoc &DL, | |||
2746 | Register DstReg, ArrayRef<MachineOperand> Cond, | |||
2747 | Register TrueReg, Register FalseReg) const { | |||
2748 | BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); | |||
2749 | if (Pred == VCCZ || Pred == SCC_FALSE) { | |||
2750 | Pred = static_cast<BranchPredicate>(-Pred); | |||
2751 | std::swap(TrueReg, FalseReg); | |||
2752 | } | |||
2753 | ||||
2754 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
2755 | const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); | |||
2756 | unsigned DstSize = RI.getRegSizeInBits(*DstRC); | |||
2757 | ||||
2758 | if (DstSize == 32) { | |||
2759 | MachineInstr *Select; | |||
2760 | if (Pred == SCC_TRUE) { | |||
2761 | Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg) | |||
2762 | .addReg(TrueReg) | |||
2763 | .addReg(FalseReg); | |||
2764 | } else { | |||
2765 | // Instruction's operands are backwards from what is expected. | |||
2766 | Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg) | |||
2767 | .addReg(FalseReg) | |||
2768 | .addReg(TrueReg); | |||
2769 | } | |||
2770 | ||||
2771 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); | |||
2772 | return; | |||
2773 | } | |||
2774 | ||||
2775 | if (DstSize == 64 && Pred == SCC_TRUE) { | |||
2776 | MachineInstr *Select = | |||
2777 | BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) | |||
2778 | .addReg(TrueReg) | |||
2779 | .addReg(FalseReg); | |||
2780 | ||||
2781 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); | |||
2782 | return; | |||
2783 | } | |||
2784 | ||||
2785 | static const int16_t Sub0_15[] = { | |||
2786 | AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, | |||
2787 | AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, | |||
2788 | AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, | |||
2789 | AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, | |||
2790 | }; | |||
2791 | ||||
2792 | static const int16_t Sub0_15_64[] = { | |||
2793 | AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, | |||
2794 | AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, | |||
2795 | AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, | |||
2796 | AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, | |||
2797 | }; | |||
2798 | ||||
2799 | unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; | |||
2800 | const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; | |||
2801 | const int16_t *SubIndices = Sub0_15; | |||
2802 | int NElts = DstSize / 32; | |||
2803 | ||||
2804 | // 64-bit select is only available for SALU. | |||
2805 | // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit. | |||
2806 | if (Pred == SCC_TRUE) { | |||
2807 | if (NElts % 2) { | |||
2808 | SelOp = AMDGPU::S_CSELECT_B32; | |||
2809 | EltRC = &AMDGPU::SGPR_32RegClass; | |||
2810 | } else { | |||
2811 | SelOp = AMDGPU::S_CSELECT_B64; | |||
2812 | EltRC = &AMDGPU::SGPR_64RegClass; | |||
2813 | SubIndices = Sub0_15_64; | |||
2814 | NElts /= 2; | |||
2815 | } | |||
2816 | } | |||
2817 | ||||
2818 | MachineInstrBuilder MIB = BuildMI( | |||
2819 | MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); | |||
2820 | ||||
2821 | I = MIB->getIterator(); | |||
2822 | ||||
2823 | SmallVector<Register, 8> Regs; | |||
2824 | for (int Idx = 0; Idx != NElts; ++Idx) { | |||
2825 | Register DstElt = MRI.createVirtualRegister(EltRC); | |||
2826 | Regs.push_back(DstElt); | |||
2827 | ||||
2828 | unsigned SubIdx = SubIndices[Idx]; | |||
2829 | ||||
2830 | MachineInstr *Select; | |||
2831 | if (SelOp == AMDGPU::V_CNDMASK_B32_e32) { | |||
2832 | Select = | |||
2833 | BuildMI(MBB, I, DL, get(SelOp), DstElt) | |||
2834 | .addReg(FalseReg, 0, SubIdx) | |||
2835 | .addReg(TrueReg, 0, SubIdx); | |||
2836 | } else { | |||
2837 | Select = | |||
2838 | BuildMI(MBB, I, DL, get(SelOp), DstElt) | |||
2839 | .addReg(TrueReg, 0, SubIdx) | |||
2840 | .addReg(FalseReg, 0, SubIdx); | |||
2841 | } | |||
2842 | ||||
2843 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); | |||
2844 | fixImplicitOperands(*Select); | |||
2845 | ||||
2846 | MIB.addReg(DstElt) | |||
2847 | .addImm(SubIdx); | |||
2848 | } | |||
2849 | } | |||
2850 | ||||
2851 | bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) { | |||
2852 | switch (MI.getOpcode()) { | |||
2853 | case AMDGPU::V_MOV_B32_e32: | |||
2854 | case AMDGPU::V_MOV_B32_e64: | |||
2855 | case AMDGPU::V_MOV_B64_PSEUDO: | |||
2856 | case AMDGPU::V_MOV_B64_e32: | |||
2857 | case AMDGPU::V_MOV_B64_e64: | |||
2858 | case AMDGPU::S_MOV_B32: | |||
2859 | case AMDGPU::S_MOV_B64: | |||
2860 | case AMDGPU::COPY: | |||
2861 | case AMDGPU::V_ACCVGPR_WRITE_B32_e64: | |||
2862 | case AMDGPU::V_ACCVGPR_READ_B32_e64: | |||
2863 | case AMDGPU::V_ACCVGPR_MOV_B32: | |||
2864 | return true; | |||
2865 | default: | |||
2866 | return false; | |||
2867 | } | |||
2868 | } | |||
2869 | ||||
2870 | unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( | |||
2871 | unsigned Kind) const { | |||
2872 | switch(Kind) { | |||
2873 | case PseudoSourceValue::Stack: | |||
2874 | case PseudoSourceValue::FixedStack: | |||
2875 | return AMDGPUAS::PRIVATE_ADDRESS; | |||
2876 | case PseudoSourceValue::ConstantPool: | |||
2877 | case PseudoSourceValue::GOT: | |||
2878 | case PseudoSourceValue::JumpTable: | |||
2879 | case PseudoSourceValue::GlobalValueCallEntry: | |||
2880 | case PseudoSourceValue::ExternalSymbolCallEntry: | |||
2881 | case PseudoSourceValue::TargetCustom: | |||
2882 | return AMDGPUAS::CONSTANT_ADDRESS; | |||
2883 | } | |||
2884 | return AMDGPUAS::FLAT_ADDRESS; | |||
2885 | } | |||
2886 | ||||
2887 | static void removeModOperands(MachineInstr &MI) { | |||
2888 | unsigned Opc = MI.getOpcode(); | |||
2889 | int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, | |||
2890 | AMDGPU::OpName::src0_modifiers); | |||
2891 | int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, | |||
2892 | AMDGPU::OpName::src1_modifiers); | |||
2893 | int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, | |||
2894 | AMDGPU::OpName::src2_modifiers); | |||
2895 | ||||
2896 | MI.RemoveOperand(Src2ModIdx); | |||
2897 | MI.RemoveOperand(Src1ModIdx); | |||
2898 | MI.RemoveOperand(Src0ModIdx); | |||
2899 | } | |||
2900 | ||||
2901 | bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, | |||
2902 | Register Reg, MachineRegisterInfo *MRI) const { | |||
2903 | if (!MRI->hasOneNonDBGUse(Reg)) | |||
2904 | return false; | |||
2905 | ||||
2906 | switch (DefMI.getOpcode()) { | |||
2907 | default: | |||
2908 | return false; | |||
2909 | case AMDGPU::S_MOV_B64: | |||
2910 | // TODO: We could fold 64-bit immediates, but this get complicated | |||
2911 | // when there are sub-registers. | |||
2912 | return false; | |||
2913 | ||||
2914 | case AMDGPU::V_MOV_B32_e32: | |||
2915 | case AMDGPU::S_MOV_B32: | |||
2916 | case AMDGPU::V_ACCVGPR_WRITE_B32_e64: | |||
2917 | break; | |||
2918 | } | |||
2919 | ||||
2920 | const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); | |||
2921 | assert(ImmOp)(static_cast <bool> (ImmOp) ? void (0) : __assert_fail ( "ImmOp", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2921, __extension__ __PRETTY_FUNCTION__)); | |||
2922 | // FIXME: We could handle FrameIndex values here. | |||
2923 | if (!ImmOp->isImm()) | |||
2924 | return false; | |||
2925 | ||||
2926 | unsigned Opc = UseMI.getOpcode(); | |||
2927 | if (Opc == AMDGPU::COPY) { | |||
2928 | Register DstReg = UseMI.getOperand(0).getReg(); | |||
2929 | bool Is16Bit = getOpSize(UseMI, 0) == 2; | |||
2930 | bool isVGPRCopy = RI.isVGPR(*MRI, DstReg); | |||
2931 | unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; | |||
2932 | APInt Imm(32, ImmOp->getImm()); | |||
2933 | ||||
2934 | if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16) | |||
2935 | Imm = Imm.ashr(16); | |||
2936 | ||||
2937 | if (RI.isAGPR(*MRI, DstReg)) { | |||
2938 | if (!isInlineConstant(Imm)) | |||
2939 | return false; | |||
2940 | NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64; | |||
2941 | } | |||
2942 | ||||
2943 | if (Is16Bit) { | |||
2944 | if (isVGPRCopy) | |||
2945 | return false; // Do not clobber vgpr_hi16 | |||
2946 | ||||
2947 | if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16) | |||
2948 | return false; | |||
2949 | ||||
2950 | UseMI.getOperand(0).setSubReg(0); | |||
2951 | if (DstReg.isPhysical()) { | |||
2952 | DstReg = RI.get32BitRegister(DstReg); | |||
2953 | UseMI.getOperand(0).setReg(DstReg); | |||
2954 | } | |||
2955 | assert(UseMI.getOperand(1).getReg().isVirtual())(static_cast <bool> (UseMI.getOperand(1).getReg().isVirtual ()) ? void (0) : __assert_fail ("UseMI.getOperand(1).getReg().isVirtual()" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 2955, __extension__ __PRETTY_FUNCTION__)); | |||
2956 | } | |||
2957 | ||||
2958 | UseMI.setDesc(get(NewOpc)); | |||
2959 | UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue()); | |||
2960 | UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); | |||
2961 | return true; | |||
2962 | } | |||
2963 | ||||
2964 | if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || | |||
2965 | Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 || | |||
2966 | Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || | |||
2967 | Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) { | |||
2968 | // Don't fold if we are using source or output modifiers. The new VOP2 | |||
2969 | // instructions don't have them. | |||
2970 | if (hasAnyModifiersSet(UseMI)) | |||
2971 | return false; | |||
2972 | ||||
2973 | // If this is a free constant, there's no reason to do this. | |||
2974 | // TODO: We could fold this here instead of letting SIFoldOperands do it | |||
2975 | // later. | |||
2976 | MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); | |||
2977 | ||||
2978 | // Any src operand can be used for the legality check. | |||
2979 | if (isInlineConstant(UseMI, *Src0, *ImmOp)) | |||
2980 | return false; | |||
2981 | ||||
2982 | bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || | |||
2983 | Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64; | |||
2984 | bool IsFMA = Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || | |||
2985 | Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64; | |||
2986 | MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); | |||
2987 | MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); | |||
2988 | ||||
2989 | // Multiplied part is the constant: Use v_madmk_{f16, f32}. | |||
2990 | // We should only expect these to be on src0 due to canonicalization. | |||
2991 | if (Src0->isReg() && Src0->getReg() == Reg) { | |||
2992 | if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) | |||
2993 | return false; | |||
2994 | ||||
2995 | if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) | |||
2996 | return false; | |||
2997 | ||||
2998 | unsigned NewOpc = | |||
2999 | IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16) | |||
3000 | : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); | |||
3001 | if (pseudoToMCOpcode(NewOpc) == -1) | |||
3002 | return false; | |||
3003 | ||||
3004 | // We need to swap operands 0 and 1 since madmk constant is at operand 1. | |||
3005 | ||||
3006 | const int64_t Imm = ImmOp->getImm(); | |||
3007 | ||||
3008 | // FIXME: This would be a lot easier if we could return a new instruction | |||
3009 | // instead of having to modify in place. | |||
3010 | ||||
3011 | // Remove these first since they are at the end. | |||
3012 | UseMI.RemoveOperand( | |||
3013 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); | |||
3014 | UseMI.RemoveOperand( | |||
3015 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); | |||
3016 | ||||
3017 | Register Src1Reg = Src1->getReg(); | |||
3018 | unsigned Src1SubReg = Src1->getSubReg(); | |||
3019 | Src0->setReg(Src1Reg); | |||
3020 | Src0->setSubReg(Src1SubReg); | |||
3021 | Src0->setIsKill(Src1->isKill()); | |||
3022 | ||||
3023 | if (Opc == AMDGPU::V_MAC_F32_e64 || | |||
3024 | Opc == AMDGPU::V_MAC_F16_e64 || | |||
3025 | Opc == AMDGPU::V_FMAC_F32_e64 || | |||
3026 | Opc == AMDGPU::V_FMAC_F16_e64) | |||
3027 | UseMI.untieRegOperand( | |||
3028 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); | |||
3029 | ||||
3030 | Src1->ChangeToImmediate(Imm); | |||
3031 | ||||
3032 | removeModOperands(UseMI); | |||
3033 | UseMI.setDesc(get(NewOpc)); | |||
3034 | ||||
3035 | bool DeleteDef = MRI->use_nodbg_empty(Reg); | |||
3036 | if (DeleteDef) | |||
3037 | DefMI.eraseFromParent(); | |||
3038 | ||||
3039 | return true; | |||
3040 | } | |||
3041 | ||||
3042 | // Added part is the constant: Use v_madak_{f16, f32}. | |||
3043 | if (Src2->isReg() && Src2->getReg() == Reg) { | |||
3044 | // Not allowed to use constant bus for another operand. | |||
3045 | // We can however allow an inline immediate as src0. | |||
3046 | bool Src0Inlined = false; | |||
3047 | if (Src0->isReg()) { | |||
3048 | // Try to inline constant if possible. | |||
3049 | // If the Def moves immediate and the use is single | |||
3050 | // We are saving VGPR here. | |||
3051 | MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); | |||
3052 | if (Def && Def->isMoveImmediate() && | |||
3053 | isInlineConstant(Def->getOperand(1)) && | |||
3054 | MRI->hasOneUse(Src0->getReg())) { | |||
3055 | Src0->ChangeToImmediate(Def->getOperand(1).getImm()); | |||
3056 | Src0Inlined = true; | |||
3057 | } else if ((Src0->getReg().isPhysical() && | |||
3058 | (ST.getConstantBusLimit(Opc) <= 1 && | |||
3059 | RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) || | |||
3060 | (Src0->getReg().isVirtual() && | |||
3061 | (ST.getConstantBusLimit(Opc) <= 1 && | |||
3062 | RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) | |||
3063 | return false; | |||
3064 | // VGPR is okay as Src0 - fallthrough | |||
3065 | } | |||
3066 | ||||
3067 | if (Src1->isReg() && !Src0Inlined ) { | |||
3068 | // We have one slot for inlinable constant so far - try to fill it | |||
3069 | MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); | |||
3070 | if (Def && Def->isMoveImmediate() && | |||
3071 | isInlineConstant(Def->getOperand(1)) && | |||
3072 | MRI->hasOneUse(Src1->getReg()) && | |||
3073 | commuteInstruction(UseMI)) { | |||
3074 | Src0->ChangeToImmediate(Def->getOperand(1).getImm()); | |||
3075 | } else if ((Src1->getReg().isPhysical() && | |||
3076 | RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) || | |||
3077 | (Src1->getReg().isVirtual() && | |||
3078 | RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) | |||
3079 | return false; | |||
3080 | // VGPR is okay as Src1 - fallthrough | |||
3081 | } | |||
3082 | ||||
3083 | unsigned NewOpc = | |||
3084 | IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16) | |||
3085 | : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); | |||
3086 | if (pseudoToMCOpcode(NewOpc) == -1) | |||
3087 | return false; | |||
3088 | ||||
3089 | const int64_t Imm = ImmOp->getImm(); | |||
3090 | ||||
3091 | // FIXME: This would be a lot easier if we could return a new instruction | |||
3092 | // instead of having to modify in place. | |||
3093 | ||||
3094 | // Remove these first since they are at the end. | |||
3095 | UseMI.RemoveOperand( | |||
3096 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); | |||
3097 | UseMI.RemoveOperand( | |||
3098 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); | |||
3099 | ||||
3100 | if (Opc == AMDGPU::V_MAC_F32_e64 || | |||
3101 | Opc == AMDGPU::V_MAC_F16_e64 || | |||
3102 | Opc == AMDGPU::V_FMAC_F32_e64 || | |||
3103 | Opc == AMDGPU::V_FMAC_F16_e64) | |||
3104 | UseMI.untieRegOperand( | |||
3105 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); | |||
3106 | ||||
3107 | // ChangingToImmediate adds Src2 back to the instruction. | |||
3108 | Src2->ChangeToImmediate(Imm); | |||
3109 | ||||
3110 | // These come before src2. | |||
3111 | removeModOperands(UseMI); | |||
3112 | UseMI.setDesc(get(NewOpc)); | |||
3113 | // It might happen that UseMI was commuted | |||
3114 | // and we now have SGPR as SRC1. If so 2 inlined | |||
3115 | // constant and SGPR are illegal. | |||
3116 | legalizeOperands(UseMI); | |||
3117 | ||||
3118 | bool DeleteDef = MRI->use_nodbg_empty(Reg); | |||
3119 | if (DeleteDef) | |||
3120 | DefMI.eraseFromParent(); | |||
3121 | ||||
3122 | return true; | |||
3123 | } | |||
3124 | } | |||
3125 | ||||
3126 | return false; | |||
3127 | } | |||
3128 | ||||
3129 | static bool | |||
3130 | memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1, | |||
3131 | ArrayRef<const MachineOperand *> BaseOps2) { | |||
3132 | if (BaseOps1.size() != BaseOps2.size()) | |||
3133 | return false; | |||
3134 | for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) { | |||
3135 | if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I])) | |||
3136 | return false; | |||
3137 | } | |||
3138 | return true; | |||
3139 | } | |||
3140 | ||||
3141 | static bool offsetsDoNotOverlap(int WidthA, int OffsetA, | |||
3142 | int WidthB, int OffsetB) { | |||
3143 | int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; | |||
3144 | int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; | |||
3145 | int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; | |||
3146 | return LowOffset + LowWidth <= HighOffset; | |||
3147 | } | |||
3148 | ||||
3149 | bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, | |||
3150 | const MachineInstr &MIb) const { | |||
3151 | SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1; | |||
3152 | int64_t Offset0, Offset1; | |||
3153 | unsigned Dummy0, Dummy1; | |||
3154 | bool Offset0IsScalable, Offset1IsScalable; | |||
3155 | if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable, | |||
3156 | Dummy0, &RI) || | |||
3157 | !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable, | |||
3158 | Dummy1, &RI)) | |||
3159 | return false; | |||
3160 | ||||
3161 | if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1)) | |||
3162 | return false; | |||
3163 | ||||
3164 | if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { | |||
3165 | // FIXME: Handle ds_read2 / ds_write2. | |||
3166 | return false; | |||
3167 | } | |||
3168 | unsigned Width0 = MIa.memoperands().front()->getSize(); | |||
3169 | unsigned Width1 = MIb.memoperands().front()->getSize(); | |||
3170 | return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1); | |||
3171 | } | |||
3172 | ||||
3173 | bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, | |||
3174 | const MachineInstr &MIb) const { | |||
3175 | assert(MIa.mayLoadOrStore() &&(static_cast <bool> (MIa.mayLoadOrStore() && "MIa must load from or modify a memory location" ) ? void (0) : __assert_fail ("MIa.mayLoadOrStore() && \"MIa must load from or modify a memory location\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3176, __extension__ __PRETTY_FUNCTION__)) | |||
3176 | "MIa must load from or modify a memory location")(static_cast <bool> (MIa.mayLoadOrStore() && "MIa must load from or modify a memory location" ) ? void (0) : __assert_fail ("MIa.mayLoadOrStore() && \"MIa must load from or modify a memory location\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3176, __extension__ __PRETTY_FUNCTION__)); | |||
3177 | assert(MIb.mayLoadOrStore() &&(static_cast <bool> (MIb.mayLoadOrStore() && "MIb must load from or modify a memory location" ) ? void (0) : __assert_fail ("MIb.mayLoadOrStore() && \"MIb must load from or modify a memory location\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3178, __extension__ __PRETTY_FUNCTION__)) | |||
3178 | "MIb must load from or modify a memory location")(static_cast <bool> (MIb.mayLoadOrStore() && "MIb must load from or modify a memory location" ) ? void (0) : __assert_fail ("MIb.mayLoadOrStore() && \"MIb must load from or modify a memory location\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3178, __extension__ __PRETTY_FUNCTION__)); | |||
3179 | ||||
3180 | if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) | |||
3181 | return false; | |||
3182 | ||||
3183 | // XXX - Can we relax this between address spaces? | |||
3184 | if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) | |||
3185 | return false; | |||
3186 | ||||
3187 | // TODO: Should we check the address space from the MachineMemOperand? That | |||
3188 | // would allow us to distinguish objects we know don't alias based on the | |||
3189 | // underlying address space, even if it was lowered to a different one, | |||
3190 | // e.g. private accesses lowered to use MUBUF instructions on a scratch | |||
3191 | // buffer. | |||
3192 | if (isDS(MIa)) { | |||
3193 | if (isDS(MIb)) | |||
3194 | return checkInstOffsetsDoNotOverlap(MIa, MIb); | |||
3195 | ||||
3196 | return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb); | |||
3197 | } | |||
3198 | ||||
3199 | if (isMUBUF(MIa) || isMTBUF(MIa)) { | |||
3200 | if (isMUBUF(MIb) || isMTBUF(MIb)) | |||
3201 | return checkInstOffsetsDoNotOverlap(MIa, MIb); | |||
3202 | ||||
3203 | return !isFLAT(MIb) && !isSMRD(MIb); | |||
3204 | } | |||
3205 | ||||
3206 | if (isSMRD(MIa)) { | |||
3207 | if (isSMRD(MIb)) | |||
3208 | return checkInstOffsetsDoNotOverlap(MIa, MIb); | |||
3209 | ||||
3210 | return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb); | |||
3211 | } | |||
3212 | ||||
3213 | if (isFLAT(MIa)) { | |||
3214 | if (isFLAT(MIb)) | |||
3215 | return checkInstOffsetsDoNotOverlap(MIa, MIb); | |||
3216 | ||||
3217 | return false; | |||
3218 | } | |||
3219 | ||||
3220 | return false; | |||
3221 | } | |||
3222 | ||||
3223 | static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, | |||
3224 | int64_t &Imm, MachineInstr **DefMI = nullptr) { | |||
3225 | if (Reg.isPhysical()) | |||
3226 | return false; | |||
3227 | auto *Def = MRI.getUniqueVRegDef(Reg); | |||
3228 | if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) { | |||
3229 | Imm = Def->getOperand(1).getImm(); | |||
3230 | if (DefMI) | |||
3231 | *DefMI = Def; | |||
3232 | return true; | |||
3233 | } | |||
3234 | return false; | |||
3235 | } | |||
3236 | ||||
3237 | static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm, | |||
3238 | MachineInstr **DefMI = nullptr) { | |||
3239 | if (!MO->isReg()) | |||
3240 | return false; | |||
3241 | const MachineFunction *MF = MO->getParent()->getParent()->getParent(); | |||
3242 | const MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
3243 | return getFoldableImm(MO->getReg(), MRI, Imm, DefMI); | |||
3244 | } | |||
3245 | ||||
3246 | static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, | |||
3247 | MachineInstr &NewMI) { | |||
3248 | if (LV) { | |||
3249 | unsigned NumOps = MI.getNumOperands(); | |||
3250 | for (unsigned I = 1; I < NumOps; ++I) { | |||
3251 | MachineOperand &Op = MI.getOperand(I); | |||
3252 | if (Op.isReg() && Op.isKill()) | |||
3253 | LV->replaceKillInstruction(Op.getReg(), MI, NewMI); | |||
3254 | } | |||
3255 | } | |||
3256 | } | |||
3257 | ||||
3258 | MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, | |||
3259 | LiveVariables *LV, | |||
3260 | LiveIntervals *LIS) const { | |||
3261 | MachineBasicBlock &MBB = *MI.getParent(); | |||
3262 | unsigned Opc = MI.getOpcode(); | |||
3263 | ||||
3264 | // Handle MFMA. | |||
3265 | int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc); | |||
3266 | if (NewMFMAOpc != -1) { | |||
3267 | MachineInstrBuilder MIB = | |||
3268 | BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc)); | |||
3269 | for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) | |||
3270 | MIB.add(MI.getOperand(I)); | |||
3271 | updateLiveVariables(LV, MI, *MIB); | |||
3272 | if (LIS) | |||
3273 | LIS->ReplaceMachineInstrInMaps(MI, *MIB); | |||
3274 | return MIB; | |||
3275 | } | |||
3276 | ||||
3277 | // Handle MAC/FMAC. | |||
3278 | bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 || | |||
3279 | Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64; | |||
3280 | bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || | |||
3281 | Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 || | |||
3282 | Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 || | |||
3283 | Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || | |||
3284 | Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; | |||
3285 | bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; | |||
3286 | bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 || | |||
3287 | Opc == AMDGPU::V_MAC_LEGACY_F32_e64 || | |||
3288 | Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 || | |||
3289 | Opc == AMDGPU::V_FMAC_LEGACY_F32_e64; | |||
3290 | bool Src0Literal = false; | |||
3291 | ||||
3292 | switch (Opc) { | |||
3293 | default: | |||
3294 | return nullptr; | |||
3295 | case AMDGPU::V_MAC_F16_e64: | |||
3296 | case AMDGPU::V_FMAC_F16_e64: | |||
3297 | case AMDGPU::V_MAC_F32_e64: | |||
3298 | case AMDGPU::V_MAC_LEGACY_F32_e64: | |||
3299 | case AMDGPU::V_FMAC_F32_e64: | |||
3300 | case AMDGPU::V_FMAC_LEGACY_F32_e64: | |||
3301 | case AMDGPU::V_FMAC_F64_e64: | |||
3302 | break; | |||
3303 | case AMDGPU::V_MAC_F16_e32: | |||
3304 | case AMDGPU::V_FMAC_F16_e32: | |||
3305 | case AMDGPU::V_MAC_F32_e32: | |||
3306 | case AMDGPU::V_MAC_LEGACY_F32_e32: | |||
3307 | case AMDGPU::V_FMAC_F32_e32: | |||
3308 | case AMDGPU::V_FMAC_LEGACY_F32_e32: | |||
3309 | case AMDGPU::V_FMAC_F64_e32: { | |||
3310 | int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), | |||
3311 | AMDGPU::OpName::src0); | |||
3312 | const MachineOperand *Src0 = &MI.getOperand(Src0Idx); | |||
3313 | if (!Src0->isReg() && !Src0->isImm()) | |||
3314 | return nullptr; | |||
3315 | ||||
3316 | if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) | |||
3317 | Src0Literal = true; | |||
3318 | ||||
3319 | break; | |||
3320 | } | |||
3321 | } | |||
3322 | ||||
3323 | MachineInstrBuilder MIB; | |||
3324 | const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
3325 | const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); | |||
3326 | const MachineOperand *Src0Mods = | |||
3327 | getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); | |||
3328 | const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); | |||
3329 | const MachineOperand *Src1Mods = | |||
3330 | getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); | |||
3331 | const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); | |||
3332 | const MachineOperand *Src2Mods = | |||
3333 | getNamedOperand(MI, AMDGPU::OpName::src2_modifiers); | |||
3334 | const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); | |||
3335 | const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); | |||
3336 | ||||
3337 | if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 && | |||
3338 | !IsLegacy && | |||
3339 | // If we have an SGPR input, we will violate the constant bus restriction. | |||
3340 | (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || | |||
3341 | !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) { | |||
3342 | MachineInstr *DefMI; | |||
3343 | const auto killDef = [&DefMI, &MBB, this]() -> void { | |||
3344 | const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
3345 | // The only user is the instruction which will be killed. | |||
3346 | if (!MRI.hasOneNonDBGUse(DefMI->getOperand(0).getReg())) | |||
3347 | return; | |||
3348 | // We cannot just remove the DefMI here, calling pass will crash. | |||
3349 | DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF)); | |||
3350 | for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I) | |||
3351 | DefMI->RemoveOperand(I); | |||
3352 | }; | |||
3353 | ||||
3354 | int64_t Imm; | |||
3355 | if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) { | |||
3356 | unsigned NewOpc = | |||
3357 | IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32) | |||
3358 | : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); | |||
3359 | if (pseudoToMCOpcode(NewOpc) != -1) { | |||
3360 | MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3361 | .add(*Dst) | |||
3362 | .add(*Src0) | |||
3363 | .add(*Src1) | |||
3364 | .addImm(Imm); | |||
3365 | updateLiveVariables(LV, MI, *MIB); | |||
3366 | if (LIS) | |||
3367 | LIS->ReplaceMachineInstrInMaps(MI, *MIB); | |||
3368 | killDef(); | |||
3369 | return MIB; | |||
3370 | } | |||
3371 | } | |||
3372 | unsigned NewOpc = IsFMA | |||
3373 | ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32) | |||
3374 | : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); | |||
3375 | if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) { | |||
3376 | if (pseudoToMCOpcode(NewOpc) != -1) { | |||
3377 | MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3378 | .add(*Dst) | |||
3379 | .add(*Src0) | |||
3380 | .addImm(Imm) | |||
3381 | .add(*Src2); | |||
3382 | updateLiveVariables(LV, MI, *MIB); | |||
3383 | if (LIS) | |||
3384 | LIS->ReplaceMachineInstrInMaps(MI, *MIB); | |||
3385 | killDef(); | |||
3386 | return MIB; | |||
3387 | } | |||
3388 | } | |||
3389 | if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) { | |||
3390 | if (Src0Literal) { | |||
3391 | Imm = Src0->getImm(); | |||
3392 | DefMI = nullptr; | |||
3393 | } | |||
3394 | if (pseudoToMCOpcode(NewOpc) != -1 && | |||
3395 | isOperandLegal( | |||
3396 | MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0), | |||
3397 | Src1)) { | |||
3398 | MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3399 | .add(*Dst) | |||
3400 | .add(*Src1) | |||
3401 | .addImm(Imm) | |||
3402 | .add(*Src2); | |||
3403 | updateLiveVariables(LV, MI, *MIB); | |||
3404 | if (LIS) | |||
3405 | LIS->ReplaceMachineInstrInMaps(MI, *MIB); | |||
3406 | if (DefMI) | |||
3407 | killDef(); | |||
3408 | return MIB; | |||
3409 | } | |||
3410 | } | |||
3411 | } | |||
3412 | ||||
3413 | // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma | |||
3414 | // because VOP3 does not allow a literal operand. | |||
3415 | // TODO: Remove this restriction for GFX10. | |||
3416 | if (Src0Literal) | |||
3417 | return nullptr; | |||
3418 | ||||
3419 | unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64 | |||
3420 | : IsF64 ? AMDGPU::V_FMA_F64_e64 | |||
3421 | : IsLegacy | |||
3422 | ? AMDGPU::V_FMA_LEGACY_F32_e64 | |||
3423 | : AMDGPU::V_FMA_F32_e64 | |||
3424 | : IsF16 ? AMDGPU::V_MAD_F16_e64 | |||
3425 | : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64 | |||
3426 | : AMDGPU::V_MAD_F32_e64; | |||
3427 | if (pseudoToMCOpcode(NewOpc) == -1) | |||
3428 | return nullptr; | |||
3429 | ||||
3430 | MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3431 | .add(*Dst) | |||
3432 | .addImm(Src0Mods ? Src0Mods->getImm() : 0) | |||
3433 | .add(*Src0) | |||
3434 | .addImm(Src1Mods ? Src1Mods->getImm() : 0) | |||
3435 | .add(*Src1) | |||
3436 | .addImm(Src2Mods ? Src2Mods->getImm() : 0) | |||
3437 | .add(*Src2) | |||
3438 | .addImm(Clamp ? Clamp->getImm() : 0) | |||
3439 | .addImm(Omod ? Omod->getImm() : 0); | |||
3440 | updateLiveVariables(LV, MI, *MIB); | |||
3441 | if (LIS) | |||
3442 | LIS->ReplaceMachineInstrInMaps(MI, *MIB); | |||
3443 | return MIB; | |||
3444 | } | |||
3445 | ||||
3446 | // It's not generally safe to move VALU instructions across these since it will | |||
3447 | // start using the register as a base index rather than directly. | |||
3448 | // XXX - Why isn't hasSideEffects sufficient for these? | |||
3449 | static bool changesVGPRIndexingMode(const MachineInstr &MI) { | |||
3450 | switch (MI.getOpcode()) { | |||
3451 | case AMDGPU::S_SET_GPR_IDX_ON: | |||
3452 | case AMDGPU::S_SET_GPR_IDX_MODE: | |||
3453 | case AMDGPU::S_SET_GPR_IDX_OFF: | |||
3454 | return true; | |||
3455 | default: | |||
3456 | return false; | |||
3457 | } | |||
3458 | } | |||
3459 | ||||
3460 | bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, | |||
3461 | const MachineBasicBlock *MBB, | |||
3462 | const MachineFunction &MF) const { | |||
3463 | // Skipping the check for SP writes in the base implementation. The reason it | |||
3464 | // was added was apparently due to compile time concerns. | |||
3465 | // | |||
3466 | // TODO: Do we really want this barrier? It triggers unnecessary hazard nops | |||
3467 | // but is probably avoidable. | |||
3468 | ||||
3469 | // Copied from base implementation. | |||
3470 | // Terminators and labels can't be scheduled around. | |||
3471 | if (MI.isTerminator() || MI.isPosition()) | |||
3472 | return true; | |||
3473 | ||||
3474 | // INLINEASM_BR can jump to another block | |||
3475 | if (MI.getOpcode() == TargetOpcode::INLINEASM_BR) | |||
3476 | return true; | |||
3477 | ||||
3478 | // Target-independent instructions do not have an implicit-use of EXEC, even | |||
3479 | // when they operate on VGPRs. Treating EXEC modifications as scheduling | |||
3480 | // boundaries prevents incorrect movements of such instructions. | |||
3481 | return MI.modifiesRegister(AMDGPU::EXEC, &RI) || | |||
3482 | MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || | |||
3483 | MI.getOpcode() == AMDGPU::S_SETREG_B32 || | |||
3484 | changesVGPRIndexingMode(MI); | |||
3485 | } | |||
3486 | ||||
3487 | bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { | |||
3488 | return Opcode == AMDGPU::DS_ORDERED_COUNT || | |||
3489 | Opcode == AMDGPU::DS_GWS_INIT || | |||
3490 | Opcode == AMDGPU::DS_GWS_SEMA_V || | |||
3491 | Opcode == AMDGPU::DS_GWS_SEMA_BR || | |||
3492 | Opcode == AMDGPU::DS_GWS_SEMA_P || | |||
3493 | Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || | |||
3494 | Opcode == AMDGPU::DS_GWS_BARRIER; | |||
3495 | } | |||
3496 | ||||
3497 | bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { | |||
3498 | // Skip the full operand and register alias search modifiesRegister | |||
3499 | // does. There's only a handful of instructions that touch this, it's only an | |||
3500 | // implicit def, and doesn't alias any other registers. | |||
3501 | if (const MCPhysReg *ImpDef = MI.getDesc().getImplicitDefs()) { | |||
3502 | for (; ImpDef && *ImpDef; ++ImpDef) { | |||
3503 | if (*ImpDef == AMDGPU::MODE) | |||
3504 | return true; | |||
3505 | } | |||
3506 | } | |||
3507 | ||||
3508 | return false; | |||
3509 | } | |||
3510 | ||||
3511 | bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { | |||
3512 | unsigned Opcode = MI.getOpcode(); | |||
3513 | ||||
3514 | if (MI.mayStore() && isSMRD(MI)) | |||
3515 | return true; // scalar store or atomic | |||
3516 | ||||
3517 | // This will terminate the function when other lanes may need to continue. | |||
3518 | if (MI.isReturn()) | |||
3519 | return true; | |||
3520 | ||||
3521 | // These instructions cause shader I/O that may cause hardware lockups | |||
3522 | // when executed with an empty EXEC mask. | |||
3523 | // | |||
3524 | // Note: exp with VM = DONE = 0 is automatically skipped by hardware when | |||
3525 | // EXEC = 0, but checking for that case here seems not worth it | |||
3526 | // given the typical code patterns. | |||
3527 | if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || | |||
3528 | isEXP(Opcode) || | |||
3529 | Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP || | |||
3530 | Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER) | |||
3531 | return true; | |||
3532 | ||||
3533 | if (MI.isCall() || MI.isInlineAsm()) | |||
3534 | return true; // conservative assumption | |||
3535 | ||||
3536 | // A mode change is a scalar operation that influences vector instructions. | |||
3537 | if (modifiesModeRegister(MI)) | |||
3538 | return true; | |||
3539 | ||||
3540 | // These are like SALU instructions in terms of effects, so it's questionable | |||
3541 | // whether we should return true for those. | |||
3542 | // | |||
3543 | // However, executing them with EXEC = 0 causes them to operate on undefined | |||
3544 | // data, which we avoid by returning true here. | |||
3545 | if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || | |||
3546 | Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32) | |||
3547 | return true; | |||
3548 | ||||
3549 | return false; | |||
3550 | } | |||
3551 | ||||
3552 | bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI, | |||
3553 | const MachineInstr &MI) const { | |||
3554 | if (MI.isMetaInstruction()) | |||
3555 | return false; | |||
3556 | ||||
3557 | // This won't read exec if this is an SGPR->SGPR copy. | |||
3558 | if (MI.isCopyLike()) { | |||
3559 | if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg())) | |||
3560 | return true; | |||
3561 | ||||
3562 | // Make sure this isn't copying exec as a normal operand | |||
3563 | return MI.readsRegister(AMDGPU::EXEC, &RI); | |||
3564 | } | |||
3565 | ||||
3566 | // Make a conservative assumption about the callee. | |||
3567 | if (MI.isCall()) | |||
3568 | return true; | |||
3569 | ||||
3570 | // Be conservative with any unhandled generic opcodes. | |||
3571 | if (!isTargetSpecificOpcode(MI.getOpcode())) | |||
3572 | return true; | |||
3573 | ||||
3574 | return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI); | |||
3575 | } | |||
3576 | ||||
3577 | bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { | |||
3578 | switch (Imm.getBitWidth()) { | |||
3579 | case 1: // This likely will be a condition code mask. | |||
3580 | return true; | |||
3581 | ||||
3582 | case 32: | |||
3583 | return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), | |||
3584 | ST.hasInv2PiInlineImm()); | |||
3585 | case 64: | |||
3586 | return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), | |||
3587 | ST.hasInv2PiInlineImm()); | |||
3588 | case 16: | |||
3589 | return ST.has16BitInsts() && | |||
3590 | AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), | |||
3591 | ST.hasInv2PiInlineImm()); | |||
3592 | default: | |||
3593 | llvm_unreachable("invalid bitwidth")::llvm::llvm_unreachable_internal("invalid bitwidth", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3593); | |||
3594 | } | |||
3595 | } | |||
3596 | ||||
3597 | bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, | |||
3598 | uint8_t OperandType) const { | |||
3599 | if (!MO.isImm() || | |||
3600 | OperandType < AMDGPU::OPERAND_SRC_FIRST || | |||
3601 | OperandType > AMDGPU::OPERAND_SRC_LAST) | |||
3602 | return false; | |||
3603 | ||||
3604 | // MachineOperand provides no way to tell the true operand size, since it only | |||
3605 | // records a 64-bit value. We need to know the size to determine if a 32-bit | |||
3606 | // floating point immediate bit pattern is legal for an integer immediate. It | |||
3607 | // would be for any 32-bit integer operand, but would not be for a 64-bit one. | |||
3608 | ||||
3609 | int64_t Imm = MO.getImm(); | |||
3610 | switch (OperandType) { | |||
3611 | case AMDGPU::OPERAND_REG_IMM_INT32: | |||
3612 | case AMDGPU::OPERAND_REG_IMM_FP32: | |||
3613 | case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED: | |||
3614 | case AMDGPU::OPERAND_REG_INLINE_C_INT32: | |||
3615 | case AMDGPU::OPERAND_REG_INLINE_C_FP32: | |||
3616 | case AMDGPU::OPERAND_REG_IMM_V2FP32: | |||
3617 | case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: | |||
3618 | case AMDGPU::OPERAND_REG_IMM_V2INT32: | |||
3619 | case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: | |||
3620 | case AMDGPU::OPERAND_REG_INLINE_AC_INT32: | |||
3621 | case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { | |||
3622 | int32_t Trunc = static_cast<int32_t>(Imm); | |||
3623 | return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); | |||
3624 | } | |||
3625 | case AMDGPU::OPERAND_REG_IMM_INT64: | |||
3626 | case AMDGPU::OPERAND_REG_IMM_FP64: | |||
3627 | case AMDGPU::OPERAND_REG_INLINE_C_INT64: | |||
3628 | case AMDGPU::OPERAND_REG_INLINE_C_FP64: | |||
3629 | case AMDGPU::OPERAND_REG_INLINE_AC_FP64: | |||
3630 | return AMDGPU::isInlinableLiteral64(MO.getImm(), | |||
3631 | ST.hasInv2PiInlineImm()); | |||
3632 | case AMDGPU::OPERAND_REG_IMM_INT16: | |||
3633 | case AMDGPU::OPERAND_REG_INLINE_C_INT16: | |||
3634 | case AMDGPU::OPERAND_REG_INLINE_AC_INT16: | |||
3635 | // We would expect inline immediates to not be concerned with an integer/fp | |||
3636 | // distinction. However, in the case of 16-bit integer operations, the | |||
3637 | // "floating point" values appear to not work. It seems read the low 16-bits | |||
3638 | // of 32-bit immediates, which happens to always work for the integer | |||
3639 | // values. | |||
3640 | // | |||
3641 | // See llvm bugzilla 46302. | |||
3642 | // | |||
3643 | // TODO: Theoretically we could use op-sel to use the high bits of the | |||
3644 | // 32-bit FP values. | |||
3645 | return AMDGPU::isInlinableIntLiteral(Imm); | |||
3646 | case AMDGPU::OPERAND_REG_IMM_V2INT16: | |||
3647 | case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: | |||
3648 | case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: | |||
3649 | // This suffers the same problem as the scalar 16-bit cases. | |||
3650 | return AMDGPU::isInlinableIntLiteralV216(Imm); | |||
3651 | case AMDGPU::OPERAND_REG_IMM_FP16: | |||
3652 | case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: | |||
3653 | case AMDGPU::OPERAND_REG_INLINE_C_FP16: | |||
3654 | case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { | |||
3655 | if (isInt<16>(Imm) || isUInt<16>(Imm)) { | |||
3656 | // A few special case instructions have 16-bit operands on subtargets | |||
3657 | // where 16-bit instructions are not legal. | |||
3658 | // TODO: Do the 32-bit immediates work? We shouldn't really need to handle | |||
3659 | // constants in these cases | |||
3660 | int16_t Trunc = static_cast<int16_t>(Imm); | |||
3661 | return ST.has16BitInsts() && | |||
3662 | AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); | |||
3663 | } | |||
3664 | ||||
3665 | return false; | |||
3666 | } | |||
3667 | case AMDGPU::OPERAND_REG_IMM_V2FP16: | |||
3668 | case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: | |||
3669 | case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { | |||
3670 | uint32_t Trunc = static_cast<uint32_t>(Imm); | |||
3671 | return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); | |||
3672 | } | |||
3673 | case AMDGPU::OPERAND_KIMM32: | |||
3674 | case AMDGPU::OPERAND_KIMM16: | |||
3675 | return false; | |||
3676 | default: | |||
3677 | llvm_unreachable("invalid bitwidth")::llvm::llvm_unreachable_internal("invalid bitwidth", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3677); | |||
3678 | } | |||
3679 | } | |||
3680 | ||||
3681 | bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, | |||
3682 | const MCOperandInfo &OpInfo) const { | |||
3683 | switch (MO.getType()) { | |||
3684 | case MachineOperand::MO_Register: | |||
3685 | return false; | |||
3686 | case MachineOperand::MO_Immediate: | |||
3687 | return !isInlineConstant(MO, OpInfo); | |||
3688 | case MachineOperand::MO_FrameIndex: | |||
3689 | case MachineOperand::MO_MachineBasicBlock: | |||
3690 | case MachineOperand::MO_ExternalSymbol: | |||
3691 | case MachineOperand::MO_GlobalAddress: | |||
3692 | case MachineOperand::MO_MCSymbol: | |||
3693 | return true; | |||
3694 | default: | |||
3695 | llvm_unreachable("unexpected operand type")::llvm::llvm_unreachable_internal("unexpected operand type", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3695); | |||
3696 | } | |||
3697 | } | |||
3698 | ||||
3699 | static bool compareMachineOp(const MachineOperand &Op0, | |||
3700 | const MachineOperand &Op1) { | |||
3701 | if (Op0.getType() != Op1.getType()) | |||
3702 | return false; | |||
3703 | ||||
3704 | switch (Op0.getType()) { | |||
3705 | case MachineOperand::MO_Register: | |||
3706 | return Op0.getReg() == Op1.getReg(); | |||
3707 | case MachineOperand::MO_Immediate: | |||
3708 | return Op0.getImm() == Op1.getImm(); | |||
3709 | default: | |||
3710 | llvm_unreachable("Didn't expect to be comparing these operand types")::llvm::llvm_unreachable_internal("Didn't expect to be comparing these operand types" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3710); | |||
3711 | } | |||
3712 | } | |||
3713 | ||||
3714 | bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, | |||
3715 | const MachineOperand &MO) const { | |||
3716 | const MCInstrDesc &InstDesc = MI.getDesc(); | |||
3717 | const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo]; | |||
3718 | ||||
3719 | assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal())(static_cast <bool> (MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) ? void (0) : __assert_fail ("MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3719, __extension__ __PRETTY_FUNCTION__)); | |||
3720 | ||||
3721 | if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) | |||
3722 | return true; | |||
3723 | ||||
3724 | if (OpInfo.RegClass < 0) | |||
3725 | return false; | |||
3726 | ||||
3727 | if (MO.isImm() && isInlineConstant(MO, OpInfo)) { | |||
3728 | if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() && | |||
3729 | OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(), | |||
3730 | AMDGPU::OpName::src2)) | |||
3731 | return false; | |||
3732 | return RI.opCanUseInlineConstant(OpInfo.OperandType); | |||
3733 | } | |||
3734 | ||||
3735 | if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) | |||
3736 | return false; | |||
3737 | ||||
3738 | if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) | |||
3739 | return true; | |||
3740 | ||||
3741 | return ST.hasVOP3Literal(); | |||
3742 | } | |||
3743 | ||||
3744 | bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { | |||
3745 | // GFX90A does not have V_MUL_LEGACY_F32_e32. | |||
3746 | if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts()) | |||
3747 | return false; | |||
3748 | ||||
3749 | int Op32 = AMDGPU::getVOPe32(Opcode); | |||
3750 | if (Op32 == -1) | |||
3751 | return false; | |||
3752 | ||||
3753 | return pseudoToMCOpcode(Op32) != -1; | |||
3754 | } | |||
3755 | ||||
3756 | bool SIInstrInfo::hasModifiers(unsigned Opcode) const { | |||
3757 | // The src0_modifier operand is present on all instructions | |||
3758 | // that have modifiers. | |||
3759 | ||||
3760 | return AMDGPU::getNamedOperandIdx(Opcode, | |||
3761 | AMDGPU::OpName::src0_modifiers) != -1; | |||
3762 | } | |||
3763 | ||||
3764 | bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, | |||
3765 | unsigned OpName) const { | |||
3766 | const MachineOperand *Mods = getNamedOperand(MI, OpName); | |||
3767 | return Mods && Mods->getImm(); | |||
3768 | } | |||
3769 | ||||
3770 | bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { | |||
3771 | return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || | |||
3772 | hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || | |||
3773 | hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || | |||
3774 | hasModifiersSet(MI, AMDGPU::OpName::clamp) || | |||
3775 | hasModifiersSet(MI, AMDGPU::OpName::omod); | |||
3776 | } | |||
3777 | ||||
3778 | bool SIInstrInfo::canShrink(const MachineInstr &MI, | |||
3779 | const MachineRegisterInfo &MRI) const { | |||
3780 | const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); | |||
3781 | // Can't shrink instruction with three operands. | |||
3782 | if (Src2) { | |||
3783 | switch (MI.getOpcode()) { | |||
3784 | default: return false; | |||
3785 | ||||
3786 | case AMDGPU::V_ADDC_U32_e64: | |||
3787 | case AMDGPU::V_SUBB_U32_e64: | |||
3788 | case AMDGPU::V_SUBBREV_U32_e64: { | |||
3789 | const MachineOperand *Src1 | |||
3790 | = getNamedOperand(MI, AMDGPU::OpName::src1); | |||
3791 | if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg())) | |||
3792 | return false; | |||
3793 | // Additional verification is needed for sdst/src2. | |||
3794 | return true; | |||
3795 | } | |||
3796 | case AMDGPU::V_MAC_F16_e64: | |||
3797 | case AMDGPU::V_MAC_F32_e64: | |||
3798 | case AMDGPU::V_MAC_LEGACY_F32_e64: | |||
3799 | case AMDGPU::V_FMAC_F16_e64: | |||
3800 | case AMDGPU::V_FMAC_F32_e64: | |||
3801 | case AMDGPU::V_FMAC_F64_e64: | |||
3802 | case AMDGPU::V_FMAC_LEGACY_F32_e64: | |||
3803 | if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || | |||
3804 | hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) | |||
3805 | return false; | |||
3806 | break; | |||
3807 | ||||
3808 | case AMDGPU::V_CNDMASK_B32_e64: | |||
3809 | break; | |||
3810 | } | |||
3811 | } | |||
3812 | ||||
3813 | const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); | |||
3814 | if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) || | |||
3815 | hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) | |||
3816 | return false; | |||
3817 | ||||
3818 | // We don't need to check src0, all input types are legal, so just make sure | |||
3819 | // src0 isn't using any modifiers. | |||
3820 | if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) | |||
3821 | return false; | |||
3822 | ||||
3823 | // Can it be shrunk to a valid 32 bit opcode? | |||
3824 | if (!hasVALU32BitEncoding(MI.getOpcode())) | |||
3825 | return false; | |||
3826 | ||||
3827 | // Check output modifiers | |||
3828 | return !hasModifiersSet(MI, AMDGPU::OpName::omod) && | |||
3829 | !hasModifiersSet(MI, AMDGPU::OpName::clamp); | |||
3830 | } | |||
3831 | ||||
3832 | // Set VCC operand with all flags from \p Orig, except for setting it as | |||
3833 | // implicit. | |||
3834 | static void copyFlagsToImplicitVCC(MachineInstr &MI, | |||
3835 | const MachineOperand &Orig) { | |||
3836 | ||||
3837 | for (MachineOperand &Use : MI.implicit_operands()) { | |||
3838 | if (Use.isUse() && | |||
3839 | (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) { | |||
3840 | Use.setIsUndef(Orig.isUndef()); | |||
3841 | Use.setIsKill(Orig.isKill()); | |||
3842 | return; | |||
3843 | } | |||
3844 | } | |||
3845 | } | |||
3846 | ||||
3847 | MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, | |||
3848 | unsigned Op32) const { | |||
3849 | MachineBasicBlock *MBB = MI.getParent();; | |||
3850 | MachineInstrBuilder Inst32 = | |||
3851 | BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)) | |||
3852 | .setMIFlags(MI.getFlags()); | |||
3853 | ||||
3854 | // Add the dst operand if the 32-bit encoding also has an explicit $vdst. | |||
3855 | // For VOPC instructions, this is replaced by an implicit def of vcc. | |||
3856 | int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); | |||
3857 | if (Op32DstIdx != -1) { | |||
3858 | // dst | |||
3859 | Inst32.add(MI.getOperand(0)); | |||
3860 | } else { | |||
3861 | assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||(static_cast <bool> (((MI.getOperand(0).getReg() == AMDGPU ::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && "Unexpected case") ? void (0) : __assert_fail ("((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && \"Unexpected case\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3863, __extension__ __PRETTY_FUNCTION__)) | |||
3862 | (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&(static_cast <bool> (((MI.getOperand(0).getReg() == AMDGPU ::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && "Unexpected case") ? void (0) : __assert_fail ("((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && \"Unexpected case\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3863, __extension__ __PRETTY_FUNCTION__)) | |||
3863 | "Unexpected case")(static_cast <bool> (((MI.getOperand(0).getReg() == AMDGPU ::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && "Unexpected case") ? void (0) : __assert_fail ("((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && \"Unexpected case\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 3863, __extension__ __PRETTY_FUNCTION__)); | |||
3864 | } | |||
3865 | ||||
3866 | Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0)); | |||
3867 | ||||
3868 | const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); | |||
3869 | if (Src1) | |||
3870 | Inst32.add(*Src1); | |||
3871 | ||||
3872 | const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); | |||
3873 | ||||
3874 | if (Src2) { | |||
3875 | int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); | |||
3876 | if (Op32Src2Idx != -1) { | |||
3877 | Inst32.add(*Src2); | |||
3878 | } else { | |||
3879 | // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is | |||
3880 | // replaced with an implicit read of vcc or vcc_lo. The implicit read | |||
3881 | // of vcc was already added during the initial BuildMI, but we | |||
3882 | // 1) may need to change vcc to vcc_lo to preserve the original register | |||
3883 | // 2) have to preserve the original flags. | |||
3884 | fixImplicitOperands(*Inst32); | |||
3885 | copyFlagsToImplicitVCC(*Inst32, *Src2); | |||
3886 | } | |||
3887 | } | |||
3888 | ||||
3889 | return Inst32; | |||
3890 | } | |||
3891 | ||||
3892 | bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, | |||
3893 | const MachineOperand &MO, | |||
3894 | const MCOperandInfo &OpInfo) const { | |||
3895 | // Literal constants use the constant bus. | |||
3896 | //if (isLiteralConstantLike(MO, OpInfo)) | |||
3897 | // return true; | |||
3898 | if (MO.isImm()) | |||
3899 | return !isInlineConstant(MO, OpInfo); | |||
3900 | ||||
3901 | if (!MO.isReg()) | |||
3902 | return true; // Misc other operands like FrameIndex | |||
3903 | ||||
3904 | if (!MO.isUse()) | |||
3905 | return false; | |||
3906 | ||||
3907 | if (MO.getReg().isVirtual()) | |||
3908 | return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); | |||
3909 | ||||
3910 | // Null is free | |||
3911 | if (MO.getReg() == AMDGPU::SGPR_NULL) | |||
3912 | return false; | |||
3913 | ||||
3914 | // SGPRs use the constant bus | |||
3915 | if (MO.isImplicit()) { | |||
3916 | return MO.getReg() == AMDGPU::M0 || | |||
3917 | MO.getReg() == AMDGPU::VCC || | |||
3918 | MO.getReg() == AMDGPU::VCC_LO; | |||
3919 | } else { | |||
3920 | return AMDGPU::SReg_32RegClass.contains(MO.getReg()) || | |||
3921 | AMDGPU::SReg_64RegClass.contains(MO.getReg()); | |||
3922 | } | |||
3923 | } | |||
3924 | ||||
3925 | static Register findImplicitSGPRRead(const MachineInstr &MI) { | |||
3926 | for (const MachineOperand &MO : MI.implicit_operands()) { | |||
3927 | // We only care about reads. | |||
3928 | if (MO.isDef()) | |||
3929 | continue; | |||
3930 | ||||
3931 | switch (MO.getReg()) { | |||
3932 | case AMDGPU::VCC: | |||
3933 | case AMDGPU::VCC_LO: | |||
3934 | case AMDGPU::VCC_HI: | |||
3935 | case AMDGPU::M0: | |||
3936 | case AMDGPU::FLAT_SCR: | |||
3937 | return MO.getReg(); | |||
3938 | ||||
3939 | default: | |||
3940 | break; | |||
3941 | } | |||
3942 | } | |||
3943 | ||||
3944 | return AMDGPU::NoRegister; | |||
3945 | } | |||
3946 | ||||
3947 | static bool shouldReadExec(const MachineInstr &MI) { | |||
3948 | if (SIInstrInfo::isVALU(MI)) { | |||
3949 | switch (MI.getOpcode()) { | |||
3950 | case AMDGPU::V_READLANE_B32: | |||
3951 | case AMDGPU::V_WRITELANE_B32: | |||
3952 | return false; | |||
3953 | } | |||
3954 | ||||
3955 | return true; | |||
3956 | } | |||
3957 | ||||
3958 | if (MI.isPreISelOpcode() || | |||
3959 | SIInstrInfo::isGenericOpcode(MI.getOpcode()) || | |||
3960 | SIInstrInfo::isSALU(MI) || | |||
3961 | SIInstrInfo::isSMRD(MI)) | |||
3962 | return false; | |||
3963 | ||||
3964 | return true; | |||
3965 | } | |||
3966 | ||||
3967 | static bool isSubRegOf(const SIRegisterInfo &TRI, | |||
3968 | const MachineOperand &SuperVec, | |||
3969 | const MachineOperand &SubReg) { | |||
3970 | if (SubReg.getReg().isPhysical()) | |||
3971 | return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); | |||
3972 | ||||
3973 | return SubReg.getSubReg() != AMDGPU::NoSubRegister && | |||
3974 | SubReg.getReg() == SuperVec.getReg(); | |||
3975 | } | |||
3976 | ||||
3977 | bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, | |||
3978 | StringRef &ErrInfo) const { | |||
3979 | uint16_t Opcode = MI.getOpcode(); | |||
3980 | if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) | |||
3981 | return true; | |||
3982 | ||||
3983 | const MachineFunction *MF = MI.getParent()->getParent(); | |||
3984 | const MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
3985 | ||||
3986 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); | |||
3987 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); | |||
3988 | int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); | |||
3989 | ||||
3990 | // Make sure the number of operands is correct. | |||
3991 | const MCInstrDesc &Desc = get(Opcode); | |||
3992 | if (!Desc.isVariadic() && | |||
3993 | Desc.getNumOperands() != MI.getNumExplicitOperands()) { | |||
3994 | ErrInfo = "Instruction has wrong number of operands."; | |||
3995 | return false; | |||
3996 | } | |||
3997 | ||||
3998 | if (MI.isInlineAsm()) { | |||
3999 | // Verify register classes for inlineasm constraints. | |||
4000 | for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); | |||
4001 | I != E; ++I) { | |||
4002 | const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); | |||
4003 | if (!RC) | |||
4004 | continue; | |||
4005 | ||||
4006 | const MachineOperand &Op = MI.getOperand(I); | |||
4007 | if (!Op.isReg()) | |||
4008 | continue; | |||
4009 | ||||
4010 | Register Reg = Op.getReg(); | |||
4011 | if (!Reg.isVirtual() && !RC->contains(Reg)) { | |||
4012 | ErrInfo = "inlineasm operand has incorrect register class."; | |||
4013 | return false; | |||
4014 | } | |||
4015 | } | |||
4016 | ||||
4017 | return true; | |||
4018 | } | |||
4019 | ||||
4020 | if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) { | |||
4021 | ErrInfo = "missing memory operand from MIMG instruction."; | |||
4022 | return false; | |||
4023 | } | |||
4024 | ||||
4025 | // Make sure the register classes are correct. | |||
4026 | for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { | |||
4027 | const MachineOperand &MO = MI.getOperand(i); | |||
4028 | if (MO.isFPImm()) { | |||
4029 | ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " | |||
4030 | "all fp values to integers."; | |||
4031 | return false; | |||
4032 | } | |||
4033 | ||||
4034 | int RegClass = Desc.OpInfo[i].RegClass; | |||
4035 | ||||
4036 | switch (Desc.OpInfo[i].OperandType) { | |||
4037 | case MCOI::OPERAND_REGISTER: | |||
4038 | if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { | |||
4039 | ErrInfo = "Illegal immediate value for operand."; | |||
4040 | return false; | |||
4041 | } | |||
4042 | break; | |||
4043 | case AMDGPU::OPERAND_REG_IMM_INT32: | |||
4044 | case AMDGPU::OPERAND_REG_IMM_FP32: | |||
4045 | case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED: | |||
4046 | break; | |||
4047 | case AMDGPU::OPERAND_REG_INLINE_C_INT32: | |||
4048 | case AMDGPU::OPERAND_REG_INLINE_C_FP32: | |||
4049 | case AMDGPU::OPERAND_REG_INLINE_C_INT64: | |||
4050 | case AMDGPU::OPERAND_REG_INLINE_C_FP64: | |||
4051 | case AMDGPU::OPERAND_REG_INLINE_C_INT16: | |||
4052 | case AMDGPU::OPERAND_REG_INLINE_C_FP16: | |||
4053 | case AMDGPU::OPERAND_REG_INLINE_AC_INT32: | |||
4054 | case AMDGPU::OPERAND_REG_INLINE_AC_FP32: | |||
4055 | case AMDGPU::OPERAND_REG_INLINE_AC_INT16: | |||
4056 | case AMDGPU::OPERAND_REG_INLINE_AC_FP16: | |||
4057 | case AMDGPU::OPERAND_REG_INLINE_AC_FP64: { | |||
4058 | if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { | |||
4059 | ErrInfo = "Illegal immediate value for operand."; | |||
4060 | return false; | |||
4061 | } | |||
4062 | break; | |||
4063 | } | |||
4064 | case MCOI::OPERAND_IMMEDIATE: | |||
4065 | case AMDGPU::OPERAND_KIMM32: | |||
4066 | // Check if this operand is an immediate. | |||
4067 | // FrameIndex operands will be replaced by immediates, so they are | |||
4068 | // allowed. | |||
4069 | if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { | |||
4070 | ErrInfo = "Expected immediate, but got non-immediate"; | |||
4071 | return false; | |||
4072 | } | |||
4073 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | |||
4074 | default: | |||
4075 | continue; | |||
4076 | } | |||
4077 | ||||
4078 | if (!MO.isReg()) | |||
4079 | continue; | |||
4080 | Register Reg = MO.getReg(); | |||
4081 | if (!Reg) | |||
4082 | continue; | |||
4083 | ||||
4084 | // FIXME: Ideally we would have separate instruction definitions with the | |||
4085 | // aligned register constraint. | |||
4086 | // FIXME: We do not verify inline asm operands, but custom inline asm | |||
4087 | // verification is broken anyway | |||
4088 | if (ST.needsAlignedVGPRs()) { | |||
4089 | const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg); | |||
4090 | if (RI.hasVectorRegisters(RC) && MO.getSubReg()) { | |||
4091 | const TargetRegisterClass *SubRC = | |||
4092 | RI.getSubRegClass(RC, MO.getSubReg()); | |||
4093 | RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg()); | |||
4094 | if (RC) | |||
4095 | RC = SubRC; | |||
4096 | } | |||
4097 | ||||
4098 | // Check that this is the aligned version of the class. | |||
4099 | if (!RC || !RI.isProperlyAlignedRC(*RC)) { | |||
4100 | ErrInfo = "Subtarget requires even aligned vector registers"; | |||
4101 | return false; | |||
4102 | } | |||
4103 | } | |||
4104 | ||||
4105 | if (RegClass != -1) { | |||
4106 | if (Reg.isVirtual()) | |||
4107 | continue; | |||
4108 | ||||
4109 | const TargetRegisterClass *RC = RI.getRegClass(RegClass); | |||
4110 | if (!RC->contains(Reg)) { | |||
4111 | ErrInfo = "Operand has incorrect register class."; | |||
4112 | return false; | |||
4113 | } | |||
4114 | } | |||
4115 | } | |||
4116 | ||||
4117 | // Verify SDWA | |||
4118 | if (isSDWA(MI)) { | |||
4119 | if (!ST.hasSDWA()) { | |||
4120 | ErrInfo = "SDWA is not supported on this target"; | |||
4121 | return false; | |||
4122 | } | |||
4123 | ||||
4124 | int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); | |||
4125 | ||||
4126 | const int OpIndices[] = {DstIdx, Src0Idx, Src1Idx, Src2Idx}; | |||
4127 | ||||
4128 | for (int OpIdx : OpIndices) { | |||
4129 | if (OpIdx == -1) | |||
4130 | continue; | |||
4131 | const MachineOperand &MO = MI.getOperand(OpIdx); | |||
4132 | ||||
4133 | if (!ST.hasSDWAScalar()) { | |||
4134 | // Only VGPRS on VI | |||
4135 | if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { | |||
4136 | ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; | |||
4137 | return false; | |||
4138 | } | |||
4139 | } else { | |||
4140 | // No immediates on GFX9 | |||
4141 | if (!MO.isReg()) { | |||
4142 | ErrInfo = | |||
4143 | "Only reg allowed as operands in SDWA instructions on GFX9+"; | |||
4144 | return false; | |||
4145 | } | |||
4146 | } | |||
4147 | } | |||
4148 | ||||
4149 | if (!ST.hasSDWAOmod()) { | |||
4150 | // No omod allowed on VI | |||
4151 | const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); | |||
4152 | if (OMod != nullptr && | |||
4153 | (!OMod->isImm() || OMod->getImm() != 0)) { | |||
4154 | ErrInfo = "OMod not allowed in SDWA instructions on VI"; | |||
4155 | return false; | |||
4156 | } | |||
4157 | } | |||
4158 | ||||
4159 | uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); | |||
4160 | if (isVOPC(BasicOpcode)) { | |||
4161 | if (!ST.hasSDWASdst() && DstIdx != -1) { | |||
4162 | // Only vcc allowed as dst on VI for VOPC | |||
4163 | const MachineOperand &Dst = MI.getOperand(DstIdx); | |||
4164 | if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { | |||
4165 | ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; | |||
4166 | return false; | |||
4167 | } | |||
4168 | } else if (!ST.hasSDWAOutModsVOPC()) { | |||
4169 | // No clamp allowed on GFX9 for VOPC | |||
4170 | const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); | |||
4171 | if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { | |||
4172 | ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; | |||
4173 | return false; | |||
4174 | } | |||
4175 | ||||
4176 | // No omod allowed on GFX9 for VOPC | |||
4177 | const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); | |||
4178 | if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { | |||
4179 | ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; | |||
4180 | return false; | |||
4181 | } | |||
4182 | } | |||
4183 | } | |||
4184 | ||||
4185 | const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused); | |||
4186 | if (DstUnused && DstUnused->isImm() && | |||
4187 | DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) { | |||
4188 | const MachineOperand &Dst = MI.getOperand(DstIdx); | |||
4189 | if (!Dst.isReg() || !Dst.isTied()) { | |||
4190 | ErrInfo = "Dst register should have tied register"; | |||
4191 | return false; | |||
4192 | } | |||
4193 | ||||
4194 | const MachineOperand &TiedMO = | |||
4195 | MI.getOperand(MI.findTiedOperandIdx(DstIdx)); | |||
4196 | if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) { | |||
4197 | ErrInfo = | |||
4198 | "Dst register should be tied to implicit use of preserved register"; | |||
4199 | return false; | |||
4200 | } else if (TiedMO.getReg().isPhysical() && | |||
4201 | Dst.getReg() != TiedMO.getReg()) { | |||
4202 | ErrInfo = "Dst register should use same physical register as preserved"; | |||
4203 | return false; | |||
4204 | } | |||
4205 | } | |||
4206 | } | |||
4207 | ||||
4208 | // Verify MIMG | |||
4209 | if (isMIMG(MI.getOpcode()) && !MI.mayStore()) { | |||
4210 | // Ensure that the return type used is large enough for all the options | |||
4211 | // being used TFE/LWE require an extra result register. | |||
4212 | const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); | |||
4213 | if (DMask) { | |||
4214 | uint64_t DMaskImm = DMask->getImm(); | |||
4215 | uint32_t RegCount = | |||
4216 | isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm); | |||
4217 | const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe); | |||
4218 | const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe); | |||
4219 | const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16); | |||
4220 | ||||
4221 | // Adjust for packed 16 bit values | |||
4222 | if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem()) | |||
4223 | RegCount >>= 1; | |||
4224 | ||||
4225 | // Adjust if using LWE or TFE | |||
4226 | if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) | |||
4227 | RegCount += 1; | |||
4228 | ||||
4229 | const uint32_t DstIdx = | |||
4230 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); | |||
4231 | const MachineOperand &Dst = MI.getOperand(DstIdx); | |||
4232 | if (Dst.isReg()) { | |||
4233 | const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); | |||
4234 | uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; | |||
4235 | if (RegCount > DstSize) { | |||
4236 | ErrInfo = "MIMG instruction returns too many registers for dst " | |||
4237 | "register class"; | |||
4238 | return false; | |||
4239 | } | |||
4240 | } | |||
4241 | } | |||
4242 | } | |||
4243 | ||||
4244 | // Verify VOP*. Ignore multiple sgpr operands on writelane. | |||
4245 | if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 | |||
4246 | && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { | |||
4247 | // Only look at the true operands. Only a real operand can use the constant | |||
4248 | // bus, and we don't want to check pseudo-operands like the source modifier | |||
4249 | // flags. | |||
4250 | const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; | |||
4251 | ||||
4252 | unsigned ConstantBusCount = 0; | |||
4253 | bool UsesLiteral = false; | |||
4254 | const MachineOperand *LiteralVal = nullptr; | |||
4255 | ||||
4256 | if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) | |||
4257 | ++ConstantBusCount; | |||
4258 | ||||
4259 | SmallVector<Register, 2> SGPRsUsed; | |||
4260 | Register SGPRUsed; | |||
4261 | ||||
4262 | for (int OpIdx : OpIndices) { | |||
4263 | if (OpIdx == -1) | |||
4264 | break; | |||
4265 | const MachineOperand &MO = MI.getOperand(OpIdx); | |||
4266 | if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { | |||
4267 | if (MO.isReg()) { | |||
4268 | SGPRUsed = MO.getReg(); | |||
4269 | if (llvm::all_of(SGPRsUsed, [SGPRUsed](unsigned SGPR) { | |||
4270 | return SGPRUsed != SGPR; | |||
4271 | })) { | |||
4272 | ++ConstantBusCount; | |||
4273 | SGPRsUsed.push_back(SGPRUsed); | |||
4274 | } | |||
4275 | } else { | |||
4276 | if (!UsesLiteral) { | |||
4277 | ++ConstantBusCount; | |||
4278 | UsesLiteral = true; | |||
4279 | LiteralVal = &MO; | |||
4280 | } else if (!MO.isIdenticalTo(*LiteralVal)) { | |||
4281 | assert(isVOP3(MI))(static_cast <bool> (isVOP3(MI)) ? void (0) : __assert_fail ("isVOP3(MI)", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 4281 , __extension__ __PRETTY_FUNCTION__)); | |||
4282 | ErrInfo = "VOP3 instruction uses more than one literal"; | |||
4283 | return false; | |||
4284 | } | |||
4285 | } | |||
4286 | } | |||
4287 | } | |||
4288 | ||||
4289 | SGPRUsed = findImplicitSGPRRead(MI); | |||
4290 | if (SGPRUsed != AMDGPU::NoRegister) { | |||
4291 | // Implicit uses may safely overlap true operands | |||
4292 | if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { | |||
4293 | return !RI.regsOverlap(SGPRUsed, SGPR); | |||
4294 | })) { | |||
4295 | ++ConstantBusCount; | |||
4296 | SGPRsUsed.push_back(SGPRUsed); | |||
4297 | } | |||
4298 | } | |||
4299 | ||||
4300 | // v_writelane_b32 is an exception from constant bus restriction: | |||
4301 | // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const | |||
4302 | if (ConstantBusCount > ST.getConstantBusLimit(Opcode) && | |||
4303 | Opcode != AMDGPU::V_WRITELANE_B32) { | |||
4304 | ErrInfo = "VOP* instruction violates constant bus restriction"; | |||
4305 | return false; | |||
4306 | } | |||
4307 | ||||
4308 | if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) { | |||
4309 | ErrInfo = "VOP3 instruction uses literal"; | |||
4310 | return false; | |||
4311 | } | |||
4312 | } | |||
4313 | ||||
4314 | // Special case for writelane - this can break the multiple constant bus rule, | |||
4315 | // but still can't use more than one SGPR register | |||
4316 | if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) { | |||
4317 | unsigned SGPRCount = 0; | |||
4318 | Register SGPRUsed = AMDGPU::NoRegister; | |||
4319 | ||||
4320 | for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { | |||
4321 | if (OpIdx == -1) | |||
4322 | break; | |||
4323 | ||||
4324 | const MachineOperand &MO = MI.getOperand(OpIdx); | |||
4325 | ||||
4326 | if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { | |||
4327 | if (MO.isReg() && MO.getReg() != AMDGPU::M0) { | |||
4328 | if (MO.getReg() != SGPRUsed) | |||
4329 | ++SGPRCount; | |||
4330 | SGPRUsed = MO.getReg(); | |||
4331 | } | |||
4332 | } | |||
4333 | if (SGPRCount > ST.getConstantBusLimit(Opcode)) { | |||
4334 | ErrInfo = "WRITELANE instruction violates constant bus restriction"; | |||
4335 | return false; | |||
4336 | } | |||
4337 | } | |||
4338 | } | |||
4339 | ||||
4340 | // Verify misc. restrictions on specific instructions. | |||
4341 | if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 || | |||
4342 | Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) { | |||
4343 | const MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
4344 | const MachineOperand &Src1 = MI.getOperand(Src1Idx); | |||
4345 | const MachineOperand &Src2 = MI.getOperand(Src2Idx); | |||
4346 | if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { | |||
4347 | if (!compareMachineOp(Src0, Src1) && | |||
4348 | !compareMachineOp(Src0, Src2)) { | |||
4349 | ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; | |||
4350 | return false; | |||
4351 | } | |||
4352 | } | |||
4353 | if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & | |||
4354 | SISrcMods::ABS) || | |||
4355 | (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() & | |||
4356 | SISrcMods::ABS) || | |||
4357 | (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() & | |||
4358 | SISrcMods::ABS)) { | |||
4359 | ErrInfo = "ABS not allowed in VOP3B instructions"; | |||
4360 | return false; | |||
4361 | } | |||
4362 | } | |||
4363 | ||||
4364 | if (isSOP2(MI) || isSOPC(MI)) { | |||
4365 | const MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
4366 | const MachineOperand &Src1 = MI.getOperand(Src1Idx); | |||
4367 | unsigned Immediates = 0; | |||
4368 | ||||
4369 | if (!Src0.isReg() && | |||
4370 | !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType)) | |||
4371 | Immediates++; | |||
4372 | if (!Src1.isReg() && | |||
4373 | !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType)) | |||
4374 | Immediates++; | |||
4375 | ||||
4376 | if (Immediates > 1) { | |||
4377 | ErrInfo = "SOP2/SOPC instruction requires too many immediate constants"; | |||
4378 | return false; | |||
4379 | } | |||
4380 | } | |||
4381 | ||||
4382 | if (isSOPK(MI)) { | |||
4383 | auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16); | |||
4384 | if (Desc.isBranch()) { | |||
4385 | if (!Op->isMBB()) { | |||
4386 | ErrInfo = "invalid branch target for SOPK instruction"; | |||
4387 | return false; | |||
4388 | } | |||
4389 | } else { | |||
4390 | uint64_t Imm = Op->getImm(); | |||
4391 | if (sopkIsZext(MI)) { | |||
4392 | if (!isUInt<16>(Imm)) { | |||
4393 | ErrInfo = "invalid immediate for SOPK instruction"; | |||
4394 | return false; | |||
4395 | } | |||
4396 | } else { | |||
4397 | if (!isInt<16>(Imm)) { | |||
4398 | ErrInfo = "invalid immediate for SOPK instruction"; | |||
4399 | return false; | |||
4400 | } | |||
4401 | } | |||
4402 | } | |||
4403 | } | |||
4404 | ||||
4405 | if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || | |||
4406 | Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || | |||
4407 | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || | |||
4408 | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { | |||
4409 | const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || | |||
4410 | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; | |||
4411 | ||||
4412 | const unsigned StaticNumOps = Desc.getNumOperands() + | |||
4413 | Desc.getNumImplicitUses(); | |||
4414 | const unsigned NumImplicitOps = IsDst ? 2 : 1; | |||
4415 | ||||
4416 | // Allow additional implicit operands. This allows a fixup done by the post | |||
4417 | // RA scheduler where the main implicit operand is killed and implicit-defs | |||
4418 | // are added for sub-registers that remain live after this instruction. | |||
4419 | if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { | |||
4420 | ErrInfo = "missing implicit register operands"; | |||
4421 | return false; | |||
4422 | } | |||
4423 | ||||
4424 | const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
4425 | if (IsDst) { | |||
4426 | if (!Dst->isUse()) { | |||
4427 | ErrInfo = "v_movreld_b32 vdst should be a use operand"; | |||
4428 | return false; | |||
4429 | } | |||
4430 | ||||
4431 | unsigned UseOpIdx; | |||
4432 | if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || | |||
4433 | UseOpIdx != StaticNumOps + 1) { | |||
4434 | ErrInfo = "movrel implicit operands should be tied"; | |||
4435 | return false; | |||
4436 | } | |||
4437 | } | |||
4438 | ||||
4439 | const MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
4440 | const MachineOperand &ImpUse | |||
4441 | = MI.getOperand(StaticNumOps + NumImplicitOps - 1); | |||
4442 | if (!ImpUse.isReg() || !ImpUse.isUse() || | |||
4443 | !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { | |||
4444 | ErrInfo = "src0 should be subreg of implicit vector use"; | |||
4445 | return false; | |||
4446 | } | |||
4447 | } | |||
4448 | ||||
4449 | // Make sure we aren't losing exec uses in the td files. This mostly requires | |||
4450 | // being careful when using let Uses to try to add other use registers. | |||
4451 | if (shouldReadExec(MI)) { | |||
4452 | if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { | |||
4453 | ErrInfo = "VALU instruction does not implicitly read exec mask"; | |||
4454 | return false; | |||
4455 | } | |||
4456 | } | |||
4457 | ||||
4458 | if (isSMRD(MI)) { | |||
4459 | if (MI.mayStore()) { | |||
4460 | // The register offset form of scalar stores may only use m0 as the | |||
4461 | // soffset register. | |||
4462 | const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); | |||
4463 | if (Soff && Soff->getReg() != AMDGPU::M0) { | |||
4464 | ErrInfo = "scalar stores must use m0 as offset register"; | |||
4465 | return false; | |||
4466 | } | |||
4467 | } | |||
4468 | } | |||
4469 | ||||
4470 | if (isFLAT(MI) && !ST.hasFlatInstOffsets()) { | |||
4471 | const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); | |||
4472 | if (Offset->getImm() != 0) { | |||
4473 | ErrInfo = "subtarget does not support offsets in flat instructions"; | |||
4474 | return false; | |||
4475 | } | |||
4476 | } | |||
4477 | ||||
4478 | if (isMIMG(MI)) { | |||
4479 | const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim); | |||
4480 | if (DimOp) { | |||
4481 | int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode, | |||
4482 | AMDGPU::OpName::vaddr0); | |||
4483 | int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); | |||
4484 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode); | |||
4485 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = | |||
4486 | AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); | |||
4487 | const AMDGPU::MIMGDimInfo *Dim = | |||
4488 | AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm()); | |||
4489 | ||||
4490 | if (!Dim) { | |||
4491 | ErrInfo = "dim is out of range"; | |||
4492 | return false; | |||
4493 | } | |||
4494 | ||||
4495 | bool IsA16 = false; | |||
4496 | if (ST.hasR128A16()) { | |||
4497 | const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128); | |||
4498 | IsA16 = R128A16->getImm() != 0; | |||
4499 | } else if (ST.hasGFX10A16()) { | |||
4500 | const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16); | |||
4501 | IsA16 = A16->getImm() != 0; | |||
4502 | } | |||
4503 | ||||
4504 | bool IsNSA = SRsrcIdx - VAddr0Idx > 1; | |||
4505 | ||||
4506 | unsigned AddrWords = | |||
4507 | AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16()); | |||
4508 | ||||
4509 | unsigned VAddrWords; | |||
4510 | if (IsNSA) { | |||
4511 | VAddrWords = SRsrcIdx - VAddr0Idx; | |||
4512 | } else { | |||
4513 | const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx); | |||
4514 | VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32; | |||
4515 | if (AddrWords > 8) | |||
4516 | AddrWords = 16; | |||
4517 | } | |||
4518 | ||||
4519 | if (VAddrWords != AddrWords) { | |||
4520 | LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWordsdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-instr-info")) { dbgs() << "bad vaddr size, expected " << AddrWords << " but got " << VAddrWords << "\n"; } } while (false) | |||
4521 | << " but got " << VAddrWords << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-instr-info")) { dbgs() << "bad vaddr size, expected " << AddrWords << " but got " << VAddrWords << "\n"; } } while (false); | |||
4522 | ErrInfo = "bad vaddr size"; | |||
4523 | return false; | |||
4524 | } | |||
4525 | } | |||
4526 | } | |||
4527 | ||||
4528 | const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl); | |||
4529 | if (DppCt) { | |||
4530 | using namespace AMDGPU::DPP; | |||
4531 | ||||
4532 | unsigned DC = DppCt->getImm(); | |||
4533 | if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 || | |||
4534 | DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST || | |||
4535 | (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || | |||
4536 | (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || | |||
4537 | (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || | |||
4538 | (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) || | |||
4539 | (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) { | |||
4540 | ErrInfo = "Invalid dpp_ctrl value"; | |||
4541 | return false; | |||
4542 | } | |||
4543 | if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 && | |||
4544 | ST.getGeneration() >= AMDGPUSubtarget::GFX10) { | |||
4545 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4546 | "wavefront shifts are not supported on GFX10+"; | |||
4547 | return false; | |||
4548 | } | |||
4549 | if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 && | |||
4550 | ST.getGeneration() >= AMDGPUSubtarget::GFX10) { | |||
4551 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4552 | "broadcasts are not supported on GFX10+"; | |||
4553 | return false; | |||
4554 | } | |||
4555 | if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && | |||
4556 | ST.getGeneration() < AMDGPUSubtarget::GFX10) { | |||
4557 | if (DC >= DppCtrl::ROW_NEWBCAST_FIRST && | |||
4558 | DC <= DppCtrl::ROW_NEWBCAST_LAST && | |||
4559 | !ST.hasGFX90AInsts()) { | |||
4560 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4561 | "row_newbroadcast/row_share is not supported before " | |||
4562 | "GFX90A/GFX10"; | |||
4563 | return false; | |||
4564 | } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) { | |||
4565 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4566 | "row_share and row_xmask are not supported before GFX10"; | |||
4567 | return false; | |||
4568 | } | |||
4569 | } | |||
4570 | ||||
4571 | int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); | |||
4572 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); | |||
4573 | ||||
4574 | if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO && | |||
4575 | ((DstIdx >= 0 && | |||
4576 | (Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID || | |||
4577 | Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64_Align2RegClassID)) || | |||
4578 | ((Src0Idx >= 0 && | |||
4579 | (Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID || | |||
4580 | Desc.OpInfo[Src0Idx].RegClass == | |||
4581 | AMDGPU::VReg_64_Align2RegClassID)))) && | |||
4582 | !AMDGPU::isLegal64BitDPPControl(DC)) { | |||
4583 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4584 | "64 bit dpp only support row_newbcast"; | |||
4585 | return false; | |||
4586 | } | |||
4587 | } | |||
4588 | ||||
4589 | if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) { | |||
4590 | const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
4591 | uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0 | |||
4592 | : AMDGPU::OpName::vdata; | |||
4593 | const MachineOperand *Data = getNamedOperand(MI, DataNameIdx); | |||
4594 | const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1); | |||
4595 | if (Data && !Data->isReg()) | |||
4596 | Data = nullptr; | |||
4597 | ||||
4598 | if (ST.hasGFX90AInsts()) { | |||
4599 | if (Dst && Data && | |||
4600 | (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) { | |||
4601 | ErrInfo = "Invalid register class: " | |||
4602 | "vdata and vdst should be both VGPR or AGPR"; | |||
4603 | return false; | |||
4604 | } | |||
4605 | if (Data && Data2 && | |||
4606 | (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) { | |||
4607 | ErrInfo = "Invalid register class: " | |||
4608 | "both data operands should be VGPR or AGPR"; | |||
4609 | return false; | |||
4610 | } | |||
4611 | } else { | |||
4612 | if ((Dst && RI.isAGPR(MRI, Dst->getReg())) || | |||
4613 | (Data && RI.isAGPR(MRI, Data->getReg())) || | |||
4614 | (Data2 && RI.isAGPR(MRI, Data2->getReg()))) { | |||
4615 | ErrInfo = "Invalid register class: " | |||
4616 | "agpr loads and stores not supported on this GPU"; | |||
4617 | return false; | |||
4618 | } | |||
4619 | } | |||
4620 | } | |||
4621 | ||||
4622 | if (ST.needsAlignedVGPRs() && | |||
4623 | (MI.getOpcode() == AMDGPU::DS_GWS_INIT || | |||
4624 | MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR || | |||
4625 | MI.getOpcode() == AMDGPU::DS_GWS_BARRIER)) { | |||
4626 | const MachineOperand *Op = getNamedOperand(MI, AMDGPU::OpName::data0); | |||
4627 | Register Reg = Op->getReg(); | |||
4628 | bool Aligned = true; | |||
4629 | if (Reg.isPhysical()) { | |||
4630 | Aligned = !(RI.getHWRegIndex(Reg) & 1); | |||
4631 | } else { | |||
4632 | const TargetRegisterClass &RC = *MRI.getRegClass(Reg); | |||
4633 | Aligned = RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) && | |||
4634 | !(RI.getChannelFromSubReg(Op->getSubReg()) & 1); | |||
4635 | } | |||
4636 | ||||
4637 | if (!Aligned) { | |||
4638 | ErrInfo = "Subtarget requires even aligned vector registers " | |||
4639 | "for DS_GWS instructions"; | |||
4640 | return false; | |||
4641 | } | |||
4642 | } | |||
4643 | ||||
4644 | if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) { | |||
4645 | const MachineOperand &SrcOp = MI.getOperand(1); | |||
4646 | if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) { | |||
4647 | ErrInfo = "pseudo expects only physical SGPRs"; | |||
4648 | return false; | |||
4649 | } | |||
4650 | } | |||
4651 | ||||
4652 | return true; | |||
4653 | } | |||
4654 | ||||
4655 | unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { | |||
4656 | switch (MI.getOpcode()) { | |||
4657 | default: return AMDGPU::INSTRUCTION_LIST_END; | |||
4658 | case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; | |||
4659 | case AMDGPU::COPY: return AMDGPU::COPY; | |||
4660 | case AMDGPU::PHI: return AMDGPU::PHI; | |||
4661 | case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; | |||
4662 | case AMDGPU::WQM: return AMDGPU::WQM; | |||
4663 | case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; | |||
4664 | case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM; | |||
4665 | case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM; | |||
4666 | case AMDGPU::S_MOV_B32: { | |||
4667 | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); | |||
4668 | return MI.getOperand(1).isReg() || | |||
4669 | RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? | |||
4670 | AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; | |||
4671 | } | |||
4672 | case AMDGPU::S_ADD_I32: | |||
4673 | return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32; | |||
4674 | case AMDGPU::S_ADDC_U32: | |||
4675 | return AMDGPU::V_ADDC_U32_e32; | |||
4676 | case AMDGPU::S_SUB_I32: | |||
4677 | return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32; | |||
4678 | // FIXME: These are not consistently handled, and selected when the carry is | |||
4679 | // used. | |||
4680 | case AMDGPU::S_ADD_U32: | |||
4681 | return AMDGPU::V_ADD_CO_U32_e32; | |||
4682 | case AMDGPU::S_SUB_U32: | |||
4683 | return AMDGPU::V_SUB_CO_U32_e32; | |||
4684 | case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; | |||
4685 | case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64; | |||
4686 | case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64; | |||
4687 | case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64; | |||
4688 | case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; | |||
4689 | case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; | |||
4690 | case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; | |||
4691 | case AMDGPU::S_XNOR_B32: | |||
4692 | return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; | |||
4693 | case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; | |||
4694 | case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; | |||
4695 | case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; | |||
4696 | case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; | |||
4697 | case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; | |||
4698 | case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64; | |||
4699 | case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; | |||
4700 | case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64; | |||
4701 | case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; | |||
4702 | case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64; | |||
4703 | case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64; | |||
4704 | case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64; | |||
4705 | case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64; | |||
4706 | case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64; | |||
4707 | case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; | |||
4708 | case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; | |||
4709 | case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; | |||
4710 | case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; | |||
4711 | case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64; | |||
4712 | case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64; | |||
4713 | case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64; | |||
4714 | case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64; | |||
4715 | case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64; | |||
4716 | case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64; | |||
4717 | case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64; | |||
4718 | case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64; | |||
4719 | case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64; | |||
4720 | case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64; | |||
4721 | case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64; | |||
4722 | case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64; | |||
4723 | case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64; | |||
4724 | case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64; | |||
4725 | case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; | |||
4726 | case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; | |||
4727 | case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; | |||
4728 | case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; | |||
4729 | case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; | |||
4730 | case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; | |||
4731 | } | |||
4732 | llvm_unreachable(::llvm::llvm_unreachable_internal("Unexpected scalar opcode without corresponding vector one!" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 4733) | |||
4733 | "Unexpected scalar opcode without corresponding vector one!")::llvm::llvm_unreachable_internal("Unexpected scalar opcode without corresponding vector one!" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 4733); | |||
4734 | } | |||
4735 | ||||
4736 | static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST, | |||
4737 | const MachineRegisterInfo &MRI, | |||
4738 | const MCInstrDesc &TID, | |||
4739 | unsigned RCID, | |||
4740 | bool IsAllocatable) { | |||
4741 | if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && | |||
4742 | (((TID.mayLoad() || TID.mayStore()) && | |||
4743 | !(TID.TSFlags & SIInstrFlags::VGPRSpill)) || | |||
4744 | (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) { | |||
4745 | switch (RCID) { | |||
4746 | case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID; | |||
4747 | case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID; | |||
4748 | case AMDGPU::AV_96RegClassID: return AMDGPU::VReg_96RegClassID; | |||
4749 | case AMDGPU::AV_128RegClassID: return AMDGPU::VReg_128RegClassID; | |||
4750 | case AMDGPU::AV_160RegClassID: return AMDGPU::VReg_160RegClassID; | |||
4751 | default: | |||
4752 | break; | |||
4753 | } | |||
4754 | } | |||
4755 | return RCID; | |||
4756 | } | |||
4757 | ||||
4758 | const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID, | |||
4759 | unsigned OpNum, const TargetRegisterInfo *TRI, | |||
4760 | const MachineFunction &MF) | |||
4761 | const { | |||
4762 | if (OpNum >= TID.getNumOperands()) | |||
4763 | return nullptr; | |||
4764 | auto RegClass = TID.OpInfo[OpNum].RegClass; | |||
4765 | bool IsAllocatable = false; | |||
4766 | if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) { | |||
4767 | // vdst and vdata should be both VGPR or AGPR, same for the DS instructions | |||
4768 | // with two data operands. Request register class constrained to VGPR only | |||
4769 | // of both operands present as Machine Copy Propagation can not check this | |||
4770 | // constraint and possibly other passes too. | |||
4771 | // | |||
4772 | // The check is limited to FLAT and DS because atomics in non-flat encoding | |||
4773 | // have their vdst and vdata tied to be the same register. | |||
4774 | const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, | |||
4775 | AMDGPU::OpName::vdst); | |||
4776 | const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, | |||
4777 | (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0 | |||
4778 | : AMDGPU::OpName::vdata); | |||
4779 | if (DataIdx != -1) { | |||
4780 | IsAllocatable = VDstIdx != -1 || | |||
4781 | AMDGPU::getNamedOperandIdx(TID.Opcode, | |||
4782 | AMDGPU::OpName::data1) != -1; | |||
4783 | } | |||
4784 | } | |||
4785 | RegClass = adjustAllocatableRegClass(ST, MF.getRegInfo(), TID, RegClass, | |||
4786 | IsAllocatable); | |||
4787 | return RI.getRegClass(RegClass); | |||
4788 | } | |||
4789 | ||||
4790 | const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, | |||
4791 | unsigned OpNo) const { | |||
4792 | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); | |||
4793 | const MCInstrDesc &Desc = get(MI.getOpcode()); | |||
4794 | if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || | |||
4795 | Desc.OpInfo[OpNo].RegClass == -1) { | |||
4796 | Register Reg = MI.getOperand(OpNo).getReg(); | |||
4797 | ||||
4798 | if (Reg.isVirtual()) | |||
4799 | return MRI.getRegClass(Reg); | |||
4800 | return RI.getPhysRegClass(Reg); | |||
4801 | } | |||
4802 | ||||
4803 | unsigned RCID = Desc.OpInfo[OpNo].RegClass; | |||
4804 | RCID = adjustAllocatableRegClass(ST, MRI, Desc, RCID, true); | |||
4805 | return RI.getRegClass(RCID); | |||
4806 | } | |||
4807 | ||||
4808 | void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { | |||
4809 | MachineBasicBlock::iterator I = MI; | |||
4810 | MachineBasicBlock *MBB = MI.getParent(); | |||
4811 | MachineOperand &MO = MI.getOperand(OpIdx); | |||
4812 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | |||
4813 | unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; | |||
4814 | const TargetRegisterClass *RC = RI.getRegClass(RCID); | |||
4815 | unsigned Size = RI.getRegSizeInBits(*RC); | |||
4816 | unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32; | |||
4817 | if (MO.isReg()) | |||
4818 | Opcode = AMDGPU::COPY; | |||
4819 | else if (RI.isSGPRClass(RC)) | |||
4820 | Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; | |||
4821 | ||||
4822 | const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); | |||
4823 | const TargetRegisterClass *VRC64 = RI.getVGPR64Class(); | |||
4824 | if (RI.getCommonSubClass(VRC64, VRC)) | |||
4825 | VRC = VRC64; | |||
4826 | else | |||
4827 | VRC = &AMDGPU::VGPR_32RegClass; | |||
4828 | ||||
4829 | Register Reg = MRI.createVirtualRegister(VRC); | |||
4830 | DebugLoc DL = MBB->findDebugLoc(I); | |||
4831 | BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); | |||
4832 | MO.ChangeToRegister(Reg, false); | |||
4833 | } | |||
4834 | ||||
4835 | unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, | |||
4836 | MachineRegisterInfo &MRI, | |||
4837 | MachineOperand &SuperReg, | |||
4838 | const TargetRegisterClass *SuperRC, | |||
4839 | unsigned SubIdx, | |||
4840 | const TargetRegisterClass *SubRC) | |||
4841 | const { | |||
4842 | MachineBasicBlock *MBB = MI->getParent(); | |||
4843 | DebugLoc DL = MI->getDebugLoc(); | |||
4844 | Register SubReg = MRI.createVirtualRegister(SubRC); | |||
4845 | ||||
4846 | if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { | |||
4847 | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) | |||
4848 | .addReg(SuperReg.getReg(), 0, SubIdx); | |||
4849 | return SubReg; | |||
4850 | } | |||
4851 | ||||
4852 | // Just in case the super register is itself a sub-register, copy it to a new | |||
4853 | // value so we don't need to worry about merging its subreg index with the | |||
4854 | // SubIdx passed to this function. The register coalescer should be able to | |||
4855 | // eliminate this extra copy. | |||
4856 | Register NewSuperReg = MRI.createVirtualRegister(SuperRC); | |||
4857 | ||||
4858 | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) | |||
4859 | .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); | |||
4860 | ||||
4861 | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) | |||
4862 | .addReg(NewSuperReg, 0, SubIdx); | |||
4863 | ||||
4864 | return SubReg; | |||
4865 | } | |||
4866 | ||||
4867 | MachineOperand SIInstrInfo::buildExtractSubRegOrImm( | |||
4868 | MachineBasicBlock::iterator MII, | |||
4869 | MachineRegisterInfo &MRI, | |||
4870 | MachineOperand &Op, | |||
4871 | const TargetRegisterClass *SuperRC, | |||
4872 | unsigned SubIdx, | |||
4873 | const TargetRegisterClass *SubRC) const { | |||
4874 | if (Op.isImm()) { | |||
4875 | if (SubIdx == AMDGPU::sub0) | |||
4876 | return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); | |||
4877 | if (SubIdx == AMDGPU::sub1) | |||
4878 | return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); | |||
4879 | ||||
4880 | llvm_unreachable("Unhandled register index for immediate")::llvm::llvm_unreachable_internal("Unhandled register index for immediate" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 4880); | |||
4881 | } | |||
4882 | ||||
4883 | unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, | |||
4884 | SubIdx, SubRC); | |||
4885 | return MachineOperand::CreateReg(SubReg, false); | |||
4886 | } | |||
4887 | ||||
4888 | // Change the order of operands from (0, 1, 2) to (0, 2, 1) | |||
4889 | void SIInstrInfo::swapOperands(MachineInstr &Inst) const { | |||
4890 | assert(Inst.getNumExplicitOperands() == 3)(static_cast <bool> (Inst.getNumExplicitOperands() == 3 ) ? void (0) : __assert_fail ("Inst.getNumExplicitOperands() == 3" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 4890, __extension__ __PRETTY_FUNCTION__)); | |||
4891 | MachineOperand Op1 = Inst.getOperand(1); | |||
4892 | Inst.RemoveOperand(1); | |||
4893 | Inst.addOperand(Op1); | |||
4894 | } | |||
4895 | ||||
4896 | bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, | |||
4897 | const MCOperandInfo &OpInfo, | |||
4898 | const MachineOperand &MO) const { | |||
4899 | if (!MO.isReg()) | |||
4900 | return false; | |||
4901 | ||||
4902 | Register Reg = MO.getReg(); | |||
4903 | ||||
4904 | const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass); | |||
4905 | if (Reg.isPhysical()) | |||
4906 | return DRC->contains(Reg); | |||
4907 | ||||
4908 | const TargetRegisterClass *RC = MRI.getRegClass(Reg); | |||
4909 | ||||
4910 | if (MO.getSubReg()) { | |||
4911 | const MachineFunction *MF = MO.getParent()->getParent()->getParent(); | |||
4912 | const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF); | |||
4913 | if (!SuperRC) | |||
4914 | return false; | |||
4915 | ||||
4916 | DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()); | |||
4917 | if (!DRC) | |||
4918 | return false; | |||
4919 | } | |||
4920 | return RC->hasSuperClassEq(DRC); | |||
4921 | } | |||
4922 | ||||
4923 | bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, | |||
4924 | const MCOperandInfo &OpInfo, | |||
4925 | const MachineOperand &MO) const { | |||
4926 | if (MO.isReg()) | |||
4927 | return isLegalRegOperand(MRI, OpInfo, MO); | |||
4928 | ||||
4929 | // Handle non-register types that are treated like immediates. | |||
4930 | assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal())(static_cast <bool> (MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) ? void (0) : __assert_fail ("MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 4930, __extension__ __PRETTY_FUNCTION__)); | |||
4931 | return true; | |||
4932 | } | |||
4933 | ||||
4934 | bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, | |||
4935 | const MachineOperand *MO) const { | |||
4936 | const MachineFunction &MF = *MI.getParent()->getParent(); | |||
4937 | const MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
4938 | const MCInstrDesc &InstDesc = MI.getDesc(); | |||
4939 | const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; | |||
4940 | const TargetRegisterClass *DefinedRC = | |||
4941 | OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; | |||
4942 | if (!MO) | |||
4943 | MO = &MI.getOperand(OpIdx); | |||
4944 | ||||
4945 | int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); | |||
4946 | int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; | |||
4947 | if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { | |||
4948 | if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--) | |||
4949 | return false; | |||
4950 | ||||
4951 | SmallDenseSet<RegSubRegPair> SGPRsUsed; | |||
4952 | if (MO->isReg()) | |||
4953 | SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg())); | |||
4954 | ||||
4955 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { | |||
4956 | if (i == OpIdx) | |||
4957 | continue; | |||
4958 | const MachineOperand &Op = MI.getOperand(i); | |||
4959 | if (Op.isReg()) { | |||
4960 | RegSubRegPair SGPR(Op.getReg(), Op.getSubReg()); | |||
4961 | if (!SGPRsUsed.count(SGPR) && | |||
4962 | usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { | |||
4963 | if (--ConstantBusLimit <= 0) | |||
4964 | return false; | |||
4965 | SGPRsUsed.insert(SGPR); | |||
4966 | } | |||
4967 | } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { | |||
4968 | if (--ConstantBusLimit <= 0) | |||
4969 | return false; | |||
4970 | } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) && | |||
4971 | isLiteralConstantLike(Op, InstDesc.OpInfo[i])) { | |||
4972 | if (!VOP3LiteralLimit--) | |||
4973 | return false; | |||
4974 | if (--ConstantBusLimit <= 0) | |||
4975 | return false; | |||
4976 | } | |||
4977 | } | |||
4978 | } | |||
4979 | ||||
4980 | if (MO->isReg()) { | |||
4981 | assert(DefinedRC)(static_cast <bool> (DefinedRC) ? void (0) : __assert_fail ("DefinedRC", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 4981 , __extension__ __PRETTY_FUNCTION__)); | |||
4982 | if (!isLegalRegOperand(MRI, OpInfo, *MO)) | |||
4983 | return false; | |||
4984 | bool IsAGPR = RI.isAGPR(MRI, MO->getReg()); | |||
4985 | if (IsAGPR && !ST.hasMAIInsts()) | |||
4986 | return false; | |||
4987 | unsigned Opc = MI.getOpcode(); | |||
4988 | if (IsAGPR && | |||
4989 | (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && | |||
4990 | (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc))) | |||
4991 | return false; | |||
4992 | // Atomics should have both vdst and vdata either vgpr or agpr. | |||
4993 | const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
4994 | const int DataIdx = AMDGPU::getNamedOperandIdx(Opc, | |||
4995 | isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata); | |||
4996 | if ((int)OpIdx == VDstIdx && DataIdx != -1 && | |||
4997 | MI.getOperand(DataIdx).isReg() && | |||
4998 | RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR) | |||
4999 | return false; | |||
5000 | if ((int)OpIdx == DataIdx) { | |||
5001 | if (VDstIdx != -1 && | |||
5002 | RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR) | |||
5003 | return false; | |||
5004 | // DS instructions with 2 src operands also must have tied RC. | |||
5005 | const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, | |||
5006 | AMDGPU::OpName::data1); | |||
5007 | if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() && | |||
5008 | RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR) | |||
5009 | return false; | |||
5010 | } | |||
5011 | if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && | |||
5012 | (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) && | |||
5013 | RI.isSGPRReg(MRI, MO->getReg())) | |||
5014 | return false; | |||
5015 | return true; | |||
5016 | } | |||
5017 | ||||
5018 | // Handle non-register types that are treated like immediates. | |||
5019 | assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal())(static_cast <bool> (MO->isImm() || MO->isTargetIndex () || MO->isFI() || MO->isGlobal()) ? void (0) : __assert_fail ("MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 5019, __extension__ __PRETTY_FUNCTION__)); | |||
5020 | ||||
5021 | if (!DefinedRC) { | |||
5022 | // This operand expects an immediate. | |||
5023 | return true; | |||
5024 | } | |||
5025 | ||||
5026 | return isImmOperandLegal(MI, OpIdx, *MO); | |||
5027 | } | |||
5028 | ||||
5029 | void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, | |||
5030 | MachineInstr &MI) const { | |||
5031 | unsigned Opc = MI.getOpcode(); | |||
5032 | const MCInstrDesc &InstrDesc = get(Opc); | |||
5033 | ||||
5034 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); | |||
5035 | MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
5036 | ||||
5037 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); | |||
5038 | MachineOperand &Src1 = MI.getOperand(Src1Idx); | |||
5039 | ||||
5040 | // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 | |||
5041 | // we need to only have one constant bus use before GFX10. | |||
5042 | bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; | |||
5043 | if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && | |||
5044 | Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) || | |||
5045 | isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx]))) | |||
5046 | legalizeOpWithMove(MI, Src0Idx); | |||
5047 | ||||
5048 | // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for | |||
5049 | // both the value to write (src0) and lane select (src1). Fix up non-SGPR | |||
5050 | // src0/src1 with V_READFIRSTLANE. | |||
5051 | if (Opc == AMDGPU::V_WRITELANE_B32) { | |||
5052 | const DebugLoc &DL = MI.getDebugLoc(); | |||
5053 | if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { | |||
5054 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
5055 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
5056 | .add(Src0); | |||
5057 | Src0.ChangeToRegister(Reg, false); | |||
5058 | } | |||
5059 | if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { | |||
5060 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
5061 | const DebugLoc &DL = MI.getDebugLoc(); | |||
5062 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
5063 | .add(Src1); | |||
5064 | Src1.ChangeToRegister(Reg, false); | |||
5065 | } | |||
5066 | return; | |||
5067 | } | |||
5068 | ||||
5069 | // No VOP2 instructions support AGPRs. | |||
5070 | if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg())) | |||
5071 | legalizeOpWithMove(MI, Src0Idx); | |||
5072 | ||||
5073 | if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) | |||
5074 | legalizeOpWithMove(MI, Src1Idx); | |||
5075 | ||||
5076 | // VOP2 src0 instructions support all operand types, so we don't need to check | |||
5077 | // their legality. If src1 is already legal, we don't need to do anything. | |||
5078 | if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) | |||
5079 | return; | |||
5080 | ||||
5081 | // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for | |||
5082 | // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane | |||
5083 | // select is uniform. | |||
5084 | if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && | |||
5085 | RI.isVGPR(MRI, Src1.getReg())) { | |||
5086 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
5087 | const DebugLoc &DL = MI.getDebugLoc(); | |||
5088 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
5089 | .add(Src1); | |||
5090 | Src1.ChangeToRegister(Reg, false); | |||
5091 | return; | |||
5092 | } | |||
5093 | ||||
5094 | // We do not use commuteInstruction here because it is too aggressive and will | |||
5095 | // commute if it is possible. We only want to commute here if it improves | |||
5096 | // legality. This can be called a fairly large number of times so don't waste | |||
5097 | // compile time pointlessly swapping and checking legality again. | |||
5098 | if (HasImplicitSGPR || !MI.isCommutable()) { | |||
5099 | legalizeOpWithMove(MI, Src1Idx); | |||
5100 | return; | |||
5101 | } | |||
5102 | ||||
5103 | // If src0 can be used as src1, commuting will make the operands legal. | |||
5104 | // Otherwise we have to give up and insert a move. | |||
5105 | // | |||
5106 | // TODO: Other immediate-like operand kinds could be commuted if there was a | |||
5107 | // MachineOperand::ChangeTo* for them. | |||
5108 | if ((!Src1.isImm() && !Src1.isReg()) || | |||
5109 | !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { | |||
5110 | legalizeOpWithMove(MI, Src1Idx); | |||
5111 | return; | |||
5112 | } | |||
5113 | ||||
5114 | int CommutedOpc = commuteOpcode(MI); | |||
5115 | if (CommutedOpc == -1) { | |||
5116 | legalizeOpWithMove(MI, Src1Idx); | |||
5117 | return; | |||
5118 | } | |||
5119 | ||||
5120 | MI.setDesc(get(CommutedOpc)); | |||
5121 | ||||
5122 | Register Src0Reg = Src0.getReg(); | |||
5123 | unsigned Src0SubReg = Src0.getSubReg(); | |||
5124 | bool Src0Kill = Src0.isKill(); | |||
5125 | ||||
5126 | if (Src1.isImm()) | |||
5127 | Src0.ChangeToImmediate(Src1.getImm()); | |||
5128 | else if (Src1.isReg()) { | |||
5129 | Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); | |||
5130 | Src0.setSubReg(Src1.getSubReg()); | |||
5131 | } else | |||
5132 | llvm_unreachable("Should only have register or immediate operands")::llvm::llvm_unreachable_internal("Should only have register or immediate operands" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 5132); | |||
5133 | ||||
5134 | Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); | |||
5135 | Src1.setSubReg(Src0SubReg); | |||
5136 | fixImplicitOperands(MI); | |||
5137 | } | |||
5138 | ||||
5139 | // Legalize VOP3 operands. All operand types are supported for any operand | |||
5140 | // but only one literal constant and only starting from GFX10. | |||
5141 | void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, | |||
5142 | MachineInstr &MI) const { | |||
5143 | unsigned Opc = MI.getOpcode(); | |||
5144 | ||||
5145 | int VOP3Idx[3] = { | |||
5146 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), | |||
5147 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), | |||
5148 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) | |||
5149 | }; | |||
5150 | ||||
5151 | if (Opc == AMDGPU::V_PERMLANE16_B32_e64 || | |||
5152 | Opc == AMDGPU::V_PERMLANEX16_B32_e64) { | |||
5153 | // src1 and src2 must be scalar | |||
5154 | MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]); | |||
5155 | MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); | |||
5156 | const DebugLoc &DL = MI.getDebugLoc(); | |||
5157 | if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { | |||
5158 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
5159 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
5160 | .add(Src1); | |||
5161 | Src1.ChangeToRegister(Reg, false); | |||
5162 | } | |||
5163 | if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { | |||
5164 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
5165 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
5166 | .add(Src2); | |||
5167 | Src2.ChangeToRegister(Reg, false); | |||
5168 | } | |||
5169 | } | |||
5170 | ||||
5171 | // Find the one SGPR operand we are allowed to use. | |||
5172 | int ConstantBusLimit = ST.getConstantBusLimit(Opc); | |||
5173 | int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; | |||
5174 | SmallDenseSet<unsigned> SGPRsUsed; | |||
5175 | Register SGPRReg = findUsedSGPR(MI, VOP3Idx); | |||
5176 | if (SGPRReg != AMDGPU::NoRegister) { | |||
5177 | SGPRsUsed.insert(SGPRReg); | |||
5178 | --ConstantBusLimit; | |||
5179 | } | |||
5180 | ||||
5181 | for (int Idx : VOP3Idx) { | |||
5182 | if (Idx == -1) | |||
5183 | break; | |||
5184 | MachineOperand &MO = MI.getOperand(Idx); | |||
5185 | ||||
5186 | if (!MO.isReg()) { | |||
5187 | if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx])) | |||
5188 | continue; | |||
5189 | ||||
5190 | if (LiteralLimit > 0 && ConstantBusLimit > 0) { | |||
5191 | --LiteralLimit; | |||
5192 | --ConstantBusLimit; | |||
5193 | continue; | |||
5194 | } | |||
5195 | ||||
5196 | --LiteralLimit; | |||
5197 | --ConstantBusLimit; | |||
5198 | legalizeOpWithMove(MI, Idx); | |||
5199 | continue; | |||
5200 | } | |||
5201 | ||||
5202 | if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) && | |||
5203 | !isOperandLegal(MI, Idx, &MO)) { | |||
5204 | legalizeOpWithMove(MI, Idx); | |||
5205 | continue; | |||
5206 | } | |||
5207 | ||||
5208 | if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg()))) | |||
5209 | continue; // VGPRs are legal | |||
5210 | ||||
5211 | // We can use one SGPR in each VOP3 instruction prior to GFX10 | |||
5212 | // and two starting from GFX10. | |||
5213 | if (SGPRsUsed.count(MO.getReg())) | |||
5214 | continue; | |||
5215 | if (ConstantBusLimit > 0) { | |||
5216 | SGPRsUsed.insert(MO.getReg()); | |||
5217 | --ConstantBusLimit; | |||
5218 | continue; | |||
5219 | } | |||
5220 | ||||
5221 | // If we make it this far, then the operand is not legal and we must | |||
5222 | // legalize it. | |||
5223 | legalizeOpWithMove(MI, Idx); | |||
5224 | } | |||
5225 | } | |||
5226 | ||||
5227 | Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, | |||
5228 | MachineRegisterInfo &MRI) const { | |||
5229 | const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); | |||
5230 | const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); | |||
5231 | Register DstReg = MRI.createVirtualRegister(SRC); | |||
5232 | unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; | |||
5233 | ||||
5234 | if (RI.hasAGPRs(VRC)) { | |||
5235 | VRC = RI.getEquivalentVGPRClass(VRC); | |||
5236 | Register NewSrcReg = MRI.createVirtualRegister(VRC); | |||
5237 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), | |||
5238 | get(TargetOpcode::COPY), NewSrcReg) | |||
5239 | .addReg(SrcReg); | |||
5240 | SrcReg = NewSrcReg; | |||
5241 | } | |||
5242 | ||||
5243 | if (SubRegs == 1) { | |||
5244 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), | |||
5245 | get(AMDGPU::V_READFIRSTLANE_B32), DstReg) | |||
5246 | .addReg(SrcReg); | |||
5247 | return DstReg; | |||
5248 | } | |||
5249 | ||||
5250 | SmallVector<unsigned, 8> SRegs; | |||
5251 | for (unsigned i = 0; i < SubRegs; ++i) { | |||
5252 | Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5253 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), | |||
5254 | get(AMDGPU::V_READFIRSTLANE_B32), SGPR) | |||
5255 | .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); | |||
5256 | SRegs.push_back(SGPR); | |||
5257 | } | |||
5258 | ||||
5259 | MachineInstrBuilder MIB = | |||
5260 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), | |||
5261 | get(AMDGPU::REG_SEQUENCE), DstReg); | |||
5262 | for (unsigned i = 0; i < SubRegs; ++i) { | |||
5263 | MIB.addReg(SRegs[i]); | |||
5264 | MIB.addImm(RI.getSubRegFromChannel(i)); | |||
5265 | } | |||
5266 | return DstReg; | |||
5267 | } | |||
5268 | ||||
5269 | void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, | |||
5270 | MachineInstr &MI) const { | |||
5271 | ||||
5272 | // If the pointer is store in VGPRs, then we need to move them to | |||
5273 | // SGPRs using v_readfirstlane. This is safe because we only select | |||
5274 | // loads with uniform pointers to SMRD instruction so we know the | |||
5275 | // pointer value is uniform. | |||
5276 | MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); | |||
5277 | if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { | |||
5278 | Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); | |||
5279 | SBase->setReg(SGPR); | |||
5280 | } | |||
5281 | MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff); | |||
5282 | if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { | |||
5283 | Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); | |||
5284 | SOff->setReg(SGPR); | |||
5285 | } | |||
5286 | } | |||
5287 | ||||
5288 | bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { | |||
5289 | unsigned Opc = Inst.getOpcode(); | |||
5290 | int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); | |||
5291 | if (OldSAddrIdx < 0) | |||
5292 | return false; | |||
5293 | ||||
5294 | assert(isSegmentSpecificFLAT(Inst))(static_cast <bool> (isSegmentSpecificFLAT(Inst)) ? void (0) : __assert_fail ("isSegmentSpecificFLAT(Inst)", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5294, __extension__ __PRETTY_FUNCTION__)); | |||
5295 | ||||
5296 | int NewOpc = AMDGPU::getGlobalVaddrOp(Opc); | |||
5297 | if (NewOpc < 0) | |||
5298 | NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opc); | |||
5299 | if (NewOpc < 0) | |||
5300 | return false; | |||
5301 | ||||
5302 | MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo(); | |||
5303 | MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx); | |||
5304 | if (RI.isSGPRReg(MRI, SAddr.getReg())) | |||
5305 | return false; | |||
5306 | ||||
5307 | int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr); | |||
5308 | if (NewVAddrIdx < 0) | |||
5309 | return false; | |||
5310 | ||||
5311 | int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); | |||
5312 | ||||
5313 | // Check vaddr, it shall be zero or absent. | |||
5314 | MachineInstr *VAddrDef = nullptr; | |||
5315 | if (OldVAddrIdx >= 0) { | |||
5316 | MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx); | |||
5317 | VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg()); | |||
5318 | if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 || | |||
5319 | !VAddrDef->getOperand(1).isImm() || | |||
5320 | VAddrDef->getOperand(1).getImm() != 0) | |||
5321 | return false; | |||
5322 | } | |||
5323 | ||||
5324 | const MCInstrDesc &NewDesc = get(NewOpc); | |||
5325 | Inst.setDesc(NewDesc); | |||
5326 | ||||
5327 | // Callers expect iterator to be valid after this call, so modify the | |||
5328 | // instruction in place. | |||
5329 | if (OldVAddrIdx == NewVAddrIdx) { | |||
5330 | MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx); | |||
5331 | // Clear use list from the old vaddr holding a zero register. | |||
5332 | MRI.removeRegOperandFromUseList(&NewVAddr); | |||
5333 | MRI.moveOperands(&NewVAddr, &SAddr, 1); | |||
5334 | Inst.RemoveOperand(OldSAddrIdx); | |||
5335 | // Update the use list with the pointer we have just moved from vaddr to | |||
5336 | // saddr position. Otherwise new vaddr will be missing from the use list. | |||
5337 | MRI.removeRegOperandFromUseList(&NewVAddr); | |||
5338 | MRI.addRegOperandToUseList(&NewVAddr); | |||
5339 | } else { | |||
5340 | assert(OldSAddrIdx == NewVAddrIdx)(static_cast <bool> (OldSAddrIdx == NewVAddrIdx) ? void (0) : __assert_fail ("OldSAddrIdx == NewVAddrIdx", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5340, __extension__ __PRETTY_FUNCTION__)); | |||
5341 | ||||
5342 | if (OldVAddrIdx >= 0) { | |||
5343 | int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc, | |||
5344 | AMDGPU::OpName::vdst_in); | |||
5345 | ||||
5346 | // RemoveOperand doesn't try to fixup tied operand indexes at it goes, so | |||
5347 | // it asserts. Untie the operands for now and retie them afterwards. | |||
5348 | if (NewVDstIn != -1) { | |||
5349 | int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); | |||
5350 | Inst.untieRegOperand(OldVDstIn); | |||
5351 | } | |||
5352 | ||||
5353 | Inst.RemoveOperand(OldVAddrIdx); | |||
5354 | ||||
5355 | if (NewVDstIn != -1) { | |||
5356 | int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst); | |||
5357 | Inst.tieOperands(NewVDst, NewVDstIn); | |||
5358 | } | |||
5359 | } | |||
5360 | } | |||
5361 | ||||
5362 | if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg())) | |||
5363 | VAddrDef->eraseFromParent(); | |||
5364 | ||||
5365 | return true; | |||
5366 | } | |||
5367 | ||||
5368 | // FIXME: Remove this when SelectionDAG is obsoleted. | |||
5369 | void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, | |||
5370 | MachineInstr &MI) const { | |||
5371 | if (!isSegmentSpecificFLAT(MI)) | |||
5372 | return; | |||
5373 | ||||
5374 | // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence | |||
5375 | // thinks they are uniform, so a readfirstlane should be valid. | |||
5376 | MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr); | |||
5377 | if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg()))) | |||
5378 | return; | |||
5379 | ||||
5380 | if (moveFlatAddrToVGPR(MI)) | |||
5381 | return; | |||
5382 | ||||
5383 | Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI); | |||
5384 | SAddr->setReg(ToSGPR); | |||
5385 | } | |||
5386 | ||||
5387 | void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, | |||
5388 | MachineBasicBlock::iterator I, | |||
5389 | const TargetRegisterClass *DstRC, | |||
5390 | MachineOperand &Op, | |||
5391 | MachineRegisterInfo &MRI, | |||
5392 | const DebugLoc &DL) const { | |||
5393 | Register OpReg = Op.getReg(); | |||
5394 | unsigned OpSubReg = Op.getSubReg(); | |||
5395 | ||||
5396 | const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( | |||
5397 | RI.getRegClassForReg(MRI, OpReg), OpSubReg); | |||
5398 | ||||
5399 | // Check if operand is already the correct register class. | |||
5400 | if (DstRC == OpRC) | |||
5401 | return; | |||
5402 | ||||
5403 | Register DstReg = MRI.createVirtualRegister(DstRC); | |||
5404 | auto Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); | |||
5405 | ||||
5406 | Op.setReg(DstReg); | |||
5407 | Op.setSubReg(0); | |||
5408 | ||||
5409 | MachineInstr *Def = MRI.getVRegDef(OpReg); | |||
5410 | if (!Def) | |||
5411 | return; | |||
5412 | ||||
5413 | // Try to eliminate the copy if it is copying an immediate value. | |||
5414 | if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass) | |||
5415 | FoldImmediate(*Copy, *Def, OpReg, &MRI); | |||
5416 | ||||
5417 | bool ImpDef = Def->isImplicitDef(); | |||
5418 | while (!ImpDef && Def && Def->isCopy()) { | |||
5419 | if (Def->getOperand(1).getReg().isPhysical()) | |||
5420 | break; | |||
5421 | Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); | |||
5422 | ImpDef = Def && Def->isImplicitDef(); | |||
5423 | } | |||
5424 | if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) && | |||
5425 | !ImpDef) | |||
5426 | Copy.addReg(AMDGPU::EXEC, RegState::Implicit); | |||
5427 | } | |||
5428 | ||||
5429 | // Emit the actual waterfall loop, executing the wrapped instruction for each | |||
5430 | // unique value of \p Rsrc across all lanes. In the best case we execute 1 | |||
5431 | // iteration, in the worst case we execute 64 (once per lane). | |||
5432 | static void | |||
5433 | emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, | |||
5434 | MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, | |||
5435 | const DebugLoc &DL, MachineOperand &Rsrc) { | |||
5436 | MachineFunction &MF = *OrigBB.getParent(); | |||
5437 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | |||
5438 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); | |||
5439 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
5440 | unsigned SaveExecOpc = | |||
5441 | ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; | |||
5442 | unsigned XorTermOpc = | |||
5443 | ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; | |||
5444 | unsigned AndOpc = | |||
5445 | ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; | |||
5446 | const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
5447 | ||||
5448 | MachineBasicBlock::iterator I = LoopBB.begin(); | |||
5449 | ||||
5450 | SmallVector<Register, 8> ReadlanePieces; | |||
5451 | Register CondReg = AMDGPU::NoRegister; | |||
5452 | ||||
5453 | Register VRsrc = Rsrc.getReg(); | |||
5454 | unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); | |||
5455 | ||||
5456 | unsigned RegSize = TRI->getRegSizeInBits(Rsrc.getReg(), MRI); | |||
5457 | unsigned NumSubRegs = RegSize / 32; | |||
5458 | assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size")(static_cast <bool> (NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size") ? void (0) : __assert_fail ("NumSubRegs % 2 == 0 && NumSubRegs <= 32 && \"Unhandled register size\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 5458, __extension__ __PRETTY_FUNCTION__)); | |||
5459 | ||||
5460 | for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) { | |||
5461 | ||||
5462 | Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5463 | Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5464 | ||||
5465 | // Read the next variant <- also loop target. | |||
5466 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo) | |||
5467 | .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx)); | |||
5468 | ||||
5469 | // Read the next variant <- also loop target. | |||
5470 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi) | |||
5471 | .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx + 1)); | |||
5472 | ||||
5473 | ReadlanePieces.push_back(CurRegLo); | |||
5474 | ReadlanePieces.push_back(CurRegHi); | |||
5475 | ||||
5476 | // Comparison is to be done as 64-bit. | |||
5477 | Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); | |||
5478 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg) | |||
5479 | .addReg(CurRegLo) | |||
5480 | .addImm(AMDGPU::sub0) | |||
5481 | .addReg(CurRegHi) | |||
5482 | .addImm(AMDGPU::sub1); | |||
5483 | ||||
5484 | Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC); | |||
5485 | auto Cmp = | |||
5486 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg) | |||
5487 | .addReg(CurReg); | |||
5488 | if (NumSubRegs <= 2) | |||
5489 | Cmp.addReg(VRsrc); | |||
5490 | else | |||
5491 | Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2)); | |||
5492 | ||||
5493 | // Combine the comparison results with AND. | |||
5494 | if (CondReg == AMDGPU::NoRegister) // First. | |||
5495 | CondReg = NewCondReg; | |||
5496 | else { // If not the first, we create an AND. | |||
5497 | Register AndReg = MRI.createVirtualRegister(BoolXExecRC); | |||
5498 | BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg) | |||
5499 | .addReg(CondReg) | |||
5500 | .addReg(NewCondReg); | |||
5501 | CondReg = AndReg; | |||
5502 | } | |||
5503 | } // End for loop. | |||
5504 | ||||
5505 | auto SRsrcRC = TRI->getEquivalentSGPRClass(MRI.getRegClass(VRsrc)); | |||
5506 | Register SRsrc = MRI.createVirtualRegister(SRsrcRC); | |||
5507 | ||||
5508 | // Build scalar Rsrc. | |||
5509 | auto Merge = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc); | |||
5510 | unsigned Channel = 0; | |||
5511 | for (Register Piece : ReadlanePieces) { | |||
5512 | Merge.addReg(Piece) | |||
5513 | .addImm(TRI->getSubRegFromChannel(Channel++)); | |||
5514 | } | |||
5515 | ||||
5516 | // Update Rsrc operand to use the SGPR Rsrc. | |||
5517 | Rsrc.setReg(SRsrc); | |||
5518 | Rsrc.setIsKill(true); | |||
5519 | ||||
5520 | Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); | |||
5521 | MRI.setSimpleHint(SaveExec, CondReg); | |||
5522 | ||||
5523 | // Update EXEC to matching lanes, saving original to SaveExec. | |||
5524 | BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec) | |||
5525 | .addReg(CondReg, RegState::Kill); | |||
5526 | ||||
5527 | // The original instruction is here; we insert the terminators after it. | |||
5528 | I = LoopBB.end(); | |||
5529 | ||||
5530 | // Update EXEC, switch all done bits to 0 and all todo bits to 1. | |||
5531 | BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec) | |||
5532 | .addReg(Exec) | |||
5533 | .addReg(SaveExec); | |||
5534 | ||||
5535 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB); | |||
5536 | } | |||
5537 | ||||
5538 | // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register | |||
5539 | // with SGPRs by iterating over all unique values across all lanes. | |||
5540 | // Returns the loop basic block that now contains \p MI. | |||
5541 | static MachineBasicBlock * | |||
5542 | loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, | |||
5543 | MachineOperand &Rsrc, MachineDominatorTree *MDT, | |||
5544 | MachineBasicBlock::iterator Begin = nullptr, | |||
5545 | MachineBasicBlock::iterator End = nullptr) { | |||
5546 | MachineBasicBlock &MBB = *MI.getParent(); | |||
5547 | MachineFunction &MF = *MBB.getParent(); | |||
5548 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | |||
5549 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); | |||
5550 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
5551 | if (!Begin.isValid()) | |||
5552 | Begin = &MI; | |||
5553 | if (!End.isValid()) { | |||
5554 | End = &MI; | |||
5555 | ++End; | |||
5556 | } | |||
5557 | const DebugLoc &DL = MI.getDebugLoc(); | |||
5558 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
5559 | unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; | |||
5560 | const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
5561 | ||||
5562 | Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); | |||
5563 | ||||
5564 | // Save the EXEC mask | |||
5565 | BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); | |||
5566 | ||||
5567 | // Killed uses in the instruction we are waterfalling around will be | |||
5568 | // incorrect due to the added control-flow. | |||
5569 | MachineBasicBlock::iterator AfterMI = MI; | |||
5570 | ++AfterMI; | |||
5571 | for (auto I = Begin; I != AfterMI; I++) { | |||
5572 | for (auto &MO : I->uses()) { | |||
5573 | if (MO.isReg() && MO.isUse()) { | |||
5574 | MRI.clearKillFlags(MO.getReg()); | |||
5575 | } | |||
5576 | } | |||
5577 | } | |||
5578 | ||||
5579 | // To insert the loop we need to split the block. Move everything after this | |||
5580 | // point to a new block, and insert a new empty block between the two. | |||
5581 | MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); | |||
5582 | MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); | |||
5583 | MachineFunction::iterator MBBI(MBB); | |||
5584 | ++MBBI; | |||
5585 | ||||
5586 | MF.insert(MBBI, LoopBB); | |||
5587 | MF.insert(MBBI, RemainderBB); | |||
5588 | ||||
5589 | LoopBB->addSuccessor(LoopBB); | |||
5590 | LoopBB->addSuccessor(RemainderBB); | |||
5591 | ||||
5592 | // Move Begin to MI to the LoopBB, and the remainder of the block to | |||
5593 | // RemainderBB. | |||
5594 | RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); | |||
5595 | RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end()); | |||
5596 | LoopBB->splice(LoopBB->begin(), &MBB, Begin, MBB.end()); | |||
5597 | ||||
5598 | MBB.addSuccessor(LoopBB); | |||
5599 | ||||
5600 | // Update dominators. We know that MBB immediately dominates LoopBB, that | |||
5601 | // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately | |||
5602 | // dominates all of the successors transferred to it from MBB that MBB used | |||
5603 | // to properly dominate. | |||
5604 | if (MDT) { | |||
5605 | MDT->addNewBlock(LoopBB, &MBB); | |||
5606 | MDT->addNewBlock(RemainderBB, LoopBB); | |||
5607 | for (auto &Succ : RemainderBB->successors()) { | |||
5608 | if (MDT->properlyDominates(&MBB, Succ)) { | |||
5609 | MDT->changeImmediateDominator(Succ, RemainderBB); | |||
5610 | } | |||
5611 | } | |||
5612 | } | |||
5613 | ||||
5614 | emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc); | |||
5615 | ||||
5616 | // Restore the EXEC mask | |||
5617 | MachineBasicBlock::iterator First = RemainderBB->begin(); | |||
5618 | BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); | |||
5619 | return LoopBB; | |||
5620 | } | |||
5621 | ||||
5622 | // Extract pointer from Rsrc and return a zero-value Rsrc replacement. | |||
5623 | static std::tuple<unsigned, unsigned> | |||
5624 | extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { | |||
5625 | MachineBasicBlock &MBB = *MI.getParent(); | |||
5626 | MachineFunction &MF = *MBB.getParent(); | |||
5627 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
5628 | ||||
5629 | // Extract the ptr from the resource descriptor. | |||
5630 | unsigned RsrcPtr = | |||
5631 | TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass, | |||
5632 | AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); | |||
5633 | ||||
5634 | // Create an empty resource descriptor | |||
5635 | Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); | |||
5636 | Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5637 | Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5638 | Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); | |||
5639 | uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat(); | |||
5640 | ||||
5641 | // Zero64 = 0 | |||
5642 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64) | |||
5643 | .addImm(0); | |||
5644 | ||||
5645 | // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} | |||
5646 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo) | |||
5647 | .addImm(RsrcDataFormat & 0xFFFFFFFF); | |||
5648 | ||||
5649 | // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} | |||
5650 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi) | |||
5651 | .addImm(RsrcDataFormat >> 32); | |||
5652 | ||||
5653 | // NewSRsrc = {Zero64, SRsrcFormat} | |||
5654 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc) | |||
5655 | .addReg(Zero64) | |||
5656 | .addImm(AMDGPU::sub0_sub1) | |||
5657 | .addReg(SRsrcFormatLo) | |||
5658 | .addImm(AMDGPU::sub2) | |||
5659 | .addReg(SRsrcFormatHi) | |||
5660 | .addImm(AMDGPU::sub3); | |||
5661 | ||||
5662 | return std::make_tuple(RsrcPtr, NewSRsrc); | |||
5663 | } | |||
5664 | ||||
5665 | MachineBasicBlock * | |||
5666 | SIInstrInfo::legalizeOperands(MachineInstr &MI, | |||
5667 | MachineDominatorTree *MDT) const { | |||
5668 | MachineFunction &MF = *MI.getParent()->getParent(); | |||
5669 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
5670 | MachineBasicBlock *CreatedBB = nullptr; | |||
5671 | ||||
5672 | // Legalize VOP2 | |||
5673 | if (isVOP2(MI) || isVOPC(MI)) { | |||
5674 | legalizeOperandsVOP2(MRI, MI); | |||
5675 | return CreatedBB; | |||
5676 | } | |||
5677 | ||||
5678 | // Legalize VOP3 | |||
5679 | if (isVOP3(MI)) { | |||
5680 | legalizeOperandsVOP3(MRI, MI); | |||
5681 | return CreatedBB; | |||
5682 | } | |||
5683 | ||||
5684 | // Legalize SMRD | |||
5685 | if (isSMRD(MI)) { | |||
5686 | legalizeOperandsSMRD(MRI, MI); | |||
5687 | return CreatedBB; | |||
5688 | } | |||
5689 | ||||
5690 | // Legalize FLAT | |||
5691 | if (isFLAT(MI)) { | |||
5692 | legalizeOperandsFLAT(MRI, MI); | |||
5693 | return CreatedBB; | |||
5694 | } | |||
5695 | ||||
5696 | // Legalize REG_SEQUENCE and PHI | |||
5697 | // The register class of the operands much be the same type as the register | |||
5698 | // class of the output. | |||
5699 | if (MI.getOpcode() == AMDGPU::PHI) { | |||
5700 | const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; | |||
5701 | for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { | |||
5702 | if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual()) | |||
5703 | continue; | |||
5704 | const TargetRegisterClass *OpRC = | |||
5705 | MRI.getRegClass(MI.getOperand(i).getReg()); | |||
5706 | if (RI.hasVectorRegisters(OpRC)) { | |||
5707 | VRC = OpRC; | |||
5708 | } else { | |||
5709 | SRC = OpRC; | |||
5710 | } | |||
5711 | } | |||
5712 | ||||
5713 | // If any of the operands are VGPR registers, then they all most be | |||
5714 | // otherwise we will create illegal VGPR->SGPR copies when legalizing | |||
5715 | // them. | |||
5716 | if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { | |||
5717 | if (!VRC) { | |||
5718 | assert(SRC)(static_cast <bool> (SRC) ? void (0) : __assert_fail ("SRC" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 5718, __extension__ __PRETTY_FUNCTION__)); | |||
5719 | if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { | |||
5720 | VRC = &AMDGPU::VReg_1RegClass; | |||
5721 | } else | |||
5722 | VRC = RI.isAGPRClass(getOpRegClass(MI, 0)) | |||
5723 | ? RI.getEquivalentAGPRClass(SRC) | |||
5724 | : RI.getEquivalentVGPRClass(SRC); | |||
5725 | } else { | |||
5726 | VRC = RI.isAGPRClass(getOpRegClass(MI, 0)) | |||
5727 | ? RI.getEquivalentAGPRClass(VRC) | |||
5728 | : RI.getEquivalentVGPRClass(VRC); | |||
5729 | } | |||
5730 | RC = VRC; | |||
5731 | } else { | |||
5732 | RC = SRC; | |||
5733 | } | |||
5734 | ||||
5735 | // Update all the operands so they have the same type. | |||
5736 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { | |||
5737 | MachineOperand &Op = MI.getOperand(I); | |||
5738 | if (!Op.isReg() || !Op.getReg().isVirtual()) | |||
5739 | continue; | |||
5740 | ||||
5741 | // MI is a PHI instruction. | |||
5742 | MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); | |||
5743 | MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); | |||
5744 | ||||
5745 | // Avoid creating no-op copies with the same src and dst reg class. These | |||
5746 | // confuse some of the machine passes. | |||
5747 | legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); | |||
5748 | } | |||
5749 | } | |||
5750 | ||||
5751 | // REG_SEQUENCE doesn't really require operand legalization, but if one has a | |||
5752 | // VGPR dest type and SGPR sources, insert copies so all operands are | |||
5753 | // VGPRs. This seems to help operand folding / the register coalescer. | |||
5754 | if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { | |||
5755 | MachineBasicBlock *MBB = MI.getParent(); | |||
5756 | const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); | |||
5757 | if (RI.hasVGPRs(DstRC)) { | |||
5758 | // Update all the operands so they are VGPR register classes. These may | |||
5759 | // not be the same register class because REG_SEQUENCE supports mixing | |||
5760 | // subregister index types e.g. sub0_sub1 + sub2 + sub3 | |||
5761 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { | |||
5762 | MachineOperand &Op = MI.getOperand(I); | |||
5763 | if (!Op.isReg() || !Op.getReg().isVirtual()) | |||
5764 | continue; | |||
5765 | ||||
5766 | const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); | |||
5767 | const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); | |||
5768 | if (VRC == OpRC) | |||
5769 | continue; | |||
5770 | ||||
5771 | legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); | |||
5772 | Op.setIsKill(); | |||
5773 | } | |||
5774 | } | |||
5775 | ||||
5776 | return CreatedBB; | |||
5777 | } | |||
5778 | ||||
5779 | // Legalize INSERT_SUBREG | |||
5780 | // src0 must have the same register class as dst | |||
5781 | if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { | |||
5782 | Register Dst = MI.getOperand(0).getReg(); | |||
5783 | Register Src0 = MI.getOperand(1).getReg(); | |||
5784 | const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); | |||
5785 | const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); | |||
5786 | if (DstRC != Src0RC) { | |||
5787 | MachineBasicBlock *MBB = MI.getParent(); | |||
5788 | MachineOperand &Op = MI.getOperand(1); | |||
5789 | legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); | |||
5790 | } | |||
5791 | return CreatedBB; | |||
5792 | } | |||
5793 | ||||
5794 | // Legalize SI_INIT_M0 | |||
5795 | if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { | |||
5796 | MachineOperand &Src = MI.getOperand(0); | |||
5797 | if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) | |||
5798 | Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); | |||
5799 | return CreatedBB; | |||
5800 | } | |||
5801 | ||||
5802 | // Legalize MIMG and MUBUF/MTBUF for shaders. | |||
5803 | // | |||
5804 | // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via | |||
5805 | // scratch memory access. In both cases, the legalization never involves | |||
5806 | // conversion to the addr64 form. | |||
5807 | if (isMIMG(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) && | |||
5808 | (isMUBUF(MI) || isMTBUF(MI)))) { | |||
5809 | MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); | |||
5810 | if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) | |||
5811 | CreatedBB = loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT); | |||
5812 | ||||
5813 | MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); | |||
5814 | if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) | |||
5815 | CreatedBB = loadSRsrcFromVGPR(*this, MI, *SSamp, MDT); | |||
5816 | ||||
5817 | return CreatedBB; | |||
5818 | } | |||
5819 | ||||
5820 | // Legalize SI_CALL | |||
5821 | if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { | |||
5822 | MachineOperand *Dest = &MI.getOperand(0); | |||
5823 | if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) { | |||
5824 | // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and | |||
5825 | // following copies, we also need to move copies from and to physical | |||
5826 | // registers into the loop block. | |||
5827 | unsigned FrameSetupOpcode = getCallFrameSetupOpcode(); | |||
5828 | unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode(); | |||
5829 | ||||
5830 | // Also move the copies to physical registers into the loop block | |||
5831 | MachineBasicBlock &MBB = *MI.getParent(); | |||
5832 | MachineBasicBlock::iterator Start(&MI); | |||
5833 | while (Start->getOpcode() != FrameSetupOpcode) | |||
5834 | --Start; | |||
5835 | MachineBasicBlock::iterator End(&MI); | |||
5836 | while (End->getOpcode() != FrameDestroyOpcode) | |||
5837 | ++End; | |||
5838 | // Also include following copies of the return value | |||
5839 | ++End; | |||
5840 | while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() && | |||
5841 | MI.definesRegister(End->getOperand(1).getReg())) | |||
5842 | ++End; | |||
5843 | CreatedBB = loadSRsrcFromVGPR(*this, MI, *Dest, MDT, Start, End); | |||
5844 | } | |||
5845 | } | |||
5846 | ||||
5847 | // Legalize MUBUF* instructions. | |||
5848 | int RsrcIdx = | |||
5849 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); | |||
5850 | if (RsrcIdx != -1) { | |||
5851 | // We have an MUBUF instruction | |||
5852 | MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); | |||
5853 | unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass; | |||
5854 | if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()), | |||
5855 | RI.getRegClass(RsrcRC))) { | |||
5856 | // The operands are legal. | |||
5857 | // FIXME: We may need to legalize operands besides srsrc. | |||
5858 | return CreatedBB; | |||
5859 | } | |||
5860 | ||||
5861 | // Legalize a VGPR Rsrc. | |||
5862 | // | |||
5863 | // If the instruction is _ADDR64, we can avoid a waterfall by extracting | |||
5864 | // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using | |||
5865 | // a zero-value SRsrc. | |||
5866 | // | |||
5867 | // If the instruction is _OFFSET (both idxen and offen disabled), and we | |||
5868 | // support ADDR64 instructions, we can convert to ADDR64 and do the same as | |||
5869 | // above. | |||
5870 | // | |||
5871 | // Otherwise we are on non-ADDR64 hardware, and/or we have | |||
5872 | // idxen/offen/bothen and we fall back to a waterfall loop. | |||
5873 | ||||
5874 | MachineBasicBlock &MBB = *MI.getParent(); | |||
5875 | ||||
5876 | MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); | |||
5877 | if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) { | |||
5878 | // This is already an ADDR64 instruction so we need to add the pointer | |||
5879 | // extracted from the resource descriptor to the current value of VAddr. | |||
5880 | Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
5881 | Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
5882 | Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); | |||
5883 | ||||
5884 | const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
5885 | Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); | |||
5886 | Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); | |||
5887 | ||||
5888 | unsigned RsrcPtr, NewSRsrc; | |||
5889 | std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); | |||
5890 | ||||
5891 | // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0 | |||
5892 | const DebugLoc &DL = MI.getDebugLoc(); | |||
5893 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo) | |||
5894 | .addDef(CondReg0) | |||
5895 | .addReg(RsrcPtr, 0, AMDGPU::sub0) | |||
5896 | .addReg(VAddr->getReg(), 0, AMDGPU::sub0) | |||
5897 | .addImm(0); | |||
5898 | ||||
5899 | // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1 | |||
5900 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi) | |||
5901 | .addDef(CondReg1, RegState::Dead) | |||
5902 | .addReg(RsrcPtr, 0, AMDGPU::sub1) | |||
5903 | .addReg(VAddr->getReg(), 0, AMDGPU::sub1) | |||
5904 | .addReg(CondReg0, RegState::Kill) | |||
5905 | .addImm(0); | |||
5906 | ||||
5907 | // NewVaddr = {NewVaddrHi, NewVaddrLo} | |||
5908 | BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) | |||
5909 | .addReg(NewVAddrLo) | |||
5910 | .addImm(AMDGPU::sub0) | |||
5911 | .addReg(NewVAddrHi) | |||
5912 | .addImm(AMDGPU::sub1); | |||
5913 | ||||
5914 | VAddr->setReg(NewVAddr); | |||
5915 | Rsrc->setReg(NewSRsrc); | |||
5916 | } else if (!VAddr && ST.hasAddr64()) { | |||
5917 | // This instructions is the _OFFSET variant, so we need to convert it to | |||
5918 | // ADDR64. | |||
5919 | assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&(static_cast <bool> (ST.getGeneration() < AMDGPUSubtarget ::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here" ) ? void (0) : __assert_fail ("ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && \"FIXME: Need to emit flat atomics here\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 5920, __extension__ __PRETTY_FUNCTION__)) | |||
5920 | "FIXME: Need to emit flat atomics here")(static_cast <bool> (ST.getGeneration() < AMDGPUSubtarget ::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here" ) ? void (0) : __assert_fail ("ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && \"FIXME: Need to emit flat atomics here\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 5920, __extension__ __PRETTY_FUNCTION__)); | |||
5921 | ||||
5922 | unsigned RsrcPtr, NewSRsrc; | |||
5923 | std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); | |||
5924 | ||||
5925 | Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); | |||
5926 | MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); | |||
5927 | MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); | |||
5928 | MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); | |||
5929 | unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); | |||
5930 | ||||
5931 | // Atomics with return have an additional tied operand and are | |||
5932 | // missing some of the special bits. | |||
5933 | MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); | |||
5934 | MachineInstr *Addr64; | |||
5935 | ||||
5936 | if (!VDataIn) { | |||
5937 | // Regular buffer load / store. | |||
5938 | MachineInstrBuilder MIB = | |||
5939 | BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) | |||
5940 | .add(*VData) | |||
5941 | .addReg(NewVAddr) | |||
5942 | .addReg(NewSRsrc) | |||
5943 | .add(*SOffset) | |||
5944 | .add(*Offset); | |||
5945 | ||||
5946 | if (const MachineOperand *CPol = | |||
5947 | getNamedOperand(MI, AMDGPU::OpName::cpol)) { | |||
5948 | MIB.addImm(CPol->getImm()); | |||
5949 | } | |||
5950 | ||||
5951 | if (const MachineOperand *TFE = | |||
5952 | getNamedOperand(MI, AMDGPU::OpName::tfe)) { | |||
5953 | MIB.addImm(TFE->getImm()); | |||
5954 | } | |||
5955 | ||||
5956 | MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz)); | |||
5957 | ||||
5958 | MIB.cloneMemRefs(MI); | |||
5959 | Addr64 = MIB; | |||
5960 | } else { | |||
5961 | // Atomics with return. | |||
5962 | Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) | |||
5963 | .add(*VData) | |||
5964 | .add(*VDataIn) | |||
5965 | .addReg(NewVAddr) | |||
5966 | .addReg(NewSRsrc) | |||
5967 | .add(*SOffset) | |||
5968 | .add(*Offset) | |||
5969 | .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol)) | |||
5970 | .cloneMemRefs(MI); | |||
5971 | } | |||
5972 | ||||
5973 | MI.removeFromParent(); | |||
5974 | ||||
5975 | // NewVaddr = {NewVaddrHi, NewVaddrLo} | |||
5976 | BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), | |||
5977 | NewVAddr) | |||
5978 | .addReg(RsrcPtr, 0, AMDGPU::sub0) | |||
5979 | .addImm(AMDGPU::sub0) | |||
5980 | .addReg(RsrcPtr, 0, AMDGPU::sub1) | |||
5981 | .addImm(AMDGPU::sub1); | |||
5982 | } else { | |||
5983 | // This is another variant; legalize Rsrc with waterfall loop from VGPRs | |||
5984 | // to SGPRs. | |||
5985 | CreatedBB = loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT); | |||
5986 | return CreatedBB; | |||
5987 | } | |||
5988 | } | |||
5989 | return CreatedBB; | |||
5990 | } | |||
5991 | ||||
5992 | MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, | |||
5993 | MachineDominatorTree *MDT) const { | |||
5994 | SetVectorType Worklist; | |||
5995 | Worklist.insert(&TopInst); | |||
5996 | MachineBasicBlock *CreatedBB = nullptr; | |||
5997 | MachineBasicBlock *CreatedBBTmp = nullptr; | |||
5998 | ||||
5999 | while (!Worklist.empty()) { | |||
6000 | MachineInstr &Inst = *Worklist.pop_back_val(); | |||
6001 | MachineBasicBlock *MBB = Inst.getParent(); | |||
6002 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | |||
6003 | ||||
6004 | unsigned Opcode = Inst.getOpcode(); | |||
6005 | unsigned NewOpcode = getVALUOp(Inst); | |||
6006 | ||||
6007 | // Handle some special cases | |||
6008 | switch (Opcode) { | |||
6009 | default: | |||
6010 | break; | |||
6011 | case AMDGPU::S_ADD_U64_PSEUDO: | |||
6012 | case AMDGPU::S_SUB_U64_PSEUDO: | |||
6013 | splitScalar64BitAddSub(Worklist, Inst, MDT); | |||
6014 | Inst.eraseFromParent(); | |||
6015 | continue; | |||
6016 | case AMDGPU::S_ADD_I32: | |||
6017 | case AMDGPU::S_SUB_I32: { | |||
6018 | // FIXME: The u32 versions currently selected use the carry. | |||
6019 | bool Changed; | |||
6020 | std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT); | |||
6021 | if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) | |||
6022 | CreatedBB = CreatedBBTmp; | |||
6023 | if (Changed) | |||
6024 | continue; | |||
6025 | ||||
6026 | // Default handling | |||
6027 | break; | |||
6028 | } | |||
6029 | case AMDGPU::S_AND_B64: | |||
6030 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); | |||
6031 | Inst.eraseFromParent(); | |||
6032 | continue; | |||
6033 | ||||
6034 | case AMDGPU::S_OR_B64: | |||
6035 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); | |||
6036 | Inst.eraseFromParent(); | |||
6037 | continue; | |||
6038 | ||||
6039 | case AMDGPU::S_XOR_B64: | |||
6040 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); | |||
6041 | Inst.eraseFromParent(); | |||
6042 | continue; | |||
6043 | ||||
6044 | case AMDGPU::S_NAND_B64: | |||
6045 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); | |||
6046 | Inst.eraseFromParent(); | |||
6047 | continue; | |||
6048 | ||||
6049 | case AMDGPU::S_NOR_B64: | |||
6050 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); | |||
6051 | Inst.eraseFromParent(); | |||
6052 | continue; | |||
6053 | ||||
6054 | case AMDGPU::S_XNOR_B64: | |||
6055 | if (ST.hasDLInsts()) | |||
6056 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); | |||
6057 | else | |||
6058 | splitScalar64BitXnor(Worklist, Inst, MDT); | |||
6059 | Inst.eraseFromParent(); | |||
6060 | continue; | |||
6061 | ||||
6062 | case AMDGPU::S_ANDN2_B64: | |||
6063 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); | |||
6064 | Inst.eraseFromParent(); | |||
6065 | continue; | |||
6066 | ||||
6067 | case AMDGPU::S_ORN2_B64: | |||
6068 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); | |||
6069 | Inst.eraseFromParent(); | |||
6070 | continue; | |||
6071 | ||||
6072 | case AMDGPU::S_BREV_B64: | |||
6073 | splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true); | |||
6074 | Inst.eraseFromParent(); | |||
6075 | continue; | |||
6076 | ||||
6077 | case AMDGPU::S_NOT_B64: | |||
6078 | splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); | |||
6079 | Inst.eraseFromParent(); | |||
6080 | continue; | |||
6081 | ||||
6082 | case AMDGPU::S_BCNT1_I32_B64: | |||
6083 | splitScalar64BitBCNT(Worklist, Inst); | |||
6084 | Inst.eraseFromParent(); | |||
6085 | continue; | |||
6086 | ||||
6087 | case AMDGPU::S_BFE_I64: | |||
6088 | splitScalar64BitBFE(Worklist, Inst); | |||
6089 | Inst.eraseFromParent(); | |||
6090 | continue; | |||
6091 | ||||
6092 | case AMDGPU::S_LSHL_B32: | |||
6093 | if (ST.hasOnlyRevVALUShifts()) { | |||
6094 | NewOpcode = AMDGPU::V_LSHLREV_B32_e64; | |||
6095 | swapOperands(Inst); | |||
6096 | } | |||
6097 | break; | |||
6098 | case AMDGPU::S_ASHR_I32: | |||
6099 | if (ST.hasOnlyRevVALUShifts()) { | |||
6100 | NewOpcode = AMDGPU::V_ASHRREV_I32_e64; | |||
6101 | swapOperands(Inst); | |||
6102 | } | |||
6103 | break; | |||
6104 | case AMDGPU::S_LSHR_B32: | |||
6105 | if (ST.hasOnlyRevVALUShifts()) { | |||
6106 | NewOpcode = AMDGPU::V_LSHRREV_B32_e64; | |||
6107 | swapOperands(Inst); | |||
6108 | } | |||
6109 | break; | |||
6110 | case AMDGPU::S_LSHL_B64: | |||
6111 | if (ST.hasOnlyRevVALUShifts()) { | |||
6112 | NewOpcode = AMDGPU::V_LSHLREV_B64_e64; | |||
6113 | swapOperands(Inst); | |||
6114 | } | |||
6115 | break; | |||
6116 | case AMDGPU::S_ASHR_I64: | |||
6117 | if (ST.hasOnlyRevVALUShifts()) { | |||
6118 | NewOpcode = AMDGPU::V_ASHRREV_I64_e64; | |||
6119 | swapOperands(Inst); | |||
6120 | } | |||
6121 | break; | |||
6122 | case AMDGPU::S_LSHR_B64: | |||
6123 | if (ST.hasOnlyRevVALUShifts()) { | |||
6124 | NewOpcode = AMDGPU::V_LSHRREV_B64_e64; | |||
6125 | swapOperands(Inst); | |||
6126 | } | |||
6127 | break; | |||
6128 | ||||
6129 | case AMDGPU::S_ABS_I32: | |||
6130 | lowerScalarAbs(Worklist, Inst); | |||
6131 | Inst.eraseFromParent(); | |||
6132 | continue; | |||
6133 | ||||
6134 | case AMDGPU::S_CBRANCH_SCC0: | |||
6135 | case AMDGPU::S_CBRANCH_SCC1: { | |||
6136 | // Clear unused bits of vcc | |||
6137 | Register CondReg = Inst.getOperand(1).getReg(); | |||
6138 | bool IsSCC = CondReg == AMDGPU::SCC; | |||
6139 | Register VCC = RI.getVCC(); | |||
6140 | Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
6141 | unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; | |||
6142 | BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC) | |||
6143 | .addReg(EXEC) | |||
6144 | .addReg(IsSCC ? VCC : CondReg); | |||
6145 | Inst.RemoveOperand(1); | |||
6146 | } | |||
6147 | break; | |||
6148 | ||||
6149 | case AMDGPU::S_BFE_U64: | |||
6150 | case AMDGPU::S_BFM_B64: | |||
6151 | llvm_unreachable("Moving this op to VALU not implemented")::llvm::llvm_unreachable_internal("Moving this op to VALU not implemented" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 6151); | |||
6152 | ||||
6153 | case AMDGPU::S_PACK_LL_B32_B16: | |||
6154 | case AMDGPU::S_PACK_LH_B32_B16: | |||
6155 | case AMDGPU::S_PACK_HH_B32_B16: | |||
6156 | movePackToVALU(Worklist, MRI, Inst); | |||
6157 | Inst.eraseFromParent(); | |||
6158 | continue; | |||
6159 | ||||
6160 | case AMDGPU::S_XNOR_B32: | |||
6161 | lowerScalarXnor(Worklist, Inst); | |||
6162 | Inst.eraseFromParent(); | |||
6163 | continue; | |||
6164 | ||||
6165 | case AMDGPU::S_NAND_B32: | |||
6166 | splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); | |||
6167 | Inst.eraseFromParent(); | |||
6168 | continue; | |||
6169 | ||||
6170 | case AMDGPU::S_NOR_B32: | |||
6171 | splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); | |||
6172 | Inst.eraseFromParent(); | |||
6173 | continue; | |||
6174 | ||||
6175 | case AMDGPU::S_ANDN2_B32: | |||
6176 | splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); | |||
6177 | Inst.eraseFromParent(); | |||
6178 | continue; | |||
6179 | ||||
6180 | case AMDGPU::S_ORN2_B32: | |||
6181 | splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); | |||
6182 | Inst.eraseFromParent(); | |||
6183 | continue; | |||
6184 | ||||
6185 | // TODO: remove as soon as everything is ready | |||
6186 | // to replace VGPR to SGPR copy with V_READFIRSTLANEs. | |||
6187 | // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO | |||
6188 | // can only be selected from the uniform SDNode. | |||
6189 | case AMDGPU::S_ADD_CO_PSEUDO: | |||
6190 | case AMDGPU::S_SUB_CO_PSEUDO: { | |||
6191 | unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) | |||
6192 | ? AMDGPU::V_ADDC_U32_e64 | |||
6193 | : AMDGPU::V_SUBB_U32_e64; | |||
6194 | const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
6195 | ||||
6196 | Register CarryInReg = Inst.getOperand(4).getReg(); | |||
6197 | if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { | |||
6198 | Register NewCarryReg = MRI.createVirtualRegister(CarryRC); | |||
6199 | BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) | |||
6200 | .addReg(CarryInReg); | |||
6201 | } | |||
6202 | ||||
6203 | Register CarryOutReg = Inst.getOperand(1).getReg(); | |||
6204 | ||||
6205 | Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass( | |||
6206 | MRI.getRegClass(Inst.getOperand(0).getReg()))); | |||
6207 | MachineInstr *CarryOp = | |||
6208 | BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg) | |||
6209 | .addReg(CarryOutReg, RegState::Define) | |||
6210 | .add(Inst.getOperand(2)) | |||
6211 | .add(Inst.getOperand(3)) | |||
6212 | .addReg(CarryInReg) | |||
6213 | .addImm(0); | |||
6214 | CreatedBBTmp = legalizeOperands(*CarryOp); | |||
6215 | if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) | |||
6216 | CreatedBB = CreatedBBTmp; | |||
6217 | MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); | |||
6218 | addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); | |||
6219 | Inst.eraseFromParent(); | |||
6220 | } | |||
6221 | continue; | |||
6222 | case AMDGPU::S_UADDO_PSEUDO: | |||
6223 | case AMDGPU::S_USUBO_PSEUDO: { | |||
6224 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6225 | MachineOperand &Dest0 = Inst.getOperand(0); | |||
6226 | MachineOperand &Dest1 = Inst.getOperand(1); | |||
6227 | MachineOperand &Src0 = Inst.getOperand(2); | |||
6228 | MachineOperand &Src1 = Inst.getOperand(3); | |||
6229 | ||||
6230 | unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO) | |||
6231 | ? AMDGPU::V_ADD_CO_U32_e64 | |||
6232 | : AMDGPU::V_SUB_CO_U32_e64; | |||
6233 | const TargetRegisterClass *NewRC = | |||
6234 | RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg())); | |||
6235 | Register DestReg = MRI.createVirtualRegister(NewRC); | |||
6236 | MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg) | |||
6237 | .addReg(Dest1.getReg(), RegState::Define) | |||
6238 | .add(Src0) | |||
6239 | .add(Src1) | |||
6240 | .addImm(0); // clamp bit | |||
6241 | ||||
6242 | CreatedBBTmp = legalizeOperands(*NewInstr, MDT); | |||
6243 | if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) | |||
6244 | CreatedBB = CreatedBBTmp; | |||
6245 | ||||
6246 | MRI.replaceRegWith(Dest0.getReg(), DestReg); | |||
6247 | addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, | |||
6248 | Worklist); | |||
6249 | Inst.eraseFromParent(); | |||
6250 | } | |||
6251 | continue; | |||
6252 | ||||
6253 | case AMDGPU::S_CSELECT_B32: | |||
6254 | case AMDGPU::S_CSELECT_B64: | |||
6255 | lowerSelect(Worklist, Inst, MDT); | |||
6256 | Inst.eraseFromParent(); | |||
6257 | continue; | |||
6258 | case AMDGPU::S_CMP_EQ_I32: | |||
6259 | case AMDGPU::S_CMP_LG_I32: | |||
6260 | case AMDGPU::S_CMP_GT_I32: | |||
6261 | case AMDGPU::S_CMP_GE_I32: | |||
6262 | case AMDGPU::S_CMP_LT_I32: | |||
6263 | case AMDGPU::S_CMP_LE_I32: | |||
6264 | case AMDGPU::S_CMP_EQ_U32: | |||
6265 | case AMDGPU::S_CMP_LG_U32: | |||
6266 | case AMDGPU::S_CMP_GT_U32: | |||
6267 | case AMDGPU::S_CMP_GE_U32: | |||
6268 | case AMDGPU::S_CMP_LT_U32: | |||
6269 | case AMDGPU::S_CMP_LE_U32: | |||
6270 | case AMDGPU::S_CMP_EQ_U64: | |||
6271 | case AMDGPU::S_CMP_LG_U64: { | |||
6272 | const MCInstrDesc &NewDesc = get(NewOpcode); | |||
6273 | Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass()); | |||
6274 | MachineInstr *NewInstr = | |||
6275 | BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg) | |||
6276 | .add(Inst.getOperand(0)) | |||
6277 | .add(Inst.getOperand(1)); | |||
6278 | legalizeOperands(*NewInstr, MDT); | |||
6279 | int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC); | |||
6280 | MachineOperand SCCOp = Inst.getOperand(SCCIdx); | |||
6281 | addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); | |||
6282 | Inst.eraseFromParent(); | |||
6283 | } | |||
6284 | continue; | |||
6285 | } | |||
6286 | ||||
6287 | ||||
6288 | if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { | |||
6289 | // We cannot move this instruction to the VALU, so we should try to | |||
6290 | // legalize its operands instead. | |||
6291 | CreatedBBTmp = legalizeOperands(Inst, MDT); | |||
6292 | if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) | |||
6293 | CreatedBB = CreatedBBTmp; | |||
6294 | continue; | |||
6295 | } | |||
6296 | ||||
6297 | // Use the new VALU Opcode. | |||
6298 | const MCInstrDesc &NewDesc = get(NewOpcode); | |||
6299 | Inst.setDesc(NewDesc); | |||
6300 | ||||
6301 | // Remove any references to SCC. Vector instructions can't read from it, and | |||
6302 | // We're just about to add the implicit use / defs of VCC, and we don't want | |||
6303 | // both. | |||
6304 | for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { | |||
6305 | MachineOperand &Op = Inst.getOperand(i); | |||
6306 | if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { | |||
6307 | // Only propagate through live-def of SCC. | |||
6308 | if (Op.isDef() && !Op.isDead()) | |||
6309 | addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); | |||
6310 | if (Op.isUse()) | |||
6311 | addSCCDefsToVALUWorklist(Op, Worklist); | |||
6312 | Inst.RemoveOperand(i); | |||
6313 | } | |||
6314 | } | |||
6315 | ||||
6316 | if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { | |||
6317 | // We are converting these to a BFE, so we need to add the missing | |||
6318 | // operands for the size and offset. | |||
6319 | unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; | |||
6320 | Inst.addOperand(MachineOperand::CreateImm(0)); | |||
6321 | Inst.addOperand(MachineOperand::CreateImm(Size)); | |||
6322 | ||||
6323 | } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { | |||
6324 | // The VALU version adds the second operand to the result, so insert an | |||
6325 | // extra 0 operand. | |||
6326 | Inst.addOperand(MachineOperand::CreateImm(0)); | |||
6327 | } | |||
6328 | ||||
6329 | Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); | |||
6330 | fixImplicitOperands(Inst); | |||
6331 | ||||
6332 | if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { | |||
6333 | const MachineOperand &OffsetWidthOp = Inst.getOperand(2); | |||
6334 | // If we need to move this to VGPRs, we need to unpack the second operand | |||
6335 | // back into the 2 separate ones for bit offset and width. | |||
6336 | assert(OffsetWidthOp.isImm() &&(static_cast <bool> (OffsetWidthOp.isImm() && "Scalar BFE is only implemented for constant width and offset" ) ? void (0) : __assert_fail ("OffsetWidthOp.isImm() && \"Scalar BFE is only implemented for constant width and offset\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 6337, __extension__ __PRETTY_FUNCTION__)) | |||
6337 | "Scalar BFE is only implemented for constant width and offset")(static_cast <bool> (OffsetWidthOp.isImm() && "Scalar BFE is only implemented for constant width and offset" ) ? void (0) : __assert_fail ("OffsetWidthOp.isImm() && \"Scalar BFE is only implemented for constant width and offset\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 6337, __extension__ __PRETTY_FUNCTION__)); | |||
6338 | uint32_t Imm = OffsetWidthOp.getImm(); | |||
6339 | ||||
6340 | uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. | |||
6341 | uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. | |||
6342 | Inst.RemoveOperand(2); // Remove old immediate. | |||
6343 | Inst.addOperand(MachineOperand::CreateImm(Offset)); | |||
6344 | Inst.addOperand(MachineOperand::CreateImm(BitWidth)); | |||
6345 | } | |||
6346 | ||||
6347 | bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); | |||
6348 | unsigned NewDstReg = AMDGPU::NoRegister; | |||
6349 | if (HasDst) { | |||
6350 | Register DstReg = Inst.getOperand(0).getReg(); | |||
6351 | if (DstReg.isPhysical()) | |||
6352 | continue; | |||
6353 | ||||
6354 | // Update the destination register class. | |||
6355 | const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); | |||
6356 | if (!NewDstRC) | |||
6357 | continue; | |||
6358 | ||||
6359 | if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && | |||
6360 | NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { | |||
6361 | // Instead of creating a copy where src and dst are the same register | |||
6362 | // class, we just replace all uses of dst with src. These kinds of | |||
6363 | // copies interfere with the heuristics MachineSink uses to decide | |||
6364 | // whether or not to split a critical edge. Since the pass assumes | |||
6365 | // that copies will end up as machine instructions and not be | |||
6366 | // eliminated. | |||
6367 | addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); | |||
6368 | MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); | |||
6369 | MRI.clearKillFlags(Inst.getOperand(1).getReg()); | |||
6370 | Inst.getOperand(0).setReg(DstReg); | |||
6371 | ||||
6372 | // Make sure we don't leave around a dead VGPR->SGPR copy. Normally | |||
6373 | // these are deleted later, but at -O0 it would leave a suspicious | |||
6374 | // looking illegal copy of an undef register. | |||
6375 | for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) | |||
6376 | Inst.RemoveOperand(I); | |||
6377 | Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); | |||
6378 | continue; | |||
6379 | } | |||
6380 | ||||
6381 | NewDstReg = MRI.createVirtualRegister(NewDstRC); | |||
6382 | MRI.replaceRegWith(DstReg, NewDstReg); | |||
6383 | } | |||
6384 | ||||
6385 | // Legalize the operands | |||
6386 | CreatedBBTmp = legalizeOperands(Inst, MDT); | |||
6387 | if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) | |||
6388 | CreatedBB = CreatedBBTmp; | |||
6389 | ||||
6390 | if (HasDst) | |||
6391 | addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); | |||
6392 | } | |||
6393 | return CreatedBB; | |||
6394 | } | |||
6395 | ||||
6396 | // Add/sub require special handling to deal with carry outs. | |||
6397 | std::pair<bool, MachineBasicBlock *> | |||
6398 | SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, | |||
6399 | MachineDominatorTree *MDT) const { | |||
6400 | if (ST.hasAddNoCarry()) { | |||
6401 | // Assume there is no user of scc since we don't select this in that case. | |||
6402 | // Since scc isn't used, it doesn't really matter if the i32 or u32 variant | |||
6403 | // is used. | |||
6404 | ||||
6405 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6406 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6407 | ||||
6408 | Register OldDstReg = Inst.getOperand(0).getReg(); | |||
6409 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6410 | ||||
6411 | unsigned Opc = Inst.getOpcode(); | |||
6412 | assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32)(static_cast <bool> (Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32) ? void (0) : __assert_fail ("Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 6412, __extension__ __PRETTY_FUNCTION__)); | |||
6413 | ||||
6414 | unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ? | |||
6415 | AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64; | |||
6416 | ||||
6417 | assert(Inst.getOperand(3).getReg() == AMDGPU::SCC)(static_cast <bool> (Inst.getOperand(3).getReg() == AMDGPU ::SCC) ? void (0) : __assert_fail ("Inst.getOperand(3).getReg() == AMDGPU::SCC" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 6417, __extension__ __PRETTY_FUNCTION__)); | |||
6418 | Inst.RemoveOperand(3); | |||
6419 | ||||
6420 | Inst.setDesc(get(NewOpc)); | |||
6421 | Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit | |||
6422 | Inst.addImplicitDefUseOperands(*MBB.getParent()); | |||
6423 | MRI.replaceRegWith(OldDstReg, ResultReg); | |||
6424 | MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT); | |||
6425 | ||||
6426 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
6427 | return std::make_pair(true, NewBB); | |||
6428 | } | |||
6429 | ||||
6430 | return std::make_pair(false, nullptr); | |||
6431 | } | |||
6432 | ||||
6433 | void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, | |||
6434 | MachineDominatorTree *MDT) const { | |||
6435 | ||||
6436 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6437 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6438 | MachineBasicBlock::iterator MII = Inst; | |||
6439 | DebugLoc DL = Inst.getDebugLoc(); | |||
6440 | ||||
6441 | MachineOperand &Dest = Inst.getOperand(0); | |||
6442 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6443 | MachineOperand &Src1 = Inst.getOperand(2); | |||
6444 | MachineOperand &Cond = Inst.getOperand(3); | |||
6445 | ||||
6446 | Register SCCSource = Cond.getReg(); | |||
6447 | bool IsSCC = (SCCSource == AMDGPU::SCC); | |||
6448 | ||||
6449 | // If this is a trivial select where the condition is effectively not SCC | |||
6450 | // (SCCSource is a source of copy to SCC), then the select is semantically | |||
6451 | // equivalent to copying SCCSource. Hence, there is no need to create | |||
6452 | // V_CNDMASK, we can just use that and bail out. | |||
6453 | if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() && | |||
6454 | (Src1.getImm() == 0)) { | |||
6455 | MRI.replaceRegWith(Dest.getReg(), SCCSource); | |||
6456 | return; | |||
6457 | } | |||
6458 | ||||
6459 | const TargetRegisterClass *TC = | |||
6460 | RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
6461 | ||||
6462 | Register CopySCC = MRI.createVirtualRegister(TC); | |||
6463 | ||||
6464 | if (IsSCC) { | |||
6465 | // Now look for the closest SCC def if it is a copy | |||
6466 | // replacing the SCCSource with the COPY source register | |||
6467 | bool CopyFound = false; | |||
6468 | for (MachineInstr &CandI : | |||
6469 | make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)), | |||
6470 | Inst.getParent()->rend())) { | |||
6471 | if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != | |||
6472 | -1) { | |||
6473 | if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) { | |||
6474 | BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC) | |||
6475 | .addReg(CandI.getOperand(1).getReg()); | |||
6476 | CopyFound = true; | |||
6477 | } | |||
6478 | break; | |||
6479 | } | |||
6480 | } | |||
6481 | if (!CopyFound) { | |||
6482 | // SCC def is not a copy | |||
6483 | // Insert a trivial select instead of creating a copy, because a copy from | |||
6484 | // SCC would semantically mean just copying a single bit, but we may need | |||
6485 | // the result to be a vector condition mask that needs preserving. | |||
6486 | unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64 | |||
6487 | : AMDGPU::S_CSELECT_B32; | |||
6488 | auto NewSelect = | |||
6489 | BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0); | |||
6490 | NewSelect->getOperand(3).setIsUndef(Cond.isUndef()); | |||
6491 | } | |||
6492 | } | |||
6493 | ||||
6494 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6495 | ||||
6496 | auto UpdatedInst = | |||
6497 | BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg) | |||
6498 | .addImm(0) | |||
6499 | .add(Src1) // False | |||
6500 | .addImm(0) | |||
6501 | .add(Src0) // True | |||
6502 | .addReg(IsSCC ? CopySCC : SCCSource); | |||
6503 | ||||
6504 | MRI.replaceRegWith(Dest.getReg(), ResultReg); | |||
6505 | legalizeOperands(*UpdatedInst, MDT); | |||
6506 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
6507 | } | |||
6508 | ||||
6509 | void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, | |||
6510 | MachineInstr &Inst) const { | |||
6511 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6512 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6513 | MachineBasicBlock::iterator MII = Inst; | |||
6514 | DebugLoc DL = Inst.getDebugLoc(); | |||
6515 | ||||
6516 | MachineOperand &Dest = Inst.getOperand(0); | |||
6517 | MachineOperand &Src = Inst.getOperand(1); | |||
6518 | Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6519 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6520 | ||||
6521 | unsigned SubOp = ST.hasAddNoCarry() ? | |||
6522 | AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32; | |||
6523 | ||||
6524 | BuildMI(MBB, MII, DL, get(SubOp), TmpReg) | |||
6525 | .addImm(0) | |||
6526 | .addReg(Src.getReg()); | |||
6527 | ||||
6528 | BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) | |||
6529 | .addReg(Src.getReg()) | |||
6530 | .addReg(TmpReg); | |||
6531 | ||||
6532 | MRI.replaceRegWith(Dest.getReg(), ResultReg); | |||
6533 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
6534 | } | |||
6535 | ||||
6536 | void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, | |||
6537 | MachineInstr &Inst) const { | |||
6538 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6539 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6540 | MachineBasicBlock::iterator MII = Inst; | |||
6541 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6542 | ||||
6543 | MachineOperand &Dest = Inst.getOperand(0); | |||
6544 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6545 | MachineOperand &Src1 = Inst.getOperand(2); | |||
6546 | ||||
6547 | if (ST.hasDLInsts()) { | |||
6548 | Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6549 | legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); | |||
6550 | legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); | |||
6551 | ||||
6552 | BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) | |||
6553 | .add(Src0) | |||
6554 | .add(Src1); | |||
6555 | ||||
6556 | MRI.replaceRegWith(Dest.getReg(), NewDest); | |||
6557 | addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); | |||
6558 | } else { | |||
6559 | // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can | |||
6560 | // invert either source and then perform the XOR. If either source is a | |||
6561 | // scalar register, then we can leave the inversion on the scalar unit to | |||
6562 | // achieve a better distribution of scalar and vector instructions. | |||
6563 | bool Src0IsSGPR = Src0.isReg() && | |||
6564 | RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); | |||
6565 | bool Src1IsSGPR = Src1.isReg() && | |||
6566 | RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); | |||
6567 | MachineInstr *Xor; | |||
6568 | Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | |||
6569 | Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | |||
6570 | ||||
6571 | // Build a pair of scalar instructions and add them to the work list. | |||
6572 | // The next iteration over the work list will lower these to the vector | |||
6573 | // unit as necessary. | |||
6574 | if (Src0IsSGPR) { | |||
6575 | BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0); | |||
6576 | Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) | |||
6577 | .addReg(Temp) | |||
6578 | .add(Src1); | |||
6579 | } else if (Src1IsSGPR) { | |||
6580 | BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1); | |||
6581 | Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) | |||
6582 | .add(Src0) | |||
6583 | .addReg(Temp); | |||
6584 | } else { | |||
6585 | Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp) | |||
6586 | .add(Src0) | |||
6587 | .add(Src1); | |||
6588 | MachineInstr *Not = | |||
6589 | BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp); | |||
6590 | Worklist.insert(Not); | |||
6591 | } | |||
6592 | ||||
6593 | MRI.replaceRegWith(Dest.getReg(), NewDest); | |||
6594 | ||||
6595 | Worklist.insert(Xor); | |||
6596 | ||||
6597 | addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); | |||
6598 | } | |||
6599 | } | |||
6600 | ||||
6601 | void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, | |||
6602 | MachineInstr &Inst, | |||
6603 | unsigned Opcode) const { | |||
6604 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6605 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6606 | MachineBasicBlock::iterator MII = Inst; | |||
6607 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6608 | ||||
6609 | MachineOperand &Dest = Inst.getOperand(0); | |||
6610 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6611 | MachineOperand &Src1 = Inst.getOperand(2); | |||
6612 | ||||
6613 | Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | |||
6614 | Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | |||
6615 | ||||
6616 | MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm) | |||
6617 | .add(Src0) | |||
6618 | .add(Src1); | |||
6619 | ||||
6620 | MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) | |||
6621 | .addReg(Interm); | |||
6622 | ||||
6623 | Worklist.insert(&Op); | |||
6624 | Worklist.insert(&Not); | |||
6625 | ||||
6626 | MRI.replaceRegWith(Dest.getReg(), NewDest); | |||
6627 | addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); | |||
6628 | } | |||
6629 | ||||
6630 | void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, | |||
6631 | MachineInstr &Inst, | |||
6632 | unsigned Opcode) const { | |||
6633 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6634 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6635 | MachineBasicBlock::iterator MII = Inst; | |||
6636 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6637 | ||||
6638 | MachineOperand &Dest = Inst.getOperand(0); | |||
6639 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6640 | MachineOperand &Src1 = Inst.getOperand(2); | |||
6641 | ||||
6642 | Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
6643 | Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
6644 | ||||
6645 | MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm) | |||
6646 | .add(Src1); | |||
6647 | ||||
6648 | MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest) | |||
6649 | .add(Src0) | |||
6650 | .addReg(Interm); | |||
6651 | ||||
6652 | Worklist.insert(&Not); | |||
6653 | Worklist.insert(&Op); | |||
6654 | ||||
6655 | MRI.replaceRegWith(Dest.getReg(), NewDest); | |||
6656 | addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); | |||
6657 | } | |||
6658 | ||||
6659 | void SIInstrInfo::splitScalar64BitUnaryOp( | |||
6660 | SetVectorType &Worklist, MachineInstr &Inst, | |||
6661 | unsigned Opcode, bool Swap) const { | |||
6662 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6663 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6664 | ||||
6665 | MachineOperand &Dest = Inst.getOperand(0); | |||
6666 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6667 | DebugLoc DL = Inst.getDebugLoc(); | |||
6668 | ||||
6669 | MachineBasicBlock::iterator MII = Inst; | |||
6670 | ||||
6671 | const MCInstrDesc &InstDesc = get(Opcode); | |||
6672 | const TargetRegisterClass *Src0RC = Src0.isReg() ? | |||
6673 | MRI.getRegClass(Src0.getReg()) : | |||
6674 | &AMDGPU::SGPR_32RegClass; | |||
6675 | ||||
6676 | const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); | |||
6677 | ||||
6678 | MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, | |||
6679 | AMDGPU::sub0, Src0SubRC); | |||
6680 | ||||
6681 | const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); | |||
6682 | const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); | |||
6683 | const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); | |||
6684 | ||||
6685 | Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); | |||
6686 | MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); | |||
6687 | ||||
6688 | MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, | |||
6689 | AMDGPU::sub1, Src0SubRC); | |||
6690 | ||||
6691 | Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); | |||
6692 | MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); | |||
6693 | ||||
6694 | if (Swap) | |||
6695 | std::swap(DestSub0, DestSub1); | |||
6696 | ||||
6697 | Register FullDestReg = MRI.createVirtualRegister(NewDestRC); | |||
6698 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) | |||
6699 | .addReg(DestSub0) | |||
6700 | .addImm(AMDGPU::sub0) | |||
6701 | .addReg(DestSub1) | |||
6702 | .addImm(AMDGPU::sub1); | |||
6703 | ||||
6704 | MRI.replaceRegWith(Dest.getReg(), FullDestReg); | |||
6705 | ||||
6706 | Worklist.insert(&LoHalf); | |||
6707 | Worklist.insert(&HiHalf); | |||
6708 | ||||
6709 | // We don't need to legalizeOperands here because for a single operand, src0 | |||
6710 | // will support any kind of input. | |||
6711 | ||||
6712 | // Move all users of this moved value. | |||
6713 | addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); | |||
6714 | } | |||
6715 | ||||
6716 | void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, | |||
6717 | MachineInstr &Inst, | |||
6718 | MachineDominatorTree *MDT) const { | |||
6719 | bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); | |||
6720 | ||||
6721 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6722 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6723 | const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
6724 | ||||
6725 | Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); | |||
6726 | Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6727 | Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6728 | ||||
6729 | Register CarryReg = MRI.createVirtualRegister(CarryRC); | |||
6730 | Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); | |||
6731 | ||||
6732 | MachineOperand &Dest = Inst.getOperand(0); | |||
6733 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6734 | MachineOperand &Src1 = Inst.getOperand(2); | |||
6735 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6736 | MachineBasicBlock::iterator MII = Inst; | |||
6737 | ||||
6738 | const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); | |||
6739 | const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); | |||
6740 | const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); | |||
6741 | const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); | |||
6742 | ||||
6743 | MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, | |||
6744 | AMDGPU::sub0, Src0SubRC); | |||
6745 | MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, | |||
6746 | AMDGPU::sub0, Src1SubRC); | |||
6747 | ||||
6748 | ||||
6749 | MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, | |||
6750 | AMDGPU::sub1, Src0SubRC); | |||
6751 | MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, | |||
6752 | AMDGPU::sub1, Src1SubRC); | |||
6753 | ||||
6754 | unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; | |||
6755 | MachineInstr *LoHalf = | |||
6756 | BuildMI(MBB, MII, DL, get(LoOpc), DestSub0) | |||
6757 | .addReg(CarryReg, RegState::Define) | |||
6758 | .add(SrcReg0Sub0) | |||
6759 | .add(SrcReg1Sub0) | |||
6760 | .addImm(0); // clamp bit | |||
6761 | ||||
6762 | unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; | |||
6763 | MachineInstr *HiHalf = | |||
6764 | BuildMI(MBB, MII, DL, get(HiOpc), DestSub1) | |||
6765 | .addReg(DeadCarryReg, RegState::Define | RegState::Dead) | |||
6766 | .add(SrcReg0Sub1) | |||
6767 | .add(SrcReg1Sub1) | |||
6768 | .addReg(CarryReg, RegState::Kill) | |||
6769 | .addImm(0); // clamp bit | |||
6770 | ||||
6771 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) | |||
6772 | .addReg(DestSub0) | |||
6773 | .addImm(AMDGPU::sub0) | |||
6774 | .addReg(DestSub1) | |||
6775 | .addImm(AMDGPU::sub1); | |||
6776 | ||||
6777 | MRI.replaceRegWith(Dest.getReg(), FullDestReg); | |||
6778 | ||||
6779 | // Try to legalize the operands in case we need to swap the order to keep it | |||
6780 | // valid. | |||
6781 | legalizeOperands(*LoHalf, MDT); | |||
6782 | legalizeOperands(*HiHalf, MDT); | |||
6783 | ||||
6784 | // Move all users of this moved value. | |||
6785 | addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); | |||
6786 | } | |||
6787 | ||||
6788 | void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, | |||
6789 | MachineInstr &Inst, unsigned Opcode, | |||
6790 | MachineDominatorTree *MDT) const { | |||
6791 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6792 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6793 | ||||
6794 | MachineOperand &Dest = Inst.getOperand(0); | |||
6795 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6796 | MachineOperand &Src1 = Inst.getOperand(2); | |||
6797 | DebugLoc DL = Inst.getDebugLoc(); | |||
6798 | ||||
6799 | MachineBasicBlock::iterator MII = Inst; | |||
6800 | ||||
6801 | const MCInstrDesc &InstDesc = get(Opcode); | |||
6802 | const TargetRegisterClass *Src0RC = Src0.isReg() ? | |||
6803 | MRI.getRegClass(Src0.getReg()) : | |||
6804 | &AMDGPU::SGPR_32RegClass; | |||
6805 | ||||
6806 | const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); | |||
6807 | const TargetRegisterClass *Src1RC = Src1.isReg() ? | |||
6808 | MRI.getRegClass(Src1.getReg()) : | |||
6809 | &AMDGPU::SGPR_32RegClass; | |||
6810 | ||||
6811 | const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); | |||
6812 | ||||
6813 | MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, | |||
6814 | AMDGPU::sub0, Src0SubRC); | |||
6815 | MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, | |||
6816 | AMDGPU::sub0, Src1SubRC); | |||
6817 | MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, | |||
6818 | AMDGPU::sub1, Src0SubRC); | |||
6819 | MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, | |||
6820 | AMDGPU::sub1, Src1SubRC); | |||
6821 | ||||
6822 | const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); | |||
6823 | const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); | |||
6824 | const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); | |||
6825 | ||||
6826 | Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); | |||
6827 | MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) | |||
6828 | .add(SrcReg0Sub0) | |||
6829 | .add(SrcReg1Sub0); | |||
6830 | ||||
6831 | Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); | |||
6832 | MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) | |||
6833 | .add(SrcReg0Sub1) | |||
6834 | .add(SrcReg1Sub1); | |||
6835 | ||||
6836 | Register FullDestReg = MRI.createVirtualRegister(NewDestRC); | |||
6837 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) | |||
6838 | .addReg(DestSub0) | |||
6839 | .addImm(AMDGPU::sub0) | |||
6840 | .addReg(DestSub1) | |||
6841 | .addImm(AMDGPU::sub1); | |||
6842 | ||||
6843 | MRI.replaceRegWith(Dest.getReg(), FullDestReg); | |||
6844 | ||||
6845 | Worklist.insert(&LoHalf); | |||
6846 | Worklist.insert(&HiHalf); | |||
6847 | ||||
6848 | // Move all users of this moved value. | |||
6849 | addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); | |||
6850 | } | |||
6851 | ||||
6852 | void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, | |||
6853 | MachineInstr &Inst, | |||
6854 | MachineDominatorTree *MDT) const { | |||
6855 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6856 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6857 | ||||
6858 | MachineOperand &Dest = Inst.getOperand(0); | |||
6859 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6860 | MachineOperand &Src1 = Inst.getOperand(2); | |||
6861 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6862 | ||||
6863 | MachineBasicBlock::iterator MII = Inst; | |||
6864 | ||||
6865 | const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); | |||
6866 | ||||
6867 | Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); | |||
6868 | ||||
6869 | MachineOperand* Op0; | |||
6870 | MachineOperand* Op1; | |||
6871 | ||||
6872 | if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) { | |||
6873 | Op0 = &Src0; | |||
6874 | Op1 = &Src1; | |||
6875 | } else { | |||
6876 | Op0 = &Src1; | |||
6877 | Op1 = &Src0; | |||
6878 | } | |||
6879 | ||||
6880 | BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm) | |||
6881 | .add(*Op0); | |||
6882 | ||||
6883 | Register NewDest = MRI.createVirtualRegister(DestRC); | |||
6884 | ||||
6885 | MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest) | |||
6886 | .addReg(Interm) | |||
6887 | .add(*Op1); | |||
6888 | ||||
6889 | MRI.replaceRegWith(Dest.getReg(), NewDest); | |||
6890 | ||||
6891 | Worklist.insert(&Xor); | |||
6892 | } | |||
6893 | ||||
6894 | void SIInstrInfo::splitScalar64BitBCNT( | |||
6895 | SetVectorType &Worklist, MachineInstr &Inst) const { | |||
6896 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6897 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6898 | ||||
6899 | MachineBasicBlock::iterator MII = Inst; | |||
6900 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6901 | ||||
6902 | MachineOperand &Dest = Inst.getOperand(0); | |||
6903 | MachineOperand &Src = Inst.getOperand(1); | |||
6904 | ||||
6905 | const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); | |||
6906 | const TargetRegisterClass *SrcRC = Src.isReg() ? | |||
6907 | MRI.getRegClass(Src.getReg()) : | |||
6908 | &AMDGPU::SGPR_32RegClass; | |||
6909 | ||||
6910 | Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6911 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6912 | ||||
6913 | const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); | |||
6914 | ||||
6915 | MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, | |||
6916 | AMDGPU::sub0, SrcSubRC); | |||
6917 | MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, | |||
6918 | AMDGPU::sub1, SrcSubRC); | |||
6919 | ||||
6920 | BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); | |||
6921 | ||||
6922 | BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); | |||
6923 | ||||
6924 | MRI.replaceRegWith(Dest.getReg(), ResultReg); | |||
6925 | ||||
6926 | // We don't need to legalize operands here. src0 for either instruction can be | |||
6927 | // an SGPR, and the second input is unused or determined here. | |||
6928 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
6929 | } | |||
6930 | ||||
6931 | void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, | |||
6932 | MachineInstr &Inst) const { | |||
6933 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6934 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6935 | MachineBasicBlock::iterator MII = Inst; | |||
6936 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6937 | ||||
6938 | MachineOperand &Dest = Inst.getOperand(0); | |||
6939 | uint32_t Imm = Inst.getOperand(2).getImm(); | |||
6940 | uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. | |||
6941 | uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. | |||
6942 | ||||
6943 | (void) Offset; | |||
6944 | ||||
6945 | // Only sext_inreg cases handled. | |||
6946 | assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&(static_cast <bool> (Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && Offset == 0 && "Not implemented") ? void (0) : __assert_fail ("Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && Offset == 0 && \"Not implemented\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 6947, __extension__ __PRETTY_FUNCTION__)) | |||
6947 | Offset == 0 && "Not implemented")(static_cast <bool> (Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && Offset == 0 && "Not implemented") ? void (0) : __assert_fail ("Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && Offset == 0 && \"Not implemented\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 6947, __extension__ __PRETTY_FUNCTION__)); | |||
6948 | ||||
6949 | if (BitWidth < 32) { | |||
6950 | Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6951 | Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6952 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); | |||
6953 | ||||
6954 | BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo) | |||
6955 | .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) | |||
6956 | .addImm(0) | |||
6957 | .addImm(BitWidth); | |||
6958 | ||||
6959 | BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) | |||
6960 | .addImm(31) | |||
6961 | .addReg(MidRegLo); | |||
6962 | ||||
6963 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) | |||
6964 | .addReg(MidRegLo) | |||
6965 | .addImm(AMDGPU::sub0) | |||
6966 | .addReg(MidRegHi) | |||
6967 | .addImm(AMDGPU::sub1); | |||
6968 | ||||
6969 | MRI.replaceRegWith(Dest.getReg(), ResultReg); | |||
6970 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
6971 | return; | |||
6972 | } | |||
6973 | ||||
6974 | MachineOperand &Src = Inst.getOperand(1); | |||
6975 | Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6976 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); | |||
6977 | ||||
6978 | BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) | |||
6979 | .addImm(31) | |||
6980 | .addReg(Src.getReg(), 0, AMDGPU::sub0); | |||
6981 | ||||
6982 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) | |||
6983 | .addReg(Src.getReg(), 0, AMDGPU::sub0) | |||
6984 | .addImm(AMDGPU::sub0) | |||
6985 | .addReg(TmpReg) | |||
6986 | .addImm(AMDGPU::sub1); | |||
6987 | ||||
6988 | MRI.replaceRegWith(Dest.getReg(), ResultReg); | |||
6989 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
6990 | } | |||
6991 | ||||
6992 | void SIInstrInfo::addUsersToMoveToVALUWorklist( | |||
6993 | Register DstReg, | |||
6994 | MachineRegisterInfo &MRI, | |||
6995 | SetVectorType &Worklist) const { | |||
6996 | for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), | |||
6997 | E = MRI.use_end(); I != E;) { | |||
6998 | MachineInstr &UseMI = *I->getParent(); | |||
6999 | ||||
7000 | unsigned OpNo = 0; | |||
7001 | ||||
7002 | switch (UseMI.getOpcode()) { | |||
7003 | case AMDGPU::COPY: | |||
7004 | case AMDGPU::WQM: | |||
7005 | case AMDGPU::SOFT_WQM: | |||
7006 | case AMDGPU::STRICT_WWM: | |||
7007 | case AMDGPU::STRICT_WQM: | |||
7008 | case AMDGPU::REG_SEQUENCE: | |||
7009 | case AMDGPU::PHI: | |||
7010 | case AMDGPU::INSERT_SUBREG: | |||
7011 | break; | |||
7012 | default: | |||
7013 | OpNo = I.getOperandNo(); | |||
7014 | break; | |||
7015 | } | |||
7016 | ||||
7017 | if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) { | |||
7018 | Worklist.insert(&UseMI); | |||
7019 | ||||
7020 | do { | |||
7021 | ++I; | |||
7022 | } while (I != E && I->getParent() == &UseMI); | |||
7023 | } else { | |||
7024 | ++I; | |||
7025 | } | |||
7026 | } | |||
7027 | } | |||
7028 | ||||
7029 | void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, | |||
7030 | MachineRegisterInfo &MRI, | |||
7031 | MachineInstr &Inst) const { | |||
7032 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
7033 | MachineBasicBlock *MBB = Inst.getParent(); | |||
7034 | MachineOperand &Src0 = Inst.getOperand(1); | |||
7035 | MachineOperand &Src1 = Inst.getOperand(2); | |||
7036 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
7037 | ||||
7038 | switch (Inst.getOpcode()) { | |||
7039 | case AMDGPU::S_PACK_LL_B32_B16: { | |||
7040 | Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
7041 | Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
7042 | ||||
7043 | // FIXME: Can do a lot better if we know the high bits of src0 or src1 are | |||
7044 | // 0. | |||
7045 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) | |||
7046 | .addImm(0xffff); | |||
7047 | ||||
7048 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) | |||
7049 | .addReg(ImmReg, RegState::Kill) | |||
7050 | .add(Src0); | |||
7051 | ||||
7052 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg) | |||
7053 | .add(Src1) | |||
7054 | .addImm(16) | |||
7055 | .addReg(TmpReg, RegState::Kill); | |||
7056 | break; | |||
7057 | } | |||
7058 | case AMDGPU::S_PACK_LH_B32_B16: { | |||
7059 | Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
7060 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) | |||
7061 | .addImm(0xffff); | |||
7062 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg) | |||
7063 | .addReg(ImmReg, RegState::Kill) | |||
7064 | .add(Src0) | |||
7065 | .add(Src1); | |||
7066 | break; | |||
7067 | } | |||
7068 | case AMDGPU::S_PACK_HH_B32_B16: { | |||
7069 | Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
7070 | Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
7071 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) | |||
7072 | .addImm(16) | |||
7073 | .add(Src0); | |||
7074 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) | |||
7075 | .addImm(0xffff0000); | |||
7076 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg) | |||
7077 | .add(Src1) | |||
7078 | .addReg(ImmReg, RegState::Kill) | |||
7079 | .addReg(TmpReg, RegState::Kill); | |||
7080 | break; | |||
7081 | } | |||
7082 | default: | |||
7083 | llvm_unreachable("unhandled s_pack_* instruction")::llvm::llvm_unreachable_internal("unhandled s_pack_* instruction" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7083); | |||
7084 | } | |||
7085 | ||||
7086 | MachineOperand &Dest = Inst.getOperand(0); | |||
7087 | MRI.replaceRegWith(Dest.getReg(), ResultReg); | |||
7088 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
7089 | } | |||
7090 | ||||
7091 | void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, | |||
7092 | MachineInstr &SCCDefInst, | |||
7093 | SetVectorType &Worklist, | |||
7094 | Register NewCond) const { | |||
7095 | ||||
7096 | // Ensure that def inst defines SCC, which is still live. | |||
7097 | assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&(static_cast <bool> (Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst) ? void (0) : __assert_fail ("Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7098, __extension__ __PRETTY_FUNCTION__)) | |||
7098 | !Op.isDead() && Op.getParent() == &SCCDefInst)(static_cast <bool> (Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst) ? void (0) : __assert_fail ("Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7098, __extension__ __PRETTY_FUNCTION__)); | |||
7099 | SmallVector<MachineInstr *, 4> CopyToDelete; | |||
7100 | // This assumes that all the users of SCC are in the same block | |||
7101 | // as the SCC def. | |||
7102 | for (MachineInstr &MI : // Skip the def inst itself. | |||
7103 | make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)), | |||
7104 | SCCDefInst.getParent()->end())) { | |||
7105 | // Check if SCC is used first. | |||
7106 | int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI); | |||
7107 | if (SCCIdx != -1) { | |||
7108 | if (MI.isCopy()) { | |||
7109 | MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); | |||
7110 | Register DestReg = MI.getOperand(0).getReg(); | |||
7111 | ||||
7112 | MRI.replaceRegWith(DestReg, NewCond); | |||
7113 | CopyToDelete.push_back(&MI); | |||
7114 | } else { | |||
7115 | ||||
7116 | if (NewCond.isValid()) | |||
7117 | MI.getOperand(SCCIdx).setReg(NewCond); | |||
7118 | ||||
7119 | Worklist.insert(&MI); | |||
7120 | } | |||
7121 | } | |||
7122 | // Exit if we find another SCC def. | |||
7123 | if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) | |||
7124 | break; | |||
7125 | } | |||
7126 | for (auto &Copy : CopyToDelete) | |||
7127 | Copy->eraseFromParent(); | |||
7128 | } | |||
7129 | ||||
7130 | // Instructions that use SCC may be converted to VALU instructions. When that | |||
7131 | // happens, the SCC register is changed to VCC_LO. The instruction that defines | |||
7132 | // SCC must be changed to an instruction that defines VCC. This function makes | |||
7133 | // sure that the instruction that defines SCC is added to the moveToVALU | |||
7134 | // worklist. | |||
7135 | void SIInstrInfo::addSCCDefsToVALUWorklist(MachineOperand &Op, | |||
7136 | SetVectorType &Worklist) const { | |||
7137 | assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse())(static_cast <bool> (Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse()) ? void (0) : __assert_fail ("Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse()" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7137, __extension__ __PRETTY_FUNCTION__)); | |||
7138 | ||||
7139 | MachineInstr *SCCUseInst = Op.getParent(); | |||
7140 | // Look for a preceding instruction that either defines VCC or SCC. If VCC | |||
7141 | // then there is nothing to do because the defining instruction has been | |||
7142 | // converted to a VALU already. If SCC then that instruction needs to be | |||
7143 | // converted to a VALU. | |||
7144 | for (MachineInstr &MI : | |||
7145 | make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)), | |||
7146 | SCCUseInst->getParent()->rend())) { | |||
7147 | if (MI.modifiesRegister(AMDGPU::VCC, &RI)) | |||
7148 | break; | |||
7149 | if (MI.definesRegister(AMDGPU::SCC, &RI)) { | |||
7150 | Worklist.insert(&MI); | |||
7151 | break; | |||
7152 | } | |||
7153 | } | |||
7154 | } | |||
7155 | ||||
7156 | const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( | |||
7157 | const MachineInstr &Inst) const { | |||
7158 | const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); | |||
7159 | ||||
7160 | switch (Inst.getOpcode()) { | |||
7161 | // For target instructions, getOpRegClass just returns the virtual register | |||
7162 | // class associated with the operand, so we need to find an equivalent VGPR | |||
7163 | // register class in order to move the instruction to the VALU. | |||
7164 | case AMDGPU::COPY: | |||
7165 | case AMDGPU::PHI: | |||
7166 | case AMDGPU::REG_SEQUENCE: | |||
7167 | case AMDGPU::INSERT_SUBREG: | |||
7168 | case AMDGPU::WQM: | |||
7169 | case AMDGPU::SOFT_WQM: | |||
7170 | case AMDGPU::STRICT_WWM: | |||
7171 | case AMDGPU::STRICT_WQM: { | |||
7172 | const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); | |||
7173 | if (RI.isAGPRClass(SrcRC)) { | |||
7174 | if (RI.isAGPRClass(NewDstRC)) | |||
7175 | return nullptr; | |||
7176 | ||||
7177 | switch (Inst.getOpcode()) { | |||
7178 | case AMDGPU::PHI: | |||
7179 | case AMDGPU::REG_SEQUENCE: | |||
7180 | case AMDGPU::INSERT_SUBREG: | |||
7181 | NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); | |||
7182 | break; | |||
7183 | default: | |||
7184 | NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); | |||
7185 | } | |||
7186 | ||||
7187 | if (!NewDstRC) | |||
7188 | return nullptr; | |||
7189 | } else { | |||
7190 | if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) | |||
7191 | return nullptr; | |||
7192 | ||||
7193 | NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); | |||
7194 | if (!NewDstRC) | |||
7195 | return nullptr; | |||
7196 | } | |||
7197 | ||||
7198 | return NewDstRC; | |||
7199 | } | |||
7200 | default: | |||
7201 | return NewDstRC; | |||
7202 | } | |||
7203 | } | |||
7204 | ||||
7205 | // Find the one SGPR operand we are allowed to use. | |||
7206 | Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, | |||
7207 | int OpIndices[3]) const { | |||
7208 | const MCInstrDesc &Desc = MI.getDesc(); | |||
7209 | ||||
7210 | // Find the one SGPR operand we are allowed to use. | |||
7211 | // | |||
7212 | // First we need to consider the instruction's operand requirements before | |||
7213 | // legalizing. Some operands are required to be SGPRs, such as implicit uses | |||
7214 | // of VCC, but we are still bound by the constant bus requirement to only use | |||
7215 | // one. | |||
7216 | // | |||
7217 | // If the operand's class is an SGPR, we can never move it. | |||
7218 | ||||
7219 | Register SGPRReg = findImplicitSGPRRead(MI); | |||
7220 | if (SGPRReg != AMDGPU::NoRegister) | |||
7221 | return SGPRReg; | |||
7222 | ||||
7223 | Register UsedSGPRs[3] = { AMDGPU::NoRegister }; | |||
7224 | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); | |||
7225 | ||||
7226 | for (unsigned i = 0; i < 3; ++i) { | |||
7227 | int Idx = OpIndices[i]; | |||
7228 | if (Idx == -1) | |||
7229 | break; | |||
7230 | ||||
7231 | const MachineOperand &MO = MI.getOperand(Idx); | |||
7232 | if (!MO.isReg()) | |||
7233 | continue; | |||
7234 | ||||
7235 | // Is this operand statically required to be an SGPR based on the operand | |||
7236 | // constraints? | |||
7237 | const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); | |||
7238 | bool IsRequiredSGPR = RI.isSGPRClass(OpRC); | |||
7239 | if (IsRequiredSGPR) | |||
7240 | return MO.getReg(); | |||
7241 | ||||
7242 | // If this could be a VGPR or an SGPR, Check the dynamic register class. | |||
7243 | Register Reg = MO.getReg(); | |||
7244 | const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); | |||
7245 | if (RI.isSGPRClass(RegRC)) | |||
7246 | UsedSGPRs[i] = Reg; | |||
7247 | } | |||
7248 | ||||
7249 | // We don't have a required SGPR operand, so we have a bit more freedom in | |||
7250 | // selecting operands to move. | |||
7251 | ||||
7252 | // Try to select the most used SGPR. If an SGPR is equal to one of the | |||
7253 | // others, we choose that. | |||
7254 | // | |||
7255 | // e.g. | |||
7256 | // V_FMA_F32 v0, s0, s0, s0 -> No moves | |||
7257 | // V_FMA_F32 v0, s0, s1, s0 -> Move s1 | |||
7258 | ||||
7259 | // TODO: If some of the operands are 64-bit SGPRs and some 32, we should | |||
7260 | // prefer those. | |||
7261 | ||||
7262 | if (UsedSGPRs[0] != AMDGPU::NoRegister) { | |||
7263 | if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) | |||
7264 | SGPRReg = UsedSGPRs[0]; | |||
7265 | } | |||
7266 | ||||
7267 | if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { | |||
7268 | if (UsedSGPRs[1] == UsedSGPRs[2]) | |||
7269 | SGPRReg = UsedSGPRs[1]; | |||
7270 | } | |||
7271 | ||||
7272 | return SGPRReg; | |||
7273 | } | |||
7274 | ||||
7275 | MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, | |||
7276 | unsigned OperandName) const { | |||
7277 | int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); | |||
7278 | if (Idx == -1) | |||
7279 | return nullptr; | |||
7280 | ||||
7281 | return &MI.getOperand(Idx); | |||
7282 | } | |||
7283 | ||||
7284 | uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { | |||
7285 | if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { | |||
7286 | return (AMDGPU::MTBUFFormat::UFMT_32_FLOAT << 44) | | |||
7287 | (1ULL << 56) | // RESOURCE_LEVEL = 1 | |||
7288 | (3ULL << 60); // OOB_SELECT = 3 | |||
7289 | } | |||
7290 | ||||
7291 | uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; | |||
7292 | if (ST.isAmdHsaOS()) { | |||
7293 | // Set ATC = 1. GFX9 doesn't have this bit. | |||
7294 | if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) | |||
7295 | RsrcDataFormat |= (1ULL << 56); | |||
7296 | ||||
7297 | // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. | |||
7298 | // BTW, it disables TC L2 and therefore decreases performance. | |||
7299 | if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) | |||
7300 | RsrcDataFormat |= (2ULL << 59); | |||
7301 | } | |||
7302 | ||||
7303 | return RsrcDataFormat; | |||
7304 | } | |||
7305 | ||||
7306 | uint64_t SIInstrInfo::getScratchRsrcWords23() const { | |||
7307 | uint64_t Rsrc23 = getDefaultRsrcDataFormat() | | |||
7308 | AMDGPU::RSRC_TID_ENABLE | | |||
7309 | 0xffffffff; // Size; | |||
7310 | ||||
7311 | // GFX9 doesn't have ELEMENT_SIZE. | |||
7312 | if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) { | |||
7313 | uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1; | |||
7314 | Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; | |||
7315 | } | |||
7316 | ||||
7317 | // IndexStride = 64 / 32. | |||
7318 | uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2; | |||
7319 | Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; | |||
7320 | ||||
7321 | // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. | |||
7322 | // Clear them unless we want a huge stride. | |||
7323 | if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && | |||
7324 | ST.getGeneration() <= AMDGPUSubtarget::GFX9) | |||
7325 | Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; | |||
7326 | ||||
7327 | return Rsrc23; | |||
7328 | } | |||
7329 | ||||
7330 | bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { | |||
7331 | unsigned Opc = MI.getOpcode(); | |||
7332 | ||||
7333 | return isSMRD(Opc); | |||
7334 | } | |||
7335 | ||||
7336 | bool SIInstrInfo::isHighLatencyDef(int Opc) const { | |||
7337 | return get(Opc).mayLoad() && | |||
7338 | (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc)); | |||
7339 | } | |||
7340 | ||||
7341 | unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, | |||
7342 | int &FrameIndex) const { | |||
7343 | const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); | |||
7344 | if (!Addr || !Addr->isFI()) | |||
7345 | return AMDGPU::NoRegister; | |||
7346 | ||||
7347 | assert(!MI.memoperands_empty() &&(static_cast <bool> (!MI.memoperands_empty() && (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ) ? void (0) : __assert_fail ("!MI.memoperands_empty() && (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7348, __extension__ __PRETTY_FUNCTION__)) | |||
7348 | (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (!MI.memoperands_empty() && (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ) ? void (0) : __assert_fail ("!MI.memoperands_empty() && (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7348, __extension__ __PRETTY_FUNCTION__)); | |||
7349 | ||||
7350 | FrameIndex = Addr->getIndex(); | |||
7351 | return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); | |||
7352 | } | |||
7353 | ||||
7354 | unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, | |||
7355 | int &FrameIndex) const { | |||
7356 | const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); | |||
7357 | assert(Addr && Addr->isFI())(static_cast <bool> (Addr && Addr->isFI()) ? void (0) : __assert_fail ("Addr && Addr->isFI()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7357, __extension__ __PRETTY_FUNCTION__)); | |||
7358 | FrameIndex = Addr->getIndex(); | |||
7359 | return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); | |||
7360 | } | |||
7361 | ||||
7362 | unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, | |||
7363 | int &FrameIndex) const { | |||
7364 | if (!MI.mayLoad()) | |||
7365 | return AMDGPU::NoRegister; | |||
7366 | ||||
7367 | if (isMUBUF(MI) || isVGPRSpill(MI)) | |||
7368 | return isStackAccess(MI, FrameIndex); | |||
7369 | ||||
7370 | if (isSGPRSpill(MI)) | |||
7371 | return isSGPRStackAccess(MI, FrameIndex); | |||
7372 | ||||
7373 | return AMDGPU::NoRegister; | |||
7374 | } | |||
7375 | ||||
7376 | unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, | |||
7377 | int &FrameIndex) const { | |||
7378 | if (!MI.mayStore()) | |||
7379 | return AMDGPU::NoRegister; | |||
7380 | ||||
7381 | if (isMUBUF(MI) || isVGPRSpill(MI)) | |||
7382 | return isStackAccess(MI, FrameIndex); | |||
7383 | ||||
7384 | if (isSGPRSpill(MI)) | |||
7385 | return isSGPRStackAccess(MI, FrameIndex); | |||
7386 | ||||
7387 | return AMDGPU::NoRegister; | |||
7388 | } | |||
7389 | ||||
7390 | unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const { | |||
7391 | unsigned Size = 0; | |||
7392 | MachineBasicBlock::const_instr_iterator I = MI.getIterator(); | |||
7393 | MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); | |||
7394 | while (++I != E && I->isInsideBundle()) { | |||
7395 | assert(!I->isBundle() && "No nested bundle!")(static_cast <bool> (!I->isBundle() && "No nested bundle!" ) ? void (0) : __assert_fail ("!I->isBundle() && \"No nested bundle!\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7395, __extension__ __PRETTY_FUNCTION__)); | |||
7396 | Size += getInstSizeInBytes(*I); | |||
7397 | } | |||
7398 | ||||
7399 | return Size; | |||
7400 | } | |||
7401 | ||||
7402 | unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { | |||
7403 | unsigned Opc = MI.getOpcode(); | |||
7404 | const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); | |||
7405 | unsigned DescSize = Desc.getSize(); | |||
7406 | ||||
7407 | // If we have a definitive size, we can use it. Otherwise we need to inspect | |||
7408 | // the operands to know the size. | |||
7409 | if (isFixedSize(MI)) { | |||
7410 | unsigned Size = DescSize; | |||
7411 | ||||
7412 | // If we hit the buggy offset, an extra nop will be inserted in MC so | |||
7413 | // estimate the worst case. | |||
7414 | if (MI.isBranch() && ST.hasOffset3fBug()) | |||
7415 | Size += 4; | |||
7416 | ||||
7417 | return Size; | |||
7418 | } | |||
7419 | ||||
7420 | // Instructions may have a 32-bit literal encoded after them. Check | |||
7421 | // operands that could ever be literals. | |||
7422 | if (isVALU(MI) || isSALU(MI)) { | |||
7423 | if (isDPP(MI)) | |||
7424 | return DescSize; | |||
7425 | bool HasLiteral = false; | |||
7426 | for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) { | |||
7427 | if (isLiteralConstant(MI, I)) { | |||
7428 | HasLiteral = true; | |||
7429 | break; | |||
7430 | } | |||
7431 | } | |||
7432 | return HasLiteral ? DescSize + 4 : DescSize; | |||
7433 | } | |||
7434 | ||||
7435 | // Check whether we have extra NSA words. | |||
7436 | if (isMIMG(MI)) { | |||
7437 | int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); | |||
7438 | if (VAddr0Idx < 0) | |||
7439 | return 8; | |||
7440 | ||||
7441 | int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); | |||
7442 | return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4); | |||
7443 | } | |||
7444 | ||||
7445 | switch (Opc) { | |||
7446 | case TargetOpcode::BUNDLE: | |||
7447 | return getInstBundleSize(MI); | |||
7448 | case TargetOpcode::INLINEASM: | |||
7449 | case TargetOpcode::INLINEASM_BR: { | |||
7450 | const MachineFunction *MF = MI.getParent()->getParent(); | |||
7451 | const char *AsmStr = MI.getOperand(0).getSymbolName(); | |||
7452 | return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST); | |||
7453 | } | |||
7454 | default: | |||
7455 | if (MI.isMetaInstruction()) | |||
7456 | return 0; | |||
7457 | return DescSize; | |||
7458 | } | |||
7459 | } | |||
7460 | ||||
7461 | bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { | |||
7462 | if (!isFLAT(MI)) | |||
7463 | return false; | |||
7464 | ||||
7465 | if (MI.memoperands_empty()) | |||
7466 | return true; | |||
7467 | ||||
7468 | for (const MachineMemOperand *MMO : MI.memoperands()) { | |||
7469 | if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) | |||
7470 | return true; | |||
7471 | } | |||
7472 | return false; | |||
7473 | } | |||
7474 | ||||
7475 | bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { | |||
7476 | return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; | |||
7477 | } | |||
7478 | ||||
7479 | void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, | |||
7480 | MachineBasicBlock *IfEnd) const { | |||
7481 | MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); | |||
7482 | assert(TI != IfEntry->end())(static_cast <bool> (TI != IfEntry->end()) ? void (0 ) : __assert_fail ("TI != IfEntry->end()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7482, __extension__ __PRETTY_FUNCTION__)); | |||
7483 | ||||
7484 | MachineInstr *Branch = &(*TI); | |||
7485 | MachineFunction *MF = IfEntry->getParent(); | |||
7486 | MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); | |||
7487 | ||||
7488 | if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { | |||
7489 | Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
7490 | MachineInstr *SIIF = | |||
7491 | BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) | |||
7492 | .add(Branch->getOperand(0)) | |||
7493 | .add(Branch->getOperand(1)); | |||
7494 | MachineInstr *SIEND = | |||
7495 | BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) | |||
7496 | .addReg(DstReg); | |||
7497 | ||||
7498 | IfEntry->erase(TI); | |||
7499 | IfEntry->insert(IfEntry->end(), SIIF); | |||
7500 | IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); | |||
7501 | } | |||
7502 | } | |||
7503 | ||||
7504 | void SIInstrInfo::convertNonUniformLoopRegion( | |||
7505 | MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { | |||
7506 | MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); | |||
7507 | // We expect 2 terminators, one conditional and one unconditional. | |||
7508 | assert(TI != LoopEnd->end())(static_cast <bool> (TI != LoopEnd->end()) ? void (0 ) : __assert_fail ("TI != LoopEnd->end()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7508, __extension__ __PRETTY_FUNCTION__)); | |||
7509 | ||||
7510 | MachineInstr *Branch = &(*TI); | |||
7511 | MachineFunction *MF = LoopEnd->getParent(); | |||
7512 | MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); | |||
7513 | ||||
7514 | if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { | |||
7515 | ||||
7516 | Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
7517 | Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
7518 | MachineInstrBuilder HeaderPHIBuilder = | |||
7519 | BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); | |||
7520 | for (MachineBasicBlock *PMBB : LoopEntry->predecessors()) { | |||
7521 | if (PMBB == LoopEnd) { | |||
7522 | HeaderPHIBuilder.addReg(BackEdgeReg); | |||
7523 | } else { | |||
7524 | Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
7525 | materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), | |||
7526 | ZeroReg, 0); | |||
7527 | HeaderPHIBuilder.addReg(ZeroReg); | |||
7528 | } | |||
7529 | HeaderPHIBuilder.addMBB(PMBB); | |||
7530 | } | |||
7531 | MachineInstr *HeaderPhi = HeaderPHIBuilder; | |||
7532 | MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), | |||
7533 | get(AMDGPU::SI_IF_BREAK), BackEdgeReg) | |||
7534 | .addReg(DstReg) | |||
7535 | .add(Branch->getOperand(0)); | |||
7536 | MachineInstr *SILOOP = | |||
7537 | BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) | |||
7538 | .addReg(BackEdgeReg) | |||
7539 | .addMBB(LoopEntry); | |||
7540 | ||||
7541 | LoopEntry->insert(LoopEntry->begin(), HeaderPhi); | |||
7542 | LoopEnd->erase(TI); | |||
7543 | LoopEnd->insert(LoopEnd->end(), SIIFBREAK); | |||
7544 | LoopEnd->insert(LoopEnd->end(), SILOOP); | |||
7545 | } | |||
7546 | } | |||
7547 | ||||
7548 | ArrayRef<std::pair<int, const char *>> | |||
7549 | SIInstrInfo::getSerializableTargetIndices() const { | |||
7550 | static const std::pair<int, const char *> TargetIndices[] = { | |||
7551 | {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, | |||
7552 | {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, | |||
7553 | {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, | |||
7554 | {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, | |||
7555 | {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; | |||
7556 | return makeArrayRef(TargetIndices); | |||
7557 | } | |||
7558 | ||||
7559 | /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The | |||
7560 | /// post-RA version of misched uses CreateTargetMIHazardRecognizer. | |||
7561 | ScheduleHazardRecognizer * | |||
7562 | SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, | |||
7563 | const ScheduleDAG *DAG) const { | |||
7564 | return new GCNHazardRecognizer(DAG->MF); | |||
7565 | } | |||
7566 | ||||
7567 | /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer | |||
7568 | /// pass. | |||
7569 | ScheduleHazardRecognizer * | |||
7570 | SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { | |||
7571 | return new GCNHazardRecognizer(MF); | |||
7572 | } | |||
7573 | ||||
7574 | // Called during: | |||
7575 | // - pre-RA scheduling and post-RA scheduling | |||
7576 | ScheduleHazardRecognizer * | |||
7577 | SIInstrInfo::CreateTargetMIHazardRecognizer(const InstrItineraryData *II, | |||
7578 | const ScheduleDAGMI *DAG) const { | |||
7579 | // Borrowed from Arm Target | |||
7580 | // We would like to restrict this hazard recognizer to only | |||
7581 | // post-RA scheduling; we can tell that we're post-RA because we don't | |||
7582 | // track VRegLiveness. | |||
7583 | if (!DAG->hasVRegLiveness()) | |||
7584 | return new GCNHazardRecognizer(DAG->MF); | |||
7585 | return TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG); | |||
7586 | } | |||
7587 | ||||
7588 | std::pair<unsigned, unsigned> | |||
7589 | SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { | |||
7590 | return std::make_pair(TF & MO_MASK, TF & ~MO_MASK); | |||
7591 | } | |||
7592 | ||||
7593 | ArrayRef<std::pair<unsigned, const char *>> | |||
7594 | SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { | |||
7595 | static const std::pair<unsigned, const char *> TargetFlags[] = { | |||
7596 | { MO_GOTPCREL, "amdgpu-gotprel" }, | |||
7597 | { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, | |||
7598 | { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, | |||
7599 | { MO_REL32_LO, "amdgpu-rel32-lo" }, | |||
7600 | { MO_REL32_HI, "amdgpu-rel32-hi" }, | |||
7601 | { MO_ABS32_LO, "amdgpu-abs32-lo" }, | |||
7602 | { MO_ABS32_HI, "amdgpu-abs32-hi" }, | |||
7603 | }; | |||
7604 | ||||
7605 | return makeArrayRef(TargetFlags); | |||
7606 | } | |||
7607 | ||||
7608 | ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> | |||
7609 | SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const { | |||
7610 | static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = | |||
7611 | { | |||
7612 | {MONoClobber, "amdgpu-noclobber"}, | |||
7613 | }; | |||
7614 | ||||
7615 | return makeArrayRef(TargetFlags); | |||
7616 | } | |||
7617 | ||||
7618 | bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { | |||
7619 | return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && | |||
7620 | MI.modifiesRegister(AMDGPU::EXEC, &RI); | |||
7621 | } | |||
7622 | ||||
7623 | MachineInstrBuilder | |||
7624 | SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, | |||
7625 | MachineBasicBlock::iterator I, | |||
7626 | const DebugLoc &DL, | |||
7627 | Register DestReg) const { | |||
7628 | if (ST.hasAddNoCarry()) | |||
7629 | return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); | |||
7630 | ||||
7631 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
7632 | Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC()); | |||
7633 | MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC()); | |||
7634 | ||||
7635 | return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg) | |||
7636 | .addReg(UnusedCarry, RegState::Define | RegState::Dead); | |||
7637 | } | |||
7638 | ||||
7639 | MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, | |||
7640 | MachineBasicBlock::iterator I, | |||
7641 | const DebugLoc &DL, | |||
7642 | Register DestReg, | |||
7643 | RegScavenger &RS) const { | |||
7644 | if (ST.hasAddNoCarry()) | |||
7645 | return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg); | |||
7646 | ||||
7647 | // If available, prefer to use vcc. | |||
7648 | Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC) | |||
7649 | ? Register(RI.getVCC()) | |||
7650 | : RS.scavengeRegister(RI.getBoolRC(), I, 0, false); | |||
7651 | ||||
7652 | // TODO: Users need to deal with this. | |||
7653 | if (!UnusedCarry.isValid()) | |||
7654 | return MachineInstrBuilder(); | |||
7655 | ||||
7656 | return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg) | |||
7657 | .addReg(UnusedCarry, RegState::Define | RegState::Dead); | |||
7658 | } | |||
7659 | ||||
7660 | bool SIInstrInfo::isKillTerminator(unsigned Opcode) { | |||
7661 | switch (Opcode) { | |||
7662 | case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: | |||
7663 | case AMDGPU::SI_KILL_I1_TERMINATOR: | |||
7664 | return true; | |||
7665 | default: | |||
7666 | return false; | |||
7667 | } | |||
7668 | } | |||
7669 | ||||
7670 | const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const { | |||
7671 | switch (Opcode) { | |||
7672 | case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: | |||
7673 | return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR); | |||
7674 | case AMDGPU::SI_KILL_I1_PSEUDO: | |||
7675 | return get(AMDGPU::SI_KILL_I1_TERMINATOR); | |||
7676 | default: | |||
7677 | llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO")::llvm::llvm_unreachable_internal("invalid opcode, expected SI_KILL_*_PSEUDO" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7677); | |||
7678 | } | |||
7679 | } | |||
7680 | ||||
7681 | void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const { | |||
7682 | if (!ST.isWave32()) | |||
7683 | return; | |||
7684 | ||||
7685 | for (auto &Op : MI.implicit_operands()) { | |||
7686 | if (Op.isReg() && Op.getReg() == AMDGPU::VCC) | |||
7687 | Op.setReg(AMDGPU::VCC_LO); | |||
7688 | } | |||
7689 | } | |||
7690 | ||||
7691 | bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { | |||
7692 | if (!isSMRD(MI)) | |||
7693 | return false; | |||
7694 | ||||
7695 | // Check that it is using a buffer resource. | |||
7696 | int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase); | |||
7697 | if (Idx == -1) // e.g. s_memtime | |||
7698 | return false; | |||
7699 | ||||
7700 | const auto RCID = MI.getDesc().OpInfo[Idx].RegClass; | |||
7701 | return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); | |||
7702 | } | |||
7703 | ||||
7704 | // Depending on the used address space and instructions, some immediate offsets | |||
7705 | // are allowed and some are not. | |||
7706 | // In general, flat instruction offsets can only be non-negative, global and | |||
7707 | // scratch instruction offsets can also be negative. | |||
7708 | // | |||
7709 | // There are several bugs related to these offsets: | |||
7710 | // On gfx10.1, flat instructions that go into the global address space cannot | |||
7711 | // use an offset. | |||
7712 | // | |||
7713 | // For scratch instructions, the address can be either an SGPR or a VGPR. | |||
7714 | // The following offsets can be used, depending on the architecture (x means | |||
7715 | // cannot be used): | |||
7716 | // +----------------------------+------+------+ | |||
7717 | // | Address-Mode | SGPR | VGPR | | |||
7718 | // +----------------------------+------+------+ | |||
7719 | // | gfx9 | | | | |||
7720 | // | negative, 4-aligned offset | x | ok | | |||
7721 | // | negative, unaligned offset | x | ok | | |||
7722 | // +----------------------------+------+------+ | |||
7723 | // | gfx10 | | | | |||
7724 | // | negative, 4-aligned offset | ok | ok | | |||
7725 | // | negative, unaligned offset | ok | x | | |||
7726 | // +----------------------------+------+------+ | |||
7727 | // | gfx10.3 | | | | |||
7728 | // | negative, 4-aligned offset | ok | ok | | |||
7729 | // | negative, unaligned offset | ok | ok | | |||
7730 | // +----------------------------+------+------+ | |||
7731 | // | |||
7732 | // This function ignores the addressing mode, so if an offset cannot be used in | |||
7733 | // one addressing mode, it is considered illegal. | |||
7734 | bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, | |||
7735 | uint64_t FlatVariant) const { | |||
7736 | // TODO: Should 0 be special cased? | |||
7737 | if (!ST.hasFlatInstOffsets()) | |||
7738 | return false; | |||
7739 | ||||
7740 | if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT && | |||
7741 | (AddrSpace == AMDGPUAS::FLAT_ADDRESS || | |||
7742 | AddrSpace == AMDGPUAS::GLOBAL_ADDRESS)) | |||
7743 | return false; | |||
7744 | ||||
7745 | bool Signed = FlatVariant != SIInstrFlags::FLAT; | |||
7746 | if (ST.hasNegativeScratchOffsetBug() && | |||
7747 | FlatVariant == SIInstrFlags::FlatScratch) | |||
7748 | Signed = false; | |||
7749 | if (ST.hasNegativeUnalignedScratchOffsetBug() && | |||
7750 | FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 && | |||
7751 | (Offset % 4) != 0) { | |||
7752 | return false; | |||
7753 | } | |||
7754 | ||||
7755 | unsigned N = AMDGPU::getNumFlatOffsetBits(ST, Signed); | |||
7756 | return Signed ? isIntN(N, Offset) : isUIntN(N, Offset); | |||
7757 | } | |||
7758 | ||||
7759 | // See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not. | |||
7760 | std::pair<int64_t, int64_t> | |||
7761 | SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, | |||
7762 | uint64_t FlatVariant) const { | |||
7763 | int64_t RemainderOffset = COffsetVal; | |||
7764 | int64_t ImmField = 0; | |||
7765 | bool Signed = FlatVariant != SIInstrFlags::FLAT; | |||
7766 | if (ST.hasNegativeScratchOffsetBug() && | |||
7767 | FlatVariant == SIInstrFlags::FlatScratch) | |||
7768 | Signed = false; | |||
7769 | ||||
7770 | const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, Signed); | |||
7771 | if (Signed) { | |||
7772 | // Use signed division by a power of two to truncate towards 0. | |||
7773 | int64_t D = 1LL << (NumBits - 1); | |||
7774 | RemainderOffset = (COffsetVal / D) * D; | |||
7775 | ImmField = COffsetVal - RemainderOffset; | |||
7776 | ||||
7777 | if (ST.hasNegativeUnalignedScratchOffsetBug() && | |||
7778 | FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 && | |||
7779 | (ImmField % 4) != 0) { | |||
7780 | // Make ImmField a multiple of 4 | |||
7781 | RemainderOffset += ImmField % 4; | |||
7782 | ImmField -= ImmField % 4; | |||
7783 | } | |||
7784 | } else if (COffsetVal >= 0) { | |||
7785 | ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits); | |||
7786 | RemainderOffset = COffsetVal - ImmField; | |||
7787 | } | |||
7788 | ||||
7789 | assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant))(static_cast <bool> (isLegalFLATOffset(ImmField, AddrSpace , FlatVariant)) ? void (0) : __assert_fail ("isLegalFLATOffset(ImmField, AddrSpace, FlatVariant)" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7789, __extension__ __PRETTY_FUNCTION__)); | |||
7790 | assert(RemainderOffset + ImmField == COffsetVal)(static_cast <bool> (RemainderOffset + ImmField == COffsetVal ) ? void (0) : __assert_fail ("RemainderOffset + ImmField == COffsetVal" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7790, __extension__ __PRETTY_FUNCTION__)); | |||
7791 | return {ImmField, RemainderOffset}; | |||
7792 | } | |||
7793 | ||||
7794 | // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td | |||
7795 | enum SIEncodingFamily { | |||
7796 | SI = 0, | |||
7797 | VI = 1, | |||
7798 | SDWA = 2, | |||
7799 | SDWA9 = 3, | |||
7800 | GFX80 = 4, | |||
7801 | GFX9 = 5, | |||
7802 | GFX10 = 6, | |||
7803 | SDWA10 = 7, | |||
7804 | GFX90A = 8, | |||
7805 | GFX940 = 9 | |||
7806 | }; | |||
7807 | ||||
7808 | static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { | |||
7809 | switch (ST.getGeneration()) { | |||
7810 | default: | |||
7811 | break; | |||
7812 | case AMDGPUSubtarget::SOUTHERN_ISLANDS: | |||
7813 | case AMDGPUSubtarget::SEA_ISLANDS: | |||
7814 | return SIEncodingFamily::SI; | |||
7815 | case AMDGPUSubtarget::VOLCANIC_ISLANDS: | |||
7816 | case AMDGPUSubtarget::GFX9: | |||
7817 | return SIEncodingFamily::VI; | |||
7818 | case AMDGPUSubtarget::GFX10: | |||
7819 | return SIEncodingFamily::GFX10; | |||
7820 | } | |||
7821 | llvm_unreachable("Unknown subtarget generation!")::llvm::llvm_unreachable_internal("Unknown subtarget generation!" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7821); | |||
7822 | } | |||
7823 | ||||
7824 | bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const { | |||
7825 | switch(MCOp) { | |||
7826 | // These opcodes use indirect register addressing so | |||
7827 | // they need special handling by codegen (currently missing). | |||
7828 | // Therefore it is too risky to allow these opcodes | |||
7829 | // to be selected by dpp combiner or sdwa peepholer. | |||
7830 | case AMDGPU::V_MOVRELS_B32_dpp_gfx10: | |||
7831 | case AMDGPU::V_MOVRELS_B32_sdwa_gfx10: | |||
7832 | case AMDGPU::V_MOVRELD_B32_dpp_gfx10: | |||
7833 | case AMDGPU::V_MOVRELD_B32_sdwa_gfx10: | |||
7834 | case AMDGPU::V_MOVRELSD_B32_dpp_gfx10: | |||
7835 | case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10: | |||
7836 | case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10: | |||
7837 | case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10: | |||
7838 | return true; | |||
7839 | default: | |||
7840 | return false; | |||
7841 | } | |||
7842 | } | |||
7843 | ||||
7844 | int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { | |||
7845 | SIEncodingFamily Gen = subtargetEncodingFamily(ST); | |||
7846 | ||||
7847 | if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && | |||
7848 | ST.getGeneration() == AMDGPUSubtarget::GFX9) | |||
7849 | Gen = SIEncodingFamily::GFX9; | |||
7850 | ||||
7851 | // Adjust the encoding family to GFX80 for D16 buffer instructions when the | |||
7852 | // subtarget has UnpackedD16VMem feature. | |||
7853 | // TODO: remove this when we discard GFX80 encoding. | |||
7854 | if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf)) | |||
7855 | Gen = SIEncodingFamily::GFX80; | |||
7856 | ||||
7857 | if (get(Opcode).TSFlags & SIInstrFlags::SDWA) { | |||
7858 | switch (ST.getGeneration()) { | |||
7859 | default: | |||
7860 | Gen = SIEncodingFamily::SDWA; | |||
7861 | break; | |||
7862 | case AMDGPUSubtarget::GFX9: | |||
7863 | Gen = SIEncodingFamily::SDWA9; | |||
7864 | break; | |||
7865 | case AMDGPUSubtarget::GFX10: | |||
7866 | Gen = SIEncodingFamily::SDWA10; | |||
7867 | break; | |||
7868 | } | |||
7869 | } | |||
7870 | ||||
7871 | if (isMAI(Opcode)) { | |||
7872 | int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode); | |||
7873 | if (MFMAOp != -1) | |||
7874 | Opcode = MFMAOp; | |||
7875 | } | |||
7876 | ||||
7877 | int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); | |||
7878 | ||||
7879 | // -1 means that Opcode is already a native instruction. | |||
7880 | if (MCOp == -1) | |||
7881 | return Opcode; | |||
7882 | ||||
7883 | if (ST.hasGFX90AInsts()) { | |||
7884 | uint16_t NMCOp = (uint16_t)-1; | |||
7885 | if (ST.hasGFX940Insts()) | |||
7886 | NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX940); | |||
7887 | if (NMCOp == (uint16_t)-1) | |||
7888 | NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A); | |||
7889 | if (NMCOp == (uint16_t)-1) | |||
7890 | NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9); | |||
7891 | if (NMCOp != (uint16_t)-1) | |||
7892 | MCOp = NMCOp; | |||
7893 | } | |||
7894 | ||||
7895 | // (uint16_t)-1 means that Opcode is a pseudo instruction that has | |||
7896 | // no encoding in the given subtarget generation. | |||
7897 | if (MCOp == (uint16_t)-1) | |||
7898 | return -1; | |||
7899 | ||||
7900 | if (isAsmOnlyOpcode(MCOp)) | |||
7901 | return -1; | |||
7902 | ||||
7903 | return MCOp; | |||
7904 | } | |||
7905 | ||||
7906 | static | |||
7907 | TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) { | |||
7908 | assert(RegOpnd.isReg())(static_cast <bool> (RegOpnd.isReg()) ? void (0) : __assert_fail ("RegOpnd.isReg()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7908, __extension__ __PRETTY_FUNCTION__)); | |||
7909 | return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() : | |||
7910 | getRegSubRegPair(RegOpnd); | |||
7911 | } | |||
7912 | ||||
7913 | TargetInstrInfo::RegSubRegPair | |||
7914 | llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) { | |||
7915 | assert(MI.isRegSequence())(static_cast <bool> (MI.isRegSequence()) ? void (0) : __assert_fail ("MI.isRegSequence()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7915, __extension__ __PRETTY_FUNCTION__)); | |||
7916 | for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I) | |||
7917 | if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) { | |||
7918 | auto &RegOp = MI.getOperand(1 + 2 * I); | |||
7919 | return getRegOrUndef(RegOp); | |||
7920 | } | |||
7921 | return TargetInstrInfo::RegSubRegPair(); | |||
7922 | } | |||
7923 | ||||
7924 | // Try to find the definition of reg:subreg in subreg-manipulation pseudos | |||
7925 | // Following a subreg of reg:subreg isn't supported | |||
7926 | static bool followSubRegDef(MachineInstr &MI, | |||
7927 | TargetInstrInfo::RegSubRegPair &RSR) { | |||
7928 | if (!RSR.SubReg) | |||
7929 | return false; | |||
7930 | switch (MI.getOpcode()) { | |||
7931 | default: break; | |||
7932 | case AMDGPU::REG_SEQUENCE: | |||
7933 | RSR = getRegSequenceSubReg(MI, RSR.SubReg); | |||
7934 | return true; | |||
7935 | // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg | |||
7936 | case AMDGPU::INSERT_SUBREG: | |||
7937 | if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm()) | |||
7938 | // inserted the subreg we're looking for | |||
7939 | RSR = getRegOrUndef(MI.getOperand(2)); | |||
7940 | else { // the subreg in the rest of the reg | |||
7941 | auto R1 = getRegOrUndef(MI.getOperand(1)); | |||
7942 | if (R1.SubReg) // subreg of subreg isn't supported | |||
7943 | return false; | |||
7944 | RSR.Reg = R1.Reg; | |||
7945 | } | |||
7946 | return true; | |||
7947 | } | |||
7948 | return false; | |||
7949 | } | |||
7950 | ||||
7951 | MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, | |||
7952 | MachineRegisterInfo &MRI) { | |||
7953 | assert(MRI.isSSA())(static_cast <bool> (MRI.isSSA()) ? void (0) : __assert_fail ("MRI.isSSA()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7953 , __extension__ __PRETTY_FUNCTION__)); | |||
7954 | if (!P.Reg.isVirtual()) | |||
7955 | return nullptr; | |||
7956 | ||||
7957 | auto RSR = P; | |||
7958 | auto *DefInst = MRI.getVRegDef(RSR.Reg); | |||
7959 | while (auto *MI = DefInst) { | |||
7960 | DefInst = nullptr; | |||
7961 | switch (MI->getOpcode()) { | |||
7962 | case AMDGPU::COPY: | |||
7963 | case AMDGPU::V_MOV_B32_e32: { | |||
7964 | auto &Op1 = MI->getOperand(1); | |||
7965 | if (Op1.isReg() && Op1.getReg().isVirtual()) { | |||
7966 | if (Op1.isUndef()) | |||
7967 | return nullptr; | |||
7968 | RSR = getRegSubRegPair(Op1); | |||
7969 | DefInst = MRI.getVRegDef(RSR.Reg); | |||
7970 | } | |||
7971 | break; | |||
7972 | } | |||
7973 | default: | |||
7974 | if (followSubRegDef(*MI, RSR)) { | |||
7975 | if (!RSR.Reg) | |||
7976 | return nullptr; | |||
7977 | DefInst = MRI.getVRegDef(RSR.Reg); | |||
7978 | } | |||
7979 | } | |||
7980 | if (!DefInst) | |||
7981 | return MI; | |||
7982 | } | |||
7983 | return nullptr; | |||
7984 | } | |||
7985 | ||||
7986 | bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, | |||
7987 | Register VReg, | |||
7988 | const MachineInstr &DefMI, | |||
7989 | const MachineInstr &UseMI) { | |||
7990 | assert(MRI.isSSA() && "Must be run on SSA")(static_cast <bool> (MRI.isSSA() && "Must be run on SSA" ) ? void (0) : __assert_fail ("MRI.isSSA() && \"Must be run on SSA\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 7990, __extension__ __PRETTY_FUNCTION__)); | |||
7991 | ||||
7992 | auto *TRI = MRI.getTargetRegisterInfo(); | |||
7993 | auto *DefBB = DefMI.getParent(); | |||
7994 | ||||
7995 | // Don't bother searching between blocks, although it is possible this block | |||
7996 | // doesn't modify exec. | |||
7997 | if (UseMI.getParent() != DefBB) | |||
7998 | return true; | |||
7999 | ||||
8000 | const int MaxInstScan = 20; | |||
8001 | int NumInst = 0; | |||
8002 | ||||
8003 | // Stop scan at the use. | |||
8004 | auto E = UseMI.getIterator(); | |||
8005 | for (auto I = std::next(DefMI.getIterator()); I != E; ++I) { | |||
8006 | if (I->isDebugInstr()) | |||
8007 | continue; | |||
8008 | ||||
8009 | if (++NumInst > MaxInstScan) | |||
8010 | return true; | |||
8011 | ||||
8012 | if (I->modifiesRegister(AMDGPU::EXEC, TRI)) | |||
8013 | return true; | |||
8014 | } | |||
8015 | ||||
8016 | return false; | |||
8017 | } | |||
8018 | ||||
8019 | bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, | |||
8020 | Register VReg, | |||
8021 | const MachineInstr &DefMI) { | |||
8022 | assert(MRI.isSSA() && "Must be run on SSA")(static_cast <bool> (MRI.isSSA() && "Must be run on SSA" ) ? void (0) : __assert_fail ("MRI.isSSA() && \"Must be run on SSA\"" , "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 8022, __extension__ __PRETTY_FUNCTION__)); | |||
8023 | ||||
8024 | auto *TRI = MRI.getTargetRegisterInfo(); | |||
8025 | auto *DefBB = DefMI.getParent(); | |||
8026 | ||||
8027 | const int MaxUseScan = 10; | |||
8028 | int NumUse = 0; | |||
8029 | ||||
8030 | for (auto &Use : MRI.use_nodbg_operands(VReg)) { | |||
8031 | auto &UseInst = *Use.getParent(); | |||
8032 | // Don't bother searching between blocks, although it is possible this block | |||
8033 | // doesn't modify exec. | |||
8034 | if (UseInst.getParent() != DefBB) | |||
8035 | return true; | |||
8036 | ||||
8037 | if (++NumUse > MaxUseScan) | |||
8038 | return true; | |||
8039 | } | |||
8040 | ||||
8041 | if (NumUse == 0) | |||
8042 | return false; | |||
8043 | ||||
8044 | const int MaxInstScan = 20; | |||
8045 | int NumInst = 0; | |||
8046 | ||||
8047 | // Stop scan when we have seen all the uses. | |||
8048 | for (auto I = std::next(DefMI.getIterator()); ; ++I) { | |||
8049 | assert(I != DefBB->end())(static_cast <bool> (I != DefBB->end()) ? void (0) : __assert_fail ("I != DefBB->end()", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 8049, __extension__ __PRETTY_FUNCTION__)); | |||
8050 | ||||
8051 | if (I->isDebugInstr()) | |||
8052 | continue; | |||
8053 | ||||
8054 | if (++NumInst > MaxInstScan) | |||
8055 | return true; | |||
8056 | ||||
8057 | for (const MachineOperand &Op : I->operands()) { | |||
8058 | // We don't check reg masks here as they're used only on calls: | |||
8059 | // 1. EXEC is only considered const within one BB | |||
8060 | // 2. Call should be a terminator instruction if present in a BB | |||
8061 | ||||
8062 | if (!Op.isReg()) | |||
8063 | continue; | |||
8064 | ||||
8065 | Register Reg = Op.getReg(); | |||
8066 | if (Op.isUse()) { | |||
8067 | if (Reg == VReg && --NumUse == 0) | |||
8068 | return false; | |||
8069 | } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC)) | |||
8070 | return true; | |||
8071 | } | |||
8072 | } | |||
8073 | } | |||
8074 | ||||
8075 | MachineInstr *SIInstrInfo::createPHIDestinationCopy( | |||
8076 | MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt, | |||
8077 | const DebugLoc &DL, Register Src, Register Dst) const { | |||
8078 | auto Cur = MBB.begin(); | |||
8079 | if (Cur != MBB.end()) | |||
8080 | do { | |||
8081 | if (!Cur->isPHI() && Cur->readsRegister(Dst)) | |||
8082 | return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src); | |||
8083 | ++Cur; | |||
8084 | } while (Cur != MBB.end() && Cur != LastPHIIt); | |||
8085 | ||||
8086 | return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src, | |||
8087 | Dst); | |||
8088 | } | |||
8089 | ||||
8090 | MachineInstr *SIInstrInfo::createPHISourceCopy( | |||
8091 | MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, | |||
8092 | const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const { | |||
8093 | if (InsPt != MBB.end() && | |||
8094 | (InsPt->getOpcode() == AMDGPU::SI_IF || | |||
8095 | InsPt->getOpcode() == AMDGPU::SI_ELSE || | |||
8096 | InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) && | |||
8097 | InsPt->definesRegister(Src)) { | |||
8098 | InsPt++; | |||
8099 | return BuildMI(MBB, InsPt, DL, | |||
8100 | get(ST.isWave32() ? AMDGPU::S_MOV_B32_term | |||
8101 | : AMDGPU::S_MOV_B64_term), | |||
8102 | Dst) | |||
8103 | .addReg(Src, 0, SrcSubReg) | |||
8104 | .addReg(AMDGPU::EXEC, RegState::Implicit); | |||
8105 | } | |||
8106 | return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg, | |||
8107 | Dst); | |||
8108 | } | |||
8109 | ||||
8110 | bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); } | |||
8111 | ||||
8112 | MachineInstr *SIInstrInfo::foldMemoryOperandImpl( | |||
8113 | MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, | |||
8114 | MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS, | |||
8115 | VirtRegMap *VRM) const { | |||
8116 | // This is a bit of a hack (copied from AArch64). Consider this instruction: | |||
8117 | // | |||
8118 | // %0:sreg_32 = COPY $m0 | |||
8119 | // | |||
8120 | // We explicitly chose SReg_32 for the virtual register so such a copy might | |||
8121 | // be eliminated by RegisterCoalescer. However, that may not be possible, and | |||
8122 | // %0 may even spill. We can't spill $m0 normally (it would require copying to | |||
8123 | // a numbered SGPR anyway), and since it is in the SReg_32 register class, | |||
8124 | // TargetInstrInfo::foldMemoryOperand() is going to try. | |||
8125 | // A similar issue also exists with spilling and reloading $exec registers. | |||
8126 | // | |||
8127 | // To prevent that, constrain the %0 register class here. | |||
8128 | if (MI.isFullCopy()) { | |||
8129 | Register DstReg = MI.getOperand(0).getReg(); | |||
8130 | Register SrcReg = MI.getOperand(1).getReg(); | |||
8131 | if ((DstReg.isVirtual() || SrcReg.isVirtual()) && | |||
8132 | (DstReg.isVirtual() != SrcReg.isVirtual())) { | |||
8133 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
8134 | Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg; | |||
8135 | const TargetRegisterClass *RC = MRI.getRegClass(VirtReg); | |||
8136 | if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) { | |||
8137 | MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); | |||
8138 | return nullptr; | |||
8139 | } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) { | |||
8140 | MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass); | |||
8141 | return nullptr; | |||
8142 | } | |||
8143 | } | |||
8144 | } | |||
8145 | ||||
8146 | return nullptr; | |||
8147 | } | |||
8148 | ||||
8149 | unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, | |||
8150 | const MachineInstr &MI, | |||
8151 | unsigned *PredCost) const { | |||
8152 | if (MI.isBundle()) { | |||
8153 | MachineBasicBlock::const_instr_iterator I(MI.getIterator()); | |||
8154 | MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end()); | |||
8155 | unsigned Lat = 0, Count = 0; | |||
8156 | for (++I; I != E && I->isBundledWithPred(); ++I) { | |||
8157 | ++Count; | |||
8158 | Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I)); | |||
8159 | } | |||
8160 | return Lat + Count - 1; | |||
8161 | } | |||
8162 | ||||
8163 | return SchedModel.computeInstrLatency(&MI); | |||
8164 | } | |||
8165 | ||||
8166 | unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) { | |||
8167 | switch (MF.getFunction().getCallingConv()) { | |||
8168 | case CallingConv::AMDGPU_PS: | |||
8169 | return 1; | |||
8170 | case CallingConv::AMDGPU_VS: | |||
8171 | return 2; | |||
8172 | case CallingConv::AMDGPU_GS: | |||
8173 | return 3; | |||
8174 | case CallingConv::AMDGPU_HS: | |||
8175 | case CallingConv::AMDGPU_LS: | |||
8176 | case CallingConv::AMDGPU_ES: | |||
8177 | report_fatal_error("ds_ordered_count unsupported for this calling conv"); | |||
8178 | case CallingConv::AMDGPU_CS: | |||
8179 | case CallingConv::AMDGPU_KERNEL: | |||
8180 | case CallingConv::C: | |||
8181 | case CallingConv::Fast: | |||
8182 | default: | |||
8183 | // Assume other calling conventions are various compute callable functions | |||
8184 | return 0; | |||
8185 | } | |||
8186 | } | |||
8187 | ||||
8188 | bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, | |||
8189 | Register &SrcReg2, int64_t &CmpMask, | |||
8190 | int64_t &CmpValue) const { | |||
8191 | if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg()) | |||
8192 | return false; | |||
8193 | ||||
8194 | switch (MI.getOpcode()) { | |||
8195 | default: | |||
8196 | break; | |||
8197 | case AMDGPU::S_CMP_EQ_U32: | |||
8198 | case AMDGPU::S_CMP_EQ_I32: | |||
8199 | case AMDGPU::S_CMP_LG_U32: | |||
8200 | case AMDGPU::S_CMP_LG_I32: | |||
8201 | case AMDGPU::S_CMP_LT_U32: | |||
8202 | case AMDGPU::S_CMP_LT_I32: | |||
8203 | case AMDGPU::S_CMP_GT_U32: | |||
8204 | case AMDGPU::S_CMP_GT_I32: | |||
8205 | case AMDGPU::S_CMP_LE_U32: | |||
8206 | case AMDGPU::S_CMP_LE_I32: | |||
8207 | case AMDGPU::S_CMP_GE_U32: | |||
8208 | case AMDGPU::S_CMP_GE_I32: | |||
8209 | case AMDGPU::S_CMP_EQ_U64: | |||
8210 | case AMDGPU::S_CMP_LG_U64: | |||
8211 | SrcReg = MI.getOperand(0).getReg(); | |||
8212 | if (MI.getOperand(1).isReg()) { | |||
8213 | if (MI.getOperand(1).getSubReg()) | |||
8214 | return false; | |||
8215 | SrcReg2 = MI.getOperand(1).getReg(); | |||
8216 | CmpValue = 0; | |||
8217 | } else if (MI.getOperand(1).isImm()) { | |||
8218 | SrcReg2 = Register(); | |||
8219 | CmpValue = MI.getOperand(1).getImm(); | |||
8220 | } else { | |||
8221 | return false; | |||
8222 | } | |||
8223 | CmpMask = ~0; | |||
8224 | return true; | |||
8225 | case AMDGPU::S_CMPK_EQ_U32: | |||
8226 | case AMDGPU::S_CMPK_EQ_I32: | |||
8227 | case AMDGPU::S_CMPK_LG_U32: | |||
8228 | case AMDGPU::S_CMPK_LG_I32: | |||
8229 | case AMDGPU::S_CMPK_LT_U32: | |||
8230 | case AMDGPU::S_CMPK_LT_I32: | |||
8231 | case AMDGPU::S_CMPK_GT_U32: | |||
8232 | case AMDGPU::S_CMPK_GT_I32: | |||
8233 | case AMDGPU::S_CMPK_LE_U32: | |||
8234 | case AMDGPU::S_CMPK_LE_I32: | |||
8235 | case AMDGPU::S_CMPK_GE_U32: | |||
8236 | case AMDGPU::S_CMPK_GE_I32: | |||
8237 | SrcReg = MI.getOperand(0).getReg(); | |||
8238 | SrcReg2 = Register(); | |||
8239 | CmpValue = MI.getOperand(1).getImm(); | |||
8240 | CmpMask = ~0; | |||
8241 | return true; | |||
8242 | } | |||
8243 | ||||
8244 | return false; | |||
8245 | } | |||
8246 | ||||
8247 | bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, | |||
8248 | Register SrcReg2, int64_t CmpMask, | |||
8249 | int64_t CmpValue, | |||
8250 | const MachineRegisterInfo *MRI) const { | |||
8251 | if (!SrcReg || SrcReg.isPhysical()) | |||
8252 | return false; | |||
8253 | ||||
8254 | if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue)) | |||
8255 | return false; | |||
8256 | ||||
8257 | const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI, | |||
8258 | this](int64_t ExpectedValue, unsigned SrcSize, | |||
8259 | bool IsReversible, bool IsSigned) -> bool { | |||
8260 | // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n | |||
8261 | // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n | |||
8262 | // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n | |||
8263 | // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n | |||
8264 | // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n | |||
8265 | // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n | |||
8266 | // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n | |||
8267 | // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n | |||
8268 | // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n | |||
8269 | // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n | |||
8270 | // | |||
8271 | // Signed ge/gt are not used for the sign bit. | |||
8272 | // | |||
8273 | // If result of the AND is unused except in the compare: | |||
8274 | // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n | |||
8275 | // | |||
8276 | // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n | |||
8277 | // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n | |||
8278 | // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n | |||
8279 | // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n | |||
8280 | // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n | |||
8281 | // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n | |||
8282 | ||||
8283 | MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); | |||
8284 | if (!Def || Def->getParent() != CmpInstr.getParent()) | |||
8285 | return false; | |||
8286 | ||||
8287 | if (Def->getOpcode() != AMDGPU::S_AND_B32 && | |||
8288 | Def->getOpcode() != AMDGPU::S_AND_B64) | |||
8289 | return false; | |||
8290 | ||||
8291 | int64_t Mask; | |||
8292 | const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool { | |||
8293 | if (MO->isImm()) | |||
8294 | Mask = MO->getImm(); | |||
8295 | else if (!getFoldableImm(MO, Mask)) | |||
8296 | return false; | |||
8297 | Mask &= maxUIntN(SrcSize); | |||
8298 | return isPowerOf2_64(Mask); | |||
8299 | }; | |||
8300 | ||||
8301 | MachineOperand *SrcOp = &Def->getOperand(1); | |||
8302 | if (isMask(SrcOp)) | |||
8303 | SrcOp = &Def->getOperand(2); | |||
8304 | else if (isMask(&Def->getOperand(2))) | |||
8305 | SrcOp = &Def->getOperand(1); | |||
8306 | else | |||
8307 | return false; | |||
8308 | ||||
8309 | unsigned BitNo = countTrailingZeros((uint64_t)Mask); | |||
8310 | if (IsSigned && BitNo == SrcSize - 1) | |||
8311 | return false; | |||
8312 | ||||
8313 | ExpectedValue <<= BitNo; | |||
8314 | ||||
8315 | bool IsReversedCC = false; | |||
8316 | if (CmpValue != ExpectedValue) { | |||
8317 | if (!IsReversible) | |||
8318 | return false; | |||
8319 | IsReversedCC = CmpValue == (ExpectedValue ^ Mask); | |||
8320 | if (!IsReversedCC) | |||
8321 | return false; | |||
8322 | } | |||
8323 | ||||
8324 | Register DefReg = Def->getOperand(0).getReg(); | |||
8325 | if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg)) | |||
8326 | return false; | |||
8327 | ||||
8328 | for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator(); | |||
8329 | I != E; ++I) { | |||
8330 | if (I->modifiesRegister(AMDGPU::SCC, &RI) || | |||
8331 | I->killsRegister(AMDGPU::SCC, &RI)) | |||
8332 | return false; | |||
8333 | } | |||
8334 | ||||
8335 | MachineOperand *SccDef = Def->findRegisterDefOperand(AMDGPU::SCC); | |||
8336 | SccDef->setIsDead(false); | |||
8337 | CmpInstr.eraseFromParent(); | |||
8338 | ||||
8339 | if (!MRI->use_nodbg_empty(DefReg)) { | |||
8340 | assert(!IsReversedCC)(static_cast <bool> (!IsReversedCC) ? void (0) : __assert_fail ("!IsReversedCC", "llvm/lib/Target/AMDGPU/SIInstrInfo.cpp", 8340 , __extension__ __PRETTY_FUNCTION__)); | |||
8341 | return true; | |||
8342 | } | |||
8343 | ||||
8344 | // Replace AND with unused result with a S_BITCMP. | |||
8345 | MachineBasicBlock *MBB = Def->getParent(); | |||
8346 | ||||
8347 | unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32 | |||
8348 | : AMDGPU::S_BITCMP1_B32 | |||
8349 | : IsReversedCC ? AMDGPU::S_BITCMP0_B64 | |||
8350 | : AMDGPU::S_BITCMP1_B64; | |||
8351 | ||||
8352 | BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc)) | |||
8353 | .add(*SrcOp) | |||
8354 | .addImm(BitNo); | |||
8355 | Def->eraseFromParent(); | |||
8356 | ||||
8357 | return true; | |||
8358 | }; | |||
8359 | ||||
8360 | switch (CmpInstr.getOpcode()) { | |||
8361 | default: | |||
8362 | break; | |||
8363 | case AMDGPU::S_CMP_EQ_U32: | |||
8364 | case AMDGPU::S_CMP_EQ_I32: | |||
8365 | case AMDGPU::S_CMPK_EQ_U32: | |||
8366 | case AMDGPU::S_CMPK_EQ_I32: | |||
8367 | return optimizeCmpAnd(1, 32, true, false); | |||
8368 | case AMDGPU::S_CMP_GE_U32: | |||
8369 | case AMDGPU::S_CMPK_GE_U32: | |||
8370 | return optimizeCmpAnd(1, 32, false, false); | |||
8371 | case AMDGPU::S_CMP_GE_I32: | |||
8372 | case AMDGPU::S_CMPK_GE_I32: | |||
8373 | return optimizeCmpAnd(1, 32, false, true); | |||
8374 | case AMDGPU::S_CMP_EQ_U64: | |||
8375 | return optimizeCmpAnd(1, 64, true, false); | |||
8376 | case AMDGPU::S_CMP_LG_U32: | |||
8377 | case AMDGPU::S_CMP_LG_I32: | |||
8378 | case AMDGPU::S_CMPK_LG_U32: | |||
8379 | case AMDGPU::S_CMPK_LG_I32: | |||
8380 | return optimizeCmpAnd(0, 32, true, false); | |||
8381 | case AMDGPU::S_CMP_GT_U32: | |||
8382 | case AMDGPU::S_CMPK_GT_U32: | |||
8383 | return optimizeCmpAnd(0, 32, false, false); | |||
8384 | case AMDGPU::S_CMP_GT_I32: | |||
8385 | case AMDGPU::S_CMPK_GT_I32: | |||
8386 | return optimizeCmpAnd(0, 32, false, true); | |||
8387 | case AMDGPU::S_CMP_LG_U64: | |||
8388 | return optimizeCmpAnd(0, 64, true, false); | |||
8389 | } | |||
8390 | ||||
8391 | return false; | |||
8392 | } |
1 | //===-- llvm/CodeGen/Register.h ---------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef LLVM_CODEGEN_REGISTER_H |
10 | #define LLVM_CODEGEN_REGISTER_H |
11 | |
12 | #include "llvm/MC/MCRegister.h" |
13 | #include <cassert> |
14 | |
15 | namespace llvm { |
16 | |
17 | /// Wrapper class representing virtual and physical registers. Should be passed |
18 | /// by value. |
19 | class Register { |
20 | unsigned Reg; |
21 | |
22 | public: |
23 | constexpr Register(unsigned Val = 0): Reg(Val) {} |
24 | constexpr Register(MCRegister Val): Reg(Val) {} |
25 | |
26 | // Register numbers can represent physical registers, virtual registers, and |
27 | // sometimes stack slots. The unsigned values are divided into these ranges: |
28 | // |
29 | // 0 Not a register, can be used as a sentinel. |
30 | // [1;2^30) Physical registers assigned by TableGen. |
31 | // [2^30;2^31) Stack slots. (Rarely used.) |
32 | // [2^31;2^32) Virtual registers assigned by MachineRegisterInfo. |
33 | // |
34 | // Further sentinels can be allocated from the small negative integers. |
35 | // DenseMapInfo<unsigned> uses -1u and -2u. |
36 | static_assert(std::numeric_limits<decltype(Reg)>::max() >= 0xFFFFFFFF, |
37 | "Reg isn't large enough to hold full range."); |
38 | |
39 | /// isStackSlot - Sometimes it is useful the be able to store a non-negative |
40 | /// frame index in a variable that normally holds a register. isStackSlot() |
41 | /// returns true if Reg is in the range used for stack slots. |
42 | /// |
43 | /// FIXME: remove in favor of member. |
44 | static bool isStackSlot(unsigned Reg) { |
45 | return MCRegister::isStackSlot(Reg); |
46 | } |
47 | |
48 | /// Return true if this is a stack slot. |
49 | bool isStack() const { return MCRegister::isStackSlot(Reg); } |
50 | |
51 | /// Compute the frame index from a register value representing a stack slot. |
52 | static int stackSlot2Index(Register Reg) { |
53 | assert(Reg.isStack() && "Not a stack slot")(static_cast <bool> (Reg.isStack() && "Not a stack slot" ) ? void (0) : __assert_fail ("Reg.isStack() && \"Not a stack slot\"" , "llvm/include/llvm/CodeGen/Register.h", 53, __extension__ __PRETTY_FUNCTION__ )); |
54 | return int(Reg - MCRegister::FirstStackSlot); |
55 | } |
56 | |
57 | /// Convert a non-negative frame index to a stack slot register value. |
58 | static Register index2StackSlot(int FI) { |
59 | assert(FI >= 0 && "Cannot hold a negative frame index.")(static_cast <bool> (FI >= 0 && "Cannot hold a negative frame index." ) ? void (0) : __assert_fail ("FI >= 0 && \"Cannot hold a negative frame index.\"" , "llvm/include/llvm/CodeGen/Register.h", 59, __extension__ __PRETTY_FUNCTION__ )); |
60 | return Register(FI + MCRegister::FirstStackSlot); |
61 | } |
62 | |
63 | /// Return true if the specified register number is in |
64 | /// the physical register namespace. |
65 | static bool isPhysicalRegister(unsigned Reg) { |
66 | return MCRegister::isPhysicalRegister(Reg); |
67 | } |
68 | |
69 | /// Return true if the specified register number is in |
70 | /// the virtual register namespace. |
71 | static bool isVirtualRegister(unsigned Reg) { |
72 | return Reg & MCRegister::VirtualRegFlag && !isStackSlot(Reg); |
73 | } |
74 | |
75 | /// Convert a virtual register number to a 0-based index. |
76 | /// The first virtual register in a function will get the index 0. |
77 | static unsigned virtReg2Index(Register Reg) { |
78 | assert(isVirtualRegister(Reg) && "Not a virtual register")(static_cast <bool> (isVirtualRegister(Reg) && "Not a virtual register" ) ? void (0) : __assert_fail ("isVirtualRegister(Reg) && \"Not a virtual register\"" , "llvm/include/llvm/CodeGen/Register.h", 78, __extension__ __PRETTY_FUNCTION__ )); |
79 | return Reg & ~MCRegister::VirtualRegFlag; |
80 | } |
81 | |
82 | /// Convert a 0-based index to a virtual register number. |
83 | /// This is the inverse operation of VirtReg2IndexFunctor below. |
84 | static Register index2VirtReg(unsigned Index) { |
85 | assert(Index < (1u << 31) && "Index too large for virtual register range.")(static_cast <bool> (Index < (1u << 31) && "Index too large for virtual register range.") ? void (0) : __assert_fail ("Index < (1u << 31) && \"Index too large for virtual register range.\"" , "llvm/include/llvm/CodeGen/Register.h", 85, __extension__ __PRETTY_FUNCTION__ )); |
86 | return Index | MCRegister::VirtualRegFlag; |
87 | } |
88 | |
89 | /// Return true if the specified register number is in the virtual register |
90 | /// namespace. |
91 | bool isVirtual() const { |
92 | return isVirtualRegister(Reg); |
93 | } |
94 | |
95 | /// Return true if the specified register number is in the physical register |
96 | /// namespace. |
97 | bool isPhysical() const { |
98 | return isPhysicalRegister(Reg); |
99 | } |
100 | |
101 | /// Convert a virtual register number to a 0-based index. The first virtual |
102 | /// register in a function will get the index 0. |
103 | unsigned virtRegIndex() const { |
104 | return virtReg2Index(Reg); |
105 | } |
106 | |
107 | constexpr operator unsigned() const { |
108 | return Reg; |
109 | } |
110 | |
111 | unsigned id() const { return Reg; } |
112 | |
113 | operator MCRegister() const { |
114 | return MCRegister(Reg); |
115 | } |
116 | |
117 | /// Utility to check-convert this value to a MCRegister. The caller is |
118 | /// expected to have already validated that this Register is, indeed, |
119 | /// physical. |
120 | MCRegister asMCReg() const { |
121 | assert(Reg == MCRegister::NoRegister ||(static_cast <bool> (Reg == MCRegister::NoRegister || MCRegister ::isPhysicalRegister(Reg)) ? void (0) : __assert_fail ("Reg == MCRegister::NoRegister || MCRegister::isPhysicalRegister(Reg)" , "llvm/include/llvm/CodeGen/Register.h", 122, __extension__ __PRETTY_FUNCTION__ )) |
122 | MCRegister::isPhysicalRegister(Reg))(static_cast <bool> (Reg == MCRegister::NoRegister || MCRegister ::isPhysicalRegister(Reg)) ? void (0) : __assert_fail ("Reg == MCRegister::NoRegister || MCRegister::isPhysicalRegister(Reg)" , "llvm/include/llvm/CodeGen/Register.h", 122, __extension__ __PRETTY_FUNCTION__ )); |
123 | return MCRegister(Reg); |
124 | } |
125 | |
126 | bool isValid() const { return Reg != MCRegister::NoRegister; } |
127 | |
128 | /// Comparisons between register objects |
129 | bool operator==(const Register &Other) const { return Reg == Other.Reg; } |
130 | bool operator!=(const Register &Other) const { return Reg != Other.Reg; } |
131 | bool operator==(const MCRegister &Other) const { return Reg == Other.id(); } |
132 | bool operator!=(const MCRegister &Other) const { return Reg != Other.id(); } |
133 | |
134 | /// Comparisons against register constants. E.g. |
135 | /// * R == AArch64::WZR |
136 | /// * R == 0 |
137 | /// * R == VirtRegMap::NO_PHYS_REG |
138 | bool operator==(unsigned Other) const { return Reg == Other; } |
139 | bool operator!=(unsigned Other) const { return Reg != Other; } |
140 | bool operator==(int Other) const { return Reg == unsigned(Other); } |
141 | bool operator!=(int Other) const { return Reg != unsigned(Other); } |
142 | // MSVC requires that we explicitly declare these two as well. |
143 | bool operator==(MCPhysReg Other) const { return Reg == unsigned(Other); } |
144 | bool operator!=(MCPhysReg Other) const { return Reg != unsigned(Other); } |
145 | }; |
146 | |
147 | // Provide DenseMapInfo for Register |
148 | template<> struct DenseMapInfo<Register> { |
149 | static inline unsigned getEmptyKey() { |
150 | return DenseMapInfo<unsigned>::getEmptyKey(); |
151 | } |
152 | static inline unsigned getTombstoneKey() { |
153 | return DenseMapInfo<unsigned>::getTombstoneKey(); |
154 | } |
155 | static unsigned getHashValue(const Register &Val) { |
156 | return DenseMapInfo<unsigned>::getHashValue(Val.id()); |
157 | } |
158 | static bool isEqual(const Register &LHS, const Register &RHS) { |
159 | return DenseMapInfo<unsigned>::isEqual(LHS.id(), RHS.id()); |
160 | } |
161 | }; |
162 | |
163 | } |
164 | |
165 | #endif // LLVM_CODEGEN_REGISTER_H |