File: | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp |
Warning: | line 2060, column 15 Called C++ object pointer is uninitialized |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | // | |||
9 | /// \file | |||
10 | /// SI Implementation of TargetInstrInfo. | |||
11 | // | |||
12 | //===----------------------------------------------------------------------===// | |||
13 | ||||
14 | #include "SIInstrInfo.h" | |||
15 | #include "AMDGPU.h" | |||
16 | #include "AMDGPUInstrInfo.h" | |||
17 | #include "GCNHazardRecognizer.h" | |||
18 | #include "GCNSubtarget.h" | |||
19 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" | |||
20 | #include "SIMachineFunctionInfo.h" | |||
21 | #include "llvm/Analysis/ValueTracking.h" | |||
22 | #include "llvm/CodeGen/LiveVariables.h" | |||
23 | #include "llvm/CodeGen/MachineDominators.h" | |||
24 | #include "llvm/CodeGen/RegisterScavenging.h" | |||
25 | #include "llvm/CodeGen/ScheduleDAG.h" | |||
26 | #include "llvm/IR/DiagnosticInfo.h" | |||
27 | #include "llvm/IR/IntrinsicsAMDGPU.h" | |||
28 | #include "llvm/MC/MCContext.h" | |||
29 | #include "llvm/Support/CommandLine.h" | |||
30 | #include "llvm/Target/TargetMachine.h" | |||
31 | ||||
32 | using namespace llvm; | |||
33 | ||||
34 | #define DEBUG_TYPE"si-instr-info" "si-instr-info" | |||
35 | ||||
36 | #define GET_INSTRINFO_CTOR_DTOR | |||
37 | #include "AMDGPUGenInstrInfo.inc" | |||
38 | ||||
39 | namespace llvm { | |||
40 | ||||
41 | class AAResults; | |||
42 | ||||
43 | namespace AMDGPU { | |||
44 | #define GET_D16ImageDimIntrinsics_IMPL | |||
45 | #define GET_ImageDimIntrinsicTable_IMPL | |||
46 | #define GET_RsrcIntrinsics_IMPL | |||
47 | #include "AMDGPUGenSearchableTables.inc" | |||
48 | } | |||
49 | } | |||
50 | ||||
51 | ||||
52 | // Must be at least 4 to be able to branch over minimum unconditional branch | |||
53 | // code. This is only for making it possible to write reasonably small tests for | |||
54 | // long branches. | |||
55 | static cl::opt<unsigned> | |||
56 | BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), | |||
57 | cl::desc("Restrict range of branch instructions (DEBUG)")); | |||
58 | ||||
59 | static cl::opt<bool> Fix16BitCopies( | |||
60 | "amdgpu-fix-16-bit-physreg-copies", | |||
61 | cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), | |||
62 | cl::init(true), | |||
63 | cl::ReallyHidden); | |||
64 | ||||
65 | SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) | |||
66 | : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), | |||
67 | RI(ST), ST(ST) { | |||
68 | SchedModel.init(&ST); | |||
69 | } | |||
70 | ||||
71 | //===----------------------------------------------------------------------===// | |||
72 | // TargetInstrInfo callbacks | |||
73 | //===----------------------------------------------------------------------===// | |||
74 | ||||
75 | static unsigned getNumOperandsNoGlue(SDNode *Node) { | |||
76 | unsigned N = Node->getNumOperands(); | |||
77 | while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) | |||
78 | --N; | |||
79 | return N; | |||
80 | } | |||
81 | ||||
82 | /// Returns true if both nodes have the same value for the given | |||
83 | /// operand \p Op, or if both nodes do not have this operand. | |||
84 | static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { | |||
85 | unsigned Opc0 = N0->getMachineOpcode(); | |||
86 | unsigned Opc1 = N1->getMachineOpcode(); | |||
87 | ||||
88 | int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); | |||
89 | int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); | |||
90 | ||||
91 | if (Op0Idx == -1 && Op1Idx == -1) | |||
92 | return true; | |||
93 | ||||
94 | ||||
95 | if ((Op0Idx == -1 && Op1Idx != -1) || | |||
96 | (Op1Idx == -1 && Op0Idx != -1)) | |||
97 | return false; | |||
98 | ||||
99 | // getNamedOperandIdx returns the index for the MachineInstr's operands, | |||
100 | // which includes the result as the first operand. We are indexing into the | |||
101 | // MachineSDNode's operands, so we need to skip the result operand to get | |||
102 | // the real index. | |||
103 | --Op0Idx; | |||
104 | --Op1Idx; | |||
105 | ||||
106 | return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); | |||
107 | } | |||
108 | ||||
109 | bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, | |||
110 | AAResults *AA) const { | |||
111 | if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI)) { | |||
112 | // Normally VALU use of exec would block the rematerialization, but that | |||
113 | // is OK in this case to have an implicit exec read as all VALU do. | |||
114 | // We really want all of the generic logic for this except for this. | |||
115 | ||||
116 | // Another potential implicit use is mode register. The core logic of | |||
117 | // the RA will not attempt rematerialization if mode is set anywhere | |||
118 | // in the function, otherwise it is safe since mode is not changed. | |||
119 | return !MI.hasImplicitDef() && | |||
120 | MI.getNumImplicitOperands() == MI.getDesc().getNumImplicitUses() && | |||
121 | !MI.mayRaiseFPException(); | |||
122 | } | |||
123 | ||||
124 | return false; | |||
125 | } | |||
126 | ||||
127 | bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const { | |||
128 | // Any implicit use of exec by VALU is not a real register read. | |||
129 | return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() && | |||
130 | isVALU(*MO.getParent()); | |||
131 | } | |||
132 | ||||
133 | bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, | |||
134 | int64_t &Offset0, | |||
135 | int64_t &Offset1) const { | |||
136 | if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) | |||
137 | return false; | |||
138 | ||||
139 | unsigned Opc0 = Load0->getMachineOpcode(); | |||
140 | unsigned Opc1 = Load1->getMachineOpcode(); | |||
141 | ||||
142 | // Make sure both are actually loads. | |||
143 | if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) | |||
144 | return false; | |||
145 | ||||
146 | if (isDS(Opc0) && isDS(Opc1)) { | |||
147 | ||||
148 | // FIXME: Handle this case: | |||
149 | if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) | |||
150 | return false; | |||
151 | ||||
152 | // Check base reg. | |||
153 | if (Load0->getOperand(0) != Load1->getOperand(0)) | |||
154 | return false; | |||
155 | ||||
156 | // Skip read2 / write2 variants for simplicity. | |||
157 | // TODO: We should report true if the used offsets are adjacent (excluded | |||
158 | // st64 versions). | |||
159 | int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); | |||
160 | int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); | |||
161 | if (Offset0Idx == -1 || Offset1Idx == -1) | |||
162 | return false; | |||
163 | ||||
164 | // XXX - be careful of datalesss loads | |||
165 | // getNamedOperandIdx returns the index for MachineInstrs. Since they | |||
166 | // include the output in the operand list, but SDNodes don't, we need to | |||
167 | // subtract the index by one. | |||
168 | Offset0Idx -= get(Opc0).NumDefs; | |||
169 | Offset1Idx -= get(Opc1).NumDefs; | |||
170 | Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue(); | |||
171 | Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue(); | |||
172 | return true; | |||
173 | } | |||
174 | ||||
175 | if (isSMRD(Opc0) && isSMRD(Opc1)) { | |||
176 | // Skip time and cache invalidation instructions. | |||
177 | if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || | |||
178 | AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) | |||
179 | return false; | |||
180 | ||||
181 | assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1))(static_cast <bool> (getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue (Load1)) ? void (0) : __assert_fail ("getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 181, __extension__ __PRETTY_FUNCTION__)); | |||
182 | ||||
183 | // Check base reg. | |||
184 | if (Load0->getOperand(0) != Load1->getOperand(0)) | |||
185 | return false; | |||
186 | ||||
187 | const ConstantSDNode *Load0Offset = | |||
188 | dyn_cast<ConstantSDNode>(Load0->getOperand(1)); | |||
189 | const ConstantSDNode *Load1Offset = | |||
190 | dyn_cast<ConstantSDNode>(Load1->getOperand(1)); | |||
191 | ||||
192 | if (!Load0Offset || !Load1Offset) | |||
193 | return false; | |||
194 | ||||
195 | Offset0 = Load0Offset->getZExtValue(); | |||
196 | Offset1 = Load1Offset->getZExtValue(); | |||
197 | return true; | |||
198 | } | |||
199 | ||||
200 | // MUBUF and MTBUF can access the same addresses. | |||
201 | if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { | |||
202 | ||||
203 | // MUBUF and MTBUF have vaddr at different indices. | |||
204 | if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || | |||
205 | !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || | |||
206 | !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) | |||
207 | return false; | |||
208 | ||||
209 | int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); | |||
210 | int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); | |||
211 | ||||
212 | if (OffIdx0 == -1 || OffIdx1 == -1) | |||
213 | return false; | |||
214 | ||||
215 | // getNamedOperandIdx returns the index for MachineInstrs. Since they | |||
216 | // include the output in the operand list, but SDNodes don't, we need to | |||
217 | // subtract the index by one. | |||
218 | OffIdx0 -= get(Opc0).NumDefs; | |||
219 | OffIdx1 -= get(Opc1).NumDefs; | |||
220 | ||||
221 | SDValue Off0 = Load0->getOperand(OffIdx0); | |||
222 | SDValue Off1 = Load1->getOperand(OffIdx1); | |||
223 | ||||
224 | // The offset might be a FrameIndexSDNode. | |||
225 | if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) | |||
226 | return false; | |||
227 | ||||
228 | Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); | |||
229 | Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); | |||
230 | return true; | |||
231 | } | |||
232 | ||||
233 | return false; | |||
234 | } | |||
235 | ||||
236 | static bool isStride64(unsigned Opc) { | |||
237 | switch (Opc) { | |||
238 | case AMDGPU::DS_READ2ST64_B32: | |||
239 | case AMDGPU::DS_READ2ST64_B64: | |||
240 | case AMDGPU::DS_WRITE2ST64_B32: | |||
241 | case AMDGPU::DS_WRITE2ST64_B64: | |||
242 | return true; | |||
243 | default: | |||
244 | return false; | |||
245 | } | |||
246 | } | |||
247 | ||||
248 | bool SIInstrInfo::getMemOperandsWithOffsetWidth( | |||
249 | const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, | |||
250 | int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, | |||
251 | const TargetRegisterInfo *TRI) const { | |||
252 | if (!LdSt.mayLoadOrStore()) | |||
253 | return false; | |||
254 | ||||
255 | unsigned Opc = LdSt.getOpcode(); | |||
256 | OffsetIsScalable = false; | |||
257 | const MachineOperand *BaseOp, *OffsetOp; | |||
258 | int DataOpIdx; | |||
259 | ||||
260 | if (isDS(LdSt)) { | |||
261 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); | |||
262 | OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); | |||
263 | if (OffsetOp) { | |||
264 | // Normal, single offset LDS instruction. | |||
265 | if (!BaseOp) { | |||
266 | // DS_CONSUME/DS_APPEND use M0 for the base address. | |||
267 | // TODO: find the implicit use operand for M0 and use that as BaseOp? | |||
268 | return false; | |||
269 | } | |||
270 | BaseOps.push_back(BaseOp); | |||
271 | Offset = OffsetOp->getImm(); | |||
272 | // Get appropriate operand, and compute width accordingly. | |||
273 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
274 | if (DataOpIdx == -1) | |||
275 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); | |||
276 | Width = getOpSize(LdSt, DataOpIdx); | |||
277 | } else { | |||
278 | // The 2 offset instructions use offset0 and offset1 instead. We can treat | |||
279 | // these as a load with a single offset if the 2 offsets are consecutive. | |||
280 | // We will use this for some partially aligned loads. | |||
281 | const MachineOperand *Offset0Op = | |||
282 | getNamedOperand(LdSt, AMDGPU::OpName::offset0); | |||
283 | const MachineOperand *Offset1Op = | |||
284 | getNamedOperand(LdSt, AMDGPU::OpName::offset1); | |||
285 | ||||
286 | unsigned Offset0 = Offset0Op->getImm(); | |||
287 | unsigned Offset1 = Offset1Op->getImm(); | |||
288 | if (Offset0 + 1 != Offset1) | |||
289 | return false; | |||
290 | ||||
291 | // Each of these offsets is in element sized units, so we need to convert | |||
292 | // to bytes of the individual reads. | |||
293 | ||||
294 | unsigned EltSize; | |||
295 | if (LdSt.mayLoad()) | |||
296 | EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; | |||
297 | else { | |||
298 | assert(LdSt.mayStore())(static_cast <bool> (LdSt.mayStore()) ? void (0) : __assert_fail ("LdSt.mayStore()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 298, __extension__ __PRETTY_FUNCTION__)); | |||
299 | int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); | |||
300 | EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; | |||
301 | } | |||
302 | ||||
303 | if (isStride64(Opc)) | |||
304 | EltSize *= 64; | |||
305 | ||||
306 | BaseOps.push_back(BaseOp); | |||
307 | Offset = EltSize * Offset0; | |||
308 | // Get appropriate operand(s), and compute width accordingly. | |||
309 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
310 | if (DataOpIdx == -1) { | |||
311 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); | |||
312 | Width = getOpSize(LdSt, DataOpIdx); | |||
313 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1); | |||
314 | Width += getOpSize(LdSt, DataOpIdx); | |||
315 | } else { | |||
316 | Width = getOpSize(LdSt, DataOpIdx); | |||
317 | } | |||
318 | } | |||
319 | return true; | |||
320 | } | |||
321 | ||||
322 | if (isMUBUF(LdSt) || isMTBUF(LdSt)) { | |||
323 | const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); | |||
324 | if (!RSrc) // e.g. BUFFER_WBINVL1_VOL | |||
325 | return false; | |||
326 | BaseOps.push_back(RSrc); | |||
327 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); | |||
328 | if (BaseOp && !BaseOp->isFI()) | |||
329 | BaseOps.push_back(BaseOp); | |||
330 | const MachineOperand *OffsetImm = | |||
331 | getNamedOperand(LdSt, AMDGPU::OpName::offset); | |||
332 | Offset = OffsetImm->getImm(); | |||
333 | const MachineOperand *SOffset = | |||
334 | getNamedOperand(LdSt, AMDGPU::OpName::soffset); | |||
335 | if (SOffset) { | |||
336 | if (SOffset->isReg()) | |||
337 | BaseOps.push_back(SOffset); | |||
338 | else | |||
339 | Offset += SOffset->getImm(); | |||
340 | } | |||
341 | // Get appropriate operand, and compute width accordingly. | |||
342 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
343 | if (DataOpIdx == -1) | |||
344 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); | |||
345 | Width = getOpSize(LdSt, DataOpIdx); | |||
346 | return true; | |||
347 | } | |||
348 | ||||
349 | if (isMIMG(LdSt)) { | |||
350 | int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); | |||
351 | BaseOps.push_back(&LdSt.getOperand(SRsrcIdx)); | |||
352 | int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); | |||
353 | if (VAddr0Idx >= 0) { | |||
354 | // GFX10 possible NSA encoding. | |||
355 | for (int I = VAddr0Idx; I < SRsrcIdx; ++I) | |||
356 | BaseOps.push_back(&LdSt.getOperand(I)); | |||
357 | } else { | |||
358 | BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr)); | |||
359 | } | |||
360 | Offset = 0; | |||
361 | // Get appropriate operand, and compute width accordingly. | |||
362 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); | |||
363 | Width = getOpSize(LdSt, DataOpIdx); | |||
364 | return true; | |||
365 | } | |||
366 | ||||
367 | if (isSMRD(LdSt)) { | |||
368 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase); | |||
369 | if (!BaseOp) // e.g. S_MEMTIME | |||
370 | return false; | |||
371 | BaseOps.push_back(BaseOp); | |||
372 | OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); | |||
373 | Offset = OffsetOp ? OffsetOp->getImm() : 0; | |||
374 | // Get appropriate operand, and compute width accordingly. | |||
375 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst); | |||
376 | Width = getOpSize(LdSt, DataOpIdx); | |||
377 | return true; | |||
378 | } | |||
379 | ||||
380 | if (isFLAT(LdSt)) { | |||
381 | // Instructions have either vaddr or saddr or both or none. | |||
382 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); | |||
383 | if (BaseOp) | |||
384 | BaseOps.push_back(BaseOp); | |||
385 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); | |||
386 | if (BaseOp) | |||
387 | BaseOps.push_back(BaseOp); | |||
388 | Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); | |||
389 | // Get appropriate operand, and compute width accordingly. | |||
390 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
391 | if (DataOpIdx == -1) | |||
392 | DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); | |||
393 | Width = getOpSize(LdSt, DataOpIdx); | |||
394 | return true; | |||
395 | } | |||
396 | ||||
397 | return false; | |||
398 | } | |||
399 | ||||
400 | static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, | |||
401 | ArrayRef<const MachineOperand *> BaseOps1, | |||
402 | const MachineInstr &MI2, | |||
403 | ArrayRef<const MachineOperand *> BaseOps2) { | |||
404 | // Only examine the first "base" operand of each instruction, on the | |||
405 | // assumption that it represents the real base address of the memory access. | |||
406 | // Other operands are typically offsets or indices from this base address. | |||
407 | if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front())) | |||
408 | return true; | |||
409 | ||||
410 | if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) | |||
411 | return false; | |||
412 | ||||
413 | auto MO1 = *MI1.memoperands_begin(); | |||
414 | auto MO2 = *MI2.memoperands_begin(); | |||
415 | if (MO1->getAddrSpace() != MO2->getAddrSpace()) | |||
416 | return false; | |||
417 | ||||
418 | auto Base1 = MO1->getValue(); | |||
419 | auto Base2 = MO2->getValue(); | |||
420 | if (!Base1 || !Base2) | |||
421 | return false; | |||
422 | Base1 = getUnderlyingObject(Base1); | |||
423 | Base2 = getUnderlyingObject(Base2); | |||
424 | ||||
425 | if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2)) | |||
426 | return false; | |||
427 | ||||
428 | return Base1 == Base2; | |||
429 | } | |||
430 | ||||
431 | bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, | |||
432 | ArrayRef<const MachineOperand *> BaseOps2, | |||
433 | unsigned NumLoads, | |||
434 | unsigned NumBytes) const { | |||
435 | // If the mem ops (to be clustered) do not have the same base ptr, then they | |||
436 | // should not be clustered | |||
437 | if (!BaseOps1.empty() && !BaseOps2.empty()) { | |||
438 | const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); | |||
439 | const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); | |||
440 | if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) | |||
441 | return false; | |||
442 | } else if (!BaseOps1.empty() || !BaseOps2.empty()) { | |||
443 | // If only one base op is empty, they do not have the same base ptr | |||
444 | return false; | |||
445 | } | |||
446 | ||||
447 | // In order to avoid regester pressure, on an average, the number of DWORDS | |||
448 | // loaded together by all clustered mem ops should not exceed 8. This is an | |||
449 | // empirical value based on certain observations and performance related | |||
450 | // experiments. | |||
451 | // The good thing about this heuristic is - it avoids clustering of too many | |||
452 | // sub-word loads, and also avoids clustering of wide loads. Below is the | |||
453 | // brief summary of how the heuristic behaves for various `LoadSize`. | |||
454 | // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops | |||
455 | // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops | |||
456 | // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops | |||
457 | // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops | |||
458 | // (5) LoadSize >= 17: do not cluster | |||
459 | const unsigned LoadSize = NumBytes / NumLoads; | |||
460 | const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads; | |||
461 | return NumDWORDs <= 8; | |||
462 | } | |||
463 | ||||
464 | // FIXME: This behaves strangely. If, for example, you have 32 load + stores, | |||
465 | // the first 16 loads will be interleaved with the stores, and the next 16 will | |||
466 | // be clustered as expected. It should really split into 2 16 store batches. | |||
467 | // | |||
468 | // Loads are clustered until this returns false, rather than trying to schedule | |||
469 | // groups of stores. This also means we have to deal with saying different | |||
470 | // address space loads should be clustered, and ones which might cause bank | |||
471 | // conflicts. | |||
472 | // | |||
473 | // This might be deprecated so it might not be worth that much effort to fix. | |||
474 | bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, | |||
475 | int64_t Offset0, int64_t Offset1, | |||
476 | unsigned NumLoads) const { | |||
477 | assert(Offset1 > Offset0 &&(static_cast <bool> (Offset1 > Offset0 && "Second offset should be larger than first offset!" ) ? void (0) : __assert_fail ("Offset1 > Offset0 && \"Second offset should be larger than first offset!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 478, __extension__ __PRETTY_FUNCTION__)) | |||
478 | "Second offset should be larger than first offset!")(static_cast <bool> (Offset1 > Offset0 && "Second offset should be larger than first offset!" ) ? void (0) : __assert_fail ("Offset1 > Offset0 && \"Second offset should be larger than first offset!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 478, __extension__ __PRETTY_FUNCTION__)); | |||
479 | // If we have less than 16 loads in a row, and the offsets are within 64 | |||
480 | // bytes, then schedule together. | |||
481 | ||||
482 | // A cacheline is 64 bytes (for global memory). | |||
483 | return (NumLoads <= 16 && (Offset1 - Offset0) < 64); | |||
484 | } | |||
485 | ||||
486 | static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, | |||
487 | MachineBasicBlock::iterator MI, | |||
488 | const DebugLoc &DL, MCRegister DestReg, | |||
489 | MCRegister SrcReg, bool KillSrc, | |||
490 | const char *Msg = "illegal SGPR to VGPR copy") { | |||
491 | MachineFunction *MF = MBB.getParent(); | |||
492 | DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error); | |||
493 | LLVMContext &C = MF->getFunction().getContext(); | |||
494 | C.diagnose(IllegalCopy); | |||
495 | ||||
496 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) | |||
497 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
498 | } | |||
499 | ||||
500 | /// Handle copying from SGPR to AGPR, or from AGPR to AGPR. It is not possible | |||
501 | /// to directly copy, so an intermediate VGPR needs to be used. | |||
502 | static void indirectCopyToAGPR(const SIInstrInfo &TII, | |||
503 | MachineBasicBlock &MBB, | |||
504 | MachineBasicBlock::iterator MI, | |||
505 | const DebugLoc &DL, MCRegister DestReg, | |||
506 | MCRegister SrcReg, bool KillSrc, | |||
507 | RegScavenger &RS, | |||
508 | Register ImpDefSuperReg = Register(), | |||
509 | Register ImpUseSuperReg = Register()) { | |||
510 | const SIRegisterInfo &RI = TII.getRegisterInfo(); | |||
511 | ||||
512 | assert(AMDGPU::SReg_32RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::SReg_32RegClass.contains(SrcReg ) || AMDGPU::AGPR_32RegClass.contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 513, __extension__ __PRETTY_FUNCTION__)) | |||
513 | AMDGPU::AGPR_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::SReg_32RegClass.contains(SrcReg ) || AMDGPU::AGPR_32RegClass.contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 513, __extension__ __PRETTY_FUNCTION__)); | |||
514 | ||||
515 | // First try to find defining accvgpr_write to avoid temporary registers. | |||
516 | for (auto Def = MI, E = MBB.begin(); Def != E; ) { | |||
517 | --Def; | |||
518 | if (!Def->definesRegister(SrcReg, &RI)) | |||
519 | continue; | |||
520 | if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) | |||
521 | break; | |||
522 | ||||
523 | MachineOperand &DefOp = Def->getOperand(1); | |||
524 | assert(DefOp.isReg() || DefOp.isImm())(static_cast <bool> (DefOp.isReg() || DefOp.isImm()) ? void (0) : __assert_fail ("DefOp.isReg() || DefOp.isImm()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 524, __extension__ __PRETTY_FUNCTION__)); | |||
525 | ||||
526 | if (DefOp.isReg()) { | |||
527 | // Check that register source operand if not clobbered before MI. | |||
528 | // Immediate operands are always safe to propagate. | |||
529 | bool SafeToPropagate = true; | |||
530 | for (auto I = Def; I != MI && SafeToPropagate; ++I) | |||
531 | if (I->modifiesRegister(DefOp.getReg(), &RI)) | |||
532 | SafeToPropagate = false; | |||
533 | ||||
534 | if (!SafeToPropagate) | |||
535 | break; | |||
536 | ||||
537 | DefOp.setIsKill(false); | |||
538 | } | |||
539 | ||||
540 | MachineInstrBuilder Builder = | |||
541 | BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) | |||
542 | .add(DefOp); | |||
543 | if (ImpDefSuperReg) | |||
544 | Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); | |||
545 | ||||
546 | if (ImpUseSuperReg) { | |||
547 | Builder.addReg(ImpUseSuperReg, | |||
548 | getKillRegState(KillSrc) | RegState::Implicit); | |||
549 | } | |||
550 | ||||
551 | return; | |||
552 | } | |||
553 | ||||
554 | RS.enterBasicBlock(MBB); | |||
555 | RS.forward(MI); | |||
556 | ||||
557 | // Ideally we want to have three registers for a long reg_sequence copy | |||
558 | // to hide 2 waitstates between v_mov_b32 and accvgpr_write. | |||
559 | unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, | |||
560 | *MBB.getParent()); | |||
561 | ||||
562 | // Registers in the sequence are allocated contiguously so we can just | |||
563 | // use register number to pick one of three round-robin temps. | |||
564 | unsigned RegNo = DestReg % 3; | |||
565 | Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); | |||
566 | if (!Tmp) | |||
567 | report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); | |||
568 | RS.setRegUsed(Tmp); | |||
569 | ||||
570 | if (!TII.getSubtarget().hasGFX90AInsts()) { | |||
571 | // Only loop through if there are any free registers left, otherwise | |||
572 | // scavenger may report a fatal error without emergency spill slot | |||
573 | // or spill with the slot. | |||
574 | while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { | |||
575 | Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); | |||
576 | if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) | |||
577 | break; | |||
578 | Tmp = Tmp2; | |||
579 | RS.setRegUsed(Tmp); | |||
580 | } | |||
581 | } | |||
582 | ||||
583 | // Insert copy to temporary VGPR. | |||
584 | unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32; | |||
585 | if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) { | |||
586 | TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64; | |||
587 | } else { | |||
588 | assert(AMDGPU::SReg_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::SReg_32RegClass.contains(SrcReg )) ? void (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 588, __extension__ __PRETTY_FUNCTION__)); | |||
589 | } | |||
590 | ||||
591 | MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp) | |||
592 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
593 | if (ImpUseSuperReg) { | |||
594 | UseBuilder.addReg(ImpUseSuperReg, | |||
595 | getKillRegState(KillSrc) | RegState::Implicit); | |||
596 | } | |||
597 | ||||
598 | MachineInstrBuilder DefBuilder | |||
599 | = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) | |||
600 | .addReg(Tmp, RegState::Kill); | |||
601 | ||||
602 | if (ImpDefSuperReg) | |||
603 | DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); | |||
604 | } | |||
605 | ||||
606 | static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, | |||
607 | MachineBasicBlock::iterator MI, const DebugLoc &DL, | |||
608 | MCRegister DestReg, MCRegister SrcReg, bool KillSrc, | |||
609 | const TargetRegisterClass *RC, bool Forward) { | |||
610 | const SIRegisterInfo &RI = TII.getRegisterInfo(); | |||
611 | ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4); | |||
612 | MachineBasicBlock::iterator I = MI; | |||
613 | MachineInstr *FirstMI = nullptr, *LastMI = nullptr; | |||
614 | ||||
615 | for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) { | |||
616 | int16_t SubIdx = BaseIndices[Idx]; | |||
617 | Register Reg = RI.getSubReg(DestReg, SubIdx); | |||
618 | unsigned Opcode = AMDGPU::S_MOV_B32; | |||
619 | ||||
620 | // Is SGPR aligned? If so try to combine with next. | |||
621 | Register Src = RI.getSubReg(SrcReg, SubIdx); | |||
622 | bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0; | |||
623 | bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0; | |||
624 | if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) { | |||
625 | // Can use SGPR64 copy | |||
626 | unsigned Channel = RI.getChannelFromSubReg(SubIdx); | |||
627 | SubIdx = RI.getSubRegFromChannel(Channel, 2); | |||
628 | Opcode = AMDGPU::S_MOV_B64; | |||
629 | Idx++; | |||
630 | } | |||
631 | ||||
632 | LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx)) | |||
633 | .addReg(RI.getSubReg(SrcReg, SubIdx)) | |||
634 | .addReg(SrcReg, RegState::Implicit); | |||
635 | ||||
636 | if (!FirstMI) | |||
637 | FirstMI = LastMI; | |||
638 | ||||
639 | if (!Forward) | |||
640 | I--; | |||
641 | } | |||
642 | ||||
643 | assert(FirstMI && LastMI)(static_cast <bool> (FirstMI && LastMI) ? void ( 0) : __assert_fail ("FirstMI && LastMI", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 643, __extension__ __PRETTY_FUNCTION__)); | |||
644 | if (!Forward) | |||
645 | std::swap(FirstMI, LastMI); | |||
646 | ||||
647 | FirstMI->addOperand( | |||
648 | MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/)); | |||
649 | ||||
650 | if (KillSrc) | |||
651 | LastMI->addRegisterKilled(SrcReg, &RI); | |||
652 | } | |||
653 | ||||
654 | void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, | |||
655 | MachineBasicBlock::iterator MI, | |||
656 | const DebugLoc &DL, MCRegister DestReg, | |||
657 | MCRegister SrcReg, bool KillSrc) const { | |||
658 | const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); | |||
659 | ||||
660 | // FIXME: This is hack to resolve copies between 16 bit and 32 bit | |||
661 | // registers until all patterns are fixed. | |||
662 | if (Fix16BitCopies && | |||
663 | ((RI.getRegSizeInBits(*RC) == 16) ^ | |||
664 | (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) { | |||
665 | MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg; | |||
666 | MCRegister Super = RI.get32BitRegister(RegToFix); | |||
667 | assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix)(static_cast <bool> (RI.getSubReg(Super, AMDGPU::lo16) == RegToFix) ? void (0) : __assert_fail ("RI.getSubReg(Super, AMDGPU::lo16) == RegToFix" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 667, __extension__ __PRETTY_FUNCTION__)); | |||
668 | RegToFix = Super; | |||
669 | ||||
670 | if (DestReg == SrcReg) { | |||
671 | // Insert empty bundle since ExpandPostRA expects an instruction here. | |||
672 | BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); | |||
673 | return; | |||
674 | } | |||
675 | ||||
676 | RC = RI.getPhysRegClass(DestReg); | |||
677 | } | |||
678 | ||||
679 | if (RC == &AMDGPU::VGPR_32RegClass) { | |||
680 | assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::VGPR_32RegClass.contains(SrcReg ) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 682, __extension__ __PRETTY_FUNCTION__)) | |||
681 | AMDGPU::SReg_32RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::VGPR_32RegClass.contains(SrcReg ) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 682, __extension__ __PRETTY_FUNCTION__)) | |||
682 | AMDGPU::AGPR_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::VGPR_32RegClass.contains(SrcReg ) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || AMDGPU::AGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 682, __extension__ __PRETTY_FUNCTION__)); | |||
683 | unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ? | |||
684 | AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32; | |||
685 | BuildMI(MBB, MI, DL, get(Opc), DestReg) | |||
686 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
687 | return; | |||
688 | } | |||
689 | ||||
690 | if (RC == &AMDGPU::SReg_32_XM0RegClass || | |||
691 | RC == &AMDGPU::SReg_32RegClass) { | |||
692 | if (SrcReg == AMDGPU::SCC) { | |||
693 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) | |||
694 | .addImm(1) | |||
695 | .addImm(0); | |||
696 | return; | |||
697 | } | |||
698 | ||||
699 | if (DestReg == AMDGPU::VCC_LO) { | |||
700 | if (AMDGPU::SReg_32RegClass.contains(SrcReg)) { | |||
701 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO) | |||
702 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
703 | } else { | |||
704 | // FIXME: Hack until VReg_1 removed. | |||
705 | assert(AMDGPU::VGPR_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::VGPR_32RegClass.contains(SrcReg )) ? void (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 705, __extension__ __PRETTY_FUNCTION__)); | |||
706 | BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) | |||
707 | .addImm(0) | |||
708 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
709 | } | |||
710 | ||||
711 | return; | |||
712 | } | |||
713 | ||||
714 | if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { | |||
715 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); | |||
716 | return; | |||
717 | } | |||
718 | ||||
719 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) | |||
720 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
721 | return; | |||
722 | } | |||
723 | ||||
724 | if (RC == &AMDGPU::SReg_64RegClass) { | |||
725 | if (SrcReg == AMDGPU::SCC) { | |||
726 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg) | |||
727 | .addImm(1) | |||
728 | .addImm(0); | |||
729 | return; | |||
730 | } | |||
731 | ||||
732 | if (DestReg == AMDGPU::VCC) { | |||
733 | if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { | |||
734 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) | |||
735 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
736 | } else { | |||
737 | // FIXME: Hack until VReg_1 removed. | |||
738 | assert(AMDGPU::VGPR_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::VGPR_32RegClass.contains(SrcReg )) ? void (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 738, __extension__ __PRETTY_FUNCTION__)); | |||
739 | BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) | |||
740 | .addImm(0) | |||
741 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
742 | } | |||
743 | ||||
744 | return; | |||
745 | } | |||
746 | ||||
747 | if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { | |||
748 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); | |||
749 | return; | |||
750 | } | |||
751 | ||||
752 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) | |||
753 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
754 | return; | |||
755 | } | |||
756 | ||||
757 | if (DestReg == AMDGPU::SCC) { | |||
758 | // Copying 64-bit or 32-bit sources to SCC barely makes sense, | |||
759 | // but SelectionDAG emits such copies for i1 sources. | |||
760 | if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { | |||
761 | // This copy can only be produced by patterns | |||
762 | // with explicit SCC, which are known to be enabled | |||
763 | // only for subtargets with S_CMP_LG_U64 present. | |||
764 | assert(ST.hasScalarCompareEq64())(static_cast <bool> (ST.hasScalarCompareEq64()) ? void ( 0) : __assert_fail ("ST.hasScalarCompareEq64()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 764, __extension__ __PRETTY_FUNCTION__)); | |||
765 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64)) | |||
766 | .addReg(SrcReg, getKillRegState(KillSrc)) | |||
767 | .addImm(0); | |||
768 | } else { | |||
769 | assert(AMDGPU::SReg_32RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::SReg_32RegClass.contains(SrcReg )) ? void (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 769, __extension__ __PRETTY_FUNCTION__)); | |||
770 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) | |||
771 | .addReg(SrcReg, getKillRegState(KillSrc)) | |||
772 | .addImm(0); | |||
773 | } | |||
774 | ||||
775 | return; | |||
776 | } | |||
777 | ||||
778 | if (RC == &AMDGPU::AGPR_32RegClass) { | |||
779 | if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) { | |||
780 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) | |||
781 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
782 | return; | |||
783 | } | |||
784 | ||||
785 | if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) { | |||
786 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg) | |||
787 | .addReg(SrcReg, getKillRegState(KillSrc)); | |||
788 | return; | |||
789 | } | |||
790 | ||||
791 | // FIXME: Pass should maintain scavenger to avoid scan through the block on | |||
792 | // every AGPR spill. | |||
793 | RegScavenger RS; | |||
794 | indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS); | |||
795 | return; | |||
796 | } | |||
797 | ||||
798 | const unsigned Size = RI.getRegSizeInBits(*RC); | |||
799 | if (Size == 16) { | |||
800 | assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::VGPR_LO16RegClass.contains (SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU ::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 803, __extension__ __PRETTY_FUNCTION__)) | |||
801 | AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::VGPR_LO16RegClass.contains (SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU ::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 803, __extension__ __PRETTY_FUNCTION__)) | |||
802 | AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||(static_cast <bool> (AMDGPU::VGPR_LO16RegClass.contains (SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU ::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 803, __extension__ __PRETTY_FUNCTION__)) | |||
803 | AMDGPU::AGPR_LO16RegClass.contains(SrcReg))(static_cast <bool> (AMDGPU::VGPR_LO16RegClass.contains (SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU ::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass .contains(SrcReg)) ? void (0) : __assert_fail ("AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 803, __extension__ __PRETTY_FUNCTION__)); | |||
804 | ||||
805 | bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg); | |||
806 | bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg); | |||
807 | bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg); | |||
808 | bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg); | |||
809 | bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) || | |||
810 | AMDGPU::SReg_LO16RegClass.contains(DestReg) || | |||
811 | AMDGPU::AGPR_LO16RegClass.contains(DestReg); | |||
812 | bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || | |||
813 | AMDGPU::SReg_LO16RegClass.contains(SrcReg) || | |||
814 | AMDGPU::AGPR_LO16RegClass.contains(SrcReg); | |||
815 | MCRegister NewDestReg = RI.get32BitRegister(DestReg); | |||
816 | MCRegister NewSrcReg = RI.get32BitRegister(SrcReg); | |||
817 | ||||
818 | if (IsSGPRDst) { | |||
819 | if (!IsSGPRSrc) { | |||
820 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); | |||
821 | return; | |||
822 | } | |||
823 | ||||
824 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg) | |||
825 | .addReg(NewSrcReg, getKillRegState(KillSrc)); | |||
826 | return; | |||
827 | } | |||
828 | ||||
829 | if (IsAGPRDst || IsAGPRSrc) { | |||
830 | if (!DstLow || !SrcLow) { | |||
831 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, | |||
832 | "Cannot use hi16 subreg with an AGPR!"); | |||
833 | } | |||
834 | ||||
835 | copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc); | |||
836 | return; | |||
837 | } | |||
838 | ||||
839 | if (IsSGPRSrc && !ST.hasSDWAScalar()) { | |||
840 | if (!DstLow || !SrcLow) { | |||
841 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, | |||
842 | "Cannot use hi16 subreg on VI!"); | |||
843 | } | |||
844 | ||||
845 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg) | |||
846 | .addReg(NewSrcReg, getKillRegState(KillSrc)); | |||
847 | return; | |||
848 | } | |||
849 | ||||
850 | auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg) | |||
851 | .addImm(0) // src0_modifiers | |||
852 | .addReg(NewSrcReg) | |||
853 | .addImm(0) // clamp | |||
854 | .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0 | |||
855 | : AMDGPU::SDWA::SdwaSel::WORD_1) | |||
856 | .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) | |||
857 | .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0 | |||
858 | : AMDGPU::SDWA::SdwaSel::WORD_1) | |||
859 | .addReg(NewDestReg, RegState::Implicit | RegState::Undef); | |||
860 | // First implicit operand is $exec. | |||
861 | MIB->tieOperands(0, MIB->getNumOperands() - 1); | |||
862 | return; | |||
863 | } | |||
864 | ||||
865 | const TargetRegisterClass *SrcRC = RI.getPhysRegClass(SrcReg); | |||
866 | if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) { | |||
867 | if (ST.hasPackedFP32Ops()) { | |||
868 | BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg) | |||
869 | .addImm(SISrcMods::OP_SEL_1) | |||
870 | .addReg(SrcReg) | |||
871 | .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) | |||
872 | .addReg(SrcReg) | |||
873 | .addImm(0) // op_sel_lo | |||
874 | .addImm(0) // op_sel_hi | |||
875 | .addImm(0) // neg_lo | |||
876 | .addImm(0) // neg_hi | |||
877 | .addImm(0) // clamp | |||
878 | .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); | |||
879 | return; | |||
880 | } | |||
881 | } | |||
882 | ||||
883 | const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); | |||
884 | if (RI.isSGPRClass(RC)) { | |||
885 | if (!RI.isSGPRClass(SrcRC)) { | |||
886 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); | |||
887 | return; | |||
888 | } | |||
889 | expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RC, Forward); | |||
890 | return; | |||
891 | } | |||
892 | ||||
893 | unsigned EltSize = 4; | |||
894 | unsigned Opcode = AMDGPU::V_MOV_B32_e32; | |||
895 | if (RI.hasAGPRs(RC)) { | |||
896 | Opcode = (RI.hasVGPRs(SrcRC)) ? | |||
897 | AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; | |||
898 | } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(SrcRC)) { | |||
899 | Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64; | |||
900 | } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) && | |||
901 | (RI.isProperlyAlignedRC(*RC) && | |||
902 | (SrcRC == RC || RI.isSGPRClass(SrcRC)))) { | |||
903 | // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov. | |||
904 | if (ST.hasPackedFP32Ops()) { | |||
905 | Opcode = AMDGPU::V_PK_MOV_B32; | |||
906 | EltSize = 8; | |||
907 | } | |||
908 | } | |||
909 | ||||
910 | // For the cases where we need an intermediate instruction/temporary register | |||
911 | // (destination is an AGPR), we need a scavenger. | |||
912 | // | |||
913 | // FIXME: The pass should maintain this for us so we don't have to re-scan the | |||
914 | // whole block for every handled copy. | |||
915 | std::unique_ptr<RegScavenger> RS; | |||
916 | if (Opcode == AMDGPU::INSTRUCTION_LIST_END) | |||
917 | RS.reset(new RegScavenger()); | |||
918 | ||||
919 | ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); | |||
920 | ||||
921 | // If there is an overlap, we can't kill the super-register on the last | |||
922 | // instruction, since it will also kill the components made live by this def. | |||
923 | const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg); | |||
924 | ||||
925 | for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { | |||
926 | unsigned SubIdx; | |||
927 | if (Forward) | |||
928 | SubIdx = SubIndices[Idx]; | |||
929 | else | |||
930 | SubIdx = SubIndices[SubIndices.size() - Idx - 1]; | |||
931 | ||||
932 | bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1; | |||
933 | ||||
934 | if (Opcode == AMDGPU::INSTRUCTION_LIST_END) { | |||
935 | Register ImpDefSuper = Idx == 0 ? Register(DestReg) : Register(); | |||
936 | Register ImpUseSuper = SrcReg; | |||
937 | indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), | |||
938 | RI.getSubReg(SrcReg, SubIdx), UseKill, *RS, | |||
939 | ImpDefSuper, ImpUseSuper); | |||
940 | } else if (Opcode == AMDGPU::V_PK_MOV_B32) { | |||
941 | Register DstSubReg = RI.getSubReg(DestReg, SubIdx); | |||
942 | Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); | |||
943 | MachineInstrBuilder MIB = | |||
944 | BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg) | |||
945 | .addImm(SISrcMods::OP_SEL_1) | |||
946 | .addReg(SrcSubReg) | |||
947 | .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) | |||
948 | .addReg(SrcSubReg) | |||
949 | .addImm(0) // op_sel_lo | |||
950 | .addImm(0) // op_sel_hi | |||
951 | .addImm(0) // neg_lo | |||
952 | .addImm(0) // neg_hi | |||
953 | .addImm(0) // clamp | |||
954 | .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); | |||
955 | if (Idx == 0) | |||
956 | MIB.addReg(DestReg, RegState::Define | RegState::Implicit); | |||
957 | } else { | |||
958 | MachineInstrBuilder Builder = | |||
959 | BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)) | |||
960 | .addReg(RI.getSubReg(SrcReg, SubIdx)); | |||
961 | if (Idx == 0) | |||
962 | Builder.addReg(DestReg, RegState::Define | RegState::Implicit); | |||
963 | ||||
964 | Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); | |||
965 | } | |||
966 | } | |||
967 | } | |||
968 | ||||
969 | int SIInstrInfo::commuteOpcode(unsigned Opcode) const { | |||
970 | int NewOpc; | |||
971 | ||||
972 | // Try to map original to commuted opcode | |||
973 | NewOpc = AMDGPU::getCommuteRev(Opcode); | |||
974 | if (NewOpc != -1) | |||
975 | // Check if the commuted (REV) opcode exists on the target. | |||
976 | return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; | |||
977 | ||||
978 | // Try to map commuted to original opcode | |||
979 | NewOpc = AMDGPU::getCommuteOrig(Opcode); | |||
980 | if (NewOpc != -1) | |||
981 | // Check if the original (non-REV) opcode exists on the target. | |||
982 | return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; | |||
983 | ||||
984 | return Opcode; | |||
985 | } | |||
986 | ||||
987 | void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, | |||
988 | MachineBasicBlock::iterator MI, | |||
989 | const DebugLoc &DL, unsigned DestReg, | |||
990 | int64_t Value) const { | |||
991 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
992 | const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); | |||
993 | if (RegClass == &AMDGPU::SReg_32RegClass || | |||
994 | RegClass == &AMDGPU::SGPR_32RegClass || | |||
995 | RegClass == &AMDGPU::SReg_32_XM0RegClass || | |||
996 | RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { | |||
997 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) | |||
998 | .addImm(Value); | |||
999 | return; | |||
1000 | } | |||
1001 | ||||
1002 | if (RegClass == &AMDGPU::SReg_64RegClass || | |||
1003 | RegClass == &AMDGPU::SGPR_64RegClass || | |||
1004 | RegClass == &AMDGPU::SReg_64_XEXECRegClass) { | |||
1005 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) | |||
1006 | .addImm(Value); | |||
1007 | return; | |||
1008 | } | |||
1009 | ||||
1010 | if (RegClass == &AMDGPU::VGPR_32RegClass) { | |||
1011 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) | |||
1012 | .addImm(Value); | |||
1013 | return; | |||
1014 | } | |||
1015 | if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) { | |||
1016 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) | |||
1017 | .addImm(Value); | |||
1018 | return; | |||
1019 | } | |||
1020 | ||||
1021 | unsigned EltSize = 4; | |||
1022 | unsigned Opcode = AMDGPU::V_MOV_B32_e32; | |||
1023 | if (RI.isSGPRClass(RegClass)) { | |||
1024 | if (RI.getRegSizeInBits(*RegClass) > 32) { | |||
1025 | Opcode = AMDGPU::S_MOV_B64; | |||
1026 | EltSize = 8; | |||
1027 | } else { | |||
1028 | Opcode = AMDGPU::S_MOV_B32; | |||
1029 | EltSize = 4; | |||
1030 | } | |||
1031 | } | |||
1032 | ||||
1033 | ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); | |||
1034 | for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { | |||
1035 | int64_t IdxValue = Idx == 0 ? Value : 0; | |||
1036 | ||||
1037 | MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, | |||
1038 | get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx])); | |||
1039 | Builder.addImm(IdxValue); | |||
1040 | } | |||
1041 | } | |||
1042 | ||||
1043 | const TargetRegisterClass * | |||
1044 | SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { | |||
1045 | return &AMDGPU::VGPR_32RegClass; | |||
1046 | } | |||
1047 | ||||
1048 | void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, | |||
1049 | MachineBasicBlock::iterator I, | |||
1050 | const DebugLoc &DL, Register DstReg, | |||
1051 | ArrayRef<MachineOperand> Cond, | |||
1052 | Register TrueReg, | |||
1053 | Register FalseReg) const { | |||
1054 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
1055 | const TargetRegisterClass *BoolXExecRC = | |||
1056 | RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
1057 | assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&(static_cast <bool> (MRI.getRegClass(DstReg) == &AMDGPU ::VGPR_32RegClass && "Not a VGPR32 reg") ? void (0) : __assert_fail ("MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && \"Not a VGPR32 reg\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1058, __extension__ __PRETTY_FUNCTION__)) | |||
1058 | "Not a VGPR32 reg")(static_cast <bool> (MRI.getRegClass(DstReg) == &AMDGPU ::VGPR_32RegClass && "Not a VGPR32 reg") ? void (0) : __assert_fail ("MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && \"Not a VGPR32 reg\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1058, __extension__ __PRETTY_FUNCTION__)); | |||
1059 | ||||
1060 | if (Cond.size() == 1) { | |||
1061 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1062 | BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) | |||
1063 | .add(Cond[0]); | |||
1064 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1065 | .addImm(0) | |||
1066 | .addReg(FalseReg) | |||
1067 | .addImm(0) | |||
1068 | .addReg(TrueReg) | |||
1069 | .addReg(SReg); | |||
1070 | } else if (Cond.size() == 2) { | |||
1071 | assert(Cond[0].isImm() && "Cond[0] is not an immediate")(static_cast <bool> (Cond[0].isImm() && "Cond[0] is not an immediate" ) ? void (0) : __assert_fail ("Cond[0].isImm() && \"Cond[0] is not an immediate\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1071, __extension__ __PRETTY_FUNCTION__)); | |||
1072 | switch (Cond[0].getImm()) { | |||
1073 | case SIInstrInfo::SCC_TRUE: { | |||
1074 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1075 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 | |||
1076 | : AMDGPU::S_CSELECT_B64), SReg) | |||
1077 | .addImm(1) | |||
1078 | .addImm(0); | |||
1079 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1080 | .addImm(0) | |||
1081 | .addReg(FalseReg) | |||
1082 | .addImm(0) | |||
1083 | .addReg(TrueReg) | |||
1084 | .addReg(SReg); | |||
1085 | break; | |||
1086 | } | |||
1087 | case SIInstrInfo::SCC_FALSE: { | |||
1088 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1089 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 | |||
1090 | : AMDGPU::S_CSELECT_B64), SReg) | |||
1091 | .addImm(0) | |||
1092 | .addImm(1); | |||
1093 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1094 | .addImm(0) | |||
1095 | .addReg(FalseReg) | |||
1096 | .addImm(0) | |||
1097 | .addReg(TrueReg) | |||
1098 | .addReg(SReg); | |||
1099 | break; | |||
1100 | } | |||
1101 | case SIInstrInfo::VCCNZ: { | |||
1102 | MachineOperand RegOp = Cond[1]; | |||
1103 | RegOp.setImplicit(false); | |||
1104 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1105 | BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) | |||
1106 | .add(RegOp); | |||
1107 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1108 | .addImm(0) | |||
1109 | .addReg(FalseReg) | |||
1110 | .addImm(0) | |||
1111 | .addReg(TrueReg) | |||
1112 | .addReg(SReg); | |||
1113 | break; | |||
1114 | } | |||
1115 | case SIInstrInfo::VCCZ: { | |||
1116 | MachineOperand RegOp = Cond[1]; | |||
1117 | RegOp.setImplicit(false); | |||
1118 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1119 | BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) | |||
1120 | .add(RegOp); | |||
1121 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1122 | .addImm(0) | |||
1123 | .addReg(TrueReg) | |||
1124 | .addImm(0) | |||
1125 | .addReg(FalseReg) | |||
1126 | .addReg(SReg); | |||
1127 | break; | |||
1128 | } | |||
1129 | case SIInstrInfo::EXECNZ: { | |||
1130 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1131 | Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); | |||
1132 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 | |||
1133 | : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) | |||
1134 | .addImm(0); | |||
1135 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 | |||
1136 | : AMDGPU::S_CSELECT_B64), SReg) | |||
1137 | .addImm(1) | |||
1138 | .addImm(0); | |||
1139 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1140 | .addImm(0) | |||
1141 | .addReg(FalseReg) | |||
1142 | .addImm(0) | |||
1143 | .addReg(TrueReg) | |||
1144 | .addReg(SReg); | |||
1145 | break; | |||
1146 | } | |||
1147 | case SIInstrInfo::EXECZ: { | |||
1148 | Register SReg = MRI.createVirtualRegister(BoolXExecRC); | |||
1149 | Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); | |||
1150 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 | |||
1151 | : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) | |||
1152 | .addImm(0); | |||
1153 | BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 | |||
1154 | : AMDGPU::S_CSELECT_B64), SReg) | |||
1155 | .addImm(0) | |||
1156 | .addImm(1); | |||
1157 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
1158 | .addImm(0) | |||
1159 | .addReg(FalseReg) | |||
1160 | .addImm(0) | |||
1161 | .addReg(TrueReg) | |||
1162 | .addReg(SReg); | |||
1163 | llvm_unreachable("Unhandled branch predicate EXECZ")::llvm::llvm_unreachable_internal("Unhandled branch predicate EXECZ" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1163); | |||
1164 | break; | |||
1165 | } | |||
1166 | default: | |||
1167 | llvm_unreachable("invalid branch predicate")::llvm::llvm_unreachable_internal("invalid branch predicate", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1167); | |||
1168 | } | |||
1169 | } else { | |||
1170 | llvm_unreachable("Can only handle Cond size 1 or 2")::llvm::llvm_unreachable_internal("Can only handle Cond size 1 or 2" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1170); | |||
1171 | } | |||
1172 | } | |||
1173 | ||||
1174 | Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB, | |||
1175 | MachineBasicBlock::iterator I, | |||
1176 | const DebugLoc &DL, | |||
1177 | Register SrcReg, int Value) const { | |||
1178 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | |||
1179 | Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
1180 | BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) | |||
1181 | .addImm(Value) | |||
1182 | .addReg(SrcReg); | |||
1183 | ||||
1184 | return Reg; | |||
1185 | } | |||
1186 | ||||
1187 | Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, | |||
1188 | MachineBasicBlock::iterator I, | |||
1189 | const DebugLoc &DL, | |||
1190 | Register SrcReg, int Value) const { | |||
1191 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | |||
1192 | Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
1193 | BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) | |||
1194 | .addImm(Value) | |||
1195 | .addReg(SrcReg); | |||
1196 | ||||
1197 | return Reg; | |||
1198 | } | |||
1199 | ||||
1200 | unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { | |||
1201 | ||||
1202 | if (RI.hasAGPRs(DstRC)) | |||
1203 | return AMDGPU::COPY; | |||
1204 | if (RI.getRegSizeInBits(*DstRC) == 32) { | |||
1205 | return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; | |||
1206 | } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { | |||
1207 | return AMDGPU::S_MOV_B64; | |||
1208 | } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { | |||
1209 | return AMDGPU::V_MOV_B64_PSEUDO; | |||
1210 | } | |||
1211 | return AMDGPU::COPY; | |||
1212 | } | |||
1213 | ||||
1214 | const MCInstrDesc & | |||
1215 | SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize, | |||
1216 | bool IsIndirectSrc) const { | |||
1217 | if (IsIndirectSrc) { | |||
1218 | if (VecSize <= 32) // 4 bytes | |||
1219 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1); | |||
1220 | if (VecSize <= 64) // 8 bytes | |||
1221 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2); | |||
1222 | if (VecSize <= 96) // 12 bytes | |||
1223 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3); | |||
1224 | if (VecSize <= 128) // 16 bytes | |||
1225 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4); | |||
1226 | if (VecSize <= 160) // 20 bytes | |||
1227 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5); | |||
1228 | if (VecSize <= 256) // 32 bytes | |||
1229 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8); | |||
1230 | if (VecSize <= 512) // 64 bytes | |||
1231 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16); | |||
1232 | if (VecSize <= 1024) // 128 bytes | |||
1233 | return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32); | |||
1234 | ||||
1235 | llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegReadGPRIDX pseudos" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1235); | |||
1236 | } | |||
1237 | ||||
1238 | if (VecSize <= 32) // 4 bytes | |||
1239 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1); | |||
1240 | if (VecSize <= 64) // 8 bytes | |||
1241 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2); | |||
1242 | if (VecSize <= 96) // 12 bytes | |||
1243 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3); | |||
1244 | if (VecSize <= 128) // 16 bytes | |||
1245 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4); | |||
1246 | if (VecSize <= 160) // 20 bytes | |||
1247 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5); | |||
1248 | if (VecSize <= 256) // 32 bytes | |||
1249 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8); | |||
1250 | if (VecSize <= 512) // 64 bytes | |||
1251 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16); | |||
1252 | if (VecSize <= 1024) // 128 bytes | |||
1253 | return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32); | |||
1254 | ||||
1255 | llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWriteGPRIDX pseudos" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1255); | |||
1256 | } | |||
1257 | ||||
1258 | static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) { | |||
1259 | if (VecSize <= 32) // 4 bytes | |||
1260 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1; | |||
1261 | if (VecSize <= 64) // 8 bytes | |||
1262 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2; | |||
1263 | if (VecSize <= 96) // 12 bytes | |||
1264 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3; | |||
1265 | if (VecSize <= 128) // 16 bytes | |||
1266 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4; | |||
1267 | if (VecSize <= 160) // 20 bytes | |||
1268 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5; | |||
1269 | if (VecSize <= 256) // 32 bytes | |||
1270 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8; | |||
1271 | if (VecSize <= 512) // 64 bytes | |||
1272 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16; | |||
1273 | if (VecSize <= 1024) // 128 bytes | |||
1274 | return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32; | |||
1275 | ||||
1276 | llvm_unreachable("unsupported size for IndirectRegWrite pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWrite pseudos" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1276); | |||
1277 | } | |||
1278 | ||||
1279 | static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) { | |||
1280 | if (VecSize <= 32) // 4 bytes | |||
1281 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1; | |||
1282 | if (VecSize <= 64) // 8 bytes | |||
1283 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2; | |||
1284 | if (VecSize <= 96) // 12 bytes | |||
1285 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3; | |||
1286 | if (VecSize <= 128) // 16 bytes | |||
1287 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4; | |||
1288 | if (VecSize <= 160) // 20 bytes | |||
1289 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5; | |||
1290 | if (VecSize <= 256) // 32 bytes | |||
1291 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8; | |||
1292 | if (VecSize <= 512) // 64 bytes | |||
1293 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16; | |||
1294 | if (VecSize <= 1024) // 128 bytes | |||
1295 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32; | |||
1296 | ||||
1297 | llvm_unreachable("unsupported size for IndirectRegWrite pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWrite pseudos" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1297); | |||
1298 | } | |||
1299 | ||||
1300 | static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) { | |||
1301 | if (VecSize <= 64) // 8 bytes | |||
1302 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1; | |||
1303 | if (VecSize <= 128) // 16 bytes | |||
1304 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2; | |||
1305 | if (VecSize <= 256) // 32 bytes | |||
1306 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4; | |||
1307 | if (VecSize <= 512) // 64 bytes | |||
1308 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8; | |||
1309 | if (VecSize <= 1024) // 128 bytes | |||
1310 | return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16; | |||
1311 | ||||
1312 | llvm_unreachable("unsupported size for IndirectRegWrite pseudos")::llvm::llvm_unreachable_internal("unsupported size for IndirectRegWrite pseudos" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1312); | |||
1313 | } | |||
1314 | ||||
1315 | const MCInstrDesc & | |||
1316 | SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, | |||
1317 | bool IsSGPR) const { | |||
1318 | if (IsSGPR) { | |||
1319 | switch (EltSize) { | |||
1320 | case 32: | |||
1321 | return get(getIndirectSGPRWriteMovRelPseudo32(VecSize)); | |||
1322 | case 64: | |||
1323 | return get(getIndirectSGPRWriteMovRelPseudo64(VecSize)); | |||
1324 | default: | |||
1325 | llvm_unreachable("invalid reg indexing elt size")::llvm::llvm_unreachable_internal("invalid reg indexing elt size" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1325); | |||
1326 | } | |||
1327 | } | |||
1328 | ||||
1329 | assert(EltSize == 32 && "invalid reg indexing elt size")(static_cast <bool> (EltSize == 32 && "invalid reg indexing elt size" ) ? void (0) : __assert_fail ("EltSize == 32 && \"invalid reg indexing elt size\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1329, __extension__ __PRETTY_FUNCTION__)); | |||
1330 | return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize)); | |||
1331 | } | |||
1332 | ||||
1333 | static unsigned getSGPRSpillSaveOpcode(unsigned Size) { | |||
1334 | switch (Size) { | |||
1335 | case 4: | |||
1336 | return AMDGPU::SI_SPILL_S32_SAVE; | |||
1337 | case 8: | |||
1338 | return AMDGPU::SI_SPILL_S64_SAVE; | |||
1339 | case 12: | |||
1340 | return AMDGPU::SI_SPILL_S96_SAVE; | |||
1341 | case 16: | |||
1342 | return AMDGPU::SI_SPILL_S128_SAVE; | |||
1343 | case 20: | |||
1344 | return AMDGPU::SI_SPILL_S160_SAVE; | |||
1345 | case 24: | |||
1346 | return AMDGPU::SI_SPILL_S192_SAVE; | |||
1347 | case 28: | |||
1348 | return AMDGPU::SI_SPILL_S224_SAVE; | |||
1349 | case 32: | |||
1350 | return AMDGPU::SI_SPILL_S256_SAVE; | |||
1351 | case 64: | |||
1352 | return AMDGPU::SI_SPILL_S512_SAVE; | |||
1353 | case 128: | |||
1354 | return AMDGPU::SI_SPILL_S1024_SAVE; | |||
1355 | default: | |||
1356 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1356); | |||
1357 | } | |||
1358 | } | |||
1359 | ||||
1360 | static unsigned getVGPRSpillSaveOpcode(unsigned Size) { | |||
1361 | switch (Size) { | |||
1362 | case 4: | |||
1363 | return AMDGPU::SI_SPILL_V32_SAVE; | |||
1364 | case 8: | |||
1365 | return AMDGPU::SI_SPILL_V64_SAVE; | |||
1366 | case 12: | |||
1367 | return AMDGPU::SI_SPILL_V96_SAVE; | |||
1368 | case 16: | |||
1369 | return AMDGPU::SI_SPILL_V128_SAVE; | |||
1370 | case 20: | |||
1371 | return AMDGPU::SI_SPILL_V160_SAVE; | |||
1372 | case 24: | |||
1373 | return AMDGPU::SI_SPILL_V192_SAVE; | |||
1374 | case 28: | |||
1375 | return AMDGPU::SI_SPILL_V224_SAVE; | |||
1376 | case 32: | |||
1377 | return AMDGPU::SI_SPILL_V256_SAVE; | |||
1378 | case 64: | |||
1379 | return AMDGPU::SI_SPILL_V512_SAVE; | |||
1380 | case 128: | |||
1381 | return AMDGPU::SI_SPILL_V1024_SAVE; | |||
1382 | default: | |||
1383 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1383); | |||
1384 | } | |||
1385 | } | |||
1386 | ||||
1387 | static unsigned getAGPRSpillSaveOpcode(unsigned Size) { | |||
1388 | switch (Size) { | |||
1389 | case 4: | |||
1390 | return AMDGPU::SI_SPILL_A32_SAVE; | |||
1391 | case 8: | |||
1392 | return AMDGPU::SI_SPILL_A64_SAVE; | |||
1393 | case 12: | |||
1394 | return AMDGPU::SI_SPILL_A96_SAVE; | |||
1395 | case 16: | |||
1396 | return AMDGPU::SI_SPILL_A128_SAVE; | |||
1397 | case 20: | |||
1398 | return AMDGPU::SI_SPILL_A160_SAVE; | |||
1399 | case 24: | |||
1400 | return AMDGPU::SI_SPILL_A192_SAVE; | |||
1401 | case 28: | |||
1402 | return AMDGPU::SI_SPILL_A224_SAVE; | |||
1403 | case 32: | |||
1404 | return AMDGPU::SI_SPILL_A256_SAVE; | |||
1405 | case 64: | |||
1406 | return AMDGPU::SI_SPILL_A512_SAVE; | |||
1407 | case 128: | |||
1408 | return AMDGPU::SI_SPILL_A1024_SAVE; | |||
1409 | default: | |||
1410 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1410); | |||
1411 | } | |||
1412 | } | |||
1413 | ||||
1414 | void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, | |||
1415 | MachineBasicBlock::iterator MI, | |||
1416 | Register SrcReg, bool isKill, | |||
1417 | int FrameIndex, | |||
1418 | const TargetRegisterClass *RC, | |||
1419 | const TargetRegisterInfo *TRI) const { | |||
1420 | MachineFunction *MF = MBB.getParent(); | |||
1421 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); | |||
1422 | MachineFrameInfo &FrameInfo = MF->getFrameInfo(); | |||
1423 | const DebugLoc &DL = MBB.findDebugLoc(MI); | |||
1424 | ||||
1425 | MachinePointerInfo PtrInfo | |||
1426 | = MachinePointerInfo::getFixedStack(*MF, FrameIndex); | |||
1427 | MachineMemOperand *MMO = MF->getMachineMemOperand( | |||
1428 | PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), | |||
1429 | FrameInfo.getObjectAlign(FrameIndex)); | |||
1430 | unsigned SpillSize = TRI->getSpillSize(*RC); | |||
1431 | ||||
1432 | if (RI.isSGPRClass(RC)) { | |||
1433 | MFI->setHasSpilledSGPRs(); | |||
1434 | assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled")(static_cast <bool> (SrcReg != AMDGPU::M0 && "m0 should not be spilled" ) ? void (0) : __assert_fail ("SrcReg != AMDGPU::M0 && \"m0 should not be spilled\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1434, __extension__ __PRETTY_FUNCTION__)); | |||
1435 | assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&(static_cast <bool> (SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && "exec should not be spilled") ? void (0) : __assert_fail ("SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && \"exec should not be spilled\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1436, __extension__ __PRETTY_FUNCTION__)) | |||
1436 | SrcReg != AMDGPU::EXEC && "exec should not be spilled")(static_cast <bool> (SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && "exec should not be spilled") ? void (0) : __assert_fail ("SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && SrcReg != AMDGPU::EXEC && \"exec should not be spilled\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1436, __extension__ __PRETTY_FUNCTION__)); | |||
1437 | ||||
1438 | // We are only allowed to create one new instruction when spilling | |||
1439 | // registers, so we need to use pseudo instruction for spilling SGPRs. | |||
1440 | const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); | |||
1441 | ||||
1442 | // The SGPR spill/restore instructions only work on number sgprs, so we need | |||
1443 | // to make sure we are using the correct register class. | |||
1444 | if (SrcReg.isVirtual() && SpillSize == 4) { | |||
1445 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
1446 | MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); | |||
1447 | } | |||
1448 | ||||
1449 | BuildMI(MBB, MI, DL, OpDesc) | |||
1450 | .addReg(SrcReg, getKillRegState(isKill)) // data | |||
1451 | .addFrameIndex(FrameIndex) // addr | |||
1452 | .addMemOperand(MMO) | |||
1453 | .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); | |||
1454 | ||||
1455 | if (RI.spillSGPRToVGPR()) | |||
1456 | FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); | |||
1457 | return; | |||
1458 | } | |||
1459 | ||||
1460 | unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize) | |||
1461 | : getVGPRSpillSaveOpcode(SpillSize); | |||
1462 | MFI->setHasSpilledVGPRs(); | |||
1463 | ||||
1464 | BuildMI(MBB, MI, DL, get(Opcode)) | |||
1465 | .addReg(SrcReg, getKillRegState(isKill)) // data | |||
1466 | .addFrameIndex(FrameIndex) // addr | |||
1467 | .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset | |||
1468 | .addImm(0) // offset | |||
1469 | .addMemOperand(MMO); | |||
1470 | } | |||
1471 | ||||
1472 | static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { | |||
1473 | switch (Size) { | |||
1474 | case 4: | |||
1475 | return AMDGPU::SI_SPILL_S32_RESTORE; | |||
1476 | case 8: | |||
1477 | return AMDGPU::SI_SPILL_S64_RESTORE; | |||
1478 | case 12: | |||
1479 | return AMDGPU::SI_SPILL_S96_RESTORE; | |||
1480 | case 16: | |||
1481 | return AMDGPU::SI_SPILL_S128_RESTORE; | |||
1482 | case 20: | |||
1483 | return AMDGPU::SI_SPILL_S160_RESTORE; | |||
1484 | case 24: | |||
1485 | return AMDGPU::SI_SPILL_S192_RESTORE; | |||
1486 | case 28: | |||
1487 | return AMDGPU::SI_SPILL_S224_RESTORE; | |||
1488 | case 32: | |||
1489 | return AMDGPU::SI_SPILL_S256_RESTORE; | |||
1490 | case 64: | |||
1491 | return AMDGPU::SI_SPILL_S512_RESTORE; | |||
1492 | case 128: | |||
1493 | return AMDGPU::SI_SPILL_S1024_RESTORE; | |||
1494 | default: | |||
1495 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1495); | |||
1496 | } | |||
1497 | } | |||
1498 | ||||
1499 | static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { | |||
1500 | switch (Size) { | |||
1501 | case 4: | |||
1502 | return AMDGPU::SI_SPILL_V32_RESTORE; | |||
1503 | case 8: | |||
1504 | return AMDGPU::SI_SPILL_V64_RESTORE; | |||
1505 | case 12: | |||
1506 | return AMDGPU::SI_SPILL_V96_RESTORE; | |||
1507 | case 16: | |||
1508 | return AMDGPU::SI_SPILL_V128_RESTORE; | |||
1509 | case 20: | |||
1510 | return AMDGPU::SI_SPILL_V160_RESTORE; | |||
1511 | case 24: | |||
1512 | return AMDGPU::SI_SPILL_V192_RESTORE; | |||
1513 | case 28: | |||
1514 | return AMDGPU::SI_SPILL_V224_RESTORE; | |||
1515 | case 32: | |||
1516 | return AMDGPU::SI_SPILL_V256_RESTORE; | |||
1517 | case 64: | |||
1518 | return AMDGPU::SI_SPILL_V512_RESTORE; | |||
1519 | case 128: | |||
1520 | return AMDGPU::SI_SPILL_V1024_RESTORE; | |||
1521 | default: | |||
1522 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1522); | |||
1523 | } | |||
1524 | } | |||
1525 | ||||
1526 | static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { | |||
1527 | switch (Size) { | |||
1528 | case 4: | |||
1529 | return AMDGPU::SI_SPILL_A32_RESTORE; | |||
1530 | case 8: | |||
1531 | return AMDGPU::SI_SPILL_A64_RESTORE; | |||
1532 | case 12: | |||
1533 | return AMDGPU::SI_SPILL_A96_RESTORE; | |||
1534 | case 16: | |||
1535 | return AMDGPU::SI_SPILL_A128_RESTORE; | |||
1536 | case 20: | |||
1537 | return AMDGPU::SI_SPILL_A160_RESTORE; | |||
1538 | case 24: | |||
1539 | return AMDGPU::SI_SPILL_A192_RESTORE; | |||
1540 | case 28: | |||
1541 | return AMDGPU::SI_SPILL_A224_RESTORE; | |||
1542 | case 32: | |||
1543 | return AMDGPU::SI_SPILL_A256_RESTORE; | |||
1544 | case 64: | |||
1545 | return AMDGPU::SI_SPILL_A512_RESTORE; | |||
1546 | case 128: | |||
1547 | return AMDGPU::SI_SPILL_A1024_RESTORE; | |||
1548 | default: | |||
1549 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1549); | |||
1550 | } | |||
1551 | } | |||
1552 | ||||
1553 | void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, | |||
1554 | MachineBasicBlock::iterator MI, | |||
1555 | Register DestReg, int FrameIndex, | |||
1556 | const TargetRegisterClass *RC, | |||
1557 | const TargetRegisterInfo *TRI) const { | |||
1558 | MachineFunction *MF = MBB.getParent(); | |||
1559 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); | |||
1560 | MachineFrameInfo &FrameInfo = MF->getFrameInfo(); | |||
1561 | const DebugLoc &DL = MBB.findDebugLoc(MI); | |||
1562 | unsigned SpillSize = TRI->getSpillSize(*RC); | |||
1563 | ||||
1564 | MachinePointerInfo PtrInfo | |||
1565 | = MachinePointerInfo::getFixedStack(*MF, FrameIndex); | |||
1566 | ||||
1567 | MachineMemOperand *MMO = MF->getMachineMemOperand( | |||
1568 | PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex), | |||
1569 | FrameInfo.getObjectAlign(FrameIndex)); | |||
1570 | ||||
1571 | if (RI.isSGPRClass(RC)) { | |||
1572 | MFI->setHasSpilledSGPRs(); | |||
1573 | assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into")(static_cast <bool> (DestReg != AMDGPU::M0 && "m0 should not be reloaded into" ) ? void (0) : __assert_fail ("DestReg != AMDGPU::M0 && \"m0 should not be reloaded into\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1573, __extension__ __PRETTY_FUNCTION__)); | |||
1574 | assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&(static_cast <bool> (DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && "exec should not be spilled") ? void (0) : __assert_fail ("DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && \"exec should not be spilled\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1575, __extension__ __PRETTY_FUNCTION__)) | |||
1575 | DestReg != AMDGPU::EXEC && "exec should not be spilled")(static_cast <bool> (DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && "exec should not be spilled") ? void (0) : __assert_fail ("DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && DestReg != AMDGPU::EXEC && \"exec should not be spilled\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1575, __extension__ __PRETTY_FUNCTION__)); | |||
1576 | ||||
1577 | // FIXME: Maybe this should not include a memoperand because it will be | |||
1578 | // lowered to non-memory instructions. | |||
1579 | const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); | |||
1580 | if (DestReg.isVirtual() && SpillSize == 4) { | |||
1581 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
1582 | MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); | |||
1583 | } | |||
1584 | ||||
1585 | if (RI.spillSGPRToVGPR()) | |||
1586 | FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); | |||
1587 | BuildMI(MBB, MI, DL, OpDesc, DestReg) | |||
1588 | .addFrameIndex(FrameIndex) // addr | |||
1589 | .addMemOperand(MMO) | |||
1590 | .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); | |||
1591 | ||||
1592 | return; | |||
1593 | } | |||
1594 | ||||
1595 | unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize) | |||
1596 | : getVGPRSpillRestoreOpcode(SpillSize); | |||
1597 | BuildMI(MBB, MI, DL, get(Opcode), DestReg) | |||
1598 | .addFrameIndex(FrameIndex) // vaddr | |||
1599 | .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset | |||
1600 | .addImm(0) // offset | |||
1601 | .addMemOperand(MMO); | |||
1602 | } | |||
1603 | ||||
1604 | void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, | |||
1605 | MachineBasicBlock::iterator MI) const { | |||
1606 | insertNoops(MBB, MI, 1); | |||
1607 | } | |||
1608 | ||||
1609 | void SIInstrInfo::insertNoops(MachineBasicBlock &MBB, | |||
1610 | MachineBasicBlock::iterator MI, | |||
1611 | unsigned Quantity) const { | |||
1612 | DebugLoc DL = MBB.findDebugLoc(MI); | |||
1613 | while (Quantity > 0) { | |||
1614 | unsigned Arg = std::min(Quantity, 8u); | |||
1615 | Quantity -= Arg; | |||
1616 | BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1); | |||
1617 | } | |||
1618 | } | |||
1619 | ||||
1620 | void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { | |||
1621 | auto MF = MBB.getParent(); | |||
1622 | SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); | |||
1623 | ||||
1624 | assert(Info->isEntryFunction())(static_cast <bool> (Info->isEntryFunction()) ? void (0) : __assert_fail ("Info->isEntryFunction()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1624, __extension__ __PRETTY_FUNCTION__)); | |||
1625 | ||||
1626 | if (MBB.succ_empty()) { | |||
1627 | bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); | |||
1628 | if (HasNoTerminator) { | |||
1629 | if (Info->returnsVoid()) { | |||
1630 | BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0); | |||
1631 | } else { | |||
1632 | BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG)); | |||
1633 | } | |||
1634 | } | |||
1635 | } | |||
1636 | } | |||
1637 | ||||
1638 | unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { | |||
1639 | switch (MI.getOpcode()) { | |||
1640 | default: | |||
1641 | if (MI.isMetaInstruction()) | |||
1642 | return 0; | |||
1643 | return 1; // FIXME: Do wait states equal cycles? | |||
1644 | ||||
1645 | case AMDGPU::S_NOP: | |||
1646 | return MI.getOperand(0).getImm() + 1; | |||
1647 | ||||
1648 | // FIXME: Any other pseudo instruction? | |||
1649 | // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The | |||
1650 | // hazard, even if one exist, won't really be visible. Should we handle it? | |||
1651 | case AMDGPU::SI_MASKED_UNREACHABLE: | |||
1652 | case AMDGPU::WAVE_BARRIER: | |||
1653 | return 0; | |||
1654 | } | |||
1655 | } | |||
1656 | ||||
1657 | bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { | |||
1658 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); | |||
1659 | MachineBasicBlock &MBB = *MI.getParent(); | |||
1660 | DebugLoc DL = MBB.findDebugLoc(MI); | |||
1661 | switch (MI.getOpcode()) { | |||
| ||||
1662 | default: return TargetInstrInfo::expandPostRAPseudo(MI); | |||
1663 | case AMDGPU::S_MOV_B64_term: | |||
1664 | // This is only a terminator to get the correct spill code placement during | |||
1665 | // register allocation. | |||
1666 | MI.setDesc(get(AMDGPU::S_MOV_B64)); | |||
1667 | break; | |||
1668 | ||||
1669 | case AMDGPU::S_MOV_B32_term: | |||
1670 | // This is only a terminator to get the correct spill code placement during | |||
1671 | // register allocation. | |||
1672 | MI.setDesc(get(AMDGPU::S_MOV_B32)); | |||
1673 | break; | |||
1674 | ||||
1675 | case AMDGPU::S_XOR_B64_term: | |||
1676 | // This is only a terminator to get the correct spill code placement during | |||
1677 | // register allocation. | |||
1678 | MI.setDesc(get(AMDGPU::S_XOR_B64)); | |||
1679 | break; | |||
1680 | ||||
1681 | case AMDGPU::S_XOR_B32_term: | |||
1682 | // This is only a terminator to get the correct spill code placement during | |||
1683 | // register allocation. | |||
1684 | MI.setDesc(get(AMDGPU::S_XOR_B32)); | |||
1685 | break; | |||
1686 | case AMDGPU::S_OR_B64_term: | |||
1687 | // This is only a terminator to get the correct spill code placement during | |||
1688 | // register allocation. | |||
1689 | MI.setDesc(get(AMDGPU::S_OR_B64)); | |||
1690 | break; | |||
1691 | case AMDGPU::S_OR_B32_term: | |||
1692 | // This is only a terminator to get the correct spill code placement during | |||
1693 | // register allocation. | |||
1694 | MI.setDesc(get(AMDGPU::S_OR_B32)); | |||
1695 | break; | |||
1696 | ||||
1697 | case AMDGPU::S_ANDN2_B64_term: | |||
1698 | // This is only a terminator to get the correct spill code placement during | |||
1699 | // register allocation. | |||
1700 | MI.setDesc(get(AMDGPU::S_ANDN2_B64)); | |||
1701 | break; | |||
1702 | ||||
1703 | case AMDGPU::S_ANDN2_B32_term: | |||
1704 | // This is only a terminator to get the correct spill code placement during | |||
1705 | // register allocation. | |||
1706 | MI.setDesc(get(AMDGPU::S_ANDN2_B32)); | |||
1707 | break; | |||
1708 | ||||
1709 | case AMDGPU::S_AND_B64_term: | |||
1710 | // This is only a terminator to get the correct spill code placement during | |||
1711 | // register allocation. | |||
1712 | MI.setDesc(get(AMDGPU::S_AND_B64)); | |||
1713 | break; | |||
1714 | ||||
1715 | case AMDGPU::S_AND_B32_term: | |||
1716 | // This is only a terminator to get the correct spill code placement during | |||
1717 | // register allocation. | |||
1718 | MI.setDesc(get(AMDGPU::S_AND_B32)); | |||
1719 | break; | |||
1720 | ||||
1721 | case AMDGPU::V_MOV_B64_PSEUDO: { | |||
1722 | Register Dst = MI.getOperand(0).getReg(); | |||
1723 | Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); | |||
1724 | Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); | |||
1725 | ||||
1726 | const MachineOperand &SrcOp = MI.getOperand(1); | |||
1727 | // FIXME: Will this work for 64-bit floating point immediates? | |||
1728 | assert(!SrcOp.isFPImm())(static_cast <bool> (!SrcOp.isFPImm()) ? void (0) : __assert_fail ("!SrcOp.isFPImm()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1728, __extension__ __PRETTY_FUNCTION__)); | |||
1729 | if (SrcOp.isImm()) { | |||
1730 | APInt Imm(64, SrcOp.getImm()); | |||
1731 | APInt Lo(32, Imm.getLoBits(32).getZExtValue()); | |||
1732 | APInt Hi(32, Imm.getHiBits(32).getZExtValue()); | |||
1733 | if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) { | |||
1734 | BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) | |||
1735 | .addImm(SISrcMods::OP_SEL_1) | |||
1736 | .addImm(Lo.getSExtValue()) | |||
1737 | .addImm(SISrcMods::OP_SEL_1) | |||
1738 | .addImm(Lo.getSExtValue()) | |||
1739 | .addImm(0) // op_sel_lo | |||
1740 | .addImm(0) // op_sel_hi | |||
1741 | .addImm(0) // neg_lo | |||
1742 | .addImm(0) // neg_hi | |||
1743 | .addImm(0); // clamp | |||
1744 | } else { | |||
1745 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) | |||
1746 | .addImm(Lo.getSExtValue()) | |||
1747 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1748 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) | |||
1749 | .addImm(Hi.getSExtValue()) | |||
1750 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1751 | } | |||
1752 | } else { | |||
1753 | assert(SrcOp.isReg())(static_cast <bool> (SrcOp.isReg()) ? void (0) : __assert_fail ("SrcOp.isReg()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1753, __extension__ __PRETTY_FUNCTION__)); | |||
1754 | if (ST.hasPackedFP32Ops() && | |||
1755 | !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) { | |||
1756 | BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) | |||
1757 | .addImm(SISrcMods::OP_SEL_1) // src0_mod | |||
1758 | .addReg(SrcOp.getReg()) | |||
1759 | .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod | |||
1760 | .addReg(SrcOp.getReg()) | |||
1761 | .addImm(0) // op_sel_lo | |||
1762 | .addImm(0) // op_sel_hi | |||
1763 | .addImm(0) // neg_lo | |||
1764 | .addImm(0) // neg_hi | |||
1765 | .addImm(0); // clamp | |||
1766 | } else { | |||
1767 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) | |||
1768 | .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) | |||
1769 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1770 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) | |||
1771 | .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) | |||
1772 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1773 | } | |||
1774 | } | |||
1775 | MI.eraseFromParent(); | |||
1776 | break; | |||
1777 | } | |||
1778 | case AMDGPU::V_MOV_B64_DPP_PSEUDO: { | |||
1779 | expandMovDPP64(MI); | |||
1780 | break; | |||
1781 | } | |||
1782 | case AMDGPU::S_MOV_B64_IMM_PSEUDO: { | |||
1783 | const MachineOperand &SrcOp = MI.getOperand(1); | |||
1784 | assert(!SrcOp.isFPImm())(static_cast <bool> (!SrcOp.isFPImm()) ? void (0) : __assert_fail ("!SrcOp.isFPImm()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1784, __extension__ __PRETTY_FUNCTION__)); | |||
1785 | APInt Imm(64, SrcOp.getImm()); | |||
1786 | if (Imm.isIntN(32) || isInlineConstant(Imm)) { | |||
1787 | MI.setDesc(get(AMDGPU::S_MOV_B64)); | |||
1788 | break; | |||
1789 | } | |||
1790 | ||||
1791 | Register Dst = MI.getOperand(0).getReg(); | |||
1792 | Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); | |||
1793 | Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); | |||
1794 | ||||
1795 | APInt Lo(32, Imm.getLoBits(32).getZExtValue()); | |||
1796 | APInt Hi(32, Imm.getHiBits(32).getZExtValue()); | |||
1797 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo) | |||
1798 | .addImm(Lo.getSExtValue()) | |||
1799 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1800 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi) | |||
1801 | .addImm(Hi.getSExtValue()) | |||
1802 | .addReg(Dst, RegState::Implicit | RegState::Define); | |||
1803 | MI.eraseFromParent(); | |||
1804 | break; | |||
1805 | } | |||
1806 | case AMDGPU::V_SET_INACTIVE_B32: { | |||
1807 | unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; | |||
1808 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
1809 | auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); | |||
1810 | FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten | |||
1811 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) | |||
1812 | .add(MI.getOperand(2)); | |||
1813 | BuildMI(MBB, MI, DL, get(NotOpc), Exec) | |||
1814 | .addReg(Exec); | |||
1815 | MI.eraseFromParent(); | |||
1816 | break; | |||
1817 | } | |||
1818 | case AMDGPU::V_SET_INACTIVE_B64: { | |||
1819 | unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; | |||
1820 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
1821 | auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); | |||
1822 | FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten | |||
1823 | MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), | |||
1824 | MI.getOperand(0).getReg()) | |||
1825 | .add(MI.getOperand(2)); | |||
1826 | expandPostRAPseudo(*Copy); | |||
1827 | BuildMI(MBB, MI, DL, get(NotOpc), Exec) | |||
1828 | .addReg(Exec); | |||
1829 | MI.eraseFromParent(); | |||
1830 | break; | |||
1831 | } | |||
1832 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1: | |||
1833 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2: | |||
1834 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3: | |||
1835 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4: | |||
1836 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5: | |||
1837 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8: | |||
1838 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16: | |||
1839 | case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32: | |||
1840 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1: | |||
1841 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2: | |||
1842 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3: | |||
1843 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4: | |||
1844 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5: | |||
1845 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8: | |||
1846 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16: | |||
1847 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32: | |||
1848 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1: | |||
1849 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2: | |||
1850 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4: | |||
1851 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8: | |||
1852 | case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: { | |||
1853 | const TargetRegisterClass *EltRC = getOpRegClass(MI, 2); | |||
1854 | ||||
1855 | unsigned Opc; | |||
1856 | if (RI.hasVGPRs(EltRC)) { | |||
1857 | Opc = AMDGPU::V_MOVRELD_B32_e32; | |||
1858 | } else { | |||
1859 | Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64 | |||
1860 | : AMDGPU::S_MOVRELD_B32; | |||
1861 | } | |||
1862 | ||||
1863 | const MCInstrDesc &OpDesc = get(Opc); | |||
1864 | Register VecReg = MI.getOperand(0).getReg(); | |||
1865 | bool IsUndef = MI.getOperand(1).isUndef(); | |||
1866 | unsigned SubReg = MI.getOperand(3).getImm(); | |||
1867 | assert(VecReg == MI.getOperand(1).getReg())(static_cast <bool> (VecReg == MI.getOperand(1).getReg( )) ? void (0) : __assert_fail ("VecReg == MI.getOperand(1).getReg()" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1867, __extension__ __PRETTY_FUNCTION__)); | |||
1868 | ||||
1869 | MachineInstrBuilder MIB = | |||
1870 | BuildMI(MBB, MI, DL, OpDesc) | |||
1871 | .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) | |||
1872 | .add(MI.getOperand(2)) | |||
1873 | .addReg(VecReg, RegState::ImplicitDefine) | |||
1874 | .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); | |||
1875 | ||||
1876 | const int ImpDefIdx = | |||
1877 | OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); | |||
1878 | const int ImpUseIdx = ImpDefIdx + 1; | |||
1879 | MIB->tieOperands(ImpDefIdx, ImpUseIdx); | |||
1880 | MI.eraseFromParent(); | |||
1881 | break; | |||
1882 | } | |||
1883 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1: | |||
1884 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2: | |||
1885 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3: | |||
1886 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4: | |||
1887 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5: | |||
1888 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8: | |||
1889 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16: | |||
1890 | case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: { | |||
1891 | assert(ST.useVGPRIndexMode())(static_cast <bool> (ST.useVGPRIndexMode()) ? void (0) : __assert_fail ("ST.useVGPRIndexMode()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1891, __extension__ __PRETTY_FUNCTION__)); | |||
1892 | Register VecReg = MI.getOperand(0).getReg(); | |||
1893 | bool IsUndef = MI.getOperand(1).isUndef(); | |||
1894 | Register Idx = MI.getOperand(3).getReg(); | |||
1895 | Register SubReg = MI.getOperand(4).getImm(); | |||
1896 | ||||
1897 | MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) | |||
1898 | .addReg(Idx) | |||
1899 | .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); | |||
1900 | SetOn->getOperand(3).setIsUndef(); | |||
1901 | ||||
1902 | const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect); | |||
1903 | MachineInstrBuilder MIB = | |||
1904 | BuildMI(MBB, MI, DL, OpDesc) | |||
1905 | .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) | |||
1906 | .add(MI.getOperand(2)) | |||
1907 | .addReg(VecReg, RegState::ImplicitDefine) | |||
1908 | .addReg(VecReg, | |||
1909 | RegState::Implicit | (IsUndef ? RegState::Undef : 0)); | |||
1910 | ||||
1911 | const int ImpDefIdx = OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); | |||
1912 | const int ImpUseIdx = ImpDefIdx + 1; | |||
1913 | MIB->tieOperands(ImpDefIdx, ImpUseIdx); | |||
1914 | ||||
1915 | MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); | |||
1916 | ||||
1917 | finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); | |||
1918 | ||||
1919 | MI.eraseFromParent(); | |||
1920 | break; | |||
1921 | } | |||
1922 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1: | |||
1923 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2: | |||
1924 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3: | |||
1925 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4: | |||
1926 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5: | |||
1927 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8: | |||
1928 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16: | |||
1929 | case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: { | |||
1930 | assert(ST.useVGPRIndexMode())(static_cast <bool> (ST.useVGPRIndexMode()) ? void (0) : __assert_fail ("ST.useVGPRIndexMode()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1930, __extension__ __PRETTY_FUNCTION__)); | |||
1931 | Register Dst = MI.getOperand(0).getReg(); | |||
1932 | Register VecReg = MI.getOperand(1).getReg(); | |||
1933 | bool IsUndef = MI.getOperand(1).isUndef(); | |||
1934 | Register Idx = MI.getOperand(2).getReg(); | |||
1935 | Register SubReg = MI.getOperand(3).getImm(); | |||
1936 | ||||
1937 | MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) | |||
1938 | .addReg(Idx) | |||
1939 | .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); | |||
1940 | SetOn->getOperand(3).setIsUndef(); | |||
1941 | ||||
1942 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32)) | |||
1943 | .addDef(Dst) | |||
1944 | .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) | |||
1945 | .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)) | |||
1946 | .addReg(AMDGPU::M0, RegState::Implicit); | |||
1947 | ||||
1948 | MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); | |||
1949 | ||||
1950 | finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); | |||
1951 | ||||
1952 | MI.eraseFromParent(); | |||
1953 | break; | |||
1954 | } | |||
1955 | case AMDGPU::SI_PC_ADD_REL_OFFSET: { | |||
1956 | MachineFunction &MF = *MBB.getParent(); | |||
1957 | Register Reg = MI.getOperand(0).getReg(); | |||
1958 | Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0); | |||
1959 | Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1); | |||
1960 | ||||
1961 | // Create a bundle so these instructions won't be re-ordered by the | |||
1962 | // post-RA scheduler. | |||
1963 | MIBundleBuilder Bundler(MBB, MI); | |||
1964 | Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); | |||
1965 | ||||
1966 | // Add 32-bit offset from this instruction to the start of the | |||
1967 | // constant data. | |||
1968 | Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) | |||
1969 | .addReg(RegLo) | |||
1970 | .add(MI.getOperand(1))); | |||
1971 | ||||
1972 | MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) | |||
1973 | .addReg(RegHi); | |||
1974 | MIB.add(MI.getOperand(2)); | |||
1975 | ||||
1976 | Bundler.append(MIB); | |||
1977 | finalizeBundle(MBB, Bundler.begin()); | |||
1978 | ||||
1979 | MI.eraseFromParent(); | |||
1980 | break; | |||
1981 | } | |||
1982 | case AMDGPU::ENTER_STRICT_WWM: { | |||
1983 | // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when | |||
1984 | // Whole Wave Mode is entered. | |||
1985 | MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 | |||
1986 | : AMDGPU::S_OR_SAVEEXEC_B64)); | |||
1987 | break; | |||
1988 | } | |||
1989 | case AMDGPU::ENTER_STRICT_WQM: { | |||
1990 | // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when | |||
1991 | // STRICT_WQM is entered. | |||
1992 | const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
1993 | const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64; | |||
1994 | const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; | |||
1995 | BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec); | |||
1996 | BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec); | |||
1997 | ||||
1998 | MI.eraseFromParent(); | |||
1999 | break; | |||
2000 | } | |||
2001 | case AMDGPU::EXIT_STRICT_WWM: | |||
2002 | case AMDGPU::EXIT_STRICT_WQM: { | |||
2003 | // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when | |||
2004 | // WWM/STICT_WQM is exited. | |||
2005 | MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); | |||
2006 | break; | |||
2007 | } | |||
2008 | } | |||
2009 | return true; | |||
2010 | } | |||
2011 | ||||
2012 | std::pair<MachineInstr*, MachineInstr*> | |||
2013 | SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { | |||
2014 | assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO)(static_cast <bool> (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ) ? void (0) : __assert_fail ("MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2014, __extension__ __PRETTY_FUNCTION__)); | |||
2015 | ||||
2016 | MachineBasicBlock &MBB = *MI.getParent(); | |||
2017 | DebugLoc DL = MBB.findDebugLoc(MI); | |||
2018 | MachineFunction *MF = MBB.getParent(); | |||
2019 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
2020 | Register Dst = MI.getOperand(0).getReg(); | |||
2021 | unsigned Part = 0; | |||
2022 | MachineInstr *Split[2]; | |||
2023 | ||||
2024 | for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) { | |||
2025 | auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp)); | |||
2026 | if (Dst.isPhysical()) { | |||
2027 | MovDPP.addDef(RI.getSubReg(Dst, Sub)); | |||
2028 | } else { | |||
2029 | assert(MRI.isSSA())(static_cast <bool> (MRI.isSSA()) ? void (0) : __assert_fail ("MRI.isSSA()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2029, __extension__ __PRETTY_FUNCTION__)); | |||
2030 | auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
2031 | MovDPP.addDef(Tmp); | |||
2032 | } | |||
2033 | ||||
2034 | for (unsigned I = 1; I <= 2; ++I) { // old and src operands. | |||
2035 | const MachineOperand &SrcOp = MI.getOperand(I); | |||
2036 | assert(!SrcOp.isFPImm())(static_cast <bool> (!SrcOp.isFPImm()) ? void (0) : __assert_fail ("!SrcOp.isFPImm()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2036, __extension__ __PRETTY_FUNCTION__)); | |||
2037 | if (SrcOp.isImm()) { | |||
2038 | APInt Imm(64, SrcOp.getImm()); | |||
2039 | Imm.ashrInPlace(Part * 32); | |||
2040 | MovDPP.addImm(Imm.getLoBits(32).getZExtValue()); | |||
2041 | } else { | |||
2042 | assert(SrcOp.isReg())(static_cast <bool> (SrcOp.isReg()) ? void (0) : __assert_fail ("SrcOp.isReg()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2042, __extension__ __PRETTY_FUNCTION__)); | |||
2043 | Register Src = SrcOp.getReg(); | |||
2044 | if (Src.isPhysical()) | |||
2045 | MovDPP.addReg(RI.getSubReg(Src, Sub)); | |||
2046 | else | |||
2047 | MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub); | |||
2048 | } | |||
2049 | } | |||
2050 | ||||
2051 | for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I) | |||
2052 | MovDPP.addImm(MI.getOperand(I).getImm()); | |||
2053 | ||||
2054 | Split[Part] = MovDPP; | |||
2055 | ++Part; | |||
2056 | } | |||
2057 | ||||
2058 | if (Dst.isVirtual()) | |||
2059 | BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst) | |||
2060 | .addReg(Split[0]->getOperand(0).getReg()) | |||
| ||||
2061 | .addImm(AMDGPU::sub0) | |||
2062 | .addReg(Split[1]->getOperand(0).getReg()) | |||
2063 | .addImm(AMDGPU::sub1); | |||
2064 | ||||
2065 | MI.eraseFromParent(); | |||
2066 | return std::make_pair(Split[0], Split[1]); | |||
2067 | } | |||
2068 | ||||
2069 | bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, | |||
2070 | MachineOperand &Src0, | |||
2071 | unsigned Src0OpName, | |||
2072 | MachineOperand &Src1, | |||
2073 | unsigned Src1OpName) const { | |||
2074 | MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); | |||
2075 | if (!Src0Mods) | |||
2076 | return false; | |||
2077 | ||||
2078 | MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); | |||
2079 | assert(Src1Mods &&(static_cast <bool> (Src1Mods && "All commutable instructions have both src0 and src1 modifiers" ) ? void (0) : __assert_fail ("Src1Mods && \"All commutable instructions have both src0 and src1 modifiers\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2080, __extension__ __PRETTY_FUNCTION__)) | |||
2080 | "All commutable instructions have both src0 and src1 modifiers")(static_cast <bool> (Src1Mods && "All commutable instructions have both src0 and src1 modifiers" ) ? void (0) : __assert_fail ("Src1Mods && \"All commutable instructions have both src0 and src1 modifiers\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2080, __extension__ __PRETTY_FUNCTION__)); | |||
2081 | ||||
2082 | int Src0ModsVal = Src0Mods->getImm(); | |||
2083 | int Src1ModsVal = Src1Mods->getImm(); | |||
2084 | ||||
2085 | Src1Mods->setImm(Src0ModsVal); | |||
2086 | Src0Mods->setImm(Src1ModsVal); | |||
2087 | return true; | |||
2088 | } | |||
2089 | ||||
2090 | static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, | |||
2091 | MachineOperand &RegOp, | |||
2092 | MachineOperand &NonRegOp) { | |||
2093 | Register Reg = RegOp.getReg(); | |||
2094 | unsigned SubReg = RegOp.getSubReg(); | |||
2095 | bool IsKill = RegOp.isKill(); | |||
2096 | bool IsDead = RegOp.isDead(); | |||
2097 | bool IsUndef = RegOp.isUndef(); | |||
2098 | bool IsDebug = RegOp.isDebug(); | |||
2099 | ||||
2100 | if (NonRegOp.isImm()) | |||
2101 | RegOp.ChangeToImmediate(NonRegOp.getImm()); | |||
2102 | else if (NonRegOp.isFI()) | |||
2103 | RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); | |||
2104 | else if (NonRegOp.isGlobal()) { | |||
2105 | RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(), | |||
2106 | NonRegOp.getTargetFlags()); | |||
2107 | } else | |||
2108 | return nullptr; | |||
2109 | ||||
2110 | // Make sure we don't reinterpret a subreg index in the target flags. | |||
2111 | RegOp.setTargetFlags(NonRegOp.getTargetFlags()); | |||
2112 | ||||
2113 | NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); | |||
2114 | NonRegOp.setSubReg(SubReg); | |||
2115 | ||||
2116 | return &MI; | |||
2117 | } | |||
2118 | ||||
2119 | MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, | |||
2120 | unsigned Src0Idx, | |||
2121 | unsigned Src1Idx) const { | |||
2122 | assert(!NewMI && "this should never be used")(static_cast <bool> (!NewMI && "this should never be used" ) ? void (0) : __assert_fail ("!NewMI && \"this should never be used\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2122, __extension__ __PRETTY_FUNCTION__)); | |||
2123 | ||||
2124 | unsigned Opc = MI.getOpcode(); | |||
2125 | int CommutedOpcode = commuteOpcode(Opc); | |||
2126 | if (CommutedOpcode == -1) | |||
2127 | return nullptr; | |||
2128 | ||||
2129 | assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==(static_cast <bool> (AMDGPU::getNamedOperandIdx(Opc, AMDGPU ::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast <int>(Src1Idx) && "inconsistency with findCommutedOpIndices" ) ? void (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2133, __extension__ __PRETTY_FUNCTION__)) | |||
2130 | static_cast<int>(Src0Idx) &&(static_cast <bool> (AMDGPU::getNamedOperandIdx(Opc, AMDGPU ::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast <int>(Src1Idx) && "inconsistency with findCommutedOpIndices" ) ? void (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2133, __extension__ __PRETTY_FUNCTION__)) | |||
2131 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==(static_cast <bool> (AMDGPU::getNamedOperandIdx(Opc, AMDGPU ::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast <int>(Src1Idx) && "inconsistency with findCommutedOpIndices" ) ? void (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2133, __extension__ __PRETTY_FUNCTION__)) | |||
2132 | static_cast<int>(Src1Idx) &&(static_cast <bool> (AMDGPU::getNamedOperandIdx(Opc, AMDGPU ::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast <int>(Src1Idx) && "inconsistency with findCommutedOpIndices" ) ? void (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2133, __extension__ __PRETTY_FUNCTION__)) | |||
2133 | "inconsistency with findCommutedOpIndices")(static_cast <bool> (AMDGPU::getNamedOperandIdx(Opc, AMDGPU ::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast <int>(Src1Idx) && "inconsistency with findCommutedOpIndices" ) ? void (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2133, __extension__ __PRETTY_FUNCTION__)); | |||
2134 | ||||
2135 | MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
2136 | MachineOperand &Src1 = MI.getOperand(Src1Idx); | |||
2137 | ||||
2138 | MachineInstr *CommutedMI = nullptr; | |||
2139 | if (Src0.isReg() && Src1.isReg()) { | |||
2140 | if (isOperandLegal(MI, Src1Idx, &Src0)) { | |||
2141 | // Be sure to copy the source modifiers to the right place. | |||
2142 | CommutedMI | |||
2143 | = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); | |||
2144 | } | |||
2145 | ||||
2146 | } else if (Src0.isReg() && !Src1.isReg()) { | |||
2147 | // src0 should always be able to support any operand type, so no need to | |||
2148 | // check operand legality. | |||
2149 | CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); | |||
2150 | } else if (!Src0.isReg() && Src1.isReg()) { | |||
2151 | if (isOperandLegal(MI, Src1Idx, &Src0)) | |||
2152 | CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); | |||
2153 | } else { | |||
2154 | // FIXME: Found two non registers to commute. This does happen. | |||
2155 | return nullptr; | |||
2156 | } | |||
2157 | ||||
2158 | if (CommutedMI) { | |||
2159 | swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, | |||
2160 | Src1, AMDGPU::OpName::src1_modifiers); | |||
2161 | ||||
2162 | CommutedMI->setDesc(get(CommutedOpcode)); | |||
2163 | } | |||
2164 | ||||
2165 | return CommutedMI; | |||
2166 | } | |||
2167 | ||||
2168 | // This needs to be implemented because the source modifiers may be inserted | |||
2169 | // between the true commutable operands, and the base | |||
2170 | // TargetInstrInfo::commuteInstruction uses it. | |||
2171 | bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI, | |||
2172 | unsigned &SrcOpIdx0, | |||
2173 | unsigned &SrcOpIdx1) const { | |||
2174 | return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); | |||
2175 | } | |||
2176 | ||||
2177 | bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0, | |||
2178 | unsigned &SrcOpIdx1) const { | |||
2179 | if (!Desc.isCommutable()) | |||
2180 | return false; | |||
2181 | ||||
2182 | unsigned Opc = Desc.getOpcode(); | |||
2183 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); | |||
2184 | if (Src0Idx == -1) | |||
2185 | return false; | |||
2186 | ||||
2187 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); | |||
2188 | if (Src1Idx == -1) | |||
2189 | return false; | |||
2190 | ||||
2191 | return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); | |||
2192 | } | |||
2193 | ||||
2194 | bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, | |||
2195 | int64_t BrOffset) const { | |||
2196 | // BranchRelaxation should never have to check s_setpc_b64 because its dest | |||
2197 | // block is unanalyzable. | |||
2198 | assert(BranchOp != AMDGPU::S_SETPC_B64)(static_cast <bool> (BranchOp != AMDGPU::S_SETPC_B64) ? void (0) : __assert_fail ("BranchOp != AMDGPU::S_SETPC_B64", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2198, __extension__ __PRETTY_FUNCTION__)); | |||
2199 | ||||
2200 | // Convert to dwords. | |||
2201 | BrOffset /= 4; | |||
2202 | ||||
2203 | // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is | |||
2204 | // from the next instruction. | |||
2205 | BrOffset -= 1; | |||
2206 | ||||
2207 | return isIntN(BranchOffsetBits, BrOffset); | |||
2208 | } | |||
2209 | ||||
2210 | MachineBasicBlock *SIInstrInfo::getBranchDestBlock( | |||
2211 | const MachineInstr &MI) const { | |||
2212 | if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { | |||
2213 | // This would be a difficult analysis to perform, but can always be legal so | |||
2214 | // there's no need to analyze it. | |||
2215 | return nullptr; | |||
2216 | } | |||
2217 | ||||
2218 | return MI.getOperand(0).getMBB(); | |||
2219 | } | |||
2220 | ||||
2221 | unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, | |||
2222 | MachineBasicBlock &DestBB, | |||
2223 | const DebugLoc &DL, | |||
2224 | int64_t BrOffset, | |||
2225 | RegScavenger *RS) const { | |||
2226 | assert(RS && "RegScavenger required for long branching")(static_cast <bool> (RS && "RegScavenger required for long branching" ) ? void (0) : __assert_fail ("RS && \"RegScavenger required for long branching\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2226, __extension__ __PRETTY_FUNCTION__)); | |||
2227 | assert(MBB.empty() &&(static_cast <bool> (MBB.empty() && "new block should be inserted for expanding unconditional branch" ) ? void (0) : __assert_fail ("MBB.empty() && \"new block should be inserted for expanding unconditional branch\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2228, __extension__ __PRETTY_FUNCTION__)) | |||
2228 | "new block should be inserted for expanding unconditional branch")(static_cast <bool> (MBB.empty() && "new block should be inserted for expanding unconditional branch" ) ? void (0) : __assert_fail ("MBB.empty() && \"new block should be inserted for expanding unconditional branch\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2228, __extension__ __PRETTY_FUNCTION__)); | |||
2229 | assert(MBB.pred_size() == 1)(static_cast <bool> (MBB.pred_size() == 1) ? void (0) : __assert_fail ("MBB.pred_size() == 1", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2229, __extension__ __PRETTY_FUNCTION__)); | |||
2230 | ||||
2231 | MachineFunction *MF = MBB.getParent(); | |||
2232 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
2233 | ||||
2234 | // FIXME: Virtual register workaround for RegScavenger not working with empty | |||
2235 | // blocks. | |||
2236 | Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); | |||
2237 | ||||
2238 | auto I = MBB.end(); | |||
2239 | ||||
2240 | // We need to compute the offset relative to the instruction immediately after | |||
2241 | // s_getpc_b64. Insert pc arithmetic code before last terminator. | |||
2242 | MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); | |||
2243 | ||||
2244 | auto &MCCtx = MF->getContext(); | |||
2245 | MCSymbol *PostGetPCLabel = | |||
2246 | MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true); | |||
2247 | GetPC->setPostInstrSymbol(*MF, PostGetPCLabel); | |||
2248 | ||||
2249 | MCSymbol *OffsetLo = | |||
2250 | MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true); | |||
2251 | MCSymbol *OffsetHi = | |||
2252 | MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true); | |||
2253 | BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) | |||
2254 | .addReg(PCReg, RegState::Define, AMDGPU::sub0) | |||
2255 | .addReg(PCReg, 0, AMDGPU::sub0) | |||
2256 | .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET); | |||
2257 | BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) | |||
2258 | .addReg(PCReg, RegState::Define, AMDGPU::sub1) | |||
2259 | .addReg(PCReg, 0, AMDGPU::sub1) | |||
2260 | .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET); | |||
2261 | ||||
2262 | // Insert the indirect branch after the other terminator. | |||
2263 | BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) | |||
2264 | .addReg(PCReg); | |||
2265 | ||||
2266 | auto ComputeBlockSize = [](const TargetInstrInfo *TII, | |||
2267 | const MachineBasicBlock &MBB) { | |||
2268 | unsigned Size = 0; | |||
2269 | for (const MachineInstr &MI : MBB) | |||
2270 | Size += TII->getInstSizeInBytes(MI); | |||
2271 | return Size; | |||
2272 | }; | |||
2273 | ||||
2274 | // FIXME: If spilling is necessary, this will fail because this scavenger has | |||
2275 | // no emergency stack slots. It is non-trivial to spill in this situation, | |||
2276 | // because the restore code needs to be specially placed after the | |||
2277 | // jump. BranchRelaxation then needs to be made aware of the newly inserted | |||
2278 | // block. | |||
2279 | // | |||
2280 | // If a spill is needed for the pc register pair, we need to insert a spill | |||
2281 | // restore block right before the destination block, and insert a short branch | |||
2282 | // into the old destination block's fallthrough predecessor. | |||
2283 | // e.g.: | |||
2284 | // | |||
2285 | // s_cbranch_scc0 skip_long_branch: | |||
2286 | // | |||
2287 | // long_branch_bb: | |||
2288 | // spill s[8:9] | |||
2289 | // s_getpc_b64 s[8:9] | |||
2290 | // s_add_u32 s8, s8, restore_bb | |||
2291 | // s_addc_u32 s9, s9, 0 | |||
2292 | // s_setpc_b64 s[8:9] | |||
2293 | // | |||
2294 | // skip_long_branch: | |||
2295 | // foo; | |||
2296 | // | |||
2297 | // ..... | |||
2298 | // | |||
2299 | // dest_bb_fallthrough_predecessor: | |||
2300 | // bar; | |||
2301 | // s_branch dest_bb | |||
2302 | // | |||
2303 | // restore_bb: | |||
2304 | // restore s[8:9] | |||
2305 | // fallthrough dest_bb | |||
2306 | /// | |||
2307 | // dest_bb: | |||
2308 | // buzz; | |||
2309 | ||||
2310 | RS->enterBasicBlockEnd(MBB); | |||
2311 | Register Scav = RS->scavengeRegisterBackwards( | |||
2312 | AMDGPU::SReg_64RegClass, | |||
2313 | MachineBasicBlock::iterator(GetPC), false, 0); | |||
2314 | MRI.replaceRegWith(PCReg, Scav); | |||
2315 | MRI.clearVirtRegs(); | |||
2316 | RS->setRegUsed(Scav); | |||
2317 | ||||
2318 | // Now, the distance could be defined. | |||
2319 | auto *Offset = MCBinaryExpr::createSub( | |||
2320 | MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx), | |||
2321 | MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx); | |||
2322 | // Add offset assignments. | |||
2323 | auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx); | |||
2324 | OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx)); | |||
2325 | auto *ShAmt = MCConstantExpr::create(32, MCCtx); | |||
2326 | OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx)); | |||
2327 | return ComputeBlockSize(this, MBB); | |||
2328 | } | |||
2329 | ||||
2330 | unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { | |||
2331 | switch (Cond) { | |||
2332 | case SIInstrInfo::SCC_TRUE: | |||
2333 | return AMDGPU::S_CBRANCH_SCC1; | |||
2334 | case SIInstrInfo::SCC_FALSE: | |||
2335 | return AMDGPU::S_CBRANCH_SCC0; | |||
2336 | case SIInstrInfo::VCCNZ: | |||
2337 | return AMDGPU::S_CBRANCH_VCCNZ; | |||
2338 | case SIInstrInfo::VCCZ: | |||
2339 | return AMDGPU::S_CBRANCH_VCCZ; | |||
2340 | case SIInstrInfo::EXECNZ: | |||
2341 | return AMDGPU::S_CBRANCH_EXECNZ; | |||
2342 | case SIInstrInfo::EXECZ: | |||
2343 | return AMDGPU::S_CBRANCH_EXECZ; | |||
2344 | default: | |||
2345 | llvm_unreachable("invalid branch predicate")::llvm::llvm_unreachable_internal("invalid branch predicate", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2345); | |||
2346 | } | |||
2347 | } | |||
2348 | ||||
2349 | SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { | |||
2350 | switch (Opcode) { | |||
2351 | case AMDGPU::S_CBRANCH_SCC0: | |||
2352 | return SCC_FALSE; | |||
2353 | case AMDGPU::S_CBRANCH_SCC1: | |||
2354 | return SCC_TRUE; | |||
2355 | case AMDGPU::S_CBRANCH_VCCNZ: | |||
2356 | return VCCNZ; | |||
2357 | case AMDGPU::S_CBRANCH_VCCZ: | |||
2358 | return VCCZ; | |||
2359 | case AMDGPU::S_CBRANCH_EXECNZ: | |||
2360 | return EXECNZ; | |||
2361 | case AMDGPU::S_CBRANCH_EXECZ: | |||
2362 | return EXECZ; | |||
2363 | default: | |||
2364 | return INVALID_BR; | |||
2365 | } | |||
2366 | } | |||
2367 | ||||
2368 | bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, | |||
2369 | MachineBasicBlock::iterator I, | |||
2370 | MachineBasicBlock *&TBB, | |||
2371 | MachineBasicBlock *&FBB, | |||
2372 | SmallVectorImpl<MachineOperand> &Cond, | |||
2373 | bool AllowModify) const { | |||
2374 | if (I->getOpcode() == AMDGPU::S_BRANCH) { | |||
2375 | // Unconditional Branch | |||
2376 | TBB = I->getOperand(0).getMBB(); | |||
2377 | return false; | |||
2378 | } | |||
2379 | ||||
2380 | MachineBasicBlock *CondBB = nullptr; | |||
2381 | ||||
2382 | if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { | |||
2383 | CondBB = I->getOperand(1).getMBB(); | |||
2384 | Cond.push_back(I->getOperand(0)); | |||
2385 | } else { | |||
2386 | BranchPredicate Pred = getBranchPredicate(I->getOpcode()); | |||
2387 | if (Pred == INVALID_BR) | |||
2388 | return true; | |||
2389 | ||||
2390 | CondBB = I->getOperand(0).getMBB(); | |||
2391 | Cond.push_back(MachineOperand::CreateImm(Pred)); | |||
2392 | Cond.push_back(I->getOperand(1)); // Save the branch register. | |||
2393 | } | |||
2394 | ++I; | |||
2395 | ||||
2396 | if (I == MBB.end()) { | |||
2397 | // Conditional branch followed by fall-through. | |||
2398 | TBB = CondBB; | |||
2399 | return false; | |||
2400 | } | |||
2401 | ||||
2402 | if (I->getOpcode() == AMDGPU::S_BRANCH) { | |||
2403 | TBB = CondBB; | |||
2404 | FBB = I->getOperand(0).getMBB(); | |||
2405 | return false; | |||
2406 | } | |||
2407 | ||||
2408 | return true; | |||
2409 | } | |||
2410 | ||||
2411 | bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, | |||
2412 | MachineBasicBlock *&FBB, | |||
2413 | SmallVectorImpl<MachineOperand> &Cond, | |||
2414 | bool AllowModify) const { | |||
2415 | MachineBasicBlock::iterator I = MBB.getFirstTerminator(); | |||
2416 | auto E = MBB.end(); | |||
2417 | if (I == E) | |||
2418 | return false; | |||
2419 | ||||
2420 | // Skip over the instructions that are artificially terminators for special | |||
2421 | // exec management. | |||
2422 | while (I != E && !I->isBranch() && !I->isReturn()) { | |||
2423 | switch (I->getOpcode()) { | |||
2424 | case AMDGPU::S_MOV_B64_term: | |||
2425 | case AMDGPU::S_XOR_B64_term: | |||
2426 | case AMDGPU::S_OR_B64_term: | |||
2427 | case AMDGPU::S_ANDN2_B64_term: | |||
2428 | case AMDGPU::S_AND_B64_term: | |||
2429 | case AMDGPU::S_MOV_B32_term: | |||
2430 | case AMDGPU::S_XOR_B32_term: | |||
2431 | case AMDGPU::S_OR_B32_term: | |||
2432 | case AMDGPU::S_ANDN2_B32_term: | |||
2433 | case AMDGPU::S_AND_B32_term: | |||
2434 | break; | |||
2435 | case AMDGPU::SI_IF: | |||
2436 | case AMDGPU::SI_ELSE: | |||
2437 | case AMDGPU::SI_KILL_I1_TERMINATOR: | |||
2438 | case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: | |||
2439 | // FIXME: It's messy that these need to be considered here at all. | |||
2440 | return true; | |||
2441 | default: | |||
2442 | llvm_unreachable("unexpected non-branch terminator inst")::llvm::llvm_unreachable_internal("unexpected non-branch terminator inst" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2442); | |||
2443 | } | |||
2444 | ||||
2445 | ++I; | |||
2446 | } | |||
2447 | ||||
2448 | if (I == E) | |||
2449 | return false; | |||
2450 | ||||
2451 | return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); | |||
2452 | } | |||
2453 | ||||
2454 | unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, | |||
2455 | int *BytesRemoved) const { | |||
2456 | MachineBasicBlock::iterator I = MBB.getFirstTerminator(); | |||
2457 | ||||
2458 | unsigned Count = 0; | |||
2459 | unsigned RemovedSize = 0; | |||
2460 | while (I != MBB.end()) { | |||
2461 | MachineBasicBlock::iterator Next = std::next(I); | |||
2462 | RemovedSize += getInstSizeInBytes(*I); | |||
2463 | I->eraseFromParent(); | |||
2464 | ++Count; | |||
2465 | I = Next; | |||
2466 | } | |||
2467 | ||||
2468 | if (BytesRemoved) | |||
2469 | *BytesRemoved = RemovedSize; | |||
2470 | ||||
2471 | return Count; | |||
2472 | } | |||
2473 | ||||
2474 | // Copy the flags onto the implicit condition register operand. | |||
2475 | static void preserveCondRegFlags(MachineOperand &CondReg, | |||
2476 | const MachineOperand &OrigCond) { | |||
2477 | CondReg.setIsUndef(OrigCond.isUndef()); | |||
2478 | CondReg.setIsKill(OrigCond.isKill()); | |||
2479 | } | |||
2480 | ||||
2481 | unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, | |||
2482 | MachineBasicBlock *TBB, | |||
2483 | MachineBasicBlock *FBB, | |||
2484 | ArrayRef<MachineOperand> Cond, | |||
2485 | const DebugLoc &DL, | |||
2486 | int *BytesAdded) const { | |||
2487 | if (!FBB && Cond.empty()) { | |||
2488 | BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) | |||
2489 | .addMBB(TBB); | |||
2490 | if (BytesAdded) | |||
2491 | *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; | |||
2492 | return 1; | |||
2493 | } | |||
2494 | ||||
2495 | if(Cond.size() == 1 && Cond[0].isReg()) { | |||
2496 | BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) | |||
2497 | .add(Cond[0]) | |||
2498 | .addMBB(TBB); | |||
2499 | return 1; | |||
2500 | } | |||
2501 | ||||
2502 | assert(TBB && Cond[0].isImm())(static_cast <bool> (TBB && Cond[0].isImm()) ? void (0) : __assert_fail ("TBB && Cond[0].isImm()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2502, __extension__ __PRETTY_FUNCTION__)); | |||
2503 | ||||
2504 | unsigned Opcode | |||
2505 | = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); | |||
2506 | ||||
2507 | if (!FBB) { | |||
2508 | Cond[1].isUndef(); | |||
2509 | MachineInstr *CondBr = | |||
2510 | BuildMI(&MBB, DL, get(Opcode)) | |||
2511 | .addMBB(TBB); | |||
2512 | ||||
2513 | // Copy the flags onto the implicit condition register operand. | |||
2514 | preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); | |||
2515 | fixImplicitOperands(*CondBr); | |||
2516 | ||||
2517 | if (BytesAdded) | |||
2518 | *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; | |||
2519 | return 1; | |||
2520 | } | |||
2521 | ||||
2522 | assert(TBB && FBB)(static_cast <bool> (TBB && FBB) ? void (0) : __assert_fail ("TBB && FBB", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2522, __extension__ __PRETTY_FUNCTION__)); | |||
2523 | ||||
2524 | MachineInstr *CondBr = | |||
2525 | BuildMI(&MBB, DL, get(Opcode)) | |||
2526 | .addMBB(TBB); | |||
2527 | fixImplicitOperands(*CondBr); | |||
2528 | BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) | |||
2529 | .addMBB(FBB); | |||
2530 | ||||
2531 | MachineOperand &CondReg = CondBr->getOperand(1); | |||
2532 | CondReg.setIsUndef(Cond[1].isUndef()); | |||
2533 | CondReg.setIsKill(Cond[1].isKill()); | |||
2534 | ||||
2535 | if (BytesAdded) | |||
2536 | *BytesAdded = ST.hasOffset3fBug() ? 16 : 8; | |||
2537 | ||||
2538 | return 2; | |||
2539 | } | |||
2540 | ||||
2541 | bool SIInstrInfo::reverseBranchCondition( | |||
2542 | SmallVectorImpl<MachineOperand> &Cond) const { | |||
2543 | if (Cond.size() != 2) { | |||
2544 | return true; | |||
2545 | } | |||
2546 | ||||
2547 | if (Cond[0].isImm()) { | |||
2548 | Cond[0].setImm(-Cond[0].getImm()); | |||
2549 | return false; | |||
2550 | } | |||
2551 | ||||
2552 | return true; | |||
2553 | } | |||
2554 | ||||
2555 | bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, | |||
2556 | ArrayRef<MachineOperand> Cond, | |||
2557 | Register DstReg, Register TrueReg, | |||
2558 | Register FalseReg, int &CondCycles, | |||
2559 | int &TrueCycles, int &FalseCycles) const { | |||
2560 | switch (Cond[0].getImm()) { | |||
2561 | case VCCNZ: | |||
2562 | case VCCZ: { | |||
2563 | const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
2564 | const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); | |||
2565 | if (MRI.getRegClass(FalseReg) != RC) | |||
2566 | return false; | |||
2567 | ||||
2568 | int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; | |||
2569 | CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? | |||
2570 | ||||
2571 | // Limit to equal cost for branch vs. N v_cndmask_b32s. | |||
2572 | return RI.hasVGPRs(RC) && NumInsts <= 6; | |||
2573 | } | |||
2574 | case SCC_TRUE: | |||
2575 | case SCC_FALSE: { | |||
2576 | // FIXME: We could insert for VGPRs if we could replace the original compare | |||
2577 | // with a vector one. | |||
2578 | const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
2579 | const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); | |||
2580 | if (MRI.getRegClass(FalseReg) != RC) | |||
2581 | return false; | |||
2582 | ||||
2583 | int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; | |||
2584 | ||||
2585 | // Multiples of 8 can do s_cselect_b64 | |||
2586 | if (NumInsts % 2 == 0) | |||
2587 | NumInsts /= 2; | |||
2588 | ||||
2589 | CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? | |||
2590 | return RI.isSGPRClass(RC); | |||
2591 | } | |||
2592 | default: | |||
2593 | return false; | |||
2594 | } | |||
2595 | } | |||
2596 | ||||
2597 | void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, | |||
2598 | MachineBasicBlock::iterator I, const DebugLoc &DL, | |||
2599 | Register DstReg, ArrayRef<MachineOperand> Cond, | |||
2600 | Register TrueReg, Register FalseReg) const { | |||
2601 | BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); | |||
2602 | if (Pred == VCCZ || Pred == SCC_FALSE) { | |||
2603 | Pred = static_cast<BranchPredicate>(-Pred); | |||
2604 | std::swap(TrueReg, FalseReg); | |||
2605 | } | |||
2606 | ||||
2607 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
2608 | const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); | |||
2609 | unsigned DstSize = RI.getRegSizeInBits(*DstRC); | |||
2610 | ||||
2611 | if (DstSize == 32) { | |||
2612 | MachineInstr *Select; | |||
2613 | if (Pred == SCC_TRUE) { | |||
2614 | Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg) | |||
2615 | .addReg(TrueReg) | |||
2616 | .addReg(FalseReg); | |||
2617 | } else { | |||
2618 | // Instruction's operands are backwards from what is expected. | |||
2619 | Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg) | |||
2620 | .addReg(FalseReg) | |||
2621 | .addReg(TrueReg); | |||
2622 | } | |||
2623 | ||||
2624 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); | |||
2625 | return; | |||
2626 | } | |||
2627 | ||||
2628 | if (DstSize == 64 && Pred == SCC_TRUE) { | |||
2629 | MachineInstr *Select = | |||
2630 | BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) | |||
2631 | .addReg(TrueReg) | |||
2632 | .addReg(FalseReg); | |||
2633 | ||||
2634 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); | |||
2635 | return; | |||
2636 | } | |||
2637 | ||||
2638 | static const int16_t Sub0_15[] = { | |||
2639 | AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, | |||
2640 | AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, | |||
2641 | AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, | |||
2642 | AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, | |||
2643 | }; | |||
2644 | ||||
2645 | static const int16_t Sub0_15_64[] = { | |||
2646 | AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, | |||
2647 | AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, | |||
2648 | AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, | |||
2649 | AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, | |||
2650 | }; | |||
2651 | ||||
2652 | unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; | |||
2653 | const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; | |||
2654 | const int16_t *SubIndices = Sub0_15; | |||
2655 | int NElts = DstSize / 32; | |||
2656 | ||||
2657 | // 64-bit select is only available for SALU. | |||
2658 | // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit. | |||
2659 | if (Pred == SCC_TRUE) { | |||
2660 | if (NElts % 2) { | |||
2661 | SelOp = AMDGPU::S_CSELECT_B32; | |||
2662 | EltRC = &AMDGPU::SGPR_32RegClass; | |||
2663 | } else { | |||
2664 | SelOp = AMDGPU::S_CSELECT_B64; | |||
2665 | EltRC = &AMDGPU::SGPR_64RegClass; | |||
2666 | SubIndices = Sub0_15_64; | |||
2667 | NElts /= 2; | |||
2668 | } | |||
2669 | } | |||
2670 | ||||
2671 | MachineInstrBuilder MIB = BuildMI( | |||
2672 | MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); | |||
2673 | ||||
2674 | I = MIB->getIterator(); | |||
2675 | ||||
2676 | SmallVector<Register, 8> Regs; | |||
2677 | for (int Idx = 0; Idx != NElts; ++Idx) { | |||
2678 | Register DstElt = MRI.createVirtualRegister(EltRC); | |||
2679 | Regs.push_back(DstElt); | |||
2680 | ||||
2681 | unsigned SubIdx = SubIndices[Idx]; | |||
2682 | ||||
2683 | MachineInstr *Select; | |||
2684 | if (SelOp == AMDGPU::V_CNDMASK_B32_e32) { | |||
2685 | Select = | |||
2686 | BuildMI(MBB, I, DL, get(SelOp), DstElt) | |||
2687 | .addReg(FalseReg, 0, SubIdx) | |||
2688 | .addReg(TrueReg, 0, SubIdx); | |||
2689 | } else { | |||
2690 | Select = | |||
2691 | BuildMI(MBB, I, DL, get(SelOp), DstElt) | |||
2692 | .addReg(TrueReg, 0, SubIdx) | |||
2693 | .addReg(FalseReg, 0, SubIdx); | |||
2694 | } | |||
2695 | ||||
2696 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); | |||
2697 | fixImplicitOperands(*Select); | |||
2698 | ||||
2699 | MIB.addReg(DstElt) | |||
2700 | .addImm(SubIdx); | |||
2701 | } | |||
2702 | } | |||
2703 | ||||
2704 | bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { | |||
2705 | switch (MI.getOpcode()) { | |||
2706 | case AMDGPU::V_MOV_B32_e32: | |||
2707 | case AMDGPU::V_MOV_B32_e64: | |||
2708 | case AMDGPU::V_MOV_B64_PSEUDO: { | |||
2709 | // If there are additional implicit register operands, this may be used for | |||
2710 | // register indexing so the source register operand isn't simply copied. | |||
2711 | unsigned NumOps = MI.getDesc().getNumOperands() + | |||
2712 | MI.getDesc().getNumImplicitUses(); | |||
2713 | ||||
2714 | return MI.getNumOperands() == NumOps; | |||
2715 | } | |||
2716 | case AMDGPU::S_MOV_B32: | |||
2717 | case AMDGPU::S_MOV_B64: | |||
2718 | case AMDGPU::COPY: | |||
2719 | case AMDGPU::V_ACCVGPR_WRITE_B32_e64: | |||
2720 | case AMDGPU::V_ACCVGPR_READ_B32_e64: | |||
2721 | case AMDGPU::V_ACCVGPR_MOV_B32: | |||
2722 | return true; | |||
2723 | default: | |||
2724 | return false; | |||
2725 | } | |||
2726 | } | |||
2727 | ||||
2728 | unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( | |||
2729 | unsigned Kind) const { | |||
2730 | switch(Kind) { | |||
2731 | case PseudoSourceValue::Stack: | |||
2732 | case PseudoSourceValue::FixedStack: | |||
2733 | return AMDGPUAS::PRIVATE_ADDRESS; | |||
2734 | case PseudoSourceValue::ConstantPool: | |||
2735 | case PseudoSourceValue::GOT: | |||
2736 | case PseudoSourceValue::JumpTable: | |||
2737 | case PseudoSourceValue::GlobalValueCallEntry: | |||
2738 | case PseudoSourceValue::ExternalSymbolCallEntry: | |||
2739 | case PseudoSourceValue::TargetCustom: | |||
2740 | return AMDGPUAS::CONSTANT_ADDRESS; | |||
2741 | } | |||
2742 | return AMDGPUAS::FLAT_ADDRESS; | |||
2743 | } | |||
2744 | ||||
2745 | static void removeModOperands(MachineInstr &MI) { | |||
2746 | unsigned Opc = MI.getOpcode(); | |||
2747 | int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, | |||
2748 | AMDGPU::OpName::src0_modifiers); | |||
2749 | int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, | |||
2750 | AMDGPU::OpName::src1_modifiers); | |||
2751 | int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, | |||
2752 | AMDGPU::OpName::src2_modifiers); | |||
2753 | ||||
2754 | MI.RemoveOperand(Src2ModIdx); | |||
2755 | MI.RemoveOperand(Src1ModIdx); | |||
2756 | MI.RemoveOperand(Src0ModIdx); | |||
2757 | } | |||
2758 | ||||
2759 | bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, | |||
2760 | Register Reg, MachineRegisterInfo *MRI) const { | |||
2761 | if (!MRI->hasOneNonDBGUse(Reg)) | |||
2762 | return false; | |||
2763 | ||||
2764 | switch (DefMI.getOpcode()) { | |||
2765 | default: | |||
2766 | return false; | |||
2767 | case AMDGPU::S_MOV_B64: | |||
2768 | // TODO: We could fold 64-bit immediates, but this get compilicated | |||
2769 | // when there are sub-registers. | |||
2770 | return false; | |||
2771 | ||||
2772 | case AMDGPU::V_MOV_B32_e32: | |||
2773 | case AMDGPU::S_MOV_B32: | |||
2774 | case AMDGPU::V_ACCVGPR_WRITE_B32_e64: | |||
2775 | break; | |||
2776 | } | |||
2777 | ||||
2778 | const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); | |||
2779 | assert(ImmOp)(static_cast <bool> (ImmOp) ? void (0) : __assert_fail ( "ImmOp", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2779, __extension__ __PRETTY_FUNCTION__)); | |||
2780 | // FIXME: We could handle FrameIndex values here. | |||
2781 | if (!ImmOp->isImm()) | |||
2782 | return false; | |||
2783 | ||||
2784 | unsigned Opc = UseMI.getOpcode(); | |||
2785 | if (Opc == AMDGPU::COPY) { | |||
2786 | Register DstReg = UseMI.getOperand(0).getReg(); | |||
2787 | bool Is16Bit = getOpSize(UseMI, 0) == 2; | |||
2788 | bool isVGPRCopy = RI.isVGPR(*MRI, DstReg); | |||
2789 | unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; | |||
2790 | APInt Imm(32, ImmOp->getImm()); | |||
2791 | ||||
2792 | if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16) | |||
2793 | Imm = Imm.ashr(16); | |||
2794 | ||||
2795 | if (RI.isAGPR(*MRI, DstReg)) { | |||
2796 | if (!isInlineConstant(Imm)) | |||
2797 | return false; | |||
2798 | NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64; | |||
2799 | } | |||
2800 | ||||
2801 | if (Is16Bit) { | |||
2802 | if (isVGPRCopy) | |||
2803 | return false; // Do not clobber vgpr_hi16 | |||
2804 | ||||
2805 | if (DstReg.isVirtual() && | |||
2806 | UseMI.getOperand(0).getSubReg() != AMDGPU::lo16) | |||
2807 | return false; | |||
2808 | ||||
2809 | UseMI.getOperand(0).setSubReg(0); | |||
2810 | if (DstReg.isPhysical()) { | |||
2811 | DstReg = RI.get32BitRegister(DstReg); | |||
2812 | UseMI.getOperand(0).setReg(DstReg); | |||
2813 | } | |||
2814 | assert(UseMI.getOperand(1).getReg().isVirtual())(static_cast <bool> (UseMI.getOperand(1).getReg().isVirtual ()) ? void (0) : __assert_fail ("UseMI.getOperand(1).getReg().isVirtual()" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2814, __extension__ __PRETTY_FUNCTION__)); | |||
2815 | } | |||
2816 | ||||
2817 | UseMI.setDesc(get(NewOpc)); | |||
2818 | UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue()); | |||
2819 | UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); | |||
2820 | return true; | |||
2821 | } | |||
2822 | ||||
2823 | if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || | |||
2824 | Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 || | |||
2825 | Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || | |||
2826 | Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) { | |||
2827 | // Don't fold if we are using source or output modifiers. The new VOP2 | |||
2828 | // instructions don't have them. | |||
2829 | if (hasAnyModifiersSet(UseMI)) | |||
2830 | return false; | |||
2831 | ||||
2832 | // If this is a free constant, there's no reason to do this. | |||
2833 | // TODO: We could fold this here instead of letting SIFoldOperands do it | |||
2834 | // later. | |||
2835 | MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); | |||
2836 | ||||
2837 | // Any src operand can be used for the legality check. | |||
2838 | if (isInlineConstant(UseMI, *Src0, *ImmOp)) | |||
2839 | return false; | |||
2840 | ||||
2841 | bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || | |||
2842 | Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64; | |||
2843 | bool IsFMA = Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || | |||
2844 | Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64; | |||
2845 | MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); | |||
2846 | MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); | |||
2847 | ||||
2848 | // Multiplied part is the constant: Use v_madmk_{f16, f32}. | |||
2849 | // We should only expect these to be on src0 due to canonicalizations. | |||
2850 | if (Src0->isReg() && Src0->getReg() == Reg) { | |||
2851 | if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) | |||
2852 | return false; | |||
2853 | ||||
2854 | if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) | |||
2855 | return false; | |||
2856 | ||||
2857 | unsigned NewOpc = | |||
2858 | IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16) | |||
2859 | : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); | |||
2860 | if (pseudoToMCOpcode(NewOpc) == -1) | |||
2861 | return false; | |||
2862 | ||||
2863 | // We need to swap operands 0 and 1 since madmk constant is at operand 1. | |||
2864 | ||||
2865 | const int64_t Imm = ImmOp->getImm(); | |||
2866 | ||||
2867 | // FIXME: This would be a lot easier if we could return a new instruction | |||
2868 | // instead of having to modify in place. | |||
2869 | ||||
2870 | // Remove these first since they are at the end. | |||
2871 | UseMI.RemoveOperand( | |||
2872 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); | |||
2873 | UseMI.RemoveOperand( | |||
2874 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); | |||
2875 | ||||
2876 | Register Src1Reg = Src1->getReg(); | |||
2877 | unsigned Src1SubReg = Src1->getSubReg(); | |||
2878 | Src0->setReg(Src1Reg); | |||
2879 | Src0->setSubReg(Src1SubReg); | |||
2880 | Src0->setIsKill(Src1->isKill()); | |||
2881 | ||||
2882 | if (Opc == AMDGPU::V_MAC_F32_e64 || | |||
2883 | Opc == AMDGPU::V_MAC_F16_e64 || | |||
2884 | Opc == AMDGPU::V_FMAC_F32_e64 || | |||
2885 | Opc == AMDGPU::V_FMAC_F16_e64) | |||
2886 | UseMI.untieRegOperand( | |||
2887 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); | |||
2888 | ||||
2889 | Src1->ChangeToImmediate(Imm); | |||
2890 | ||||
2891 | removeModOperands(UseMI); | |||
2892 | UseMI.setDesc(get(NewOpc)); | |||
2893 | ||||
2894 | bool DeleteDef = MRI->hasOneNonDBGUse(Reg); | |||
2895 | if (DeleteDef) | |||
2896 | DefMI.eraseFromParent(); | |||
2897 | ||||
2898 | return true; | |||
2899 | } | |||
2900 | ||||
2901 | // Added part is the constant: Use v_madak_{f16, f32}. | |||
2902 | if (Src2->isReg() && Src2->getReg() == Reg) { | |||
2903 | // Not allowed to use constant bus for another operand. | |||
2904 | // We can however allow an inline immediate as src0. | |||
2905 | bool Src0Inlined = false; | |||
2906 | if (Src0->isReg()) { | |||
2907 | // Try to inline constant if possible. | |||
2908 | // If the Def moves immediate and the use is single | |||
2909 | // We are saving VGPR here. | |||
2910 | MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); | |||
2911 | if (Def && Def->isMoveImmediate() && | |||
2912 | isInlineConstant(Def->getOperand(1)) && | |||
2913 | MRI->hasOneUse(Src0->getReg())) { | |||
2914 | Src0->ChangeToImmediate(Def->getOperand(1).getImm()); | |||
2915 | Src0Inlined = true; | |||
2916 | } else if ((Src0->getReg().isPhysical() && | |||
2917 | (ST.getConstantBusLimit(Opc) <= 1 && | |||
2918 | RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) || | |||
2919 | (Src0->getReg().isVirtual() && | |||
2920 | (ST.getConstantBusLimit(Opc) <= 1 && | |||
2921 | RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) | |||
2922 | return false; | |||
2923 | // VGPR is okay as Src0 - fallthrough | |||
2924 | } | |||
2925 | ||||
2926 | if (Src1->isReg() && !Src0Inlined ) { | |||
2927 | // We have one slot for inlinable constant so far - try to fill it | |||
2928 | MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); | |||
2929 | if (Def && Def->isMoveImmediate() && | |||
2930 | isInlineConstant(Def->getOperand(1)) && | |||
2931 | MRI->hasOneUse(Src1->getReg()) && | |||
2932 | commuteInstruction(UseMI)) { | |||
2933 | Src0->ChangeToImmediate(Def->getOperand(1).getImm()); | |||
2934 | } else if ((Src1->getReg().isPhysical() && | |||
2935 | RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) || | |||
2936 | (Src1->getReg().isVirtual() && | |||
2937 | RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) | |||
2938 | return false; | |||
2939 | // VGPR is okay as Src1 - fallthrough | |||
2940 | } | |||
2941 | ||||
2942 | unsigned NewOpc = | |||
2943 | IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16) | |||
2944 | : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); | |||
2945 | if (pseudoToMCOpcode(NewOpc) == -1) | |||
2946 | return false; | |||
2947 | ||||
2948 | const int64_t Imm = ImmOp->getImm(); | |||
2949 | ||||
2950 | // FIXME: This would be a lot easier if we could return a new instruction | |||
2951 | // instead of having to modify in place. | |||
2952 | ||||
2953 | // Remove these first since they are at the end. | |||
2954 | UseMI.RemoveOperand( | |||
2955 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); | |||
2956 | UseMI.RemoveOperand( | |||
2957 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); | |||
2958 | ||||
2959 | if (Opc == AMDGPU::V_MAC_F32_e64 || | |||
2960 | Opc == AMDGPU::V_MAC_F16_e64 || | |||
2961 | Opc == AMDGPU::V_FMAC_F32_e64 || | |||
2962 | Opc == AMDGPU::V_FMAC_F16_e64) | |||
2963 | UseMI.untieRegOperand( | |||
2964 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); | |||
2965 | ||||
2966 | // ChangingToImmediate adds Src2 back to the instruction. | |||
2967 | Src2->ChangeToImmediate(Imm); | |||
2968 | ||||
2969 | // These come before src2. | |||
2970 | removeModOperands(UseMI); | |||
2971 | UseMI.setDesc(get(NewOpc)); | |||
2972 | // It might happen that UseMI was commuted | |||
2973 | // and we now have SGPR as SRC1. If so 2 inlined | |||
2974 | // constant and SGPR are illegal. | |||
2975 | legalizeOperands(UseMI); | |||
2976 | ||||
2977 | bool DeleteDef = MRI->hasOneNonDBGUse(Reg); | |||
2978 | if (DeleteDef) | |||
2979 | DefMI.eraseFromParent(); | |||
2980 | ||||
2981 | return true; | |||
2982 | } | |||
2983 | } | |||
2984 | ||||
2985 | return false; | |||
2986 | } | |||
2987 | ||||
2988 | static bool | |||
2989 | memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1, | |||
2990 | ArrayRef<const MachineOperand *> BaseOps2) { | |||
2991 | if (BaseOps1.size() != BaseOps2.size()) | |||
2992 | return false; | |||
2993 | for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) { | |||
2994 | if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I])) | |||
2995 | return false; | |||
2996 | } | |||
2997 | return true; | |||
2998 | } | |||
2999 | ||||
3000 | static bool offsetsDoNotOverlap(int WidthA, int OffsetA, | |||
3001 | int WidthB, int OffsetB) { | |||
3002 | int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; | |||
3003 | int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; | |||
3004 | int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; | |||
3005 | return LowOffset + LowWidth <= HighOffset; | |||
3006 | } | |||
3007 | ||||
3008 | bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, | |||
3009 | const MachineInstr &MIb) const { | |||
3010 | SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1; | |||
3011 | int64_t Offset0, Offset1; | |||
3012 | unsigned Dummy0, Dummy1; | |||
3013 | bool Offset0IsScalable, Offset1IsScalable; | |||
3014 | if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable, | |||
3015 | Dummy0, &RI) || | |||
3016 | !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable, | |||
3017 | Dummy1, &RI)) | |||
3018 | return false; | |||
3019 | ||||
3020 | if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1)) | |||
3021 | return false; | |||
3022 | ||||
3023 | if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { | |||
3024 | // FIXME: Handle ds_read2 / ds_write2. | |||
3025 | return false; | |||
3026 | } | |||
3027 | unsigned Width0 = MIa.memoperands().front()->getSize(); | |||
3028 | unsigned Width1 = MIb.memoperands().front()->getSize(); | |||
3029 | return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1); | |||
3030 | } | |||
3031 | ||||
3032 | bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, | |||
3033 | const MachineInstr &MIb) const { | |||
3034 | assert(MIa.mayLoadOrStore() &&(static_cast <bool> (MIa.mayLoadOrStore() && "MIa must load from or modify a memory location" ) ? void (0) : __assert_fail ("MIa.mayLoadOrStore() && \"MIa must load from or modify a memory location\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3035, __extension__ __PRETTY_FUNCTION__)) | |||
3035 | "MIa must load from or modify a memory location")(static_cast <bool> (MIa.mayLoadOrStore() && "MIa must load from or modify a memory location" ) ? void (0) : __assert_fail ("MIa.mayLoadOrStore() && \"MIa must load from or modify a memory location\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3035, __extension__ __PRETTY_FUNCTION__)); | |||
3036 | assert(MIb.mayLoadOrStore() &&(static_cast <bool> (MIb.mayLoadOrStore() && "MIb must load from or modify a memory location" ) ? void (0) : __assert_fail ("MIb.mayLoadOrStore() && \"MIb must load from or modify a memory location\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3037, __extension__ __PRETTY_FUNCTION__)) | |||
3037 | "MIb must load from or modify a memory location")(static_cast <bool> (MIb.mayLoadOrStore() && "MIb must load from or modify a memory location" ) ? void (0) : __assert_fail ("MIb.mayLoadOrStore() && \"MIb must load from or modify a memory location\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3037, __extension__ __PRETTY_FUNCTION__)); | |||
3038 | ||||
3039 | if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) | |||
3040 | return false; | |||
3041 | ||||
3042 | // XXX - Can we relax this between address spaces? | |||
3043 | if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) | |||
3044 | return false; | |||
3045 | ||||
3046 | // TODO: Should we check the address space from the MachineMemOperand? That | |||
3047 | // would allow us to distinguish objects we know don't alias based on the | |||
3048 | // underlying address space, even if it was lowered to a different one, | |||
3049 | // e.g. private accesses lowered to use MUBUF instructions on a scratch | |||
3050 | // buffer. | |||
3051 | if (isDS(MIa)) { | |||
3052 | if (isDS(MIb)) | |||
3053 | return checkInstOffsetsDoNotOverlap(MIa, MIb); | |||
3054 | ||||
3055 | return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb); | |||
3056 | } | |||
3057 | ||||
3058 | if (isMUBUF(MIa) || isMTBUF(MIa)) { | |||
3059 | if (isMUBUF(MIb) || isMTBUF(MIb)) | |||
3060 | return checkInstOffsetsDoNotOverlap(MIa, MIb); | |||
3061 | ||||
3062 | return !isFLAT(MIb) && !isSMRD(MIb); | |||
3063 | } | |||
3064 | ||||
3065 | if (isSMRD(MIa)) { | |||
3066 | if (isSMRD(MIb)) | |||
3067 | return checkInstOffsetsDoNotOverlap(MIa, MIb); | |||
3068 | ||||
3069 | return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb); | |||
3070 | } | |||
3071 | ||||
3072 | if (isFLAT(MIa)) { | |||
3073 | if (isFLAT(MIb)) | |||
3074 | return checkInstOffsetsDoNotOverlap(MIa, MIb); | |||
3075 | ||||
3076 | return false; | |||
3077 | } | |||
3078 | ||||
3079 | return false; | |||
3080 | } | |||
3081 | ||||
3082 | static int64_t getFoldableImm(const MachineOperand* MO) { | |||
3083 | if (!MO->isReg()) | |||
3084 | return false; | |||
3085 | const MachineFunction *MF = MO->getParent()->getParent()->getParent(); | |||
3086 | const MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
3087 | auto Def = MRI.getUniqueVRegDef(MO->getReg()); | |||
3088 | if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 && | |||
3089 | Def->getOperand(1).isImm()) | |||
3090 | return Def->getOperand(1).getImm(); | |||
3091 | return AMDGPU::NoRegister; | |||
3092 | } | |||
3093 | ||||
3094 | static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, | |||
3095 | MachineInstr &NewMI) { | |||
3096 | if (LV) { | |||
3097 | unsigned NumOps = MI.getNumOperands(); | |||
3098 | for (unsigned I = 1; I < NumOps; ++I) { | |||
3099 | MachineOperand &Op = MI.getOperand(I); | |||
3100 | if (Op.isReg() && Op.isKill()) | |||
3101 | LV->replaceKillInstruction(Op.getReg(), MI, NewMI); | |||
3102 | } | |||
3103 | } | |||
3104 | } | |||
3105 | ||||
3106 | MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, | |||
3107 | MachineInstr &MI, | |||
3108 | LiveVariables *LV) const { | |||
3109 | unsigned Opc = MI.getOpcode(); | |||
3110 | bool IsF16 = false; | |||
3111 | bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || | |||
3112 | Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || | |||
3113 | Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; | |||
3114 | bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; | |||
3115 | ||||
3116 | switch (Opc) { | |||
3117 | default: | |||
3118 | return nullptr; | |||
3119 | case AMDGPU::V_MAC_F16_e64: | |||
3120 | case AMDGPU::V_FMAC_F16_e64: | |||
3121 | IsF16 = true; | |||
3122 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | |||
3123 | case AMDGPU::V_MAC_F32_e64: | |||
3124 | case AMDGPU::V_FMAC_F32_e64: | |||
3125 | case AMDGPU::V_FMAC_F64_e64: | |||
3126 | break; | |||
3127 | case AMDGPU::V_MAC_F16_e32: | |||
3128 | case AMDGPU::V_FMAC_F16_e32: | |||
3129 | IsF16 = true; | |||
3130 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | |||
3131 | case AMDGPU::V_MAC_F32_e32: | |||
3132 | case AMDGPU::V_FMAC_F32_e32: | |||
3133 | case AMDGPU::V_FMAC_F64_e32: { | |||
3134 | int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), | |||
3135 | AMDGPU::OpName::src0); | |||
3136 | const MachineOperand *Src0 = &MI.getOperand(Src0Idx); | |||
3137 | if (!Src0->isReg() && !Src0->isImm()) | |||
3138 | return nullptr; | |||
3139 | ||||
3140 | if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) | |||
3141 | return nullptr; | |||
3142 | ||||
3143 | break; | |||
3144 | } | |||
3145 | } | |||
3146 | ||||
3147 | const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
3148 | const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); | |||
3149 | const MachineOperand *Src0Mods = | |||
3150 | getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); | |||
3151 | const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); | |||
3152 | const MachineOperand *Src1Mods = | |||
3153 | getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); | |||
3154 | const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); | |||
3155 | const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); | |||
3156 | const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); | |||
3157 | MachineInstrBuilder MIB; | |||
3158 | ||||
3159 | if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 && | |||
3160 | // If we have an SGPR input, we will violate the constant bus restriction. | |||
3161 | (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || | |||
3162 | !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { | |||
3163 | if (auto Imm = getFoldableImm(Src2)) { | |||
3164 | unsigned NewOpc = | |||
3165 | IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32) | |||
3166 | : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); | |||
3167 | if (pseudoToMCOpcode(NewOpc) != -1) { | |||
3168 | MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3169 | .add(*Dst) | |||
3170 | .add(*Src0) | |||
3171 | .add(*Src1) | |||
3172 | .addImm(Imm); | |||
3173 | updateLiveVariables(LV, MI, *MIB); | |||
3174 | return MIB; | |||
3175 | } | |||
3176 | } | |||
3177 | unsigned NewOpc = IsFMA | |||
3178 | ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32) | |||
3179 | : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); | |||
3180 | if (auto Imm = getFoldableImm(Src1)) { | |||
3181 | if (pseudoToMCOpcode(NewOpc) != -1) { | |||
3182 | MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3183 | .add(*Dst) | |||
3184 | .add(*Src0) | |||
3185 | .addImm(Imm) | |||
3186 | .add(*Src2); | |||
3187 | updateLiveVariables(LV, MI, *MIB); | |||
3188 | return MIB; | |||
3189 | } | |||
3190 | } | |||
3191 | if (auto Imm = getFoldableImm(Src0)) { | |||
3192 | if (pseudoToMCOpcode(NewOpc) != -1 && | |||
3193 | isOperandLegal( | |||
3194 | MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0), | |||
3195 | Src1)) { | |||
3196 | MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3197 | .add(*Dst) | |||
3198 | .add(*Src1) | |||
3199 | .addImm(Imm) | |||
3200 | .add(*Src2); | |||
3201 | updateLiveVariables(LV, MI, *MIB); | |||
3202 | return MIB; | |||
3203 | } | |||
3204 | } | |||
3205 | } | |||
3206 | ||||
3207 | unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64 | |||
3208 | : IsF64 ? AMDGPU::V_FMA_F64_e64 | |||
3209 | : AMDGPU::V_FMA_F32_e64) | |||
3210 | : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64); | |||
3211 | if (pseudoToMCOpcode(NewOpc) == -1) | |||
3212 | return nullptr; | |||
3213 | ||||
3214 | MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) | |||
3215 | .add(*Dst) | |||
3216 | .addImm(Src0Mods ? Src0Mods->getImm() : 0) | |||
3217 | .add(*Src0) | |||
3218 | .addImm(Src1Mods ? Src1Mods->getImm() : 0) | |||
3219 | .add(*Src1) | |||
3220 | .addImm(0) // Src mods | |||
3221 | .add(*Src2) | |||
3222 | .addImm(Clamp ? Clamp->getImm() : 0) | |||
3223 | .addImm(Omod ? Omod->getImm() : 0); | |||
3224 | updateLiveVariables(LV, MI, *MIB); | |||
3225 | return MIB; | |||
3226 | } | |||
3227 | ||||
3228 | // It's not generally safe to move VALU instructions across these since it will | |||
3229 | // start using the register as a base index rather than directly. | |||
3230 | // XXX - Why isn't hasSideEffects sufficient for these? | |||
3231 | static bool changesVGPRIndexingMode(const MachineInstr &MI) { | |||
3232 | switch (MI.getOpcode()) { | |||
3233 | case AMDGPU::S_SET_GPR_IDX_ON: | |||
3234 | case AMDGPU::S_SET_GPR_IDX_MODE: | |||
3235 | case AMDGPU::S_SET_GPR_IDX_OFF: | |||
3236 | return true; | |||
3237 | default: | |||
3238 | return false; | |||
3239 | } | |||
3240 | } | |||
3241 | ||||
3242 | bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, | |||
3243 | const MachineBasicBlock *MBB, | |||
3244 | const MachineFunction &MF) const { | |||
3245 | // Skipping the check for SP writes in the base implementation. The reason it | |||
3246 | // was added was apparently due to compile time concerns. | |||
3247 | // | |||
3248 | // TODO: Do we really want this barrier? It triggers unnecessary hazard nops | |||
3249 | // but is probably avoidable. | |||
3250 | ||||
3251 | // Copied from base implementation. | |||
3252 | // Terminators and labels can't be scheduled around. | |||
3253 | if (MI.isTerminator() || MI.isPosition()) | |||
3254 | return true; | |||
3255 | ||||
3256 | // INLINEASM_BR can jump to another block | |||
3257 | if (MI.getOpcode() == TargetOpcode::INLINEASM_BR) | |||
3258 | return true; | |||
3259 | ||||
3260 | // Target-independent instructions do not have an implicit-use of EXEC, even | |||
3261 | // when they operate on VGPRs. Treating EXEC modifications as scheduling | |||
3262 | // boundaries prevents incorrect movements of such instructions. | |||
3263 | return MI.modifiesRegister(AMDGPU::EXEC, &RI) || | |||
3264 | MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || | |||
3265 | MI.getOpcode() == AMDGPU::S_SETREG_B32 || | |||
3266 | changesVGPRIndexingMode(MI); | |||
3267 | } | |||
3268 | ||||
3269 | bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { | |||
3270 | return Opcode == AMDGPU::DS_ORDERED_COUNT || | |||
3271 | Opcode == AMDGPU::DS_GWS_INIT || | |||
3272 | Opcode == AMDGPU::DS_GWS_SEMA_V || | |||
3273 | Opcode == AMDGPU::DS_GWS_SEMA_BR || | |||
3274 | Opcode == AMDGPU::DS_GWS_SEMA_P || | |||
3275 | Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || | |||
3276 | Opcode == AMDGPU::DS_GWS_BARRIER; | |||
3277 | } | |||
3278 | ||||
3279 | bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { | |||
3280 | // Skip the full operand and register alias search modifiesRegister | |||
3281 | // does. There's only a handful of instructions that touch this, it's only an | |||
3282 | // implicit def, and doesn't alias any other registers. | |||
3283 | if (const MCPhysReg *ImpDef = MI.getDesc().getImplicitDefs()) { | |||
3284 | for (; ImpDef && *ImpDef; ++ImpDef) { | |||
3285 | if (*ImpDef == AMDGPU::MODE) | |||
3286 | return true; | |||
3287 | } | |||
3288 | } | |||
3289 | ||||
3290 | return false; | |||
3291 | } | |||
3292 | ||||
3293 | bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { | |||
3294 | unsigned Opcode = MI.getOpcode(); | |||
3295 | ||||
3296 | if (MI.mayStore() && isSMRD(MI)) | |||
3297 | return true; // scalar store or atomic | |||
3298 | ||||
3299 | // This will terminate the function when other lanes may need to continue. | |||
3300 | if (MI.isReturn()) | |||
3301 | return true; | |||
3302 | ||||
3303 | // These instructions cause shader I/O that may cause hardware lockups | |||
3304 | // when executed with an empty EXEC mask. | |||
3305 | // | |||
3306 | // Note: exp with VM = DONE = 0 is automatically skipped by hardware when | |||
3307 | // EXEC = 0, but checking for that case here seems not worth it | |||
3308 | // given the typical code patterns. | |||
3309 | if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || | |||
3310 | isEXP(Opcode) || | |||
3311 | Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP || | |||
3312 | Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER) | |||
3313 | return true; | |||
3314 | ||||
3315 | if (MI.isCall() || MI.isInlineAsm()) | |||
3316 | return true; // conservative assumption | |||
3317 | ||||
3318 | // A mode change is a scalar operation that influences vector instructions. | |||
3319 | if (modifiesModeRegister(MI)) | |||
3320 | return true; | |||
3321 | ||||
3322 | // These are like SALU instructions in terms of effects, so it's questionable | |||
3323 | // whether we should return true for those. | |||
3324 | // | |||
3325 | // However, executing them with EXEC = 0 causes them to operate on undefined | |||
3326 | // data, which we avoid by returning true here. | |||
3327 | if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || | |||
3328 | Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32) | |||
3329 | return true; | |||
3330 | ||||
3331 | return false; | |||
3332 | } | |||
3333 | ||||
3334 | bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI, | |||
3335 | const MachineInstr &MI) const { | |||
3336 | if (MI.isMetaInstruction()) | |||
3337 | return false; | |||
3338 | ||||
3339 | // This won't read exec if this is an SGPR->SGPR copy. | |||
3340 | if (MI.isCopyLike()) { | |||
3341 | if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg())) | |||
3342 | return true; | |||
3343 | ||||
3344 | // Make sure this isn't copying exec as a normal operand | |||
3345 | return MI.readsRegister(AMDGPU::EXEC, &RI); | |||
3346 | } | |||
3347 | ||||
3348 | // Make a conservative assumption about the callee. | |||
3349 | if (MI.isCall()) | |||
3350 | return true; | |||
3351 | ||||
3352 | // Be conservative with any unhandled generic opcodes. | |||
3353 | if (!isTargetSpecificOpcode(MI.getOpcode())) | |||
3354 | return true; | |||
3355 | ||||
3356 | return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI); | |||
3357 | } | |||
3358 | ||||
3359 | bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { | |||
3360 | switch (Imm.getBitWidth()) { | |||
3361 | case 1: // This likely will be a condition code mask. | |||
3362 | return true; | |||
3363 | ||||
3364 | case 32: | |||
3365 | return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), | |||
3366 | ST.hasInv2PiInlineImm()); | |||
3367 | case 64: | |||
3368 | return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), | |||
3369 | ST.hasInv2PiInlineImm()); | |||
3370 | case 16: | |||
3371 | return ST.has16BitInsts() && | |||
3372 | AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), | |||
3373 | ST.hasInv2PiInlineImm()); | |||
3374 | default: | |||
3375 | llvm_unreachable("invalid bitwidth")::llvm::llvm_unreachable_internal("invalid bitwidth", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3375); | |||
3376 | } | |||
3377 | } | |||
3378 | ||||
3379 | bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, | |||
3380 | uint8_t OperandType) const { | |||
3381 | if (!MO.isImm() || | |||
3382 | OperandType < AMDGPU::OPERAND_SRC_FIRST || | |||
3383 | OperandType > AMDGPU::OPERAND_SRC_LAST) | |||
3384 | return false; | |||
3385 | ||||
3386 | // MachineOperand provides no way to tell the true operand size, since it only | |||
3387 | // records a 64-bit value. We need to know the size to determine if a 32-bit | |||
3388 | // floating point immediate bit pattern is legal for an integer immediate. It | |||
3389 | // would be for any 32-bit integer operand, but would not be for a 64-bit one. | |||
3390 | ||||
3391 | int64_t Imm = MO.getImm(); | |||
3392 | switch (OperandType) { | |||
3393 | case AMDGPU::OPERAND_REG_IMM_INT32: | |||
3394 | case AMDGPU::OPERAND_REG_IMM_FP32: | |||
3395 | case AMDGPU::OPERAND_REG_INLINE_C_INT32: | |||
3396 | case AMDGPU::OPERAND_REG_INLINE_C_FP32: | |||
3397 | case AMDGPU::OPERAND_REG_IMM_V2FP32: | |||
3398 | case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: | |||
3399 | case AMDGPU::OPERAND_REG_IMM_V2INT32: | |||
3400 | case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: | |||
3401 | case AMDGPU::OPERAND_REG_INLINE_AC_INT32: | |||
3402 | case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { | |||
3403 | int32_t Trunc = static_cast<int32_t>(Imm); | |||
3404 | return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); | |||
3405 | } | |||
3406 | case AMDGPU::OPERAND_REG_IMM_INT64: | |||
3407 | case AMDGPU::OPERAND_REG_IMM_FP64: | |||
3408 | case AMDGPU::OPERAND_REG_INLINE_C_INT64: | |||
3409 | case AMDGPU::OPERAND_REG_INLINE_C_FP64: | |||
3410 | case AMDGPU::OPERAND_REG_INLINE_AC_FP64: | |||
3411 | return AMDGPU::isInlinableLiteral64(MO.getImm(), | |||
3412 | ST.hasInv2PiInlineImm()); | |||
3413 | case AMDGPU::OPERAND_REG_IMM_INT16: | |||
3414 | case AMDGPU::OPERAND_REG_INLINE_C_INT16: | |||
3415 | case AMDGPU::OPERAND_REG_INLINE_AC_INT16: | |||
3416 | // We would expect inline immediates to not be concerned with an integer/fp | |||
3417 | // distinction. However, in the case of 16-bit integer operations, the | |||
3418 | // "floating point" values appear to not work. It seems read the low 16-bits | |||
3419 | // of 32-bit immediates, which happens to always work for the integer | |||
3420 | // values. | |||
3421 | // | |||
3422 | // See llvm bugzilla 46302. | |||
3423 | // | |||
3424 | // TODO: Theoretically we could use op-sel to use the high bits of the | |||
3425 | // 32-bit FP values. | |||
3426 | return AMDGPU::isInlinableIntLiteral(Imm); | |||
3427 | case AMDGPU::OPERAND_REG_IMM_V2INT16: | |||
3428 | case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: | |||
3429 | case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: | |||
3430 | // This suffers the same problem as the scalar 16-bit cases. | |||
3431 | return AMDGPU::isInlinableIntLiteralV216(Imm); | |||
3432 | case AMDGPU::OPERAND_REG_IMM_FP16: | |||
3433 | case AMDGPU::OPERAND_REG_INLINE_C_FP16: | |||
3434 | case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { | |||
3435 | if (isInt<16>(Imm) || isUInt<16>(Imm)) { | |||
3436 | // A few special case instructions have 16-bit operands on subtargets | |||
3437 | // where 16-bit instructions are not legal. | |||
3438 | // TODO: Do the 32-bit immediates work? We shouldn't really need to handle | |||
3439 | // constants in these cases | |||
3440 | int16_t Trunc = static_cast<int16_t>(Imm); | |||
3441 | return ST.has16BitInsts() && | |||
3442 | AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); | |||
3443 | } | |||
3444 | ||||
3445 | return false; | |||
3446 | } | |||
3447 | case AMDGPU::OPERAND_REG_IMM_V2FP16: | |||
3448 | case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: | |||
3449 | case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { | |||
3450 | uint32_t Trunc = static_cast<uint32_t>(Imm); | |||
3451 | return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); | |||
3452 | } | |||
3453 | default: | |||
3454 | llvm_unreachable("invalid bitwidth")::llvm::llvm_unreachable_internal("invalid bitwidth", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3454); | |||
3455 | } | |||
3456 | } | |||
3457 | ||||
3458 | bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, | |||
3459 | const MCOperandInfo &OpInfo) const { | |||
3460 | switch (MO.getType()) { | |||
3461 | case MachineOperand::MO_Register: | |||
3462 | return false; | |||
3463 | case MachineOperand::MO_Immediate: | |||
3464 | return !isInlineConstant(MO, OpInfo); | |||
3465 | case MachineOperand::MO_FrameIndex: | |||
3466 | case MachineOperand::MO_MachineBasicBlock: | |||
3467 | case MachineOperand::MO_ExternalSymbol: | |||
3468 | case MachineOperand::MO_GlobalAddress: | |||
3469 | case MachineOperand::MO_MCSymbol: | |||
3470 | return true; | |||
3471 | default: | |||
3472 | llvm_unreachable("unexpected operand type")::llvm::llvm_unreachable_internal("unexpected operand type", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3472); | |||
3473 | } | |||
3474 | } | |||
3475 | ||||
3476 | static bool compareMachineOp(const MachineOperand &Op0, | |||
3477 | const MachineOperand &Op1) { | |||
3478 | if (Op0.getType() != Op1.getType()) | |||
3479 | return false; | |||
3480 | ||||
3481 | switch (Op0.getType()) { | |||
3482 | case MachineOperand::MO_Register: | |||
3483 | return Op0.getReg() == Op1.getReg(); | |||
3484 | case MachineOperand::MO_Immediate: | |||
3485 | return Op0.getImm() == Op1.getImm(); | |||
3486 | default: | |||
3487 | llvm_unreachable("Didn't expect to be comparing these operand types")::llvm::llvm_unreachable_internal("Didn't expect to be comparing these operand types" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3487); | |||
3488 | } | |||
3489 | } | |||
3490 | ||||
3491 | bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, | |||
3492 | const MachineOperand &MO) const { | |||
3493 | const MCInstrDesc &InstDesc = MI.getDesc(); | |||
3494 | const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo]; | |||
3495 | ||||
3496 | assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal())(static_cast <bool> (MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) ? void (0) : __assert_fail ("MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3496, __extension__ __PRETTY_FUNCTION__)); | |||
3497 | ||||
3498 | if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) | |||
3499 | return true; | |||
3500 | ||||
3501 | if (OpInfo.RegClass < 0) | |||
3502 | return false; | |||
3503 | ||||
3504 | if (MO.isImm() && isInlineConstant(MO, OpInfo)) { | |||
3505 | if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() && | |||
3506 | OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(), | |||
3507 | AMDGPU::OpName::src2)) | |||
3508 | return false; | |||
3509 | return RI.opCanUseInlineConstant(OpInfo.OperandType); | |||
3510 | } | |||
3511 | ||||
3512 | if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) | |||
3513 | return false; | |||
3514 | ||||
3515 | if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) | |||
3516 | return true; | |||
3517 | ||||
3518 | return ST.hasVOP3Literal(); | |||
3519 | } | |||
3520 | ||||
3521 | bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { | |||
3522 | // GFX90A does not have V_MUL_LEGACY_F32_e32. | |||
3523 | if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts()) | |||
3524 | return false; | |||
3525 | ||||
3526 | int Op32 = AMDGPU::getVOPe32(Opcode); | |||
3527 | if (Op32 == -1) | |||
3528 | return false; | |||
3529 | ||||
3530 | return pseudoToMCOpcode(Op32) != -1; | |||
3531 | } | |||
3532 | ||||
3533 | bool SIInstrInfo::hasModifiers(unsigned Opcode) const { | |||
3534 | // The src0_modifier operand is present on all instructions | |||
3535 | // that have modifiers. | |||
3536 | ||||
3537 | return AMDGPU::getNamedOperandIdx(Opcode, | |||
3538 | AMDGPU::OpName::src0_modifiers) != -1; | |||
3539 | } | |||
3540 | ||||
3541 | bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, | |||
3542 | unsigned OpName) const { | |||
3543 | const MachineOperand *Mods = getNamedOperand(MI, OpName); | |||
3544 | return Mods && Mods->getImm(); | |||
3545 | } | |||
3546 | ||||
3547 | bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { | |||
3548 | return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || | |||
3549 | hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || | |||
3550 | hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || | |||
3551 | hasModifiersSet(MI, AMDGPU::OpName::clamp) || | |||
3552 | hasModifiersSet(MI, AMDGPU::OpName::omod); | |||
3553 | } | |||
3554 | ||||
3555 | bool SIInstrInfo::canShrink(const MachineInstr &MI, | |||
3556 | const MachineRegisterInfo &MRI) const { | |||
3557 | const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); | |||
3558 | // Can't shrink instruction with three operands. | |||
3559 | // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add | |||
3560 | // a special case for it. It can only be shrunk if the third operand | |||
3561 | // is vcc, and src0_modifiers and src1_modifiers are not set. | |||
3562 | // We should handle this the same way we handle vopc, by addding | |||
3563 | // a register allocation hint pre-regalloc and then do the shrinking | |||
3564 | // post-regalloc. | |||
3565 | if (Src2) { | |||
3566 | switch (MI.getOpcode()) { | |||
3567 | default: return false; | |||
3568 | ||||
3569 | case AMDGPU::V_ADDC_U32_e64: | |||
3570 | case AMDGPU::V_SUBB_U32_e64: | |||
3571 | case AMDGPU::V_SUBBREV_U32_e64: { | |||
3572 | const MachineOperand *Src1 | |||
3573 | = getNamedOperand(MI, AMDGPU::OpName::src1); | |||
3574 | if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg())) | |||
3575 | return false; | |||
3576 | // Additional verification is needed for sdst/src2. | |||
3577 | return true; | |||
3578 | } | |||
3579 | case AMDGPU::V_MAC_F32_e64: | |||
3580 | case AMDGPU::V_MAC_F16_e64: | |||
3581 | case AMDGPU::V_FMAC_F32_e64: | |||
3582 | case AMDGPU::V_FMAC_F16_e64: | |||
3583 | case AMDGPU::V_FMAC_F64_e64: | |||
3584 | if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || | |||
3585 | hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) | |||
3586 | return false; | |||
3587 | break; | |||
3588 | ||||
3589 | case AMDGPU::V_CNDMASK_B32_e64: | |||
3590 | break; | |||
3591 | } | |||
3592 | } | |||
3593 | ||||
3594 | const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); | |||
3595 | if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) || | |||
3596 | hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) | |||
3597 | return false; | |||
3598 | ||||
3599 | // We don't need to check src0, all input types are legal, so just make sure | |||
3600 | // src0 isn't using any modifiers. | |||
3601 | if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) | |||
3602 | return false; | |||
3603 | ||||
3604 | // Can it be shrunk to a valid 32 bit opcode? | |||
3605 | if (!hasVALU32BitEncoding(MI.getOpcode())) | |||
3606 | return false; | |||
3607 | ||||
3608 | // Check output modifiers | |||
3609 | return !hasModifiersSet(MI, AMDGPU::OpName::omod) && | |||
3610 | !hasModifiersSet(MI, AMDGPU::OpName::clamp); | |||
3611 | } | |||
3612 | ||||
3613 | // Set VCC operand with all flags from \p Orig, except for setting it as | |||
3614 | // implicit. | |||
3615 | static void copyFlagsToImplicitVCC(MachineInstr &MI, | |||
3616 | const MachineOperand &Orig) { | |||
3617 | ||||
3618 | for (MachineOperand &Use : MI.implicit_operands()) { | |||
3619 | if (Use.isUse() && | |||
3620 | (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) { | |||
3621 | Use.setIsUndef(Orig.isUndef()); | |||
3622 | Use.setIsKill(Orig.isKill()); | |||
3623 | return; | |||
3624 | } | |||
3625 | } | |||
3626 | } | |||
3627 | ||||
3628 | MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, | |||
3629 | unsigned Op32) const { | |||
3630 | MachineBasicBlock *MBB = MI.getParent();; | |||
3631 | MachineInstrBuilder Inst32 = | |||
3632 | BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)) | |||
3633 | .setMIFlags(MI.getFlags()); | |||
3634 | ||||
3635 | // Add the dst operand if the 32-bit encoding also has an explicit $vdst. | |||
3636 | // For VOPC instructions, this is replaced by an implicit def of vcc. | |||
3637 | int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); | |||
3638 | if (Op32DstIdx != -1) { | |||
3639 | // dst | |||
3640 | Inst32.add(MI.getOperand(0)); | |||
3641 | } else { | |||
3642 | assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||(static_cast <bool> (((MI.getOperand(0).getReg() == AMDGPU ::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && "Unexpected case") ? void (0) : __assert_fail ("((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && \"Unexpected case\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3644, __extension__ __PRETTY_FUNCTION__)) | |||
3643 | (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&(static_cast <bool> (((MI.getOperand(0).getReg() == AMDGPU ::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && "Unexpected case") ? void (0) : __assert_fail ("((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && \"Unexpected case\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3644, __extension__ __PRETTY_FUNCTION__)) | |||
3644 | "Unexpected case")(static_cast <bool> (((MI.getOperand(0).getReg() == AMDGPU ::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && "Unexpected case") ? void (0) : __assert_fail ("((MI.getOperand(0).getReg() == AMDGPU::VCC) || (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && \"Unexpected case\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3644, __extension__ __PRETTY_FUNCTION__)); | |||
3645 | } | |||
3646 | ||||
3647 | Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0)); | |||
3648 | ||||
3649 | const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); | |||
3650 | if (Src1) | |||
3651 | Inst32.add(*Src1); | |||
3652 | ||||
3653 | const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); | |||
3654 | ||||
3655 | if (Src2) { | |||
3656 | int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); | |||
3657 | if (Op32Src2Idx != -1) { | |||
3658 | Inst32.add(*Src2); | |||
3659 | } else { | |||
3660 | // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is | |||
3661 | // replaced with an implicit read of vcc or vcc_lo. The implicit read | |||
3662 | // of vcc was already added during the initial BuildMI, but we | |||
3663 | // 1) may need to change vcc to vcc_lo to preserve the original register | |||
3664 | // 2) have to preserve the original flags. | |||
3665 | fixImplicitOperands(*Inst32); | |||
3666 | copyFlagsToImplicitVCC(*Inst32, *Src2); | |||
3667 | } | |||
3668 | } | |||
3669 | ||||
3670 | return Inst32; | |||
3671 | } | |||
3672 | ||||
3673 | bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, | |||
3674 | const MachineOperand &MO, | |||
3675 | const MCOperandInfo &OpInfo) const { | |||
3676 | // Literal constants use the constant bus. | |||
3677 | //if (isLiteralConstantLike(MO, OpInfo)) | |||
3678 | // return true; | |||
3679 | if (MO.isImm()) | |||
3680 | return !isInlineConstant(MO, OpInfo); | |||
3681 | ||||
3682 | if (!MO.isReg()) | |||
3683 | return true; // Misc other operands like FrameIndex | |||
3684 | ||||
3685 | if (!MO.isUse()) | |||
3686 | return false; | |||
3687 | ||||
3688 | if (MO.getReg().isVirtual()) | |||
3689 | return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); | |||
3690 | ||||
3691 | // Null is free | |||
3692 | if (MO.getReg() == AMDGPU::SGPR_NULL) | |||
3693 | return false; | |||
3694 | ||||
3695 | // SGPRs use the constant bus | |||
3696 | if (MO.isImplicit()) { | |||
3697 | return MO.getReg() == AMDGPU::M0 || | |||
3698 | MO.getReg() == AMDGPU::VCC || | |||
3699 | MO.getReg() == AMDGPU::VCC_LO; | |||
3700 | } else { | |||
3701 | return AMDGPU::SReg_32RegClass.contains(MO.getReg()) || | |||
3702 | AMDGPU::SReg_64RegClass.contains(MO.getReg()); | |||
3703 | } | |||
3704 | } | |||
3705 | ||||
3706 | static Register findImplicitSGPRRead(const MachineInstr &MI) { | |||
3707 | for (const MachineOperand &MO : MI.implicit_operands()) { | |||
3708 | // We only care about reads. | |||
3709 | if (MO.isDef()) | |||
3710 | continue; | |||
3711 | ||||
3712 | switch (MO.getReg()) { | |||
3713 | case AMDGPU::VCC: | |||
3714 | case AMDGPU::VCC_LO: | |||
3715 | case AMDGPU::VCC_HI: | |||
3716 | case AMDGPU::M0: | |||
3717 | case AMDGPU::FLAT_SCR: | |||
3718 | return MO.getReg(); | |||
3719 | ||||
3720 | default: | |||
3721 | break; | |||
3722 | } | |||
3723 | } | |||
3724 | ||||
3725 | return AMDGPU::NoRegister; | |||
3726 | } | |||
3727 | ||||
3728 | static bool shouldReadExec(const MachineInstr &MI) { | |||
3729 | if (SIInstrInfo::isVALU(MI)) { | |||
3730 | switch (MI.getOpcode()) { | |||
3731 | case AMDGPU::V_READLANE_B32: | |||
3732 | case AMDGPU::V_WRITELANE_B32: | |||
3733 | return false; | |||
3734 | } | |||
3735 | ||||
3736 | return true; | |||
3737 | } | |||
3738 | ||||
3739 | if (MI.isPreISelOpcode() || | |||
3740 | SIInstrInfo::isGenericOpcode(MI.getOpcode()) || | |||
3741 | SIInstrInfo::isSALU(MI) || | |||
3742 | SIInstrInfo::isSMRD(MI)) | |||
3743 | return false; | |||
3744 | ||||
3745 | return true; | |||
3746 | } | |||
3747 | ||||
3748 | static bool isSubRegOf(const SIRegisterInfo &TRI, | |||
3749 | const MachineOperand &SuperVec, | |||
3750 | const MachineOperand &SubReg) { | |||
3751 | if (SubReg.getReg().isPhysical()) | |||
3752 | return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); | |||
3753 | ||||
3754 | return SubReg.getSubReg() != AMDGPU::NoSubRegister && | |||
3755 | SubReg.getReg() == SuperVec.getReg(); | |||
3756 | } | |||
3757 | ||||
3758 | bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, | |||
3759 | StringRef &ErrInfo) const { | |||
3760 | uint16_t Opcode = MI.getOpcode(); | |||
3761 | if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) | |||
3762 | return true; | |||
3763 | ||||
3764 | const MachineFunction *MF = MI.getParent()->getParent(); | |||
3765 | const MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
3766 | ||||
3767 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); | |||
3768 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); | |||
3769 | int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); | |||
3770 | ||||
3771 | // Make sure the number of operands is correct. | |||
3772 | const MCInstrDesc &Desc = get(Opcode); | |||
3773 | if (!Desc.isVariadic() && | |||
3774 | Desc.getNumOperands() != MI.getNumExplicitOperands()) { | |||
3775 | ErrInfo = "Instruction has wrong number of operands."; | |||
3776 | return false; | |||
3777 | } | |||
3778 | ||||
3779 | if (MI.isInlineAsm()) { | |||
3780 | // Verify register classes for inlineasm constraints. | |||
3781 | for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); | |||
3782 | I != E; ++I) { | |||
3783 | const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); | |||
3784 | if (!RC) | |||
3785 | continue; | |||
3786 | ||||
3787 | const MachineOperand &Op = MI.getOperand(I); | |||
3788 | if (!Op.isReg()) | |||
3789 | continue; | |||
3790 | ||||
3791 | Register Reg = Op.getReg(); | |||
3792 | if (!Reg.isVirtual() && !RC->contains(Reg)) { | |||
3793 | ErrInfo = "inlineasm operand has incorrect register class."; | |||
3794 | return false; | |||
3795 | } | |||
3796 | } | |||
3797 | ||||
3798 | return true; | |||
3799 | } | |||
3800 | ||||
3801 | if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) { | |||
3802 | ErrInfo = "missing memory operand from MIMG instruction."; | |||
3803 | return false; | |||
3804 | } | |||
3805 | ||||
3806 | // Make sure the register classes are correct. | |||
3807 | for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { | |||
3808 | const MachineOperand &MO = MI.getOperand(i); | |||
3809 | if (MO.isFPImm()) { | |||
3810 | ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " | |||
3811 | "all fp values to integers."; | |||
3812 | return false; | |||
3813 | } | |||
3814 | ||||
3815 | int RegClass = Desc.OpInfo[i].RegClass; | |||
3816 | ||||
3817 | switch (Desc.OpInfo[i].OperandType) { | |||
3818 | case MCOI::OPERAND_REGISTER: | |||
3819 | if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { | |||
3820 | ErrInfo = "Illegal immediate value for operand."; | |||
3821 | return false; | |||
3822 | } | |||
3823 | break; | |||
3824 | case AMDGPU::OPERAND_REG_IMM_INT32: | |||
3825 | case AMDGPU::OPERAND_REG_IMM_FP32: | |||
3826 | break; | |||
3827 | case AMDGPU::OPERAND_REG_INLINE_C_INT32: | |||
3828 | case AMDGPU::OPERAND_REG_INLINE_C_FP32: | |||
3829 | case AMDGPU::OPERAND_REG_INLINE_C_INT64: | |||
3830 | case AMDGPU::OPERAND_REG_INLINE_C_FP64: | |||
3831 | case AMDGPU::OPERAND_REG_INLINE_C_INT16: | |||
3832 | case AMDGPU::OPERAND_REG_INLINE_C_FP16: | |||
3833 | case AMDGPU::OPERAND_REG_INLINE_AC_INT32: | |||
3834 | case AMDGPU::OPERAND_REG_INLINE_AC_FP32: | |||
3835 | case AMDGPU::OPERAND_REG_INLINE_AC_INT16: | |||
3836 | case AMDGPU::OPERAND_REG_INLINE_AC_FP16: | |||
3837 | case AMDGPU::OPERAND_REG_INLINE_AC_FP64: { | |||
3838 | if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { | |||
3839 | ErrInfo = "Illegal immediate value for operand."; | |||
3840 | return false; | |||
3841 | } | |||
3842 | break; | |||
3843 | } | |||
3844 | case MCOI::OPERAND_IMMEDIATE: | |||
3845 | case AMDGPU::OPERAND_KIMM32: | |||
3846 | // Check if this operand is an immediate. | |||
3847 | // FrameIndex operands will be replaced by immediates, so they are | |||
3848 | // allowed. | |||
3849 | if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { | |||
3850 | ErrInfo = "Expected immediate, but got non-immediate"; | |||
3851 | return false; | |||
3852 | } | |||
3853 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | |||
3854 | default: | |||
3855 | continue; | |||
3856 | } | |||
3857 | ||||
3858 | if (!MO.isReg()) | |||
3859 | continue; | |||
3860 | Register Reg = MO.getReg(); | |||
3861 | if (!Reg) | |||
3862 | continue; | |||
3863 | ||||
3864 | // FIXME: Ideally we would have separate instruction definitions with the | |||
3865 | // aligned register constraint. | |||
3866 | // FIXME: We do not verify inline asm operands, but custom inline asm | |||
3867 | // verification is broken anyway | |||
3868 | if (ST.needsAlignedVGPRs()) { | |||
3869 | const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg); | |||
3870 | const bool IsVGPR = RI.hasVGPRs(RC); | |||
3871 | const bool IsAGPR = !IsVGPR && RI.hasAGPRs(RC); | |||
3872 | if ((IsVGPR || IsAGPR) && MO.getSubReg()) { | |||
3873 | const TargetRegisterClass *SubRC = | |||
3874 | RI.getSubRegClass(RC, MO.getSubReg()); | |||
3875 | RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg()); | |||
3876 | if (RC) | |||
3877 | RC = SubRC; | |||
3878 | } | |||
3879 | ||||
3880 | // Check that this is the aligned version of the class. | |||
3881 | if (!RC || !RI.isProperlyAlignedRC(*RC)) { | |||
3882 | ErrInfo = "Subtarget requires even aligned vector registers"; | |||
3883 | return false; | |||
3884 | } | |||
3885 | } | |||
3886 | ||||
3887 | if (RegClass != -1) { | |||
3888 | if (Reg.isVirtual()) | |||
3889 | continue; | |||
3890 | ||||
3891 | const TargetRegisterClass *RC = RI.getRegClass(RegClass); | |||
3892 | if (!RC->contains(Reg)) { | |||
3893 | ErrInfo = "Operand has incorrect register class."; | |||
3894 | return false; | |||
3895 | } | |||
3896 | } | |||
3897 | } | |||
3898 | ||||
3899 | // Verify SDWA | |||
3900 | if (isSDWA(MI)) { | |||
3901 | if (!ST.hasSDWA()) { | |||
3902 | ErrInfo = "SDWA is not supported on this target"; | |||
3903 | return false; | |||
3904 | } | |||
3905 | ||||
3906 | int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); | |||
3907 | ||||
3908 | const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; | |||
3909 | ||||
3910 | for (int OpIdx: OpIndicies) { | |||
3911 | if (OpIdx == -1) | |||
3912 | continue; | |||
3913 | const MachineOperand &MO = MI.getOperand(OpIdx); | |||
3914 | ||||
3915 | if (!ST.hasSDWAScalar()) { | |||
3916 | // Only VGPRS on VI | |||
3917 | if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { | |||
3918 | ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; | |||
3919 | return false; | |||
3920 | } | |||
3921 | } else { | |||
3922 | // No immediates on GFX9 | |||
3923 | if (!MO.isReg()) { | |||
3924 | ErrInfo = | |||
3925 | "Only reg allowed as operands in SDWA instructions on GFX9+"; | |||
3926 | return false; | |||
3927 | } | |||
3928 | } | |||
3929 | } | |||
3930 | ||||
3931 | if (!ST.hasSDWAOmod()) { | |||
3932 | // No omod allowed on VI | |||
3933 | const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); | |||
3934 | if (OMod != nullptr && | |||
3935 | (!OMod->isImm() || OMod->getImm() != 0)) { | |||
3936 | ErrInfo = "OMod not allowed in SDWA instructions on VI"; | |||
3937 | return false; | |||
3938 | } | |||
3939 | } | |||
3940 | ||||
3941 | uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); | |||
3942 | if (isVOPC(BasicOpcode)) { | |||
3943 | if (!ST.hasSDWASdst() && DstIdx != -1) { | |||
3944 | // Only vcc allowed as dst on VI for VOPC | |||
3945 | const MachineOperand &Dst = MI.getOperand(DstIdx); | |||
3946 | if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { | |||
3947 | ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; | |||
3948 | return false; | |||
3949 | } | |||
3950 | } else if (!ST.hasSDWAOutModsVOPC()) { | |||
3951 | // No clamp allowed on GFX9 for VOPC | |||
3952 | const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); | |||
3953 | if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { | |||
3954 | ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; | |||
3955 | return false; | |||
3956 | } | |||
3957 | ||||
3958 | // No omod allowed on GFX9 for VOPC | |||
3959 | const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); | |||
3960 | if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { | |||
3961 | ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; | |||
3962 | return false; | |||
3963 | } | |||
3964 | } | |||
3965 | } | |||
3966 | ||||
3967 | const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused); | |||
3968 | if (DstUnused && DstUnused->isImm() && | |||
3969 | DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) { | |||
3970 | const MachineOperand &Dst = MI.getOperand(DstIdx); | |||
3971 | if (!Dst.isReg() || !Dst.isTied()) { | |||
3972 | ErrInfo = "Dst register should have tied register"; | |||
3973 | return false; | |||
3974 | } | |||
3975 | ||||
3976 | const MachineOperand &TiedMO = | |||
3977 | MI.getOperand(MI.findTiedOperandIdx(DstIdx)); | |||
3978 | if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) { | |||
3979 | ErrInfo = | |||
3980 | "Dst register should be tied to implicit use of preserved register"; | |||
3981 | return false; | |||
3982 | } else if (TiedMO.getReg().isPhysical() && | |||
3983 | Dst.getReg() != TiedMO.getReg()) { | |||
3984 | ErrInfo = "Dst register should use same physical register as preserved"; | |||
3985 | return false; | |||
3986 | } | |||
3987 | } | |||
3988 | } | |||
3989 | ||||
3990 | // Verify MIMG | |||
3991 | if (isMIMG(MI.getOpcode()) && !MI.mayStore()) { | |||
3992 | // Ensure that the return type used is large enough for all the options | |||
3993 | // being used TFE/LWE require an extra result register. | |||
3994 | const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); | |||
3995 | if (DMask) { | |||
3996 | uint64_t DMaskImm = DMask->getImm(); | |||
3997 | uint32_t RegCount = | |||
3998 | isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm); | |||
3999 | const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe); | |||
4000 | const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe); | |||
4001 | const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16); | |||
4002 | ||||
4003 | // Adjust for packed 16 bit values | |||
4004 | if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem()) | |||
4005 | RegCount >>= 1; | |||
4006 | ||||
4007 | // Adjust if using LWE or TFE | |||
4008 | if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) | |||
4009 | RegCount += 1; | |||
4010 | ||||
4011 | const uint32_t DstIdx = | |||
4012 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); | |||
4013 | const MachineOperand &Dst = MI.getOperand(DstIdx); | |||
4014 | if (Dst.isReg()) { | |||
4015 | const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); | |||
4016 | uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; | |||
4017 | if (RegCount > DstSize) { | |||
4018 | ErrInfo = "MIMG instruction returns too many registers for dst " | |||
4019 | "register class"; | |||
4020 | return false; | |||
4021 | } | |||
4022 | } | |||
4023 | } | |||
4024 | } | |||
4025 | ||||
4026 | // Verify VOP*. Ignore multiple sgpr operands on writelane. | |||
4027 | if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 | |||
4028 | && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { | |||
4029 | // Only look at the true operands. Only a real operand can use the constant | |||
4030 | // bus, and we don't want to check pseudo-operands like the source modifier | |||
4031 | // flags. | |||
4032 | const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; | |||
4033 | ||||
4034 | unsigned ConstantBusCount = 0; | |||
4035 | bool UsesLiteral = false; | |||
4036 | const MachineOperand *LiteralVal = nullptr; | |||
4037 | ||||
4038 | if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) | |||
4039 | ++ConstantBusCount; | |||
4040 | ||||
4041 | SmallVector<Register, 2> SGPRsUsed; | |||
4042 | Register SGPRUsed; | |||
4043 | ||||
4044 | for (int OpIdx : OpIndices) { | |||
4045 | if (OpIdx == -1) | |||
4046 | break; | |||
4047 | const MachineOperand &MO = MI.getOperand(OpIdx); | |||
4048 | if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { | |||
4049 | if (MO.isReg()) { | |||
4050 | SGPRUsed = MO.getReg(); | |||
4051 | if (llvm::all_of(SGPRsUsed, [SGPRUsed](unsigned SGPR) { | |||
4052 | return SGPRUsed != SGPR; | |||
4053 | })) { | |||
4054 | ++ConstantBusCount; | |||
4055 | SGPRsUsed.push_back(SGPRUsed); | |||
4056 | } | |||
4057 | } else { | |||
4058 | if (!UsesLiteral) { | |||
4059 | ++ConstantBusCount; | |||
4060 | UsesLiteral = true; | |||
4061 | LiteralVal = &MO; | |||
4062 | } else if (!MO.isIdenticalTo(*LiteralVal)) { | |||
4063 | assert(isVOP3(MI))(static_cast <bool> (isVOP3(MI)) ? void (0) : __assert_fail ("isVOP3(MI)", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4063, __extension__ __PRETTY_FUNCTION__)); | |||
4064 | ErrInfo = "VOP3 instruction uses more than one literal"; | |||
4065 | return false; | |||
4066 | } | |||
4067 | } | |||
4068 | } | |||
4069 | } | |||
4070 | ||||
4071 | SGPRUsed = findImplicitSGPRRead(MI); | |||
4072 | if (SGPRUsed != AMDGPU::NoRegister) { | |||
4073 | // Implicit uses may safely overlap true overands | |||
4074 | if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { | |||
4075 | return !RI.regsOverlap(SGPRUsed, SGPR); | |||
4076 | })) { | |||
4077 | ++ConstantBusCount; | |||
4078 | SGPRsUsed.push_back(SGPRUsed); | |||
4079 | } | |||
4080 | } | |||
4081 | ||||
4082 | // v_writelane_b32 is an exception from constant bus restriction: | |||
4083 | // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const | |||
4084 | if (ConstantBusCount > ST.getConstantBusLimit(Opcode) && | |||
4085 | Opcode != AMDGPU::V_WRITELANE_B32) { | |||
4086 | ErrInfo = "VOP* instruction violates constant bus restriction"; | |||
4087 | return false; | |||
4088 | } | |||
4089 | ||||
4090 | if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) { | |||
4091 | ErrInfo = "VOP3 instruction uses literal"; | |||
4092 | return false; | |||
4093 | } | |||
4094 | } | |||
4095 | ||||
4096 | // Special case for writelane - this can break the multiple constant bus rule, | |||
4097 | // but still can't use more than one SGPR register | |||
4098 | if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) { | |||
4099 | unsigned SGPRCount = 0; | |||
4100 | Register SGPRUsed = AMDGPU::NoRegister; | |||
4101 | ||||
4102 | for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { | |||
4103 | if (OpIdx == -1) | |||
4104 | break; | |||
4105 | ||||
4106 | const MachineOperand &MO = MI.getOperand(OpIdx); | |||
4107 | ||||
4108 | if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { | |||
4109 | if (MO.isReg() && MO.getReg() != AMDGPU::M0) { | |||
4110 | if (MO.getReg() != SGPRUsed) | |||
4111 | ++SGPRCount; | |||
4112 | SGPRUsed = MO.getReg(); | |||
4113 | } | |||
4114 | } | |||
4115 | if (SGPRCount > ST.getConstantBusLimit(Opcode)) { | |||
4116 | ErrInfo = "WRITELANE instruction violates constant bus restriction"; | |||
4117 | return false; | |||
4118 | } | |||
4119 | } | |||
4120 | } | |||
4121 | ||||
4122 | // Verify misc. restrictions on specific instructions. | |||
4123 | if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 || | |||
4124 | Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) { | |||
4125 | const MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
4126 | const MachineOperand &Src1 = MI.getOperand(Src1Idx); | |||
4127 | const MachineOperand &Src2 = MI.getOperand(Src2Idx); | |||
4128 | if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { | |||
4129 | if (!compareMachineOp(Src0, Src1) && | |||
4130 | !compareMachineOp(Src0, Src2)) { | |||
4131 | ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; | |||
4132 | return false; | |||
4133 | } | |||
4134 | } | |||
4135 | if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & | |||
4136 | SISrcMods::ABS) || | |||
4137 | (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() & | |||
4138 | SISrcMods::ABS) || | |||
4139 | (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() & | |||
4140 | SISrcMods::ABS)) { | |||
4141 | ErrInfo = "ABS not allowed in VOP3B instructions"; | |||
4142 | return false; | |||
4143 | } | |||
4144 | } | |||
4145 | ||||
4146 | if (isSOP2(MI) || isSOPC(MI)) { | |||
4147 | const MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
4148 | const MachineOperand &Src1 = MI.getOperand(Src1Idx); | |||
4149 | unsigned Immediates = 0; | |||
4150 | ||||
4151 | if (!Src0.isReg() && | |||
4152 | !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType)) | |||
4153 | Immediates++; | |||
4154 | if (!Src1.isReg() && | |||
4155 | !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType)) | |||
4156 | Immediates++; | |||
4157 | ||||
4158 | if (Immediates > 1) { | |||
4159 | ErrInfo = "SOP2/SOPC instruction requires too many immediate constants"; | |||
4160 | return false; | |||
4161 | } | |||
4162 | } | |||
4163 | ||||
4164 | if (isSOPK(MI)) { | |||
4165 | auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16); | |||
4166 | if (Desc.isBranch()) { | |||
4167 | if (!Op->isMBB()) { | |||
4168 | ErrInfo = "invalid branch target for SOPK instruction"; | |||
4169 | return false; | |||
4170 | } | |||
4171 | } else { | |||
4172 | uint64_t Imm = Op->getImm(); | |||
4173 | if (sopkIsZext(MI)) { | |||
4174 | if (!isUInt<16>(Imm)) { | |||
4175 | ErrInfo = "invalid immediate for SOPK instruction"; | |||
4176 | return false; | |||
4177 | } | |||
4178 | } else { | |||
4179 | if (!isInt<16>(Imm)) { | |||
4180 | ErrInfo = "invalid immediate for SOPK instruction"; | |||
4181 | return false; | |||
4182 | } | |||
4183 | } | |||
4184 | } | |||
4185 | } | |||
4186 | ||||
4187 | if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || | |||
4188 | Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || | |||
4189 | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || | |||
4190 | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { | |||
4191 | const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || | |||
4192 | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; | |||
4193 | ||||
4194 | const unsigned StaticNumOps = Desc.getNumOperands() + | |||
4195 | Desc.getNumImplicitUses(); | |||
4196 | const unsigned NumImplicitOps = IsDst ? 2 : 1; | |||
4197 | ||||
4198 | // Allow additional implicit operands. This allows a fixup done by the post | |||
4199 | // RA scheduler where the main implicit operand is killed and implicit-defs | |||
4200 | // are added for sub-registers that remain live after this instruction. | |||
4201 | if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { | |||
4202 | ErrInfo = "missing implicit register operands"; | |||
4203 | return false; | |||
4204 | } | |||
4205 | ||||
4206 | const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
4207 | if (IsDst) { | |||
4208 | if (!Dst->isUse()) { | |||
4209 | ErrInfo = "v_movreld_b32 vdst should be a use operand"; | |||
4210 | return false; | |||
4211 | } | |||
4212 | ||||
4213 | unsigned UseOpIdx; | |||
4214 | if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || | |||
4215 | UseOpIdx != StaticNumOps + 1) { | |||
4216 | ErrInfo = "movrel implicit operands should be tied"; | |||
4217 | return false; | |||
4218 | } | |||
4219 | } | |||
4220 | ||||
4221 | const MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
4222 | const MachineOperand &ImpUse | |||
4223 | = MI.getOperand(StaticNumOps + NumImplicitOps - 1); | |||
4224 | if (!ImpUse.isReg() || !ImpUse.isUse() || | |||
4225 | !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { | |||
4226 | ErrInfo = "src0 should be subreg of implicit vector use"; | |||
4227 | return false; | |||
4228 | } | |||
4229 | } | |||
4230 | ||||
4231 | // Make sure we aren't losing exec uses in the td files. This mostly requires | |||
4232 | // being careful when using let Uses to try to add other use registers. | |||
4233 | if (shouldReadExec(MI)) { | |||
4234 | if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { | |||
4235 | ErrInfo = "VALU instruction does not implicitly read exec mask"; | |||
4236 | return false; | |||
4237 | } | |||
4238 | } | |||
4239 | ||||
4240 | if (isSMRD(MI)) { | |||
4241 | if (MI.mayStore()) { | |||
4242 | // The register offset form of scalar stores may only use m0 as the | |||
4243 | // soffset register. | |||
4244 | const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); | |||
4245 | if (Soff && Soff->getReg() != AMDGPU::M0) { | |||
4246 | ErrInfo = "scalar stores must use m0 as offset register"; | |||
4247 | return false; | |||
4248 | } | |||
4249 | } | |||
4250 | } | |||
4251 | ||||
4252 | if (isFLAT(MI) && !ST.hasFlatInstOffsets()) { | |||
4253 | const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); | |||
4254 | if (Offset->getImm() != 0) { | |||
4255 | ErrInfo = "subtarget does not support offsets in flat instructions"; | |||
4256 | return false; | |||
4257 | } | |||
4258 | } | |||
4259 | ||||
4260 | if (isMIMG(MI)) { | |||
4261 | const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim); | |||
4262 | if (DimOp) { | |||
4263 | int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode, | |||
4264 | AMDGPU::OpName::vaddr0); | |||
4265 | int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); | |||
4266 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode); | |||
4267 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = | |||
4268 | AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); | |||
4269 | const AMDGPU::MIMGDimInfo *Dim = | |||
4270 | AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm()); | |||
4271 | ||||
4272 | if (!Dim) { | |||
4273 | ErrInfo = "dim is out of range"; | |||
4274 | return false; | |||
4275 | } | |||
4276 | ||||
4277 | bool IsA16 = false; | |||
4278 | if (ST.hasR128A16()) { | |||
4279 | const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128); | |||
4280 | IsA16 = R128A16->getImm() != 0; | |||
4281 | } else if (ST.hasGFX10A16()) { | |||
4282 | const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16); | |||
4283 | IsA16 = A16->getImm() != 0; | |||
4284 | } | |||
4285 | ||||
4286 | bool IsNSA = SRsrcIdx - VAddr0Idx > 1; | |||
4287 | ||||
4288 | unsigned AddrWords = | |||
4289 | AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16()); | |||
4290 | ||||
4291 | unsigned VAddrWords; | |||
4292 | if (IsNSA) { | |||
4293 | VAddrWords = SRsrcIdx - VAddr0Idx; | |||
4294 | } else { | |||
4295 | const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx); | |||
4296 | VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32; | |||
4297 | if (AddrWords > 8) | |||
4298 | AddrWords = 16; | |||
4299 | } | |||
4300 | ||||
4301 | if (VAddrWords != AddrWords) { | |||
4302 | LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWordsdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-instr-info")) { dbgs() << "bad vaddr size, expected " << AddrWords << " but got " << VAddrWords << "\n"; } } while (false) | |||
4303 | << " but got " << VAddrWords << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-instr-info")) { dbgs() << "bad vaddr size, expected " << AddrWords << " but got " << VAddrWords << "\n"; } } while (false); | |||
4304 | ErrInfo = "bad vaddr size"; | |||
4305 | return false; | |||
4306 | } | |||
4307 | } | |||
4308 | } | |||
4309 | ||||
4310 | const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl); | |||
4311 | if (DppCt) { | |||
4312 | using namespace AMDGPU::DPP; | |||
4313 | ||||
4314 | unsigned DC = DppCt->getImm(); | |||
4315 | if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 || | |||
4316 | DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST || | |||
4317 | (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || | |||
4318 | (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || | |||
4319 | (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || | |||
4320 | (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) || | |||
4321 | (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) { | |||
4322 | ErrInfo = "Invalid dpp_ctrl value"; | |||
4323 | return false; | |||
4324 | } | |||
4325 | if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 && | |||
4326 | ST.getGeneration() >= AMDGPUSubtarget::GFX10) { | |||
4327 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4328 | "wavefront shifts are not supported on GFX10+"; | |||
4329 | return false; | |||
4330 | } | |||
4331 | if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 && | |||
4332 | ST.getGeneration() >= AMDGPUSubtarget::GFX10) { | |||
4333 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4334 | "broadcasts are not supported on GFX10+"; | |||
4335 | return false; | |||
4336 | } | |||
4337 | if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && | |||
4338 | ST.getGeneration() < AMDGPUSubtarget::GFX10) { | |||
4339 | if (DC >= DppCtrl::ROW_NEWBCAST_FIRST && | |||
4340 | DC <= DppCtrl::ROW_NEWBCAST_LAST && | |||
4341 | !ST.hasGFX90AInsts()) { | |||
4342 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4343 | "row_newbroadcast/row_share is not supported before " | |||
4344 | "GFX90A/GFX10"; | |||
4345 | return false; | |||
4346 | } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) { | |||
4347 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4348 | "row_share and row_xmask are not supported before GFX10"; | |||
4349 | return false; | |||
4350 | } | |||
4351 | } | |||
4352 | ||||
4353 | int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); | |||
4354 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); | |||
4355 | ||||
4356 | if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO && | |||
4357 | ((DstIdx >= 0 && | |||
4358 | (Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID || | |||
4359 | Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64_Align2RegClassID)) || | |||
4360 | ((Src0Idx >= 0 && | |||
4361 | (Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID || | |||
4362 | Desc.OpInfo[Src0Idx].RegClass == | |||
4363 | AMDGPU::VReg_64_Align2RegClassID)))) && | |||
4364 | !AMDGPU::isLegal64BitDPPControl(DC)) { | |||
4365 | ErrInfo = "Invalid dpp_ctrl value: " | |||
4366 | "64 bit dpp only support row_newbcast"; | |||
4367 | return false; | |||
4368 | } | |||
4369 | } | |||
4370 | ||||
4371 | if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) { | |||
4372 | const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
4373 | uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0 | |||
4374 | : AMDGPU::OpName::vdata; | |||
4375 | const MachineOperand *Data = getNamedOperand(MI, DataNameIdx); | |||
4376 | const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1); | |||
4377 | if (Data && !Data->isReg()) | |||
4378 | Data = nullptr; | |||
4379 | ||||
4380 | if (ST.hasGFX90AInsts()) { | |||
4381 | if (Dst && Data && | |||
4382 | (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) { | |||
4383 | ErrInfo = "Invalid register class: " | |||
4384 | "vdata and vdst should be both VGPR or AGPR"; | |||
4385 | return false; | |||
4386 | } | |||
4387 | if (Data && Data2 && | |||
4388 | (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) { | |||
4389 | ErrInfo = "Invalid register class: " | |||
4390 | "both data operands should be VGPR or AGPR"; | |||
4391 | return false; | |||
4392 | } | |||
4393 | } else { | |||
4394 | if ((Dst && RI.isAGPR(MRI, Dst->getReg())) || | |||
4395 | (Data && RI.isAGPR(MRI, Data->getReg())) || | |||
4396 | (Data2 && RI.isAGPR(MRI, Data2->getReg()))) { | |||
4397 | ErrInfo = "Invalid register class: " | |||
4398 | "agpr loads and stores not supported on this GPU"; | |||
4399 | return false; | |||
4400 | } | |||
4401 | } | |||
4402 | } | |||
4403 | ||||
4404 | if (ST.needsAlignedVGPRs() && | |||
4405 | (MI.getOpcode() == AMDGPU::DS_GWS_INIT || | |||
4406 | MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR || | |||
4407 | MI.getOpcode() == AMDGPU::DS_GWS_BARRIER)) { | |||
4408 | const MachineOperand *Op = getNamedOperand(MI, AMDGPU::OpName::data0); | |||
4409 | Register Reg = Op->getReg(); | |||
4410 | bool Aligned = true; | |||
4411 | if (Reg.isPhysical()) { | |||
4412 | Aligned = !(RI.getHWRegIndex(Reg) & 1); | |||
4413 | } else { | |||
4414 | const TargetRegisterClass &RC = *MRI.getRegClass(Reg); | |||
4415 | Aligned = RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) && | |||
4416 | !(RI.getChannelFromSubReg(Op->getSubReg()) & 1); | |||
4417 | } | |||
4418 | ||||
4419 | if (!Aligned) { | |||
4420 | ErrInfo = "Subtarget requires even aligned vector registers " | |||
4421 | "for DS_GWS instructions"; | |||
4422 | return false; | |||
4423 | } | |||
4424 | } | |||
4425 | ||||
4426 | return true; | |||
4427 | } | |||
4428 | ||||
4429 | unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { | |||
4430 | switch (MI.getOpcode()) { | |||
4431 | default: return AMDGPU::INSTRUCTION_LIST_END; | |||
4432 | case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; | |||
4433 | case AMDGPU::COPY: return AMDGPU::COPY; | |||
4434 | case AMDGPU::PHI: return AMDGPU::PHI; | |||
4435 | case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; | |||
4436 | case AMDGPU::WQM: return AMDGPU::WQM; | |||
4437 | case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; | |||
4438 | case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM; | |||
4439 | case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM; | |||
4440 | case AMDGPU::S_MOV_B32: { | |||
4441 | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); | |||
4442 | return MI.getOperand(1).isReg() || | |||
4443 | RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? | |||
4444 | AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; | |||
4445 | } | |||
4446 | case AMDGPU::S_ADD_I32: | |||
4447 | return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32; | |||
4448 | case AMDGPU::S_ADDC_U32: | |||
4449 | return AMDGPU::V_ADDC_U32_e32; | |||
4450 | case AMDGPU::S_SUB_I32: | |||
4451 | return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32; | |||
4452 | // FIXME: These are not consistently handled, and selected when the carry is | |||
4453 | // used. | |||
4454 | case AMDGPU::S_ADD_U32: | |||
4455 | return AMDGPU::V_ADD_CO_U32_e32; | |||
4456 | case AMDGPU::S_SUB_U32: | |||
4457 | return AMDGPU::V_SUB_CO_U32_e32; | |||
4458 | case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; | |||
4459 | case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64; | |||
4460 | case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64; | |||
4461 | case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64; | |||
4462 | case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; | |||
4463 | case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; | |||
4464 | case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; | |||
4465 | case AMDGPU::S_XNOR_B32: | |||
4466 | return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; | |||
4467 | case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; | |||
4468 | case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; | |||
4469 | case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; | |||
4470 | case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; | |||
4471 | case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; | |||
4472 | case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64; | |||
4473 | case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; | |||
4474 | case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64; | |||
4475 | case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; | |||
4476 | case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64; | |||
4477 | case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64; | |||
4478 | case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64; | |||
4479 | case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64; | |||
4480 | case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64; | |||
4481 | case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; | |||
4482 | case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; | |||
4483 | case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; | |||
4484 | case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; | |||
4485 | case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64; | |||
4486 | case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64; | |||
4487 | case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64; | |||
4488 | case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64; | |||
4489 | case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64; | |||
4490 | case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64; | |||
4491 | case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64; | |||
4492 | case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64; | |||
4493 | case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64; | |||
4494 | case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64; | |||
4495 | case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64; | |||
4496 | case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64; | |||
4497 | case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64; | |||
4498 | case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64; | |||
4499 | case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; | |||
4500 | case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; | |||
4501 | case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; | |||
4502 | case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; | |||
4503 | case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; | |||
4504 | case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; | |||
4505 | } | |||
4506 | llvm_unreachable(::llvm::llvm_unreachable_internal("Unexpected scalar opcode without corresponding vector one!" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4507) | |||
4507 | "Unexpected scalar opcode without corresponding vector one!")::llvm::llvm_unreachable_internal("Unexpected scalar opcode without corresponding vector one!" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4507); | |||
4508 | } | |||
4509 | ||||
4510 | static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST, | |||
4511 | const MachineRegisterInfo &MRI, | |||
4512 | const MCInstrDesc &TID, | |||
4513 | unsigned RCID, | |||
4514 | bool IsAllocatable) { | |||
4515 | if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && | |||
4516 | (TID.mayLoad() || TID.mayStore() || | |||
4517 | (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) { | |||
4518 | switch (RCID) { | |||
4519 | case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID; | |||
4520 | case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID; | |||
4521 | case AMDGPU::AV_96RegClassID: return AMDGPU::VReg_96RegClassID; | |||
4522 | case AMDGPU::AV_128RegClassID: return AMDGPU::VReg_128RegClassID; | |||
4523 | case AMDGPU::AV_160RegClassID: return AMDGPU::VReg_160RegClassID; | |||
4524 | default: | |||
4525 | break; | |||
4526 | } | |||
4527 | } | |||
4528 | return RCID; | |||
4529 | } | |||
4530 | ||||
4531 | const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID, | |||
4532 | unsigned OpNum, const TargetRegisterInfo *TRI, | |||
4533 | const MachineFunction &MF) | |||
4534 | const { | |||
4535 | if (OpNum >= TID.getNumOperands()) | |||
4536 | return nullptr; | |||
4537 | auto RegClass = TID.OpInfo[OpNum].RegClass; | |||
4538 | bool IsAllocatable = false; | |||
4539 | if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) { | |||
4540 | // vdst and vdata should be both VGPR or AGPR, same for the DS instructions | |||
4541 | // with two data operands. Request register class constainted to VGPR only | |||
4542 | // of both operands present as Machine Copy Propagation can not check this | |||
4543 | // constraint and possibly other passes too. | |||
4544 | // | |||
4545 | // The check is limited to FLAT and DS because atomics in non-flat encoding | |||
4546 | // have their vdst and vdata tied to be the same register. | |||
4547 | const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, | |||
4548 | AMDGPU::OpName::vdst); | |||
4549 | const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, | |||
4550 | (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0 | |||
4551 | : AMDGPU::OpName::vdata); | |||
4552 | if (DataIdx != -1) { | |||
4553 | IsAllocatable = VDstIdx != -1 || | |||
4554 | AMDGPU::getNamedOperandIdx(TID.Opcode, | |||
4555 | AMDGPU::OpName::data1) != -1; | |||
4556 | } | |||
4557 | } | |||
4558 | RegClass = adjustAllocatableRegClass(ST, MF.getRegInfo(), TID, RegClass, | |||
4559 | IsAllocatable); | |||
4560 | return RI.getRegClass(RegClass); | |||
4561 | } | |||
4562 | ||||
4563 | const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, | |||
4564 | unsigned OpNo) const { | |||
4565 | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); | |||
4566 | const MCInstrDesc &Desc = get(MI.getOpcode()); | |||
4567 | if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || | |||
4568 | Desc.OpInfo[OpNo].RegClass == -1) { | |||
4569 | Register Reg = MI.getOperand(OpNo).getReg(); | |||
4570 | ||||
4571 | if (Reg.isVirtual()) | |||
4572 | return MRI.getRegClass(Reg); | |||
4573 | return RI.getPhysRegClass(Reg); | |||
4574 | } | |||
4575 | ||||
4576 | unsigned RCID = Desc.OpInfo[OpNo].RegClass; | |||
4577 | RCID = adjustAllocatableRegClass(ST, MRI, Desc, RCID, true); | |||
4578 | return RI.getRegClass(RCID); | |||
4579 | } | |||
4580 | ||||
4581 | void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { | |||
4582 | MachineBasicBlock::iterator I = MI; | |||
4583 | MachineBasicBlock *MBB = MI.getParent(); | |||
4584 | MachineOperand &MO = MI.getOperand(OpIdx); | |||
4585 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | |||
4586 | unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; | |||
4587 | const TargetRegisterClass *RC = RI.getRegClass(RCID); | |||
4588 | unsigned Size = RI.getRegSizeInBits(*RC); | |||
4589 | unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32; | |||
4590 | if (MO.isReg()) | |||
4591 | Opcode = AMDGPU::COPY; | |||
4592 | else if (RI.isSGPRClass(RC)) | |||
4593 | Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; | |||
4594 | ||||
4595 | const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); | |||
4596 | const TargetRegisterClass *VRC64 = RI.getVGPR64Class(); | |||
4597 | if (RI.getCommonSubClass(VRC64, VRC)) | |||
4598 | VRC = VRC64; | |||
4599 | else | |||
4600 | VRC = &AMDGPU::VGPR_32RegClass; | |||
4601 | ||||
4602 | Register Reg = MRI.createVirtualRegister(VRC); | |||
4603 | DebugLoc DL = MBB->findDebugLoc(I); | |||
4604 | BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); | |||
4605 | MO.ChangeToRegister(Reg, false); | |||
4606 | } | |||
4607 | ||||
4608 | unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, | |||
4609 | MachineRegisterInfo &MRI, | |||
4610 | MachineOperand &SuperReg, | |||
4611 | const TargetRegisterClass *SuperRC, | |||
4612 | unsigned SubIdx, | |||
4613 | const TargetRegisterClass *SubRC) | |||
4614 | const { | |||
4615 | MachineBasicBlock *MBB = MI->getParent(); | |||
4616 | DebugLoc DL = MI->getDebugLoc(); | |||
4617 | Register SubReg = MRI.createVirtualRegister(SubRC); | |||
4618 | ||||
4619 | if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { | |||
4620 | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) | |||
4621 | .addReg(SuperReg.getReg(), 0, SubIdx); | |||
4622 | return SubReg; | |||
4623 | } | |||
4624 | ||||
4625 | // Just in case the super register is itself a sub-register, copy it to a new | |||
4626 | // value so we don't need to worry about merging its subreg index with the | |||
4627 | // SubIdx passed to this function. The register coalescer should be able to | |||
4628 | // eliminate this extra copy. | |||
4629 | Register NewSuperReg = MRI.createVirtualRegister(SuperRC); | |||
4630 | ||||
4631 | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) | |||
4632 | .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); | |||
4633 | ||||
4634 | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) | |||
4635 | .addReg(NewSuperReg, 0, SubIdx); | |||
4636 | ||||
4637 | return SubReg; | |||
4638 | } | |||
4639 | ||||
4640 | MachineOperand SIInstrInfo::buildExtractSubRegOrImm( | |||
4641 | MachineBasicBlock::iterator MII, | |||
4642 | MachineRegisterInfo &MRI, | |||
4643 | MachineOperand &Op, | |||
4644 | const TargetRegisterClass *SuperRC, | |||
4645 | unsigned SubIdx, | |||
4646 | const TargetRegisterClass *SubRC) const { | |||
4647 | if (Op.isImm()) { | |||
4648 | if (SubIdx == AMDGPU::sub0) | |||
4649 | return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); | |||
4650 | if (SubIdx == AMDGPU::sub1) | |||
4651 | return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); | |||
4652 | ||||
4653 | llvm_unreachable("Unhandled register index for immediate")::llvm::llvm_unreachable_internal("Unhandled register index for immediate" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4653); | |||
4654 | } | |||
4655 | ||||
4656 | unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, | |||
4657 | SubIdx, SubRC); | |||
4658 | return MachineOperand::CreateReg(SubReg, false); | |||
4659 | } | |||
4660 | ||||
4661 | // Change the order of operands from (0, 1, 2) to (0, 2, 1) | |||
4662 | void SIInstrInfo::swapOperands(MachineInstr &Inst) const { | |||
4663 | assert(Inst.getNumExplicitOperands() == 3)(static_cast <bool> (Inst.getNumExplicitOperands() == 3 ) ? void (0) : __assert_fail ("Inst.getNumExplicitOperands() == 3" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4663, __extension__ __PRETTY_FUNCTION__)); | |||
4664 | MachineOperand Op1 = Inst.getOperand(1); | |||
4665 | Inst.RemoveOperand(1); | |||
4666 | Inst.addOperand(Op1); | |||
4667 | } | |||
4668 | ||||
4669 | bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, | |||
4670 | const MCOperandInfo &OpInfo, | |||
4671 | const MachineOperand &MO) const { | |||
4672 | if (!MO.isReg()) | |||
4673 | return false; | |||
4674 | ||||
4675 | Register Reg = MO.getReg(); | |||
4676 | ||||
4677 | const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass); | |||
4678 | if (Reg.isPhysical()) | |||
4679 | return DRC->contains(Reg); | |||
4680 | ||||
4681 | const TargetRegisterClass *RC = MRI.getRegClass(Reg); | |||
4682 | ||||
4683 | if (MO.getSubReg()) { | |||
4684 | const MachineFunction *MF = MO.getParent()->getParent()->getParent(); | |||
4685 | const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF); | |||
4686 | if (!SuperRC) | |||
4687 | return false; | |||
4688 | ||||
4689 | DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()); | |||
4690 | if (!DRC) | |||
4691 | return false; | |||
4692 | } | |||
4693 | return RC->hasSuperClassEq(DRC); | |||
4694 | } | |||
4695 | ||||
4696 | bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, | |||
4697 | const MCOperandInfo &OpInfo, | |||
4698 | const MachineOperand &MO) const { | |||
4699 | if (MO.isReg()) | |||
4700 | return isLegalRegOperand(MRI, OpInfo, MO); | |||
4701 | ||||
4702 | // Handle non-register types that are treated like immediates. | |||
4703 | assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal())(static_cast <bool> (MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) ? void (0) : __assert_fail ("MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4703, __extension__ __PRETTY_FUNCTION__)); | |||
4704 | return true; | |||
4705 | } | |||
4706 | ||||
4707 | bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, | |||
4708 | const MachineOperand *MO) const { | |||
4709 | const MachineFunction &MF = *MI.getParent()->getParent(); | |||
4710 | const MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
4711 | const MCInstrDesc &InstDesc = MI.getDesc(); | |||
4712 | const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; | |||
4713 | const TargetRegisterClass *DefinedRC = | |||
4714 | OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; | |||
4715 | if (!MO) | |||
4716 | MO = &MI.getOperand(OpIdx); | |||
4717 | ||||
4718 | int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); | |||
4719 | int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; | |||
4720 | if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { | |||
4721 | if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--) | |||
4722 | return false; | |||
4723 | ||||
4724 | SmallDenseSet<RegSubRegPair> SGPRsUsed; | |||
4725 | if (MO->isReg()) | |||
4726 | SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg())); | |||
4727 | ||||
4728 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { | |||
4729 | if (i == OpIdx) | |||
4730 | continue; | |||
4731 | const MachineOperand &Op = MI.getOperand(i); | |||
4732 | if (Op.isReg()) { | |||
4733 | RegSubRegPair SGPR(Op.getReg(), Op.getSubReg()); | |||
4734 | if (!SGPRsUsed.count(SGPR) && | |||
4735 | usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { | |||
4736 | if (--ConstantBusLimit <= 0) | |||
4737 | return false; | |||
4738 | SGPRsUsed.insert(SGPR); | |||
4739 | } | |||
4740 | } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { | |||
4741 | if (--ConstantBusLimit <= 0) | |||
4742 | return false; | |||
4743 | } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) && | |||
4744 | isLiteralConstantLike(Op, InstDesc.OpInfo[i])) { | |||
4745 | if (!VOP3LiteralLimit--) | |||
4746 | return false; | |||
4747 | if (--ConstantBusLimit <= 0) | |||
4748 | return false; | |||
4749 | } | |||
4750 | } | |||
4751 | } | |||
4752 | ||||
4753 | if (MO->isReg()) { | |||
4754 | assert(DefinedRC)(static_cast <bool> (DefinedRC) ? void (0) : __assert_fail ("DefinedRC", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4754, __extension__ __PRETTY_FUNCTION__)); | |||
4755 | if (!isLegalRegOperand(MRI, OpInfo, *MO)) | |||
4756 | return false; | |||
4757 | bool IsAGPR = RI.isAGPR(MRI, MO->getReg()); | |||
4758 | if (IsAGPR && !ST.hasMAIInsts()) | |||
4759 | return false; | |||
4760 | unsigned Opc = MI.getOpcode(); | |||
4761 | if (IsAGPR && | |||
4762 | (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && | |||
4763 | (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc))) | |||
4764 | return false; | |||
4765 | // Atomics should have both vdst and vdata either vgpr or agpr. | |||
4766 | const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); | |||
4767 | const int DataIdx = AMDGPU::getNamedOperandIdx(Opc, | |||
4768 | isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata); | |||
4769 | if ((int)OpIdx == VDstIdx && DataIdx != -1 && | |||
4770 | MI.getOperand(DataIdx).isReg() && | |||
4771 | RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR) | |||
4772 | return false; | |||
4773 | if ((int)OpIdx == DataIdx) { | |||
4774 | if (VDstIdx != -1 && | |||
4775 | RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR) | |||
4776 | return false; | |||
4777 | // DS instructions with 2 src operands also must have tied RC. | |||
4778 | const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, | |||
4779 | AMDGPU::OpName::data1); | |||
4780 | if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() && | |||
4781 | RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR) | |||
4782 | return false; | |||
4783 | } | |||
4784 | if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && | |||
4785 | (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) && | |||
4786 | RI.isSGPRReg(MRI, MO->getReg())) | |||
4787 | return false; | |||
4788 | return true; | |||
4789 | } | |||
4790 | ||||
4791 | // Handle non-register types that are treated like immediates. | |||
4792 | assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal())(static_cast <bool> (MO->isImm() || MO->isTargetIndex () || MO->isFI() || MO->isGlobal()) ? void (0) : __assert_fail ("MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4792, __extension__ __PRETTY_FUNCTION__)); | |||
4793 | ||||
4794 | if (!DefinedRC) { | |||
4795 | // This operand expects an immediate. | |||
4796 | return true; | |||
4797 | } | |||
4798 | ||||
4799 | return isImmOperandLegal(MI, OpIdx, *MO); | |||
4800 | } | |||
4801 | ||||
4802 | void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, | |||
4803 | MachineInstr &MI) const { | |||
4804 | unsigned Opc = MI.getOpcode(); | |||
4805 | const MCInstrDesc &InstrDesc = get(Opc); | |||
4806 | ||||
4807 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); | |||
4808 | MachineOperand &Src0 = MI.getOperand(Src0Idx); | |||
4809 | ||||
4810 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); | |||
4811 | MachineOperand &Src1 = MI.getOperand(Src1Idx); | |||
4812 | ||||
4813 | // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 | |||
4814 | // we need to only have one constant bus use before GFX10. | |||
4815 | bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; | |||
4816 | if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && | |||
4817 | Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) || | |||
4818 | isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx]))) | |||
4819 | legalizeOpWithMove(MI, Src0Idx); | |||
4820 | ||||
4821 | // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for | |||
4822 | // both the value to write (src0) and lane select (src1). Fix up non-SGPR | |||
4823 | // src0/src1 with V_READFIRSTLANE. | |||
4824 | if (Opc == AMDGPU::V_WRITELANE_B32) { | |||
4825 | const DebugLoc &DL = MI.getDebugLoc(); | |||
4826 | if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { | |||
4827 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
4828 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
4829 | .add(Src0); | |||
4830 | Src0.ChangeToRegister(Reg, false); | |||
4831 | } | |||
4832 | if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { | |||
4833 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
4834 | const DebugLoc &DL = MI.getDebugLoc(); | |||
4835 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
4836 | .add(Src1); | |||
4837 | Src1.ChangeToRegister(Reg, false); | |||
4838 | } | |||
4839 | return; | |||
4840 | } | |||
4841 | ||||
4842 | // No VOP2 instructions support AGPRs. | |||
4843 | if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg())) | |||
4844 | legalizeOpWithMove(MI, Src0Idx); | |||
4845 | ||||
4846 | if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) | |||
4847 | legalizeOpWithMove(MI, Src1Idx); | |||
4848 | ||||
4849 | // VOP2 src0 instructions support all operand types, so we don't need to check | |||
4850 | // their legality. If src1 is already legal, we don't need to do anything. | |||
4851 | if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) | |||
4852 | return; | |||
4853 | ||||
4854 | // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for | |||
4855 | // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane | |||
4856 | // select is uniform. | |||
4857 | if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && | |||
4858 | RI.isVGPR(MRI, Src1.getReg())) { | |||
4859 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
4860 | const DebugLoc &DL = MI.getDebugLoc(); | |||
4861 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
4862 | .add(Src1); | |||
4863 | Src1.ChangeToRegister(Reg, false); | |||
4864 | return; | |||
4865 | } | |||
4866 | ||||
4867 | // We do not use commuteInstruction here because it is too aggressive and will | |||
4868 | // commute if it is possible. We only want to commute here if it improves | |||
4869 | // legality. This can be called a fairly large number of times so don't waste | |||
4870 | // compile time pointlessly swapping and checking legality again. | |||
4871 | if (HasImplicitSGPR || !MI.isCommutable()) { | |||
4872 | legalizeOpWithMove(MI, Src1Idx); | |||
4873 | return; | |||
4874 | } | |||
4875 | ||||
4876 | // If src0 can be used as src1, commuting will make the operands legal. | |||
4877 | // Otherwise we have to give up and insert a move. | |||
4878 | // | |||
4879 | // TODO: Other immediate-like operand kinds could be commuted if there was a | |||
4880 | // MachineOperand::ChangeTo* for them. | |||
4881 | if ((!Src1.isImm() && !Src1.isReg()) || | |||
4882 | !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { | |||
4883 | legalizeOpWithMove(MI, Src1Idx); | |||
4884 | return; | |||
4885 | } | |||
4886 | ||||
4887 | int CommutedOpc = commuteOpcode(MI); | |||
4888 | if (CommutedOpc == -1) { | |||
4889 | legalizeOpWithMove(MI, Src1Idx); | |||
4890 | return; | |||
4891 | } | |||
4892 | ||||
4893 | MI.setDesc(get(CommutedOpc)); | |||
4894 | ||||
4895 | Register Src0Reg = Src0.getReg(); | |||
4896 | unsigned Src0SubReg = Src0.getSubReg(); | |||
4897 | bool Src0Kill = Src0.isKill(); | |||
4898 | ||||
4899 | if (Src1.isImm()) | |||
4900 | Src0.ChangeToImmediate(Src1.getImm()); | |||
4901 | else if (Src1.isReg()) { | |||
4902 | Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); | |||
4903 | Src0.setSubReg(Src1.getSubReg()); | |||
4904 | } else | |||
4905 | llvm_unreachable("Should only have register or immediate operands")::llvm::llvm_unreachable_internal("Should only have register or immediate operands" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4905); | |||
4906 | ||||
4907 | Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); | |||
4908 | Src1.setSubReg(Src0SubReg); | |||
4909 | fixImplicitOperands(MI); | |||
4910 | } | |||
4911 | ||||
4912 | // Legalize VOP3 operands. All operand types are supported for any operand | |||
4913 | // but only one literal constant and only starting from GFX10. | |||
4914 | void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, | |||
4915 | MachineInstr &MI) const { | |||
4916 | unsigned Opc = MI.getOpcode(); | |||
4917 | ||||
4918 | int VOP3Idx[3] = { | |||
4919 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), | |||
4920 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), | |||
4921 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) | |||
4922 | }; | |||
4923 | ||||
4924 | if (Opc == AMDGPU::V_PERMLANE16_B32_e64 || | |||
4925 | Opc == AMDGPU::V_PERMLANEX16_B32_e64) { | |||
4926 | // src1 and src2 must be scalar | |||
4927 | MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]); | |||
4928 | MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); | |||
4929 | const DebugLoc &DL = MI.getDebugLoc(); | |||
4930 | if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { | |||
4931 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
4932 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
4933 | .add(Src1); | |||
4934 | Src1.ChangeToRegister(Reg, false); | |||
4935 | } | |||
4936 | if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { | |||
4937 | Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
4938 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) | |||
4939 | .add(Src2); | |||
4940 | Src2.ChangeToRegister(Reg, false); | |||
4941 | } | |||
4942 | } | |||
4943 | ||||
4944 | // Find the one SGPR operand we are allowed to use. | |||
4945 | int ConstantBusLimit = ST.getConstantBusLimit(Opc); | |||
4946 | int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; | |||
4947 | SmallDenseSet<unsigned> SGPRsUsed; | |||
4948 | Register SGPRReg = findUsedSGPR(MI, VOP3Idx); | |||
4949 | if (SGPRReg != AMDGPU::NoRegister) { | |||
4950 | SGPRsUsed.insert(SGPRReg); | |||
4951 | --ConstantBusLimit; | |||
4952 | } | |||
4953 | ||||
4954 | for (unsigned i = 0; i < 3; ++i) { | |||
4955 | int Idx = VOP3Idx[i]; | |||
4956 | if (Idx == -1) | |||
4957 | break; | |||
4958 | MachineOperand &MO = MI.getOperand(Idx); | |||
4959 | ||||
4960 | if (!MO.isReg()) { | |||
4961 | if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx])) | |||
4962 | continue; | |||
4963 | ||||
4964 | if (LiteralLimit > 0 && ConstantBusLimit > 0) { | |||
4965 | --LiteralLimit; | |||
4966 | --ConstantBusLimit; | |||
4967 | continue; | |||
4968 | } | |||
4969 | ||||
4970 | --LiteralLimit; | |||
4971 | --ConstantBusLimit; | |||
4972 | legalizeOpWithMove(MI, Idx); | |||
4973 | continue; | |||
4974 | } | |||
4975 | ||||
4976 | if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) && | |||
4977 | !isOperandLegal(MI, Idx, &MO)) { | |||
4978 | legalizeOpWithMove(MI, Idx); | |||
4979 | continue; | |||
4980 | } | |||
4981 | ||||
4982 | if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg()))) | |||
4983 | continue; // VGPRs are legal | |||
4984 | ||||
4985 | // We can use one SGPR in each VOP3 instruction prior to GFX10 | |||
4986 | // and two starting from GFX10. | |||
4987 | if (SGPRsUsed.count(MO.getReg())) | |||
4988 | continue; | |||
4989 | if (ConstantBusLimit > 0) { | |||
4990 | SGPRsUsed.insert(MO.getReg()); | |||
4991 | --ConstantBusLimit; | |||
4992 | continue; | |||
4993 | } | |||
4994 | ||||
4995 | // If we make it this far, then the operand is not legal and we must | |||
4996 | // legalize it. | |||
4997 | legalizeOpWithMove(MI, Idx); | |||
4998 | } | |||
4999 | } | |||
5000 | ||||
5001 | Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, | |||
5002 | MachineRegisterInfo &MRI) const { | |||
5003 | const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); | |||
5004 | const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); | |||
5005 | Register DstReg = MRI.createVirtualRegister(SRC); | |||
5006 | unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; | |||
5007 | ||||
5008 | if (RI.hasAGPRs(VRC)) { | |||
5009 | VRC = RI.getEquivalentVGPRClass(VRC); | |||
5010 | Register NewSrcReg = MRI.createVirtualRegister(VRC); | |||
5011 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), | |||
5012 | get(TargetOpcode::COPY), NewSrcReg) | |||
5013 | .addReg(SrcReg); | |||
5014 | SrcReg = NewSrcReg; | |||
5015 | } | |||
5016 | ||||
5017 | if (SubRegs == 1) { | |||
5018 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), | |||
5019 | get(AMDGPU::V_READFIRSTLANE_B32), DstReg) | |||
5020 | .addReg(SrcReg); | |||
5021 | return DstReg; | |||
5022 | } | |||
5023 | ||||
5024 | SmallVector<unsigned, 8> SRegs; | |||
5025 | for (unsigned i = 0; i < SubRegs; ++i) { | |||
5026 | Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5027 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), | |||
5028 | get(AMDGPU::V_READFIRSTLANE_B32), SGPR) | |||
5029 | .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); | |||
5030 | SRegs.push_back(SGPR); | |||
5031 | } | |||
5032 | ||||
5033 | MachineInstrBuilder MIB = | |||
5034 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), | |||
5035 | get(AMDGPU::REG_SEQUENCE), DstReg); | |||
5036 | for (unsigned i = 0; i < SubRegs; ++i) { | |||
5037 | MIB.addReg(SRegs[i]); | |||
5038 | MIB.addImm(RI.getSubRegFromChannel(i)); | |||
5039 | } | |||
5040 | return DstReg; | |||
5041 | } | |||
5042 | ||||
5043 | void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, | |||
5044 | MachineInstr &MI) const { | |||
5045 | ||||
5046 | // If the pointer is store in VGPRs, then we need to move them to | |||
5047 | // SGPRs using v_readfirstlane. This is safe because we only select | |||
5048 | // loads with uniform pointers to SMRD instruction so we know the | |||
5049 | // pointer value is uniform. | |||
5050 | MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); | |||
5051 | if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { | |||
5052 | Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); | |||
5053 | SBase->setReg(SGPR); | |||
5054 | } | |||
5055 | MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff); | |||
5056 | if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { | |||
5057 | Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); | |||
5058 | SOff->setReg(SGPR); | |||
5059 | } | |||
5060 | } | |||
5061 | ||||
5062 | bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { | |||
5063 | unsigned Opc = Inst.getOpcode(); | |||
5064 | int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); | |||
5065 | if (OldSAddrIdx < 0) | |||
5066 | return false; | |||
5067 | ||||
5068 | assert(isSegmentSpecificFLAT(Inst))(static_cast <bool> (isSegmentSpecificFLAT(Inst)) ? void (0) : __assert_fail ("isSegmentSpecificFLAT(Inst)", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5068, __extension__ __PRETTY_FUNCTION__)); | |||
5069 | ||||
5070 | int NewOpc = AMDGPU::getGlobalVaddrOp(Opc); | |||
5071 | if (NewOpc < 0) | |||
5072 | NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opc); | |||
5073 | if (NewOpc < 0) | |||
5074 | return false; | |||
5075 | ||||
5076 | MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo(); | |||
5077 | MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx); | |||
5078 | if (RI.isSGPRReg(MRI, SAddr.getReg())) | |||
5079 | return false; | |||
5080 | ||||
5081 | int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr); | |||
5082 | if (NewVAddrIdx < 0) | |||
5083 | return false; | |||
5084 | ||||
5085 | int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); | |||
5086 | ||||
5087 | // Check vaddr, it shall be zero or absent. | |||
5088 | MachineInstr *VAddrDef = nullptr; | |||
5089 | if (OldVAddrIdx >= 0) { | |||
5090 | MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx); | |||
5091 | VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg()); | |||
5092 | if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 || | |||
5093 | !VAddrDef->getOperand(1).isImm() || | |||
5094 | VAddrDef->getOperand(1).getImm() != 0) | |||
5095 | return false; | |||
5096 | } | |||
5097 | ||||
5098 | const MCInstrDesc &NewDesc = get(NewOpc); | |||
5099 | Inst.setDesc(NewDesc); | |||
5100 | ||||
5101 | // Callers expect interator to be valid after this call, so modify the | |||
5102 | // instruction in place. | |||
5103 | if (OldVAddrIdx == NewVAddrIdx) { | |||
5104 | MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx); | |||
5105 | // Clear use list from the old vaddr holding a zero register. | |||
5106 | MRI.removeRegOperandFromUseList(&NewVAddr); | |||
5107 | MRI.moveOperands(&NewVAddr, &SAddr, 1); | |||
5108 | Inst.RemoveOperand(OldSAddrIdx); | |||
5109 | // Update the use list with the pointer we have just moved from vaddr to | |||
5110 | // saddr poisition. Otherwise new vaddr will be missing from the use list. | |||
5111 | MRI.removeRegOperandFromUseList(&NewVAddr); | |||
5112 | MRI.addRegOperandToUseList(&NewVAddr); | |||
5113 | } else { | |||
5114 | assert(OldSAddrIdx == NewVAddrIdx)(static_cast <bool> (OldSAddrIdx == NewVAddrIdx) ? void (0) : __assert_fail ("OldSAddrIdx == NewVAddrIdx", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5114, __extension__ __PRETTY_FUNCTION__)); | |||
5115 | ||||
5116 | if (OldVAddrIdx >= 0) { | |||
5117 | int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc, | |||
5118 | AMDGPU::OpName::vdst_in); | |||
5119 | ||||
5120 | // RemoveOperand doesn't try to fixup tied operand indexes at it goes, so | |||
5121 | // it asserts. Untie the operands for now and retie them afterwards. | |||
5122 | if (NewVDstIn != -1) { | |||
5123 | int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); | |||
5124 | Inst.untieRegOperand(OldVDstIn); | |||
5125 | } | |||
5126 | ||||
5127 | Inst.RemoveOperand(OldVAddrIdx); | |||
5128 | ||||
5129 | if (NewVDstIn != -1) { | |||
5130 | int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst); | |||
5131 | Inst.tieOperands(NewVDst, NewVDstIn); | |||
5132 | } | |||
5133 | } | |||
5134 | } | |||
5135 | ||||
5136 | if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg())) | |||
5137 | VAddrDef->eraseFromParent(); | |||
5138 | ||||
5139 | return true; | |||
5140 | } | |||
5141 | ||||
5142 | // FIXME: Remove this when SelectionDAG is obsoleted. | |||
5143 | void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, | |||
5144 | MachineInstr &MI) const { | |||
5145 | if (!isSegmentSpecificFLAT(MI)) | |||
5146 | return; | |||
5147 | ||||
5148 | // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence | |||
5149 | // thinks they are uniform, so a readfirstlane should be valid. | |||
5150 | MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr); | |||
5151 | if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg()))) | |||
5152 | return; | |||
5153 | ||||
5154 | if (moveFlatAddrToVGPR(MI)) | |||
5155 | return; | |||
5156 | ||||
5157 | Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI); | |||
5158 | SAddr->setReg(ToSGPR); | |||
5159 | } | |||
5160 | ||||
5161 | void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, | |||
5162 | MachineBasicBlock::iterator I, | |||
5163 | const TargetRegisterClass *DstRC, | |||
5164 | MachineOperand &Op, | |||
5165 | MachineRegisterInfo &MRI, | |||
5166 | const DebugLoc &DL) const { | |||
5167 | Register OpReg = Op.getReg(); | |||
5168 | unsigned OpSubReg = Op.getSubReg(); | |||
5169 | ||||
5170 | const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( | |||
5171 | RI.getRegClassForReg(MRI, OpReg), OpSubReg); | |||
5172 | ||||
5173 | // Check if operand is already the correct register class. | |||
5174 | if (DstRC == OpRC) | |||
5175 | return; | |||
5176 | ||||
5177 | Register DstReg = MRI.createVirtualRegister(DstRC); | |||
5178 | MachineInstr *Copy = | |||
5179 | BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); | |||
5180 | ||||
5181 | Op.setReg(DstReg); | |||
5182 | Op.setSubReg(0); | |||
5183 | ||||
5184 | MachineInstr *Def = MRI.getVRegDef(OpReg); | |||
5185 | if (!Def) | |||
5186 | return; | |||
5187 | ||||
5188 | // Try to eliminate the copy if it is copying an immediate value. | |||
5189 | if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass) | |||
5190 | FoldImmediate(*Copy, *Def, OpReg, &MRI); | |||
5191 | ||||
5192 | bool ImpDef = Def->isImplicitDef(); | |||
5193 | while (!ImpDef && Def && Def->isCopy()) { | |||
5194 | if (Def->getOperand(1).getReg().isPhysical()) | |||
5195 | break; | |||
5196 | Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); | |||
5197 | ImpDef = Def && Def->isImplicitDef(); | |||
5198 | } | |||
5199 | if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) && | |||
5200 | !ImpDef) | |||
5201 | Copy->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); | |||
5202 | } | |||
5203 | ||||
5204 | // Emit the actual waterfall loop, executing the wrapped instruction for each | |||
5205 | // unique value of \p Rsrc across all lanes. In the best case we execute 1 | |||
5206 | // iteration, in the worst case we execute 64 (once per lane). | |||
5207 | static void | |||
5208 | emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, | |||
5209 | MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, | |||
5210 | const DebugLoc &DL, MachineOperand &Rsrc) { | |||
5211 | MachineFunction &MF = *OrigBB.getParent(); | |||
5212 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | |||
5213 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); | |||
5214 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
5215 | unsigned SaveExecOpc = | |||
5216 | ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; | |||
5217 | unsigned XorTermOpc = | |||
5218 | ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; | |||
5219 | unsigned AndOpc = | |||
5220 | ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; | |||
5221 | const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
5222 | ||||
5223 | MachineBasicBlock::iterator I = LoopBB.begin(); | |||
5224 | ||||
5225 | SmallVector<Register, 8> ReadlanePieces; | |||
5226 | Register CondReg = AMDGPU::NoRegister; | |||
5227 | ||||
5228 | Register VRsrc = Rsrc.getReg(); | |||
5229 | unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); | |||
5230 | ||||
5231 | unsigned RegSize = TRI->getRegSizeInBits(Rsrc.getReg(), MRI); | |||
5232 | unsigned NumSubRegs = RegSize / 32; | |||
5233 | assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size")(static_cast <bool> (NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size") ? void (0) : __assert_fail ("NumSubRegs % 2 == 0 && NumSubRegs <= 32 && \"Unhandled register size\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5233, __extension__ __PRETTY_FUNCTION__)); | |||
5234 | ||||
5235 | for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) { | |||
5236 | ||||
5237 | Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5238 | Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5239 | ||||
5240 | // Read the next variant <- also loop target. | |||
5241 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo) | |||
5242 | .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx)); | |||
5243 | ||||
5244 | // Read the next variant <- also loop target. | |||
5245 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi) | |||
5246 | .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx + 1)); | |||
5247 | ||||
5248 | ReadlanePieces.push_back(CurRegLo); | |||
5249 | ReadlanePieces.push_back(CurRegHi); | |||
5250 | ||||
5251 | // Comparison is to be done as 64-bit. | |||
5252 | Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); | |||
5253 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg) | |||
5254 | .addReg(CurRegLo) | |||
5255 | .addImm(AMDGPU::sub0) | |||
5256 | .addReg(CurRegHi) | |||
5257 | .addImm(AMDGPU::sub1); | |||
5258 | ||||
5259 | Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC); | |||
5260 | auto Cmp = | |||
5261 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg) | |||
5262 | .addReg(CurReg); | |||
5263 | if (NumSubRegs <= 2) | |||
5264 | Cmp.addReg(VRsrc); | |||
5265 | else | |||
5266 | Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2)); | |||
5267 | ||||
5268 | // Combine the comparision results with AND. | |||
5269 | if (CondReg == AMDGPU::NoRegister) // First. | |||
5270 | CondReg = NewCondReg; | |||
5271 | else { // If not the first, we create an AND. | |||
5272 | Register AndReg = MRI.createVirtualRegister(BoolXExecRC); | |||
5273 | BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg) | |||
5274 | .addReg(CondReg) | |||
5275 | .addReg(NewCondReg); | |||
5276 | CondReg = AndReg; | |||
5277 | } | |||
5278 | } // End for loop. | |||
5279 | ||||
5280 | auto SRsrcRC = TRI->getEquivalentSGPRClass(MRI.getRegClass(VRsrc)); | |||
5281 | Register SRsrc = MRI.createVirtualRegister(SRsrcRC); | |||
5282 | ||||
5283 | // Build scalar Rsrc. | |||
5284 | auto Merge = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc); | |||
5285 | unsigned Channel = 0; | |||
5286 | for (Register Piece : ReadlanePieces) { | |||
5287 | Merge.addReg(Piece) | |||
5288 | .addImm(TRI->getSubRegFromChannel(Channel++)); | |||
5289 | } | |||
5290 | ||||
5291 | // Update Rsrc operand to use the SGPR Rsrc. | |||
5292 | Rsrc.setReg(SRsrc); | |||
5293 | Rsrc.setIsKill(true); | |||
5294 | ||||
5295 | Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); | |||
5296 | MRI.setSimpleHint(SaveExec, CondReg); | |||
5297 | ||||
5298 | // Update EXEC to matching lanes, saving original to SaveExec. | |||
5299 | BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec) | |||
5300 | .addReg(CondReg, RegState::Kill); | |||
5301 | ||||
5302 | // The original instruction is here; we insert the terminators after it. | |||
5303 | I = LoopBB.end(); | |||
5304 | ||||
5305 | // Update EXEC, switch all done bits to 0 and all todo bits to 1. | |||
5306 | BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec) | |||
5307 | .addReg(Exec) | |||
5308 | .addReg(SaveExec); | |||
5309 | ||||
5310 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB); | |||
5311 | } | |||
5312 | ||||
5313 | // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register | |||
5314 | // with SGPRs by iterating over all unique values across all lanes. | |||
5315 | // Returns the loop basic block that now contains \p MI. | |||
5316 | static MachineBasicBlock * | |||
5317 | loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, | |||
5318 | MachineOperand &Rsrc, MachineDominatorTree *MDT, | |||
5319 | MachineBasicBlock::iterator Begin = nullptr, | |||
5320 | MachineBasicBlock::iterator End = nullptr) { | |||
5321 | MachineBasicBlock &MBB = *MI.getParent(); | |||
5322 | MachineFunction &MF = *MBB.getParent(); | |||
5323 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | |||
5324 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); | |||
5325 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
5326 | if (!Begin.isValid()) | |||
5327 | Begin = &MI; | |||
5328 | if (!End.isValid()) { | |||
5329 | End = &MI; | |||
5330 | ++End; | |||
5331 | } | |||
5332 | const DebugLoc &DL = MI.getDebugLoc(); | |||
5333 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
5334 | unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; | |||
5335 | const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
5336 | ||||
5337 | Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); | |||
5338 | ||||
5339 | // Save the EXEC mask | |||
5340 | BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); | |||
5341 | ||||
5342 | // Killed uses in the instruction we are waterfalling around will be | |||
5343 | // incorrect due to the added control-flow. | |||
5344 | MachineBasicBlock::iterator AfterMI = MI; | |||
5345 | ++AfterMI; | |||
5346 | for (auto I = Begin; I != AfterMI; I++) { | |||
5347 | for (auto &MO : I->uses()) { | |||
5348 | if (MO.isReg() && MO.isUse()) { | |||
5349 | MRI.clearKillFlags(MO.getReg()); | |||
5350 | } | |||
5351 | } | |||
5352 | } | |||
5353 | ||||
5354 | // To insert the loop we need to split the block. Move everything after this | |||
5355 | // point to a new block, and insert a new empty block between the two. | |||
5356 | MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); | |||
5357 | MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); | |||
5358 | MachineFunction::iterator MBBI(MBB); | |||
5359 | ++MBBI; | |||
5360 | ||||
5361 | MF.insert(MBBI, LoopBB); | |||
5362 | MF.insert(MBBI, RemainderBB); | |||
5363 | ||||
5364 | LoopBB->addSuccessor(LoopBB); | |||
5365 | LoopBB->addSuccessor(RemainderBB); | |||
5366 | ||||
5367 | // Move Begin to MI to the LoopBB, and the remainder of the block to | |||
5368 | // RemainderBB. | |||
5369 | RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); | |||
5370 | RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end()); | |||
5371 | LoopBB->splice(LoopBB->begin(), &MBB, Begin, MBB.end()); | |||
5372 | ||||
5373 | MBB.addSuccessor(LoopBB); | |||
5374 | ||||
5375 | // Update dominators. We know that MBB immediately dominates LoopBB, that | |||
5376 | // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately | |||
5377 | // dominates all of the successors transferred to it from MBB that MBB used | |||
5378 | // to properly dominate. | |||
5379 | if (MDT) { | |||
5380 | MDT->addNewBlock(LoopBB, &MBB); | |||
5381 | MDT->addNewBlock(RemainderBB, LoopBB); | |||
5382 | for (auto &Succ : RemainderBB->successors()) { | |||
5383 | if (MDT->properlyDominates(&MBB, Succ)) { | |||
5384 | MDT->changeImmediateDominator(Succ, RemainderBB); | |||
5385 | } | |||
5386 | } | |||
5387 | } | |||
5388 | ||||
5389 | emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc); | |||
5390 | ||||
5391 | // Restore the EXEC mask | |||
5392 | MachineBasicBlock::iterator First = RemainderBB->begin(); | |||
5393 | BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); | |||
5394 | return LoopBB; | |||
5395 | } | |||
5396 | ||||
5397 | // Extract pointer from Rsrc and return a zero-value Rsrc replacement. | |||
5398 | static std::tuple<unsigned, unsigned> | |||
5399 | extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { | |||
5400 | MachineBasicBlock &MBB = *MI.getParent(); | |||
5401 | MachineFunction &MF = *MBB.getParent(); | |||
5402 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
5403 | ||||
5404 | // Extract the ptr from the resource descriptor. | |||
5405 | unsigned RsrcPtr = | |||
5406 | TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass, | |||
5407 | AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); | |||
5408 | ||||
5409 | // Create an empty resource descriptor | |||
5410 | Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); | |||
5411 | Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5412 | Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); | |||
5413 | Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); | |||
5414 | uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat(); | |||
5415 | ||||
5416 | // Zero64 = 0 | |||
5417 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64) | |||
5418 | .addImm(0); | |||
5419 | ||||
5420 | // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} | |||
5421 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo) | |||
5422 | .addImm(RsrcDataFormat & 0xFFFFFFFF); | |||
5423 | ||||
5424 | // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} | |||
5425 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi) | |||
5426 | .addImm(RsrcDataFormat >> 32); | |||
5427 | ||||
5428 | // NewSRsrc = {Zero64, SRsrcFormat} | |||
5429 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc) | |||
5430 | .addReg(Zero64) | |||
5431 | .addImm(AMDGPU::sub0_sub1) | |||
5432 | .addReg(SRsrcFormatLo) | |||
5433 | .addImm(AMDGPU::sub2) | |||
5434 | .addReg(SRsrcFormatHi) | |||
5435 | .addImm(AMDGPU::sub3); | |||
5436 | ||||
5437 | return std::make_tuple(RsrcPtr, NewSRsrc); | |||
5438 | } | |||
5439 | ||||
5440 | MachineBasicBlock * | |||
5441 | SIInstrInfo::legalizeOperands(MachineInstr &MI, | |||
5442 | MachineDominatorTree *MDT) const { | |||
5443 | MachineFunction &MF = *MI.getParent()->getParent(); | |||
5444 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
5445 | MachineBasicBlock *CreatedBB = nullptr; | |||
5446 | ||||
5447 | // Legalize VOP2 | |||
5448 | if (isVOP2(MI) || isVOPC(MI)) { | |||
5449 | legalizeOperandsVOP2(MRI, MI); | |||
5450 | return CreatedBB; | |||
5451 | } | |||
5452 | ||||
5453 | // Legalize VOP3 | |||
5454 | if (isVOP3(MI)) { | |||
5455 | legalizeOperandsVOP3(MRI, MI); | |||
5456 | return CreatedBB; | |||
5457 | } | |||
5458 | ||||
5459 | // Legalize SMRD | |||
5460 | if (isSMRD(MI)) { | |||
5461 | legalizeOperandsSMRD(MRI, MI); | |||
5462 | return CreatedBB; | |||
5463 | } | |||
5464 | ||||
5465 | // Legalize FLAT | |||
5466 | if (isFLAT(MI)) { | |||
5467 | legalizeOperandsFLAT(MRI, MI); | |||
5468 | return CreatedBB; | |||
5469 | } | |||
5470 | ||||
5471 | // Legalize REG_SEQUENCE and PHI | |||
5472 | // The register class of the operands much be the same type as the register | |||
5473 | // class of the output. | |||
5474 | if (MI.getOpcode() == AMDGPU::PHI) { | |||
5475 | const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; | |||
5476 | for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { | |||
5477 | if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual()) | |||
5478 | continue; | |||
5479 | const TargetRegisterClass *OpRC = | |||
5480 | MRI.getRegClass(MI.getOperand(i).getReg()); | |||
5481 | if (RI.hasVectorRegisters(OpRC)) { | |||
5482 | VRC = OpRC; | |||
5483 | } else { | |||
5484 | SRC = OpRC; | |||
5485 | } | |||
5486 | } | |||
5487 | ||||
5488 | // If any of the operands are VGPR registers, then they all most be | |||
5489 | // otherwise we will create illegal VGPR->SGPR copies when legalizing | |||
5490 | // them. | |||
5491 | if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { | |||
5492 | if (!VRC) { | |||
5493 | assert(SRC)(static_cast <bool> (SRC) ? void (0) : __assert_fail ("SRC" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5493, __extension__ __PRETTY_FUNCTION__)); | |||
5494 | if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { | |||
5495 | VRC = &AMDGPU::VReg_1RegClass; | |||
5496 | } else | |||
5497 | VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) | |||
5498 | ? RI.getEquivalentAGPRClass(SRC) | |||
5499 | : RI.getEquivalentVGPRClass(SRC); | |||
5500 | } else { | |||
5501 | VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) | |||
5502 | ? RI.getEquivalentAGPRClass(VRC) | |||
5503 | : RI.getEquivalentVGPRClass(VRC); | |||
5504 | } | |||
5505 | RC = VRC; | |||
5506 | } else { | |||
5507 | RC = SRC; | |||
5508 | } | |||
5509 | ||||
5510 | // Update all the operands so they have the same type. | |||
5511 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { | |||
5512 | MachineOperand &Op = MI.getOperand(I); | |||
5513 | if (!Op.isReg() || !Op.getReg().isVirtual()) | |||
5514 | continue; | |||
5515 | ||||
5516 | // MI is a PHI instruction. | |||
5517 | MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); | |||
5518 | MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); | |||
5519 | ||||
5520 | // Avoid creating no-op copies with the same src and dst reg class. These | |||
5521 | // confuse some of the machine passes. | |||
5522 | legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); | |||
5523 | } | |||
5524 | } | |||
5525 | ||||
5526 | // REG_SEQUENCE doesn't really require operand legalization, but if one has a | |||
5527 | // VGPR dest type and SGPR sources, insert copies so all operands are | |||
5528 | // VGPRs. This seems to help operand folding / the register coalescer. | |||
5529 | if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { | |||
5530 | MachineBasicBlock *MBB = MI.getParent(); | |||
5531 | const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); | |||
5532 | if (RI.hasVGPRs(DstRC)) { | |||
5533 | // Update all the operands so they are VGPR register classes. These may | |||
5534 | // not be the same register class because REG_SEQUENCE supports mixing | |||
5535 | // subregister index types e.g. sub0_sub1 + sub2 + sub3 | |||
5536 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { | |||
5537 | MachineOperand &Op = MI.getOperand(I); | |||
5538 | if (!Op.isReg() || !Op.getReg().isVirtual()) | |||
5539 | continue; | |||
5540 | ||||
5541 | const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); | |||
5542 | const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); | |||
5543 | if (VRC == OpRC) | |||
5544 | continue; | |||
5545 | ||||
5546 | legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); | |||
5547 | Op.setIsKill(); | |||
5548 | } | |||
5549 | } | |||
5550 | ||||
5551 | return CreatedBB; | |||
5552 | } | |||
5553 | ||||
5554 | // Legalize INSERT_SUBREG | |||
5555 | // src0 must have the same register class as dst | |||
5556 | if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { | |||
5557 | Register Dst = MI.getOperand(0).getReg(); | |||
5558 | Register Src0 = MI.getOperand(1).getReg(); | |||
5559 | const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); | |||
5560 | const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); | |||
5561 | if (DstRC != Src0RC) { | |||
5562 | MachineBasicBlock *MBB = MI.getParent(); | |||
5563 | MachineOperand &Op = MI.getOperand(1); | |||
5564 | legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); | |||
5565 | } | |||
5566 | return CreatedBB; | |||
5567 | } | |||
5568 | ||||
5569 | // Legalize SI_INIT_M0 | |||
5570 | if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { | |||
5571 | MachineOperand &Src = MI.getOperand(0); | |||
5572 | if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) | |||
5573 | Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); | |||
5574 | return CreatedBB; | |||
5575 | } | |||
5576 | ||||
5577 | // Legalize MIMG and MUBUF/MTBUF for shaders. | |||
5578 | // | |||
5579 | // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via | |||
5580 | // scratch memory access. In both cases, the legalization never involves | |||
5581 | // conversion to the addr64 form. | |||
5582 | if (isMIMG(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) && | |||
5583 | (isMUBUF(MI) || isMTBUF(MI)))) { | |||
5584 | MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); | |||
5585 | if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) | |||
5586 | CreatedBB = loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT); | |||
5587 | ||||
5588 | MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); | |||
5589 | if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) | |||
5590 | CreatedBB = loadSRsrcFromVGPR(*this, MI, *SSamp, MDT); | |||
5591 | ||||
5592 | return CreatedBB; | |||
5593 | } | |||
5594 | ||||
5595 | // Legalize SI_CALL | |||
5596 | if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { | |||
5597 | MachineOperand *Dest = &MI.getOperand(0); | |||
5598 | if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) { | |||
5599 | // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and | |||
5600 | // following copies, we also need to move copies from and to physical | |||
5601 | // registers into the loop block. | |||
5602 | unsigned FrameSetupOpcode = getCallFrameSetupOpcode(); | |||
5603 | unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode(); | |||
5604 | ||||
5605 | // Also move the copies to physical registers into the loop block | |||
5606 | MachineBasicBlock &MBB = *MI.getParent(); | |||
5607 | MachineBasicBlock::iterator Start(&MI); | |||
5608 | while (Start->getOpcode() != FrameSetupOpcode) | |||
5609 | --Start; | |||
5610 | MachineBasicBlock::iterator End(&MI); | |||
5611 | while (End->getOpcode() != FrameDestroyOpcode) | |||
5612 | ++End; | |||
5613 | // Also include following copies of the return value | |||
5614 | ++End; | |||
5615 | while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() && | |||
5616 | MI.definesRegister(End->getOperand(1).getReg())) | |||
5617 | ++End; | |||
5618 | CreatedBB = loadSRsrcFromVGPR(*this, MI, *Dest, MDT, Start, End); | |||
5619 | } | |||
5620 | } | |||
5621 | ||||
5622 | // Legalize MUBUF* instructions. | |||
5623 | int RsrcIdx = | |||
5624 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); | |||
5625 | if (RsrcIdx != -1) { | |||
5626 | // We have an MUBUF instruction | |||
5627 | MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); | |||
5628 | unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass; | |||
5629 | if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()), | |||
5630 | RI.getRegClass(RsrcRC))) { | |||
5631 | // The operands are legal. | |||
5632 | // FIXME: We may need to legalize operands besided srsrc. | |||
5633 | return CreatedBB; | |||
5634 | } | |||
5635 | ||||
5636 | // Legalize a VGPR Rsrc. | |||
5637 | // | |||
5638 | // If the instruction is _ADDR64, we can avoid a waterfall by extracting | |||
5639 | // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using | |||
5640 | // a zero-value SRsrc. | |||
5641 | // | |||
5642 | // If the instruction is _OFFSET (both idxen and offen disabled), and we | |||
5643 | // support ADDR64 instructions, we can convert to ADDR64 and do the same as | |||
5644 | // above. | |||
5645 | // | |||
5646 | // Otherwise we are on non-ADDR64 hardware, and/or we have | |||
5647 | // idxen/offen/bothen and we fall back to a waterfall loop. | |||
5648 | ||||
5649 | MachineBasicBlock &MBB = *MI.getParent(); | |||
5650 | ||||
5651 | MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); | |||
5652 | if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) { | |||
5653 | // This is already an ADDR64 instruction so we need to add the pointer | |||
5654 | // extracted from the resource descriptor to the current value of VAddr. | |||
5655 | Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
5656 | Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
5657 | Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); | |||
5658 | ||||
5659 | const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
5660 | Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); | |||
5661 | Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); | |||
5662 | ||||
5663 | unsigned RsrcPtr, NewSRsrc; | |||
5664 | std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); | |||
5665 | ||||
5666 | // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0 | |||
5667 | const DebugLoc &DL = MI.getDebugLoc(); | |||
5668 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo) | |||
5669 | .addDef(CondReg0) | |||
5670 | .addReg(RsrcPtr, 0, AMDGPU::sub0) | |||
5671 | .addReg(VAddr->getReg(), 0, AMDGPU::sub0) | |||
5672 | .addImm(0); | |||
5673 | ||||
5674 | // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1 | |||
5675 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi) | |||
5676 | .addDef(CondReg1, RegState::Dead) | |||
5677 | .addReg(RsrcPtr, 0, AMDGPU::sub1) | |||
5678 | .addReg(VAddr->getReg(), 0, AMDGPU::sub1) | |||
5679 | .addReg(CondReg0, RegState::Kill) | |||
5680 | .addImm(0); | |||
5681 | ||||
5682 | // NewVaddr = {NewVaddrHi, NewVaddrLo} | |||
5683 | BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) | |||
5684 | .addReg(NewVAddrLo) | |||
5685 | .addImm(AMDGPU::sub0) | |||
5686 | .addReg(NewVAddrHi) | |||
5687 | .addImm(AMDGPU::sub1); | |||
5688 | ||||
5689 | VAddr->setReg(NewVAddr); | |||
5690 | Rsrc->setReg(NewSRsrc); | |||
5691 | } else if (!VAddr && ST.hasAddr64()) { | |||
5692 | // This instructions is the _OFFSET variant, so we need to convert it to | |||
5693 | // ADDR64. | |||
5694 | assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&(static_cast <bool> (ST.getGeneration() < AMDGPUSubtarget ::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here" ) ? void (0) : __assert_fail ("ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && \"FIXME: Need to emit flat atomics here\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5695, __extension__ __PRETTY_FUNCTION__)) | |||
5695 | "FIXME: Need to emit flat atomics here")(static_cast <bool> (ST.getGeneration() < AMDGPUSubtarget ::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here" ) ? void (0) : __assert_fail ("ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && \"FIXME: Need to emit flat atomics here\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5695, __extension__ __PRETTY_FUNCTION__)); | |||
5696 | ||||
5697 | unsigned RsrcPtr, NewSRsrc; | |||
5698 | std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); | |||
5699 | ||||
5700 | Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); | |||
5701 | MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); | |||
5702 | MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); | |||
5703 | MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); | |||
5704 | unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); | |||
5705 | ||||
5706 | // Atomics rith return have have an additional tied operand and are | |||
5707 | // missing some of the special bits. | |||
5708 | MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); | |||
5709 | MachineInstr *Addr64; | |||
5710 | ||||
5711 | if (!VDataIn) { | |||
5712 | // Regular buffer load / store. | |||
5713 | MachineInstrBuilder MIB = | |||
5714 | BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) | |||
5715 | .add(*VData) | |||
5716 | .addReg(NewVAddr) | |||
5717 | .addReg(NewSRsrc) | |||
5718 | .add(*SOffset) | |||
5719 | .add(*Offset); | |||
5720 | ||||
5721 | if (const MachineOperand *CPol = | |||
5722 | getNamedOperand(MI, AMDGPU::OpName::cpol)) { | |||
5723 | MIB.addImm(CPol->getImm()); | |||
5724 | } | |||
5725 | ||||
5726 | if (const MachineOperand *TFE = | |||
5727 | getNamedOperand(MI, AMDGPU::OpName::tfe)) { | |||
5728 | MIB.addImm(TFE->getImm()); | |||
5729 | } | |||
5730 | ||||
5731 | MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz)); | |||
5732 | ||||
5733 | MIB.cloneMemRefs(MI); | |||
5734 | Addr64 = MIB; | |||
5735 | } else { | |||
5736 | // Atomics with return. | |||
5737 | Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) | |||
5738 | .add(*VData) | |||
5739 | .add(*VDataIn) | |||
5740 | .addReg(NewVAddr) | |||
5741 | .addReg(NewSRsrc) | |||
5742 | .add(*SOffset) | |||
5743 | .add(*Offset) | |||
5744 | .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol)) | |||
5745 | .cloneMemRefs(MI); | |||
5746 | } | |||
5747 | ||||
5748 | MI.removeFromParent(); | |||
5749 | ||||
5750 | // NewVaddr = {NewVaddrHi, NewVaddrLo} | |||
5751 | BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), | |||
5752 | NewVAddr) | |||
5753 | .addReg(RsrcPtr, 0, AMDGPU::sub0) | |||
5754 | .addImm(AMDGPU::sub0) | |||
5755 | .addReg(RsrcPtr, 0, AMDGPU::sub1) | |||
5756 | .addImm(AMDGPU::sub1); | |||
5757 | } else { | |||
5758 | // This is another variant; legalize Rsrc with waterfall loop from VGPRs | |||
5759 | // to SGPRs. | |||
5760 | CreatedBB = loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT); | |||
5761 | return CreatedBB; | |||
5762 | } | |||
5763 | } | |||
5764 | return CreatedBB; | |||
5765 | } | |||
5766 | ||||
5767 | MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, | |||
5768 | MachineDominatorTree *MDT) const { | |||
5769 | SetVectorType Worklist; | |||
5770 | Worklist.insert(&TopInst); | |||
5771 | MachineBasicBlock *CreatedBB = nullptr; | |||
5772 | MachineBasicBlock *CreatedBBTmp = nullptr; | |||
5773 | ||||
5774 | while (!Worklist.empty()) { | |||
5775 | MachineInstr &Inst = *Worklist.pop_back_val(); | |||
5776 | MachineBasicBlock *MBB = Inst.getParent(); | |||
5777 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | |||
5778 | ||||
5779 | unsigned Opcode = Inst.getOpcode(); | |||
5780 | unsigned NewOpcode = getVALUOp(Inst); | |||
5781 | ||||
5782 | // Handle some special cases | |||
5783 | switch (Opcode) { | |||
5784 | default: | |||
5785 | break; | |||
5786 | case AMDGPU::S_ADD_U64_PSEUDO: | |||
5787 | case AMDGPU::S_SUB_U64_PSEUDO: | |||
5788 | splitScalar64BitAddSub(Worklist, Inst, MDT); | |||
5789 | Inst.eraseFromParent(); | |||
5790 | continue; | |||
5791 | case AMDGPU::S_ADD_I32: | |||
5792 | case AMDGPU::S_SUB_I32: { | |||
5793 | // FIXME: The u32 versions currently selected use the carry. | |||
5794 | bool Changed; | |||
5795 | std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT); | |||
5796 | if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) | |||
5797 | CreatedBB = CreatedBBTmp; | |||
5798 | if (Changed) | |||
5799 | continue; | |||
5800 | ||||
5801 | // Default handling | |||
5802 | break; | |||
5803 | } | |||
5804 | case AMDGPU::S_AND_B64: | |||
5805 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); | |||
5806 | Inst.eraseFromParent(); | |||
5807 | continue; | |||
5808 | ||||
5809 | case AMDGPU::S_OR_B64: | |||
5810 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); | |||
5811 | Inst.eraseFromParent(); | |||
5812 | continue; | |||
5813 | ||||
5814 | case AMDGPU::S_XOR_B64: | |||
5815 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); | |||
5816 | Inst.eraseFromParent(); | |||
5817 | continue; | |||
5818 | ||||
5819 | case AMDGPU::S_NAND_B64: | |||
5820 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); | |||
5821 | Inst.eraseFromParent(); | |||
5822 | continue; | |||
5823 | ||||
5824 | case AMDGPU::S_NOR_B64: | |||
5825 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); | |||
5826 | Inst.eraseFromParent(); | |||
5827 | continue; | |||
5828 | ||||
5829 | case AMDGPU::S_XNOR_B64: | |||
5830 | if (ST.hasDLInsts()) | |||
5831 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); | |||
5832 | else | |||
5833 | splitScalar64BitXnor(Worklist, Inst, MDT); | |||
5834 | Inst.eraseFromParent(); | |||
5835 | continue; | |||
5836 | ||||
5837 | case AMDGPU::S_ANDN2_B64: | |||
5838 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); | |||
5839 | Inst.eraseFromParent(); | |||
5840 | continue; | |||
5841 | ||||
5842 | case AMDGPU::S_ORN2_B64: | |||
5843 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); | |||
5844 | Inst.eraseFromParent(); | |||
5845 | continue; | |||
5846 | ||||
5847 | case AMDGPU::S_BREV_B64: | |||
5848 | splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true); | |||
5849 | Inst.eraseFromParent(); | |||
5850 | continue; | |||
5851 | ||||
5852 | case AMDGPU::S_NOT_B64: | |||
5853 | splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); | |||
5854 | Inst.eraseFromParent(); | |||
5855 | continue; | |||
5856 | ||||
5857 | case AMDGPU::S_BCNT1_I32_B64: | |||
5858 | splitScalar64BitBCNT(Worklist, Inst); | |||
5859 | Inst.eraseFromParent(); | |||
5860 | continue; | |||
5861 | ||||
5862 | case AMDGPU::S_BFE_I64: | |||
5863 | splitScalar64BitBFE(Worklist, Inst); | |||
5864 | Inst.eraseFromParent(); | |||
5865 | continue; | |||
5866 | ||||
5867 | case AMDGPU::S_LSHL_B32: | |||
5868 | if (ST.hasOnlyRevVALUShifts()) { | |||
5869 | NewOpcode = AMDGPU::V_LSHLREV_B32_e64; | |||
5870 | swapOperands(Inst); | |||
5871 | } | |||
5872 | break; | |||
5873 | case AMDGPU::S_ASHR_I32: | |||
5874 | if (ST.hasOnlyRevVALUShifts()) { | |||
5875 | NewOpcode = AMDGPU::V_ASHRREV_I32_e64; | |||
5876 | swapOperands(Inst); | |||
5877 | } | |||
5878 | break; | |||
5879 | case AMDGPU::S_LSHR_B32: | |||
5880 | if (ST.hasOnlyRevVALUShifts()) { | |||
5881 | NewOpcode = AMDGPU::V_LSHRREV_B32_e64; | |||
5882 | swapOperands(Inst); | |||
5883 | } | |||
5884 | break; | |||
5885 | case AMDGPU::S_LSHL_B64: | |||
5886 | if (ST.hasOnlyRevVALUShifts()) { | |||
5887 | NewOpcode = AMDGPU::V_LSHLREV_B64_e64; | |||
5888 | swapOperands(Inst); | |||
5889 | } | |||
5890 | break; | |||
5891 | case AMDGPU::S_ASHR_I64: | |||
5892 | if (ST.hasOnlyRevVALUShifts()) { | |||
5893 | NewOpcode = AMDGPU::V_ASHRREV_I64_e64; | |||
5894 | swapOperands(Inst); | |||
5895 | } | |||
5896 | break; | |||
5897 | case AMDGPU::S_LSHR_B64: | |||
5898 | if (ST.hasOnlyRevVALUShifts()) { | |||
5899 | NewOpcode = AMDGPU::V_LSHRREV_B64_e64; | |||
5900 | swapOperands(Inst); | |||
5901 | } | |||
5902 | break; | |||
5903 | ||||
5904 | case AMDGPU::S_ABS_I32: | |||
5905 | lowerScalarAbs(Worklist, Inst); | |||
5906 | Inst.eraseFromParent(); | |||
5907 | continue; | |||
5908 | ||||
5909 | case AMDGPU::S_CBRANCH_SCC0: | |||
5910 | case AMDGPU::S_CBRANCH_SCC1: { | |||
5911 | // Clear unused bits of vcc | |||
5912 | Register CondReg = Inst.getOperand(1).getReg(); | |||
5913 | bool IsSCC = CondReg == AMDGPU::SCC; | |||
5914 | Register VCC = RI.getVCC(); | |||
5915 | Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
5916 | unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; | |||
5917 | BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC) | |||
5918 | .addReg(EXEC) | |||
5919 | .addReg(IsSCC ? VCC : CondReg); | |||
5920 | Inst.RemoveOperand(1); | |||
5921 | } | |||
5922 | break; | |||
5923 | ||||
5924 | case AMDGPU::S_BFE_U64: | |||
5925 | case AMDGPU::S_BFM_B64: | |||
5926 | llvm_unreachable("Moving this op to VALU not implemented")::llvm::llvm_unreachable_internal("Moving this op to VALU not implemented" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5926); | |||
5927 | ||||
5928 | case AMDGPU::S_PACK_LL_B32_B16: | |||
5929 | case AMDGPU::S_PACK_LH_B32_B16: | |||
5930 | case AMDGPU::S_PACK_HH_B32_B16: | |||
5931 | movePackToVALU(Worklist, MRI, Inst); | |||
5932 | Inst.eraseFromParent(); | |||
5933 | continue; | |||
5934 | ||||
5935 | case AMDGPU::S_XNOR_B32: | |||
5936 | lowerScalarXnor(Worklist, Inst); | |||
5937 | Inst.eraseFromParent(); | |||
5938 | continue; | |||
5939 | ||||
5940 | case AMDGPU::S_NAND_B32: | |||
5941 | splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); | |||
5942 | Inst.eraseFromParent(); | |||
5943 | continue; | |||
5944 | ||||
5945 | case AMDGPU::S_NOR_B32: | |||
5946 | splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); | |||
5947 | Inst.eraseFromParent(); | |||
5948 | continue; | |||
5949 | ||||
5950 | case AMDGPU::S_ANDN2_B32: | |||
5951 | splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); | |||
5952 | Inst.eraseFromParent(); | |||
5953 | continue; | |||
5954 | ||||
5955 | case AMDGPU::S_ORN2_B32: | |||
5956 | splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); | |||
5957 | Inst.eraseFromParent(); | |||
5958 | continue; | |||
5959 | ||||
5960 | // TODO: remove as soon as everything is ready | |||
5961 | // to replace VGPR to SGPR copy with V_READFIRSTLANEs. | |||
5962 | // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO | |||
5963 | // can only be selected from the uniform SDNode. | |||
5964 | case AMDGPU::S_ADD_CO_PSEUDO: | |||
5965 | case AMDGPU::S_SUB_CO_PSEUDO: { | |||
5966 | unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) | |||
5967 | ? AMDGPU::V_ADDC_U32_e64 | |||
5968 | : AMDGPU::V_SUBB_U32_e64; | |||
5969 | const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
5970 | ||||
5971 | Register CarryInReg = Inst.getOperand(4).getReg(); | |||
5972 | if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { | |||
5973 | Register NewCarryReg = MRI.createVirtualRegister(CarryRC); | |||
5974 | BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) | |||
5975 | .addReg(CarryInReg); | |||
5976 | } | |||
5977 | ||||
5978 | Register CarryOutReg = Inst.getOperand(1).getReg(); | |||
5979 | ||||
5980 | Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass( | |||
5981 | MRI.getRegClass(Inst.getOperand(0).getReg()))); | |||
5982 | MachineInstr *CarryOp = | |||
5983 | BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg) | |||
5984 | .addReg(CarryOutReg, RegState::Define) | |||
5985 | .add(Inst.getOperand(2)) | |||
5986 | .add(Inst.getOperand(3)) | |||
5987 | .addReg(CarryInReg) | |||
5988 | .addImm(0); | |||
5989 | CreatedBBTmp = legalizeOperands(*CarryOp); | |||
5990 | if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) | |||
5991 | CreatedBB = CreatedBBTmp; | |||
5992 | MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); | |||
5993 | addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); | |||
5994 | Inst.eraseFromParent(); | |||
5995 | } | |||
5996 | continue; | |||
5997 | case AMDGPU::S_UADDO_PSEUDO: | |||
5998 | case AMDGPU::S_USUBO_PSEUDO: { | |||
5999 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6000 | MachineOperand &Dest0 = Inst.getOperand(0); | |||
6001 | MachineOperand &Dest1 = Inst.getOperand(1); | |||
6002 | MachineOperand &Src0 = Inst.getOperand(2); | |||
6003 | MachineOperand &Src1 = Inst.getOperand(3); | |||
6004 | ||||
6005 | unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO) | |||
6006 | ? AMDGPU::V_ADD_CO_U32_e64 | |||
6007 | : AMDGPU::V_SUB_CO_U32_e64; | |||
6008 | const TargetRegisterClass *NewRC = | |||
6009 | RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg())); | |||
6010 | Register DestReg = MRI.createVirtualRegister(NewRC); | |||
6011 | MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg) | |||
6012 | .addReg(Dest1.getReg(), RegState::Define) | |||
6013 | .add(Src0) | |||
6014 | .add(Src1) | |||
6015 | .addImm(0); // clamp bit | |||
6016 | ||||
6017 | CreatedBBTmp = legalizeOperands(*NewInstr, MDT); | |||
6018 | if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) | |||
6019 | CreatedBB = CreatedBBTmp; | |||
6020 | ||||
6021 | MRI.replaceRegWith(Dest0.getReg(), DestReg); | |||
6022 | addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, | |||
6023 | Worklist); | |||
6024 | Inst.eraseFromParent(); | |||
6025 | } | |||
6026 | continue; | |||
6027 | ||||
6028 | case AMDGPU::S_CSELECT_B32: | |||
6029 | case AMDGPU::S_CSELECT_B64: | |||
6030 | lowerSelect(Worklist, Inst, MDT); | |||
6031 | Inst.eraseFromParent(); | |||
6032 | continue; | |||
6033 | case AMDGPU::S_CMP_EQ_I32: | |||
6034 | case AMDGPU::S_CMP_LG_I32: | |||
6035 | case AMDGPU::S_CMP_GT_I32: | |||
6036 | case AMDGPU::S_CMP_GE_I32: | |||
6037 | case AMDGPU::S_CMP_LT_I32: | |||
6038 | case AMDGPU::S_CMP_LE_I32: | |||
6039 | case AMDGPU::S_CMP_EQ_U32: | |||
6040 | case AMDGPU::S_CMP_LG_U32: | |||
6041 | case AMDGPU::S_CMP_GT_U32: | |||
6042 | case AMDGPU::S_CMP_GE_U32: | |||
6043 | case AMDGPU::S_CMP_LT_U32: | |||
6044 | case AMDGPU::S_CMP_LE_U32: | |||
6045 | case AMDGPU::S_CMP_EQ_U64: | |||
6046 | case AMDGPU::S_CMP_LG_U64: { | |||
6047 | const MCInstrDesc &NewDesc = get(NewOpcode); | |||
6048 | Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass()); | |||
6049 | MachineInstr *NewInstr = | |||
6050 | BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg) | |||
6051 | .add(Inst.getOperand(0)) | |||
6052 | .add(Inst.getOperand(1)); | |||
6053 | legalizeOperands(*NewInstr, MDT); | |||
6054 | int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC); | |||
6055 | MachineOperand SCCOp = Inst.getOperand(SCCIdx); | |||
6056 | addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); | |||
6057 | Inst.eraseFromParent(); | |||
6058 | } | |||
6059 | continue; | |||
6060 | } | |||
6061 | ||||
6062 | ||||
6063 | if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { | |||
6064 | // We cannot move this instruction to the VALU, so we should try to | |||
6065 | // legalize its operands instead. | |||
6066 | CreatedBBTmp = legalizeOperands(Inst, MDT); | |||
6067 | if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) | |||
6068 | CreatedBB = CreatedBBTmp; | |||
6069 | continue; | |||
6070 | } | |||
6071 | ||||
6072 | // Use the new VALU Opcode. | |||
6073 | const MCInstrDesc &NewDesc = get(NewOpcode); | |||
6074 | Inst.setDesc(NewDesc); | |||
6075 | ||||
6076 | // Remove any references to SCC. Vector instructions can't read from it, and | |||
6077 | // We're just about to add the implicit use / defs of VCC, and we don't want | |||
6078 | // both. | |||
6079 | for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { | |||
6080 | MachineOperand &Op = Inst.getOperand(i); | |||
6081 | if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { | |||
6082 | // Only propagate through live-def of SCC. | |||
6083 | if (Op.isDef() && !Op.isDead()) | |||
6084 | addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); | |||
6085 | if (Op.isUse()) | |||
6086 | addSCCDefsToVALUWorklist(Op, Worklist); | |||
6087 | Inst.RemoveOperand(i); | |||
6088 | } | |||
6089 | } | |||
6090 | ||||
6091 | if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { | |||
6092 | // We are converting these to a BFE, so we need to add the missing | |||
6093 | // operands for the size and offset. | |||
6094 | unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; | |||
6095 | Inst.addOperand(MachineOperand::CreateImm(0)); | |||
6096 | Inst.addOperand(MachineOperand::CreateImm(Size)); | |||
6097 | ||||
6098 | } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { | |||
6099 | // The VALU version adds the second operand to the result, so insert an | |||
6100 | // extra 0 operand. | |||
6101 | Inst.addOperand(MachineOperand::CreateImm(0)); | |||
6102 | } | |||
6103 | ||||
6104 | Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); | |||
6105 | fixImplicitOperands(Inst); | |||
6106 | ||||
6107 | if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { | |||
6108 | const MachineOperand &OffsetWidthOp = Inst.getOperand(2); | |||
6109 | // If we need to move this to VGPRs, we need to unpack the second operand | |||
6110 | // back into the 2 separate ones for bit offset and width. | |||
6111 | assert(OffsetWidthOp.isImm() &&(static_cast <bool> (OffsetWidthOp.isImm() && "Scalar BFE is only implemented for constant width and offset" ) ? void (0) : __assert_fail ("OffsetWidthOp.isImm() && \"Scalar BFE is only implemented for constant width and offset\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6112, __extension__ __PRETTY_FUNCTION__)) | |||
6112 | "Scalar BFE is only implemented for constant width and offset")(static_cast <bool> (OffsetWidthOp.isImm() && "Scalar BFE is only implemented for constant width and offset" ) ? void (0) : __assert_fail ("OffsetWidthOp.isImm() && \"Scalar BFE is only implemented for constant width and offset\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6112, __extension__ __PRETTY_FUNCTION__)); | |||
6113 | uint32_t Imm = OffsetWidthOp.getImm(); | |||
6114 | ||||
6115 | uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. | |||
6116 | uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. | |||
6117 | Inst.RemoveOperand(2); // Remove old immediate. | |||
6118 | Inst.addOperand(MachineOperand::CreateImm(Offset)); | |||
6119 | Inst.addOperand(MachineOperand::CreateImm(BitWidth)); | |||
6120 | } | |||
6121 | ||||
6122 | bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); | |||
6123 | unsigned NewDstReg = AMDGPU::NoRegister; | |||
6124 | if (HasDst) { | |||
6125 | Register DstReg = Inst.getOperand(0).getReg(); | |||
6126 | if (DstReg.isPhysical()) | |||
6127 | continue; | |||
6128 | ||||
6129 | // Update the destination register class. | |||
6130 | const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); | |||
6131 | if (!NewDstRC) | |||
6132 | continue; | |||
6133 | ||||
6134 | if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && | |||
6135 | NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { | |||
6136 | // Instead of creating a copy where src and dst are the same register | |||
6137 | // class, we just replace all uses of dst with src. These kinds of | |||
6138 | // copies interfere with the heuristics MachineSink uses to decide | |||
6139 | // whether or not to split a critical edge. Since the pass assumes | |||
6140 | // that copies will end up as machine instructions and not be | |||
6141 | // eliminated. | |||
6142 | addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); | |||
6143 | MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); | |||
6144 | MRI.clearKillFlags(Inst.getOperand(1).getReg()); | |||
6145 | Inst.getOperand(0).setReg(DstReg); | |||
6146 | ||||
6147 | // Make sure we don't leave around a dead VGPR->SGPR copy. Normally | |||
6148 | // these are deleted later, but at -O0 it would leave a suspicious | |||
6149 | // looking illegal copy of an undef register. | |||
6150 | for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) | |||
6151 | Inst.RemoveOperand(I); | |||
6152 | Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); | |||
6153 | continue; | |||
6154 | } | |||
6155 | ||||
6156 | NewDstReg = MRI.createVirtualRegister(NewDstRC); | |||
6157 | MRI.replaceRegWith(DstReg, NewDstReg); | |||
6158 | } | |||
6159 | ||||
6160 | // Legalize the operands | |||
6161 | CreatedBBTmp = legalizeOperands(Inst, MDT); | |||
6162 | if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) | |||
6163 | CreatedBB = CreatedBBTmp; | |||
6164 | ||||
6165 | if (HasDst) | |||
6166 | addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); | |||
6167 | } | |||
6168 | return CreatedBB; | |||
6169 | } | |||
6170 | ||||
6171 | // Add/sub require special handling to deal with carry outs. | |||
6172 | std::pair<bool, MachineBasicBlock *> | |||
6173 | SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, | |||
6174 | MachineDominatorTree *MDT) const { | |||
6175 | if (ST.hasAddNoCarry()) { | |||
6176 | // Assume there is no user of scc since we don't select this in that case. | |||
6177 | // Since scc isn't used, it doesn't really matter if the i32 or u32 variant | |||
6178 | // is used. | |||
6179 | ||||
6180 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6181 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6182 | ||||
6183 | Register OldDstReg = Inst.getOperand(0).getReg(); | |||
6184 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6185 | ||||
6186 | unsigned Opc = Inst.getOpcode(); | |||
6187 | assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32)(static_cast <bool> (Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32) ? void (0) : __assert_fail ("Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6187, __extension__ __PRETTY_FUNCTION__)); | |||
6188 | ||||
6189 | unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ? | |||
6190 | AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64; | |||
6191 | ||||
6192 | assert(Inst.getOperand(3).getReg() == AMDGPU::SCC)(static_cast <bool> (Inst.getOperand(3).getReg() == AMDGPU ::SCC) ? void (0) : __assert_fail ("Inst.getOperand(3).getReg() == AMDGPU::SCC" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6192, __extension__ __PRETTY_FUNCTION__)); | |||
6193 | Inst.RemoveOperand(3); | |||
6194 | ||||
6195 | Inst.setDesc(get(NewOpc)); | |||
6196 | Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit | |||
6197 | Inst.addImplicitDefUseOperands(*MBB.getParent()); | |||
6198 | MRI.replaceRegWith(OldDstReg, ResultReg); | |||
6199 | MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT); | |||
6200 | ||||
6201 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
6202 | return std::make_pair(true, NewBB); | |||
6203 | } | |||
6204 | ||||
6205 | return std::make_pair(false, nullptr); | |||
6206 | } | |||
6207 | ||||
6208 | void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, | |||
6209 | MachineDominatorTree *MDT) const { | |||
6210 | ||||
6211 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6212 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6213 | MachineBasicBlock::iterator MII = Inst; | |||
6214 | DebugLoc DL = Inst.getDebugLoc(); | |||
6215 | ||||
6216 | MachineOperand &Dest = Inst.getOperand(0); | |||
6217 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6218 | MachineOperand &Src1 = Inst.getOperand(2); | |||
6219 | MachineOperand &Cond = Inst.getOperand(3); | |||
6220 | ||||
6221 | Register SCCSource = Cond.getReg(); | |||
6222 | bool IsSCC = (SCCSource == AMDGPU::SCC); | |||
6223 | ||||
6224 | // If this is a trivial select where the condition is effectively not SCC | |||
6225 | // (SCCSource is a source of copy to SCC), then the select is semantically | |||
6226 | // equivalent to copying SCCSource. Hence, there is no need to create | |||
6227 | // V_CNDMASK, we can just use that and bail out. | |||
6228 | if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() && | |||
6229 | (Src1.getImm() == 0)) { | |||
6230 | MRI.replaceRegWith(Dest.getReg(), SCCSource); | |||
6231 | return; | |||
6232 | } | |||
6233 | ||||
6234 | const TargetRegisterClass *TC = | |||
6235 | RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
6236 | ||||
6237 | Register CopySCC = MRI.createVirtualRegister(TC); | |||
6238 | ||||
6239 | if (IsSCC) { | |||
6240 | // Now look for the closest SCC def if it is a copy | |||
6241 | // replacing the SCCSource with the COPY source register | |||
6242 | bool CopyFound = false; | |||
6243 | for (MachineInstr &CandI : | |||
6244 | make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)), | |||
6245 | Inst.getParent()->rend())) { | |||
6246 | if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != | |||
6247 | -1) { | |||
6248 | if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) { | |||
6249 | BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC) | |||
6250 | .addReg(CandI.getOperand(1).getReg()); | |||
6251 | CopyFound = true; | |||
6252 | } | |||
6253 | break; | |||
6254 | } | |||
6255 | } | |||
6256 | if (!CopyFound) { | |||
6257 | // SCC def is not a copy | |||
6258 | // Insert a trivial select instead of creating a copy, because a copy from | |||
6259 | // SCC would semantically mean just copying a single bit, but we may need | |||
6260 | // the result to be a vector condition mask that needs preserving. | |||
6261 | unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64 | |||
6262 | : AMDGPU::S_CSELECT_B32; | |||
6263 | auto NewSelect = | |||
6264 | BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0); | |||
6265 | NewSelect->getOperand(3).setIsUndef(Cond.isUndef()); | |||
6266 | } | |||
6267 | } | |||
6268 | ||||
6269 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6270 | ||||
6271 | auto UpdatedInst = | |||
6272 | BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg) | |||
6273 | .addImm(0) | |||
6274 | .add(Src1) // False | |||
6275 | .addImm(0) | |||
6276 | .add(Src0) // True | |||
6277 | .addReg(IsSCC ? CopySCC : SCCSource); | |||
6278 | ||||
6279 | MRI.replaceRegWith(Dest.getReg(), ResultReg); | |||
6280 | legalizeOperands(*UpdatedInst, MDT); | |||
6281 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
6282 | } | |||
6283 | ||||
6284 | void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, | |||
6285 | MachineInstr &Inst) const { | |||
6286 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6287 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6288 | MachineBasicBlock::iterator MII = Inst; | |||
6289 | DebugLoc DL = Inst.getDebugLoc(); | |||
6290 | ||||
6291 | MachineOperand &Dest = Inst.getOperand(0); | |||
6292 | MachineOperand &Src = Inst.getOperand(1); | |||
6293 | Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6294 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6295 | ||||
6296 | unsigned SubOp = ST.hasAddNoCarry() ? | |||
6297 | AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32; | |||
6298 | ||||
6299 | BuildMI(MBB, MII, DL, get(SubOp), TmpReg) | |||
6300 | .addImm(0) | |||
6301 | .addReg(Src.getReg()); | |||
6302 | ||||
6303 | BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) | |||
6304 | .addReg(Src.getReg()) | |||
6305 | .addReg(TmpReg); | |||
6306 | ||||
6307 | MRI.replaceRegWith(Dest.getReg(), ResultReg); | |||
6308 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
6309 | } | |||
6310 | ||||
6311 | void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, | |||
6312 | MachineInstr &Inst) const { | |||
6313 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6314 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6315 | MachineBasicBlock::iterator MII = Inst; | |||
6316 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6317 | ||||
6318 | MachineOperand &Dest = Inst.getOperand(0); | |||
6319 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6320 | MachineOperand &Src1 = Inst.getOperand(2); | |||
6321 | ||||
6322 | if (ST.hasDLInsts()) { | |||
6323 | Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6324 | legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); | |||
6325 | legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); | |||
6326 | ||||
6327 | BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) | |||
6328 | .add(Src0) | |||
6329 | .add(Src1); | |||
6330 | ||||
6331 | MRI.replaceRegWith(Dest.getReg(), NewDest); | |||
6332 | addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); | |||
6333 | } else { | |||
6334 | // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can | |||
6335 | // invert either source and then perform the XOR. If either source is a | |||
6336 | // scalar register, then we can leave the inversion on the scalar unit to | |||
6337 | // acheive a better distrubution of scalar and vector instructions. | |||
6338 | bool Src0IsSGPR = Src0.isReg() && | |||
6339 | RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); | |||
6340 | bool Src1IsSGPR = Src1.isReg() && | |||
6341 | RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); | |||
6342 | MachineInstr *Xor; | |||
6343 | Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | |||
6344 | Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | |||
6345 | ||||
6346 | // Build a pair of scalar instructions and add them to the work list. | |||
6347 | // The next iteration over the work list will lower these to the vector | |||
6348 | // unit as necessary. | |||
6349 | if (Src0IsSGPR) { | |||
6350 | BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0); | |||
6351 | Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) | |||
6352 | .addReg(Temp) | |||
6353 | .add(Src1); | |||
6354 | } else if (Src1IsSGPR) { | |||
6355 | BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1); | |||
6356 | Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) | |||
6357 | .add(Src0) | |||
6358 | .addReg(Temp); | |||
6359 | } else { | |||
6360 | Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp) | |||
6361 | .add(Src0) | |||
6362 | .add(Src1); | |||
6363 | MachineInstr *Not = | |||
6364 | BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp); | |||
6365 | Worklist.insert(Not); | |||
6366 | } | |||
6367 | ||||
6368 | MRI.replaceRegWith(Dest.getReg(), NewDest); | |||
6369 | ||||
6370 | Worklist.insert(Xor); | |||
6371 | ||||
6372 | addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); | |||
6373 | } | |||
6374 | } | |||
6375 | ||||
6376 | void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, | |||
6377 | MachineInstr &Inst, | |||
6378 | unsigned Opcode) const { | |||
6379 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6380 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6381 | MachineBasicBlock::iterator MII = Inst; | |||
6382 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6383 | ||||
6384 | MachineOperand &Dest = Inst.getOperand(0); | |||
6385 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6386 | MachineOperand &Src1 = Inst.getOperand(2); | |||
6387 | ||||
6388 | Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | |||
6389 | Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | |||
6390 | ||||
6391 | MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm) | |||
6392 | .add(Src0) | |||
6393 | .add(Src1); | |||
6394 | ||||
6395 | MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) | |||
6396 | .addReg(Interm); | |||
6397 | ||||
6398 | Worklist.insert(&Op); | |||
6399 | Worklist.insert(&Not); | |||
6400 | ||||
6401 | MRI.replaceRegWith(Dest.getReg(), NewDest); | |||
6402 | addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); | |||
6403 | } | |||
6404 | ||||
6405 | void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, | |||
6406 | MachineInstr &Inst, | |||
6407 | unsigned Opcode) const { | |||
6408 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6409 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6410 | MachineBasicBlock::iterator MII = Inst; | |||
6411 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6412 | ||||
6413 | MachineOperand &Dest = Inst.getOperand(0); | |||
6414 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6415 | MachineOperand &Src1 = Inst.getOperand(2); | |||
6416 | ||||
6417 | Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
6418 | Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
6419 | ||||
6420 | MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm) | |||
6421 | .add(Src1); | |||
6422 | ||||
6423 | MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest) | |||
6424 | .add(Src0) | |||
6425 | .addReg(Interm); | |||
6426 | ||||
6427 | Worklist.insert(&Not); | |||
6428 | Worklist.insert(&Op); | |||
6429 | ||||
6430 | MRI.replaceRegWith(Dest.getReg(), NewDest); | |||
6431 | addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); | |||
6432 | } | |||
6433 | ||||
6434 | void SIInstrInfo::splitScalar64BitUnaryOp( | |||
6435 | SetVectorType &Worklist, MachineInstr &Inst, | |||
6436 | unsigned Opcode, bool Swap) const { | |||
6437 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6438 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6439 | ||||
6440 | MachineOperand &Dest = Inst.getOperand(0); | |||
6441 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6442 | DebugLoc DL = Inst.getDebugLoc(); | |||
6443 | ||||
6444 | MachineBasicBlock::iterator MII = Inst; | |||
6445 | ||||
6446 | const MCInstrDesc &InstDesc = get(Opcode); | |||
6447 | const TargetRegisterClass *Src0RC = Src0.isReg() ? | |||
6448 | MRI.getRegClass(Src0.getReg()) : | |||
6449 | &AMDGPU::SGPR_32RegClass; | |||
6450 | ||||
6451 | const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); | |||
6452 | ||||
6453 | MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, | |||
6454 | AMDGPU::sub0, Src0SubRC); | |||
6455 | ||||
6456 | const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); | |||
6457 | const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); | |||
6458 | const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); | |||
6459 | ||||
6460 | Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); | |||
6461 | MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); | |||
6462 | ||||
6463 | MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, | |||
6464 | AMDGPU::sub1, Src0SubRC); | |||
6465 | ||||
6466 | Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); | |||
6467 | MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); | |||
6468 | ||||
6469 | if (Swap) | |||
6470 | std::swap(DestSub0, DestSub1); | |||
6471 | ||||
6472 | Register FullDestReg = MRI.createVirtualRegister(NewDestRC); | |||
6473 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) | |||
6474 | .addReg(DestSub0) | |||
6475 | .addImm(AMDGPU::sub0) | |||
6476 | .addReg(DestSub1) | |||
6477 | .addImm(AMDGPU::sub1); | |||
6478 | ||||
6479 | MRI.replaceRegWith(Dest.getReg(), FullDestReg); | |||
6480 | ||||
6481 | Worklist.insert(&LoHalf); | |||
6482 | Worklist.insert(&HiHalf); | |||
6483 | ||||
6484 | // We don't need to legalizeOperands here because for a single operand, src0 | |||
6485 | // will support any kind of input. | |||
6486 | ||||
6487 | // Move all users of this moved value. | |||
6488 | addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); | |||
6489 | } | |||
6490 | ||||
6491 | void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, | |||
6492 | MachineInstr &Inst, | |||
6493 | MachineDominatorTree *MDT) const { | |||
6494 | bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); | |||
6495 | ||||
6496 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6497 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6498 | const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); | |||
6499 | ||||
6500 | Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); | |||
6501 | Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6502 | Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6503 | ||||
6504 | Register CarryReg = MRI.createVirtualRegister(CarryRC); | |||
6505 | Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); | |||
6506 | ||||
6507 | MachineOperand &Dest = Inst.getOperand(0); | |||
6508 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6509 | MachineOperand &Src1 = Inst.getOperand(2); | |||
6510 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6511 | MachineBasicBlock::iterator MII = Inst; | |||
6512 | ||||
6513 | const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); | |||
6514 | const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); | |||
6515 | const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); | |||
6516 | const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); | |||
6517 | ||||
6518 | MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, | |||
6519 | AMDGPU::sub0, Src0SubRC); | |||
6520 | MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, | |||
6521 | AMDGPU::sub0, Src1SubRC); | |||
6522 | ||||
6523 | ||||
6524 | MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, | |||
6525 | AMDGPU::sub1, Src0SubRC); | |||
6526 | MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, | |||
6527 | AMDGPU::sub1, Src1SubRC); | |||
6528 | ||||
6529 | unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; | |||
6530 | MachineInstr *LoHalf = | |||
6531 | BuildMI(MBB, MII, DL, get(LoOpc), DestSub0) | |||
6532 | .addReg(CarryReg, RegState::Define) | |||
6533 | .add(SrcReg0Sub0) | |||
6534 | .add(SrcReg1Sub0) | |||
6535 | .addImm(0); // clamp bit | |||
6536 | ||||
6537 | unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; | |||
6538 | MachineInstr *HiHalf = | |||
6539 | BuildMI(MBB, MII, DL, get(HiOpc), DestSub1) | |||
6540 | .addReg(DeadCarryReg, RegState::Define | RegState::Dead) | |||
6541 | .add(SrcReg0Sub1) | |||
6542 | .add(SrcReg1Sub1) | |||
6543 | .addReg(CarryReg, RegState::Kill) | |||
6544 | .addImm(0); // clamp bit | |||
6545 | ||||
6546 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) | |||
6547 | .addReg(DestSub0) | |||
6548 | .addImm(AMDGPU::sub0) | |||
6549 | .addReg(DestSub1) | |||
6550 | .addImm(AMDGPU::sub1); | |||
6551 | ||||
6552 | MRI.replaceRegWith(Dest.getReg(), FullDestReg); | |||
6553 | ||||
6554 | // Try to legalize the operands in case we need to swap the order to keep it | |||
6555 | // valid. | |||
6556 | legalizeOperands(*LoHalf, MDT); | |||
6557 | legalizeOperands(*HiHalf, MDT); | |||
6558 | ||||
6559 | // Move all users of this moved vlaue. | |||
6560 | addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); | |||
6561 | } | |||
6562 | ||||
6563 | void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, | |||
6564 | MachineInstr &Inst, unsigned Opcode, | |||
6565 | MachineDominatorTree *MDT) const { | |||
6566 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6567 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6568 | ||||
6569 | MachineOperand &Dest = Inst.getOperand(0); | |||
6570 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6571 | MachineOperand &Src1 = Inst.getOperand(2); | |||
6572 | DebugLoc DL = Inst.getDebugLoc(); | |||
6573 | ||||
6574 | MachineBasicBlock::iterator MII = Inst; | |||
6575 | ||||
6576 | const MCInstrDesc &InstDesc = get(Opcode); | |||
6577 | const TargetRegisterClass *Src0RC = Src0.isReg() ? | |||
6578 | MRI.getRegClass(Src0.getReg()) : | |||
6579 | &AMDGPU::SGPR_32RegClass; | |||
6580 | ||||
6581 | const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); | |||
6582 | const TargetRegisterClass *Src1RC = Src1.isReg() ? | |||
6583 | MRI.getRegClass(Src1.getReg()) : | |||
6584 | &AMDGPU::SGPR_32RegClass; | |||
6585 | ||||
6586 | const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); | |||
6587 | ||||
6588 | MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, | |||
6589 | AMDGPU::sub0, Src0SubRC); | |||
6590 | MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, | |||
6591 | AMDGPU::sub0, Src1SubRC); | |||
6592 | MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, | |||
6593 | AMDGPU::sub1, Src0SubRC); | |||
6594 | MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, | |||
6595 | AMDGPU::sub1, Src1SubRC); | |||
6596 | ||||
6597 | const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); | |||
6598 | const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); | |||
6599 | const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); | |||
6600 | ||||
6601 | Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); | |||
6602 | MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) | |||
6603 | .add(SrcReg0Sub0) | |||
6604 | .add(SrcReg1Sub0); | |||
6605 | ||||
6606 | Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); | |||
6607 | MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) | |||
6608 | .add(SrcReg0Sub1) | |||
6609 | .add(SrcReg1Sub1); | |||
6610 | ||||
6611 | Register FullDestReg = MRI.createVirtualRegister(NewDestRC); | |||
6612 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) | |||
6613 | .addReg(DestSub0) | |||
6614 | .addImm(AMDGPU::sub0) | |||
6615 | .addReg(DestSub1) | |||
6616 | .addImm(AMDGPU::sub1); | |||
6617 | ||||
6618 | MRI.replaceRegWith(Dest.getReg(), FullDestReg); | |||
6619 | ||||
6620 | Worklist.insert(&LoHalf); | |||
6621 | Worklist.insert(&HiHalf); | |||
6622 | ||||
6623 | // Move all users of this moved vlaue. | |||
6624 | addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); | |||
6625 | } | |||
6626 | ||||
6627 | void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, | |||
6628 | MachineInstr &Inst, | |||
6629 | MachineDominatorTree *MDT) const { | |||
6630 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6631 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6632 | ||||
6633 | MachineOperand &Dest = Inst.getOperand(0); | |||
6634 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6635 | MachineOperand &Src1 = Inst.getOperand(2); | |||
6636 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6637 | ||||
6638 | MachineBasicBlock::iterator MII = Inst; | |||
6639 | ||||
6640 | const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); | |||
6641 | ||||
6642 | Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); | |||
6643 | ||||
6644 | MachineOperand* Op0; | |||
6645 | MachineOperand* Op1; | |||
6646 | ||||
6647 | if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) { | |||
6648 | Op0 = &Src0; | |||
6649 | Op1 = &Src1; | |||
6650 | } else { | |||
6651 | Op0 = &Src1; | |||
6652 | Op1 = &Src0; | |||
6653 | } | |||
6654 | ||||
6655 | BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm) | |||
6656 | .add(*Op0); | |||
6657 | ||||
6658 | Register NewDest = MRI.createVirtualRegister(DestRC); | |||
6659 | ||||
6660 | MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest) | |||
6661 | .addReg(Interm) | |||
6662 | .add(*Op1); | |||
6663 | ||||
6664 | MRI.replaceRegWith(Dest.getReg(), NewDest); | |||
6665 | ||||
6666 | Worklist.insert(&Xor); | |||
6667 | } | |||
6668 | ||||
6669 | void SIInstrInfo::splitScalar64BitBCNT( | |||
6670 | SetVectorType &Worklist, MachineInstr &Inst) const { | |||
6671 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6672 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6673 | ||||
6674 | MachineBasicBlock::iterator MII = Inst; | |||
6675 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6676 | ||||
6677 | MachineOperand &Dest = Inst.getOperand(0); | |||
6678 | MachineOperand &Src = Inst.getOperand(1); | |||
6679 | ||||
6680 | const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); | |||
6681 | const TargetRegisterClass *SrcRC = Src.isReg() ? | |||
6682 | MRI.getRegClass(Src.getReg()) : | |||
6683 | &AMDGPU::SGPR_32RegClass; | |||
6684 | ||||
6685 | Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6686 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6687 | ||||
6688 | const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); | |||
6689 | ||||
6690 | MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, | |||
6691 | AMDGPU::sub0, SrcSubRC); | |||
6692 | MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, | |||
6693 | AMDGPU::sub1, SrcSubRC); | |||
6694 | ||||
6695 | BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); | |||
6696 | ||||
6697 | BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); | |||
6698 | ||||
6699 | MRI.replaceRegWith(Dest.getReg(), ResultReg); | |||
6700 | ||||
6701 | // We don't need to legalize operands here. src0 for etiher instruction can be | |||
6702 | // an SGPR, and the second input is unused or determined here. | |||
6703 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
6704 | } | |||
6705 | ||||
6706 | void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, | |||
6707 | MachineInstr &Inst) const { | |||
6708 | MachineBasicBlock &MBB = *Inst.getParent(); | |||
6709 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
6710 | MachineBasicBlock::iterator MII = Inst; | |||
6711 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6712 | ||||
6713 | MachineOperand &Dest = Inst.getOperand(0); | |||
6714 | uint32_t Imm = Inst.getOperand(2).getImm(); | |||
6715 | uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. | |||
6716 | uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. | |||
6717 | ||||
6718 | (void) Offset; | |||
6719 | ||||
6720 | // Only sext_inreg cases handled. | |||
6721 | assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&(static_cast <bool> (Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && Offset == 0 && "Not implemented") ? void (0) : __assert_fail ("Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && Offset == 0 && \"Not implemented\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6722, __extension__ __PRETTY_FUNCTION__)) | |||
6722 | Offset == 0 && "Not implemented")(static_cast <bool> (Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && Offset == 0 && "Not implemented") ? void (0) : __assert_fail ("Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && Offset == 0 && \"Not implemented\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6722, __extension__ __PRETTY_FUNCTION__)); | |||
6723 | ||||
6724 | if (BitWidth < 32) { | |||
6725 | Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6726 | Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6727 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); | |||
6728 | ||||
6729 | BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo) | |||
6730 | .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) | |||
6731 | .addImm(0) | |||
6732 | .addImm(BitWidth); | |||
6733 | ||||
6734 | BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) | |||
6735 | .addImm(31) | |||
6736 | .addReg(MidRegLo); | |||
6737 | ||||
6738 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) | |||
6739 | .addReg(MidRegLo) | |||
6740 | .addImm(AMDGPU::sub0) | |||
6741 | .addReg(MidRegHi) | |||
6742 | .addImm(AMDGPU::sub1); | |||
6743 | ||||
6744 | MRI.replaceRegWith(Dest.getReg(), ResultReg); | |||
6745 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
6746 | return; | |||
6747 | } | |||
6748 | ||||
6749 | MachineOperand &Src = Inst.getOperand(1); | |||
6750 | Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6751 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); | |||
6752 | ||||
6753 | BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) | |||
6754 | .addImm(31) | |||
6755 | .addReg(Src.getReg(), 0, AMDGPU::sub0); | |||
6756 | ||||
6757 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) | |||
6758 | .addReg(Src.getReg(), 0, AMDGPU::sub0) | |||
6759 | .addImm(AMDGPU::sub0) | |||
6760 | .addReg(TmpReg) | |||
6761 | .addImm(AMDGPU::sub1); | |||
6762 | ||||
6763 | MRI.replaceRegWith(Dest.getReg(), ResultReg); | |||
6764 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
6765 | } | |||
6766 | ||||
6767 | void SIInstrInfo::addUsersToMoveToVALUWorklist( | |||
6768 | Register DstReg, | |||
6769 | MachineRegisterInfo &MRI, | |||
6770 | SetVectorType &Worklist) const { | |||
6771 | for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), | |||
6772 | E = MRI.use_end(); I != E;) { | |||
6773 | MachineInstr &UseMI = *I->getParent(); | |||
6774 | ||||
6775 | unsigned OpNo = 0; | |||
6776 | ||||
6777 | switch (UseMI.getOpcode()) { | |||
6778 | case AMDGPU::COPY: | |||
6779 | case AMDGPU::WQM: | |||
6780 | case AMDGPU::SOFT_WQM: | |||
6781 | case AMDGPU::STRICT_WWM: | |||
6782 | case AMDGPU::STRICT_WQM: | |||
6783 | case AMDGPU::REG_SEQUENCE: | |||
6784 | case AMDGPU::PHI: | |||
6785 | case AMDGPU::INSERT_SUBREG: | |||
6786 | break; | |||
6787 | default: | |||
6788 | OpNo = I.getOperandNo(); | |||
6789 | break; | |||
6790 | } | |||
6791 | ||||
6792 | if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) { | |||
6793 | Worklist.insert(&UseMI); | |||
6794 | ||||
6795 | do { | |||
6796 | ++I; | |||
6797 | } while (I != E && I->getParent() == &UseMI); | |||
6798 | } else { | |||
6799 | ++I; | |||
6800 | } | |||
6801 | } | |||
6802 | } | |||
6803 | ||||
6804 | void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, | |||
6805 | MachineRegisterInfo &MRI, | |||
6806 | MachineInstr &Inst) const { | |||
6807 | Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6808 | MachineBasicBlock *MBB = Inst.getParent(); | |||
6809 | MachineOperand &Src0 = Inst.getOperand(1); | |||
6810 | MachineOperand &Src1 = Inst.getOperand(2); | |||
6811 | const DebugLoc &DL = Inst.getDebugLoc(); | |||
6812 | ||||
6813 | switch (Inst.getOpcode()) { | |||
6814 | case AMDGPU::S_PACK_LL_B32_B16: { | |||
6815 | Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6816 | Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6817 | ||||
6818 | // FIXME: Can do a lot better if we know the high bits of src0 or src1 are | |||
6819 | // 0. | |||
6820 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) | |||
6821 | .addImm(0xffff); | |||
6822 | ||||
6823 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) | |||
6824 | .addReg(ImmReg, RegState::Kill) | |||
6825 | .add(Src0); | |||
6826 | ||||
6827 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg) | |||
6828 | .add(Src1) | |||
6829 | .addImm(16) | |||
6830 | .addReg(TmpReg, RegState::Kill); | |||
6831 | break; | |||
6832 | } | |||
6833 | case AMDGPU::S_PACK_LH_B32_B16: { | |||
6834 | Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6835 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) | |||
6836 | .addImm(0xffff); | |||
6837 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg) | |||
6838 | .addReg(ImmReg, RegState::Kill) | |||
6839 | .add(Src0) | |||
6840 | .add(Src1); | |||
6841 | break; | |||
6842 | } | |||
6843 | case AMDGPU::S_PACK_HH_B32_B16: { | |||
6844 | Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6845 | Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
6846 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) | |||
6847 | .addImm(16) | |||
6848 | .add(Src0); | |||
6849 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) | |||
6850 | .addImm(0xffff0000); | |||
6851 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg) | |||
6852 | .add(Src1) | |||
6853 | .addReg(ImmReg, RegState::Kill) | |||
6854 | .addReg(TmpReg, RegState::Kill); | |||
6855 | break; | |||
6856 | } | |||
6857 | default: | |||
6858 | llvm_unreachable("unhandled s_pack_* instruction")::llvm::llvm_unreachable_internal("unhandled s_pack_* instruction" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6858); | |||
6859 | } | |||
6860 | ||||
6861 | MachineOperand &Dest = Inst.getOperand(0); | |||
6862 | MRI.replaceRegWith(Dest.getReg(), ResultReg); | |||
6863 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); | |||
6864 | } | |||
6865 | ||||
6866 | void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, | |||
6867 | MachineInstr &SCCDefInst, | |||
6868 | SetVectorType &Worklist, | |||
6869 | Register NewCond) const { | |||
6870 | ||||
6871 | // Ensure that def inst defines SCC, which is still live. | |||
6872 | assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&(static_cast <bool> (Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst) ? void (0) : __assert_fail ("Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6873, __extension__ __PRETTY_FUNCTION__)) | |||
6873 | !Op.isDead() && Op.getParent() == &SCCDefInst)(static_cast <bool> (Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst) ? void (0) : __assert_fail ("Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6873, __extension__ __PRETTY_FUNCTION__)); | |||
6874 | SmallVector<MachineInstr *, 4> CopyToDelete; | |||
6875 | // This assumes that all the users of SCC are in the same block | |||
6876 | // as the SCC def. | |||
6877 | for (MachineInstr &MI : // Skip the def inst itself. | |||
6878 | make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)), | |||
6879 | SCCDefInst.getParent()->end())) { | |||
6880 | // Check if SCC is used first. | |||
6881 | int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI); | |||
6882 | if (SCCIdx != -1) { | |||
6883 | if (MI.isCopy()) { | |||
6884 | MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); | |||
6885 | Register DestReg = MI.getOperand(0).getReg(); | |||
6886 | ||||
6887 | MRI.replaceRegWith(DestReg, NewCond); | |||
6888 | CopyToDelete.push_back(&MI); | |||
6889 | } else { | |||
6890 | ||||
6891 | if (NewCond.isValid()) | |||
6892 | MI.getOperand(SCCIdx).setReg(NewCond); | |||
6893 | ||||
6894 | Worklist.insert(&MI); | |||
6895 | } | |||
6896 | } | |||
6897 | // Exit if we find another SCC def. | |||
6898 | if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) | |||
6899 | break; | |||
6900 | } | |||
6901 | for (auto &Copy : CopyToDelete) | |||
6902 | Copy->eraseFromParent(); | |||
6903 | } | |||
6904 | ||||
6905 | // Instructions that use SCC may be converted to VALU instructions. When that | |||
6906 | // happens, the SCC register is changed to VCC_LO. The instruction that defines | |||
6907 | // SCC must be changed to an instruction that defines VCC. This function makes | |||
6908 | // sure that the instruction that defines SCC is added to the moveToVALU | |||
6909 | // worklist. | |||
6910 | void SIInstrInfo::addSCCDefsToVALUWorklist(MachineOperand &Op, | |||
6911 | SetVectorType &Worklist) const { | |||
6912 | assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse())(static_cast <bool> (Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse()) ? void (0) : __assert_fail ("Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse()" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 6912, __extension__ __PRETTY_FUNCTION__)); | |||
6913 | ||||
6914 | MachineInstr *SCCUseInst = Op.getParent(); | |||
6915 | // Look for a preceeding instruction that either defines VCC or SCC. If VCC | |||
6916 | // then there is nothing to do because the defining instruction has been | |||
6917 | // converted to a VALU already. If SCC then that instruction needs to be | |||
6918 | // converted to a VALU. | |||
6919 | for (MachineInstr &MI : | |||
6920 | make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)), | |||
6921 | SCCUseInst->getParent()->rend())) { | |||
6922 | if (MI.modifiesRegister(AMDGPU::VCC, &RI)) | |||
6923 | break; | |||
6924 | if (MI.definesRegister(AMDGPU::SCC, &RI)) { | |||
6925 | Worklist.insert(&MI); | |||
6926 | break; | |||
6927 | } | |||
6928 | } | |||
6929 | } | |||
6930 | ||||
6931 | const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( | |||
6932 | const MachineInstr &Inst) const { | |||
6933 | const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); | |||
6934 | ||||
6935 | switch (Inst.getOpcode()) { | |||
6936 | // For target instructions, getOpRegClass just returns the virtual register | |||
6937 | // class associated with the operand, so we need to find an equivalent VGPR | |||
6938 | // register class in order to move the instruction to the VALU. | |||
6939 | case AMDGPU::COPY: | |||
6940 | case AMDGPU::PHI: | |||
6941 | case AMDGPU::REG_SEQUENCE: | |||
6942 | case AMDGPU::INSERT_SUBREG: | |||
6943 | case AMDGPU::WQM: | |||
6944 | case AMDGPU::SOFT_WQM: | |||
6945 | case AMDGPU::STRICT_WWM: | |||
6946 | case AMDGPU::STRICT_WQM: { | |||
6947 | const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); | |||
6948 | if (RI.hasAGPRs(SrcRC)) { | |||
6949 | if (RI.hasAGPRs(NewDstRC)) | |||
6950 | return nullptr; | |||
6951 | ||||
6952 | switch (Inst.getOpcode()) { | |||
6953 | case AMDGPU::PHI: | |||
6954 | case AMDGPU::REG_SEQUENCE: | |||
6955 | case AMDGPU::INSERT_SUBREG: | |||
6956 | NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); | |||
6957 | break; | |||
6958 | default: | |||
6959 | NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); | |||
6960 | } | |||
6961 | ||||
6962 | if (!NewDstRC) | |||
6963 | return nullptr; | |||
6964 | } else { | |||
6965 | if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) | |||
6966 | return nullptr; | |||
6967 | ||||
6968 | NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); | |||
6969 | if (!NewDstRC) | |||
6970 | return nullptr; | |||
6971 | } | |||
6972 | ||||
6973 | return NewDstRC; | |||
6974 | } | |||
6975 | default: | |||
6976 | return NewDstRC; | |||
6977 | } | |||
6978 | } | |||
6979 | ||||
6980 | // Find the one SGPR operand we are allowed to use. | |||
6981 | Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, | |||
6982 | int OpIndices[3]) const { | |||
6983 | const MCInstrDesc &Desc = MI.getDesc(); | |||
6984 | ||||
6985 | // Find the one SGPR operand we are allowed to use. | |||
6986 | // | |||
6987 | // First we need to consider the instruction's operand requirements before | |||
6988 | // legalizing. Some operands are required to be SGPRs, such as implicit uses | |||
6989 | // of VCC, but we are still bound by the constant bus requirement to only use | |||
6990 | // one. | |||
6991 | // | |||
6992 | // If the operand's class is an SGPR, we can never move it. | |||
6993 | ||||
6994 | Register SGPRReg = findImplicitSGPRRead(MI); | |||
6995 | if (SGPRReg != AMDGPU::NoRegister) | |||
6996 | return SGPRReg; | |||
6997 | ||||
6998 | Register UsedSGPRs[3] = { AMDGPU::NoRegister }; | |||
6999 | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); | |||
7000 | ||||
7001 | for (unsigned i = 0; i < 3; ++i) { | |||
7002 | int Idx = OpIndices[i]; | |||
7003 | if (Idx == -1) | |||
7004 | break; | |||
7005 | ||||
7006 | const MachineOperand &MO = MI.getOperand(Idx); | |||
7007 | if (!MO.isReg()) | |||
7008 | continue; | |||
7009 | ||||
7010 | // Is this operand statically required to be an SGPR based on the operand | |||
7011 | // constraints? | |||
7012 | const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); | |||
7013 | bool IsRequiredSGPR = RI.isSGPRClass(OpRC); | |||
7014 | if (IsRequiredSGPR) | |||
7015 | return MO.getReg(); | |||
7016 | ||||
7017 | // If this could be a VGPR or an SGPR, Check the dynamic register class. | |||
7018 | Register Reg = MO.getReg(); | |||
7019 | const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); | |||
7020 | if (RI.isSGPRClass(RegRC)) | |||
7021 | UsedSGPRs[i] = Reg; | |||
7022 | } | |||
7023 | ||||
7024 | // We don't have a required SGPR operand, so we have a bit more freedom in | |||
7025 | // selecting operands to move. | |||
7026 | ||||
7027 | // Try to select the most used SGPR. If an SGPR is equal to one of the | |||
7028 | // others, we choose that. | |||
7029 | // | |||
7030 | // e.g. | |||
7031 | // V_FMA_F32 v0, s0, s0, s0 -> No moves | |||
7032 | // V_FMA_F32 v0, s0, s1, s0 -> Move s1 | |||
7033 | ||||
7034 | // TODO: If some of the operands are 64-bit SGPRs and some 32, we should | |||
7035 | // prefer those. | |||
7036 | ||||
7037 | if (UsedSGPRs[0] != AMDGPU::NoRegister) { | |||
7038 | if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) | |||
7039 | SGPRReg = UsedSGPRs[0]; | |||
7040 | } | |||
7041 | ||||
7042 | if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { | |||
7043 | if (UsedSGPRs[1] == UsedSGPRs[2]) | |||
7044 | SGPRReg = UsedSGPRs[1]; | |||
7045 | } | |||
7046 | ||||
7047 | return SGPRReg; | |||
7048 | } | |||
7049 | ||||
7050 | MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, | |||
7051 | unsigned OperandName) const { | |||
7052 | int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); | |||
7053 | if (Idx == -1) | |||
7054 | return nullptr; | |||
7055 | ||||
7056 | return &MI.getOperand(Idx); | |||
7057 | } | |||
7058 | ||||
7059 | uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { | |||
7060 | if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { | |||
7061 | return (AMDGPU::MTBUFFormat::UFMT_32_FLOAT << 44) | | |||
7062 | (1ULL << 56) | // RESOURCE_LEVEL = 1 | |||
7063 | (3ULL << 60); // OOB_SELECT = 3 | |||
7064 | } | |||
7065 | ||||
7066 | uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; | |||
7067 | if (ST.isAmdHsaOS()) { | |||
7068 | // Set ATC = 1. GFX9 doesn't have this bit. | |||
7069 | if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) | |||
7070 | RsrcDataFormat |= (1ULL << 56); | |||
7071 | ||||
7072 | // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. | |||
7073 | // BTW, it disables TC L2 and therefore decreases performance. | |||
7074 | if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) | |||
7075 | RsrcDataFormat |= (2ULL << 59); | |||
7076 | } | |||
7077 | ||||
7078 | return RsrcDataFormat; | |||
7079 | } | |||
7080 | ||||
7081 | uint64_t SIInstrInfo::getScratchRsrcWords23() const { | |||
7082 | uint64_t Rsrc23 = getDefaultRsrcDataFormat() | | |||
7083 | AMDGPU::RSRC_TID_ENABLE | | |||
7084 | 0xffffffff; // Size; | |||
7085 | ||||
7086 | // GFX9 doesn't have ELEMENT_SIZE. | |||
7087 | if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) { | |||
7088 | uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1; | |||
7089 | Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; | |||
7090 | } | |||
7091 | ||||
7092 | // IndexStride = 64 / 32. | |||
7093 | uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2; | |||
7094 | Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; | |||
7095 | ||||
7096 | // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. | |||
7097 | // Clear them unless we want a huge stride. | |||
7098 | if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && | |||
7099 | ST.getGeneration() <= AMDGPUSubtarget::GFX9) | |||
7100 | Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; | |||
7101 | ||||
7102 | return Rsrc23; | |||
7103 | } | |||
7104 | ||||
7105 | bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { | |||
7106 | unsigned Opc = MI.getOpcode(); | |||
7107 | ||||
7108 | return isSMRD(Opc); | |||
7109 | } | |||
7110 | ||||
7111 | bool SIInstrInfo::isHighLatencyDef(int Opc) const { | |||
7112 | return get(Opc).mayLoad() && | |||
7113 | (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc)); | |||
7114 | } | |||
7115 | ||||
7116 | unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, | |||
7117 | int &FrameIndex) const { | |||
7118 | const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); | |||
7119 | if (!Addr || !Addr->isFI()) | |||
7120 | return AMDGPU::NoRegister; | |||
7121 | ||||
7122 | assert(!MI.memoperands_empty() &&(static_cast <bool> (!MI.memoperands_empty() && (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ) ? void (0) : __assert_fail ("!MI.memoperands_empty() && (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7123, __extension__ __PRETTY_FUNCTION__)) | |||
7123 | (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (!MI.memoperands_empty() && (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ) ? void (0) : __assert_fail ("!MI.memoperands_empty() && (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7123, __extension__ __PRETTY_FUNCTION__)); | |||
7124 | ||||
7125 | FrameIndex = Addr->getIndex(); | |||
7126 | return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); | |||
7127 | } | |||
7128 | ||||
7129 | unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, | |||
7130 | int &FrameIndex) const { | |||
7131 | const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); | |||
7132 | assert(Addr && Addr->isFI())(static_cast <bool> (Addr && Addr->isFI()) ? void (0) : __assert_fail ("Addr && Addr->isFI()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7132, __extension__ __PRETTY_FUNCTION__)); | |||
7133 | FrameIndex = Addr->getIndex(); | |||
7134 | return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); | |||
7135 | } | |||
7136 | ||||
7137 | unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, | |||
7138 | int &FrameIndex) const { | |||
7139 | if (!MI.mayLoad()) | |||
7140 | return AMDGPU::NoRegister; | |||
7141 | ||||
7142 | if (isMUBUF(MI) || isVGPRSpill(MI)) | |||
7143 | return isStackAccess(MI, FrameIndex); | |||
7144 | ||||
7145 | if (isSGPRSpill(MI)) | |||
7146 | return isSGPRStackAccess(MI, FrameIndex); | |||
7147 | ||||
7148 | return AMDGPU::NoRegister; | |||
7149 | } | |||
7150 | ||||
7151 | unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, | |||
7152 | int &FrameIndex) const { | |||
7153 | if (!MI.mayStore()) | |||
7154 | return AMDGPU::NoRegister; | |||
7155 | ||||
7156 | if (isMUBUF(MI) || isVGPRSpill(MI)) | |||
7157 | return isStackAccess(MI, FrameIndex); | |||
7158 | ||||
7159 | if (isSGPRSpill(MI)) | |||
7160 | return isSGPRStackAccess(MI, FrameIndex); | |||
7161 | ||||
7162 | return AMDGPU::NoRegister; | |||
7163 | } | |||
7164 | ||||
7165 | unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const { | |||
7166 | unsigned Size = 0; | |||
7167 | MachineBasicBlock::const_instr_iterator I = MI.getIterator(); | |||
7168 | MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); | |||
7169 | while (++I != E && I->isInsideBundle()) { | |||
7170 | assert(!I->isBundle() && "No nested bundle!")(static_cast <bool> (!I->isBundle() && "No nested bundle!" ) ? void (0) : __assert_fail ("!I->isBundle() && \"No nested bundle!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7170, __extension__ __PRETTY_FUNCTION__)); | |||
7171 | Size += getInstSizeInBytes(*I); | |||
7172 | } | |||
7173 | ||||
7174 | return Size; | |||
7175 | } | |||
7176 | ||||
7177 | unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { | |||
7178 | unsigned Opc = MI.getOpcode(); | |||
7179 | const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); | |||
7180 | unsigned DescSize = Desc.getSize(); | |||
7181 | ||||
7182 | // If we have a definitive size, we can use it. Otherwise we need to inspect | |||
7183 | // the operands to know the size. | |||
7184 | if (isFixedSize(MI)) { | |||
7185 | unsigned Size = DescSize; | |||
7186 | ||||
7187 | // If we hit the buggy offset, an extra nop will be inserted in MC so | |||
7188 | // estimate the worst case. | |||
7189 | if (MI.isBranch() && ST.hasOffset3fBug()) | |||
7190 | Size += 4; | |||
7191 | ||||
7192 | return Size; | |||
7193 | } | |||
7194 | ||||
7195 | // 4-byte instructions may have a 32-bit literal encoded after them. Check | |||
7196 | // operands that coud ever be literals. | |||
7197 | if (isVALU(MI) || isSALU(MI)) { | |||
7198 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); | |||
7199 | if (Src0Idx == -1) | |||
7200 | return DescSize; // No operands. | |||
7201 | ||||
7202 | if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) | |||
7203 | return isVOP3(MI) ? 12 : (DescSize + 4); | |||
7204 | ||||
7205 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); | |||
7206 | if (Src1Idx == -1) | |||
7207 | return DescSize; | |||
7208 | ||||
7209 | if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) | |||
7210 | return isVOP3(MI) ? 12 : (DescSize + 4); | |||
7211 | ||||
7212 | int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); | |||
7213 | if (Src2Idx == -1) | |||
7214 | return DescSize; | |||
7215 | ||||
7216 | if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx])) | |||
7217 | return isVOP3(MI) ? 12 : (DescSize + 4); | |||
7218 | ||||
7219 | return DescSize; | |||
7220 | } | |||
7221 | ||||
7222 | // Check whether we have extra NSA words. | |||
7223 | if (isMIMG(MI)) { | |||
7224 | int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); | |||
7225 | if (VAddr0Idx < 0) | |||
7226 | return 8; | |||
7227 | ||||
7228 | int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); | |||
7229 | return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4); | |||
7230 | } | |||
7231 | ||||
7232 | switch (Opc) { | |||
7233 | case TargetOpcode::BUNDLE: | |||
7234 | return getInstBundleSize(MI); | |||
7235 | case TargetOpcode::INLINEASM: | |||
7236 | case TargetOpcode::INLINEASM_BR: { | |||
7237 | const MachineFunction *MF = MI.getParent()->getParent(); | |||
7238 | const char *AsmStr = MI.getOperand(0).getSymbolName(); | |||
7239 | return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST); | |||
7240 | } | |||
7241 | default: | |||
7242 | if (MI.isMetaInstruction()) | |||
7243 | return 0; | |||
7244 | return DescSize; | |||
7245 | } | |||
7246 | } | |||
7247 | ||||
7248 | bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { | |||
7249 | if (!isFLAT(MI)) | |||
7250 | return false; | |||
7251 | ||||
7252 | if (MI.memoperands_empty()) | |||
7253 | return true; | |||
7254 | ||||
7255 | for (const MachineMemOperand *MMO : MI.memoperands()) { | |||
7256 | if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) | |||
7257 | return true; | |||
7258 | } | |||
7259 | return false; | |||
7260 | } | |||
7261 | ||||
7262 | bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { | |||
7263 | return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; | |||
7264 | } | |||
7265 | ||||
7266 | void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, | |||
7267 | MachineBasicBlock *IfEnd) const { | |||
7268 | MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); | |||
7269 | assert(TI != IfEntry->end())(static_cast <bool> (TI != IfEntry->end()) ? void (0 ) : __assert_fail ("TI != IfEntry->end()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7269, __extension__ __PRETTY_FUNCTION__)); | |||
7270 | ||||
7271 | MachineInstr *Branch = &(*TI); | |||
7272 | MachineFunction *MF = IfEntry->getParent(); | |||
7273 | MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); | |||
7274 | ||||
7275 | if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { | |||
7276 | Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
7277 | MachineInstr *SIIF = | |||
7278 | BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) | |||
7279 | .add(Branch->getOperand(0)) | |||
7280 | .add(Branch->getOperand(1)); | |||
7281 | MachineInstr *SIEND = | |||
7282 | BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) | |||
7283 | .addReg(DstReg); | |||
7284 | ||||
7285 | IfEntry->erase(TI); | |||
7286 | IfEntry->insert(IfEntry->end(), SIIF); | |||
7287 | IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); | |||
7288 | } | |||
7289 | } | |||
7290 | ||||
7291 | void SIInstrInfo::convertNonUniformLoopRegion( | |||
7292 | MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { | |||
7293 | MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); | |||
7294 | // We expect 2 terminators, one conditional and one unconditional. | |||
7295 | assert(TI != LoopEnd->end())(static_cast <bool> (TI != LoopEnd->end()) ? void (0 ) : __assert_fail ("TI != LoopEnd->end()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7295, __extension__ __PRETTY_FUNCTION__)); | |||
7296 | ||||
7297 | MachineInstr *Branch = &(*TI); | |||
7298 | MachineFunction *MF = LoopEnd->getParent(); | |||
7299 | MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); | |||
7300 | ||||
7301 | if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { | |||
7302 | ||||
7303 | Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
7304 | Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
7305 | MachineInstrBuilder HeaderPHIBuilder = | |||
7306 | BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); | |||
7307 | for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), | |||
7308 | E = LoopEntry->pred_end(); | |||
7309 | PI != E; ++PI) { | |||
7310 | if (*PI == LoopEnd) { | |||
7311 | HeaderPHIBuilder.addReg(BackEdgeReg); | |||
7312 | } else { | |||
7313 | MachineBasicBlock *PMBB = *PI; | |||
7314 | Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC()); | |||
7315 | materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), | |||
7316 | ZeroReg, 0); | |||
7317 | HeaderPHIBuilder.addReg(ZeroReg); | |||
7318 | } | |||
7319 | HeaderPHIBuilder.addMBB(*PI); | |||
7320 | } | |||
7321 | MachineInstr *HeaderPhi = HeaderPHIBuilder; | |||
7322 | MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), | |||
7323 | get(AMDGPU::SI_IF_BREAK), BackEdgeReg) | |||
7324 | .addReg(DstReg) | |||
7325 | .add(Branch->getOperand(0)); | |||
7326 | MachineInstr *SILOOP = | |||
7327 | BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) | |||
7328 | .addReg(BackEdgeReg) | |||
7329 | .addMBB(LoopEntry); | |||
7330 | ||||
7331 | LoopEntry->insert(LoopEntry->begin(), HeaderPhi); | |||
7332 | LoopEnd->erase(TI); | |||
7333 | LoopEnd->insert(LoopEnd->end(), SIIFBREAK); | |||
7334 | LoopEnd->insert(LoopEnd->end(), SILOOP); | |||
7335 | } | |||
7336 | } | |||
7337 | ||||
7338 | ArrayRef<std::pair<int, const char *>> | |||
7339 | SIInstrInfo::getSerializableTargetIndices() const { | |||
7340 | static const std::pair<int, const char *> TargetIndices[] = { | |||
7341 | {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, | |||
7342 | {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, | |||
7343 | {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, | |||
7344 | {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, | |||
7345 | {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; | |||
7346 | return makeArrayRef(TargetIndices); | |||
7347 | } | |||
7348 | ||||
7349 | /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The | |||
7350 | /// post-RA version of misched uses CreateTargetMIHazardRecognizer. | |||
7351 | ScheduleHazardRecognizer * | |||
7352 | SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, | |||
7353 | const ScheduleDAG *DAG) const { | |||
7354 | return new GCNHazardRecognizer(DAG->MF); | |||
7355 | } | |||
7356 | ||||
7357 | /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer | |||
7358 | /// pass. | |||
7359 | ScheduleHazardRecognizer * | |||
7360 | SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { | |||
7361 | return new GCNHazardRecognizer(MF); | |||
7362 | } | |||
7363 | ||||
7364 | std::pair<unsigned, unsigned> | |||
7365 | SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { | |||
7366 | return std::make_pair(TF & MO_MASK, TF & ~MO_MASK); | |||
7367 | } | |||
7368 | ||||
7369 | ArrayRef<std::pair<unsigned, const char *>> | |||
7370 | SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { | |||
7371 | static const std::pair<unsigned, const char *> TargetFlags[] = { | |||
7372 | { MO_GOTPCREL, "amdgpu-gotprel" }, | |||
7373 | { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, | |||
7374 | { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, | |||
7375 | { MO_REL32_LO, "amdgpu-rel32-lo" }, | |||
7376 | { MO_REL32_HI, "amdgpu-rel32-hi" }, | |||
7377 | { MO_ABS32_LO, "amdgpu-abs32-lo" }, | |||
7378 | { MO_ABS32_HI, "amdgpu-abs32-hi" }, | |||
7379 | }; | |||
7380 | ||||
7381 | return makeArrayRef(TargetFlags); | |||
7382 | } | |||
7383 | ||||
7384 | bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { | |||
7385 | return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && | |||
7386 | MI.modifiesRegister(AMDGPU::EXEC, &RI); | |||
7387 | } | |||
7388 | ||||
7389 | MachineInstrBuilder | |||
7390 | SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, | |||
7391 | MachineBasicBlock::iterator I, | |||
7392 | const DebugLoc &DL, | |||
7393 | Register DestReg) const { | |||
7394 | if (ST.hasAddNoCarry()) | |||
7395 | return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); | |||
7396 | ||||
7397 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | |||
7398 | Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC()); | |||
7399 | MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC()); | |||
7400 | ||||
7401 | return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg) | |||
7402 | .addReg(UnusedCarry, RegState::Define | RegState::Dead); | |||
7403 | } | |||
7404 | ||||
7405 | MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, | |||
7406 | MachineBasicBlock::iterator I, | |||
7407 | const DebugLoc &DL, | |||
7408 | Register DestReg, | |||
7409 | RegScavenger &RS) const { | |||
7410 | if (ST.hasAddNoCarry()) | |||
7411 | return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg); | |||
7412 | ||||
7413 | // If available, prefer to use vcc. | |||
7414 | Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC) | |||
7415 | ? Register(RI.getVCC()) | |||
7416 | : RS.scavengeRegister(RI.getBoolRC(), I, 0, false); | |||
7417 | ||||
7418 | // TODO: Users need to deal with this. | |||
7419 | if (!UnusedCarry.isValid()) | |||
7420 | return MachineInstrBuilder(); | |||
7421 | ||||
7422 | return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg) | |||
7423 | .addReg(UnusedCarry, RegState::Define | RegState::Dead); | |||
7424 | } | |||
7425 | ||||
7426 | bool SIInstrInfo::isKillTerminator(unsigned Opcode) { | |||
7427 | switch (Opcode) { | |||
7428 | case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: | |||
7429 | case AMDGPU::SI_KILL_I1_TERMINATOR: | |||
7430 | return true; | |||
7431 | default: | |||
7432 | return false; | |||
7433 | } | |||
7434 | } | |||
7435 | ||||
7436 | const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const { | |||
7437 | switch (Opcode) { | |||
7438 | case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: | |||
7439 | return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR); | |||
7440 | case AMDGPU::SI_KILL_I1_PSEUDO: | |||
7441 | return get(AMDGPU::SI_KILL_I1_TERMINATOR); | |||
7442 | default: | |||
7443 | llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO")::llvm::llvm_unreachable_internal("invalid opcode, expected SI_KILL_*_PSEUDO" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7443); | |||
7444 | } | |||
7445 | } | |||
7446 | ||||
7447 | void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const { | |||
7448 | if (!ST.isWave32()) | |||
7449 | return; | |||
7450 | ||||
7451 | for (auto &Op : MI.implicit_operands()) { | |||
7452 | if (Op.isReg() && Op.getReg() == AMDGPU::VCC) | |||
7453 | Op.setReg(AMDGPU::VCC_LO); | |||
7454 | } | |||
7455 | } | |||
7456 | ||||
7457 | bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { | |||
7458 | if (!isSMRD(MI)) | |||
7459 | return false; | |||
7460 | ||||
7461 | // Check that it is using a buffer resource. | |||
7462 | int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase); | |||
7463 | if (Idx == -1) // e.g. s_memtime | |||
7464 | return false; | |||
7465 | ||||
7466 | const auto RCID = MI.getDesc().OpInfo[Idx].RegClass; | |||
7467 | return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); | |||
7468 | } | |||
7469 | ||||
7470 | // Depending on the used address space and instructions, some immediate offsets | |||
7471 | // are allowed and some are not. | |||
7472 | // In general, flat instruction offsets can only be non-negative, global and | |||
7473 | // scratch instruction offsets can also be negative. | |||
7474 | // | |||
7475 | // There are several bugs related to these offsets: | |||
7476 | // On gfx10.1, flat instructions that go into the global address space cannot | |||
7477 | // use an offset. | |||
7478 | // | |||
7479 | // For scratch instructions, the address can be either an SGPR or a VGPR. | |||
7480 | // The following offsets can be used, depending on the architecture (x means | |||
7481 | // cannot be used): | |||
7482 | // +----------------------------+------+------+ | |||
7483 | // | Address-Mode | SGPR | VGPR | | |||
7484 | // +----------------------------+------+------+ | |||
7485 | // | gfx9 | | | | |||
7486 | // | negative, 4-aligned offset | x | ok | | |||
7487 | // | negative, unaligned offset | x | ok | | |||
7488 | // +----------------------------+------+------+ | |||
7489 | // | gfx10 | | | | |||
7490 | // | negative, 4-aligned offset | ok | ok | | |||
7491 | // | negative, unaligned offset | ok | x | | |||
7492 | // +----------------------------+------+------+ | |||
7493 | // | gfx10.3 | | | | |||
7494 | // | negative, 4-aligned offset | ok | ok | | |||
7495 | // | negative, unaligned offset | ok | ok | | |||
7496 | // +----------------------------+------+------+ | |||
7497 | // | |||
7498 | // This function ignores the addressing mode, so if an offset cannot be used in | |||
7499 | // one addressing mode, it is considered illegal. | |||
7500 | bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, | |||
7501 | uint64_t FlatVariant) const { | |||
7502 | // TODO: Should 0 be special cased? | |||
7503 | if (!ST.hasFlatInstOffsets()) | |||
7504 | return false; | |||
7505 | ||||
7506 | if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT && | |||
7507 | (AddrSpace == AMDGPUAS::FLAT_ADDRESS || | |||
7508 | AddrSpace == AMDGPUAS::GLOBAL_ADDRESS)) | |||
7509 | return false; | |||
7510 | ||||
7511 | bool Signed = FlatVariant != SIInstrFlags::FLAT; | |||
7512 | if (ST.hasNegativeScratchOffsetBug() && | |||
7513 | FlatVariant == SIInstrFlags::FlatScratch) | |||
7514 | Signed = false; | |||
7515 | if (ST.hasNegativeUnalignedScratchOffsetBug() && | |||
7516 | FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 && | |||
7517 | (Offset % 4) != 0) { | |||
7518 | return false; | |||
7519 | } | |||
7520 | ||||
7521 | unsigned N = AMDGPU::getNumFlatOffsetBits(ST, Signed); | |||
7522 | return Signed ? isIntN(N, Offset) : isUIntN(N, Offset); | |||
7523 | } | |||
7524 | ||||
7525 | // See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not. | |||
7526 | std::pair<int64_t, int64_t> | |||
7527 | SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, | |||
7528 | uint64_t FlatVariant) const { | |||
7529 | int64_t RemainderOffset = COffsetVal; | |||
7530 | int64_t ImmField = 0; | |||
7531 | bool Signed = FlatVariant != SIInstrFlags::FLAT; | |||
7532 | if (ST.hasNegativeScratchOffsetBug() && | |||
7533 | FlatVariant == SIInstrFlags::FlatScratch) | |||
7534 | Signed = false; | |||
7535 | ||||
7536 | const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, Signed); | |||
7537 | if (Signed) { | |||
7538 | // Use signed division by a power of two to truncate towards 0. | |||
7539 | int64_t D = 1LL << (NumBits - 1); | |||
7540 | RemainderOffset = (COffsetVal / D) * D; | |||
7541 | ImmField = COffsetVal - RemainderOffset; | |||
7542 | ||||
7543 | if (ST.hasNegativeUnalignedScratchOffsetBug() && | |||
7544 | FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 && | |||
7545 | (ImmField % 4) != 0) { | |||
7546 | // Make ImmField a multiple of 4 | |||
7547 | RemainderOffset += ImmField % 4; | |||
7548 | ImmField -= ImmField % 4; | |||
7549 | } | |||
7550 | } else if (COffsetVal >= 0) { | |||
7551 | ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits); | |||
7552 | RemainderOffset = COffsetVal - ImmField; | |||
7553 | } | |||
7554 | ||||
7555 | assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant))(static_cast <bool> (isLegalFLATOffset(ImmField, AddrSpace , FlatVariant)) ? void (0) : __assert_fail ("isLegalFLATOffset(ImmField, AddrSpace, FlatVariant)" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7555, __extension__ __PRETTY_FUNCTION__)); | |||
7556 | assert(RemainderOffset + ImmField == COffsetVal)(static_cast <bool> (RemainderOffset + ImmField == COffsetVal ) ? void (0) : __assert_fail ("RemainderOffset + ImmField == COffsetVal" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7556, __extension__ __PRETTY_FUNCTION__)); | |||
7557 | return {ImmField, RemainderOffset}; | |||
7558 | } | |||
7559 | ||||
7560 | // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td | |||
7561 | enum SIEncodingFamily { | |||
7562 | SI = 0, | |||
7563 | VI = 1, | |||
7564 | SDWA = 2, | |||
7565 | SDWA9 = 3, | |||
7566 | GFX80 = 4, | |||
7567 | GFX9 = 5, | |||
7568 | GFX10 = 6, | |||
7569 | SDWA10 = 7, | |||
7570 | GFX90A = 8 | |||
7571 | }; | |||
7572 | ||||
7573 | static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { | |||
7574 | switch (ST.getGeneration()) { | |||
7575 | default: | |||
7576 | break; | |||
7577 | case AMDGPUSubtarget::SOUTHERN_ISLANDS: | |||
7578 | case AMDGPUSubtarget::SEA_ISLANDS: | |||
7579 | return SIEncodingFamily::SI; | |||
7580 | case AMDGPUSubtarget::VOLCANIC_ISLANDS: | |||
7581 | case AMDGPUSubtarget::GFX9: | |||
7582 | return SIEncodingFamily::VI; | |||
7583 | case AMDGPUSubtarget::GFX10: | |||
7584 | return SIEncodingFamily::GFX10; | |||
7585 | } | |||
7586 | llvm_unreachable("Unknown subtarget generation!")::llvm::llvm_unreachable_internal("Unknown subtarget generation!" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7586); | |||
7587 | } | |||
7588 | ||||
7589 | bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const { | |||
7590 | switch(MCOp) { | |||
7591 | // These opcodes use indirect register addressing so | |||
7592 | // they need special handling by codegen (currently missing). | |||
7593 | // Therefore it is too risky to allow these opcodes | |||
7594 | // to be selected by dpp combiner or sdwa peepholer. | |||
7595 | case AMDGPU::V_MOVRELS_B32_dpp_gfx10: | |||
7596 | case AMDGPU::V_MOVRELS_B32_sdwa_gfx10: | |||
7597 | case AMDGPU::V_MOVRELD_B32_dpp_gfx10: | |||
7598 | case AMDGPU::V_MOVRELD_B32_sdwa_gfx10: | |||
7599 | case AMDGPU::V_MOVRELSD_B32_dpp_gfx10: | |||
7600 | case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10: | |||
7601 | case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10: | |||
7602 | case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10: | |||
7603 | return true; | |||
7604 | default: | |||
7605 | return false; | |||
7606 | } | |||
7607 | } | |||
7608 | ||||
7609 | int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { | |||
7610 | SIEncodingFamily Gen = subtargetEncodingFamily(ST); | |||
7611 | ||||
7612 | if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && | |||
7613 | ST.getGeneration() == AMDGPUSubtarget::GFX9) | |||
7614 | Gen = SIEncodingFamily::GFX9; | |||
7615 | ||||
7616 | // Adjust the encoding family to GFX80 for D16 buffer instructions when the | |||
7617 | // subtarget has UnpackedD16VMem feature. | |||
7618 | // TODO: remove this when we discard GFX80 encoding. | |||
7619 | if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf)) | |||
7620 | Gen = SIEncodingFamily::GFX80; | |||
7621 | ||||
7622 | if (get(Opcode).TSFlags & SIInstrFlags::SDWA) { | |||
7623 | switch (ST.getGeneration()) { | |||
7624 | default: | |||
7625 | Gen = SIEncodingFamily::SDWA; | |||
7626 | break; | |||
7627 | case AMDGPUSubtarget::GFX9: | |||
7628 | Gen = SIEncodingFamily::SDWA9; | |||
7629 | break; | |||
7630 | case AMDGPUSubtarget::GFX10: | |||
7631 | Gen = SIEncodingFamily::SDWA10; | |||
7632 | break; | |||
7633 | } | |||
7634 | } | |||
7635 | ||||
7636 | int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); | |||
7637 | ||||
7638 | // -1 means that Opcode is already a native instruction. | |||
7639 | if (MCOp == -1) | |||
7640 | return Opcode; | |||
7641 | ||||
7642 | if (ST.hasGFX90AInsts()) { | |||
7643 | uint16_t NMCOp = (uint16_t)-1; | |||
7644 | NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A); | |||
7645 | if (NMCOp == (uint16_t)-1) | |||
7646 | NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9); | |||
7647 | if (NMCOp != (uint16_t)-1) | |||
7648 | MCOp = NMCOp; | |||
7649 | } | |||
7650 | ||||
7651 | // (uint16_t)-1 means that Opcode is a pseudo instruction that has | |||
7652 | // no encoding in the given subtarget generation. | |||
7653 | if (MCOp == (uint16_t)-1) | |||
7654 | return -1; | |||
7655 | ||||
7656 | if (isAsmOnlyOpcode(MCOp)) | |||
7657 | return -1; | |||
7658 | ||||
7659 | return MCOp; | |||
7660 | } | |||
7661 | ||||
7662 | static | |||
7663 | TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) { | |||
7664 | assert(RegOpnd.isReg())(static_cast <bool> (RegOpnd.isReg()) ? void (0) : __assert_fail ("RegOpnd.isReg()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7664, __extension__ __PRETTY_FUNCTION__)); | |||
7665 | return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() : | |||
7666 | getRegSubRegPair(RegOpnd); | |||
7667 | } | |||
7668 | ||||
7669 | TargetInstrInfo::RegSubRegPair | |||
7670 | llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) { | |||
7671 | assert(MI.isRegSequence())(static_cast <bool> (MI.isRegSequence()) ? void (0) : __assert_fail ("MI.isRegSequence()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7671, __extension__ __PRETTY_FUNCTION__)); | |||
7672 | for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I) | |||
7673 | if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) { | |||
7674 | auto &RegOp = MI.getOperand(1 + 2 * I); | |||
7675 | return getRegOrUndef(RegOp); | |||
7676 | } | |||
7677 | return TargetInstrInfo::RegSubRegPair(); | |||
7678 | } | |||
7679 | ||||
7680 | // Try to find the definition of reg:subreg in subreg-manipulation pseudos | |||
7681 | // Following a subreg of reg:subreg isn't supported | |||
7682 | static bool followSubRegDef(MachineInstr &MI, | |||
7683 | TargetInstrInfo::RegSubRegPair &RSR) { | |||
7684 | if (!RSR.SubReg) | |||
7685 | return false; | |||
7686 | switch (MI.getOpcode()) { | |||
7687 | default: break; | |||
7688 | case AMDGPU::REG_SEQUENCE: | |||
7689 | RSR = getRegSequenceSubReg(MI, RSR.SubReg); | |||
7690 | return true; | |||
7691 | // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg | |||
7692 | case AMDGPU::INSERT_SUBREG: | |||
7693 | if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm()) | |||
7694 | // inserted the subreg we're looking for | |||
7695 | RSR = getRegOrUndef(MI.getOperand(2)); | |||
7696 | else { // the subreg in the rest of the reg | |||
7697 | auto R1 = getRegOrUndef(MI.getOperand(1)); | |||
7698 | if (R1.SubReg) // subreg of subreg isn't supported | |||
7699 | return false; | |||
7700 | RSR.Reg = R1.Reg; | |||
7701 | } | |||
7702 | return true; | |||
7703 | } | |||
7704 | return false; | |||
7705 | } | |||
7706 | ||||
7707 | MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, | |||
7708 | MachineRegisterInfo &MRI) { | |||
7709 | assert(MRI.isSSA())(static_cast <bool> (MRI.isSSA()) ? void (0) : __assert_fail ("MRI.isSSA()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7709, __extension__ __PRETTY_FUNCTION__)); | |||
7710 | if (!P.Reg.isVirtual()) | |||
7711 | return nullptr; | |||
7712 | ||||
7713 | auto RSR = P; | |||
7714 | auto *DefInst = MRI.getVRegDef(RSR.Reg); | |||
7715 | while (auto *MI = DefInst) { | |||
7716 | DefInst = nullptr; | |||
7717 | switch (MI->getOpcode()) { | |||
7718 | case AMDGPU::COPY: | |||
7719 | case AMDGPU::V_MOV_B32_e32: { | |||
7720 | auto &Op1 = MI->getOperand(1); | |||
7721 | if (Op1.isReg() && Op1.getReg().isVirtual()) { | |||
7722 | if (Op1.isUndef()) | |||
7723 | return nullptr; | |||
7724 | RSR = getRegSubRegPair(Op1); | |||
7725 | DefInst = MRI.getVRegDef(RSR.Reg); | |||
7726 | } | |||
7727 | break; | |||
7728 | } | |||
7729 | default: | |||
7730 | if (followSubRegDef(*MI, RSR)) { | |||
7731 | if (!RSR.Reg) | |||
7732 | return nullptr; | |||
7733 | DefInst = MRI.getVRegDef(RSR.Reg); | |||
7734 | } | |||
7735 | } | |||
7736 | if (!DefInst) | |||
7737 | return MI; | |||
7738 | } | |||
7739 | return nullptr; | |||
7740 | } | |||
7741 | ||||
7742 | bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, | |||
7743 | Register VReg, | |||
7744 | const MachineInstr &DefMI, | |||
7745 | const MachineInstr &UseMI) { | |||
7746 | assert(MRI.isSSA() && "Must be run on SSA")(static_cast <bool> (MRI.isSSA() && "Must be run on SSA" ) ? void (0) : __assert_fail ("MRI.isSSA() && \"Must be run on SSA\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7746, __extension__ __PRETTY_FUNCTION__)); | |||
7747 | ||||
7748 | auto *TRI = MRI.getTargetRegisterInfo(); | |||
7749 | auto *DefBB = DefMI.getParent(); | |||
7750 | ||||
7751 | // Don't bother searching between blocks, although it is possible this block | |||
7752 | // doesn't modify exec. | |||
7753 | if (UseMI.getParent() != DefBB) | |||
7754 | return true; | |||
7755 | ||||
7756 | const int MaxInstScan = 20; | |||
7757 | int NumInst = 0; | |||
7758 | ||||
7759 | // Stop scan at the use. | |||
7760 | auto E = UseMI.getIterator(); | |||
7761 | for (auto I = std::next(DefMI.getIterator()); I != E; ++I) { | |||
7762 | if (I->isDebugInstr()) | |||
7763 | continue; | |||
7764 | ||||
7765 | if (++NumInst > MaxInstScan) | |||
7766 | return true; | |||
7767 | ||||
7768 | if (I->modifiesRegister(AMDGPU::EXEC, TRI)) | |||
7769 | return true; | |||
7770 | } | |||
7771 | ||||
7772 | return false; | |||
7773 | } | |||
7774 | ||||
7775 | bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, | |||
7776 | Register VReg, | |||
7777 | const MachineInstr &DefMI) { | |||
7778 | assert(MRI.isSSA() && "Must be run on SSA")(static_cast <bool> (MRI.isSSA() && "Must be run on SSA" ) ? void (0) : __assert_fail ("MRI.isSSA() && \"Must be run on SSA\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7778, __extension__ __PRETTY_FUNCTION__)); | |||
7779 | ||||
7780 | auto *TRI = MRI.getTargetRegisterInfo(); | |||
7781 | auto *DefBB = DefMI.getParent(); | |||
7782 | ||||
7783 | const int MaxUseScan = 10; | |||
7784 | int NumUse = 0; | |||
7785 | ||||
7786 | for (auto &Use : MRI.use_nodbg_operands(VReg)) { | |||
7787 | auto &UseInst = *Use.getParent(); | |||
7788 | // Don't bother searching between blocks, although it is possible this block | |||
7789 | // doesn't modify exec. | |||
7790 | if (UseInst.getParent() != DefBB) | |||
7791 | return true; | |||
7792 | ||||
7793 | if (++NumUse > MaxUseScan) | |||
7794 | return true; | |||
7795 | } | |||
7796 | ||||
7797 | if (NumUse == 0) | |||
7798 | return false; | |||
7799 | ||||
7800 | const int MaxInstScan = 20; | |||
7801 | int NumInst = 0; | |||
7802 | ||||
7803 | // Stop scan when we have seen all the uses. | |||
7804 | for (auto I = std::next(DefMI.getIterator()); ; ++I) { | |||
7805 | assert(I != DefBB->end())(static_cast <bool> (I != DefBB->end()) ? void (0) : __assert_fail ("I != DefBB->end()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp" , 7805, __extension__ __PRETTY_FUNCTION__)); | |||
7806 | ||||
7807 | if (I->isDebugInstr()) | |||
7808 | continue; | |||
7809 | ||||
7810 | if (++NumInst > MaxInstScan) | |||
7811 | return true; | |||
7812 | ||||
7813 | for (const MachineOperand &Op : I->operands()) { | |||
7814 | // We don't check reg masks here as they're used only on calls: | |||
7815 | // 1. EXEC is only considered const within one BB | |||
7816 | // 2. Call should be a terminator instruction if present in a BB | |||
7817 | ||||
7818 | if (!Op.isReg()) | |||
7819 | continue; | |||
7820 | ||||
7821 | Register Reg = Op.getReg(); | |||
7822 | if (Op.isUse()) { | |||
7823 | if (Reg == VReg && --NumUse == 0) | |||
7824 | return false; | |||
7825 | } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC)) | |||
7826 | return true; | |||
7827 | } | |||
7828 | } | |||
7829 | } | |||
7830 | ||||
7831 | MachineInstr *SIInstrInfo::createPHIDestinationCopy( | |||
7832 | MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt, | |||
7833 | const DebugLoc &DL, Register Src, Register Dst) const { | |||
7834 | auto Cur = MBB.begin(); | |||
7835 | if (Cur != MBB.end()) | |||
7836 | do { | |||
7837 | if (!Cur->isPHI() && Cur->readsRegister(Dst)) | |||
7838 | return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src); | |||
7839 | ++Cur; | |||
7840 | } while (Cur != MBB.end() && Cur != LastPHIIt); | |||
7841 | ||||
7842 | return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src, | |||
7843 | Dst); | |||
7844 | } | |||
7845 | ||||
7846 | MachineInstr *SIInstrInfo::createPHISourceCopy( | |||
7847 | MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, | |||
7848 | const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const { | |||
7849 | if (InsPt != MBB.end() && | |||
7850 | (InsPt->getOpcode() == AMDGPU::SI_IF || | |||
7851 | InsPt->getOpcode() == AMDGPU::SI_ELSE || | |||
7852 | InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) && | |||
7853 | InsPt->definesRegister(Src)) { | |||
7854 | InsPt++; | |||
7855 | return BuildMI(MBB, InsPt, DL, | |||
7856 | get(ST.isWave32() ? AMDGPU::S_MOV_B32_term | |||
7857 | : AMDGPU::S_MOV_B64_term), | |||
7858 | Dst) | |||
7859 | .addReg(Src, 0, SrcSubReg) | |||
7860 | .addReg(AMDGPU::EXEC, RegState::Implicit); | |||
7861 | } | |||
7862 | return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg, | |||
7863 | Dst); | |||
7864 | } | |||
7865 | ||||
7866 | bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); } | |||
7867 | ||||
7868 | MachineInstr *SIInstrInfo::foldMemoryOperandImpl( | |||
7869 | MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, | |||
7870 | MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS, | |||
7871 | VirtRegMap *VRM) const { | |||
7872 | // This is a bit of a hack (copied from AArch64). Consider this instruction: | |||
7873 | // | |||
7874 | // %0:sreg_32 = COPY $m0 | |||
7875 | // | |||
7876 | // We explicitly chose SReg_32 for the virtual register so such a copy might | |||
7877 | // be eliminated by RegisterCoalescer. However, that may not be possible, and | |||
7878 | // %0 may even spill. We can't spill $m0 normally (it would require copying to | |||
7879 | // a numbered SGPR anyway), and since it is in the SReg_32 register class, | |||
7880 | // TargetInstrInfo::foldMemoryOperand() is going to try. | |||
7881 | // A similar issue also exists with spilling and reloading $exec registers. | |||
7882 | // | |||
7883 | // To prevent that, constrain the %0 register class here. | |||
7884 | if (MI.isFullCopy()) { | |||
7885 | Register DstReg = MI.getOperand(0).getReg(); | |||
7886 | Register SrcReg = MI.getOperand(1).getReg(); | |||
7887 | if ((DstReg.isVirtual() || SrcReg.isVirtual()) && | |||
7888 | (DstReg.isVirtual() != SrcReg.isVirtual())) { | |||
7889 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
7890 | Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg; | |||
7891 | const TargetRegisterClass *RC = MRI.getRegClass(VirtReg); | |||
7892 | if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) { | |||
7893 | MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); | |||
7894 | return nullptr; | |||
7895 | } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) { | |||
7896 | MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass); | |||
7897 | return nullptr; | |||
7898 | } | |||
7899 | } | |||
7900 | } | |||
7901 | ||||
7902 | return nullptr; | |||
7903 | } | |||
7904 | ||||
7905 | unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, | |||
7906 | const MachineInstr &MI, | |||
7907 | unsigned *PredCost) const { | |||
7908 | if (MI.isBundle()) { | |||
7909 | MachineBasicBlock::const_instr_iterator I(MI.getIterator()); | |||
7910 | MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end()); | |||
7911 | unsigned Lat = 0, Count = 0; | |||
7912 | for (++I; I != E && I->isBundledWithPred(); ++I) { | |||
7913 | ++Count; | |||
7914 | Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I)); | |||
7915 | } | |||
7916 | return Lat + Count - 1; | |||
7917 | } | |||
7918 | ||||
7919 | return SchedModel.computeInstrLatency(&MI); | |||
7920 | } | |||
7921 | ||||
7922 | unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) { | |||
7923 | switch (MF.getFunction().getCallingConv()) { | |||
7924 | case CallingConv::AMDGPU_PS: | |||
7925 | return 1; | |||
7926 | case CallingConv::AMDGPU_VS: | |||
7927 | return 2; | |||
7928 | case CallingConv::AMDGPU_GS: | |||
7929 | return 3; | |||
7930 | case CallingConv::AMDGPU_HS: | |||
7931 | case CallingConv::AMDGPU_LS: | |||
7932 | case CallingConv::AMDGPU_ES: | |||
7933 | report_fatal_error("ds_ordered_count unsupported for this calling conv"); | |||
7934 | case CallingConv::AMDGPU_CS: | |||
7935 | case CallingConv::AMDGPU_KERNEL: | |||
7936 | case CallingConv::C: | |||
7937 | case CallingConv::Fast: | |||
7938 | default: | |||
7939 | // Assume other calling conventions are various compute callable functions | |||
7940 | return 0; | |||
7941 | } | |||
7942 | } |
1 | //===-- llvm/CodeGen/Register.h ---------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef LLVM_CODEGEN_REGISTER_H |
10 | #define LLVM_CODEGEN_REGISTER_H |
11 | |
12 | #include "llvm/MC/MCRegister.h" |
13 | #include <cassert> |
14 | |
15 | namespace llvm { |
16 | |
17 | /// Wrapper class representing virtual and physical registers. Should be passed |
18 | /// by value. |
19 | class Register { |
20 | unsigned Reg; |
21 | |
22 | public: |
23 | constexpr Register(unsigned Val = 0): Reg(Val) {} |
24 | constexpr Register(MCRegister Val): Reg(Val) {} |
25 | |
26 | // Register numbers can represent physical registers, virtual registers, and |
27 | // sometimes stack slots. The unsigned values are divided into these ranges: |
28 | // |
29 | // 0 Not a register, can be used as a sentinel. |
30 | // [1;2^30) Physical registers assigned by TableGen. |
31 | // [2^30;2^31) Stack slots. (Rarely used.) |
32 | // [2^31;2^32) Virtual registers assigned by MachineRegisterInfo. |
33 | // |
34 | // Further sentinels can be allocated from the small negative integers. |
35 | // DenseMapInfo<unsigned> uses -1u and -2u. |
36 | static_assert(std::numeric_limits<decltype(Reg)>::max() >= 0xFFFFFFFF, |
37 | "Reg isn't large enough to hold full range."); |
38 | |
39 | /// isStackSlot - Sometimes it is useful the be able to store a non-negative |
40 | /// frame index in a variable that normally holds a register. isStackSlot() |
41 | /// returns true if Reg is in the range used for stack slots. |
42 | /// |
43 | /// FIXME: remove in favor of member. |
44 | static bool isStackSlot(unsigned Reg) { |
45 | return MCRegister::isStackSlot(Reg); |
46 | } |
47 | |
48 | /// Return true if this is a stack slot. |
49 | bool isStack() const { return MCRegister::isStackSlot(Reg); } |
50 | |
51 | /// Compute the frame index from a register value representing a stack slot. |
52 | static int stackSlot2Index(Register Reg) { |
53 | assert(Reg.isStack() && "Not a stack slot")(static_cast <bool> (Reg.isStack() && "Not a stack slot" ) ? void (0) : __assert_fail ("Reg.isStack() && \"Not a stack slot\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/CodeGen/Register.h" , 53, __extension__ __PRETTY_FUNCTION__)); |
54 | return int(Reg - MCRegister::FirstStackSlot); |
55 | } |
56 | |
57 | /// Convert a non-negative frame index to a stack slot register value. |
58 | static Register index2StackSlot(int FI) { |
59 | assert(FI >= 0 && "Cannot hold a negative frame index.")(static_cast <bool> (FI >= 0 && "Cannot hold a negative frame index." ) ? void (0) : __assert_fail ("FI >= 0 && \"Cannot hold a negative frame index.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/CodeGen/Register.h" , 59, __extension__ __PRETTY_FUNCTION__)); |
60 | return Register(FI + MCRegister::FirstStackSlot); |
61 | } |
62 | |
63 | /// Return true if the specified register number is in |
64 | /// the physical register namespace. |
65 | static bool isPhysicalRegister(unsigned Reg) { |
66 | return MCRegister::isPhysicalRegister(Reg); |
67 | } |
68 | |
69 | /// Return true if the specified register number is in |
70 | /// the virtual register namespace. |
71 | static bool isVirtualRegister(unsigned Reg) { |
72 | return Reg & MCRegister::VirtualRegFlag && !isStackSlot(Reg); |
73 | } |
74 | |
75 | /// Convert a virtual register number to a 0-based index. |
76 | /// The first virtual register in a function will get the index 0. |
77 | static unsigned virtReg2Index(Register Reg) { |
78 | assert(isVirtualRegister(Reg) && "Not a virtual register")(static_cast <bool> (isVirtualRegister(Reg) && "Not a virtual register" ) ? void (0) : __assert_fail ("isVirtualRegister(Reg) && \"Not a virtual register\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/CodeGen/Register.h" , 78, __extension__ __PRETTY_FUNCTION__)); |
79 | return Reg & ~MCRegister::VirtualRegFlag; |
80 | } |
81 | |
82 | /// Convert a 0-based index to a virtual register number. |
83 | /// This is the inverse operation of VirtReg2IndexFunctor below. |
84 | static Register index2VirtReg(unsigned Index) { |
85 | assert(Index < (1u << 31) && "Index too large for virtual register range.")(static_cast <bool> (Index < (1u << 31) && "Index too large for virtual register range.") ? void (0) : __assert_fail ("Index < (1u << 31) && \"Index too large for virtual register range.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/CodeGen/Register.h" , 85, __extension__ __PRETTY_FUNCTION__)); |
86 | return Index | MCRegister::VirtualRegFlag; |
87 | } |
88 | |
89 | /// Return true if the specified register number is in the virtual register |
90 | /// namespace. |
91 | bool isVirtual() const { |
92 | return isVirtualRegister(Reg); |
93 | } |
94 | |
95 | /// Return true if the specified register number is in the physical register |
96 | /// namespace. |
97 | bool isPhysical() const { |
98 | return isPhysicalRegister(Reg); |
99 | } |
100 | |
101 | /// Convert a virtual register number to a 0-based index. The first virtual |
102 | /// register in a function will get the index 0. |
103 | unsigned virtRegIndex() const { |
104 | return virtReg2Index(Reg); |
105 | } |
106 | |
107 | constexpr operator unsigned() const { |
108 | return Reg; |
109 | } |
110 | |
111 | unsigned id() const { return Reg; } |
112 | |
113 | operator MCRegister() const { |
114 | return MCRegister(Reg); |
115 | } |
116 | |
117 | /// Utility to check-convert this value to a MCRegister. The caller is |
118 | /// expected to have already validated that this Register is, indeed, |
119 | /// physical. |
120 | MCRegister asMCReg() const { |
121 | assert(Reg == MCRegister::NoRegister ||(static_cast <bool> (Reg == MCRegister::NoRegister || MCRegister ::isPhysicalRegister(Reg)) ? void (0) : __assert_fail ("Reg == MCRegister::NoRegister || MCRegister::isPhysicalRegister(Reg)" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/CodeGen/Register.h" , 122, __extension__ __PRETTY_FUNCTION__)) |
122 | MCRegister::isPhysicalRegister(Reg))(static_cast <bool> (Reg == MCRegister::NoRegister || MCRegister ::isPhysicalRegister(Reg)) ? void (0) : __assert_fail ("Reg == MCRegister::NoRegister || MCRegister::isPhysicalRegister(Reg)" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/CodeGen/Register.h" , 122, __extension__ __PRETTY_FUNCTION__)); |
123 | return MCRegister(Reg); |
124 | } |
125 | |
126 | bool isValid() const { return Reg != MCRegister::NoRegister; } |
127 | |
128 | /// Comparisons between register objects |
129 | bool operator==(const Register &Other) const { return Reg == Other.Reg; } |
130 | bool operator!=(const Register &Other) const { return Reg != Other.Reg; } |
131 | bool operator==(const MCRegister &Other) const { return Reg == Other.id(); } |
132 | bool operator!=(const MCRegister &Other) const { return Reg != Other.id(); } |
133 | |
134 | /// Comparisons against register constants. E.g. |
135 | /// * R == AArch64::WZR |
136 | /// * R == 0 |
137 | /// * R == VirtRegMap::NO_PHYS_REG |
138 | bool operator==(unsigned Other) const { return Reg == Other; } |
139 | bool operator!=(unsigned Other) const { return Reg != Other; } |
140 | bool operator==(int Other) const { return Reg == unsigned(Other); } |
141 | bool operator!=(int Other) const { return Reg != unsigned(Other); } |
142 | // MSVC requires that we explicitly declare these two as well. |
143 | bool operator==(MCPhysReg Other) const { return Reg == unsigned(Other); } |
144 | bool operator!=(MCPhysReg Other) const { return Reg != unsigned(Other); } |
145 | }; |
146 | |
147 | // Provide DenseMapInfo for Register |
148 | template<> struct DenseMapInfo<Register> { |
149 | static inline unsigned getEmptyKey() { |
150 | return DenseMapInfo<unsigned>::getEmptyKey(); |
151 | } |
152 | static inline unsigned getTombstoneKey() { |
153 | return DenseMapInfo<unsigned>::getTombstoneKey(); |
154 | } |
155 | static unsigned getHashValue(const Register &Val) { |
156 | return DenseMapInfo<unsigned>::getHashValue(Val.id()); |
157 | } |
158 | static bool isEqual(const Register &LHS, const Register &RHS) { |
159 | return DenseMapInfo<unsigned>::isEqual(LHS.id(), RHS.id()); |
160 | } |
161 | }; |
162 | |
163 | } |
164 | |
165 | #endif // LLVM_CODEGEN_REGISTER_H |