File: | llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp |
Warning: | line 2965, column 7 Value stored to 'ValMapping' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// This file implements the targeting of the RegisterBankInfo class for |
10 | /// AMDGPU. |
11 | /// |
12 | /// \par |
13 | /// |
14 | /// AMDGPU has unique register bank constraints that require special high level |
15 | /// strategies to deal with. There are two main true physical register banks |
16 | /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a |
17 | /// sort of pseudo-register bank needed to represent SGPRs used in a vector |
18 | /// boolean context. There is also the AGPR bank, which is a special purpose |
19 | /// physical register bank present on some subtargets. |
20 | /// |
21 | /// Copying from VGPR to SGPR is generally illegal, unless the value is known to |
22 | /// be uniform. It is generally not valid to legalize operands by inserting |
23 | /// copies as on other targets. Operations which require uniform, SGPR operands |
24 | /// generally require scalarization by repeatedly executing the instruction, |
25 | /// activating each set of lanes using a unique set of input values. This is |
26 | /// referred to as a waterfall loop. |
27 | /// |
28 | /// \par Booleans |
29 | /// |
30 | /// Booleans (s1 values) requires special consideration. A vector compare result |
31 | /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit |
32 | /// register. These are represented with the VCC bank. During selection, we need |
33 | /// to be able to unambiguously go back from a register class to a register |
34 | /// bank. To distinguish whether an SGPR should use the SGPR or VCC register |
35 | /// bank, we need to know the use context type. An SGPR s1 value always means a |
36 | /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets |
37 | /// SCC, which is a 1-bit unaddressable register. This will need to be copied to |
38 | /// a 32-bit virtual register. Taken together, this means we need to adjust the |
39 | /// type of boolean operations to be regbank legal. All SALU booleans need to be |
40 | /// widened to 32-bits, and all VALU booleans need to be s1 values. |
41 | /// |
42 | /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact |
43 | /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc |
44 | /// bank. A non-boolean source (such as a truncate from a 1-bit load from |
45 | /// memory) will require a copy to the VCC bank which will require clearing the |
46 | /// high bits and inserting a compare. |
47 | /// |
48 | /// \par Constant bus restriction |
49 | /// |
50 | /// VALU instructions have a limitation known as the constant bus |
51 | /// restriction. Most VALU instructions can use SGPR operands, but may read at |
52 | /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most |
53 | /// instructions). This is one unique SGPR, so the same SGPR may be used for |
54 | /// multiple operands. From a register bank perspective, any combination of |
55 | /// operands should be legal as an SGPR, but this is contextually dependent on |
56 | /// the SGPR operands all being the same register. There is therefore optimal to |
57 | /// choose the SGPR with the most uses to minimize the number of copies. |
58 | /// |
59 | /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_* |
60 | /// operation should have its source operands all mapped to VGPRs (except for |
61 | /// VCC), inserting copies from any SGPR operands. This the most trival legal |
62 | /// mapping. Anything beyond the simplest 1:1 instruction selection would be too |
63 | /// complicated to solve here. Every optimization pattern or instruction |
64 | /// selected to multiple outputs would have to enforce this rule, and there |
65 | /// would be additional complexity in tracking this rule for every G_* |
66 | /// operation. By forcing all inputs to VGPRs, it also simplifies the task of |
67 | /// picking the optimal operand combination from a post-isel optimization pass. |
68 | /// |
69 | //===----------------------------------------------------------------------===// |
70 | |
71 | #include "AMDGPURegisterBankInfo.h" |
72 | |
73 | #include "AMDGPUGlobalISelUtils.h" |
74 | #include "AMDGPUInstrInfo.h" |
75 | #include "AMDGPUSubtarget.h" |
76 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
77 | #include "SIMachineFunctionInfo.h" |
78 | #include "SIRegisterInfo.h" |
79 | #include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h" |
80 | #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" |
81 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" |
82 | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" |
83 | #include "llvm/CodeGen/GlobalISel/RegisterBank.h" |
84 | #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" |
85 | #include "llvm/CodeGen/TargetRegisterInfo.h" |
86 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
87 | #include "llvm/IR/Constants.h" |
88 | |
89 | #define GET_TARGET_REGBANK_IMPL |
90 | #include "AMDGPUGenRegisterBank.inc" |
91 | |
92 | // This file will be TableGen'ed at some point. |
93 | #include "AMDGPUGenRegisterBankInfo.def" |
94 | |
95 | using namespace llvm; |
96 | using namespace MIPatternMatch; |
97 | |
98 | namespace { |
99 | |
100 | // Observer to apply a register bank to new registers created by LegalizerHelper. |
101 | class ApplyRegBankMapping final : public GISelChangeObserver { |
102 | private: |
103 | const AMDGPURegisterBankInfo &RBI; |
104 | MachineRegisterInfo &MRI; |
105 | const RegisterBank *NewBank; |
106 | SmallVector<MachineInstr *, 4> NewInsts; |
107 | |
108 | public: |
109 | ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_, |
110 | MachineRegisterInfo &MRI_, const RegisterBank *RB) |
111 | : RBI(RBI_), MRI(MRI_), NewBank(RB) {} |
112 | |
113 | ~ApplyRegBankMapping() { |
114 | for (MachineInstr *MI : NewInsts) |
115 | applyBank(*MI); |
116 | } |
117 | |
118 | /// Set any registers that don't have a set register class or bank to SALU. |
119 | void applyBank(MachineInstr &MI) { |
120 | const unsigned Opc = MI.getOpcode(); |
121 | if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT || |
122 | Opc == AMDGPU::G_SEXT) { |
123 | // LegalizerHelper wants to use the basic legalization artifacts when |
124 | // widening etc. We don't handle selection with vcc in artifact sources, |
125 | // so we need to use a sslect instead to handle these properly. |
126 | Register DstReg = MI.getOperand(0).getReg(); |
127 | Register SrcReg = MI.getOperand(1).getReg(); |
128 | const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI); |
129 | if (SrcBank == &AMDGPU::VCCRegBank) { |
130 | const LLT S32 = LLT::scalar(32); |
131 | assert(MRI.getType(SrcReg) == LLT::scalar(1))((MRI.getType(SrcReg) == LLT::scalar(1)) ? static_cast<void > (0) : __assert_fail ("MRI.getType(SrcReg) == LLT::scalar(1)" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 131, __PRETTY_FUNCTION__)); |
132 | assert(MRI.getType(DstReg) == S32)((MRI.getType(DstReg) == S32) ? static_cast<void> (0) : __assert_fail ("MRI.getType(DstReg) == S32", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 132, __PRETTY_FUNCTION__)); |
133 | assert(NewBank == &AMDGPU::VGPRRegBank)((NewBank == &AMDGPU::VGPRRegBank) ? static_cast<void> (0) : __assert_fail ("NewBank == &AMDGPU::VGPRRegBank", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 133, __PRETTY_FUNCTION__)); |
134 | |
135 | // Replace the extension with a select, which really uses the boolean |
136 | // source. |
137 | MachineIRBuilder B(MI); |
138 | auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1); |
139 | auto False = B.buildConstant(S32, 0); |
140 | B.buildSelect(DstReg, SrcReg, True, False); |
141 | MRI.setRegBank(True.getReg(0), *NewBank); |
142 | MRI.setRegBank(False.getReg(0), *NewBank); |
143 | MI.eraseFromParent(); |
144 | } |
145 | |
146 | assert(!MRI.getRegClassOrRegBank(DstReg))((!MRI.getRegClassOrRegBank(DstReg)) ? static_cast<void> (0) : __assert_fail ("!MRI.getRegClassOrRegBank(DstReg)", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 146, __PRETTY_FUNCTION__)); |
147 | MRI.setRegBank(DstReg, *NewBank); |
148 | return; |
149 | } |
150 | |
151 | #ifndef NDEBUG |
152 | if (Opc == AMDGPU::G_TRUNC) { |
153 | Register DstReg = MI.getOperand(0).getReg(); |
154 | const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI); |
155 | assert(DstBank != &AMDGPU::VCCRegBank)((DstBank != &AMDGPU::VCCRegBank) ? static_cast<void> (0) : __assert_fail ("DstBank != &AMDGPU::VCCRegBank", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 155, __PRETTY_FUNCTION__)); |
156 | } |
157 | #endif |
158 | |
159 | for (MachineOperand &Op : MI.operands()) { |
160 | if (!Op.isReg()) |
161 | continue; |
162 | |
163 | // We may see physical registers if building a real MI |
164 | Register Reg = Op.getReg(); |
165 | if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg)) |
166 | continue; |
167 | |
168 | const RegisterBank *RB = NewBank; |
169 | if (MRI.getType(Reg) == LLT::scalar(1)) { |
170 | assert(NewBank == &AMDGPU::VGPRRegBank &&((NewBank == &AMDGPU::VGPRRegBank && "s1 operands should only be used for vector bools" ) ? static_cast<void> (0) : __assert_fail ("NewBank == &AMDGPU::VGPRRegBank && \"s1 operands should only be used for vector bools\"" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 171, __PRETTY_FUNCTION__)) |
171 | "s1 operands should only be used for vector bools")((NewBank == &AMDGPU::VGPRRegBank && "s1 operands should only be used for vector bools" ) ? static_cast<void> (0) : __assert_fail ("NewBank == &AMDGPU::VGPRRegBank && \"s1 operands should only be used for vector bools\"" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 171, __PRETTY_FUNCTION__)); |
172 | assert((MI.getOpcode() != AMDGPU::G_TRUNC &&(((MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode( ) != AMDGPU::G_ANYEXT) && "not expecting legalization artifacts here" ) ? static_cast<void> (0) : __assert_fail ("(MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode() != AMDGPU::G_ANYEXT) && \"not expecting legalization artifacts here\"" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 174, __PRETTY_FUNCTION__)) |
173 | MI.getOpcode() != AMDGPU::G_ANYEXT) &&(((MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode( ) != AMDGPU::G_ANYEXT) && "not expecting legalization artifacts here" ) ? static_cast<void> (0) : __assert_fail ("(MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode() != AMDGPU::G_ANYEXT) && \"not expecting legalization artifacts here\"" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 174, __PRETTY_FUNCTION__)) |
174 | "not expecting legalization artifacts here")(((MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode( ) != AMDGPU::G_ANYEXT) && "not expecting legalization artifacts here" ) ? static_cast<void> (0) : __assert_fail ("(MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode() != AMDGPU::G_ANYEXT) && \"not expecting legalization artifacts here\"" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 174, __PRETTY_FUNCTION__)); |
175 | RB = &AMDGPU::VCCRegBank; |
176 | } |
177 | |
178 | MRI.setRegBank(Reg, *RB); |
179 | } |
180 | } |
181 | |
182 | void erasingInstr(MachineInstr &MI) override {} |
183 | |
184 | void createdInstr(MachineInstr &MI) override { |
185 | // At this point, the instruction was just inserted and has no operands. |
186 | NewInsts.push_back(&MI); |
187 | } |
188 | |
189 | void changingInstr(MachineInstr &MI) override {} |
190 | void changedInstr(MachineInstr &MI) override {} |
191 | }; |
192 | |
193 | } |
194 | AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) |
195 | : AMDGPUGenRegisterBankInfo(), |
196 | Subtarget(ST), |
197 | TRI(Subtarget.getRegisterInfo()), |
198 | TII(Subtarget.getInstrInfo()) { |
199 | |
200 | // HACK: Until this is fully tablegen'd. |
201 | static llvm::once_flag InitializeRegisterBankFlag; |
202 | |
203 | static auto InitializeRegisterBankOnce = [this]() { |
204 | assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&((&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU ::VGPRRegBank && &getRegBank(AMDGPU::AGPRRegBankID ) == &AMDGPU::AGPRRegBank) ? static_cast<void> (0) : __assert_fail ("&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 206, __PRETTY_FUNCTION__)) |
205 | &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&((&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU ::VGPRRegBank && &getRegBank(AMDGPU::AGPRRegBankID ) == &AMDGPU::AGPRRegBank) ? static_cast<void> (0) : __assert_fail ("&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 206, __PRETTY_FUNCTION__)) |
206 | &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank)((&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU ::VGPRRegBank && &getRegBank(AMDGPU::AGPRRegBankID ) == &AMDGPU::AGPRRegBank) ? static_cast<void> (0) : __assert_fail ("&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 206, __PRETTY_FUNCTION__)); |
207 | (void)this; |
208 | }; |
209 | |
210 | llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce); |
211 | } |
212 | |
213 | static bool isVectorRegisterBank(const RegisterBank &Bank) { |
214 | unsigned BankID = Bank.getID(); |
215 | return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; |
216 | } |
217 | |
218 | unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, |
219 | const RegisterBank &Src, |
220 | unsigned Size) const { |
221 | // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? |
222 | if (Dst.getID() == AMDGPU::SGPRRegBankID && |
223 | isVectorRegisterBank(Src)) { |
224 | return std::numeric_limits<unsigned>::max(); |
225 | } |
226 | |
227 | // Bool values are tricky, because the meaning is based on context. The SCC |
228 | // and VCC banks are for the natural scalar and vector conditions produced by |
229 | // a compare. |
230 | // |
231 | // Legalization doesn't know about the necessary context, so an s1 use may |
232 | // have been a truncate from an arbitrary value, in which case a copy (lowered |
233 | // as a compare with 0) needs to be inserted. |
234 | if (Size == 1 && |
235 | (Dst.getID() == AMDGPU::SGPRRegBankID) && |
236 | (isVectorRegisterBank(Src) || |
237 | Src.getID() == AMDGPU::SGPRRegBankID || |
238 | Src.getID() == AMDGPU::VCCRegBankID)) |
239 | return std::numeric_limits<unsigned>::max(); |
240 | |
241 | if (Src.getID() == AMDGPU::VCCRegBankID) |
242 | return std::numeric_limits<unsigned>::max(); |
243 | |
244 | // There is no direct copy between AGPRs. |
245 | if (Dst.getID() == AMDGPU::AGPRRegBankID && |
246 | Src.getID() == AMDGPU::AGPRRegBankID) |
247 | return 4; |
248 | |
249 | return RegisterBankInfo::copyCost(Dst, Src, Size); |
250 | } |
251 | |
252 | unsigned AMDGPURegisterBankInfo::getBreakDownCost( |
253 | const ValueMapping &ValMapping, |
254 | const RegisterBank *CurBank) const { |
255 | // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to |
256 | // VGPR. |
257 | // FIXME: Is there a better way to do this? |
258 | if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) |
259 | return 10; // This is expensive. |
260 | |
261 | assert(ValMapping.NumBreakDowns == 2 &&((ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown [0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping .BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank) ? static_cast <void> (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 266, __PRETTY_FUNCTION__)) |
262 | ValMapping.BreakDown[0].Length == 32 &&((ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown [0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping .BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank) ? static_cast <void> (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 266, __PRETTY_FUNCTION__)) |
263 | ValMapping.BreakDown[0].StartIdx == 0 &&((ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown [0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping .BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank) ? static_cast <void> (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 266, __PRETTY_FUNCTION__)) |
264 | ValMapping.BreakDown[1].Length == 32 &&((ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown [0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping .BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank) ? static_cast <void> (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 266, __PRETTY_FUNCTION__)) |
265 | ValMapping.BreakDown[1].StartIdx == 32 &&((ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown [0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping .BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank) ? static_cast <void> (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 266, __PRETTY_FUNCTION__)) |
266 | ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank)((ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown [0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping .BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank) ? static_cast <void> (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 266, __PRETTY_FUNCTION__)); |
267 | |
268 | // 32-bit extract of a 64-bit value is just access of a subregister, so free. |
269 | // TODO: Cost of 0 hits assert, though it's not clear it's what we really |
270 | // want. |
271 | |
272 | // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR |
273 | // alignment restrictions, but this probably isn't important. |
274 | return 1; |
275 | } |
276 | |
277 | const RegisterBank & |
278 | AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, |
279 | LLT Ty) const { |
280 | if (&RC == &AMDGPU::SReg_1RegClass) |
281 | return AMDGPU::VCCRegBank; |
282 | |
283 | // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a |
284 | // VCC-like use. |
285 | if (TRI->isSGPRClass(&RC)) { |
286 | // FIXME: This probably came from a copy from a physical register, which |
287 | // should be inferrrable from the copied to-type. We don't have many boolean |
288 | // physical register constraints so just assume a normal SGPR for now. |
289 | if (!Ty.isValid()) |
290 | return AMDGPU::SGPRRegBank; |
291 | |
292 | return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; |
293 | } |
294 | |
295 | return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank; |
296 | } |
297 | |
298 | template <unsigned NumOps> |
299 | RegisterBankInfo::InstructionMappings |
300 | AMDGPURegisterBankInfo::addMappingFromTable( |
301 | const MachineInstr &MI, const MachineRegisterInfo &MRI, |
302 | const std::array<unsigned, NumOps> RegSrcOpIdx, |
303 | ArrayRef<OpRegBankEntry<NumOps>> Table) const { |
304 | |
305 | InstructionMappings AltMappings; |
306 | |
307 | SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); |
308 | |
309 | unsigned Sizes[NumOps]; |
310 | for (unsigned I = 0; I < NumOps; ++I) { |
311 | Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); |
312 | Sizes[I] = getSizeInBits(Reg, MRI, *TRI); |
313 | } |
314 | |
315 | for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { |
316 | unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); |
317 | Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); |
318 | } |
319 | |
320 | // getInstrMapping's default mapping uses ID 1, so start at 2. |
321 | unsigned MappingID = 2; |
322 | for (const auto &Entry : Table) { |
323 | for (unsigned I = 0; I < NumOps; ++I) { |
324 | int OpIdx = RegSrcOpIdx[I]; |
325 | Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); |
326 | } |
327 | |
328 | AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, |
329 | getOperandsMapping(Operands), |
330 | Operands.size())); |
331 | } |
332 | |
333 | return AltMappings; |
334 | } |
335 | |
336 | RegisterBankInfo::InstructionMappings |
337 | AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( |
338 | const MachineInstr &MI, const MachineRegisterInfo &MRI) const { |
339 | switch (MI.getIntrinsicID()) { |
340 | case Intrinsic::amdgcn_readlane: { |
341 | static const OpRegBankEntry<3> Table[2] = { |
342 | // Perfectly legal. |
343 | { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, |
344 | |
345 | // Need a readfirstlane for the index. |
346 | { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } |
347 | }; |
348 | |
349 | const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; |
350 | return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); |
351 | } |
352 | case Intrinsic::amdgcn_writelane: { |
353 | static const OpRegBankEntry<4> Table[4] = { |
354 | // Perfectly legal. |
355 | { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, |
356 | |
357 | // Need readfirstlane of first op |
358 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, |
359 | |
360 | // Need readfirstlane of second op |
361 | { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, |
362 | |
363 | // Need readfirstlane of both ops |
364 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } |
365 | }; |
366 | |
367 | // rsrc, voffset, offset |
368 | const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; |
369 | return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); |
370 | } |
371 | default: |
372 | return RegisterBankInfo::getInstrAlternativeMappings(MI); |
373 | } |
374 | } |
375 | |
376 | RegisterBankInfo::InstructionMappings |
377 | AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( |
378 | const MachineInstr &MI, const MachineRegisterInfo &MRI) const { |
379 | |
380 | switch (MI.getIntrinsicID()) { |
381 | case Intrinsic::amdgcn_s_buffer_load: { |
382 | static const OpRegBankEntry<2> Table[4] = { |
383 | // Perfectly legal. |
384 | { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, |
385 | |
386 | // Only need 1 register in loop |
387 | { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, |
388 | |
389 | // Have to waterfall the resource. |
390 | { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, |
391 | |
392 | // Have to waterfall the resource, and the offset. |
393 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } |
394 | }; |
395 | |
396 | // rsrc, offset |
397 | const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; |
398 | return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); |
399 | } |
400 | case Intrinsic::amdgcn_ds_ordered_add: |
401 | case Intrinsic::amdgcn_ds_ordered_swap: { |
402 | // VGPR = M0, VGPR |
403 | static const OpRegBankEntry<3> Table[2] = { |
404 | // Perfectly legal. |
405 | { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, |
406 | |
407 | // Need a readfirstlane for m0 |
408 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } |
409 | }; |
410 | |
411 | const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; |
412 | return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); |
413 | } |
414 | case Intrinsic::amdgcn_s_sendmsg: |
415 | case Intrinsic::amdgcn_s_sendmsghalt: { |
416 | // FIXME: Should have no register for immediate |
417 | static const OpRegBankEntry<1> Table[2] = { |
418 | // Perfectly legal. |
419 | { { AMDGPU::SGPRRegBankID }, 1 }, |
420 | |
421 | // Need readlane |
422 | { { AMDGPU::VGPRRegBankID }, 3 } |
423 | }; |
424 | |
425 | const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; |
426 | return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); |
427 | } |
428 | default: |
429 | return RegisterBankInfo::getInstrAlternativeMappings(MI); |
430 | } |
431 | } |
432 | |
433 | static bool memOpHasNoClobbered(const MachineMemOperand *MMO) { |
434 | const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue()); |
435 | return I && I->getMetadata("amdgpu.noclobber"); |
436 | } |
437 | |
438 | // FIXME: Returns uniform if there's no source value information. This is |
439 | // probably wrong. |
440 | static bool isScalarLoadLegal(const MachineInstr &MI) { |
441 | if (!MI.hasOneMemOperand()) |
442 | return false; |
443 | |
444 | const MachineMemOperand *MMO = *MI.memoperands_begin(); |
445 | const unsigned AS = MMO->getAddrSpace(); |
446 | const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || |
447 | AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; |
448 | |
449 | // There are no extending SMRD/SMEM loads, and they require 4-byte alignment. |
450 | return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 && |
451 | // Can't do a scalar atomic load. |
452 | !MMO->isAtomic() && |
453 | // Don't use scalar loads for volatile accesses to non-constant address |
454 | // spaces. |
455 | (IsConst || !MMO->isVolatile()) && |
456 | // Memory must be known constant, or not written before this load. |
457 | (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) && |
458 | AMDGPUInstrInfo::isUniformMMO(MMO); |
459 | } |
460 | |
461 | RegisterBankInfo::InstructionMappings |
462 | AMDGPURegisterBankInfo::getInstrAlternativeMappings( |
463 | const MachineInstr &MI) const { |
464 | |
465 | const MachineFunction &MF = *MI.getParent()->getParent(); |
466 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
467 | |
468 | |
469 | InstructionMappings AltMappings; |
470 | switch (MI.getOpcode()) { |
471 | case TargetOpcode::G_CONSTANT: { |
472 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
473 | if (Size == 1) { |
474 | static const OpRegBankEntry<1> Table[3] = { |
475 | { { AMDGPU::VGPRRegBankID }, 1 }, |
476 | { { AMDGPU::SGPRRegBankID }, 1 }, |
477 | { { AMDGPU::VCCRegBankID }, 1 } |
478 | }; |
479 | |
480 | return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); |
481 | } |
482 | |
483 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; |
484 | } |
485 | case TargetOpcode::G_FCONSTANT: |
486 | case TargetOpcode::G_FRAME_INDEX: |
487 | case TargetOpcode::G_GLOBAL_VALUE: { |
488 | static const OpRegBankEntry<1> Table[2] = { |
489 | { { AMDGPU::VGPRRegBankID }, 1 }, |
490 | { { AMDGPU::SGPRRegBankID }, 1 } |
491 | }; |
492 | |
493 | return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); |
494 | } |
495 | case TargetOpcode::G_AND: |
496 | case TargetOpcode::G_OR: |
497 | case TargetOpcode::G_XOR: { |
498 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
499 | |
500 | if (Size == 1) { |
501 | // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. |
502 | const InstructionMapping &SCCMapping = getInstructionMapping( |
503 | 1, 1, getOperandsMapping( |
504 | {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), |
505 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), |
506 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}), |
507 | 3); // Num Operands |
508 | AltMappings.push_back(&SCCMapping); |
509 | |
510 | const InstructionMapping &VCCMapping0 = getInstructionMapping( |
511 | 2, 1, getOperandsMapping( |
512 | {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), |
513 | AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), |
514 | AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), |
515 | 3); // Num Operands |
516 | AltMappings.push_back(&VCCMapping0); |
517 | return AltMappings; |
518 | } |
519 | |
520 | if (Size != 64) |
521 | break; |
522 | |
523 | const InstructionMapping &SSMapping = getInstructionMapping( |
524 | 1, 1, getOperandsMapping( |
525 | {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
526 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
527 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), |
528 | 3); // Num Operands |
529 | AltMappings.push_back(&SSMapping); |
530 | |
531 | const InstructionMapping &VVMapping = getInstructionMapping( |
532 | 2, 2, getOperandsMapping( |
533 | {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), |
534 | AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), |
535 | AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), |
536 | 3); // Num Operands |
537 | AltMappings.push_back(&VVMapping); |
538 | break; |
539 | } |
540 | case TargetOpcode::G_LOAD: |
541 | case TargetOpcode::G_ZEXTLOAD: |
542 | case TargetOpcode::G_SEXTLOAD: { |
543 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
544 | LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); |
545 | unsigned PtrSize = PtrTy.getSizeInBits(); |
546 | unsigned AS = PtrTy.getAddressSpace(); |
547 | LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); |
548 | |
549 | if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && |
550 | AS != AMDGPUAS::PRIVATE_ADDRESS) && |
551 | isScalarLoadLegal(MI)) { |
552 | const InstructionMapping &SSMapping = getInstructionMapping( |
553 | 1, 1, getOperandsMapping( |
554 | {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
555 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}), |
556 | 2); // Num Operands |
557 | AltMappings.push_back(&SSMapping); |
558 | } |
559 | |
560 | const InstructionMapping &VVMapping = getInstructionMapping( |
561 | 2, 1, getOperandsMapping( |
562 | {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy), |
563 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), |
564 | 2); // Num Operands |
565 | AltMappings.push_back(&VVMapping); |
566 | |
567 | // It may be possible to have a vgpr = load sgpr mapping here, because |
568 | // the mubuf instructions support this kind of load, but probably for only |
569 | // gfx7 and older. However, the addressing mode matching in the instruction |
570 | // selector should be able to do a better job of detecting and selecting |
571 | // these kinds of loads from the vgpr = load vgpr mapping. |
572 | |
573 | return AltMappings; |
574 | |
575 | } |
576 | case TargetOpcode::G_SELECT: { |
577 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
578 | const InstructionMapping &SSMapping = getInstructionMapping(1, 1, |
579 | getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
580 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), |
581 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
582 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), |
583 | 4); // Num Operands |
584 | AltMappings.push_back(&SSMapping); |
585 | |
586 | const InstructionMapping &VVMapping = getInstructionMapping(2, 1, |
587 | getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), |
588 | AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), |
589 | AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), |
590 | AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), |
591 | 4); // Num Operands |
592 | AltMappings.push_back(&VVMapping); |
593 | |
594 | return AltMappings; |
595 | } |
596 | case TargetOpcode::G_SMIN: |
597 | case TargetOpcode::G_SMAX: |
598 | case TargetOpcode::G_UMIN: |
599 | case TargetOpcode::G_UMAX: { |
600 | static const OpRegBankEntry<3> Table[2] = { |
601 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, |
602 | |
603 | // Scalar requires cmp+select, and extends if 16-bit. |
604 | // FIXME: Should there be separate costs for 32 and 16-bit |
605 | { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 } |
606 | }; |
607 | |
608 | const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } }; |
609 | return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); |
610 | } |
611 | case TargetOpcode::G_UADDE: |
612 | case TargetOpcode::G_USUBE: |
613 | case TargetOpcode::G_SADDE: |
614 | case TargetOpcode::G_SSUBE: { |
615 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
616 | const InstructionMapping &SSMapping = getInstructionMapping(1, 1, |
617 | getOperandsMapping( |
618 | {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
619 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), |
620 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
621 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), |
622 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}), |
623 | 5); // Num Operands |
624 | AltMappings.push_back(&SSMapping); |
625 | |
626 | const InstructionMapping &VVMapping = getInstructionMapping(2, 1, |
627 | getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), |
628 | AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), |
629 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), |
630 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), |
631 | AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), |
632 | 5); // Num Operands |
633 | AltMappings.push_back(&VVMapping); |
634 | return AltMappings; |
635 | } |
636 | case AMDGPU::G_BRCOND: { |
637 | assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1)((MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1 ) ? static_cast<void> (0) : __assert_fail ("MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 637, __PRETTY_FUNCTION__)); |
638 | |
639 | // TODO: Change type to 32 for scalar |
640 | const InstructionMapping &SMapping = getInstructionMapping( |
641 | 1, 1, getOperandsMapping( |
642 | {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}), |
643 | 2); // Num Operands |
644 | AltMappings.push_back(&SMapping); |
645 | |
646 | const InstructionMapping &VMapping = getInstructionMapping( |
647 | 1, 1, getOperandsMapping( |
648 | {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), |
649 | 2); // Num Operands |
650 | AltMappings.push_back(&VMapping); |
651 | return AltMappings; |
652 | } |
653 | case AMDGPU::G_INTRINSIC: |
654 | return getInstrAlternativeMappingsIntrinsic(MI, MRI); |
655 | case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: |
656 | return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); |
657 | default: |
658 | break; |
659 | } |
660 | return RegisterBankInfo::getInstrAlternativeMappings(MI); |
661 | } |
662 | |
663 | void AMDGPURegisterBankInfo::split64BitValueForMapping( |
664 | MachineIRBuilder &B, |
665 | SmallVector<Register, 2> &Regs, |
666 | LLT HalfTy, |
667 | Register Reg) const { |
668 | assert(HalfTy.getSizeInBits() == 32)((HalfTy.getSizeInBits() == 32) ? static_cast<void> (0) : __assert_fail ("HalfTy.getSizeInBits() == 32", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 668, __PRETTY_FUNCTION__)); |
669 | MachineRegisterInfo *MRI = B.getMRI(); |
670 | Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); |
671 | Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); |
672 | const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); |
673 | MRI->setRegBank(LoLHS, *Bank); |
674 | MRI->setRegBank(HiLHS, *Bank); |
675 | |
676 | Regs.push_back(LoLHS); |
677 | Regs.push_back(HiLHS); |
678 | |
679 | B.buildInstr(AMDGPU::G_UNMERGE_VALUES) |
680 | .addDef(LoLHS) |
681 | .addDef(HiLHS) |
682 | .addUse(Reg); |
683 | } |
684 | |
685 | /// Replace the current type each register in \p Regs has with \p NewTy |
686 | static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, |
687 | LLT NewTy) { |
688 | for (Register Reg : Regs) { |
689 | assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits())((MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()) ? static_cast<void> (0) : __assert_fail ("MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 689, __PRETTY_FUNCTION__)); |
690 | MRI.setType(Reg, NewTy); |
691 | } |
692 | } |
693 | |
694 | static LLT getHalfSizedType(LLT Ty) { |
695 | if (Ty.isVector()) { |
696 | assert(Ty.getNumElements() % 2 == 0)((Ty.getNumElements() % 2 == 0) ? static_cast<void> (0) : __assert_fail ("Ty.getNumElements() % 2 == 0", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 696, __PRETTY_FUNCTION__)); |
697 | return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType()); |
698 | } |
699 | |
700 | assert(Ty.getSizeInBits() % 2 == 0)((Ty.getSizeInBits() % 2 == 0) ? static_cast<void> (0) : __assert_fail ("Ty.getSizeInBits() % 2 == 0", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 700, __PRETTY_FUNCTION__)); |
701 | return LLT::scalar(Ty.getSizeInBits() / 2); |
702 | } |
703 | |
704 | /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If |
705 | /// any of the required SGPR operands are VGPRs, perform a waterfall loop to |
706 | /// execute the instruction for each unique combination of values in all lanes |
707 | /// in the wave. The block will be split such that rest of the instructions are |
708 | /// moved to a new block. |
709 | /// |
710 | /// Essentially performs this loop: |
711 | // |
712 | /// Save Execution Mask |
713 | /// For (Lane : Wavefront) { |
714 | /// Enable Lane, Disable all other lanes |
715 | /// SGPR = read SGPR value for current lane from VGPR |
716 | /// VGPRResult[Lane] = use_op SGPR |
717 | /// } |
718 | /// Restore Execution Mask |
719 | /// |
720 | /// There is additional complexity to try for compare values to identify the |
721 | /// unique values used. |
722 | bool AMDGPURegisterBankInfo::executeInWaterfallLoop( |
723 | MachineIRBuilder &B, |
724 | iterator_range<MachineBasicBlock::iterator> Range, |
725 | SmallSet<Register, 4> &SGPROperandRegs, |
726 | MachineRegisterInfo &MRI) const { |
727 | SmallVector<Register, 4> ResultRegs; |
728 | SmallVector<Register, 4> InitResultRegs; |
729 | SmallVector<Register, 4> PhiRegs; |
730 | |
731 | // Track use registers which have already been expanded with a readfirstlane |
732 | // sequence. This may have multiple uses if moving a sequence. |
733 | DenseMap<Register, Register> WaterfalledRegMap; |
734 | |
735 | MachineBasicBlock &MBB = B.getMBB(); |
736 | MachineFunction *MF = &B.getMF(); |
737 | |
738 | const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); |
739 | const unsigned WaveAndOpc = Subtarget.isWave32() ? |
740 | AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; |
741 | const unsigned MovTermOpc = Subtarget.isWave32() ? |
742 | AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; |
743 | const unsigned XorTermOpc = Subtarget.isWave32() ? |
744 | AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; |
745 | const unsigned AndSaveExecOpc = Subtarget.isWave32() ? |
746 | AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; |
747 | const unsigned ExecReg = Subtarget.isWave32() ? |
748 | AMDGPU::EXEC_LO : AMDGPU::EXEC; |
749 | |
750 | #ifndef NDEBUG |
751 | const int OrigRangeSize = std::distance(Range.begin(), Range.end()); |
752 | #endif |
753 | |
754 | for (MachineInstr &MI : Range) { |
755 | for (MachineOperand &Def : MI.defs()) { |
756 | LLT ResTy = MRI.getType(Def.getReg()); |
757 | const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); |
758 | ResultRegs.push_back(Def.getReg()); |
759 | Register InitReg = B.buildUndef(ResTy).getReg(0); |
760 | Register PhiReg = MRI.createGenericVirtualRegister(ResTy); |
761 | InitResultRegs.push_back(InitReg); |
762 | PhiRegs.push_back(PhiReg); |
763 | MRI.setRegBank(PhiReg, *DefBank); |
764 | MRI.setRegBank(InitReg, *DefBank); |
765 | } |
766 | } |
767 | |
768 | Register SaveExecReg = MRI.createVirtualRegister(WaveRC); |
769 | Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); |
770 | |
771 | // Don't bother using generic instructions/registers for the exec mask. |
772 | B.buildInstr(TargetOpcode::IMPLICIT_DEF) |
773 | .addDef(InitSaveExecReg); |
774 | |
775 | Register PhiExec = MRI.createVirtualRegister(WaveRC); |
776 | Register NewExec = MRI.createVirtualRegister(WaveRC); |
777 | |
778 | // To insert the loop we need to split the block. Move everything before this |
779 | // point to a new block, and insert a new empty block before this instruction. |
780 | MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); |
781 | MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); |
782 | MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); |
783 | MachineFunction::iterator MBBI(MBB); |
784 | ++MBBI; |
785 | MF->insert(MBBI, LoopBB); |
786 | MF->insert(MBBI, RestoreExecBB); |
787 | MF->insert(MBBI, RemainderBB); |
788 | |
789 | LoopBB->addSuccessor(RestoreExecBB); |
790 | LoopBB->addSuccessor(LoopBB); |
791 | |
792 | // Move the rest of the block into a new block. |
793 | RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); |
794 | RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); |
795 | |
796 | MBB.addSuccessor(LoopBB); |
797 | RestoreExecBB->addSuccessor(RemainderBB); |
798 | |
799 | B.setInsertPt(*LoopBB, LoopBB->end()); |
800 | |
801 | B.buildInstr(TargetOpcode::PHI) |
802 | .addDef(PhiExec) |
803 | .addReg(InitSaveExecReg) |
804 | .addMBB(&MBB) |
805 | .addReg(NewExec) |
806 | .addMBB(LoopBB); |
807 | |
808 | for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) { |
809 | B.buildInstr(TargetOpcode::G_PHI) |
810 | .addDef(std::get<2>(Result)) |
811 | .addReg(std::get<0>(Result)) // Initial value / implicit_def |
812 | .addMBB(&MBB) |
813 | .addReg(std::get<1>(Result)) // Mid-loop value. |
814 | .addMBB(LoopBB); |
815 | } |
816 | |
817 | const DebugLoc &DL = B.getDL(); |
818 | |
819 | MachineInstr &FirstInst = *Range.begin(); |
820 | |
821 | // Move the instruction into the loop. Note we moved everything after |
822 | // Range.end() already into a new block, so Range.end() is no longer valid. |
823 | LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end()); |
824 | |
825 | // Figure out the iterator range after splicing the instructions. |
826 | MachineBasicBlock::iterator NewBegin = FirstInst.getIterator(); |
827 | auto NewEnd = LoopBB->end(); |
828 | |
829 | MachineBasicBlock::iterator I = Range.begin(); |
830 | B.setInsertPt(*LoopBB, I); |
831 | |
832 | Register CondReg; |
833 | |
834 | assert(std::distance(NewBegin, NewEnd) == OrigRangeSize)((std::distance(NewBegin, NewEnd) == OrigRangeSize) ? static_cast <void> (0) : __assert_fail ("std::distance(NewBegin, NewEnd) == OrigRangeSize" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 834, __PRETTY_FUNCTION__)); |
835 | |
836 | for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { |
837 | for (MachineOperand &Op : MI.uses()) { |
838 | if (!Op.isReg() || Op.isDef()) |
839 | continue; |
840 | |
841 | Register OldReg = Op.getReg(); |
842 | if (!SGPROperandRegs.count(OldReg)) |
843 | continue; |
844 | |
845 | // See if we already processed this register in another instruction in the |
846 | // sequence. |
847 | auto OldVal = WaterfalledRegMap.find(OldReg); |
848 | if (OldVal != WaterfalledRegMap.end()) { |
849 | Op.setReg(OldVal->second); |
850 | continue; |
851 | } |
852 | |
853 | LLT OpTy = MRI.getType(Op.getReg()); |
854 | unsigned OpSize = OpTy.getSizeInBits(); |
855 | |
856 | // Can only do a readlane of 32-bit pieces. |
857 | if (OpSize == 32) { |
858 | // Avoid extra copies in the simple case of one 32-bit register. |
859 | Register CurrentLaneOpReg |
860 | = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
861 | MRI.setType(CurrentLaneOpReg, OpTy); |
862 | |
863 | constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); |
864 | // Read the next variant <- also loop target. |
865 | BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), |
866 | CurrentLaneOpReg) |
867 | .addReg(Op.getReg()); |
868 | |
869 | Register NewCondReg = MRI.createVirtualRegister(WaveRC); |
870 | bool First = CondReg == AMDGPU::NoRegister; |
871 | if (First) |
872 | CondReg = NewCondReg; |
873 | |
874 | // Compare the just read M0 value to all possible Idx values. |
875 | B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) |
876 | .addDef(NewCondReg) |
877 | .addReg(CurrentLaneOpReg) |
878 | .addReg(Op.getReg()); |
879 | Op.setReg(CurrentLaneOpReg); |
880 | |
881 | if (!First) { |
882 | Register AndReg = MRI.createVirtualRegister(WaveRC); |
883 | |
884 | // If there are multiple operands to consider, and the conditions. |
885 | B.buildInstr(WaveAndOpc) |
886 | .addDef(AndReg) |
887 | .addReg(NewCondReg) |
888 | .addReg(CondReg); |
889 | CondReg = AndReg; |
890 | } |
891 | } else { |
892 | LLT S32 = LLT::scalar(32); |
893 | SmallVector<Register, 8> ReadlanePieces; |
894 | |
895 | // The compares can be done as 64-bit, but the extract needs to be done |
896 | // in 32-bit pieces. |
897 | |
898 | bool Is64 = OpSize % 64 == 0; |
899 | |
900 | LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); |
901 | unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 |
902 | : AMDGPU::V_CMP_EQ_U32_e64; |
903 | |
904 | // The compares can be done as 64-bit, but the extract needs to be done |
905 | // in 32-bit pieces. |
906 | |
907 | // Insert the unmerge before the loop. |
908 | |
909 | B.setMBB(MBB); |
910 | auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); |
911 | B.setInstr(*I); |
912 | |
913 | unsigned NumPieces = Unmerge->getNumOperands() - 1; |
914 | for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { |
915 | Register UnmergePiece = Unmerge.getReg(PieceIdx); |
916 | |
917 | Register CurrentLaneOpReg; |
918 | if (Is64) { |
919 | Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); |
920 | Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); |
921 | |
922 | MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); |
923 | MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); |
924 | MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); |
925 | |
926 | // Read the next variant <- also loop target. |
927 | BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), |
928 | CurrentLaneOpRegLo) |
929 | .addReg(UnmergePiece, 0, AMDGPU::sub0); |
930 | |
931 | // Read the next variant <- also loop target. |
932 | BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), |
933 | CurrentLaneOpRegHi) |
934 | .addReg(UnmergePiece, 0, AMDGPU::sub1); |
935 | |
936 | CurrentLaneOpReg = |
937 | B.buildMerge(LLT::scalar(64), |
938 | {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) |
939 | .getReg(0); |
940 | |
941 | MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); |
942 | |
943 | if (OpTy.getScalarSizeInBits() == 64) { |
944 | // If we need to produce a 64-bit element vector, so use the |
945 | // merged pieces |
946 | ReadlanePieces.push_back(CurrentLaneOpReg); |
947 | } else { |
948 | // 32-bit element type. |
949 | ReadlanePieces.push_back(CurrentLaneOpRegLo); |
950 | ReadlanePieces.push_back(CurrentLaneOpRegHi); |
951 | } |
952 | } else { |
953 | CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32); |
954 | MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); |
955 | MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); |
956 | |
957 | // Read the next variant <- also loop target. |
958 | BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), |
959 | CurrentLaneOpReg) |
960 | .addReg(UnmergePiece); |
961 | ReadlanePieces.push_back(CurrentLaneOpReg); |
962 | } |
963 | |
964 | Register NewCondReg = MRI.createVirtualRegister(WaveRC); |
965 | bool First = CondReg == AMDGPU::NoRegister; |
966 | if (First) |
967 | CondReg = NewCondReg; |
968 | |
969 | B.buildInstr(CmpOp) |
970 | .addDef(NewCondReg) |
971 | .addReg(CurrentLaneOpReg) |
972 | .addReg(UnmergePiece); |
973 | |
974 | if (!First) { |
975 | Register AndReg = MRI.createVirtualRegister(WaveRC); |
976 | |
977 | // If there are multiple operands to consider, and the conditions. |
978 | B.buildInstr(WaveAndOpc) |
979 | .addDef(AndReg) |
980 | .addReg(NewCondReg) |
981 | .addReg(CondReg); |
982 | CondReg = AndReg; |
983 | } |
984 | } |
985 | |
986 | // FIXME: Build merge seems to switch to CONCAT_VECTORS but not |
987 | // BUILD_VECTOR |
988 | if (OpTy.isVector()) { |
989 | auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); |
990 | Op.setReg(Merge.getReg(0)); |
991 | } else { |
992 | auto Merge = B.buildMerge(OpTy, ReadlanePieces); |
993 | Op.setReg(Merge.getReg(0)); |
994 | } |
995 | |
996 | MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank); |
997 | } |
998 | |
999 | // Make sure we don't re-process this register again. |
1000 | WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg())); |
1001 | } |
1002 | } |
1003 | |
1004 | B.setInsertPt(*LoopBB, LoopBB->end()); |
1005 | |
1006 | // Update EXEC, save the original EXEC value to VCC. |
1007 | B.buildInstr(AndSaveExecOpc) |
1008 | .addDef(NewExec) |
1009 | .addReg(CondReg, RegState::Kill); |
1010 | |
1011 | MRI.setSimpleHint(NewExec, CondReg); |
1012 | |
1013 | // Update EXEC, switch all done bits to 0 and all todo bits to 1. |
1014 | B.buildInstr(XorTermOpc) |
1015 | .addDef(ExecReg) |
1016 | .addReg(ExecReg) |
1017 | .addReg(NewExec); |
1018 | |
1019 | // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use |
1020 | // s_cbranch_scc0? |
1021 | |
1022 | // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. |
1023 | B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ) |
1024 | .addMBB(LoopBB); |
1025 | |
1026 | // Save the EXEC mask before the loop. |
1027 | BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg) |
1028 | .addReg(ExecReg); |
1029 | |
1030 | // Restore the EXEC mask after the loop. |
1031 | B.setMBB(*RestoreExecBB); |
1032 | B.buildInstr(MovTermOpc) |
1033 | .addDef(ExecReg) |
1034 | .addReg(SaveExecReg); |
1035 | |
1036 | // Set the insert point after the original instruction, so any new |
1037 | // instructions will be in the remainder. |
1038 | B.setInsertPt(*RemainderBB, RemainderBB->begin()); |
1039 | |
1040 | return true; |
1041 | } |
1042 | |
1043 | // Return any unique registers used by \p MI at \p OpIndices that need to be |
1044 | // handled in a waterfall loop. Returns these registers in \p |
1045 | // SGPROperandRegs. Returns true if there are any operansd to handle and a |
1046 | // waterfall loop is necessary. |
1047 | bool AMDGPURegisterBankInfo::collectWaterfallOperands( |
1048 | SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, |
1049 | MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { |
1050 | for (unsigned Op : OpIndices) { |
1051 | assert(MI.getOperand(Op).isUse())((MI.getOperand(Op).isUse()) ? static_cast<void> (0) : __assert_fail ("MI.getOperand(Op).isUse()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 1051, __PRETTY_FUNCTION__)); |
1052 | Register Reg = MI.getOperand(Op).getReg(); |
1053 | const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); |
1054 | if (OpBank->getID() == AMDGPU::VGPRRegBankID) |
1055 | SGPROperandRegs.insert(Reg); |
1056 | } |
1057 | |
1058 | // No operands need to be replaced, so no need to loop. |
1059 | return !SGPROperandRegs.empty(); |
1060 | } |
1061 | |
1062 | bool AMDGPURegisterBankInfo::executeInWaterfallLoop( |
1063 | MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, |
1064 | ArrayRef<unsigned> OpIndices) const { |
1065 | // Use a set to avoid extra readfirstlanes in the case where multiple operands |
1066 | // are the same register. |
1067 | SmallSet<Register, 4> SGPROperandRegs; |
1068 | |
1069 | if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices)) |
1070 | return false; |
1071 | |
1072 | MachineBasicBlock::iterator I = MI.getIterator(); |
1073 | return executeInWaterfallLoop(B, make_range(I, std::next(I)), |
1074 | SGPROperandRegs, MRI); |
1075 | } |
1076 | |
1077 | bool AMDGPURegisterBankInfo::executeInWaterfallLoop( |
1078 | MachineInstr &MI, MachineRegisterInfo &MRI, |
1079 | ArrayRef<unsigned> OpIndices) const { |
1080 | MachineIRBuilder B(MI); |
1081 | return executeInWaterfallLoop(B, MI, MRI, OpIndices); |
1082 | } |
1083 | |
1084 | // Legalize an operand that must be an SGPR by inserting a readfirstlane. |
1085 | void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( |
1086 | MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { |
1087 | Register Reg = MI.getOperand(OpIdx).getReg(); |
1088 | const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); |
1089 | if (Bank != &AMDGPU::VGPRRegBank) |
1090 | return; |
1091 | |
1092 | MachineIRBuilder B(MI); |
1093 | Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); |
1094 | B.buildInstr(AMDGPU::V_READFIRSTLANE_B32) |
1095 | .addDef(SGPR) |
1096 | .addReg(Reg); |
1097 | |
1098 | MRI.setType(SGPR, MRI.getType(Reg)); |
1099 | |
1100 | const TargetRegisterClass *Constrained = |
1101 | constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI); |
1102 | (void)Constrained; |
1103 | assert(Constrained && "Failed to constrain readfirstlane src reg")((Constrained && "Failed to constrain readfirstlane src reg" ) ? static_cast<void> (0) : __assert_fail ("Constrained && \"Failed to constrain readfirstlane src reg\"" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 1103, __PRETTY_FUNCTION__)); |
1104 | |
1105 | MI.getOperand(OpIdx).setReg(SGPR); |
1106 | } |
1107 | |
1108 | // When regbankselect repairs registers, it will insert a repair instruction |
1109 | // which defines the repaired register. Then it calls applyMapping and expects |
1110 | // that the targets will either delete or rewrite the originally wrote to the |
1111 | // repaired registers. Beccause of this, we end up in a situation where |
1112 | // we have 2 instructions defining the same registers. |
1113 | static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI, |
1114 | Register Reg, |
1115 | const MachineInstr &MI) { |
1116 | // Is there some way we can assert that there are exactly 2 def instructions? |
1117 | for (MachineInstr &Other : MRI.def_instructions(Reg)) { |
1118 | if (&Other != &MI) |
1119 | return &Other; |
1120 | } |
1121 | |
1122 | return nullptr; |
1123 | } |
1124 | |
1125 | bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI, |
1126 | const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, |
1127 | MachineRegisterInfo &MRI) const { |
1128 | Register DstReg = MI.getOperand(0).getReg(); |
1129 | const LLT LoadTy = MRI.getType(DstReg); |
1130 | unsigned LoadSize = LoadTy.getSizeInBits(); |
1131 | const unsigned MaxNonSmrdLoadSize = 128; |
1132 | // 128-bit loads are supported for all instruction types. |
1133 | if (LoadSize <= MaxNonSmrdLoadSize) |
1134 | return false; |
1135 | |
1136 | SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0)); |
1137 | SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1)); |
1138 | |
1139 | // If the pointer is an SGPR, we have nothing to do. |
1140 | if (SrcRegs.empty()) { |
1141 | const RegisterBank *PtrBank = |
1142 | OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; |
1143 | if (PtrBank == &AMDGPU::SGPRRegBank) |
1144 | return false; |
1145 | SrcRegs.push_back(MI.getOperand(1).getReg()); |
1146 | } |
1147 | |
1148 | assert(LoadSize % MaxNonSmrdLoadSize == 0)((LoadSize % MaxNonSmrdLoadSize == 0) ? static_cast<void> (0) : __assert_fail ("LoadSize % MaxNonSmrdLoadSize == 0", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 1148, __PRETTY_FUNCTION__)); |
1149 | |
1150 | // We want to get the repair instruction now, because it will help us |
1151 | // determine which instruction the legalizer inserts that will also |
1152 | // write to DstReg. |
1153 | MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI); |
1154 | |
1155 | // RegBankSelect only emits scalar types, so we need to reset the pointer |
1156 | // operand to a pointer type. |
1157 | Register BasePtrReg = SrcRegs[0]; |
1158 | LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); |
1159 | MRI.setType(BasePtrReg, PtrTy); |
1160 | |
1161 | MachineIRBuilder B(MI); |
1162 | |
1163 | unsigned SplitElts = |
1164 | MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits(); |
1165 | const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType()); |
1166 | ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank); |
1167 | GISelObserverWrapper Observer(&O); |
1168 | B.setChangeObserver(Observer); |
1169 | LegalizerHelper Helper(B.getMF(), Observer, B); |
1170 | if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) |
1171 | return false; |
1172 | |
1173 | // At this point, the legalizer has split the original load into smaller |
1174 | // loads. At the end of lowering, it inserts an instruction (LegalizedInst) |
1175 | // that combines the outputs of the lower loads and writes it to DstReg. |
1176 | // The register bank selector has also added the RepairInst which writes to |
1177 | // DstReg as well. |
1178 | |
1179 | MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst); |
1180 | |
1181 | // Replace the output of the LegalizedInst with a temporary register, since |
1182 | // RepairInst already defines DstReg. |
1183 | Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg)); |
1184 | LegalizedInst->getOperand(0).setReg(TmpReg); |
1185 | B.setInsertPt(*RepairInst->getParent(), RepairInst); |
1186 | |
1187 | for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) { |
1188 | Register IdxReg = B.buildConstant(LLT::scalar(32), DefIdx).getReg(0); |
1189 | MRI.setRegBank(IdxReg, AMDGPU::VGPRRegBank); |
1190 | B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg); |
1191 | } |
1192 | |
1193 | MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); |
1194 | return true; |
1195 | } |
1196 | |
1197 | bool AMDGPURegisterBankInfo::applyMappingImage( |
1198 | MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, |
1199 | MachineRegisterInfo &MRI, int RsrcIdx) const { |
1200 | const int NumDefs = MI.getNumExplicitDefs(); |
1201 | |
1202 | // The reported argument index is relative to the IR intrinsic call arguments, |
1203 | // so we need to shift by the number of defs and the intrinsic ID. |
1204 | RsrcIdx += NumDefs + 1; |
1205 | |
1206 | // Insert copies to VGPR arguments. |
1207 | applyDefaultMapping(OpdMapper); |
1208 | |
1209 | // Fixup any SGPR arguments. |
1210 | SmallVector<unsigned, 4> SGPRIndexes; |
1211 | for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { |
1212 | if (!MI.getOperand(I).isReg()) |
1213 | continue; |
1214 | |
1215 | // If this intrinsic has a sampler, it immediately follows rsrc. |
1216 | if (I == RsrcIdx || I == RsrcIdx + 1) |
1217 | SGPRIndexes.push_back(I); |
1218 | } |
1219 | |
1220 | executeInWaterfallLoop(MI, MRI, SGPRIndexes); |
1221 | return true; |
1222 | } |
1223 | |
1224 | static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI, |
1225 | Register Reg) { |
1226 | MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); |
1227 | if (!Def) |
1228 | return Reg; |
1229 | |
1230 | // TODO: Guard against this being an implicit def |
1231 | return Def->getOperand(0).getReg(); |
1232 | } |
1233 | |
1234 | // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store |
1235 | // the three offsets (voffset, soffset and instoffset) |
1236 | static unsigned setBufferOffsets(MachineIRBuilder &B, |
1237 | const AMDGPURegisterBankInfo &RBI, |
1238 | Register CombinedOffset, |
1239 | Register &VOffsetReg, |
1240 | Register &SOffsetReg, |
1241 | int64_t &InstOffsetVal, |
1242 | unsigned Align) { |
1243 | const LLT S32 = LLT::scalar(32); |
1244 | MachineRegisterInfo *MRI = B.getMRI(); |
1245 | |
1246 | if (Optional<int64_t> Imm = getConstantVRegVal(CombinedOffset, *MRI)) { |
1247 | uint32_t SOffset, ImmOffset; |
1248 | if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, |
1249 | &RBI.Subtarget, Align)) { |
1250 | VOffsetReg = B.buildConstant(S32, 0).getReg(0); |
1251 | SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); |
1252 | InstOffsetVal = ImmOffset; |
1253 | |
1254 | B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); |
1255 | B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); |
1256 | return SOffset + ImmOffset; |
1257 | } |
1258 | } |
1259 | |
1260 | Register Base; |
1261 | unsigned Offset; |
1262 | MachineInstr *Unused; |
1263 | |
1264 | std::tie(Base, Offset, Unused) |
1265 | = AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); |
1266 | |
1267 | uint32_t SOffset, ImmOffset; |
1268 | if (Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, |
1269 | &RBI.Subtarget, Align)) { |
1270 | if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { |
1271 | VOffsetReg = Base; |
1272 | SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); |
1273 | B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); |
1274 | InstOffsetVal = ImmOffset; |
1275 | return 0; // XXX - Why is this 0? |
1276 | } |
1277 | |
1278 | // If we have SGPR base, we can use it for soffset. |
1279 | if (SOffset == 0) { |
1280 | VOffsetReg = B.buildConstant(S32, 0).getReg(0); |
1281 | B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); |
1282 | SOffsetReg = Base; |
1283 | InstOffsetVal = ImmOffset; |
1284 | return 0; // XXX - Why is this 0? |
1285 | } |
1286 | } |
1287 | |
1288 | // Handle the variable sgpr + vgpr case. |
1289 | if (MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI)) { |
1290 | Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg()); |
1291 | Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg()); |
1292 | |
1293 | const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI); |
1294 | const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI); |
1295 | |
1296 | if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { |
1297 | VOffsetReg = Src0; |
1298 | SOffsetReg = Src1; |
1299 | return 0; |
1300 | } |
1301 | |
1302 | if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) { |
1303 | VOffsetReg = Src1; |
1304 | SOffsetReg = Src0; |
1305 | return 0; |
1306 | } |
1307 | } |
1308 | |
1309 | // Ensure we have a VGPR for the combined offset. This could be an issue if we |
1310 | // have an SGPR offset and a VGPR resource. |
1311 | if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { |
1312 | VOffsetReg = CombinedOffset; |
1313 | } else { |
1314 | VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0); |
1315 | B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); |
1316 | } |
1317 | |
1318 | SOffsetReg = B.buildConstant(S32, 0).getReg(0); |
1319 | B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); |
1320 | return 0; |
1321 | } |
1322 | |
1323 | static LLT divideLLT(LLT Ty, int Factor) { |
1324 | if (Ty.isVector()) |
1325 | return LLT::vector(Ty.getNumElements() / Factor, Ty.getElementType()); |
1326 | return LLT::scalar(Ty.getSizeInBits() / Factor); |
1327 | } |
1328 | |
1329 | bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( |
1330 | const OperandsMapper &OpdMapper) const { |
1331 | MachineInstr &MI = OpdMapper.getMI(); |
1332 | MachineRegisterInfo &MRI = OpdMapper.getMRI(); |
1333 | |
1334 | const LLT S32 = LLT::scalar(32); |
1335 | Register Dst = MI.getOperand(0).getReg(); |
1336 | LLT Ty = MRI.getType(Dst); |
1337 | |
1338 | const RegisterBank *RSrcBank = |
1339 | OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; |
1340 | const RegisterBank *OffsetBank = |
1341 | OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; |
1342 | if (RSrcBank == &AMDGPU::SGPRRegBank && |
1343 | OffsetBank == &AMDGPU::SGPRRegBank) |
1344 | return true; // Legal mapping |
1345 | |
1346 | // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back |
1347 | // here but don't have an MMO. |
1348 | |
1349 | unsigned LoadSize = Ty.getSizeInBits(); |
1350 | int NumLoads = 1; |
1351 | if (LoadSize == 256 || LoadSize == 512) { |
1352 | NumLoads = LoadSize / 128; |
1353 | Ty = divideLLT(Ty, NumLoads); |
1354 | } |
1355 | |
1356 | // Use the alignment to ensure that the required offsets will fit into the |
1357 | // immediate offsets. |
1358 | const unsigned Align = NumLoads > 1 ? 16 * NumLoads : 1; |
1359 | |
1360 | MachineIRBuilder B(MI); |
1361 | MachineFunction &MF = B.getMF(); |
1362 | |
1363 | Register SOffset; |
1364 | Register VOffset; |
1365 | int64_t ImmOffset = 0; |
1366 | |
1367 | unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(), |
1368 | VOffset, SOffset, ImmOffset, Align); |
1369 | |
1370 | // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we |
1371 | // can, but we neeed to track an MMO for that. |
1372 | const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8; |
1373 | const unsigned MemAlign = 4; // FIXME: ABI type alignment? |
1374 | MachineMemOperand *BaseMMO = MF.getMachineMemOperand( |
1375 | MachinePointerInfo(), |
1376 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | |
1377 | MachineMemOperand::MOInvariant, |
1378 | MemSize, MemAlign); |
1379 | if (MMOOffset != 0) |
1380 | BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize); |
1381 | |
1382 | // If only the offset is divergent, emit a MUBUF buffer load instead. We can |
1383 | // assume that the buffer is unswizzled. |
1384 | |
1385 | Register RSrc = MI.getOperand(1).getReg(); |
1386 | Register VIndex = B.buildConstant(S32, 0).getReg(0); |
1387 | B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank); |
1388 | |
1389 | SmallVector<Register, 4> LoadParts(NumLoads); |
1390 | |
1391 | MachineBasicBlock::iterator MII = MI.getIterator(); |
1392 | MachineInstrSpan Span(MII, &B.getMBB()); |
1393 | |
1394 | for (int i = 0; i < NumLoads; ++i) { |
1395 | if (NumLoads == 1) { |
1396 | LoadParts[i] = Dst; |
1397 | } else { |
1398 | LoadParts[i] = MRI.createGenericVirtualRegister(Ty); |
1399 | MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank); |
1400 | } |
1401 | |
1402 | MachineMemOperand *MMO = BaseMMO; |
1403 | if (i != 0) |
1404 | BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize); |
1405 | |
1406 | B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD) |
1407 | .addDef(LoadParts[i]) // vdata |
1408 | .addUse(RSrc) // rsrc |
1409 | .addUse(VIndex) // vindex |
1410 | .addUse(VOffset) // voffset |
1411 | .addUse(SOffset) // soffset |
1412 | .addImm(ImmOffset + 16 * i) // offset(imm) |
1413 | .addImm(0) // cachepolicy, swizzled buffer(imm) |
1414 | .addImm(0) // idxen(imm) |
1415 | .addMemOperand(MMO); |
1416 | } |
1417 | |
1418 | // TODO: If only the resource is a VGPR, it may be better to execute the |
1419 | // scalar load in the waterfall loop if the resource is expected to frequently |
1420 | // be dynamically uniform. |
1421 | if (RSrcBank != &AMDGPU::SGPRRegBank) { |
1422 | // Remove the original instruction to avoid potentially confusing the |
1423 | // waterfall loop logic. |
1424 | B.setInstr(*Span.begin()); |
1425 | MI.eraseFromParent(); |
1426 | |
1427 | SmallSet<Register, 4> OpsToWaterfall; |
1428 | |
1429 | OpsToWaterfall.insert(RSrc); |
1430 | executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), |
1431 | OpsToWaterfall, MRI); |
1432 | } |
1433 | |
1434 | if (NumLoads != 1) { |
1435 | if (Ty.isVector()) |
1436 | B.buildConcatVectors(Dst, LoadParts); |
1437 | else |
1438 | B.buildMerge(Dst, LoadParts); |
1439 | } |
1440 | |
1441 | // We removed the instruction earlier with a waterfall loop. |
1442 | if (RSrcBank == &AMDGPU::SGPRRegBank) |
1443 | MI.eraseFromParent(); |
1444 | |
1445 | return true; |
1446 | } |
1447 | |
1448 | bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic( |
1449 | const OperandsMapper &OpdMapper, bool Signed) const { |
1450 | MachineInstr &MI = OpdMapper.getMI(); |
1451 | MachineRegisterInfo &MRI = OpdMapper.getMRI(); |
1452 | |
1453 | // Insert basic copies |
1454 | applyDefaultMapping(OpdMapper); |
1455 | |
1456 | Register DstReg = MI.getOperand(0).getReg(); |
1457 | LLT Ty = MRI.getType(DstReg); |
1458 | |
1459 | const LLT S32 = LLT::scalar(32); |
1460 | |
1461 | const RegisterBank *DstBank = |
1462 | OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; |
1463 | if (DstBank == &AMDGPU::VGPRRegBank) { |
1464 | if (Ty == S32) |
1465 | return true; |
1466 | |
1467 | // TODO: 64-bit version is scalar only, so we need to expand this. |
1468 | return false; |
1469 | } |
1470 | |
1471 | Register SrcReg = MI.getOperand(2).getReg(); |
1472 | Register OffsetReg = MI.getOperand(3).getReg(); |
1473 | Register WidthReg = MI.getOperand(4).getReg(); |
1474 | |
1475 | // The scalar form packs the offset and width in a single operand. |
1476 | |
1477 | ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); |
1478 | GISelObserverWrapper Observer(&ApplyBank); |
1479 | MachineIRBuilder B(MI); |
1480 | B.setChangeObserver(Observer); |
1481 | |
1482 | // Ensure the high bits are clear to insert the offset. |
1483 | auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); |
1484 | auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); |
1485 | |
1486 | // Zeros out the low bits, so don't bother clamping the input value. |
1487 | auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16)); |
1488 | |
1489 | // Transformation function, pack the offset and width of a BFE into |
1490 | // the format expected by the S_BFE_I32 / S_BFE_U32. In the second |
1491 | // source, bits [5:0] contain the offset and bits [22:16] the width. |
1492 | auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); |
1493 | |
1494 | // TODO: It might be worth using a pseudo here to avoid scc clobber and |
1495 | // register class constraints. |
1496 | unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : |
1497 | (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); |
1498 | |
1499 | auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); |
1500 | if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) |
1501 | llvm_unreachable("failed to constrain BFE")::llvm::llvm_unreachable_internal("failed to constrain BFE", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 1501); |
1502 | |
1503 | MI.eraseFromParent(); |
1504 | return true; |
1505 | } |
1506 | |
1507 | // FIXME: Duplicated from LegalizerHelper |
1508 | static CmpInst::Predicate minMaxToCompare(unsigned Opc) { |
1509 | switch (Opc) { |
1510 | case TargetOpcode::G_SMIN: |
1511 | return CmpInst::ICMP_SLT; |
1512 | case TargetOpcode::G_SMAX: |
1513 | return CmpInst::ICMP_SGT; |
1514 | case TargetOpcode::G_UMIN: |
1515 | return CmpInst::ICMP_ULT; |
1516 | case TargetOpcode::G_UMAX: |
1517 | return CmpInst::ICMP_UGT; |
1518 | default: |
1519 | llvm_unreachable("not in integer min/max")::llvm::llvm_unreachable_internal("not in integer min/max", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 1519); |
1520 | } |
1521 | } |
1522 | |
1523 | static unsigned minMaxToExtend(unsigned Opc) { |
1524 | switch (Opc) { |
1525 | case TargetOpcode::G_SMIN: |
1526 | case TargetOpcode::G_SMAX: |
1527 | return TargetOpcode::G_SEXT; |
1528 | case TargetOpcode::G_UMIN: |
1529 | case TargetOpcode::G_UMAX: |
1530 | return TargetOpcode::G_ZEXT; |
1531 | default: |
1532 | llvm_unreachable("not in integer min/max")::llvm::llvm_unreachable_internal("not in integer min/max", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 1532); |
1533 | } |
1534 | } |
1535 | |
1536 | // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding |
1537 | // any illegal vector extend or unmerge operations. |
1538 | static std::pair<Register, Register> |
1539 | unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) { |
1540 | const LLT S32 = LLT::scalar(32); |
1541 | auto Bitcast = B.buildBitcast(S32, Src); |
1542 | |
1543 | if (ExtOpcode == TargetOpcode::G_SEXT) { |
1544 | auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16); |
1545 | auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16)); |
1546 | return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); |
1547 | } |
1548 | |
1549 | auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16)); |
1550 | if (ExtOpcode == TargetOpcode::G_ZEXT) { |
1551 | auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff)); |
1552 | return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); |
1553 | } |
1554 | |
1555 | assert(ExtOpcode == TargetOpcode::G_ANYEXT)((ExtOpcode == TargetOpcode::G_ANYEXT) ? static_cast<void> (0) : __assert_fail ("ExtOpcode == TargetOpcode::G_ANYEXT", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 1555, __PRETTY_FUNCTION__)); |
1556 | return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0)); |
1557 | } |
1558 | |
1559 | static MachineInstr *buildExpandedScalarMinMax(MachineIRBuilder &B, |
1560 | CmpInst::Predicate Pred, |
1561 | Register Dst, Register Src0, |
1562 | Register Src1) { |
1563 | const LLT CmpType = LLT::scalar(32); |
1564 | auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1); |
1565 | return B.buildSelect(Dst, Cmp, Src0, Src1); |
1566 | } |
1567 | |
1568 | // FIXME: Duplicated from LegalizerHelper, except changing the boolean type. |
1569 | void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B, |
1570 | MachineInstr &MI) const { |
1571 | Register Dst = MI.getOperand(0).getReg(); |
1572 | Register Src0 = MI.getOperand(1).getReg(); |
1573 | Register Src1 = MI.getOperand(2).getReg(); |
1574 | |
1575 | const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode()); |
1576 | MachineInstr *Sel = buildExpandedScalarMinMax(B, Pred, Dst, Src0, Src1); |
1577 | |
1578 | Register CmpReg = Sel->getOperand(1).getReg(); |
1579 | B.getMRI()->setRegBank(CmpReg, AMDGPU::SGPRRegBank); |
1580 | MI.eraseFromParent(); |
1581 | } |
1582 | |
1583 | // For cases where only a single copy is inserted for matching register banks. |
1584 | // Replace the register in the instruction operand |
1585 | static bool substituteSimpleCopyRegs( |
1586 | const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { |
1587 | SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); |
1588 | if (!SrcReg.empty()) { |
1589 | assert(SrcReg.size() == 1)((SrcReg.size() == 1) ? static_cast<void> (0) : __assert_fail ("SrcReg.size() == 1", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 1589, __PRETTY_FUNCTION__)); |
1590 | OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); |
1591 | return true; |
1592 | } |
1593 | |
1594 | return false; |
1595 | } |
1596 | |
1597 | /// Handle register layout difference for f16 images for some subtargets. |
1598 | Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, |
1599 | MachineRegisterInfo &MRI, |
1600 | Register Reg) const { |
1601 | if (!Subtarget.hasUnpackedD16VMem()) |
1602 | return Reg; |
1603 | |
1604 | const LLT S16 = LLT::scalar(16); |
1605 | LLT StoreVT = MRI.getType(Reg); |
1606 | if (!StoreVT.isVector() || StoreVT.getElementType() != S16) |
1607 | return Reg; |
1608 | |
1609 | auto Unmerge = B.buildUnmerge(S16, Reg); |
1610 | |
1611 | |
1612 | SmallVector<Register, 4> WideRegs; |
1613 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) |
1614 | WideRegs.push_back(Unmerge.getReg(I)); |
1615 | |
1616 | const LLT S32 = LLT::scalar(32); |
1617 | int NumElts = StoreVT.getNumElements(); |
1618 | |
1619 | return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0); |
1620 | } |
1621 | |
1622 | static std::pair<Register, unsigned> |
1623 | getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { |
1624 | int64_t Const; |
1625 | if (mi_match(Reg, MRI, m_ICst(Const))) |
1626 | return std::make_pair(Register(), Const); |
1627 | |
1628 | Register Base; |
1629 | if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const)))) |
1630 | return std::make_pair(Base, Const); |
1631 | |
1632 | // TODO: Handle G_OR used for add case |
1633 | return std::make_pair(Reg, 0); |
1634 | } |
1635 | |
1636 | std::pair<Register, unsigned> |
1637 | AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, |
1638 | Register OrigOffset) const { |
1639 | const unsigned MaxImm = 4095; |
1640 | Register BaseReg; |
1641 | unsigned ImmOffset; |
1642 | const LLT S32 = LLT::scalar(32); |
1643 | |
1644 | std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), |
1645 | OrigOffset); |
1646 | |
1647 | unsigned C1 = 0; |
1648 | if (ImmOffset != 0) { |
1649 | // If the immediate value is too big for the immoffset field, put the value |
1650 | // and -4096 into the immoffset field so that the value that is copied/added |
1651 | // for the voffset field is a multiple of 4096, and it stands more chance |
1652 | // of being CSEd with the copy/add for another similar load/store. |
1653 | // However, do not do that rounding down to a multiple of 4096 if that is a |
1654 | // negative number, as it appears to be illegal to have a negative offset |
1655 | // in the vgpr, even if adding the immediate offset makes it positive. |
1656 | unsigned Overflow = ImmOffset & ~MaxImm; |
1657 | ImmOffset -= Overflow; |
1658 | if ((int32_t)Overflow < 0) { |
1659 | Overflow += ImmOffset; |
1660 | ImmOffset = 0; |
1661 | } |
1662 | |
1663 | C1 = ImmOffset; |
1664 | if (Overflow != 0) { |
1665 | if (!BaseReg) |
1666 | BaseReg = B.buildConstant(S32, Overflow).getReg(0); |
1667 | else { |
1668 | auto OverflowVal = B.buildConstant(S32, Overflow); |
1669 | BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); |
1670 | } |
1671 | } |
1672 | } |
1673 | |
1674 | if (!BaseReg) |
1675 | BaseReg = B.buildConstant(S32, 0).getReg(0); |
1676 | |
1677 | return {BaseReg, C1}; |
1678 | } |
1679 | |
1680 | static bool isZero(Register Reg, MachineRegisterInfo &MRI) { |
1681 | int64_t C; |
1682 | return mi_match(Reg, MRI, m_ICst(C)) && C == 0; |
1683 | } |
1684 | |
1685 | static unsigned extractGLC(unsigned CachePolicy) { |
1686 | return CachePolicy & 1; |
1687 | } |
1688 | |
1689 | static unsigned extractSLC(unsigned CachePolicy) { |
1690 | return (CachePolicy >> 1) & 1; |
1691 | } |
1692 | |
1693 | static unsigned extractDLC(unsigned CachePolicy) { |
1694 | return (CachePolicy >> 2) & 1; |
1695 | } |
1696 | |
1697 | MachineInstr * |
1698 | AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B, |
1699 | MachineInstr &MI) const { |
1700 | MachineRegisterInfo &MRI = *B.getMRI(); |
1701 | executeInWaterfallLoop(B, MI, MRI, {2, 4}); |
1702 | |
1703 | // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer. |
1704 | |
1705 | Register VData = MI.getOperand(1).getReg(); |
1706 | LLT Ty = MRI.getType(VData); |
1707 | |
1708 | int EltSize = Ty.getScalarSizeInBits(); |
1709 | int Size = Ty.getSizeInBits(); |
1710 | |
1711 | // FIXME: Broken integer truncstore. |
1712 | if (EltSize != 32) |
1713 | report_fatal_error("unhandled intrinsic store"); |
1714 | |
1715 | // FIXME: Verifier should enforce 1 MMO for these intrinsics. |
1716 | const int MemSize = (*MI.memoperands_begin())->getSize(); |
1717 | |
1718 | |
1719 | Register RSrc = MI.getOperand(2).getReg(); |
1720 | Register VOffset = MI.getOperand(3).getReg(); |
1721 | Register SOffset = MI.getOperand(4).getReg(); |
1722 | unsigned CachePolicy = MI.getOperand(5).getImm(); |
1723 | |
1724 | unsigned ImmOffset; |
1725 | std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); |
1726 | |
1727 | const bool Offen = !isZero(VOffset, MRI); |
1728 | |
1729 | unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact; |
1730 | switch (8 * MemSize) { |
1731 | case 8: |
1732 | Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact : |
1733 | AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact; |
1734 | break; |
1735 | case 16: |
1736 | Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact : |
1737 | AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact; |
1738 | break; |
1739 | default: |
1740 | Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact : |
1741 | AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact; |
1742 | if (Size > 32) |
1743 | Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32); |
1744 | break; |
1745 | } |
1746 | |
1747 | |
1748 | // Set the insertion point back to the instruction in case it was moved into a |
1749 | // loop. |
1750 | B.setInstr(MI); |
1751 | |
1752 | MachineInstrBuilder MIB = B.buildInstr(Opc) |
1753 | .addUse(VData); |
1754 | |
1755 | if (Offen) |
1756 | MIB.addUse(VOffset); |
1757 | |
1758 | MIB.addUse(RSrc) |
1759 | .addUse(SOffset) |
1760 | .addImm(ImmOffset) |
1761 | .addImm(extractGLC(CachePolicy)) |
1762 | .addImm(extractSLC(CachePolicy)) |
1763 | .addImm(0) // tfe: FIXME: Remove from inst |
1764 | .addImm(extractDLC(CachePolicy)) |
1765 | .cloneMemRefs(MI); |
1766 | |
1767 | // FIXME: We need a way to report failure from applyMappingImpl. |
1768 | // Insert constrain copies before inserting the loop. |
1769 | if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) |
1770 | report_fatal_error("failed to constrain selected store intrinsic"); |
1771 | |
1772 | return MIB; |
1773 | } |
1774 | |
1775 | bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, |
1776 | Register SrcReg) const { |
1777 | MachineRegisterInfo &MRI = *B.getMRI(); |
1778 | LLT SrcTy = MRI.getType(SrcReg); |
1779 | if (SrcTy.getSizeInBits() == 32) { |
1780 | // Use a v_mov_b32 here to make the exec dependency explicit. |
1781 | B.buildInstr(AMDGPU::V_MOV_B32_e32) |
1782 | .addDef(DstReg) |
1783 | .addUse(SrcReg); |
1784 | return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) && |
1785 | constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI); |
1786 | } |
1787 | |
1788 | Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
1789 | Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
1790 | |
1791 | B.buildInstr(AMDGPU::V_MOV_B32_e32) |
1792 | .addDef(TmpReg0) |
1793 | .addUse(SrcReg, 0, AMDGPU::sub0); |
1794 | B.buildInstr(AMDGPU::V_MOV_B32_e32) |
1795 | .addDef(TmpReg1) |
1796 | .addUse(SrcReg, 0, AMDGPU::sub1); |
1797 | B.buildInstr(AMDGPU::REG_SEQUENCE) |
1798 | .addDef(DstReg) |
1799 | .addUse(TmpReg0) |
1800 | .addImm(AMDGPU::sub0) |
1801 | .addUse(TmpReg1) |
1802 | .addImm(AMDGPU::sub1); |
1803 | |
1804 | return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) && |
1805 | constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI); |
1806 | } |
1807 | |
1808 | /// Utility function for pushing dynamic vector indexes with a constant offset |
1809 | /// into waterwall loops. |
1810 | static void reinsertVectorIndexAdd(MachineIRBuilder &B, |
1811 | MachineInstr &IdxUseInstr, |
1812 | unsigned OpIdx, |
1813 | unsigned ConstOffset) { |
1814 | MachineRegisterInfo &MRI = *B.getMRI(); |
1815 | const LLT S32 = LLT::scalar(32); |
1816 | Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg(); |
1817 | B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator()); |
1818 | |
1819 | auto MaterializedOffset = B.buildConstant(S32, ConstOffset); |
1820 | |
1821 | auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset); |
1822 | MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); |
1823 | MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); |
1824 | IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0)); |
1825 | } |
1826 | |
1827 | void AMDGPURegisterBankInfo::applyMappingImpl( |
1828 | const OperandsMapper &OpdMapper) const { |
1829 | MachineInstr &MI = OpdMapper.getMI(); |
1830 | unsigned Opc = MI.getOpcode(); |
1831 | MachineRegisterInfo &MRI = OpdMapper.getMRI(); |
1832 | switch (Opc) { |
1833 | case AMDGPU::G_PHI: { |
1834 | Register DstReg = MI.getOperand(0).getReg(); |
1835 | LLT DstTy = MRI.getType(DstReg); |
1836 | if (DstTy != LLT::scalar(1)) |
1837 | break; |
1838 | |
1839 | const LLT S32 = LLT::scalar(32); |
1840 | const RegisterBank *DstBank = |
1841 | OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; |
1842 | if (DstBank == &AMDGPU::VCCRegBank) { |
1843 | applyDefaultMapping(OpdMapper); |
1844 | // The standard handling only considers the result register bank for |
1845 | // phis. For VCC, blindly inserting a copy when the phi is lowered will |
1846 | // produce an invalid copy. We can only copy with some kind of compare to |
1847 | // get a vector boolean result. Insert a regitser bank copy that will be |
1848 | // correctly lowered to a compare. |
1849 | MachineIRBuilder B(*MI.getParent()->getParent()); |
1850 | |
1851 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { |
1852 | Register SrcReg = MI.getOperand(I).getReg(); |
1853 | const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); |
1854 | |
1855 | if (SrcBank != &AMDGPU::VCCRegBank) { |
1856 | MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB(); |
1857 | B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator()); |
1858 | |
1859 | auto Copy = B.buildCopy(LLT::scalar(1), SrcReg); |
1860 | MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank); |
1861 | MI.getOperand(I).setReg(Copy.getReg(0)); |
1862 | } |
1863 | } |
1864 | |
1865 | return; |
1866 | } |
1867 | |
1868 | // Phi handling is strange and only considers the bank of the destination. |
1869 | substituteSimpleCopyRegs(OpdMapper, 0); |
1870 | |
1871 | // Promote SGPR/VGPR booleans to s32 |
1872 | MachineFunction *MF = MI.getParent()->getParent(); |
1873 | ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); |
1874 | GISelObserverWrapper Observer(&ApplyBank); |
1875 | MachineIRBuilder B(MI); |
1876 | LegalizerHelper Helper(*MF, Observer, B); |
1877 | |
1878 | if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) |
1879 | llvm_unreachable("widen scalar should have succeeded")::llvm::llvm_unreachable_internal("widen scalar should have succeeded" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 1879); |
1880 | |
1881 | return; |
1882 | } |
1883 | case AMDGPU::G_ICMP: |
1884 | case AMDGPU::G_UADDO: |
1885 | case AMDGPU::G_USUBO: |
1886 | case AMDGPU::G_UADDE: |
1887 | case AMDGPU::G_SADDE: |
1888 | case AMDGPU::G_USUBE: |
1889 | case AMDGPU::G_SSUBE: { |
1890 | unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1; |
1891 | Register DstReg = MI.getOperand(BoolDstOp).getReg(); |
1892 | |
1893 | const RegisterBank *DstBank = |
1894 | OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; |
1895 | if (DstBank != &AMDGPU::SGPRRegBank) |
1896 | break; |
1897 | |
1898 | const bool HasCarryIn = MI.getNumOperands() == 5; |
1899 | |
1900 | // If this is a scalar compare, promote the result to s32, as the selection |
1901 | // will end up using a copy to a 32-bit vreg. |
1902 | const LLT S32 = LLT::scalar(32); |
1903 | Register NewDstReg = MRI.createGenericVirtualRegister(S32); |
1904 | MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank); |
1905 | MI.getOperand(BoolDstOp).setReg(NewDstReg); |
1906 | MachineIRBuilder B(MI); |
1907 | |
1908 | if (HasCarryIn) { |
1909 | Register NewSrcReg = MRI.createGenericVirtualRegister(S32); |
1910 | MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank); |
1911 | B.buildZExt(NewSrcReg, MI.getOperand(4).getReg()); |
1912 | MI.getOperand(4).setReg(NewSrcReg); |
1913 | } |
1914 | |
1915 | MachineBasicBlock *MBB = MI.getParent(); |
1916 | B.setInsertPt(*MBB, std::next(MI.getIterator())); |
1917 | |
1918 | // If we had a constrained VCC result register, a copy was inserted to VCC |
1919 | // from SGPR. |
1920 | SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); |
1921 | if (DefRegs.empty()) |
1922 | DefRegs.push_back(DstReg); |
1923 | B.buildTrunc(DefRegs[0], NewDstReg); |
1924 | return; |
1925 | } |
1926 | case AMDGPU::G_SELECT: { |
1927 | Register DstReg = MI.getOperand(0).getReg(); |
1928 | LLT DstTy = MRI.getType(DstReg); |
1929 | |
1930 | SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1)); |
1931 | if (CondRegs.empty()) |
1932 | CondRegs.push_back(MI.getOperand(1).getReg()); |
1933 | else { |
1934 | assert(CondRegs.size() == 1)((CondRegs.size() == 1) ? static_cast<void> (0) : __assert_fail ("CondRegs.size() == 1", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 1934, __PRETTY_FUNCTION__)); |
1935 | } |
1936 | |
1937 | const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI); |
1938 | if (CondBank == &AMDGPU::SGPRRegBank) { |
1939 | MachineIRBuilder B(MI); |
1940 | const LLT S32 = LLT::scalar(32); |
1941 | Register NewCondReg = MRI.createGenericVirtualRegister(S32); |
1942 | MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); |
1943 | |
1944 | MI.getOperand(1).setReg(NewCondReg); |
1945 | B.buildZExt(NewCondReg, CondRegs[0]); |
1946 | } |
1947 | |
1948 | if (DstTy.getSizeInBits() != 64) |
1949 | break; |
1950 | |
1951 | MachineIRBuilder B(MI); |
1952 | LLT HalfTy = getHalfSizedType(DstTy); |
1953 | |
1954 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); |
1955 | SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); |
1956 | SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); |
1957 | |
1958 | // All inputs are SGPRs, nothing special to do. |
1959 | if (DefRegs.empty()) { |
1960 | assert(Src1Regs.empty() && Src2Regs.empty())((Src1Regs.empty() && Src2Regs.empty()) ? static_cast <void> (0) : __assert_fail ("Src1Regs.empty() && Src2Regs.empty()" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 1960, __PRETTY_FUNCTION__)); |
1961 | break; |
1962 | } |
1963 | |
1964 | if (Src1Regs.empty()) |
1965 | split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); |
1966 | else { |
1967 | setRegsToType(MRI, Src1Regs, HalfTy); |
1968 | } |
1969 | |
1970 | if (Src2Regs.empty()) |
1971 | split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); |
1972 | else |
1973 | setRegsToType(MRI, Src2Regs, HalfTy); |
1974 | |
1975 | setRegsToType(MRI, DefRegs, HalfTy); |
1976 | |
1977 | B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]); |
1978 | B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]); |
1979 | |
1980 | MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); |
1981 | MI.eraseFromParent(); |
1982 | return; |
1983 | } |
1984 | case AMDGPU::G_BRCOND: { |
1985 | Register CondReg = MI.getOperand(0).getReg(); |
1986 | // FIXME: Should use legalizer helper, but should change bool ext type. |
1987 | const RegisterBank *CondBank = |
1988 | OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; |
1989 | |
1990 | if (CondBank == &AMDGPU::SGPRRegBank) { |
1991 | MachineIRBuilder B(MI); |
1992 | const LLT S32 = LLT::scalar(32); |
1993 | Register NewCondReg = MRI.createGenericVirtualRegister(S32); |
1994 | MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); |
1995 | |
1996 | MI.getOperand(0).setReg(NewCondReg); |
1997 | B.buildZExt(NewCondReg, CondReg); |
1998 | return; |
1999 | } |
2000 | |
2001 | break; |
2002 | } |
2003 | case AMDGPU::G_AND: |
2004 | case AMDGPU::G_OR: |
2005 | case AMDGPU::G_XOR: { |
2006 | // 64-bit and is only available on the SALU, so split into 2 32-bit ops if |
2007 | // there is a VGPR input. |
2008 | Register DstReg = MI.getOperand(0).getReg(); |
2009 | LLT DstTy = MRI.getType(DstReg); |
2010 | |
2011 | if (DstTy.getSizeInBits() == 1) { |
2012 | const RegisterBank *DstBank = |
2013 | OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; |
2014 | if (DstBank == &AMDGPU::VCCRegBank) |
2015 | break; |
2016 | |
2017 | MachineFunction *MF = MI.getParent()->getParent(); |
2018 | ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); |
2019 | GISelObserverWrapper Observer(&ApplyBank); |
2020 | MachineIRBuilder B(MI); |
2021 | LegalizerHelper Helper(*MF, Observer, B); |
2022 | |
2023 | if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != |
2024 | LegalizerHelper::Legalized) |
2025 | llvm_unreachable("widen scalar should have succeeded")::llvm::llvm_unreachable_internal("widen scalar should have succeeded" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2025); |
2026 | return; |
2027 | } |
2028 | |
2029 | if (DstTy.getSizeInBits() != 64) |
2030 | break; |
2031 | |
2032 | LLT HalfTy = getHalfSizedType(DstTy); |
2033 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); |
2034 | SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); |
2035 | SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); |
2036 | |
2037 | // All inputs are SGPRs, nothing special to do. |
2038 | if (DefRegs.empty()) { |
2039 | assert(Src0Regs.empty() && Src1Regs.empty())((Src0Regs.empty() && Src1Regs.empty()) ? static_cast <void> (0) : __assert_fail ("Src0Regs.empty() && Src1Regs.empty()" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2039, __PRETTY_FUNCTION__)); |
2040 | break; |
2041 | } |
2042 | |
2043 | assert(DefRegs.size() == 2)((DefRegs.size() == 2) ? static_cast<void> (0) : __assert_fail ("DefRegs.size() == 2", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2043, __PRETTY_FUNCTION__)); |
2044 | assert(Src0Regs.size() == Src1Regs.size() &&((Src0Regs.size() == Src1Regs.size() && (Src0Regs.empty () || Src0Regs.size() == 2)) ? static_cast<void> (0) : __assert_fail ("Src0Regs.size() == Src1Regs.size() && (Src0Regs.empty() || Src0Regs.size() == 2)" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2045, __PRETTY_FUNCTION__)) |
2045 | (Src0Regs.empty() || Src0Regs.size() == 2))((Src0Regs.size() == Src1Regs.size() && (Src0Regs.empty () || Src0Regs.size() == 2)) ? static_cast<void> (0) : __assert_fail ("Src0Regs.size() == Src1Regs.size() && (Src0Regs.empty() || Src0Regs.size() == 2)" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2045, __PRETTY_FUNCTION__)); |
2046 | |
2047 | // Depending on where the source registers came from, the generic code may |
2048 | // have decided to split the inputs already or not. If not, we still need to |
2049 | // extract the values. |
2050 | MachineIRBuilder B(MI); |
2051 | |
2052 | if (Src0Regs.empty()) |
2053 | split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); |
2054 | else |
2055 | setRegsToType(MRI, Src0Regs, HalfTy); |
2056 | |
2057 | if (Src1Regs.empty()) |
2058 | split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); |
2059 | else |
2060 | setRegsToType(MRI, Src1Regs, HalfTy); |
2061 | |
2062 | setRegsToType(MRI, DefRegs, HalfTy); |
2063 | |
2064 | B.buildInstr(Opc) |
2065 | .addDef(DefRegs[0]) |
2066 | .addUse(Src0Regs[0]) |
2067 | .addUse(Src1Regs[0]); |
2068 | |
2069 | B.buildInstr(Opc) |
2070 | .addDef(DefRegs[1]) |
2071 | .addUse(Src0Regs[1]) |
2072 | .addUse(Src1Regs[1]); |
2073 | |
2074 | MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); |
2075 | MI.eraseFromParent(); |
2076 | return; |
2077 | } |
2078 | case AMDGPU::G_ADD: |
2079 | case AMDGPU::G_SUB: |
2080 | case AMDGPU::G_MUL: { |
2081 | Register DstReg = MI.getOperand(0).getReg(); |
2082 | LLT DstTy = MRI.getType(DstReg); |
2083 | if (DstTy != LLT::scalar(16)) |
2084 | break; |
2085 | |
2086 | const RegisterBank *DstBank = |
2087 | OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; |
2088 | if (DstBank == &AMDGPU::VGPRRegBank) |
2089 | break; |
2090 | |
2091 | // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. |
2092 | MachineFunction *MF = MI.getParent()->getParent(); |
2093 | MachineIRBuilder B(MI); |
2094 | ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); |
2095 | GISelObserverWrapper Observer(&ApplySALU); |
2096 | LegalizerHelper Helper(*MF, Observer, B); |
2097 | |
2098 | if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != |
2099 | LegalizerHelper::Legalized) |
2100 | llvm_unreachable("widen scalar should have succeeded")::llvm::llvm_unreachable_internal("widen scalar should have succeeded" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2100); |
2101 | return; |
2102 | } |
2103 | case AMDGPU::G_SMIN: |
2104 | case AMDGPU::G_SMAX: |
2105 | case AMDGPU::G_UMIN: |
2106 | case AMDGPU::G_UMAX: { |
2107 | Register DstReg = MI.getOperand(0).getReg(); |
2108 | const RegisterBank *DstBank = |
2109 | OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; |
2110 | if (DstBank == &AMDGPU::VGPRRegBank) |
2111 | break; |
2112 | |
2113 | MachineFunction *MF = MI.getParent()->getParent(); |
2114 | MachineIRBuilder B(MI); |
2115 | |
2116 | // Turn scalar min/max into a compare and select. |
2117 | LLT Ty = MRI.getType(DstReg); |
2118 | const LLT S32 = LLT::scalar(32); |
2119 | const LLT S16 = LLT::scalar(16); |
2120 | const LLT V2S16 = LLT::vector(2, 16); |
2121 | |
2122 | if (Ty == V2S16) { |
2123 | ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); |
2124 | GISelObserverWrapper Observer(&ApplySALU); |
2125 | B.setChangeObserver(Observer); |
2126 | |
2127 | // Need to widen to s32, and expand as cmp + select, and avoid producing |
2128 | // illegal vector extends or unmerges that would need further |
2129 | // legalization. |
2130 | // |
2131 | // TODO: Should we just readfirstlane? That should probably be handled |
2132 | // with a UniformVGPR register bank that wouldn't need special |
2133 | // consideration here. |
2134 | |
2135 | Register Dst = MI.getOperand(0).getReg(); |
2136 | Register Src0 = MI.getOperand(1).getReg(); |
2137 | Register Src1 = MI.getOperand(2).getReg(); |
2138 | |
2139 | Register WideSrc0Lo, WideSrc0Hi; |
2140 | Register WideSrc1Lo, WideSrc1Hi; |
2141 | |
2142 | unsigned ExtendOp = minMaxToExtend(MI.getOpcode()); |
2143 | |
2144 | std::tie(WideSrc0Lo, WideSrc0Hi) = unpackV2S16ToS32(B, Src0, ExtendOp); |
2145 | std::tie(WideSrc1Lo, WideSrc1Hi) = unpackV2S16ToS32(B, Src1, ExtendOp); |
2146 | |
2147 | Register Lo = MRI.createGenericVirtualRegister(S32); |
2148 | Register Hi = MRI.createGenericVirtualRegister(S32); |
2149 | const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode()); |
2150 | buildExpandedScalarMinMax(B, Pred, Lo, WideSrc0Lo, WideSrc1Lo); |
2151 | buildExpandedScalarMinMax(B, Pred, Hi, WideSrc0Hi, WideSrc1Hi); |
2152 | |
2153 | B.buildBuildVectorTrunc(Dst, {Lo, Hi}); |
2154 | MI.eraseFromParent(); |
2155 | } else if (Ty == S16) { |
2156 | ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); |
2157 | GISelObserverWrapper Observer(&ApplySALU); |
2158 | LegalizerHelper Helper(*MF, Observer, B); |
2159 | |
2160 | // Need to widen to s32, and expand as cmp + select. |
2161 | if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) |
2162 | llvm_unreachable("widenScalar should have succeeded")::llvm::llvm_unreachable_internal("widenScalar should have succeeded" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2162); |
2163 | |
2164 | // FIXME: This is relying on widenScalar leaving MI in place. |
2165 | lowerScalarMinMax(B, MI); |
2166 | } else |
2167 | lowerScalarMinMax(B, MI); |
2168 | |
2169 | return; |
2170 | } |
2171 | case AMDGPU::G_SEXT_INREG: { |
2172 | SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); |
2173 | if (SrcRegs.empty()) |
2174 | break; // Nothing to repair |
2175 | |
2176 | const LLT S32 = LLT::scalar(32); |
2177 | MachineIRBuilder B(MI); |
2178 | ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank); |
2179 | GISelObserverWrapper Observer(&O); |
2180 | B.setChangeObserver(Observer); |
2181 | |
2182 | // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs |
2183 | // we would need to further expand, and doesn't let us directly set the |
2184 | // result registers. |
2185 | SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); |
2186 | |
2187 | int Amt = MI.getOperand(2).getImm(); |
2188 | if (Amt <= 32) { |
2189 | if (Amt == 32) { |
2190 | // The low bits are unchanged. |
2191 | B.buildCopy(DstRegs[0], SrcRegs[0]); |
2192 | } else { |
2193 | // Extend in the low bits and propagate the sign bit to the high half. |
2194 | B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt); |
2195 | } |
2196 | |
2197 | B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31)); |
2198 | } else { |
2199 | // The low bits are unchanged, and extend in the high bits. |
2200 | B.buildCopy(DstRegs[0], SrcRegs[0]); |
2201 | B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32); |
2202 | } |
2203 | |
2204 | Register DstReg = MI.getOperand(0).getReg(); |
2205 | MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); |
2206 | MI.eraseFromParent(); |
2207 | return; |
2208 | } |
2209 | case AMDGPU::G_CTPOP: |
2210 | case AMDGPU::G_CTLZ_ZERO_UNDEF: |
2211 | case AMDGPU::G_CTTZ_ZERO_UNDEF: { |
2212 | MachineIRBuilder B(MI); |
2213 | MachineFunction &MF = B.getMF(); |
2214 | |
2215 | const RegisterBank *DstBank = |
2216 | OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; |
2217 | if (DstBank == &AMDGPU::SGPRRegBank) |
2218 | break; |
2219 | |
2220 | Register SrcReg = MI.getOperand(1).getReg(); |
2221 | const LLT S32 = LLT::scalar(32); |
2222 | LLT Ty = MRI.getType(SrcReg); |
2223 | if (Ty == S32) |
2224 | break; |
2225 | |
2226 | ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); |
2227 | GISelObserverWrapper Observer(&ApplyVALU); |
2228 | LegalizerHelper Helper(MF, Observer, B); |
2229 | |
2230 | if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized) |
2231 | llvm_unreachable("narrowScalar should have succeeded")::llvm::llvm_unreachable_internal("narrowScalar should have succeeded" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2231); |
2232 | return; |
2233 | } |
2234 | case AMDGPU::G_SEXT: |
2235 | case AMDGPU::G_ZEXT: { |
2236 | Register SrcReg = MI.getOperand(1).getReg(); |
2237 | LLT SrcTy = MRI.getType(SrcReg); |
2238 | bool Signed = Opc == AMDGPU::G_SEXT; |
2239 | |
2240 | MachineIRBuilder B(MI); |
2241 | const RegisterBank *SrcBank = |
2242 | OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; |
2243 | |
2244 | Register DstReg = MI.getOperand(0).getReg(); |
2245 | LLT DstTy = MRI.getType(DstReg); |
2246 | if (DstTy.isScalar() && |
2247 | SrcBank != &AMDGPU::SGPRRegBank && |
2248 | SrcBank != &AMDGPU::VCCRegBank && |
2249 | // FIXME: Should handle any type that round to s64 when irregular |
2250 | // breakdowns supported. |
2251 | DstTy.getSizeInBits() == 64 && |
2252 | SrcTy.getSizeInBits() <= 32) { |
2253 | const LLT S32 = LLT::scalar(32); |
2254 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); |
2255 | |
2256 | // Extend to 32-bit, and then extend the low half. |
2257 | if (Signed) { |
2258 | // TODO: Should really be buildSExtOrCopy |
2259 | B.buildSExtOrTrunc(DefRegs[0], SrcReg); |
2260 | |
2261 | // Replicate sign bit from 32-bit extended part. |
2262 | auto ShiftAmt = B.buildConstant(S32, 31); |
2263 | MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank); |
2264 | B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt); |
2265 | } else { |
2266 | B.buildZExtOrTrunc(DefRegs[0], SrcReg); |
2267 | B.buildConstant(DefRegs[1], 0); |
2268 | } |
2269 | |
2270 | MRI.setRegBank(DstReg, *SrcBank); |
2271 | MI.eraseFromParent(); |
2272 | return; |
2273 | } |
2274 | |
2275 | if (SrcTy != LLT::scalar(1)) |
2276 | return; |
2277 | |
2278 | if (SrcBank == &AMDGPU::VCCRegBank) { |
2279 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); |
2280 | |
2281 | const RegisterBank *DstBank = &AMDGPU::VGPRRegBank; |
2282 | |
2283 | unsigned DstSize = DstTy.getSizeInBits(); |
2284 | // 64-bit select is SGPR only |
2285 | const bool UseSel64 = DstSize > 32 && |
2286 | SrcBank->getID() == AMDGPU::SGPRRegBankID; |
2287 | |
2288 | // TODO: Should s16 select be legal? |
2289 | LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); |
2290 | auto True = B.buildConstant(SelType, Signed ? -1 : 1); |
2291 | auto False = B.buildConstant(SelType, 0); |
2292 | |
2293 | MRI.setRegBank(True.getReg(0), *DstBank); |
2294 | MRI.setRegBank(False.getReg(0), *DstBank); |
2295 | MRI.setRegBank(DstReg, *DstBank); |
2296 | |
2297 | if (DstSize > 32) { |
2298 | B.buildSelect(DefRegs[0], SrcReg, True, False); |
2299 | B.buildCopy(DefRegs[1], DefRegs[0]); |
2300 | } else if (DstSize < 32) { |
2301 | auto Sel = B.buildSelect(SelType, SrcReg, True, False); |
2302 | MRI.setRegBank(Sel.getReg(0), *DstBank); |
2303 | B.buildTrunc(DstReg, Sel); |
2304 | } else { |
2305 | B.buildSelect(DstReg, SrcReg, True, False); |
2306 | } |
2307 | |
2308 | MI.eraseFromParent(); |
2309 | return; |
2310 | } |
2311 | |
2312 | // Fixup the case with an s1 src that isn't a condition register. Use shifts |
2313 | // instead of introducing a compare to avoid an unnecessary condition |
2314 | // register (and since there's no scalar 16-bit compares). |
2315 | auto Ext = B.buildAnyExt(DstTy, SrcReg); |
2316 | auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1); |
2317 | auto Shl = B.buildShl(DstTy, Ext, ShiftAmt); |
2318 | |
2319 | if (MI.getOpcode() == AMDGPU::G_SEXT) |
2320 | B.buildAShr(DstReg, Shl, ShiftAmt); |
2321 | else |
2322 | B.buildLShr(DstReg, Shl, ShiftAmt); |
2323 | |
2324 | MRI.setRegBank(DstReg, *SrcBank); |
2325 | MRI.setRegBank(Ext.getReg(0), *SrcBank); |
2326 | MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank); |
2327 | MRI.setRegBank(Shl.getReg(0), *SrcBank); |
2328 | MI.eraseFromParent(); |
2329 | return; |
2330 | } |
2331 | case AMDGPU::G_BUILD_VECTOR: |
2332 | case AMDGPU::G_BUILD_VECTOR_TRUNC: { |
2333 | Register DstReg = MI.getOperand(0).getReg(); |
2334 | LLT DstTy = MRI.getType(DstReg); |
2335 | if (DstTy != LLT::vector(2, 16)) |
2336 | break; |
2337 | |
2338 | assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty())((MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty ()) ? static_cast<void> (0) : __assert_fail ("MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2338, __PRETTY_FUNCTION__)); |
2339 | substituteSimpleCopyRegs(OpdMapper, 1); |
2340 | substituteSimpleCopyRegs(OpdMapper, 2); |
2341 | |
2342 | const RegisterBank *DstBank = |
2343 | OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; |
2344 | if (DstBank == &AMDGPU::SGPRRegBank) |
2345 | break; // Can use S_PACK_* instructions. |
2346 | |
2347 | MachineIRBuilder B(MI); |
2348 | |
2349 | Register Lo = MI.getOperand(1).getReg(); |
2350 | Register Hi = MI.getOperand(2).getReg(); |
2351 | const LLT S32 = LLT::scalar(32); |
2352 | |
2353 | const RegisterBank *BankLo = |
2354 | OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; |
2355 | const RegisterBank *BankHi = |
2356 | OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; |
2357 | |
2358 | Register ZextLo; |
2359 | Register ShiftHi; |
2360 | |
2361 | if (Opc == AMDGPU::G_BUILD_VECTOR) { |
2362 | ZextLo = B.buildZExt(S32, Lo).getReg(0); |
2363 | MRI.setRegBank(ZextLo, *BankLo); |
2364 | |
2365 | Register ZextHi = B.buildZExt(S32, Hi).getReg(0); |
2366 | MRI.setRegBank(ZextHi, *BankHi); |
2367 | |
2368 | auto ShiftAmt = B.buildConstant(S32, 16); |
2369 | MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); |
2370 | |
2371 | ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0); |
2372 | MRI.setRegBank(ShiftHi, *BankHi); |
2373 | } else { |
2374 | Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0); |
2375 | MRI.setRegBank(MaskLo, *BankLo); |
2376 | |
2377 | auto ShiftAmt = B.buildConstant(S32, 16); |
2378 | MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); |
2379 | |
2380 | ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0); |
2381 | MRI.setRegBank(ShiftHi, *BankHi); |
2382 | |
2383 | ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0); |
2384 | MRI.setRegBank(ZextLo, *BankLo); |
2385 | } |
2386 | |
2387 | auto Or = B.buildOr(S32, ZextLo, ShiftHi); |
2388 | MRI.setRegBank(Or.getReg(0), *DstBank); |
2389 | |
2390 | B.buildBitcast(DstReg, Or); |
2391 | MI.eraseFromParent(); |
2392 | return; |
2393 | } |
2394 | case AMDGPU::G_EXTRACT_VECTOR_ELT: { |
2395 | SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); |
2396 | |
2397 | assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty())((OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs (2).empty()) ? static_cast<void> (0) : __assert_fail ("OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2397, __PRETTY_FUNCTION__)); |
2398 | |
2399 | Register DstReg = MI.getOperand(0).getReg(); |
2400 | Register SrcReg = MI.getOperand(1).getReg(); |
2401 | |
2402 | const LLT S32 = LLT::scalar(32); |
2403 | LLT DstTy = MRI.getType(DstReg); |
2404 | LLT SrcTy = MRI.getType(SrcReg); |
2405 | |
2406 | MachineIRBuilder B(MI); |
2407 | |
2408 | const ValueMapping &DstMapping |
2409 | = OpdMapper.getInstrMapping().getOperandMapping(0); |
2410 | const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank; |
2411 | const RegisterBank *SrcBank = |
2412 | OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; |
2413 | const RegisterBank *IdxBank = |
2414 | OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; |
2415 | |
2416 | Register BaseIdxReg; |
2417 | unsigned ConstOffset; |
2418 | MachineInstr *OffsetDef; |
2419 | std::tie(BaseIdxReg, ConstOffset, OffsetDef) = |
2420 | AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg()); |
2421 | |
2422 | // See if the index is an add of a constant which will be foldable by moving |
2423 | // the base register of the index later if this is going to be executed in a |
2424 | // waterfall loop. This is essentially to reassociate the add of a constant |
2425 | // with the readfirstlane. |
2426 | bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && |
2427 | ConstOffset > 0 && |
2428 | ConstOffset < SrcTy.getNumElements(); |
2429 | |
2430 | // Move the base register. We'll re-insert the add later. |
2431 | if (ShouldMoveIndexIntoLoop) |
2432 | MI.getOperand(2).setReg(BaseIdxReg); |
2433 | |
2434 | // If this is a VGPR result only because the index was a VGPR result, the |
2435 | // actual indexing will be done on the SGPR source vector, which will |
2436 | // produce a scalar result. We need to copy to the VGPR result inside the |
2437 | // waterfall loop. |
2438 | const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank && |
2439 | SrcBank == &AMDGPU::SGPRRegBank; |
2440 | if (DstRegs.empty()) { |
2441 | applyDefaultMapping(OpdMapper); |
2442 | |
2443 | executeInWaterfallLoop(MI, MRI, { 2 }); |
2444 | |
2445 | if (NeedCopyToVGPR) { |
2446 | // We don't want a phi for this temporary reg. |
2447 | Register TmpReg = MRI.createGenericVirtualRegister(DstTy); |
2448 | MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank); |
2449 | MI.getOperand(0).setReg(TmpReg); |
2450 | B.setInsertPt(*MI.getParent(), ++MI.getIterator()); |
2451 | |
2452 | // Use a v_mov_b32 here to make the exec dependency explicit. |
2453 | buildVCopy(B, DstReg, TmpReg); |
2454 | } |
2455 | |
2456 | // Re-insert the constant offset add inside the waterfall loop. |
2457 | if (ShouldMoveIndexIntoLoop) |
2458 | reinsertVectorIndexAdd(B, MI, 2, ConstOffset); |
2459 | |
2460 | return; |
2461 | } |
2462 | |
2463 | assert(DstTy.getSizeInBits() == 64)((DstTy.getSizeInBits() == 64) ? static_cast<void> (0) : __assert_fail ("DstTy.getSizeInBits() == 64", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2463, __PRETTY_FUNCTION__)); |
2464 | |
2465 | LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); |
2466 | |
2467 | auto CastSrc = B.buildBitcast(Vec32, SrcReg); |
2468 | auto One = B.buildConstant(S32, 1); |
2469 | |
2470 | MachineBasicBlock::iterator MII = MI.getIterator(); |
2471 | |
2472 | // Split the vector index into 32-bit pieces. Prepare to move all of the |
2473 | // new instructions into a waterfall loop if necessary. |
2474 | // |
2475 | // Don't put the bitcast or constant in the loop. |
2476 | MachineInstrSpan Span(MII, &B.getMBB()); |
2477 | |
2478 | // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). |
2479 | auto IdxLo = B.buildShl(S32, BaseIdxReg, One); |
2480 | auto IdxHi = B.buildAdd(S32, IdxLo, One); |
2481 | |
2482 | auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); |
2483 | auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); |
2484 | |
2485 | MRI.setRegBank(DstReg, *DstBank); |
2486 | MRI.setRegBank(CastSrc.getReg(0), *SrcBank); |
2487 | MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); |
2488 | MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); |
2489 | MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); |
2490 | |
2491 | SmallSet<Register, 4> OpsToWaterfall; |
2492 | if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { |
2493 | MI.eraseFromParent(); |
2494 | return; |
2495 | } |
2496 | |
2497 | // Remove the original instruction to avoid potentially confusing the |
2498 | // waterfall loop logic. |
2499 | B.setInstr(*Span.begin()); |
2500 | MI.eraseFromParent(); |
2501 | executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), |
2502 | OpsToWaterfall, MRI); |
2503 | |
2504 | if (NeedCopyToVGPR) { |
2505 | MachineBasicBlock *LoopBB = Extract1->getParent(); |
2506 | Register TmpReg0 = MRI.createGenericVirtualRegister(S32); |
2507 | Register TmpReg1 = MRI.createGenericVirtualRegister(S32); |
2508 | MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank); |
2509 | MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank); |
2510 | |
2511 | Extract0->getOperand(0).setReg(TmpReg0); |
2512 | Extract1->getOperand(0).setReg(TmpReg1); |
2513 | |
2514 | B.setInsertPt(*LoopBB, ++Extract1->getIterator()); |
2515 | |
2516 | buildVCopy(B, DstRegs[0], TmpReg0); |
2517 | buildVCopy(B, DstRegs[1], TmpReg1); |
2518 | } |
2519 | |
2520 | if (ShouldMoveIndexIntoLoop) |
2521 | reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); |
2522 | |
2523 | return; |
2524 | } |
2525 | case AMDGPU::G_INSERT_VECTOR_ELT: { |
2526 | SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); |
2527 | |
2528 | Register DstReg = MI.getOperand(0).getReg(); |
2529 | LLT VecTy = MRI.getType(DstReg); |
2530 | |
2531 | assert(OpdMapper.getVRegs(0).empty())((OpdMapper.getVRegs(0).empty()) ? static_cast<void> (0 ) : __assert_fail ("OpdMapper.getVRegs(0).empty()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2531, __PRETTY_FUNCTION__)); |
2532 | assert(OpdMapper.getVRegs(3).empty())((OpdMapper.getVRegs(3).empty()) ? static_cast<void> (0 ) : __assert_fail ("OpdMapper.getVRegs(3).empty()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2532, __PRETTY_FUNCTION__)); |
2533 | |
2534 | const RegisterBank *IdxBank = |
2535 | OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; |
2536 | |
2537 | if (substituteSimpleCopyRegs(OpdMapper, 1)) |
2538 | MRI.setType(MI.getOperand(1).getReg(), VecTy); |
2539 | |
2540 | Register SrcReg = MI.getOperand(1).getReg(); |
2541 | Register InsReg = MI.getOperand(2).getReg(); |
2542 | LLT InsTy = MRI.getType(InsReg); |
2543 | (void)InsTy; |
2544 | |
2545 | Register BaseIdxReg; |
2546 | unsigned ConstOffset; |
2547 | MachineInstr *OffsetDef; |
2548 | std::tie(BaseIdxReg, ConstOffset, OffsetDef) = |
2549 | AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg()); |
2550 | |
2551 | // See if the index is an add of a constant which will be foldable by moving |
2552 | // the base register of the index later if this is going to be executed in a |
2553 | // waterfall loop. This is essentially to reassociate the add of a constant |
2554 | // with the readfirstlane. |
2555 | bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && |
2556 | ConstOffset > 0 && |
2557 | ConstOffset < VecTy.getNumElements(); |
2558 | |
2559 | // Move the base register. We'll re-insert the add later. |
2560 | if (ShouldMoveIndexIntoLoop) |
2561 | MI.getOperand(3).setReg(BaseIdxReg); |
2562 | |
2563 | |
2564 | if (InsRegs.empty()) { |
2565 | executeInWaterfallLoop(MI, MRI, { 3 }); |
2566 | |
2567 | // Re-insert the constant offset add inside the waterfall loop. |
2568 | if (ShouldMoveIndexIntoLoop) { |
2569 | MachineIRBuilder B(MI); |
2570 | reinsertVectorIndexAdd(B, MI, 3, ConstOffset); |
2571 | } |
2572 | |
2573 | return; |
2574 | } |
2575 | |
2576 | |
2577 | assert(InsTy.getSizeInBits() == 64)((InsTy.getSizeInBits() == 64) ? static_cast<void> (0) : __assert_fail ("InsTy.getSizeInBits() == 64", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2577, __PRETTY_FUNCTION__)); |
2578 | |
2579 | const LLT S32 = LLT::scalar(32); |
2580 | LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32); |
2581 | |
2582 | MachineIRBuilder B(MI); |
2583 | auto CastSrc = B.buildBitcast(Vec32, SrcReg); |
2584 | auto One = B.buildConstant(S32, 1); |
2585 | |
2586 | // Split the vector index into 32-bit pieces. Prepare to move all of the |
2587 | // new instructions into a waterfall loop if necessary. |
2588 | // |
2589 | // Don't put the bitcast or constant in the loop. |
2590 | MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); |
2591 | |
2592 | // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). |
2593 | auto IdxLo = B.buildShl(S32, BaseIdxReg, One); |
2594 | auto IdxHi = B.buildAdd(S32, IdxLo, One); |
2595 | |
2596 | auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); |
2597 | auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); |
2598 | |
2599 | const RegisterBank *DstBank = |
2600 | OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; |
2601 | const RegisterBank *SrcBank = |
2602 | OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; |
2603 | const RegisterBank *InsSrcBank = |
2604 | OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; |
2605 | |
2606 | MRI.setRegBank(InsReg, *InsSrcBank); |
2607 | MRI.setRegBank(CastSrc.getReg(0), *SrcBank); |
2608 | MRI.setRegBank(InsLo.getReg(0), *DstBank); |
2609 | MRI.setRegBank(InsHi.getReg(0), *DstBank); |
2610 | MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); |
2611 | MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); |
2612 | MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); |
2613 | |
2614 | |
2615 | SmallSet<Register, 4> OpsToWaterfall; |
2616 | if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { |
2617 | B.setInsertPt(B.getMBB(), MI); |
2618 | B.buildBitcast(DstReg, InsHi); |
2619 | MI.eraseFromParent(); |
2620 | return; |
2621 | } |
2622 | |
2623 | B.setInstr(*Span.begin()); |
2624 | MI.eraseFromParent(); |
2625 | |
2626 | // Figure out the point after the waterfall loop before mangling the control |
2627 | // flow. |
2628 | executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), |
2629 | OpsToWaterfall, MRI); |
2630 | |
2631 | // The insertion point is now right after the original instruction. |
2632 | // |
2633 | // Keep the bitcast to the original vector type out of the loop. Doing this |
2634 | // saved an extra phi we don't need inside the loop. |
2635 | B.buildBitcast(DstReg, InsHi); |
2636 | |
2637 | // Re-insert the constant offset add inside the waterfall loop. |
2638 | if (ShouldMoveIndexIntoLoop) |
2639 | reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); |
2640 | |
2641 | return; |
2642 | } |
2643 | case AMDGPU::G_AMDGPU_BUFFER_LOAD: |
2644 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: |
2645 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: |
2646 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: |
2647 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: |
2648 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: |
2649 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: |
2650 | case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: |
2651 | case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: |
2652 | case AMDGPU::G_AMDGPU_BUFFER_STORE: |
2653 | case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: |
2654 | case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: |
2655 | case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: |
2656 | case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: |
2657 | case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: |
2658 | case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: { |
2659 | applyDefaultMapping(OpdMapper); |
2660 | executeInWaterfallLoop(MI, MRI, {1, 4}); |
2661 | return; |
2662 | } |
2663 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: |
2664 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: |
2665 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: |
2666 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: |
2667 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: |
2668 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: |
2669 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: |
2670 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: |
2671 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: |
2672 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: |
2673 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: |
2674 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { |
2675 | applyDefaultMapping(OpdMapper); |
2676 | executeInWaterfallLoop(MI, MRI, {2, 5}); |
2677 | return; |
2678 | } |
2679 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { |
2680 | applyDefaultMapping(OpdMapper); |
2681 | executeInWaterfallLoop(MI, MRI, {3, 6}); |
2682 | return; |
2683 | } |
2684 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { |
2685 | applyMappingSBufferLoad(OpdMapper); |
2686 | return; |
2687 | } |
2688 | case AMDGPU::G_INTRINSIC: { |
2689 | switch (MI.getIntrinsicID()) { |
2690 | case Intrinsic::amdgcn_readlane: { |
2691 | substituteSimpleCopyRegs(OpdMapper, 2); |
2692 | |
2693 | assert(OpdMapper.getVRegs(0).empty())((OpdMapper.getVRegs(0).empty()) ? static_cast<void> (0 ) : __assert_fail ("OpdMapper.getVRegs(0).empty()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2693, __PRETTY_FUNCTION__)); |
2694 | assert(OpdMapper.getVRegs(3).empty())((OpdMapper.getVRegs(3).empty()) ? static_cast<void> (0 ) : __assert_fail ("OpdMapper.getVRegs(3).empty()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2694, __PRETTY_FUNCTION__)); |
2695 | |
2696 | // Make sure the index is an SGPR. It doesn't make sense to run this in a |
2697 | // waterfall loop, so assume it's a uniform value. |
2698 | constrainOpWithReadfirstlane(MI, MRI, 3); // Index |
2699 | return; |
2700 | } |
2701 | case Intrinsic::amdgcn_writelane: { |
2702 | assert(OpdMapper.getVRegs(0).empty())((OpdMapper.getVRegs(0).empty()) ? static_cast<void> (0 ) : __assert_fail ("OpdMapper.getVRegs(0).empty()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2702, __PRETTY_FUNCTION__)); |
2703 | assert(OpdMapper.getVRegs(2).empty())((OpdMapper.getVRegs(2).empty()) ? static_cast<void> (0 ) : __assert_fail ("OpdMapper.getVRegs(2).empty()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2703, __PRETTY_FUNCTION__)); |
2704 | assert(OpdMapper.getVRegs(3).empty())((OpdMapper.getVRegs(3).empty()) ? static_cast<void> (0 ) : __assert_fail ("OpdMapper.getVRegs(3).empty()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2704, __PRETTY_FUNCTION__)); |
2705 | |
2706 | substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val |
2707 | constrainOpWithReadfirstlane(MI, MRI, 2); // Source value |
2708 | constrainOpWithReadfirstlane(MI, MRI, 3); // Index |
2709 | return; |
2710 | } |
2711 | case Intrinsic::amdgcn_interp_p1: |
2712 | case Intrinsic::amdgcn_interp_p2: |
2713 | case Intrinsic::amdgcn_interp_mov: |
2714 | case Intrinsic::amdgcn_interp_p1_f16: |
2715 | case Intrinsic::amdgcn_interp_p2_f16: { |
2716 | applyDefaultMapping(OpdMapper); |
2717 | |
2718 | // Readlane for m0 value, which is always the last operand. |
2719 | // FIXME: Should this be a waterfall loop instead? |
2720 | constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index |
2721 | return; |
2722 | } |
2723 | case Intrinsic::amdgcn_permlane16: |
2724 | case Intrinsic::amdgcn_permlanex16: { |
2725 | // Doing a waterfall loop over these wouldn't make any sense. |
2726 | substituteSimpleCopyRegs(OpdMapper, 2); |
2727 | substituteSimpleCopyRegs(OpdMapper, 3); |
2728 | constrainOpWithReadfirstlane(MI, MRI, 4); |
2729 | constrainOpWithReadfirstlane(MI, MRI, 5); |
2730 | return; |
2731 | } |
2732 | case Intrinsic::amdgcn_sbfe: |
2733 | applyMappingBFEIntrinsic(OpdMapper, true); |
2734 | return; |
2735 | case Intrinsic::amdgcn_ubfe: |
2736 | applyMappingBFEIntrinsic(OpdMapper, false); |
2737 | return; |
2738 | } |
2739 | break; |
2740 | } |
2741 | case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { |
2742 | auto IntrID = MI.getIntrinsicID(); |
2743 | switch (IntrID) { |
2744 | case Intrinsic::amdgcn_ds_ordered_add: |
2745 | case Intrinsic::amdgcn_ds_ordered_swap: { |
2746 | // This is only allowed to execute with 1 lane, so readfirstlane is safe. |
2747 | assert(OpdMapper.getVRegs(0).empty())((OpdMapper.getVRegs(0).empty()) ? static_cast<void> (0 ) : __assert_fail ("OpdMapper.getVRegs(0).empty()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2747, __PRETTY_FUNCTION__)); |
2748 | substituteSimpleCopyRegs(OpdMapper, 3); |
2749 | constrainOpWithReadfirstlane(MI, MRI, 2); // M0 |
2750 | return; |
2751 | } |
2752 | case Intrinsic::amdgcn_ds_gws_init: |
2753 | case Intrinsic::amdgcn_ds_gws_barrier: |
2754 | case Intrinsic::amdgcn_ds_gws_sema_br: { |
2755 | // Only the first lane is executes, so readfirstlane is safe. |
2756 | substituteSimpleCopyRegs(OpdMapper, 1); |
2757 | constrainOpWithReadfirstlane(MI, MRI, 2); // M0 |
2758 | return; |
2759 | } |
2760 | case Intrinsic::amdgcn_ds_gws_sema_v: |
2761 | case Intrinsic::amdgcn_ds_gws_sema_p: |
2762 | case Intrinsic::amdgcn_ds_gws_sema_release_all: { |
2763 | // Only the first lane is executes, so readfirstlane is safe. |
2764 | constrainOpWithReadfirstlane(MI, MRI, 1); // M0 |
2765 | return; |
2766 | } |
2767 | case Intrinsic::amdgcn_ds_append: |
2768 | case Intrinsic::amdgcn_ds_consume: { |
2769 | constrainOpWithReadfirstlane(MI, MRI, 2); // M0 |
2770 | return; |
2771 | } |
2772 | case Intrinsic::amdgcn_s_sendmsg: |
2773 | case Intrinsic::amdgcn_s_sendmsghalt: { |
2774 | // FIXME: Should this use a waterfall loop? |
2775 | constrainOpWithReadfirstlane(MI, MRI, 2); // M0 |
2776 | return; |
2777 | } |
2778 | default: { |
2779 | if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = |
2780 | AMDGPU::lookupRsrcIntrinsic(IntrID)) { |
2781 | // Non-images can have complications from operands that allow both SGPR |
2782 | // and VGPR. For now it's too complicated to figure out the final opcode |
2783 | // to derive the register bank from the MCInstrDesc. |
2784 | if (RSrcIntrin->IsImage) { |
2785 | applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); |
2786 | return; |
2787 | } |
2788 | } |
2789 | |
2790 | break; |
2791 | } |
2792 | } |
2793 | break; |
2794 | } |
2795 | case AMDGPU::G_LOAD: |
2796 | case AMDGPU::G_ZEXTLOAD: |
2797 | case AMDGPU::G_SEXTLOAD: { |
2798 | if (applyMappingWideLoad(MI, OpdMapper, MRI)) |
2799 | return; |
2800 | break; |
2801 | } |
2802 | default: |
2803 | break; |
2804 | } |
2805 | |
2806 | return applyDefaultMapping(OpdMapper); |
2807 | } |
2808 | |
2809 | bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { |
2810 | const MachineFunction &MF = *MI.getParent()->getParent(); |
2811 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
2812 | for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) { |
2813 | if (!MI.getOperand(i).isReg()) |
2814 | continue; |
2815 | Register Reg = MI.getOperand(i).getReg(); |
2816 | if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { |
2817 | if (Bank->getID() != AMDGPU::SGPRRegBankID) |
2818 | return false; |
2819 | } |
2820 | } |
2821 | return true; |
2822 | } |
2823 | |
2824 | const RegisterBankInfo::InstructionMapping & |
2825 | AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { |
2826 | const MachineFunction &MF = *MI.getParent()->getParent(); |
2827 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
2828 | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); |
2829 | |
2830 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { |
2831 | const MachineOperand &SrcOp = MI.getOperand(i); |
2832 | if (!SrcOp.isReg()) |
2833 | continue; |
2834 | |
2835 | unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI); |
2836 | OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
2837 | } |
2838 | return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), |
2839 | MI.getNumOperands()); |
2840 | } |
2841 | |
2842 | const RegisterBankInfo::InstructionMapping & |
2843 | AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { |
2844 | const MachineFunction &MF = *MI.getParent()->getParent(); |
2845 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
2846 | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); |
2847 | |
2848 | // Even though we technically could use SGPRs, this would require knowledge of |
2849 | // the constant bus restriction. Force all sources to VGPR (except for VCC). |
2850 | // |
2851 | // TODO: Unary ops are trivially OK, so accept SGPRs? |
2852 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { |
2853 | const MachineOperand &Src = MI.getOperand(i); |
2854 | if (!Src.isReg()) |
2855 | continue; |
2856 | |
2857 | unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI); |
2858 | unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; |
2859 | OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); |
2860 | } |
2861 | |
2862 | return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), |
2863 | MI.getNumOperands()); |
2864 | } |
2865 | |
2866 | const RegisterBankInfo::InstructionMapping & |
2867 | AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { |
2868 | const MachineFunction &MF = *MI.getParent()->getParent(); |
2869 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
2870 | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); |
2871 | |
2872 | for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { |
2873 | const MachineOperand &Op = MI.getOperand(I); |
2874 | if (!Op.isReg()) |
2875 | continue; |
2876 | |
2877 | unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); |
2878 | OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
2879 | } |
2880 | |
2881 | return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), |
2882 | MI.getNumOperands()); |
2883 | } |
2884 | |
2885 | const RegisterBankInfo::InstructionMapping & |
2886 | AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, |
2887 | const MachineInstr &MI, |
2888 | int RsrcIdx) const { |
2889 | // The reported argument index is relative to the IR intrinsic call arguments, |
2890 | // so we need to shift by the number of defs and the intrinsic ID. |
2891 | RsrcIdx += MI.getNumExplicitDefs() + 1; |
2892 | |
2893 | const int NumOps = MI.getNumOperands(); |
2894 | SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); |
2895 | |
2896 | // TODO: Should packed/unpacked D16 difference be reported here as part of |
2897 | // the value mapping? |
2898 | for (int I = 0; I != NumOps; ++I) { |
2899 | if (!MI.getOperand(I).isReg()) |
2900 | continue; |
2901 | |
2902 | Register OpReg = MI.getOperand(I).getReg(); |
2903 | unsigned Size = getSizeInBits(OpReg, MRI, *TRI); |
2904 | |
2905 | // FIXME: Probably need a new intrinsic register bank searchable table to |
2906 | // handle arbitrary intrinsics easily. |
2907 | // |
2908 | // If this has a sampler, it immediately follows rsrc. |
2909 | const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; |
2910 | |
2911 | if (MustBeSGPR) { |
2912 | // If this must be an SGPR, so we must report whatever it is as legal. |
2913 | unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID); |
2914 | OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size); |
2915 | } else { |
2916 | // Some operands must be VGPR, and these are easy to copy to. |
2917 | OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
2918 | } |
2919 | } |
2920 | |
2921 | return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps); |
2922 | } |
2923 | |
2924 | /// Return the mapping for a pointer arugment. |
2925 | const RegisterBankInfo::ValueMapping * |
2926 | AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, |
2927 | Register PtrReg) const { |
2928 | LLT PtrTy = MRI.getType(PtrReg); |
2929 | unsigned Size = PtrTy.getSizeInBits(); |
2930 | if (Subtarget.useFlatForGlobal() || |
2931 | !SITargetLowering::isFlatGlobalAddrSpace(PtrTy.getAddressSpace())) |
2932 | return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
2933 | |
2934 | // If we're using MUBUF instructions for global memory, an SGPR base register |
2935 | // is possible. Otherwise this needs to be a VGPR. |
2936 | const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); |
2937 | return AMDGPU::getValueMapping(PtrBank->getID(), Size); |
2938 | } |
2939 | |
2940 | const RegisterBankInfo::InstructionMapping & |
2941 | AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { |
2942 | |
2943 | const MachineFunction &MF = *MI.getParent()->getParent(); |
2944 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
2945 | SmallVector<const ValueMapping*, 2> OpdsMapping(2); |
2946 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
2947 | LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); |
2948 | Register PtrReg = MI.getOperand(1).getReg(); |
2949 | LLT PtrTy = MRI.getType(PtrReg); |
2950 | unsigned AS = PtrTy.getAddressSpace(); |
2951 | unsigned PtrSize = PtrTy.getSizeInBits(); |
2952 | |
2953 | const ValueMapping *ValMapping; |
2954 | const ValueMapping *PtrMapping; |
2955 | |
2956 | const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); |
2957 | |
2958 | if (PtrBank == &AMDGPU::SGPRRegBank && |
2959 | SITargetLowering::isFlatGlobalAddrSpace(AS)) { |
2960 | if (isScalarLoadLegal(MI)) { |
2961 | // We have a uniform instruction so we want to use an SMRD load |
2962 | ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
2963 | PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); |
2964 | } else { |
2965 | ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
Value stored to 'ValMapping' is never read | |
2966 | |
2967 | // If we're using MUBUF instructions for global memory, an SGPR base |
2968 | // register is possible. Otherwise this needs to be a VGPR. |
2969 | unsigned PtrBankID = Subtarget.useFlatForGlobal() ? |
2970 | AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID; |
2971 | |
2972 | PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize); |
2973 | ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, |
2974 | LoadTy); |
2975 | } |
2976 | } else { |
2977 | ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy); |
2978 | PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); |
2979 | } |
2980 | |
2981 | OpdsMapping[0] = ValMapping; |
2982 | OpdsMapping[1] = PtrMapping; |
2983 | const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( |
2984 | 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); |
2985 | return Mapping; |
2986 | |
2987 | // FIXME: Do we want to add a mapping for FLAT load, or should we just |
2988 | // handle that during instruction selection? |
2989 | } |
2990 | |
2991 | unsigned |
2992 | AMDGPURegisterBankInfo::getRegBankID(Register Reg, |
2993 | const MachineRegisterInfo &MRI, |
2994 | const TargetRegisterInfo &TRI, |
2995 | unsigned Default) const { |
2996 | const RegisterBank *Bank = getRegBank(Reg, MRI, TRI); |
2997 | return Bank ? Bank->getID() : Default; |
2998 | } |
2999 | |
3000 | |
3001 | static unsigned regBankUnion(unsigned RB0, unsigned RB1) { |
3002 | return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ? |
3003 | AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; |
3004 | } |
3005 | |
3006 | static int regBankBoolUnion(int RB0, int RB1) { |
3007 | if (RB0 == -1) |
3008 | return RB1; |
3009 | if (RB1 == -1) |
3010 | return RB0; |
3011 | |
3012 | // vcc, vcc -> vcc |
3013 | // vcc, sgpr -> vcc |
3014 | // vcc, vgpr -> vcc |
3015 | if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID) |
3016 | return AMDGPU::VCCRegBankID; |
3017 | |
3018 | // vcc, vgpr -> vgpr |
3019 | return regBankUnion(RB0, RB1); |
3020 | } |
3021 | |
3022 | const RegisterBankInfo::ValueMapping * |
3023 | AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, |
3024 | const MachineRegisterInfo &MRI, |
3025 | const TargetRegisterInfo &TRI) const { |
3026 | // Lie and claim anything is legal, even though this needs to be an SGPR |
3027 | // applyMapping will have to deal with it as a waterfall loop. |
3028 | unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID); |
3029 | unsigned Size = getSizeInBits(Reg, MRI, TRI); |
3030 | return AMDGPU::getValueMapping(Bank, Size); |
3031 | } |
3032 | |
3033 | const RegisterBankInfo::ValueMapping * |
3034 | AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, |
3035 | const MachineRegisterInfo &MRI, |
3036 | const TargetRegisterInfo &TRI) const { |
3037 | unsigned Size = getSizeInBits(Reg, MRI, TRI); |
3038 | return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
3039 | } |
3040 | |
3041 | const RegisterBankInfo::ValueMapping * |
3042 | AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, |
3043 | const MachineRegisterInfo &MRI, |
3044 | const TargetRegisterInfo &TRI) const { |
3045 | unsigned Size = getSizeInBits(Reg, MRI, TRI); |
3046 | return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size); |
3047 | } |
3048 | |
3049 | /// |
3050 | /// This function must return a legal mapping, because |
3051 | /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called |
3052 | /// in RegBankSelect::Mode::Fast. Any mapping that would cause a |
3053 | /// VGPR to SGPR generated is illegal. |
3054 | /// |
3055 | // Operands that must be SGPRs must accept potentially divergent VGPRs as |
3056 | // legal. These will be dealt with in applyMappingImpl. |
3057 | // |
3058 | const RegisterBankInfo::InstructionMapping & |
3059 | AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { |
3060 | const MachineFunction &MF = *MI.getParent()->getParent(); |
3061 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
3062 | |
3063 | if (MI.isRegSequence()) { |
3064 | // If any input is a VGPR, the result must be a VGPR. The default handling |
3065 | // assumes any copy between banks is legal. |
3066 | unsigned BankID = AMDGPU::SGPRRegBankID; |
3067 | |
3068 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { |
3069 | auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI); |
3070 | // It doesn't make sense to use vcc or scc banks here, so just ignore |
3071 | // them. |
3072 | if (OpBank != AMDGPU::SGPRRegBankID) { |
3073 | BankID = AMDGPU::VGPRRegBankID; |
3074 | break; |
3075 | } |
3076 | } |
3077 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
3078 | |
3079 | const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); |
3080 | return getInstructionMapping( |
3081 | 1, /*Cost*/ 1, |
3082 | /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); |
3083 | } |
3084 | |
3085 | // The default handling is broken and doesn't handle illegal SGPR->VGPR copies |
3086 | // properly. |
3087 | // |
3088 | // TODO: There are additional exec masking dependencies to analyze. |
3089 | if (MI.getOpcode() == TargetOpcode::G_PHI) { |
3090 | // TODO: Generate proper invalid bank enum. |
3091 | int ResultBank = -1; |
3092 | Register DstReg = MI.getOperand(0).getReg(); |
3093 | |
3094 | // Sometimes the result may have already been assigned a bank. |
3095 | if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI)) |
3096 | ResultBank = DstBank->getID(); |
3097 | |
3098 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { |
3099 | Register Reg = MI.getOperand(I).getReg(); |
3100 | const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); |
3101 | |
3102 | // FIXME: Assuming VGPR for any undetermined inputs. |
3103 | if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { |
3104 | ResultBank = AMDGPU::VGPRRegBankID; |
3105 | break; |
3106 | } |
3107 | |
3108 | // FIXME: Need to promote SGPR case to s32 |
3109 | unsigned OpBank = Bank->getID(); |
3110 | ResultBank = regBankBoolUnion(ResultBank, OpBank); |
3111 | } |
3112 | |
3113 | assert(ResultBank != -1)((ResultBank != -1) ? static_cast<void> (0) : __assert_fail ("ResultBank != -1", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 3113, __PRETTY_FUNCTION__)); |
3114 | |
3115 | unsigned Size = MRI.getType(DstReg).getSizeInBits(); |
3116 | |
3117 | const ValueMapping &ValMap = |
3118 | getValueMapping(0, Size, getRegBank(ResultBank)); |
3119 | return getInstructionMapping( |
3120 | 1, /*Cost*/ 1, |
3121 | /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); |
3122 | } |
3123 | |
3124 | const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); |
3125 | if (Mapping.isValid()) |
3126 | return Mapping; |
3127 | |
3128 | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); |
3129 | |
3130 | switch (MI.getOpcode()) { |
3131 | default: |
3132 | return getInvalidInstructionMapping(); |
3133 | |
3134 | case AMDGPU::G_AND: |
3135 | case AMDGPU::G_OR: |
3136 | case AMDGPU::G_XOR: { |
3137 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3138 | if (Size == 1) { |
3139 | const RegisterBank *DstBank |
3140 | = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); |
3141 | |
3142 | unsigned TargetBankID = -1; |
3143 | unsigned BankLHS = -1; |
3144 | unsigned BankRHS = -1; |
3145 | if (DstBank) { |
3146 | TargetBankID = DstBank->getID(); |
3147 | if (DstBank == &AMDGPU::VCCRegBank) { |
3148 | TargetBankID = AMDGPU::VCCRegBankID; |
3149 | BankLHS = AMDGPU::VCCRegBankID; |
3150 | BankRHS = AMDGPU::VCCRegBankID; |
3151 | } else { |
3152 | BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, |
3153 | AMDGPU::SGPRRegBankID); |
3154 | BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, |
3155 | AMDGPU::SGPRRegBankID); |
3156 | } |
3157 | } else { |
3158 | BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, |
3159 | AMDGPU::VCCRegBankID); |
3160 | BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, |
3161 | AMDGPU::VCCRegBankID); |
3162 | |
3163 | // Both inputs should be true booleans to produce a boolean result. |
3164 | if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { |
3165 | TargetBankID = AMDGPU::VGPRRegBankID; |
3166 | } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { |
3167 | TargetBankID = AMDGPU::VCCRegBankID; |
3168 | BankLHS = AMDGPU::VCCRegBankID; |
3169 | BankRHS = AMDGPU::VCCRegBankID; |
3170 | } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { |
3171 | TargetBankID = AMDGPU::SGPRRegBankID; |
3172 | } |
3173 | } |
3174 | |
3175 | OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); |
3176 | OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); |
3177 | OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); |
3178 | break; |
3179 | } |
3180 | |
3181 | if (Size == 64) { |
3182 | |
3183 | if (isSALUMapping(MI)) { |
3184 | OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); |
3185 | OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; |
3186 | } else { |
3187 | OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); |
3188 | unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/); |
3189 | OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); |
3190 | |
3191 | unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/); |
3192 | OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); |
3193 | } |
3194 | |
3195 | break; |
3196 | } |
3197 | |
3198 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; |
3199 | } |
3200 | case AMDGPU::G_PTR_ADD: |
3201 | case AMDGPU::G_ADD: |
3202 | case AMDGPU::G_SUB: |
3203 | case AMDGPU::G_MUL: |
3204 | case AMDGPU::G_SHL: |
3205 | case AMDGPU::G_LSHR: |
3206 | case AMDGPU::G_ASHR: |
3207 | case AMDGPU::G_UADDO: |
3208 | case AMDGPU::G_USUBO: |
3209 | case AMDGPU::G_UADDE: |
3210 | case AMDGPU::G_SADDE: |
3211 | case AMDGPU::G_USUBE: |
3212 | case AMDGPU::G_SSUBE: |
3213 | case AMDGPU::G_SMIN: |
3214 | case AMDGPU::G_SMAX: |
3215 | case AMDGPU::G_UMIN: |
3216 | case AMDGPU::G_UMAX: |
3217 | case AMDGPU::G_SHUFFLE_VECTOR: |
3218 | if (isSALUMapping(MI)) |
3219 | return getDefaultMappingSOP(MI); |
3220 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; |
3221 | |
3222 | case AMDGPU::G_FADD: |
3223 | case AMDGPU::G_FSUB: |
3224 | case AMDGPU::G_FPTOSI: |
3225 | case AMDGPU::G_FPTOUI: |
3226 | case AMDGPU::G_FMUL: |
3227 | case AMDGPU::G_FMA: |
3228 | case AMDGPU::G_FMAD: |
3229 | case AMDGPU::G_FSQRT: |
3230 | case AMDGPU::G_FFLOOR: |
3231 | case AMDGPU::G_FCEIL: |
3232 | case AMDGPU::G_FRINT: |
3233 | case AMDGPU::G_SITOFP: |
3234 | case AMDGPU::G_UITOFP: |
3235 | case AMDGPU::G_FPTRUNC: |
3236 | case AMDGPU::G_FPEXT: |
3237 | case AMDGPU::G_FEXP2: |
3238 | case AMDGPU::G_FLOG2: |
3239 | case AMDGPU::G_FMINNUM: |
3240 | case AMDGPU::G_FMAXNUM: |
3241 | case AMDGPU::G_FMINNUM_IEEE: |
3242 | case AMDGPU::G_FMAXNUM_IEEE: |
3243 | case AMDGPU::G_FCANONICALIZE: |
3244 | case AMDGPU::G_INTRINSIC_TRUNC: |
3245 | case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? |
3246 | case AMDGPU::G_AMDGPU_FFBH_U32: |
3247 | case AMDGPU::G_AMDGPU_FMIN_LEGACY: |
3248 | case AMDGPU::G_AMDGPU_FMAX_LEGACY: |
3249 | case AMDGPU::G_AMDGPU_RCP_IFLAG: |
3250 | return getDefaultMappingVOP(MI); |
3251 | case AMDGPU::G_UMULH: |
3252 | case AMDGPU::G_SMULH: { |
3253 | if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) |
3254 | return getDefaultMappingSOP(MI); |
3255 | return getDefaultMappingVOP(MI); |
3256 | } |
3257 | case AMDGPU::G_IMPLICIT_DEF: { |
3258 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3259 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
3260 | break; |
3261 | } |
3262 | case AMDGPU::G_FCONSTANT: |
3263 | case AMDGPU::G_CONSTANT: |
3264 | case AMDGPU::G_GLOBAL_VALUE: |
3265 | case AMDGPU::G_BLOCK_ADDR: |
3266 | case AMDGPU::G_READCYCLECOUNTER: { |
3267 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3268 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
3269 | break; |
3270 | } |
3271 | case AMDGPU::G_FRAME_INDEX: { |
3272 | // TODO: This should be the same as other constants, but eliminateFrameIndex |
3273 | // currently assumes VALU uses. |
3274 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3275 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
3276 | break; |
3277 | } |
3278 | case AMDGPU::G_INSERT: { |
3279 | unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : |
3280 | AMDGPU::VGPRRegBankID; |
3281 | unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
3282 | unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); |
3283 | unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); |
3284 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); |
3285 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); |
3286 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); |
3287 | OpdsMapping[3] = nullptr; |
3288 | break; |
3289 | } |
3290 | case AMDGPU::G_EXTRACT: { |
3291 | unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); |
3292 | unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
3293 | unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); |
3294 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); |
3295 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); |
3296 | OpdsMapping[2] = nullptr; |
3297 | break; |
3298 | } |
3299 | case AMDGPU::G_BUILD_VECTOR: |
3300 | case AMDGPU::G_BUILD_VECTOR_TRUNC: { |
3301 | LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); |
3302 | if (DstTy == LLT::vector(2, 16)) { |
3303 | unsigned DstSize = DstTy.getSizeInBits(); |
3304 | unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); |
3305 | unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); |
3306 | unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); |
3307 | unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); |
3308 | |
3309 | OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); |
3310 | OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); |
3311 | OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); |
3312 | break; |
3313 | } |
3314 | |
3315 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; |
3316 | } |
3317 | case AMDGPU::G_MERGE_VALUES: |
3318 | case AMDGPU::G_CONCAT_VECTORS: { |
3319 | unsigned Bank = isSALUMapping(MI) ? |
3320 | AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; |
3321 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3322 | unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); |
3323 | |
3324 | OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); |
3325 | // Op1 and Dst should use the same register bank. |
3326 | for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) |
3327 | OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); |
3328 | break; |
3329 | } |
3330 | case AMDGPU::G_BITCAST: |
3331 | case AMDGPU::G_INTTOPTR: |
3332 | case AMDGPU::G_PTRTOINT: |
3333 | case AMDGPU::G_BITREVERSE: |
3334 | case AMDGPU::G_FABS: |
3335 | case AMDGPU::G_FNEG: { |
3336 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3337 | unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); |
3338 | OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); |
3339 | break; |
3340 | } |
3341 | case AMDGPU::G_CTLZ_ZERO_UNDEF: |
3342 | case AMDGPU::G_CTTZ_ZERO_UNDEF: |
3343 | case AMDGPU::G_CTPOP: { |
3344 | unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); |
3345 | unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); |
3346 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); |
3347 | |
3348 | // This should really be getValueMappingSGPR64Only, but allowing the generic |
3349 | // code to handle the register split just makes using LegalizerHelper more |
3350 | // difficult. |
3351 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); |
3352 | break; |
3353 | } |
3354 | case AMDGPU::G_TRUNC: { |
3355 | Register Dst = MI.getOperand(0).getReg(); |
3356 | Register Src = MI.getOperand(1).getReg(); |
3357 | unsigned Bank = getRegBankID(Src, MRI, *TRI); |
3358 | unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); |
3359 | unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); |
3360 | OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); |
3361 | OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); |
3362 | break; |
3363 | } |
3364 | case AMDGPU::G_ZEXT: |
3365 | case AMDGPU::G_SEXT: |
3366 | case AMDGPU::G_ANYEXT: |
3367 | case AMDGPU::G_SEXT_INREG: { |
3368 | Register Dst = MI.getOperand(0).getReg(); |
3369 | Register Src = MI.getOperand(1).getReg(); |
3370 | unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); |
3371 | unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); |
3372 | |
3373 | unsigned DstBank; |
3374 | const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); |
3375 | assert(SrcBank)((SrcBank) ? static_cast<void> (0) : __assert_fail ("SrcBank" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 3375, __PRETTY_FUNCTION__)); |
3376 | switch (SrcBank->getID()) { |
3377 | case AMDGPU::SGPRRegBankID: |
3378 | DstBank = AMDGPU::SGPRRegBankID; |
3379 | break; |
3380 | default: |
3381 | DstBank = AMDGPU::VGPRRegBankID; |
3382 | break; |
3383 | } |
3384 | |
3385 | // TODO: Should anyext be split into 32-bit part as well? |
3386 | if (MI.getOpcode() == AMDGPU::G_ANYEXT) { |
3387 | OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize); |
3388 | OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize); |
3389 | } else { |
3390 | // Scalar extend can use 64-bit BFE, but VGPRs require extending to |
3391 | // 32-bits, and then to 64. |
3392 | OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); |
3393 | OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), |
3394 | SrcSize); |
3395 | } |
3396 | break; |
3397 | } |
3398 | case AMDGPU::G_FCMP: { |
3399 | unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); |
3400 | unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); |
3401 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); |
3402 | OpdsMapping[1] = nullptr; // Predicate Operand. |
3403 | OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); |
3404 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
3405 | break; |
3406 | } |
3407 | case AMDGPU::G_STORE: { |
3408 | assert(MI.getOperand(0).isReg())((MI.getOperand(0).isReg()) ? static_cast<void> (0) : __assert_fail ("MI.getOperand(0).isReg()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 3408, __PRETTY_FUNCTION__)); |
3409 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3410 | |
3411 | // FIXME: We need to specify a different reg bank once scalar stores are |
3412 | // supported. |
3413 | const ValueMapping *ValMapping = |
3414 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
3415 | OpdsMapping[0] = ValMapping; |
3416 | OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); |
3417 | break; |
3418 | } |
3419 | case AMDGPU::G_ICMP: { |
3420 | auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); |
3421 | unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); |
3422 | |
3423 | // See if the result register has already been constrained to vcc, which may |
3424 | // happen due to control flow intrinsic lowering. |
3425 | unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI, |
3426 | AMDGPU::SGPRRegBankID); |
3427 | unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); |
3428 | unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); |
3429 | |
3430 | bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID && |
3431 | Op2Bank == AMDGPU::SGPRRegBankID && |
3432 | Op3Bank == AMDGPU::SGPRRegBankID && |
3433 | (Size == 32 || (Size == 64 && |
3434 | (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && |
3435 | Subtarget.hasScalarCompareEq64())); |
3436 | |
3437 | DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; |
3438 | unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; |
3439 | |
3440 | // TODO: Use 32-bit for scalar output size. |
3441 | // SCC results will need to be copied to a 32-bit SGPR virtual register. |
3442 | const unsigned ResultSize = 1; |
3443 | |
3444 | OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize); |
3445 | OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size); |
3446 | OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size); |
3447 | break; |
3448 | } |
3449 | case AMDGPU::G_EXTRACT_VECTOR_ELT: { |
3450 | // VGPR index can be used for waterfall when indexing a SGPR vector. |
3451 | unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); |
3452 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3453 | unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); |
3454 | unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); |
3455 | unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); |
3456 | unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); |
3457 | |
3458 | OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); |
3459 | OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); |
3460 | |
3461 | // The index can be either if the source vector is VGPR. |
3462 | OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); |
3463 | break; |
3464 | } |
3465 | case AMDGPU::G_INSERT_VECTOR_ELT: { |
3466 | unsigned OutputBankID = isSALUMapping(MI) ? |
3467 | AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; |
3468 | |
3469 | unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3470 | unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); |
3471 | unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); |
3472 | unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), |
3473 | MRI, *TRI); |
3474 | unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); |
3475 | |
3476 | OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); |
3477 | OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); |
3478 | |
3479 | // This is a weird case, because we need to break down the mapping based on |
3480 | // the register bank of a different operand. |
3481 | if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) { |
3482 | OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID, |
3483 | InsertSize); |
3484 | } else { |
3485 | assert(InsertSize == 32 || InsertSize == 64)((InsertSize == 32 || InsertSize == 64) ? static_cast<void > (0) : __assert_fail ("InsertSize == 32 || InsertSize == 64" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 3485, __PRETTY_FUNCTION__)); |
3486 | OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize); |
3487 | } |
3488 | |
3489 | // The index can be either if the source vector is VGPR. |
3490 | OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); |
3491 | break; |
3492 | } |
3493 | case AMDGPU::G_UNMERGE_VALUES: { |
3494 | unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : |
3495 | AMDGPU::VGPRRegBankID; |
3496 | |
3497 | // Op1 and Dst should use the same register bank. |
3498 | // FIXME: Shouldn't this be the default? Why do we need to handle this? |
3499 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { |
3500 | unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); |
3501 | OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); |
3502 | } |
3503 | break; |
3504 | } |
3505 | case AMDGPU::G_AMDGPU_BUFFER_LOAD: |
3506 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: |
3507 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: |
3508 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: |
3509 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: |
3510 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: |
3511 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: |
3512 | case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: |
3513 | case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: |
3514 | case AMDGPU::G_AMDGPU_BUFFER_STORE: |
3515 | case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: |
3516 | case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: |
3517 | case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: |
3518 | case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: { |
3519 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); |
3520 | |
3521 | // rsrc |
3522 | OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); |
3523 | |
3524 | // vindex |
3525 | OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); |
3526 | |
3527 | // voffset |
3528 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); |
3529 | |
3530 | // soffset |
3531 | OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); |
3532 | |
3533 | // Any remaining operands are immediates and were correctly null |
3534 | // initialized. |
3535 | break; |
3536 | } |
3537 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: |
3538 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: |
3539 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: |
3540 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: |
3541 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: |
3542 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: |
3543 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: |
3544 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: |
3545 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: |
3546 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: |
3547 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: |
3548 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { |
3549 | // vdata_out |
3550 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); |
3551 | |
3552 | // vdata_in |
3553 | OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); |
3554 | |
3555 | // rsrc |
3556 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); |
3557 | |
3558 | // vindex |
3559 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); |
3560 | |
3561 | // voffset |
3562 | OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); |
3563 | |
3564 | // soffset |
3565 | OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); |
3566 | |
3567 | // Any remaining operands are immediates and were correctly null |
3568 | // initialized. |
3569 | break; |
3570 | } |
3571 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { |
3572 | // vdata_out |
3573 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); |
3574 | |
3575 | // vdata_in |
3576 | OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); |
3577 | |
3578 | // cmp |
3579 | OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); |
3580 | |
3581 | // rsrc |
3582 | OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); |
3583 | |
3584 | // vindex |
3585 | OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); |
3586 | |
3587 | // voffset |
3588 | OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); |
3589 | |
3590 | // soffset |
3591 | OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); |
3592 | |
3593 | // Any remaining operands are immediates and were correctly null |
3594 | // initialized. |
3595 | break; |
3596 | } |
3597 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { |
3598 | // Lie and claim everything is legal, even though some need to be |
3599 | // SGPRs. applyMapping will have to deal with it as a waterfall loop. |
3600 | OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); |
3601 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); |
3602 | |
3603 | // We need to convert this to a MUBUF if either the resource of offset is |
3604 | // VGPR. |
3605 | unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID(); |
3606 | unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); |
3607 | unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank); |
3608 | |
3609 | unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3610 | OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0); |
3611 | break; |
3612 | } |
3613 | case AMDGPU::G_INTRINSIC: { |
3614 | switch (MI.getIntrinsicID()) { |
3615 | default: |
3616 | return getInvalidInstructionMapping(); |
3617 | case Intrinsic::amdgcn_div_fmas: |
3618 | case Intrinsic::amdgcn_div_fixup: |
3619 | case Intrinsic::amdgcn_trig_preop: |
3620 | case Intrinsic::amdgcn_sin: |
3621 | case Intrinsic::amdgcn_cos: |
3622 | case Intrinsic::amdgcn_log_clamp: |
3623 | case Intrinsic::amdgcn_rcp: |
3624 | case Intrinsic::amdgcn_rcp_legacy: |
3625 | case Intrinsic::amdgcn_rsq: |
3626 | case Intrinsic::amdgcn_rsq_legacy: |
3627 | case Intrinsic::amdgcn_rsq_clamp: |
3628 | case Intrinsic::amdgcn_fmul_legacy: |
3629 | case Intrinsic::amdgcn_ldexp: |
3630 | case Intrinsic::amdgcn_frexp_mant: |
3631 | case Intrinsic::amdgcn_frexp_exp: |
3632 | case Intrinsic::amdgcn_fract: |
3633 | case Intrinsic::amdgcn_cvt_pkrtz: |
3634 | case Intrinsic::amdgcn_cvt_pknorm_i16: |
3635 | case Intrinsic::amdgcn_cvt_pknorm_u16: |
3636 | case Intrinsic::amdgcn_cvt_pk_i16: |
3637 | case Intrinsic::amdgcn_cvt_pk_u16: |
3638 | case Intrinsic::amdgcn_fmed3: |
3639 | case Intrinsic::amdgcn_cubeid: |
3640 | case Intrinsic::amdgcn_cubema: |
3641 | case Intrinsic::amdgcn_cubesc: |
3642 | case Intrinsic::amdgcn_cubetc: |
3643 | case Intrinsic::amdgcn_sffbh: |
3644 | case Intrinsic::amdgcn_fmad_ftz: |
3645 | case Intrinsic::amdgcn_mbcnt_lo: |
3646 | case Intrinsic::amdgcn_mbcnt_hi: |
3647 | case Intrinsic::amdgcn_mul_u24: |
3648 | case Intrinsic::amdgcn_mul_i24: |
3649 | case Intrinsic::amdgcn_lerp: |
3650 | case Intrinsic::amdgcn_sad_u8: |
3651 | case Intrinsic::amdgcn_msad_u8: |
3652 | case Intrinsic::amdgcn_sad_hi_u8: |
3653 | case Intrinsic::amdgcn_sad_u16: |
3654 | case Intrinsic::amdgcn_qsad_pk_u16_u8: |
3655 | case Intrinsic::amdgcn_mqsad_pk_u16_u8: |
3656 | case Intrinsic::amdgcn_mqsad_u32_u8: |
3657 | case Intrinsic::amdgcn_cvt_pk_u8_f32: |
3658 | case Intrinsic::amdgcn_alignbit: |
3659 | case Intrinsic::amdgcn_alignbyte: |
3660 | case Intrinsic::amdgcn_fdot2: |
3661 | case Intrinsic::amdgcn_sdot2: |
3662 | case Intrinsic::amdgcn_udot2: |
3663 | case Intrinsic::amdgcn_sdot4: |
3664 | case Intrinsic::amdgcn_udot4: |
3665 | case Intrinsic::amdgcn_sdot8: |
3666 | case Intrinsic::amdgcn_udot8: |
3667 | return getDefaultMappingVOP(MI); |
3668 | case Intrinsic::amdgcn_sbfe: |
3669 | case Intrinsic::amdgcn_ubfe: |
3670 | if (isSALUMapping(MI)) |
3671 | return getDefaultMappingSOP(MI); |
3672 | return getDefaultMappingVOP(MI); |
3673 | case Intrinsic::amdgcn_ds_swizzle: |
3674 | case Intrinsic::amdgcn_ds_permute: |
3675 | case Intrinsic::amdgcn_ds_bpermute: |
3676 | case Intrinsic::amdgcn_update_dpp: |
3677 | case Intrinsic::amdgcn_mov_dpp8: |
3678 | case Intrinsic::amdgcn_mov_dpp: |
3679 | case Intrinsic::amdgcn_wwm: |
3680 | case Intrinsic::amdgcn_wqm: |
3681 | case Intrinsic::amdgcn_softwqm: |
3682 | return getDefaultMappingAllVGPR(MI); |
3683 | case Intrinsic::amdgcn_kernarg_segment_ptr: |
3684 | case Intrinsic::amdgcn_s_getpc: |
3685 | case Intrinsic::amdgcn_groupstaticsize: { |
3686 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3687 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
3688 | break; |
3689 | } |
3690 | case Intrinsic::amdgcn_wqm_vote: { |
3691 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3692 | OpdsMapping[0] = OpdsMapping[2] |
3693 | = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); |
3694 | break; |
3695 | } |
3696 | case Intrinsic::amdgcn_ps_live: { |
3697 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); |
3698 | break; |
3699 | } |
3700 | case Intrinsic::amdgcn_div_scale: { |
3701 | unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3702 | unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); |
3703 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); |
3704 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); |
3705 | |
3706 | unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); |
3707 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); |
3708 | OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); |
3709 | break; |
3710 | } |
3711 | case Intrinsic::amdgcn_class: { |
3712 | Register Src0Reg = MI.getOperand(2).getReg(); |
3713 | Register Src1Reg = MI.getOperand(3).getReg(); |
3714 | unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); |
3715 | unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); |
3716 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3717 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); |
3718 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size); |
3719 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size); |
3720 | break; |
3721 | } |
3722 | case Intrinsic::amdgcn_icmp: |
3723 | case Intrinsic::amdgcn_fcmp: { |
3724 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3725 | // This is not VCCRegBank because this is not used in boolean contexts. |
3726 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); |
3727 | unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); |
3728 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); |
3729 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); |
3730 | break; |
3731 | } |
3732 | case Intrinsic::amdgcn_readlane: { |
3733 | // This must be an SGPR, but accept a VGPR. |
3734 | Register IdxReg = MI.getOperand(3).getReg(); |
3735 | unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); |
3736 | unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); |
3737 | OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); |
3738 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; |
3739 | } |
3740 | case Intrinsic::amdgcn_readfirstlane: { |
3741 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3742 | unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); |
3743 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); |
3744 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); |
3745 | break; |
3746 | } |
3747 | case Intrinsic::amdgcn_writelane: { |
3748 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3749 | Register SrcReg = MI.getOperand(2).getReg(); |
3750 | unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); |
3751 | unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID); |
3752 | Register IdxReg = MI.getOperand(3).getReg(); |
3753 | unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); |
3754 | unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID); |
3755 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); |
3756 | |
3757 | // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted |
3758 | // to legalize. |
3759 | OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); |
3760 | OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); |
3761 | OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); |
3762 | break; |
3763 | } |
3764 | case Intrinsic::amdgcn_if_break: { |
3765 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
3766 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
3767 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); |
3768 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
3769 | break; |
3770 | } |
3771 | case Intrinsic::amdgcn_permlane16: |
3772 | case Intrinsic::amdgcn_permlanex16: { |
3773 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); |
3774 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
3775 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
3776 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); |
3777 | OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); |
3778 | OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); |
3779 | break; |
3780 | } |
3781 | case Intrinsic::amdgcn_mfma_f32_4x4x1f32: |
3782 | case Intrinsic::amdgcn_mfma_f32_4x4x4f16: |
3783 | case Intrinsic::amdgcn_mfma_i32_4x4x4i8: |
3784 | case Intrinsic::amdgcn_mfma_f32_4x4x2bf16: |
3785 | case Intrinsic::amdgcn_mfma_f32_16x16x1f32: |
3786 | case Intrinsic::amdgcn_mfma_f32_16x16x4f32: |
3787 | case Intrinsic::amdgcn_mfma_f32_16x16x4f16: |
3788 | case Intrinsic::amdgcn_mfma_f32_16x16x16f16: |
3789 | case Intrinsic::amdgcn_mfma_i32_16x16x4i8: |
3790 | case Intrinsic::amdgcn_mfma_i32_16x16x16i8: |
3791 | case Intrinsic::amdgcn_mfma_f32_16x16x2bf16: |
3792 | case Intrinsic::amdgcn_mfma_f32_16x16x8bf16: |
3793 | case Intrinsic::amdgcn_mfma_f32_32x32x1f32: |
3794 | case Intrinsic::amdgcn_mfma_f32_32x32x2f32: |
3795 | case Intrinsic::amdgcn_mfma_f32_32x32x4f16: |
3796 | case Intrinsic::amdgcn_mfma_f32_32x32x8f16: |
3797 | case Intrinsic::amdgcn_mfma_i32_32x32x4i8: |
3798 | case Intrinsic::amdgcn_mfma_i32_32x32x8i8: |
3799 | case Intrinsic::amdgcn_mfma_f32_32x32x2bf16: |
3800 | case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: { |
3801 | // Default for MAI intrinsics. |
3802 | // srcC can also be an immediate which can be folded later. |
3803 | // FIXME: Should we eventually add an alternative mapping with AGPR src |
3804 | // for srcA/srcB? |
3805 | // |
3806 | // vdst, srcA, srcB, srcC |
3807 | OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); |
3808 | OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); |
3809 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); |
3810 | OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); |
3811 | break; |
3812 | } |
3813 | case Intrinsic::amdgcn_interp_p1: |
3814 | case Intrinsic::amdgcn_interp_p2: |
3815 | case Intrinsic::amdgcn_interp_mov: |
3816 | case Intrinsic::amdgcn_interp_p1_f16: |
3817 | case Intrinsic::amdgcn_interp_p2_f16: { |
3818 | const int M0Idx = MI.getNumOperands() - 1; |
3819 | Register M0Reg = MI.getOperand(M0Idx).getReg(); |
3820 | unsigned M0Bank = getRegBankID(M0Reg, MRI, *TRI, AMDGPU::SGPRRegBankID); |
3821 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3822 | |
3823 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); |
3824 | for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) |
3825 | OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
3826 | |
3827 | // Must be SGPR, but we must take whatever the original bank is and fix it |
3828 | // later. |
3829 | OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); |
3830 | break; |
3831 | } |
3832 | } |
3833 | break; |
3834 | } |
3835 | case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { |
3836 | auto IntrID = MI.getIntrinsicID(); |
3837 | switch (IntrID) { |
3838 | case Intrinsic::amdgcn_s_getreg: |
3839 | case Intrinsic::amdgcn_s_memtime: |
3840 | case Intrinsic::amdgcn_s_memrealtime: |
3841 | case Intrinsic::amdgcn_s_get_waveid_in_workgroup: { |
3842 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3843 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
3844 | break; |
3845 | } |
3846 | case Intrinsic::amdgcn_ds_fadd: |
3847 | case Intrinsic::amdgcn_ds_fmin: |
3848 | case Intrinsic::amdgcn_ds_fmax: |
3849 | return getDefaultMappingAllVGPR(MI); |
3850 | case Intrinsic::amdgcn_ds_ordered_add: |
3851 | case Intrinsic::amdgcn_ds_ordered_swap: { |
3852 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3853 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); |
3854 | unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, |
3855 | AMDGPU::SGPRRegBankID); |
3856 | OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); |
3857 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
3858 | break; |
3859 | } |
3860 | case Intrinsic::amdgcn_ds_append: |
3861 | case Intrinsic::amdgcn_ds_consume: { |
3862 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3863 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); |
3864 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); |
3865 | break; |
3866 | } |
3867 | case Intrinsic::amdgcn_exp_compr: |
3868 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
3869 | OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
3870 | break; |
3871 | case Intrinsic::amdgcn_exp: |
3872 | // FIXME: Could we support packed types here? |
3873 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
3874 | OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
3875 | OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
3876 | OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
3877 | break; |
3878 | case Intrinsic::amdgcn_s_sendmsg: |
3879 | case Intrinsic::amdgcn_s_sendmsghalt: { |
3880 | // This must be an SGPR, but accept a VGPR. |
3881 | unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, |
3882 | AMDGPU::SGPRRegBankID); |
3883 | OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); |
3884 | break; |
3885 | } |
3886 | case Intrinsic::amdgcn_end_cf: |
3887 | case Intrinsic::amdgcn_init_exec: { |
3888 | unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); |
3889 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
3890 | break; |
3891 | } |
3892 | case Intrinsic::amdgcn_else: { |
3893 | unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); |
3894 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); |
3895 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); |
3896 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); |
3897 | break; |
3898 | } |
3899 | case Intrinsic::amdgcn_kill: { |
3900 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); |
3901 | break; |
3902 | } |
3903 | case Intrinsic::amdgcn_raw_buffer_load: |
3904 | case Intrinsic::amdgcn_raw_tbuffer_load: { |
3905 | // FIXME: Should make intrinsic ID the last operand of the instruction, |
3906 | // then this would be the same as store |
3907 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); |
3908 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); |
3909 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); |
3910 | OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); |
3911 | break; |
3912 | } |
3913 | case Intrinsic::amdgcn_raw_buffer_store: |
3914 | case Intrinsic::amdgcn_raw_buffer_store_format: |
3915 | case Intrinsic::amdgcn_raw_tbuffer_store: { |
3916 | OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); |
3917 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); |
3918 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); |
3919 | OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); |
3920 | break; |
3921 | } |
3922 | case Intrinsic::amdgcn_struct_buffer_load: |
3923 | case Intrinsic::amdgcn_struct_tbuffer_load: { |
3924 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); |
3925 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); |
3926 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); |
3927 | OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); |
3928 | OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); |
3929 | break; |
3930 | } |
3931 | case Intrinsic::amdgcn_struct_buffer_store: |
3932 | case Intrinsic::amdgcn_struct_tbuffer_store: { |
3933 | OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); |
3934 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); |
3935 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); |
3936 | OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); |
3937 | OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); |
3938 | break; |
3939 | } |
3940 | case Intrinsic::amdgcn_init_exec_from_input: { |
3941 | unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); |
3942 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
3943 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); |
3944 | break; |
3945 | } |
3946 | case Intrinsic::amdgcn_ds_gws_init: |
3947 | case Intrinsic::amdgcn_ds_gws_barrier: |
3948 | case Intrinsic::amdgcn_ds_gws_sema_br: { |
3949 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); |
3950 | |
3951 | // This must be an SGPR, but accept a VGPR. |
3952 | unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, |
3953 | AMDGPU::SGPRRegBankID); |
3954 | OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); |
3955 | break; |
3956 | } |
3957 | case Intrinsic::amdgcn_ds_gws_sema_v: |
3958 | case Intrinsic::amdgcn_ds_gws_sema_p: |
3959 | case Intrinsic::amdgcn_ds_gws_sema_release_all: { |
3960 | // This must be an SGPR, but accept a VGPR. |
3961 | unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, |
3962 | AMDGPU::SGPRRegBankID); |
3963 | OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); |
3964 | break; |
3965 | } |
3966 | default: |
3967 | if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = |
3968 | AMDGPU::lookupRsrcIntrinsic(IntrID)) { |
3969 | // Non-images can have complications from operands that allow both SGPR |
3970 | // and VGPR. For now it's too complicated to figure out the final opcode |
3971 | // to derive the register bank from the MCInstrDesc. |
3972 | if (RSrcIntrin->IsImage) |
3973 | return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); |
3974 | } |
3975 | |
3976 | return getInvalidInstructionMapping(); |
3977 | } |
3978 | break; |
3979 | } |
3980 | case AMDGPU::G_SELECT: { |
3981 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); |
3982 | unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, |
3983 | AMDGPU::SGPRRegBankID); |
3984 | unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI, |
3985 | AMDGPU::SGPRRegBankID); |
3986 | bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && |
3987 | Op3Bank == AMDGPU::SGPRRegBankID; |
3988 | |
3989 | unsigned CondBankDefault = SGPRSrcs ? |
3990 | AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; |
3991 | unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, |
3992 | CondBankDefault); |
3993 | if (CondBank == AMDGPU::SGPRRegBankID) |
3994 | CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; |
3995 | else if (CondBank == AMDGPU::VGPRRegBankID) |
3996 | CondBank = AMDGPU::VCCRegBankID; |
3997 | |
3998 | unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ? |
3999 | AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; |
4000 | |
4001 | assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID)((CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID ) ? static_cast<void> (0) : __assert_fail ("CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 4001, __PRETTY_FUNCTION__)); |
4002 | |
4003 | // TODO: Should report 32-bit for scalar condition type. |
4004 | if (Size == 64) { |
4005 | OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); |
4006 | OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); |
4007 | OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); |
4008 | OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); |
4009 | } else { |
4010 | OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); |
4011 | OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); |
4012 | OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); |
4013 | OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); |
4014 | } |
4015 | |
4016 | break; |
4017 | } |
4018 | |
4019 | case AMDGPU::G_LOAD: |
4020 | case AMDGPU::G_ZEXTLOAD: |
4021 | case AMDGPU::G_SEXTLOAD: |
4022 | return getInstrMappingForLoad(MI); |
4023 | |
4024 | case AMDGPU::G_ATOMICRMW_XCHG: |
4025 | case AMDGPU::G_ATOMICRMW_ADD: |
4026 | case AMDGPU::G_ATOMICRMW_SUB: |
4027 | case AMDGPU::G_ATOMICRMW_AND: |
4028 | case AMDGPU::G_ATOMICRMW_OR: |
4029 | case AMDGPU::G_ATOMICRMW_XOR: |
4030 | case AMDGPU::G_ATOMICRMW_MAX: |
4031 | case AMDGPU::G_ATOMICRMW_MIN: |
4032 | case AMDGPU::G_ATOMICRMW_UMAX: |
4033 | case AMDGPU::G_ATOMICRMW_UMIN: |
4034 | case AMDGPU::G_ATOMICRMW_FADD: |
4035 | case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: |
4036 | case AMDGPU::G_AMDGPU_ATOMIC_INC: |
4037 | case AMDGPU::G_AMDGPU_ATOMIC_DEC: { |
4038 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); |
4039 | OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); |
4040 | OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); |
4041 | break; |
4042 | } |
4043 | case AMDGPU::G_ATOMIC_CMPXCHG: { |
4044 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); |
4045 | OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); |
4046 | OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); |
4047 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); |
4048 | break; |
4049 | } |
4050 | case AMDGPU::G_BRCOND: { |
4051 | unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI, |
4052 | AMDGPU::SGPRRegBankID); |
4053 | assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1)((MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1 ) ? static_cast<void> (0) : __assert_fail ("MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1" , "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 4053, __PRETTY_FUNCTION__)); |
4054 | if (Bank != AMDGPU::SGPRRegBankID) |
4055 | Bank = AMDGPU::VCCRegBankID; |
4056 | |
4057 | OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); |
4058 | break; |
4059 | } |
4060 | } |
4061 | |
4062 | return getInstructionMapping(/*ID*/1, /*Cost*/1, |
4063 | getOperandsMapping(OpdsMapping), |
4064 | MI.getNumOperands()); |
4065 | } |