File: | llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp |
Warning: | line 1135, column 3 Division by zero |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | /// \file | |||
9 | /// This file implements the targeting of the RegisterBankInfo class for | |||
10 | /// AMDGPU. | |||
11 | /// | |||
12 | /// \par | |||
13 | /// | |||
14 | /// AMDGPU has unique register bank constraints that require special high level | |||
15 | /// strategies to deal with. There are two main true physical register banks | |||
16 | /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a | |||
17 | /// sort of pseudo-register bank needed to represent SGPRs used in a vector | |||
18 | /// boolean context. There is also the AGPR bank, which is a special purpose | |||
19 | /// physical register bank present on some subtargets. | |||
20 | /// | |||
21 | /// Copying from VGPR to SGPR is generally illegal, unless the value is known to | |||
22 | /// be uniform. It is generally not valid to legalize operands by inserting | |||
23 | /// copies as on other targets. Operations which require uniform, SGPR operands | |||
24 | /// generally require scalarization by repeatedly executing the instruction, | |||
25 | /// activating each set of lanes using a unique set of input values. This is | |||
26 | /// referred to as a waterfall loop. | |||
27 | /// | |||
28 | /// \par Booleans | |||
29 | /// | |||
30 | /// Booleans (s1 values) requires special consideration. A vector compare result | |||
31 | /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit | |||
32 | /// register. These are represented with the VCC bank. During selection, we need | |||
33 | /// to be able to unambiguously go back from a register class to a register | |||
34 | /// bank. To distinguish whether an SGPR should use the SGPR or VCC register | |||
35 | /// bank, we need to know the use context type. An SGPR s1 value always means a | |||
36 | /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets | |||
37 | /// SCC, which is a 1-bit unaddressable register. This will need to be copied to | |||
38 | /// a 32-bit virtual register. Taken together, this means we need to adjust the | |||
39 | /// type of boolean operations to be regbank legal. All SALU booleans need to be | |||
40 | /// widened to 32-bits, and all VALU booleans need to be s1 values. | |||
41 | /// | |||
42 | /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact | |||
43 | /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc | |||
44 | /// bank. A non-boolean source (such as a truncate from a 1-bit load from | |||
45 | /// memory) will require a copy to the VCC bank which will require clearing the | |||
46 | /// high bits and inserting a compare. | |||
47 | /// | |||
48 | /// \par Constant bus restriction | |||
49 | /// | |||
50 | /// VALU instructions have a limitation known as the constant bus | |||
51 | /// restriction. Most VALU instructions can use SGPR operands, but may read at | |||
52 | /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most | |||
53 | /// instructions). This is one unique SGPR, so the same SGPR may be used for | |||
54 | /// multiple operands. From a register bank perspective, any combination of | |||
55 | /// operands should be legal as an SGPR, but this is contextually dependent on | |||
56 | /// the SGPR operands all being the same register. There is therefore optimal to | |||
57 | /// choose the SGPR with the most uses to minimize the number of copies. | |||
58 | /// | |||
59 | /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_* | |||
60 | /// operation should have its source operands all mapped to VGPRs (except for | |||
61 | /// VCC), inserting copies from any SGPR operands. This the most trival legal | |||
62 | /// mapping. Anything beyond the simplest 1:1 instruction selection would be too | |||
63 | /// complicated to solve here. Every optimization pattern or instruction | |||
64 | /// selected to multiple outputs would have to enforce this rule, and there | |||
65 | /// would be additional complexity in tracking this rule for every G_* | |||
66 | /// operation. By forcing all inputs to VGPRs, it also simplifies the task of | |||
67 | /// picking the optimal operand combination from a post-isel optimization pass. | |||
68 | /// | |||
69 | //===----------------------------------------------------------------------===// | |||
70 | ||||
71 | #include "AMDGPURegisterBankInfo.h" | |||
72 | ||||
73 | #include "AMDGPU.h" | |||
74 | #include "AMDGPUGlobalISelUtils.h" | |||
75 | #include "AMDGPUInstrInfo.h" | |||
76 | #include "GCNSubtarget.h" | |||
77 | #include "SIMachineFunctionInfo.h" | |||
78 | #include "SIRegisterInfo.h" | |||
79 | #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" | |||
80 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" | |||
81 | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" | |||
82 | #include "llvm/CodeGen/GlobalISel/RegisterBank.h" | |||
83 | #include "llvm/IR/IntrinsicsAMDGPU.h" | |||
84 | ||||
85 | #define GET_TARGET_REGBANK_IMPL | |||
86 | #include "AMDGPUGenRegisterBank.inc" | |||
87 | ||||
88 | // This file will be TableGen'ed at some point. | |||
89 | #include "AMDGPUGenRegisterBankInfo.def" | |||
90 | ||||
91 | using namespace llvm; | |||
92 | using namespace MIPatternMatch; | |||
93 | ||||
94 | namespace { | |||
95 | ||||
96 | // Observer to apply a register bank to new registers created by LegalizerHelper. | |||
97 | class ApplyRegBankMapping final : public GISelChangeObserver { | |||
98 | private: | |||
99 | const AMDGPURegisterBankInfo &RBI; | |||
100 | MachineRegisterInfo &MRI; | |||
101 | const RegisterBank *NewBank; | |||
102 | SmallVector<MachineInstr *, 4> NewInsts; | |||
103 | ||||
104 | public: | |||
105 | ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_, | |||
106 | MachineRegisterInfo &MRI_, const RegisterBank *RB) | |||
107 | : RBI(RBI_), MRI(MRI_), NewBank(RB) {} | |||
108 | ||||
109 | ~ApplyRegBankMapping() { | |||
110 | for (MachineInstr *MI : NewInsts) | |||
111 | applyBank(*MI); | |||
112 | } | |||
113 | ||||
114 | /// Set any registers that don't have a set register class or bank to SALU. | |||
115 | void applyBank(MachineInstr &MI) { | |||
116 | const unsigned Opc = MI.getOpcode(); | |||
117 | if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT || | |||
118 | Opc == AMDGPU::G_SEXT) { | |||
119 | // LegalizerHelper wants to use the basic legalization artifacts when | |||
120 | // widening etc. We don't handle selection with vcc in artifact sources, | |||
121 | // so we need to use a sslect instead to handle these properly. | |||
122 | Register DstReg = MI.getOperand(0).getReg(); | |||
123 | Register SrcReg = MI.getOperand(1).getReg(); | |||
124 | const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI); | |||
125 | if (SrcBank == &AMDGPU::VCCRegBank) { | |||
126 | const LLT S32 = LLT::scalar(32); | |||
127 | assert(MRI.getType(SrcReg) == LLT::scalar(1))(static_cast <bool> (MRI.getType(SrcReg) == LLT::scalar (1)) ? void (0) : __assert_fail ("MRI.getType(SrcReg) == LLT::scalar(1)" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 127, __extension__ __PRETTY_FUNCTION__)); | |||
128 | assert(MRI.getType(DstReg) == S32)(static_cast <bool> (MRI.getType(DstReg) == S32) ? void (0) : __assert_fail ("MRI.getType(DstReg) == S32", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 128, __extension__ __PRETTY_FUNCTION__)); | |||
129 | assert(NewBank == &AMDGPU::VGPRRegBank)(static_cast <bool> (NewBank == &AMDGPU::VGPRRegBank ) ? void (0) : __assert_fail ("NewBank == &AMDGPU::VGPRRegBank" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 129, __extension__ __PRETTY_FUNCTION__)); | |||
130 | ||||
131 | // Replace the extension with a select, which really uses the boolean | |||
132 | // source. | |||
133 | MachineIRBuilder B(MI); | |||
134 | auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1); | |||
135 | auto False = B.buildConstant(S32, 0); | |||
136 | B.buildSelect(DstReg, SrcReg, True, False); | |||
137 | MRI.setRegBank(True.getReg(0), *NewBank); | |||
138 | MRI.setRegBank(False.getReg(0), *NewBank); | |||
139 | MI.eraseFromParent(); | |||
140 | } | |||
141 | ||||
142 | assert(!MRI.getRegClassOrRegBank(DstReg))(static_cast <bool> (!MRI.getRegClassOrRegBank(DstReg)) ? void (0) : __assert_fail ("!MRI.getRegClassOrRegBank(DstReg)" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 142, __extension__ __PRETTY_FUNCTION__)); | |||
143 | MRI.setRegBank(DstReg, *NewBank); | |||
144 | return; | |||
145 | } | |||
146 | ||||
147 | #ifndef NDEBUG | |||
148 | if (Opc == AMDGPU::G_TRUNC) { | |||
149 | Register DstReg = MI.getOperand(0).getReg(); | |||
150 | const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI); | |||
151 | assert(DstBank != &AMDGPU::VCCRegBank)(static_cast <bool> (DstBank != &AMDGPU::VCCRegBank ) ? void (0) : __assert_fail ("DstBank != &AMDGPU::VCCRegBank" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 151, __extension__ __PRETTY_FUNCTION__)); | |||
152 | } | |||
153 | #endif | |||
154 | ||||
155 | for (MachineOperand &Op : MI.operands()) { | |||
156 | if (!Op.isReg()) | |||
157 | continue; | |||
158 | ||||
159 | // We may see physical registers if building a real MI | |||
160 | Register Reg = Op.getReg(); | |||
161 | if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg)) | |||
162 | continue; | |||
163 | ||||
164 | const RegisterBank *RB = NewBank; | |||
165 | if (MRI.getType(Reg) == LLT::scalar(1)) { | |||
166 | assert(NewBank == &AMDGPU::VGPRRegBank &&(static_cast <bool> (NewBank == &AMDGPU::VGPRRegBank && "s1 operands should only be used for vector bools" ) ? void (0) : __assert_fail ("NewBank == &AMDGPU::VGPRRegBank && \"s1 operands should only be used for vector bools\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 167, __extension__ __PRETTY_FUNCTION__)) | |||
167 | "s1 operands should only be used for vector bools")(static_cast <bool> (NewBank == &AMDGPU::VGPRRegBank && "s1 operands should only be used for vector bools" ) ? void (0) : __assert_fail ("NewBank == &AMDGPU::VGPRRegBank && \"s1 operands should only be used for vector bools\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 167, __extension__ __PRETTY_FUNCTION__)); | |||
168 | assert((MI.getOpcode() != AMDGPU::G_TRUNC &&(static_cast <bool> ((MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode() != AMDGPU::G_ANYEXT) && "not expecting legalization artifacts here" ) ? void (0) : __assert_fail ("(MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode() != AMDGPU::G_ANYEXT) && \"not expecting legalization artifacts here\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 170, __extension__ __PRETTY_FUNCTION__)) | |||
169 | MI.getOpcode() != AMDGPU::G_ANYEXT) &&(static_cast <bool> ((MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode() != AMDGPU::G_ANYEXT) && "not expecting legalization artifacts here" ) ? void (0) : __assert_fail ("(MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode() != AMDGPU::G_ANYEXT) && \"not expecting legalization artifacts here\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 170, __extension__ __PRETTY_FUNCTION__)) | |||
170 | "not expecting legalization artifacts here")(static_cast <bool> ((MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode() != AMDGPU::G_ANYEXT) && "not expecting legalization artifacts here" ) ? void (0) : __assert_fail ("(MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode() != AMDGPU::G_ANYEXT) && \"not expecting legalization artifacts here\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 170, __extension__ __PRETTY_FUNCTION__)); | |||
171 | RB = &AMDGPU::VCCRegBank; | |||
172 | } | |||
173 | ||||
174 | MRI.setRegBank(Reg, *RB); | |||
175 | } | |||
176 | } | |||
177 | ||||
178 | void erasingInstr(MachineInstr &MI) override {} | |||
179 | ||||
180 | void createdInstr(MachineInstr &MI) override { | |||
181 | // At this point, the instruction was just inserted and has no operands. | |||
182 | NewInsts.push_back(&MI); | |||
183 | } | |||
184 | ||||
185 | void changingInstr(MachineInstr &MI) override {} | |||
186 | void changedInstr(MachineInstr &MI) override { | |||
187 | // FIXME: In principle we should probably add the instruction to NewInsts, | |||
188 | // but the way the LegalizerHelper uses the observer, we will always see the | |||
189 | // registers we need to set the regbank on also referenced in a new | |||
190 | // instruction. | |||
191 | } | |||
192 | }; | |||
193 | ||||
194 | } | |||
195 | AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) | |||
196 | : AMDGPUGenRegisterBankInfo(), | |||
197 | Subtarget(ST), | |||
198 | TRI(Subtarget.getRegisterInfo()), | |||
199 | TII(Subtarget.getInstrInfo()) { | |||
200 | ||||
201 | // HACK: Until this is fully tablegen'd. | |||
202 | static llvm::once_flag InitializeRegisterBankFlag; | |||
203 | ||||
204 | static auto InitializeRegisterBankOnce = [this]() { | |||
205 | assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&(static_cast <bool> (&getRegBank(AMDGPU::SGPRRegBankID ) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU ::VGPRRegBankID) == &AMDGPU::VGPRRegBank && & getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank ) ? void (0) : __assert_fail ("&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 207, __extension__ __PRETTY_FUNCTION__)) | |||
206 | &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&(static_cast <bool> (&getRegBank(AMDGPU::SGPRRegBankID ) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU ::VGPRRegBankID) == &AMDGPU::VGPRRegBank && & getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank ) ? void (0) : __assert_fail ("&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 207, __extension__ __PRETTY_FUNCTION__)) | |||
207 | &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank)(static_cast <bool> (&getRegBank(AMDGPU::SGPRRegBankID ) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU ::VGPRRegBankID) == &AMDGPU::VGPRRegBank && & getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank ) ? void (0) : __assert_fail ("&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 207, __extension__ __PRETTY_FUNCTION__)); | |||
208 | (void)this; | |||
209 | }; | |||
210 | ||||
211 | llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce); | |||
212 | } | |||
213 | ||||
214 | static bool isVectorRegisterBank(const RegisterBank &Bank) { | |||
215 | unsigned BankID = Bank.getID(); | |||
216 | return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; | |||
217 | } | |||
218 | ||||
219 | unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, | |||
220 | const RegisterBank &Src, | |||
221 | unsigned Size) const { | |||
222 | // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? | |||
223 | if (Dst.getID() == AMDGPU::SGPRRegBankID && | |||
224 | (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) { | |||
225 | return std::numeric_limits<unsigned>::max(); | |||
226 | } | |||
227 | ||||
228 | // Bool values are tricky, because the meaning is based on context. The SCC | |||
229 | // and VCC banks are for the natural scalar and vector conditions produced by | |||
230 | // a compare. | |||
231 | // | |||
232 | // Legalization doesn't know about the necessary context, so an s1 use may | |||
233 | // have been a truncate from an arbitrary value, in which case a copy (lowered | |||
234 | // as a compare with 0) needs to be inserted. | |||
235 | if (Size == 1 && | |||
236 | (Dst.getID() == AMDGPU::SGPRRegBankID) && | |||
237 | (isVectorRegisterBank(Src) || | |||
238 | Src.getID() == AMDGPU::SGPRRegBankID || | |||
239 | Src.getID() == AMDGPU::VCCRegBankID)) | |||
240 | return std::numeric_limits<unsigned>::max(); | |||
241 | ||||
242 | // There is no direct copy between AGPRs. | |||
243 | if (Dst.getID() == AMDGPU::AGPRRegBankID && | |||
244 | Src.getID() == AMDGPU::AGPRRegBankID) | |||
245 | return 4; | |||
246 | ||||
247 | return RegisterBankInfo::copyCost(Dst, Src, Size); | |||
248 | } | |||
249 | ||||
250 | unsigned AMDGPURegisterBankInfo::getBreakDownCost( | |||
251 | const ValueMapping &ValMapping, | |||
252 | const RegisterBank *CurBank) const { | |||
253 | // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to | |||
254 | // VGPR. | |||
255 | // FIXME: Is there a better way to do this? | |||
256 | if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) | |||
257 | return 10; // This is expensive. | |||
258 | ||||
259 | assert(ValMapping.NumBreakDowns == 2 &&(static_cast <bool> (ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown [0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank ) ? void (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 264, __extension__ __PRETTY_FUNCTION__)) | |||
260 | ValMapping.BreakDown[0].Length == 32 &&(static_cast <bool> (ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown [0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank ) ? void (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 264, __extension__ __PRETTY_FUNCTION__)) | |||
261 | ValMapping.BreakDown[0].StartIdx == 0 &&(static_cast <bool> (ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown [0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank ) ? void (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 264, __extension__ __PRETTY_FUNCTION__)) | |||
262 | ValMapping.BreakDown[1].Length == 32 &&(static_cast <bool> (ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown [0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank ) ? void (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 264, __extension__ __PRETTY_FUNCTION__)) | |||
263 | ValMapping.BreakDown[1].StartIdx == 32 &&(static_cast <bool> (ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown [0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank ) ? void (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 264, __extension__ __PRETTY_FUNCTION__)) | |||
264 | ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank)(static_cast <bool> (ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown [0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank ) ? void (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 264, __extension__ __PRETTY_FUNCTION__)); | |||
265 | ||||
266 | // 32-bit extract of a 64-bit value is just access of a subregister, so free. | |||
267 | // TODO: Cost of 0 hits assert, though it's not clear it's what we really | |||
268 | // want. | |||
269 | ||||
270 | // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR | |||
271 | // alignment restrictions, but this probably isn't important. | |||
272 | return 1; | |||
273 | } | |||
274 | ||||
275 | const RegisterBank & | |||
276 | AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, | |||
277 | LLT Ty) const { | |||
278 | if (&RC == &AMDGPU::SReg_1RegClass) | |||
279 | return AMDGPU::VCCRegBank; | |||
280 | ||||
281 | // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a | |||
282 | // VCC-like use. | |||
283 | if (TRI->isSGPRClass(&RC)) { | |||
284 | // FIXME: This probably came from a copy from a physical register, which | |||
285 | // should be inferrrable from the copied to-type. We don't have many boolean | |||
286 | // physical register constraints so just assume a normal SGPR for now. | |||
287 | if (!Ty.isValid()) | |||
288 | return AMDGPU::SGPRRegBank; | |||
289 | ||||
290 | return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; | |||
291 | } | |||
292 | ||||
293 | return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank; | |||
294 | } | |||
295 | ||||
296 | template <unsigned NumOps> | |||
297 | RegisterBankInfo::InstructionMappings | |||
298 | AMDGPURegisterBankInfo::addMappingFromTable( | |||
299 | const MachineInstr &MI, const MachineRegisterInfo &MRI, | |||
300 | const std::array<unsigned, NumOps> RegSrcOpIdx, | |||
301 | ArrayRef<OpRegBankEntry<NumOps>> Table) const { | |||
302 | ||||
303 | InstructionMappings AltMappings; | |||
304 | ||||
305 | SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); | |||
306 | ||||
307 | unsigned Sizes[NumOps]; | |||
308 | for (unsigned I = 0; I < NumOps; ++I) { | |||
309 | Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); | |||
310 | Sizes[I] = getSizeInBits(Reg, MRI, *TRI); | |||
311 | } | |||
312 | ||||
313 | for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { | |||
314 | unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); | |||
315 | Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); | |||
316 | } | |||
317 | ||||
318 | // getInstrMapping's default mapping uses ID 1, so start at 2. | |||
319 | unsigned MappingID = 2; | |||
320 | for (const auto &Entry : Table) { | |||
321 | for (unsigned I = 0; I < NumOps; ++I) { | |||
322 | int OpIdx = RegSrcOpIdx[I]; | |||
323 | Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); | |||
324 | } | |||
325 | ||||
326 | AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, | |||
327 | getOperandsMapping(Operands), | |||
328 | Operands.size())); | |||
329 | } | |||
330 | ||||
331 | return AltMappings; | |||
332 | } | |||
333 | ||||
334 | RegisterBankInfo::InstructionMappings | |||
335 | AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( | |||
336 | const MachineInstr &MI, const MachineRegisterInfo &MRI) const { | |||
337 | switch (MI.getIntrinsicID()) { | |||
338 | case Intrinsic::amdgcn_readlane: { | |||
339 | static const OpRegBankEntry<3> Table[2] = { | |||
340 | // Perfectly legal. | |||
341 | { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, | |||
342 | ||||
343 | // Need a readfirstlane for the index. | |||
344 | { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } | |||
345 | }; | |||
346 | ||||
347 | const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; | |||
348 | return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); | |||
349 | } | |||
350 | case Intrinsic::amdgcn_writelane: { | |||
351 | static const OpRegBankEntry<4> Table[4] = { | |||
352 | // Perfectly legal. | |||
353 | { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, | |||
354 | ||||
355 | // Need readfirstlane of first op | |||
356 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, | |||
357 | ||||
358 | // Need readfirstlane of second op | |||
359 | { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, | |||
360 | ||||
361 | // Need readfirstlane of both ops | |||
362 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } | |||
363 | }; | |||
364 | ||||
365 | // rsrc, voffset, offset | |||
366 | const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; | |||
367 | return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); | |||
368 | } | |||
369 | default: | |||
370 | return RegisterBankInfo::getInstrAlternativeMappings(MI); | |||
371 | } | |||
372 | } | |||
373 | ||||
374 | RegisterBankInfo::InstructionMappings | |||
375 | AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( | |||
376 | const MachineInstr &MI, const MachineRegisterInfo &MRI) const { | |||
377 | ||||
378 | switch (MI.getIntrinsicID()) { | |||
379 | case Intrinsic::amdgcn_s_buffer_load: { | |||
380 | static const OpRegBankEntry<2> Table[4] = { | |||
381 | // Perfectly legal. | |||
382 | { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, | |||
383 | ||||
384 | // Only need 1 register in loop | |||
385 | { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, | |||
386 | ||||
387 | // Have to waterfall the resource. | |||
388 | { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, | |||
389 | ||||
390 | // Have to waterfall the resource, and the offset. | |||
391 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } | |||
392 | }; | |||
393 | ||||
394 | // rsrc, offset | |||
395 | const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; | |||
396 | return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); | |||
397 | } | |||
398 | case Intrinsic::amdgcn_ds_ordered_add: | |||
399 | case Intrinsic::amdgcn_ds_ordered_swap: { | |||
400 | // VGPR = M0, VGPR | |||
401 | static const OpRegBankEntry<3> Table[2] = { | |||
402 | // Perfectly legal. | |||
403 | { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, | |||
404 | ||||
405 | // Need a readfirstlane for m0 | |||
406 | { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } | |||
407 | }; | |||
408 | ||||
409 | const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; | |||
410 | return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); | |||
411 | } | |||
412 | case Intrinsic::amdgcn_s_sendmsg: | |||
413 | case Intrinsic::amdgcn_s_sendmsghalt: { | |||
414 | // FIXME: Should have no register for immediate | |||
415 | static const OpRegBankEntry<1> Table[2] = { | |||
416 | // Perfectly legal. | |||
417 | { { AMDGPU::SGPRRegBankID }, 1 }, | |||
418 | ||||
419 | // Need readlane | |||
420 | { { AMDGPU::VGPRRegBankID }, 3 } | |||
421 | }; | |||
422 | ||||
423 | const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; | |||
424 | return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); | |||
425 | } | |||
426 | default: | |||
427 | return RegisterBankInfo::getInstrAlternativeMappings(MI); | |||
428 | } | |||
429 | } | |||
430 | ||||
431 | static bool memOpHasNoClobbered(const MachineMemOperand *MMO) { | |||
432 | const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue()); | |||
433 | return I && I->getMetadata("amdgpu.noclobber"); | |||
434 | } | |||
435 | ||||
436 | // FIXME: Returns uniform if there's no source value information. This is | |||
437 | // probably wrong. | |||
438 | static bool isScalarLoadLegal(const MachineInstr &MI) { | |||
439 | if (!MI.hasOneMemOperand()) | |||
440 | return false; | |||
441 | ||||
442 | const MachineMemOperand *MMO = *MI.memoperands_begin(); | |||
443 | const unsigned AS = MMO->getAddrSpace(); | |||
444 | const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || | |||
445 | AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; | |||
446 | // Require 4-byte alignment. | |||
447 | return MMO->getAlign() >= Align(4) && | |||
448 | // Can't do a scalar atomic load. | |||
449 | !MMO->isAtomic() && | |||
450 | // Don't use scalar loads for volatile accesses to non-constant address | |||
451 | // spaces. | |||
452 | (IsConst || !MMO->isVolatile()) && | |||
453 | // Memory must be known constant, or not written before this load. | |||
454 | (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) && | |||
455 | AMDGPUInstrInfo::isUniformMMO(MMO); | |||
456 | } | |||
457 | ||||
458 | RegisterBankInfo::InstructionMappings | |||
459 | AMDGPURegisterBankInfo::getInstrAlternativeMappings( | |||
460 | const MachineInstr &MI) const { | |||
461 | ||||
462 | const MachineFunction &MF = *MI.getParent()->getParent(); | |||
463 | const MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
464 | ||||
465 | ||||
466 | InstructionMappings AltMappings; | |||
467 | switch (MI.getOpcode()) { | |||
468 | case TargetOpcode::G_CONSTANT: { | |||
469 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); | |||
470 | if (Size == 1) { | |||
471 | static const OpRegBankEntry<1> Table[3] = { | |||
472 | { { AMDGPU::VGPRRegBankID }, 1 }, | |||
473 | { { AMDGPU::SGPRRegBankID }, 1 }, | |||
474 | { { AMDGPU::VCCRegBankID }, 1 } | |||
475 | }; | |||
476 | ||||
477 | return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); | |||
478 | } | |||
479 | ||||
480 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | |||
481 | } | |||
482 | case TargetOpcode::G_FCONSTANT: | |||
483 | case TargetOpcode::G_FRAME_INDEX: | |||
484 | case TargetOpcode::G_GLOBAL_VALUE: { | |||
485 | static const OpRegBankEntry<1> Table[2] = { | |||
486 | { { AMDGPU::VGPRRegBankID }, 1 }, | |||
487 | { { AMDGPU::SGPRRegBankID }, 1 } | |||
488 | }; | |||
489 | ||||
490 | return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); | |||
491 | } | |||
492 | case TargetOpcode::G_AND: | |||
493 | case TargetOpcode::G_OR: | |||
494 | case TargetOpcode::G_XOR: { | |||
495 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); | |||
496 | ||||
497 | if (Size == 1) { | |||
498 | // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. | |||
499 | const InstructionMapping &SCCMapping = getInstructionMapping( | |||
500 | 1, 1, getOperandsMapping( | |||
501 | {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), | |||
502 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), | |||
503 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}), | |||
504 | 3); // Num Operands | |||
505 | AltMappings.push_back(&SCCMapping); | |||
506 | ||||
507 | const InstructionMapping &VCCMapping0 = getInstructionMapping( | |||
508 | 2, 1, getOperandsMapping( | |||
509 | {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), | |||
510 | AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), | |||
511 | AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), | |||
512 | 3); // Num Operands | |||
513 | AltMappings.push_back(&VCCMapping0); | |||
514 | return AltMappings; | |||
515 | } | |||
516 | ||||
517 | if (Size != 64) | |||
518 | break; | |||
519 | ||||
520 | const InstructionMapping &SSMapping = getInstructionMapping( | |||
521 | 1, 1, getOperandsMapping( | |||
522 | {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), | |||
523 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), | |||
524 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), | |||
525 | 3); // Num Operands | |||
526 | AltMappings.push_back(&SSMapping); | |||
527 | ||||
528 | const InstructionMapping &VVMapping = getInstructionMapping( | |||
529 | 2, 2, getOperandsMapping( | |||
530 | {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), | |||
531 | AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), | |||
532 | AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), | |||
533 | 3); // Num Operands | |||
534 | AltMappings.push_back(&VVMapping); | |||
535 | break; | |||
536 | } | |||
537 | case TargetOpcode::G_LOAD: | |||
538 | case TargetOpcode::G_ZEXTLOAD: | |||
539 | case TargetOpcode::G_SEXTLOAD: { | |||
540 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); | |||
541 | LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); | |||
542 | unsigned PtrSize = PtrTy.getSizeInBits(); | |||
543 | unsigned AS = PtrTy.getAddressSpace(); | |||
544 | ||||
545 | if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && | |||
546 | AS != AMDGPUAS::PRIVATE_ADDRESS) && | |||
547 | isScalarLoadLegal(MI)) { | |||
548 | const InstructionMapping &SSMapping = getInstructionMapping( | |||
549 | 1, 1, getOperandsMapping( | |||
550 | {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), | |||
551 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}), | |||
552 | 2); // Num Operands | |||
553 | AltMappings.push_back(&SSMapping); | |||
554 | } | |||
555 | ||||
556 | const InstructionMapping &VVMapping = getInstructionMapping( | |||
557 | 2, 1, | |||
558 | getOperandsMapping( | |||
559 | {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), | |||
560 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), | |||
561 | 2); // Num Operands | |||
562 | AltMappings.push_back(&VVMapping); | |||
563 | ||||
564 | // It may be possible to have a vgpr = load sgpr mapping here, because | |||
565 | // the mubuf instructions support this kind of load, but probably for only | |||
566 | // gfx7 and older. However, the addressing mode matching in the instruction | |||
567 | // selector should be able to do a better job of detecting and selecting | |||
568 | // these kinds of loads from the vgpr = load vgpr mapping. | |||
569 | ||||
570 | return AltMappings; | |||
571 | ||||
572 | } | |||
573 | case TargetOpcode::G_SELECT: { | |||
574 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); | |||
575 | const InstructionMapping &SSMapping = getInstructionMapping(1, 1, | |||
576 | getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), | |||
577 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), | |||
578 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), | |||
579 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), | |||
580 | 4); // Num Operands | |||
581 | AltMappings.push_back(&SSMapping); | |||
582 | ||||
583 | const InstructionMapping &VVMapping = getInstructionMapping(2, 1, | |||
584 | getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), | |||
585 | AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), | |||
586 | AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), | |||
587 | AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), | |||
588 | 4); // Num Operands | |||
589 | AltMappings.push_back(&VVMapping); | |||
590 | ||||
591 | return AltMappings; | |||
592 | } | |||
593 | case TargetOpcode::G_UADDE: | |||
594 | case TargetOpcode::G_USUBE: | |||
595 | case TargetOpcode::G_SADDE: | |||
596 | case TargetOpcode::G_SSUBE: { | |||
597 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); | |||
598 | const InstructionMapping &SSMapping = getInstructionMapping(1, 1, | |||
599 | getOperandsMapping( | |||
600 | {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), | |||
601 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), | |||
602 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), | |||
603 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), | |||
604 | AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}), | |||
605 | 5); // Num Operands | |||
606 | AltMappings.push_back(&SSMapping); | |||
607 | ||||
608 | const InstructionMapping &VVMapping = getInstructionMapping(2, 1, | |||
609 | getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), | |||
610 | AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), | |||
611 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), | |||
612 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), | |||
613 | AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), | |||
614 | 5); // Num Operands | |||
615 | AltMappings.push_back(&VVMapping); | |||
616 | return AltMappings; | |||
617 | } | |||
618 | case AMDGPU::G_BRCOND: { | |||
619 | assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1)(static_cast <bool> (MRI.getType(MI.getOperand(0).getReg ()).getSizeInBits() == 1) ? void (0) : __assert_fail ("MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 619, __extension__ __PRETTY_FUNCTION__)); | |||
620 | ||||
621 | // TODO: Change type to 32 for scalar | |||
622 | const InstructionMapping &SMapping = getInstructionMapping( | |||
623 | 1, 1, getOperandsMapping( | |||
624 | {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}), | |||
625 | 2); // Num Operands | |||
626 | AltMappings.push_back(&SMapping); | |||
627 | ||||
628 | const InstructionMapping &VMapping = getInstructionMapping( | |||
629 | 1, 1, getOperandsMapping( | |||
630 | {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), | |||
631 | 2); // Num Operands | |||
632 | AltMappings.push_back(&VMapping); | |||
633 | return AltMappings; | |||
634 | } | |||
635 | case AMDGPU::G_INTRINSIC: | |||
636 | return getInstrAlternativeMappingsIntrinsic(MI, MRI); | |||
637 | case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: | |||
638 | return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); | |||
639 | default: | |||
640 | break; | |||
641 | } | |||
642 | return RegisterBankInfo::getInstrAlternativeMappings(MI); | |||
643 | } | |||
644 | ||||
645 | void AMDGPURegisterBankInfo::split64BitValueForMapping( | |||
646 | MachineIRBuilder &B, | |||
647 | SmallVector<Register, 2> &Regs, | |||
648 | LLT HalfTy, | |||
649 | Register Reg) const { | |||
650 | assert(HalfTy.getSizeInBits() == 32)(static_cast <bool> (HalfTy.getSizeInBits() == 32) ? void (0) : __assert_fail ("HalfTy.getSizeInBits() == 32", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 650, __extension__ __PRETTY_FUNCTION__)); | |||
651 | MachineRegisterInfo *MRI = B.getMRI(); | |||
652 | Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); | |||
653 | Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); | |||
654 | const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); | |||
655 | MRI->setRegBank(LoLHS, *Bank); | |||
656 | MRI->setRegBank(HiLHS, *Bank); | |||
657 | ||||
658 | Regs.push_back(LoLHS); | |||
659 | Regs.push_back(HiLHS); | |||
660 | ||||
661 | B.buildInstr(AMDGPU::G_UNMERGE_VALUES) | |||
662 | .addDef(LoLHS) | |||
663 | .addDef(HiLHS) | |||
664 | .addUse(Reg); | |||
665 | } | |||
666 | ||||
667 | /// Replace the current type each register in \p Regs has with \p NewTy | |||
668 | static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, | |||
669 | LLT NewTy) { | |||
670 | for (Register Reg : Regs) { | |||
671 | assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits())(static_cast <bool> (MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()) ? void (0) : __assert_fail ("MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 671, __extension__ __PRETTY_FUNCTION__)); | |||
672 | MRI.setType(Reg, NewTy); | |||
673 | } | |||
674 | } | |||
675 | ||||
676 | static LLT getHalfSizedType(LLT Ty) { | |||
677 | if (Ty.isVector()) { | |||
678 | assert(Ty.getNumElements() % 2 == 0)(static_cast <bool> (Ty.getNumElements() % 2 == 0) ? void (0) : __assert_fail ("Ty.getNumElements() % 2 == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 678, __extension__ __PRETTY_FUNCTION__)); | |||
679 | return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType()); | |||
680 | } | |||
681 | ||||
682 | assert(Ty.getSizeInBits() % 2 == 0)(static_cast <bool> (Ty.getSizeInBits() % 2 == 0) ? void (0) : __assert_fail ("Ty.getSizeInBits() % 2 == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 682, __extension__ __PRETTY_FUNCTION__)); | |||
683 | return LLT::scalar(Ty.getSizeInBits() / 2); | |||
684 | } | |||
685 | ||||
686 | /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If | |||
687 | /// any of the required SGPR operands are VGPRs, perform a waterfall loop to | |||
688 | /// execute the instruction for each unique combination of values in all lanes | |||
689 | /// in the wave. The block will be split such that rest of the instructions are | |||
690 | /// moved to a new block. | |||
691 | /// | |||
692 | /// Essentially performs this loop: | |||
693 | // | |||
694 | /// Save Execution Mask | |||
695 | /// For (Lane : Wavefront) { | |||
696 | /// Enable Lane, Disable all other lanes | |||
697 | /// SGPR = read SGPR value for current lane from VGPR | |||
698 | /// VGPRResult[Lane] = use_op SGPR | |||
699 | /// } | |||
700 | /// Restore Execution Mask | |||
701 | /// | |||
702 | /// There is additional complexity to try for compare values to identify the | |||
703 | /// unique values used. | |||
704 | bool AMDGPURegisterBankInfo::executeInWaterfallLoop( | |||
705 | MachineIRBuilder &B, | |||
706 | iterator_range<MachineBasicBlock::iterator> Range, | |||
707 | SmallSet<Register, 4> &SGPROperandRegs, | |||
708 | MachineRegisterInfo &MRI) const { | |||
709 | SmallVector<Register, 4> ResultRegs; | |||
710 | SmallVector<Register, 4> InitResultRegs; | |||
711 | SmallVector<Register, 4> PhiRegs; | |||
712 | ||||
713 | // Track use registers which have already been expanded with a readfirstlane | |||
714 | // sequence. This may have multiple uses if moving a sequence. | |||
715 | DenseMap<Register, Register> WaterfalledRegMap; | |||
716 | ||||
717 | MachineBasicBlock &MBB = B.getMBB(); | |||
718 | MachineFunction *MF = &B.getMF(); | |||
719 | ||||
720 | const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); | |||
721 | const unsigned WaveAndOpc = Subtarget.isWave32() ? | |||
722 | AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; | |||
723 | const unsigned MovTermOpc = Subtarget.isWave32() ? | |||
724 | AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; | |||
725 | const unsigned XorTermOpc = Subtarget.isWave32() ? | |||
726 | AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; | |||
727 | const unsigned AndSaveExecOpc = Subtarget.isWave32() ? | |||
728 | AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; | |||
729 | const unsigned ExecReg = Subtarget.isWave32() ? | |||
730 | AMDGPU::EXEC_LO : AMDGPU::EXEC; | |||
731 | ||||
732 | #ifndef NDEBUG | |||
733 | const int OrigRangeSize = std::distance(Range.begin(), Range.end()); | |||
734 | #endif | |||
735 | ||||
736 | for (MachineInstr &MI : Range) { | |||
737 | for (MachineOperand &Def : MI.defs()) { | |||
738 | if (MRI.use_nodbg_empty(Def.getReg())) | |||
739 | continue; | |||
740 | ||||
741 | LLT ResTy = MRI.getType(Def.getReg()); | |||
742 | const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); | |||
743 | ResultRegs.push_back(Def.getReg()); | |||
744 | Register InitReg = B.buildUndef(ResTy).getReg(0); | |||
745 | Register PhiReg = MRI.createGenericVirtualRegister(ResTy); | |||
746 | InitResultRegs.push_back(InitReg); | |||
747 | PhiRegs.push_back(PhiReg); | |||
748 | MRI.setRegBank(PhiReg, *DefBank); | |||
749 | MRI.setRegBank(InitReg, *DefBank); | |||
750 | } | |||
751 | } | |||
752 | ||||
753 | Register SaveExecReg = MRI.createVirtualRegister(WaveRC); | |||
754 | Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); | |||
755 | ||||
756 | // Don't bother using generic instructions/registers for the exec mask. | |||
757 | B.buildInstr(TargetOpcode::IMPLICIT_DEF) | |||
758 | .addDef(InitSaveExecReg); | |||
759 | ||||
760 | Register PhiExec = MRI.createVirtualRegister(WaveRC); | |||
761 | Register NewExec = MRI.createVirtualRegister(WaveRC); | |||
762 | ||||
763 | // To insert the loop we need to split the block. Move everything before this | |||
764 | // point to a new block, and insert a new empty block before this instruction. | |||
765 | MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); | |||
766 | MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); | |||
767 | MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); | |||
768 | MachineFunction::iterator MBBI(MBB); | |||
769 | ++MBBI; | |||
770 | MF->insert(MBBI, LoopBB); | |||
771 | MF->insert(MBBI, RestoreExecBB); | |||
772 | MF->insert(MBBI, RemainderBB); | |||
773 | ||||
774 | LoopBB->addSuccessor(RestoreExecBB); | |||
775 | LoopBB->addSuccessor(LoopBB); | |||
776 | ||||
777 | // Move the rest of the block into a new block. | |||
778 | RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); | |||
779 | RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); | |||
780 | ||||
781 | MBB.addSuccessor(LoopBB); | |||
782 | RestoreExecBB->addSuccessor(RemainderBB); | |||
783 | ||||
784 | B.setInsertPt(*LoopBB, LoopBB->end()); | |||
785 | ||||
786 | B.buildInstr(TargetOpcode::PHI) | |||
787 | .addDef(PhiExec) | |||
788 | .addReg(InitSaveExecReg) | |||
789 | .addMBB(&MBB) | |||
790 | .addReg(NewExec) | |||
791 | .addMBB(LoopBB); | |||
792 | ||||
793 | for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) { | |||
794 | B.buildInstr(TargetOpcode::G_PHI) | |||
795 | .addDef(std::get<2>(Result)) | |||
796 | .addReg(std::get<0>(Result)) // Initial value / implicit_def | |||
797 | .addMBB(&MBB) | |||
798 | .addReg(std::get<1>(Result)) // Mid-loop value. | |||
799 | .addMBB(LoopBB); | |||
800 | } | |||
801 | ||||
802 | const DebugLoc &DL = B.getDL(); | |||
803 | ||||
804 | MachineInstr &FirstInst = *Range.begin(); | |||
805 | ||||
806 | // Move the instruction into the loop. Note we moved everything after | |||
807 | // Range.end() already into a new block, so Range.end() is no longer valid. | |||
808 | LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end()); | |||
809 | ||||
810 | // Figure out the iterator range after splicing the instructions. | |||
811 | MachineBasicBlock::iterator NewBegin = FirstInst.getIterator(); | |||
812 | auto NewEnd = LoopBB->end(); | |||
813 | ||||
814 | MachineBasicBlock::iterator I = Range.begin(); | |||
815 | B.setInsertPt(*LoopBB, I); | |||
816 | ||||
817 | Register CondReg; | |||
818 | ||||
819 | assert(std::distance(NewBegin, NewEnd) == OrigRangeSize)(static_cast <bool> (std::distance(NewBegin, NewEnd) == OrigRangeSize) ? void (0) : __assert_fail ("std::distance(NewBegin, NewEnd) == OrigRangeSize" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 819, __extension__ __PRETTY_FUNCTION__)); | |||
820 | ||||
821 | for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { | |||
822 | for (MachineOperand &Op : MI.uses()) { | |||
823 | if (!Op.isReg() || Op.isDef()) | |||
824 | continue; | |||
825 | ||||
826 | Register OldReg = Op.getReg(); | |||
827 | if (!SGPROperandRegs.count(OldReg)) | |||
828 | continue; | |||
829 | ||||
830 | // See if we already processed this register in another instruction in the | |||
831 | // sequence. | |||
832 | auto OldVal = WaterfalledRegMap.find(OldReg); | |||
833 | if (OldVal != WaterfalledRegMap.end()) { | |||
834 | Op.setReg(OldVal->second); | |||
835 | continue; | |||
836 | } | |||
837 | ||||
838 | Register OpReg = Op.getReg(); | |||
839 | LLT OpTy = MRI.getType(OpReg); | |||
840 | ||||
841 | const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI); | |||
842 | if (OpBank != &AMDGPU::VGPRRegBank) { | |||
843 | // Insert copy from AGPR to VGPR before the loop. | |||
844 | B.setMBB(MBB); | |||
845 | OpReg = B.buildCopy(OpTy, OpReg).getReg(0); | |||
846 | MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank); | |||
847 | B.setInstr(*I); | |||
848 | } | |||
849 | ||||
850 | unsigned OpSize = OpTy.getSizeInBits(); | |||
851 | ||||
852 | // Can only do a readlane of 32-bit pieces. | |||
853 | if (OpSize == 32) { | |||
854 | // Avoid extra copies in the simple case of one 32-bit register. | |||
855 | Register CurrentLaneOpReg | |||
856 | = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | |||
857 | MRI.setType(CurrentLaneOpReg, OpTy); | |||
858 | ||||
859 | constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI); | |||
860 | // Read the next variant <- also loop target. | |||
861 | BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), | |||
862 | CurrentLaneOpReg) | |||
863 | .addReg(OpReg); | |||
864 | ||||
865 | Register NewCondReg = MRI.createVirtualRegister(WaveRC); | |||
866 | bool First = CondReg == AMDGPU::NoRegister; | |||
867 | if (First) | |||
868 | CondReg = NewCondReg; | |||
869 | ||||
870 | // Compare the just read M0 value to all possible Idx values. | |||
871 | B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) | |||
872 | .addDef(NewCondReg) | |||
873 | .addReg(CurrentLaneOpReg) | |||
874 | .addReg(OpReg); | |||
875 | Op.setReg(CurrentLaneOpReg); | |||
876 | ||||
877 | if (!First) { | |||
878 | Register AndReg = MRI.createVirtualRegister(WaveRC); | |||
879 | ||||
880 | // If there are multiple operands to consider, and the conditions. | |||
881 | B.buildInstr(WaveAndOpc) | |||
882 | .addDef(AndReg) | |||
883 | .addReg(NewCondReg) | |||
884 | .addReg(CondReg); | |||
885 | CondReg = AndReg; | |||
886 | } | |||
887 | } else { | |||
888 | LLT S32 = LLT::scalar(32); | |||
889 | SmallVector<Register, 8> ReadlanePieces; | |||
890 | ||||
891 | // The compares can be done as 64-bit, but the extract needs to be done | |||
892 | // in 32-bit pieces. | |||
893 | ||||
894 | bool Is64 = OpSize % 64 == 0; | |||
895 | ||||
896 | LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); | |||
897 | unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 | |||
898 | : AMDGPU::V_CMP_EQ_U32_e64; | |||
899 | ||||
900 | // The compares can be done as 64-bit, but the extract needs to be done | |||
901 | // in 32-bit pieces. | |||
902 | ||||
903 | // Insert the unmerge before the loop. | |||
904 | ||||
905 | B.setMBB(MBB); | |||
906 | auto Unmerge = B.buildUnmerge(UnmergeTy, OpReg); | |||
907 | B.setInstr(*I); | |||
908 | ||||
909 | unsigned NumPieces = Unmerge->getNumOperands() - 1; | |||
910 | for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { | |||
911 | Register UnmergePiece = Unmerge.getReg(PieceIdx); | |||
912 | ||||
913 | Register CurrentLaneOpReg; | |||
914 | if (Is64) { | |||
915 | Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); | |||
916 | Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); | |||
917 | ||||
918 | MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); | |||
919 | MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); | |||
920 | MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); | |||
921 | ||||
922 | // Read the next variant <- also loop target. | |||
923 | BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), | |||
924 | CurrentLaneOpRegLo) | |||
925 | .addReg(UnmergePiece, 0, AMDGPU::sub0); | |||
926 | ||||
927 | // Read the next variant <- also loop target. | |||
928 | BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), | |||
929 | CurrentLaneOpRegHi) | |||
930 | .addReg(UnmergePiece, 0, AMDGPU::sub1); | |||
931 | ||||
932 | CurrentLaneOpReg = | |||
933 | B.buildMerge(LLT::scalar(64), | |||
934 | {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) | |||
935 | .getReg(0); | |||
936 | ||||
937 | MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); | |||
938 | ||||
939 | if (OpTy.getScalarSizeInBits() == 64) { | |||
940 | // If we need to produce a 64-bit element vector, so use the | |||
941 | // merged pieces | |||
942 | ReadlanePieces.push_back(CurrentLaneOpReg); | |||
943 | } else { | |||
944 | // 32-bit element type. | |||
945 | ReadlanePieces.push_back(CurrentLaneOpRegLo); | |||
946 | ReadlanePieces.push_back(CurrentLaneOpRegHi); | |||
947 | } | |||
948 | } else { | |||
949 | CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32); | |||
950 | MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); | |||
951 | MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); | |||
952 | ||||
953 | // Read the next variant <- also loop target. | |||
954 | BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), | |||
955 | CurrentLaneOpReg) | |||
956 | .addReg(UnmergePiece); | |||
957 | ReadlanePieces.push_back(CurrentLaneOpReg); | |||
958 | } | |||
959 | ||||
960 | Register NewCondReg = MRI.createVirtualRegister(WaveRC); | |||
961 | bool First = CondReg == AMDGPU::NoRegister; | |||
962 | if (First) | |||
963 | CondReg = NewCondReg; | |||
964 | ||||
965 | B.buildInstr(CmpOp) | |||
966 | .addDef(NewCondReg) | |||
967 | .addReg(CurrentLaneOpReg) | |||
968 | .addReg(UnmergePiece); | |||
969 | ||||
970 | if (!First) { | |||
971 | Register AndReg = MRI.createVirtualRegister(WaveRC); | |||
972 | ||||
973 | // If there are multiple operands to consider, and the conditions. | |||
974 | B.buildInstr(WaveAndOpc) | |||
975 | .addDef(AndReg) | |||
976 | .addReg(NewCondReg) | |||
977 | .addReg(CondReg); | |||
978 | CondReg = AndReg; | |||
979 | } | |||
980 | } | |||
981 | ||||
982 | // FIXME: Build merge seems to switch to CONCAT_VECTORS but not | |||
983 | // BUILD_VECTOR | |||
984 | if (OpTy.isVector()) { | |||
985 | auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); | |||
986 | Op.setReg(Merge.getReg(0)); | |||
987 | } else { | |||
988 | auto Merge = B.buildMerge(OpTy, ReadlanePieces); | |||
989 | Op.setReg(Merge.getReg(0)); | |||
990 | } | |||
991 | ||||
992 | MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank); | |||
993 | } | |||
994 | ||||
995 | // Make sure we don't re-process this register again. | |||
996 | WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg())); | |||
997 | } | |||
998 | } | |||
999 | ||||
1000 | B.setInsertPt(*LoopBB, LoopBB->end()); | |||
1001 | ||||
1002 | // Update EXEC, save the original EXEC value to VCC. | |||
1003 | B.buildInstr(AndSaveExecOpc) | |||
1004 | .addDef(NewExec) | |||
1005 | .addReg(CondReg, RegState::Kill); | |||
1006 | ||||
1007 | MRI.setSimpleHint(NewExec, CondReg); | |||
1008 | ||||
1009 | // Update EXEC, switch all done bits to 0 and all todo bits to 1. | |||
1010 | B.buildInstr(XorTermOpc) | |||
1011 | .addDef(ExecReg) | |||
1012 | .addReg(ExecReg) | |||
1013 | .addReg(NewExec); | |||
1014 | ||||
1015 | // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use | |||
1016 | // s_cbranch_scc0? | |||
1017 | ||||
1018 | // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. | |||
1019 | B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ) | |||
1020 | .addMBB(LoopBB); | |||
1021 | ||||
1022 | // Save the EXEC mask before the loop. | |||
1023 | BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg) | |||
1024 | .addReg(ExecReg); | |||
1025 | ||||
1026 | // Restore the EXEC mask after the loop. | |||
1027 | B.setMBB(*RestoreExecBB); | |||
1028 | B.buildInstr(MovTermOpc) | |||
1029 | .addDef(ExecReg) | |||
1030 | .addReg(SaveExecReg); | |||
1031 | ||||
1032 | // Set the insert point after the original instruction, so any new | |||
1033 | // instructions will be in the remainder. | |||
1034 | B.setInsertPt(*RemainderBB, RemainderBB->begin()); | |||
1035 | ||||
1036 | return true; | |||
1037 | } | |||
1038 | ||||
1039 | // Return any unique registers used by \p MI at \p OpIndices that need to be | |||
1040 | // handled in a waterfall loop. Returns these registers in \p | |||
1041 | // SGPROperandRegs. Returns true if there are any operands to handle and a | |||
1042 | // waterfall loop is necessary. | |||
1043 | bool AMDGPURegisterBankInfo::collectWaterfallOperands( | |||
1044 | SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, | |||
1045 | MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { | |||
1046 | for (unsigned Op : OpIndices) { | |||
1047 | assert(MI.getOperand(Op).isUse())(static_cast <bool> (MI.getOperand(Op).isUse()) ? void ( 0) : __assert_fail ("MI.getOperand(Op).isUse()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 1047, __extension__ __PRETTY_FUNCTION__)); | |||
1048 | Register Reg = MI.getOperand(Op).getReg(); | |||
1049 | const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); | |||
1050 | if (OpBank->getID() != AMDGPU::SGPRRegBankID) | |||
1051 | SGPROperandRegs.insert(Reg); | |||
1052 | } | |||
1053 | ||||
1054 | // No operands need to be replaced, so no need to loop. | |||
1055 | return !SGPROperandRegs.empty(); | |||
1056 | } | |||
1057 | ||||
1058 | bool AMDGPURegisterBankInfo::executeInWaterfallLoop( | |||
1059 | MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, | |||
1060 | ArrayRef<unsigned> OpIndices) const { | |||
1061 | // Use a set to avoid extra readfirstlanes in the case where multiple operands | |||
1062 | // are the same register. | |||
1063 | SmallSet<Register, 4> SGPROperandRegs; | |||
1064 | ||||
1065 | if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices)) | |||
1066 | return false; | |||
1067 | ||||
1068 | MachineBasicBlock::iterator I = MI.getIterator(); | |||
1069 | return executeInWaterfallLoop(B, make_range(I, std::next(I)), | |||
1070 | SGPROperandRegs, MRI); | |||
1071 | } | |||
1072 | ||||
1073 | bool AMDGPURegisterBankInfo::executeInWaterfallLoop( | |||
1074 | MachineInstr &MI, MachineRegisterInfo &MRI, | |||
1075 | ArrayRef<unsigned> OpIndices) const { | |||
1076 | MachineIRBuilder B(MI); | |||
1077 | return executeInWaterfallLoop(B, MI, MRI, OpIndices); | |||
1078 | } | |||
1079 | ||||
1080 | // Legalize an operand that must be an SGPR by inserting a readfirstlane. | |||
1081 | void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( | |||
1082 | MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { | |||
1083 | Register Reg = MI.getOperand(OpIdx).getReg(); | |||
1084 | const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); | |||
1085 | if (Bank == &AMDGPU::SGPRRegBank) | |||
1086 | return; | |||
1087 | ||||
1088 | LLT Ty = MRI.getType(Reg); | |||
1089 | MachineIRBuilder B(MI); | |||
1090 | ||||
1091 | if (Bank != &AMDGPU::VGPRRegBank) { | |||
1092 | // We need to copy from AGPR to VGPR | |||
1093 | Reg = B.buildCopy(Ty, Reg).getReg(0); | |||
1094 | MRI.setRegBank(Reg, AMDGPU::VGPRRegBank); | |||
1095 | } | |||
1096 | ||||
1097 | Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); | |||
1098 | B.buildInstr(AMDGPU::V_READFIRSTLANE_B32) | |||
1099 | .addDef(SGPR) | |||
1100 | .addReg(Reg); | |||
1101 | ||||
1102 | MRI.setType(SGPR, Ty); | |||
1103 | ||||
1104 | const TargetRegisterClass *Constrained = | |||
1105 | constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI); | |||
1106 | (void)Constrained; | |||
1107 | assert(Constrained && "Failed to constrain readfirstlane src reg")(static_cast <bool> (Constrained && "Failed to constrain readfirstlane src reg" ) ? void (0) : __assert_fail ("Constrained && \"Failed to constrain readfirstlane src reg\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 1107, __extension__ __PRETTY_FUNCTION__)); | |||
1108 | ||||
1109 | MI.getOperand(OpIdx).setReg(SGPR); | |||
1110 | } | |||
1111 | ||||
1112 | /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the | |||
1113 | /// rest will be in the remainder. | |||
1114 | static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) { | |||
1115 | unsigned TotalSize = Ty.getSizeInBits(); | |||
1116 | if (!Ty.isVector()) | |||
1117 | return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)}; | |||
1118 | ||||
1119 | LLT EltTy = Ty.getElementType(); | |||
1120 | unsigned EltSize = EltTy.getSizeInBits(); | |||
1121 | assert(FirstSize % EltSize == 0)(static_cast <bool> (FirstSize % EltSize == 0) ? void ( 0) : __assert_fail ("FirstSize % EltSize == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 1121, __extension__ __PRETTY_FUNCTION__)); | |||
1122 | ||||
1123 | unsigned FirstPartNumElts = FirstSize / EltSize; | |||
1124 | unsigned RemainderElts = (TotalSize - FirstSize) / EltSize; | |||
1125 | ||||
1126 | return {LLT::scalarOrVector(FirstPartNumElts, EltTy), | |||
1127 | LLT::scalarOrVector(RemainderElts, EltTy)}; | |||
1128 | } | |||
1129 | ||||
1130 | static LLT widen96To128(LLT Ty) { | |||
1131 | if (!Ty.isVector()) | |||
| ||||
1132 | return LLT::scalar(128); | |||
1133 | ||||
1134 | LLT EltTy = Ty.getElementType(); | |||
1135 | assert(128 % EltTy.getSizeInBits() == 0)(static_cast <bool> (128 % EltTy.getSizeInBits() == 0) ? void (0) : __assert_fail ("128 % EltTy.getSizeInBits() == 0" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 1135, __extension__ __PRETTY_FUNCTION__)); | |||
| ||||
1136 | return LLT::vector(128 / EltTy.getSizeInBits(), EltTy); | |||
1137 | } | |||
1138 | ||||
1139 | bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, | |||
1140 | const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, | |||
1141 | MachineRegisterInfo &MRI) const { | |||
1142 | Register DstReg = MI.getOperand(0).getReg(); | |||
1143 | const LLT LoadTy = MRI.getType(DstReg); | |||
1144 | unsigned LoadSize = LoadTy.getSizeInBits(); | |||
1145 | const unsigned MaxNonSmrdLoadSize = 128; | |||
1146 | ||||
1147 | const RegisterBank *DstBank = | |||
1148 | OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; | |||
1149 | if (DstBank == &AMDGPU::SGPRRegBank) { | |||
1150 | // There are some special cases that we need to look at for 32 bit and 96 | |||
1151 | // bit SGPR loads otherwise we have nothing to do. | |||
1152 | if (LoadSize != 32 && LoadSize != 96) | |||
1153 | return false; | |||
1154 | ||||
1155 | MachineMemOperand *MMO = *MI.memoperands_begin(); | |||
1156 | const unsigned MemSize = 8 * MMO->getSize(); | |||
1157 | // Scalar loads of size 8 or 16 bit with proper alignment may be widened to | |||
1158 | // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit | |||
1159 | // scalar loads should have a load size of 32 but memory access size of less | |||
1160 | // than 32. | |||
1161 | if (LoadSize == 32 && | |||
1162 | (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI))) | |||
1163 | return false; | |||
1164 | ||||
1165 | Register PtrReg = MI.getOperand(1).getReg(); | |||
1166 | ||||
1167 | ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank); | |||
1168 | MachineIRBuilder B(MI, O); | |||
1169 | ||||
1170 | if (LoadSize == 32) { | |||
1171 | // This is an extending load from a sub-dword size. Widen the memory | |||
1172 | // access size to 4 bytes and clear the extra high bits appropriately | |||
1173 | const LLT S32 = LLT::scalar(32); | |||
1174 | if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) { | |||
1175 | // Must extend the sign bit into higher bits for a G_SEXTLOAD | |||
1176 | auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); | |||
1177 | B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize); | |||
1178 | } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) { | |||
1179 | // Must extend zero into higher bits with an AND for a G_ZEXTLOAD | |||
1180 | auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); | |||
1181 | B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize); | |||
1182 | } else | |||
1183 | // We do not need to touch the higher bits for regular loads. | |||
1184 | B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0); | |||
1185 | } else { | |||
1186 | // 96-bit loads are only available for vector loads. We need to split this | |||
1187 | // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). | |||
1188 | if (MMO->getAlign() < Align(16)) { | |||
1189 | LLT Part64, Part32; | |||
1190 | std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); | |||
1191 | auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0); | |||
1192 | auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8); | |||
1193 | ||||
1194 | auto Undef = B.buildUndef(LoadTy); | |||
1195 | auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0); | |||
1196 | B.buildInsert(MI.getOperand(0), Ins0, Load1, 64); | |||
1197 | } else { | |||
1198 | LLT WiderTy = widen96To128(LoadTy); | |||
1199 | auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); | |||
1200 | B.buildExtract(MI.getOperand(0), WideLoad, 0); | |||
1201 | } | |||
1202 | } | |||
1203 | ||||
1204 | MI.eraseFromParent(); | |||
1205 | return true; | |||
1206 | } | |||
1207 | ||||
1208 | // 128-bit loads are supported for all instruction types. | |||
1209 | if (LoadSize <= MaxNonSmrdLoadSize) | |||
1210 | return false; | |||
1211 | ||||
1212 | SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0)); | |||
1213 | SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1)); | |||
1214 | ||||
1215 | if (SrcRegs.empty()) | |||
1216 | SrcRegs.push_back(MI.getOperand(1).getReg()); | |||
1217 | ||||
1218 | assert(LoadSize % MaxNonSmrdLoadSize == 0)(static_cast <bool> (LoadSize % MaxNonSmrdLoadSize == 0 ) ? void (0) : __assert_fail ("LoadSize % MaxNonSmrdLoadSize == 0" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 1218, __extension__ __PRETTY_FUNCTION__)); | |||
1219 | ||||
1220 | // RegBankSelect only emits scalar types, so we need to reset the pointer | |||
1221 | // operand to a pointer type. | |||
1222 | Register BasePtrReg = SrcRegs[0]; | |||
1223 | LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); | |||
1224 | MRI.setType(BasePtrReg, PtrTy); | |||
1225 | ||||
1226 | unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize; | |||
1227 | const LLT LoadSplitTy = LoadTy.divide(NumSplitParts); | |||
1228 | ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank); | |||
1229 | MachineIRBuilder B(MI, Observer); | |||
1230 | LegalizerHelper Helper(B.getMF(), Observer, B); | |||
1231 | ||||
1232 | if (LoadTy.isVector()) { | |||
1233 | if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) | |||
1234 | return false; | |||
1235 | } else { | |||
1236 | if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) | |||
1237 | return false; | |||
1238 | } | |||
1239 | ||||
1240 | MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); | |||
1241 | return true; | |||
1242 | } | |||
1243 | ||||
1244 | bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( | |||
1245 | MachineInstr &MI, | |||
1246 | const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, | |||
1247 | MachineRegisterInfo &MRI) const { | |||
1248 | const MachineFunction &MF = *MI.getMF(); | |||
1249 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | |||
1250 | const auto &TFI = *ST.getFrameLowering(); | |||
1251 | ||||
1252 | // Guard in case the stack growth direction ever changes with scratch | |||
1253 | // instructions. | |||
1254 | if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown) | |||
1255 | return false; | |||
1256 | ||||
1257 | Register Dst = MI.getOperand(0).getReg(); | |||
1258 | Register AllocSize = MI.getOperand(1).getReg(); | |||
1259 | Align Alignment = assumeAligned(MI.getOperand(2).getImm()); | |||
1260 | ||||
1261 | const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI); | |||
1262 | ||||
1263 | // TODO: Need to emit a wave reduction to get the maximum size. | |||
1264 | if (SizeBank != &AMDGPU::SGPRRegBank) | |||
1265 | return false; | |||
1266 | ||||
1267 | LLT PtrTy = MRI.getType(Dst); | |||
1268 | LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); | |||
1269 | ||||
1270 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); | |||
1271 | Register SPReg = Info->getStackPtrOffsetReg(); | |||
1272 | ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); | |||
1273 | MachineIRBuilder B(MI, ApplyBank); | |||
1274 | ||||
1275 | auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2()); | |||
1276 | auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize); | |||
1277 | ||||
1278 | auto SPCopy = B.buildCopy(PtrTy, SPReg); | |||
1279 | if (Alignment > TFI.getStackAlign()) { | |||
1280 | auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize); | |||
1281 | B.buildMaskLowPtrBits(Dst, PtrAdd, | |||
1282 | Log2(Alignment) + ST.getWavefrontSizeLog2()); | |||
1283 | } else { | |||
1284 | B.buildPtrAdd(Dst, SPCopy, ScaledSize); | |||
1285 | } | |||
1286 | ||||
1287 | MI.eraseFromParent(); | |||
1288 | return true; | |||
1289 | } | |||
1290 | ||||
1291 | bool AMDGPURegisterBankInfo::applyMappingImage( | |||
1292 | MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, | |||
1293 | MachineRegisterInfo &MRI, int RsrcIdx) const { | |||
1294 | const int NumDefs = MI.getNumExplicitDefs(); | |||
1295 | ||||
1296 | // The reported argument index is relative to the IR intrinsic call arguments, | |||
1297 | // so we need to shift by the number of defs and the intrinsic ID. | |||
1298 | RsrcIdx += NumDefs + 1; | |||
1299 | ||||
1300 | // Insert copies to VGPR arguments. | |||
1301 | applyDefaultMapping(OpdMapper); | |||
1302 | ||||
1303 | // Fixup any SGPR arguments. | |||
1304 | SmallVector<unsigned, 4> SGPRIndexes; | |||
1305 | for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { | |||
1306 | if (!MI.getOperand(I).isReg()) | |||
1307 | continue; | |||
1308 | ||||
1309 | // If this intrinsic has a sampler, it immediately follows rsrc. | |||
1310 | if (I == RsrcIdx || I == RsrcIdx + 1) | |||
1311 | SGPRIndexes.push_back(I); | |||
1312 | } | |||
1313 | ||||
1314 | executeInWaterfallLoop(MI, MRI, SGPRIndexes); | |||
1315 | return true; | |||
1316 | } | |||
1317 | ||||
1318 | static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI, | |||
1319 | Register Reg) { | |||
1320 | MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); | |||
1321 | if (!Def) | |||
1322 | return Reg; | |||
1323 | ||||
1324 | // TODO: Guard against this being an implicit def | |||
1325 | return Def->getOperand(0).getReg(); | |||
1326 | } | |||
1327 | ||||
1328 | // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store | |||
1329 | // the three offsets (voffset, soffset and instoffset) | |||
1330 | static unsigned setBufferOffsets(MachineIRBuilder &B, | |||
1331 | const AMDGPURegisterBankInfo &RBI, | |||
1332 | Register CombinedOffset, Register &VOffsetReg, | |||
1333 | Register &SOffsetReg, int64_t &InstOffsetVal, | |||
1334 | Align Alignment) { | |||
1335 | const LLT S32 = LLT::scalar(32); | |||
1336 | MachineRegisterInfo *MRI = B.getMRI(); | |||
1337 | ||||
1338 | if (Optional<int64_t> Imm = getConstantVRegSExtVal(CombinedOffset, *MRI)) { | |||
1339 | uint32_t SOffset, ImmOffset; | |||
1340 | if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget, | |||
1341 | Alignment)) { | |||
1342 | VOffsetReg = B.buildConstant(S32, 0).getReg(0); | |||
1343 | SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); | |||
1344 | InstOffsetVal = ImmOffset; | |||
1345 | ||||
1346 | B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); | |||
1347 | B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); | |||
1348 | return SOffset + ImmOffset; | |||
1349 | } | |||
1350 | } | |||
1351 | ||||
1352 | Register Base; | |||
1353 | unsigned Offset; | |||
1354 | ||||
1355 | std::tie(Base, Offset) = | |||
1356 | AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); | |||
1357 | ||||
1358 | uint32_t SOffset, ImmOffset; | |||
1359 | if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, | |||
1360 | &RBI.Subtarget, Alignment)) { | |||
1361 | if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { | |||
1362 | VOffsetReg = Base; | |||
1363 | SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); | |||
1364 | B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); | |||
1365 | InstOffsetVal = ImmOffset; | |||
1366 | return 0; // XXX - Why is this 0? | |||
1367 | } | |||
1368 | ||||
1369 | // If we have SGPR base, we can use it for soffset. | |||
1370 | if (SOffset == 0) { | |||
1371 | VOffsetReg = B.buildConstant(S32, 0).getReg(0); | |||
1372 | B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); | |||
1373 | SOffsetReg = Base; | |||
1374 | InstOffsetVal = ImmOffset; | |||
1375 | return 0; // XXX - Why is this 0? | |||
1376 | } | |||
1377 | } | |||
1378 | ||||
1379 | // Handle the variable sgpr + vgpr case. | |||
1380 | MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI); | |||
1381 | if (Add && (int)Offset >= 0) { | |||
1382 | Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg()); | |||
1383 | Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg()); | |||
1384 | ||||
1385 | const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI); | |||
1386 | const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI); | |||
1387 | ||||
1388 | if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { | |||
1389 | VOffsetReg = Src0; | |||
1390 | SOffsetReg = Src1; | |||
1391 | return 0; | |||
1392 | } | |||
1393 | ||||
1394 | if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) { | |||
1395 | VOffsetReg = Src1; | |||
1396 | SOffsetReg = Src0; | |||
1397 | return 0; | |||
1398 | } | |||
1399 | } | |||
1400 | ||||
1401 | // Ensure we have a VGPR for the combined offset. This could be an issue if we | |||
1402 | // have an SGPR offset and a VGPR resource. | |||
1403 | if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { | |||
1404 | VOffsetReg = CombinedOffset; | |||
1405 | } else { | |||
1406 | VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0); | |||
1407 | B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); | |||
1408 | } | |||
1409 | ||||
1410 | SOffsetReg = B.buildConstant(S32, 0).getReg(0); | |||
1411 | B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); | |||
1412 | return 0; | |||
1413 | } | |||
1414 | ||||
1415 | bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( | |||
1416 | const OperandsMapper &OpdMapper) const { | |||
1417 | MachineInstr &MI = OpdMapper.getMI(); | |||
1418 | MachineRegisterInfo &MRI = OpdMapper.getMRI(); | |||
1419 | ||||
1420 | const LLT S32 = LLT::scalar(32); | |||
1421 | Register Dst = MI.getOperand(0).getReg(); | |||
1422 | LLT Ty = MRI.getType(Dst); | |||
1423 | ||||
1424 | const RegisterBank *RSrcBank = | |||
1425 | OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; | |||
1426 | const RegisterBank *OffsetBank = | |||
1427 | OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; | |||
1428 | if (RSrcBank == &AMDGPU::SGPRRegBank && | |||
1429 | OffsetBank == &AMDGPU::SGPRRegBank) | |||
1430 | return true; // Legal mapping | |||
1431 | ||||
1432 | // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back | |||
1433 | // here but don't have an MMO. | |||
1434 | ||||
1435 | unsigned LoadSize = Ty.getSizeInBits(); | |||
1436 | int NumLoads = 1; | |||
1437 | if (LoadSize == 256 || LoadSize == 512) { | |||
1438 | NumLoads = LoadSize / 128; | |||
1439 | Ty = Ty.divide(NumLoads); | |||
1440 | } | |||
1441 | ||||
1442 | // Use the alignment to ensure that the required offsets will fit into the | |||
1443 | // immediate offsets. | |||
1444 | const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1); | |||
1445 | ||||
1446 | MachineIRBuilder B(MI); | |||
1447 | MachineFunction &MF = B.getMF(); | |||
1448 | ||||
1449 | Register SOffset; | |||
1450 | Register VOffset; | |||
1451 | int64_t ImmOffset = 0; | |||
1452 | ||||
1453 | unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(), | |||
1454 | VOffset, SOffset, ImmOffset, Alignment); | |||
1455 | ||||
1456 | // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we | |||
1457 | // can, but we neeed to track an MMO for that. | |||
1458 | const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8; | |||
1459 | const Align MemAlign(4); // FIXME: ABI type alignment? | |||
1460 | MachineMemOperand *BaseMMO = MF.getMachineMemOperand( | |||
1461 | MachinePointerInfo(), | |||
1462 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | |||
1463 | MachineMemOperand::MOInvariant, | |||
1464 | MemSize, MemAlign); | |||
1465 | if (MMOOffset != 0) | |||
1466 | BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize); | |||
1467 | ||||
1468 | // If only the offset is divergent, emit a MUBUF buffer load instead. We can | |||
1469 | // assume that the buffer is unswizzled. | |||
1470 | ||||
1471 | Register RSrc = MI.getOperand(1).getReg(); | |||
1472 | Register VIndex = B.buildConstant(S32, 0).getReg(0); | |||
1473 | B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank); | |||
1474 | ||||
1475 | SmallVector<Register, 4> LoadParts(NumLoads); | |||
1476 | ||||
1477 | MachineBasicBlock::iterator MII = MI.getIterator(); | |||
1478 | MachineInstrSpan Span(MII, &B.getMBB()); | |||
1479 | ||||
1480 | for (int i = 0; i < NumLoads; ++i) { | |||
1481 | if (NumLoads == 1) { | |||
1482 | LoadParts[i] = Dst; | |||
1483 | } else { | |||
1484 | LoadParts[i] = MRI.createGenericVirtualRegister(Ty); | |||
1485 | MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank); | |||
1486 | } | |||
1487 | ||||
1488 | MachineMemOperand *MMO = BaseMMO; | |||
1489 | if (i != 0) | |||
1490 | BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize); | |||
1491 | ||||
1492 | B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD) | |||
1493 | .addDef(LoadParts[i]) // vdata | |||
1494 | .addUse(RSrc) // rsrc | |||
1495 | .addUse(VIndex) // vindex | |||
1496 | .addUse(VOffset) // voffset | |||
1497 | .addUse(SOffset) // soffset | |||
1498 | .addImm(ImmOffset + 16 * i) // offset(imm) | |||
1499 | .addImm(0) // cachepolicy, swizzled buffer(imm) | |||
1500 | .addImm(0) // idxen(imm) | |||
1501 | .addMemOperand(MMO); | |||
1502 | } | |||
1503 | ||||
1504 | // TODO: If only the resource is a VGPR, it may be better to execute the | |||
1505 | // scalar load in the waterfall loop if the resource is expected to frequently | |||
1506 | // be dynamically uniform. | |||
1507 | if (RSrcBank != &AMDGPU::SGPRRegBank) { | |||
1508 | // Remove the original instruction to avoid potentially confusing the | |||
1509 | // waterfall loop logic. | |||
1510 | B.setInstr(*Span.begin()); | |||
1511 | MI.eraseFromParent(); | |||
1512 | ||||
1513 | SmallSet<Register, 4> OpsToWaterfall; | |||
1514 | ||||
1515 | OpsToWaterfall.insert(RSrc); | |||
1516 | executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), | |||
1517 | OpsToWaterfall, MRI); | |||
1518 | } | |||
1519 | ||||
1520 | if (NumLoads != 1) { | |||
1521 | if (Ty.isVector()) | |||
1522 | B.buildConcatVectors(Dst, LoadParts); | |||
1523 | else | |||
1524 | B.buildMerge(Dst, LoadParts); | |||
1525 | } | |||
1526 | ||||
1527 | // We removed the instruction earlier with a waterfall loop. | |||
1528 | if (RSrcBank == &AMDGPU::SGPRRegBank) | |||
1529 | MI.eraseFromParent(); | |||
1530 | ||||
1531 | return true; | |||
1532 | } | |||
1533 | ||||
1534 | bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic( | |||
1535 | const OperandsMapper &OpdMapper, bool Signed) const { | |||
1536 | MachineInstr &MI = OpdMapper.getMI(); | |||
1537 | MachineRegisterInfo &MRI = OpdMapper.getMRI(); | |||
1538 | ||||
1539 | // Insert basic copies | |||
1540 | applyDefaultMapping(OpdMapper); | |||
1541 | ||||
1542 | Register DstReg = MI.getOperand(0).getReg(); | |||
1543 | LLT Ty = MRI.getType(DstReg); | |||
1544 | ||||
1545 | const LLT S32 = LLT::scalar(32); | |||
1546 | ||||
1547 | const RegisterBank *DstBank = | |||
1548 | OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; | |||
1549 | if (DstBank == &AMDGPU::VGPRRegBank) { | |||
1550 | if (Ty == S32) | |||
1551 | return true; | |||
1552 | ||||
1553 | // TODO: 64-bit version is scalar only, so we need to expand this. | |||
1554 | return false; | |||
1555 | } | |||
1556 | ||||
1557 | Register SrcReg = MI.getOperand(2).getReg(); | |||
1558 | Register OffsetReg = MI.getOperand(3).getReg(); | |||
1559 | Register WidthReg = MI.getOperand(4).getReg(); | |||
1560 | ||||
1561 | // The scalar form packs the offset and width in a single operand. | |||
1562 | ||||
1563 | ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); | |||
1564 | MachineIRBuilder B(MI, ApplyBank); | |||
1565 | ||||
1566 | // Ensure the high bits are clear to insert the offset. | |||
1567 | auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); | |||
1568 | auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); | |||
1569 | ||||
1570 | // Zeros out the low bits, so don't bother clamping the input value. | |||
1571 | auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16)); | |||
1572 | ||||
1573 | // Transformation function, pack the offset and width of a BFE into | |||
1574 | // the format expected by the S_BFE_I32 / S_BFE_U32. In the second | |||
1575 | // source, bits [5:0] contain the offset and bits [22:16] the width. | |||
1576 | auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); | |||
1577 | ||||
1578 | // TODO: It might be worth using a pseudo here to avoid scc clobber and | |||
1579 | // register class constraints. | |||
1580 | unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : | |||
1581 | (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); | |||
1582 | ||||
1583 | auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); | |||
1584 | if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) | |||
1585 | llvm_unreachable("failed to constrain BFE")::llvm::llvm_unreachable_internal("failed to constrain BFE", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 1585); | |||
1586 | ||||
1587 | MI.eraseFromParent(); | |||
1588 | return true; | |||
1589 | } | |||
1590 | ||||
1591 | // Return a suitable opcode for extending the operands of Opc when widening. | |||
1592 | static unsigned getExtendOp(unsigned Opc) { | |||
1593 | switch (Opc) { | |||
1594 | case TargetOpcode::G_ASHR: | |||
1595 | case TargetOpcode::G_SMIN: | |||
1596 | case TargetOpcode::G_SMAX: | |||
1597 | return TargetOpcode::G_SEXT; | |||
1598 | case TargetOpcode::G_LSHR: | |||
1599 | case TargetOpcode::G_UMIN: | |||
1600 | case TargetOpcode::G_UMAX: | |||
1601 | return TargetOpcode::G_ZEXT; | |||
1602 | default: | |||
1603 | return TargetOpcode::G_ANYEXT; | |||
1604 | } | |||
1605 | } | |||
1606 | ||||
1607 | // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding | |||
1608 | // any illegal vector extend or unmerge operations. | |||
1609 | static std::pair<Register, Register> | |||
1610 | unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) { | |||
1611 | const LLT S32 = LLT::scalar(32); | |||
1612 | auto Bitcast = B.buildBitcast(S32, Src); | |||
1613 | ||||
1614 | if (ExtOpcode == TargetOpcode::G_SEXT) { | |||
1615 | auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16); | |||
1616 | auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16)); | |||
1617 | return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); | |||
1618 | } | |||
1619 | ||||
1620 | auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16)); | |||
1621 | if (ExtOpcode == TargetOpcode::G_ZEXT) { | |||
1622 | auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff)); | |||
1623 | return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); | |||
1624 | } | |||
1625 | ||||
1626 | assert(ExtOpcode == TargetOpcode::G_ANYEXT)(static_cast <bool> (ExtOpcode == TargetOpcode::G_ANYEXT ) ? void (0) : __assert_fail ("ExtOpcode == TargetOpcode::G_ANYEXT" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 1626, __extension__ __PRETTY_FUNCTION__)); | |||
1627 | return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0)); | |||
1628 | } | |||
1629 | ||||
1630 | // For cases where only a single copy is inserted for matching register banks. | |||
1631 | // Replace the register in the instruction operand | |||
1632 | static bool substituteSimpleCopyRegs( | |||
1633 | const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { | |||
1634 | SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); | |||
1635 | if (!SrcReg.empty()) { | |||
1636 | assert(SrcReg.size() == 1)(static_cast <bool> (SrcReg.size() == 1) ? void (0) : __assert_fail ("SrcReg.size() == 1", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 1636, __extension__ __PRETTY_FUNCTION__)); | |||
1637 | OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); | |||
1638 | return true; | |||
1639 | } | |||
1640 | ||||
1641 | return false; | |||
1642 | } | |||
1643 | ||||
1644 | /// Handle register layout difference for f16 images for some subtargets. | |||
1645 | Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, | |||
1646 | MachineRegisterInfo &MRI, | |||
1647 | Register Reg) const { | |||
1648 | if (!Subtarget.hasUnpackedD16VMem()) | |||
1649 | return Reg; | |||
1650 | ||||
1651 | const LLT S16 = LLT::scalar(16); | |||
1652 | LLT StoreVT = MRI.getType(Reg); | |||
1653 | if (!StoreVT.isVector() || StoreVT.getElementType() != S16) | |||
1654 | return Reg; | |||
1655 | ||||
1656 | auto Unmerge = B.buildUnmerge(S16, Reg); | |||
1657 | ||||
1658 | ||||
1659 | SmallVector<Register, 4> WideRegs; | |||
1660 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | |||
1661 | WideRegs.push_back(Unmerge.getReg(I)); | |||
1662 | ||||
1663 | const LLT S32 = LLT::scalar(32); | |||
1664 | int NumElts = StoreVT.getNumElements(); | |||
1665 | ||||
1666 | return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0); | |||
1667 | } | |||
1668 | ||||
1669 | static std::pair<Register, unsigned> | |||
1670 | getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { | |||
1671 | int64_t Const; | |||
1672 | if (mi_match(Reg, MRI, m_ICst(Const))) | |||
1673 | return std::make_pair(Register(), Const); | |||
1674 | ||||
1675 | Register Base; | |||
1676 | if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const)))) | |||
1677 | return std::make_pair(Base, Const); | |||
1678 | ||||
1679 | // TODO: Handle G_OR used for add case | |||
1680 | return std::make_pair(Reg, 0); | |||
1681 | } | |||
1682 | ||||
1683 | std::pair<Register, unsigned> | |||
1684 | AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, | |||
1685 | Register OrigOffset) const { | |||
1686 | const unsigned MaxImm = 4095; | |||
1687 | Register BaseReg; | |||
1688 | unsigned ImmOffset; | |||
1689 | const LLT S32 = LLT::scalar(32); | |||
1690 | ||||
1691 | std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), | |||
1692 | OrigOffset); | |||
1693 | ||||
1694 | unsigned C1 = 0; | |||
1695 | if (ImmOffset != 0) { | |||
1696 | // If the immediate value is too big for the immoffset field, put the value | |||
1697 | // and -4096 into the immoffset field so that the value that is copied/added | |||
1698 | // for the voffset field is a multiple of 4096, and it stands more chance | |||
1699 | // of being CSEd with the copy/add for another similar load/store. | |||
1700 | // However, do not do that rounding down to a multiple of 4096 if that is a | |||
1701 | // negative number, as it appears to be illegal to have a negative offset | |||
1702 | // in the vgpr, even if adding the immediate offset makes it positive. | |||
1703 | unsigned Overflow = ImmOffset & ~MaxImm; | |||
1704 | ImmOffset -= Overflow; | |||
1705 | if ((int32_t)Overflow < 0) { | |||
1706 | Overflow += ImmOffset; | |||
1707 | ImmOffset = 0; | |||
1708 | } | |||
1709 | ||||
1710 | C1 = ImmOffset; | |||
1711 | if (Overflow != 0) { | |||
1712 | if (!BaseReg) | |||
1713 | BaseReg = B.buildConstant(S32, Overflow).getReg(0); | |||
1714 | else { | |||
1715 | auto OverflowVal = B.buildConstant(S32, Overflow); | |||
1716 | BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); | |||
1717 | } | |||
1718 | } | |||
1719 | } | |||
1720 | ||||
1721 | if (!BaseReg) | |||
1722 | BaseReg = B.buildConstant(S32, 0).getReg(0); | |||
1723 | ||||
1724 | return {BaseReg, C1}; | |||
1725 | } | |||
1726 | ||||
1727 | static bool isZero(Register Reg, MachineRegisterInfo &MRI) { | |||
1728 | int64_t C; | |||
1729 | return mi_match(Reg, MRI, m_ICst(C)) && C == 0; | |||
1730 | } | |||
1731 | ||||
1732 | static unsigned extractCPol(unsigned CachePolicy) { | |||
1733 | return CachePolicy & AMDGPU::CPol::ALL; | |||
1734 | } | |||
1735 | ||||
1736 | static unsigned extractSWZ(unsigned CachePolicy) { | |||
1737 | return (CachePolicy >> 3) & 1; | |||
1738 | } | |||
1739 | ||||
1740 | ||||
1741 | MachineInstr * | |||
1742 | AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B, | |||
1743 | MachineInstr &MI) const { | |||
1744 | MachineRegisterInfo &MRI = *B.getMRI(); | |||
1745 | executeInWaterfallLoop(B, MI, MRI, {2, 4}); | |||
1746 | ||||
1747 | // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer. | |||
1748 | ||||
1749 | Register VData = MI.getOperand(1).getReg(); | |||
1750 | LLT Ty = MRI.getType(VData); | |||
1751 | ||||
1752 | int EltSize = Ty.getScalarSizeInBits(); | |||
1753 | int Size = Ty.getSizeInBits(); | |||
1754 | ||||
1755 | // FIXME: Broken integer truncstore. | |||
1756 | if (EltSize != 32) | |||
1757 | report_fatal_error("unhandled intrinsic store"); | |||
1758 | ||||
1759 | // FIXME: Verifier should enforce 1 MMO for these intrinsics. | |||
1760 | const int MemSize = (*MI.memoperands_begin())->getSize(); | |||
1761 | ||||
1762 | ||||
1763 | Register RSrc = MI.getOperand(2).getReg(); | |||
1764 | Register VOffset = MI.getOperand(3).getReg(); | |||
1765 | Register SOffset = MI.getOperand(4).getReg(); | |||
1766 | unsigned CachePolicy = MI.getOperand(5).getImm(); | |||
1767 | ||||
1768 | unsigned ImmOffset; | |||
1769 | std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); | |||
1770 | ||||
1771 | const bool Offen = !isZero(VOffset, MRI); | |||
1772 | ||||
1773 | unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact; | |||
1774 | switch (8 * MemSize) { | |||
1775 | case 8: | |||
1776 | Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact : | |||
1777 | AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact; | |||
1778 | break; | |||
1779 | case 16: | |||
1780 | Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact : | |||
1781 | AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact; | |||
1782 | break; | |||
1783 | default: | |||
1784 | Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact : | |||
1785 | AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact; | |||
1786 | if (Size > 32) | |||
1787 | Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32); | |||
1788 | break; | |||
1789 | } | |||
1790 | ||||
1791 | ||||
1792 | // Set the insertion point back to the instruction in case it was moved into a | |||
1793 | // loop. | |||
1794 | B.setInstr(MI); | |||
1795 | ||||
1796 | MachineInstrBuilder MIB = B.buildInstr(Opc) | |||
1797 | .addUse(VData); | |||
1798 | ||||
1799 | if (Offen) | |||
1800 | MIB.addUse(VOffset); | |||
1801 | ||||
1802 | MIB.addUse(RSrc) | |||
1803 | .addUse(SOffset) | |||
1804 | .addImm(ImmOffset) | |||
1805 | .addImm(extractCPol(CachePolicy)) | |||
1806 | .addImm(0) // tfe: FIXME: Remove from inst | |||
1807 | .addImm(extractSWZ(CachePolicy)) | |||
1808 | .cloneMemRefs(MI); | |||
1809 | ||||
1810 | // FIXME: We need a way to report failure from applyMappingImpl. | |||
1811 | // Insert constrain copies before inserting the loop. | |||
1812 | if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) | |||
1813 | report_fatal_error("failed to constrain selected store intrinsic"); | |||
1814 | ||||
1815 | return MIB; | |||
1816 | } | |||
1817 | ||||
1818 | bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, | |||
1819 | Register SrcReg) const { | |||
1820 | MachineRegisterInfo &MRI = *B.getMRI(); | |||
1821 | LLT SrcTy = MRI.getType(SrcReg); | |||
1822 | if (SrcTy.getSizeInBits() == 32) { | |||
1823 | // Use a v_mov_b32 here to make the exec dependency explicit. | |||
1824 | B.buildInstr(AMDGPU::V_MOV_B32_e32) | |||
1825 | .addDef(DstReg) | |||
1826 | .addUse(SrcReg); | |||
1827 | return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) && | |||
1828 | constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI); | |||
1829 | } | |||
1830 | ||||
1831 | Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
1832 | Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
1833 | ||||
1834 | B.buildInstr(AMDGPU::V_MOV_B32_e32) | |||
1835 | .addDef(TmpReg0) | |||
1836 | .addUse(SrcReg, 0, AMDGPU::sub0); | |||
1837 | B.buildInstr(AMDGPU::V_MOV_B32_e32) | |||
1838 | .addDef(TmpReg1) | |||
1839 | .addUse(SrcReg, 0, AMDGPU::sub1); | |||
1840 | B.buildInstr(AMDGPU::REG_SEQUENCE) | |||
1841 | .addDef(DstReg) | |||
1842 | .addUse(TmpReg0) | |||
1843 | .addImm(AMDGPU::sub0) | |||
1844 | .addUse(TmpReg1) | |||
1845 | .addImm(AMDGPU::sub1); | |||
1846 | ||||
1847 | return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) && | |||
1848 | constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI); | |||
1849 | } | |||
1850 | ||||
1851 | /// Utility function for pushing dynamic vector indexes with a constant offset | |||
1852 | /// into waterwall loops. | |||
1853 | static void reinsertVectorIndexAdd(MachineIRBuilder &B, | |||
1854 | MachineInstr &IdxUseInstr, | |||
1855 | unsigned OpIdx, | |||
1856 | unsigned ConstOffset) { | |||
1857 | MachineRegisterInfo &MRI = *B.getMRI(); | |||
1858 | const LLT S32 = LLT::scalar(32); | |||
1859 | Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg(); | |||
1860 | B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator()); | |||
1861 | ||||
1862 | auto MaterializedOffset = B.buildConstant(S32, ConstOffset); | |||
1863 | ||||
1864 | auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset); | |||
1865 | MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); | |||
1866 | MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); | |||
1867 | IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0)); | |||
1868 | } | |||
1869 | ||||
1870 | /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the | |||
1871 | /// original 32-bit source value (to be inserted in the low part of the combined | |||
1872 | /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit | |||
1873 | /// value. | |||
1874 | static void extendLow32IntoHigh32(MachineIRBuilder &B, | |||
1875 | Register Hi32Reg, Register Lo32Reg, | |||
1876 | unsigned ExtOpc, | |||
1877 | const RegisterBank &RegBank, | |||
1878 | bool IsBooleanSrc = false) { | |||
1879 | if (ExtOpc == AMDGPU::G_ZEXT) { | |||
1880 | B.buildConstant(Hi32Reg, 0); | |||
1881 | } else if (ExtOpc == AMDGPU::G_SEXT) { | |||
1882 | if (IsBooleanSrc) { | |||
1883 | // If we know the original source was an s1, the high half is the same as | |||
1884 | // the low. | |||
1885 | B.buildCopy(Hi32Reg, Lo32Reg); | |||
1886 | } else { | |||
1887 | // Replicate sign bit from 32-bit extended part. | |||
1888 | auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31); | |||
1889 | B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank); | |||
1890 | B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt); | |||
1891 | } | |||
1892 | } else { | |||
1893 | assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension")(static_cast <bool> (ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension") ? void (0) : __assert_fail ("ExtOpc == AMDGPU::G_ANYEXT && \"not an integer extension\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 1893, __extension__ __PRETTY_FUNCTION__)); | |||
1894 | B.buildUndef(Hi32Reg); | |||
1895 | } | |||
1896 | } | |||
1897 | ||||
1898 | bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( | |||
1899 | MachineInstr &MI, MachineRegisterInfo &MRI, | |||
1900 | const OperandsMapper &OpdMapper) const { | |||
1901 | ||||
1902 | Register VecReg = MI.getOperand(1).getReg(); | |||
1903 | Register Idx = MI.getOperand(2).getReg(); | |||
1904 | ||||
1905 | const RegisterBank &IdxBank = | |||
1906 | *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; | |||
1907 | ||||
1908 | bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; | |||
1909 | ||||
1910 | LLT VecTy = MRI.getType(VecReg); | |||
1911 | unsigned EltSize = VecTy.getScalarSizeInBits(); | |||
1912 | unsigned NumElem = VecTy.getNumElements(); | |||
1913 | ||||
1914 | if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, | |||
1915 | IsDivergentIdx)) | |||
1916 | return false; | |||
1917 | ||||
1918 | MachineIRBuilder B(MI); | |||
1919 | LLT S32 = LLT::scalar(32); | |||
1920 | ||||
1921 | const RegisterBank &DstBank = | |||
1922 | *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; | |||
1923 | const RegisterBank &SrcBank = | |||
1924 | *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; | |||
1925 | ||||
1926 | const RegisterBank &CCBank = | |||
1927 | (DstBank == AMDGPU::SGPRRegBank && | |||
1928 | SrcBank == AMDGPU::SGPRRegBank && | |||
1929 | IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank | |||
1930 | : AMDGPU::VCCRegBank; | |||
1931 | LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); | |||
1932 | ||||
1933 | if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { | |||
1934 | Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); | |||
1935 | MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); | |||
1936 | } | |||
1937 | ||||
1938 | LLT EltTy = VecTy.getScalarType(); | |||
1939 | SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); | |||
1940 | unsigned NumLanes = DstRegs.size(); | |||
1941 | if (!NumLanes) | |||
1942 | NumLanes = 1; | |||
1943 | else | |||
1944 | EltTy = MRI.getType(DstRegs[0]); | |||
1945 | ||||
1946 | auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); | |||
1947 | SmallVector<Register, 2> Res(NumLanes); | |||
1948 | for (unsigned L = 0; L < NumLanes; ++L) | |||
1949 | Res[L] = UnmergeToEltTy.getReg(L); | |||
1950 | ||||
1951 | for (unsigned I = 1; I < NumElem; ++I) { | |||
1952 | auto IC = B.buildConstant(S32, I); | |||
1953 | MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); | |||
1954 | auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); | |||
1955 | MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); | |||
1956 | ||||
1957 | for (unsigned L = 0; L < NumLanes; ++L) { | |||
1958 | auto S = B.buildSelect(EltTy, Cmp, | |||
1959 | UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]); | |||
1960 | ||||
1961 | for (unsigned N : { 0, 2, 3 }) | |||
1962 | MRI.setRegBank(S->getOperand(N).getReg(), DstBank); | |||
1963 | ||||
1964 | Res[L] = S->getOperand(0).getReg(); | |||
1965 | } | |||
1966 | } | |||
1967 | ||||
1968 | for (unsigned L = 0; L < NumLanes; ++L) { | |||
1969 | Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L]; | |||
1970 | B.buildCopy(DstReg, Res[L]); | |||
1971 | MRI.setRegBank(DstReg, DstBank); | |||
1972 | } | |||
1973 | ||||
1974 | MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); | |||
1975 | MI.eraseFromParent(); | |||
1976 | ||||
1977 | return true; | |||
1978 | } | |||
1979 | ||||
1980 | // Insert a cross regbank copy for a register if it already has a bank that | |||
1981 | // differs from the one we want to set. | |||
1982 | static Register constrainRegToBank(MachineRegisterInfo &MRI, | |||
1983 | MachineIRBuilder &B, Register &Reg, | |||
1984 | const RegisterBank &Bank) { | |||
1985 | const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg); | |||
1986 | if (CurrBank && *CurrBank != Bank) { | |||
1987 | Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0); | |||
1988 | MRI.setRegBank(Copy, Bank); | |||
1989 | return Copy; | |||
1990 | } | |||
1991 | ||||
1992 | MRI.setRegBank(Reg, Bank); | |||
1993 | return Reg; | |||
1994 | } | |||
1995 | ||||
1996 | bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( | |||
1997 | MachineInstr &MI, MachineRegisterInfo &MRI, | |||
1998 | const OperandsMapper &OpdMapper) const { | |||
1999 | ||||
2000 | Register VecReg = MI.getOperand(1).getReg(); | |||
2001 | Register Idx = MI.getOperand(3).getReg(); | |||
2002 | ||||
2003 | const RegisterBank &IdxBank = | |||
2004 | *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; | |||
2005 | ||||
2006 | bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; | |||
2007 | ||||
2008 | LLT VecTy = MRI.getType(VecReg); | |||
2009 | unsigned EltSize = VecTy.getScalarSizeInBits(); | |||
2010 | unsigned NumElem = VecTy.getNumElements(); | |||
2011 | ||||
2012 | if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, | |||
2013 | IsDivergentIdx)) | |||
2014 | return false; | |||
2015 | ||||
2016 | MachineIRBuilder B(MI); | |||
2017 | LLT S32 = LLT::scalar(32); | |||
2018 | ||||
2019 | const RegisterBank &DstBank = | |||
2020 | *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; | |||
2021 | const RegisterBank &SrcBank = | |||
2022 | *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; | |||
2023 | const RegisterBank &InsBank = | |||
2024 | *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; | |||
2025 | ||||
2026 | const RegisterBank &CCBank = | |||
2027 | (DstBank == AMDGPU::SGPRRegBank && | |||
2028 | SrcBank == AMDGPU::SGPRRegBank && | |||
2029 | InsBank == AMDGPU::SGPRRegBank && | |||
2030 | IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank | |||
2031 | : AMDGPU::VCCRegBank; | |||
2032 | LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); | |||
2033 | ||||
2034 | if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { | |||
2035 | Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); | |||
2036 | MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); | |||
2037 | } | |||
2038 | ||||
2039 | LLT EltTy = VecTy.getScalarType(); | |||
2040 | SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); | |||
2041 | unsigned NumLanes = InsRegs.size(); | |||
2042 | if (!NumLanes) { | |||
2043 | NumLanes = 1; | |||
2044 | InsRegs.push_back(MI.getOperand(2).getReg()); | |||
2045 | } else { | |||
2046 | EltTy = MRI.getType(InsRegs[0]); | |||
2047 | } | |||
2048 | ||||
2049 | auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); | |||
2050 | SmallVector<Register, 16> Ops(NumElem * NumLanes); | |||
2051 | ||||
2052 | for (unsigned I = 0; I < NumElem; ++I) { | |||
2053 | auto IC = B.buildConstant(S32, I); | |||
2054 | MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); | |||
2055 | auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); | |||
2056 | MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); | |||
2057 | ||||
2058 | for (unsigned L = 0; L < NumLanes; ++L) { | |||
2059 | Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank); | |||
2060 | Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L); | |||
2061 | Op1 = constrainRegToBank(MRI, B, Op1, DstBank); | |||
2062 | ||||
2063 | Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0); | |||
2064 | MRI.setRegBank(Select, DstBank); | |||
2065 | ||||
2066 | Ops[I * NumLanes + L] = Select; | |||
2067 | } | |||
2068 | } | |||
2069 | ||||
2070 | LLT MergeTy = LLT::vector(Ops.size(), EltTy); | |||
2071 | if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) { | |||
2072 | B.buildBuildVector(MI.getOperand(0), Ops); | |||
2073 | } else { | |||
2074 | auto Vec = B.buildBuildVector(MergeTy, Ops); | |||
2075 | MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank); | |||
2076 | B.buildBitcast(MI.getOperand(0).getReg(), Vec); | |||
2077 | } | |||
2078 | ||||
2079 | MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); | |||
2080 | MI.eraseFromParent(); | |||
2081 | ||||
2082 | return true; | |||
2083 | } | |||
2084 | ||||
2085 | void AMDGPURegisterBankInfo::applyMappingImpl( | |||
2086 | const OperandsMapper &OpdMapper) const { | |||
2087 | MachineInstr &MI = OpdMapper.getMI(); | |||
2088 | unsigned Opc = MI.getOpcode(); | |||
2089 | MachineRegisterInfo &MRI = OpdMapper.getMRI(); | |||
2090 | switch (Opc) { | |||
2091 | case AMDGPU::G_PHI: { | |||
2092 | Register DstReg = MI.getOperand(0).getReg(); | |||
2093 | LLT DstTy = MRI.getType(DstReg); | |||
2094 | if (DstTy != LLT::scalar(1)) | |||
2095 | break; | |||
2096 | ||||
2097 | const LLT S32 = LLT::scalar(32); | |||
2098 | const RegisterBank *DstBank = | |||
2099 | OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; | |||
2100 | if (DstBank == &AMDGPU::VCCRegBank) { | |||
2101 | applyDefaultMapping(OpdMapper); | |||
2102 | // The standard handling only considers the result register bank for | |||
2103 | // phis. For VCC, blindly inserting a copy when the phi is lowered will | |||
2104 | // produce an invalid copy. We can only copy with some kind of compare to | |||
2105 | // get a vector boolean result. Insert a regitser bank copy that will be | |||
2106 | // correctly lowered to a compare. | |||
2107 | MachineIRBuilder B(*MI.getParent()->getParent()); | |||
2108 | ||||
2109 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { | |||
2110 | Register SrcReg = MI.getOperand(I).getReg(); | |||
2111 | const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); | |||
2112 | ||||
2113 | if (SrcBank != &AMDGPU::VCCRegBank) { | |||
2114 | MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB(); | |||
2115 | B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator()); | |||
2116 | ||||
2117 | auto Copy = B.buildCopy(LLT::scalar(1), SrcReg); | |||
2118 | MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank); | |||
2119 | MI.getOperand(I).setReg(Copy.getReg(0)); | |||
2120 | } | |||
2121 | } | |||
2122 | ||||
2123 | return; | |||
2124 | } | |||
2125 | ||||
2126 | // Phi handling is strange and only considers the bank of the destination. | |||
2127 | substituteSimpleCopyRegs(OpdMapper, 0); | |||
2128 | ||||
2129 | // Promote SGPR/VGPR booleans to s32 | |||
2130 | MachineFunction *MF = MI.getParent()->getParent(); | |||
2131 | ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); | |||
2132 | MachineIRBuilder B(MI, ApplyBank); | |||
2133 | LegalizerHelper Helper(*MF, ApplyBank, B); | |||
2134 | ||||
2135 | if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) | |||
2136 | llvm_unreachable("widen scalar should have succeeded")::llvm::llvm_unreachable_internal("widen scalar should have succeeded" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2136); | |||
2137 | ||||
2138 | return; | |||
2139 | } | |||
2140 | case AMDGPU::G_ICMP: | |||
2141 | case AMDGPU::G_UADDO: | |||
2142 | case AMDGPU::G_USUBO: | |||
2143 | case AMDGPU::G_UADDE: | |||
2144 | case AMDGPU::G_SADDE: | |||
2145 | case AMDGPU::G_USUBE: | |||
2146 | case AMDGPU::G_SSUBE: { | |||
2147 | unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1; | |||
2148 | Register DstReg = MI.getOperand(BoolDstOp).getReg(); | |||
2149 | ||||
2150 | const RegisterBank *DstBank = | |||
2151 | OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; | |||
2152 | if (DstBank != &AMDGPU::SGPRRegBank) | |||
2153 | break; | |||
2154 | ||||
2155 | const bool HasCarryIn = MI.getNumOperands() == 5; | |||
2156 | ||||
2157 | // If this is a scalar compare, promote the result to s32, as the selection | |||
2158 | // will end up using a copy to a 32-bit vreg. | |||
2159 | const LLT S32 = LLT::scalar(32); | |||
2160 | Register NewDstReg = MRI.createGenericVirtualRegister(S32); | |||
2161 | MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank); | |||
2162 | MI.getOperand(BoolDstOp).setReg(NewDstReg); | |||
2163 | MachineIRBuilder B(MI); | |||
2164 | ||||
2165 | if (HasCarryIn) { | |||
2166 | Register NewSrcReg = MRI.createGenericVirtualRegister(S32); | |||
2167 | MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank); | |||
2168 | B.buildZExt(NewSrcReg, MI.getOperand(4).getReg()); | |||
2169 | MI.getOperand(4).setReg(NewSrcReg); | |||
2170 | } | |||
2171 | ||||
2172 | MachineBasicBlock *MBB = MI.getParent(); | |||
2173 | B.setInsertPt(*MBB, std::next(MI.getIterator())); | |||
2174 | ||||
2175 | // If we had a constrained VCC result register, a copy was inserted to VCC | |||
2176 | // from SGPR. | |||
2177 | SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); | |||
2178 | if (DefRegs.empty()) | |||
2179 | DefRegs.push_back(DstReg); | |||
2180 | B.buildTrunc(DefRegs[0], NewDstReg); | |||
2181 | return; | |||
2182 | } | |||
2183 | case AMDGPU::G_SELECT: { | |||
2184 | Register DstReg = MI.getOperand(0).getReg(); | |||
2185 | LLT DstTy = MRI.getType(DstReg); | |||
2186 | ||||
2187 | SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1)); | |||
2188 | if (CondRegs.empty()) | |||
2189 | CondRegs.push_back(MI.getOperand(1).getReg()); | |||
2190 | else { | |||
2191 | assert(CondRegs.size() == 1)(static_cast <bool> (CondRegs.size() == 1) ? void (0) : __assert_fail ("CondRegs.size() == 1", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2191, __extension__ __PRETTY_FUNCTION__)); | |||
2192 | } | |||
2193 | ||||
2194 | const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI); | |||
2195 | if (CondBank == &AMDGPU::SGPRRegBank) { | |||
2196 | MachineIRBuilder B(MI); | |||
2197 | const LLT S32 = LLT::scalar(32); | |||
2198 | Register NewCondReg = MRI.createGenericVirtualRegister(S32); | |||
2199 | MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); | |||
2200 | ||||
2201 | MI.getOperand(1).setReg(NewCondReg); | |||
2202 | B.buildZExt(NewCondReg, CondRegs[0]); | |||
2203 | } | |||
2204 | ||||
2205 | if (DstTy.getSizeInBits() != 64) | |||
2206 | break; | |||
2207 | ||||
2208 | MachineIRBuilder B(MI); | |||
2209 | LLT HalfTy = getHalfSizedType(DstTy); | |||
2210 | ||||
2211 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); | |||
2212 | SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); | |||
2213 | SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); | |||
2214 | ||||
2215 | // All inputs are SGPRs, nothing special to do. | |||
2216 | if (DefRegs.empty()) { | |||
2217 | assert(Src1Regs.empty() && Src2Regs.empty())(static_cast <bool> (Src1Regs.empty() && Src2Regs .empty()) ? void (0) : __assert_fail ("Src1Regs.empty() && Src2Regs.empty()" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2217, __extension__ __PRETTY_FUNCTION__)); | |||
2218 | break; | |||
2219 | } | |||
2220 | ||||
2221 | if (Src1Regs.empty()) | |||
2222 | split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); | |||
2223 | else { | |||
2224 | setRegsToType(MRI, Src1Regs, HalfTy); | |||
2225 | } | |||
2226 | ||||
2227 | if (Src2Regs.empty()) | |||
2228 | split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); | |||
2229 | else | |||
2230 | setRegsToType(MRI, Src2Regs, HalfTy); | |||
2231 | ||||
2232 | setRegsToType(MRI, DefRegs, HalfTy); | |||
2233 | ||||
2234 | B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]); | |||
2235 | B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]); | |||
2236 | ||||
2237 | MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); | |||
2238 | MI.eraseFromParent(); | |||
2239 | return; | |||
2240 | } | |||
2241 | case AMDGPU::G_BRCOND: { | |||
2242 | Register CondReg = MI.getOperand(0).getReg(); | |||
2243 | // FIXME: Should use legalizer helper, but should change bool ext type. | |||
2244 | const RegisterBank *CondBank = | |||
2245 | OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; | |||
2246 | ||||
2247 | if (CondBank == &AMDGPU::SGPRRegBank) { | |||
2248 | MachineIRBuilder B(MI); | |||
2249 | const LLT S32 = LLT::scalar(32); | |||
2250 | Register NewCondReg = MRI.createGenericVirtualRegister(S32); | |||
2251 | MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); | |||
2252 | ||||
2253 | MI.getOperand(0).setReg(NewCondReg); | |||
2254 | B.buildZExt(NewCondReg, CondReg); | |||
2255 | return; | |||
2256 | } | |||
2257 | ||||
2258 | break; | |||
2259 | } | |||
2260 | case AMDGPU::G_AND: | |||
2261 | case AMDGPU::G_OR: | |||
2262 | case AMDGPU::G_XOR: { | |||
2263 | // 64-bit and is only available on the SALU, so split into 2 32-bit ops if | |||
2264 | // there is a VGPR input. | |||
2265 | Register DstReg = MI.getOperand(0).getReg(); | |||
2266 | LLT DstTy = MRI.getType(DstReg); | |||
2267 | ||||
2268 | if (DstTy.getSizeInBits() == 1) { | |||
2269 | const RegisterBank *DstBank = | |||
2270 | OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; | |||
2271 | if (DstBank == &AMDGPU::VCCRegBank) | |||
2272 | break; | |||
2273 | ||||
2274 | MachineFunction *MF = MI.getParent()->getParent(); | |||
2275 | ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); | |||
2276 | MachineIRBuilder B(MI, ApplyBank); | |||
2277 | LegalizerHelper Helper(*MF, ApplyBank, B); | |||
2278 | ||||
2279 | if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != | |||
2280 | LegalizerHelper::Legalized) | |||
2281 | llvm_unreachable("widen scalar should have succeeded")::llvm::llvm_unreachable_internal("widen scalar should have succeeded" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2281); | |||
2282 | return; | |||
2283 | } | |||
2284 | ||||
2285 | if (DstTy.getSizeInBits() != 64) | |||
2286 | break; | |||
2287 | ||||
2288 | LLT HalfTy = getHalfSizedType(DstTy); | |||
2289 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); | |||
2290 | SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); | |||
2291 | SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); | |||
2292 | ||||
2293 | // All inputs are SGPRs, nothing special to do. | |||
2294 | if (DefRegs.empty()) { | |||
2295 | assert(Src0Regs.empty() && Src1Regs.empty())(static_cast <bool> (Src0Regs.empty() && Src1Regs .empty()) ? void (0) : __assert_fail ("Src0Regs.empty() && Src1Regs.empty()" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2295, __extension__ __PRETTY_FUNCTION__)); | |||
2296 | break; | |||
2297 | } | |||
2298 | ||||
2299 | assert(DefRegs.size() == 2)(static_cast <bool> (DefRegs.size() == 2) ? void (0) : __assert_fail ("DefRegs.size() == 2", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2299, __extension__ __PRETTY_FUNCTION__)); | |||
2300 | assert(Src0Regs.size() == Src1Regs.size() &&(static_cast <bool> (Src0Regs.size() == Src1Regs.size() && (Src0Regs.empty() || Src0Regs.size() == 2)) ? void (0) : __assert_fail ("Src0Regs.size() == Src1Regs.size() && (Src0Regs.empty() || Src0Regs.size() == 2)" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2301, __extension__ __PRETTY_FUNCTION__)) | |||
2301 | (Src0Regs.empty() || Src0Regs.size() == 2))(static_cast <bool> (Src0Regs.size() == Src1Regs.size() && (Src0Regs.empty() || Src0Regs.size() == 2)) ? void (0) : __assert_fail ("Src0Regs.size() == Src1Regs.size() && (Src0Regs.empty() || Src0Regs.size() == 2)" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2301, __extension__ __PRETTY_FUNCTION__)); | |||
2302 | ||||
2303 | // Depending on where the source registers came from, the generic code may | |||
2304 | // have decided to split the inputs already or not. If not, we still need to | |||
2305 | // extract the values. | |||
2306 | MachineIRBuilder B(MI); | |||
2307 | ||||
2308 | if (Src0Regs.empty()) | |||
2309 | split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); | |||
2310 | else | |||
2311 | setRegsToType(MRI, Src0Regs, HalfTy); | |||
2312 | ||||
2313 | if (Src1Regs.empty()) | |||
2314 | split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); | |||
2315 | else | |||
2316 | setRegsToType(MRI, Src1Regs, HalfTy); | |||
2317 | ||||
2318 | setRegsToType(MRI, DefRegs, HalfTy); | |||
2319 | ||||
2320 | B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]}); | |||
2321 | B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]}); | |||
2322 | ||||
2323 | MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); | |||
2324 | MI.eraseFromParent(); | |||
2325 | return; | |||
2326 | } | |||
2327 | case AMDGPU::G_ABS: { | |||
2328 | Register SrcReg = MI.getOperand(1).getReg(); | |||
2329 | const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg); | |||
2330 | ||||
2331 | // There is no VALU abs instruction so we need to replace it with a sub and | |||
2332 | // max combination. | |||
2333 | if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) { | |||
2334 | MachineFunction *MF = MI.getParent()->getParent(); | |||
2335 | ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank); | |||
2336 | MachineIRBuilder B(MI, Apply); | |||
2337 | LegalizerHelper Helper(*MF, Apply, B); | |||
2338 | ||||
2339 | if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized) | |||
2340 | llvm_unreachable("lowerAbsToMaxNeg should have succeeded")::llvm::llvm_unreachable_internal("lowerAbsToMaxNeg should have succeeded" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2340); | |||
2341 | return; | |||
2342 | } | |||
2343 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | |||
2344 | } | |||
2345 | case AMDGPU::G_ADD: | |||
2346 | case AMDGPU::G_SUB: | |||
2347 | case AMDGPU::G_MUL: | |||
2348 | case AMDGPU::G_SHL: | |||
2349 | case AMDGPU::G_LSHR: | |||
2350 | case AMDGPU::G_ASHR: | |||
2351 | case AMDGPU::G_SMIN: | |||
2352 | case AMDGPU::G_SMAX: | |||
2353 | case AMDGPU::G_UMIN: | |||
2354 | case AMDGPU::G_UMAX: { | |||
2355 | Register DstReg = MI.getOperand(0).getReg(); | |||
2356 | LLT DstTy = MRI.getType(DstReg); | |||
2357 | ||||
2358 | // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. | |||
2359 | // Packed 16-bit operations need to be scalarized and promoted. | |||
2360 | if (DstTy != LLT::scalar(16) && DstTy != LLT::vector(2, 16)) | |||
2361 | break; | |||
2362 | ||||
2363 | const RegisterBank *DstBank = | |||
2364 | OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; | |||
2365 | if (DstBank == &AMDGPU::VGPRRegBank) | |||
2366 | break; | |||
2367 | ||||
2368 | const LLT S32 = LLT::scalar(32); | |||
2369 | MachineBasicBlock *MBB = MI.getParent(); | |||
2370 | MachineFunction *MF = MBB->getParent(); | |||
2371 | ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); | |||
2372 | MachineIRBuilder B(MI, ApplySALU); | |||
2373 | ||||
2374 | if (DstTy.isVector()) { | |||
2375 | Register WideSrc0Lo, WideSrc0Hi; | |||
2376 | Register WideSrc1Lo, WideSrc1Hi; | |||
2377 | ||||
2378 | unsigned ExtendOp = getExtendOp(MI.getOpcode()); | |||
2379 | std::tie(WideSrc0Lo, WideSrc0Hi) | |||
2380 | = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp); | |||
2381 | std::tie(WideSrc1Lo, WideSrc1Hi) | |||
2382 | = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp); | |||
2383 | auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo}); | |||
2384 | auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi}); | |||
2385 | B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)}); | |||
2386 | MI.eraseFromParent(); | |||
2387 | } else { | |||
2388 | LegalizerHelper Helper(*MF, ApplySALU, B); | |||
2389 | ||||
2390 | if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) | |||
2391 | llvm_unreachable("widen scalar should have succeeded")::llvm::llvm_unreachable_internal("widen scalar should have succeeded" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2391); | |||
2392 | ||||
2393 | // FIXME: s16 shift amounts should be legal. | |||
2394 | if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR || | |||
2395 | Opc == AMDGPU::G_ASHR) { | |||
2396 | B.setInsertPt(*MBB, MI.getIterator()); | |||
2397 | if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized) | |||
2398 | llvm_unreachable("widen scalar should have succeeded")::llvm::llvm_unreachable_internal("widen scalar should have succeeded" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2398); | |||
2399 | } | |||
2400 | } | |||
2401 | ||||
2402 | return; | |||
2403 | } | |||
2404 | case AMDGPU::G_SEXT_INREG: { | |||
2405 | SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); | |||
2406 | if (SrcRegs.empty()) | |||
2407 | break; // Nothing to repair | |||
2408 | ||||
2409 | const LLT S32 = LLT::scalar(32); | |||
2410 | MachineIRBuilder B(MI); | |||
2411 | ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank); | |||
2412 | GISelObserverWrapper Observer(&O); | |||
2413 | B.setChangeObserver(Observer); | |||
2414 | ||||
2415 | // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs | |||
2416 | // we would need to further expand, and doesn't let us directly set the | |||
2417 | // result registers. | |||
2418 | SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); | |||
2419 | ||||
2420 | int Amt = MI.getOperand(2).getImm(); | |||
2421 | if (Amt <= 32) { | |||
2422 | if (Amt == 32) { | |||
2423 | // The low bits are unchanged. | |||
2424 | B.buildCopy(DstRegs[0], SrcRegs[0]); | |||
2425 | } else { | |||
2426 | // Extend in the low bits and propagate the sign bit to the high half. | |||
2427 | B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt); | |||
2428 | } | |||
2429 | ||||
2430 | B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31)); | |||
2431 | } else { | |||
2432 | // The low bits are unchanged, and extend in the high bits. | |||
2433 | B.buildCopy(DstRegs[0], SrcRegs[0]); | |||
2434 | B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32); | |||
2435 | } | |||
2436 | ||||
2437 | Register DstReg = MI.getOperand(0).getReg(); | |||
2438 | MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); | |||
2439 | MI.eraseFromParent(); | |||
2440 | return; | |||
2441 | } | |||
2442 | case AMDGPU::G_CTPOP: | |||
2443 | case AMDGPU::G_BITREVERSE: | |||
2444 | case AMDGPU::G_CTLZ_ZERO_UNDEF: | |||
2445 | case AMDGPU::G_CTTZ_ZERO_UNDEF: { | |||
2446 | const RegisterBank *DstBank = | |||
2447 | OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; | |||
2448 | if (DstBank == &AMDGPU::SGPRRegBank) | |||
2449 | break; | |||
2450 | ||||
2451 | Register SrcReg = MI.getOperand(1).getReg(); | |||
2452 | const LLT S32 = LLT::scalar(32); | |||
2453 | LLT Ty = MRI.getType(SrcReg); | |||
2454 | if (Ty == S32) | |||
2455 | break; | |||
2456 | ||||
2457 | ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); | |||
2458 | MachineIRBuilder B(MI, ApplyVALU); | |||
2459 | ||||
2460 | MachineFunction &MF = B.getMF(); | |||
2461 | LegalizerHelper Helper(MF, ApplyVALU, B); | |||
2462 | ||||
2463 | if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized) | |||
2464 | llvm_unreachable("narrowScalar should have succeeded")::llvm::llvm_unreachable_internal("narrowScalar should have succeeded" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2464); | |||
2465 | return; | |||
2466 | } | |||
2467 | case AMDGPU::G_SEXT: | |||
2468 | case AMDGPU::G_ZEXT: | |||
2469 | case AMDGPU::G_ANYEXT: { | |||
2470 | Register SrcReg = MI.getOperand(1).getReg(); | |||
2471 | LLT SrcTy = MRI.getType(SrcReg); | |||
2472 | const bool Signed = Opc == AMDGPU::G_SEXT; | |||
2473 | ||||
2474 | assert(empty(OpdMapper.getVRegs(1)))(static_cast <bool> (empty(OpdMapper.getVRegs(1))) ? void (0) : __assert_fail ("empty(OpdMapper.getVRegs(1))", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2474, __extension__ __PRETTY_FUNCTION__)); | |||
2475 | ||||
2476 | MachineIRBuilder B(MI); | |||
2477 | const RegisterBank *SrcBank = | |||
2478 | OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; | |||
2479 | ||||
2480 | Register DstReg = MI.getOperand(0).getReg(); | |||
2481 | LLT DstTy = MRI.getType(DstReg); | |||
2482 | if (DstTy.isScalar() && | |||
2483 | SrcBank != &AMDGPU::SGPRRegBank && | |||
2484 | SrcBank != &AMDGPU::VCCRegBank && | |||
2485 | // FIXME: Should handle any type that round to s64 when irregular | |||
2486 | // breakdowns supported. | |||
2487 | DstTy.getSizeInBits() == 64 && | |||
2488 | SrcTy.getSizeInBits() <= 32) { | |||
2489 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); | |||
2490 | ||||
2491 | // Extend to 32-bit, and then extend the low half. | |||
2492 | if (Signed) { | |||
2493 | // TODO: Should really be buildSExtOrCopy | |||
2494 | B.buildSExtOrTrunc(DefRegs[0], SrcReg); | |||
2495 | } else if (Opc == AMDGPU::G_ZEXT) { | |||
2496 | B.buildZExtOrTrunc(DefRegs[0], SrcReg); | |||
2497 | } else { | |||
2498 | B.buildAnyExtOrTrunc(DefRegs[0], SrcReg); | |||
2499 | } | |||
2500 | ||||
2501 | extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank); | |||
2502 | MRI.setRegBank(DstReg, *SrcBank); | |||
2503 | MI.eraseFromParent(); | |||
2504 | return; | |||
2505 | } | |||
2506 | ||||
2507 | if (SrcTy != LLT::scalar(1)) | |||
2508 | return; | |||
2509 | ||||
2510 | // It is not legal to have a legalization artifact with a VCC source. Rather | |||
2511 | // than introducing a copy, insert the select we would have to select the | |||
2512 | // copy to. | |||
2513 | if (SrcBank == &AMDGPU::VCCRegBank) { | |||
2514 | SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); | |||
2515 | ||||
2516 | const RegisterBank *DstBank = &AMDGPU::VGPRRegBank; | |||
2517 | ||||
2518 | unsigned DstSize = DstTy.getSizeInBits(); | |||
2519 | // 64-bit select is SGPR only | |||
2520 | const bool UseSel64 = DstSize > 32 && | |||
2521 | SrcBank->getID() == AMDGPU::SGPRRegBankID; | |||
2522 | ||||
2523 | // TODO: Should s16 select be legal? | |||
2524 | LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); | |||
2525 | auto True = B.buildConstant(SelType, Signed ? -1 : 1); | |||
2526 | auto False = B.buildConstant(SelType, 0); | |||
2527 | ||||
2528 | MRI.setRegBank(True.getReg(0), *DstBank); | |||
2529 | MRI.setRegBank(False.getReg(0), *DstBank); | |||
2530 | MRI.setRegBank(DstReg, *DstBank); | |||
2531 | ||||
2532 | if (DstSize > 32) { | |||
2533 | B.buildSelect(DefRegs[0], SrcReg, True, False); | |||
2534 | extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true); | |||
2535 | } else if (DstSize < 32) { | |||
2536 | auto Sel = B.buildSelect(SelType, SrcReg, True, False); | |||
2537 | MRI.setRegBank(Sel.getReg(0), *DstBank); | |||
2538 | B.buildTrunc(DstReg, Sel); | |||
2539 | } else { | |||
2540 | B.buildSelect(DstReg, SrcReg, True, False); | |||
2541 | } | |||
2542 | ||||
2543 | MI.eraseFromParent(); | |||
2544 | return; | |||
2545 | } | |||
2546 | ||||
2547 | break; | |||
2548 | } | |||
2549 | case AMDGPU::G_BUILD_VECTOR: | |||
2550 | case AMDGPU::G_BUILD_VECTOR_TRUNC: { | |||
2551 | Register DstReg = MI.getOperand(0).getReg(); | |||
2552 | LLT DstTy = MRI.getType(DstReg); | |||
2553 | if (DstTy != LLT::vector(2, 16)) | |||
2554 | break; | |||
2555 | ||||
2556 | assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty())(static_cast <bool> (MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()) ? void (0) : __assert_fail ("MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2556, __extension__ __PRETTY_FUNCTION__)); | |||
2557 | substituteSimpleCopyRegs(OpdMapper, 1); | |||
2558 | substituteSimpleCopyRegs(OpdMapper, 2); | |||
2559 | ||||
2560 | const RegisterBank *DstBank = | |||
2561 | OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; | |||
2562 | if (DstBank == &AMDGPU::SGPRRegBank) | |||
2563 | break; // Can use S_PACK_* instructions. | |||
2564 | ||||
2565 | MachineIRBuilder B(MI); | |||
2566 | ||||
2567 | Register Lo = MI.getOperand(1).getReg(); | |||
2568 | Register Hi = MI.getOperand(2).getReg(); | |||
2569 | const LLT S32 = LLT::scalar(32); | |||
2570 | ||||
2571 | const RegisterBank *BankLo = | |||
2572 | OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; | |||
2573 | const RegisterBank *BankHi = | |||
2574 | OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; | |||
2575 | ||||
2576 | Register ZextLo; | |||
2577 | Register ShiftHi; | |||
2578 | ||||
2579 | if (Opc == AMDGPU::G_BUILD_VECTOR) { | |||
2580 | ZextLo = B.buildZExt(S32, Lo).getReg(0); | |||
2581 | MRI.setRegBank(ZextLo, *BankLo); | |||
2582 | ||||
2583 | Register ZextHi = B.buildZExt(S32, Hi).getReg(0); | |||
2584 | MRI.setRegBank(ZextHi, *BankHi); | |||
2585 | ||||
2586 | auto ShiftAmt = B.buildConstant(S32, 16); | |||
2587 | MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); | |||
2588 | ||||
2589 | ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0); | |||
2590 | MRI.setRegBank(ShiftHi, *BankHi); | |||
2591 | } else { | |||
2592 | Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0); | |||
2593 | MRI.setRegBank(MaskLo, *BankLo); | |||
2594 | ||||
2595 | auto ShiftAmt = B.buildConstant(S32, 16); | |||
2596 | MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); | |||
2597 | ||||
2598 | ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0); | |||
2599 | MRI.setRegBank(ShiftHi, *BankHi); | |||
2600 | ||||
2601 | ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0); | |||
2602 | MRI.setRegBank(ZextLo, *BankLo); | |||
2603 | } | |||
2604 | ||||
2605 | auto Or = B.buildOr(S32, ZextLo, ShiftHi); | |||
2606 | MRI.setRegBank(Or.getReg(0), *DstBank); | |||
2607 | ||||
2608 | B.buildBitcast(DstReg, Or); | |||
2609 | MI.eraseFromParent(); | |||
2610 | return; | |||
2611 | } | |||
2612 | case AMDGPU::G_EXTRACT_VECTOR_ELT: { | |||
2613 | SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); | |||
2614 | ||||
2615 | assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty())(static_cast <bool> (OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()) ? void (0) : __assert_fail ("OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2615, __extension__ __PRETTY_FUNCTION__)); | |||
2616 | ||||
2617 | Register DstReg = MI.getOperand(0).getReg(); | |||
2618 | Register SrcReg = MI.getOperand(1).getReg(); | |||
2619 | ||||
2620 | const LLT S32 = LLT::scalar(32); | |||
2621 | LLT DstTy = MRI.getType(DstReg); | |||
2622 | LLT SrcTy = MRI.getType(SrcReg); | |||
2623 | ||||
2624 | if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper)) | |||
2625 | return; | |||
2626 | ||||
2627 | MachineIRBuilder B(MI); | |||
2628 | ||||
2629 | const ValueMapping &DstMapping | |||
2630 | = OpdMapper.getInstrMapping().getOperandMapping(0); | |||
2631 | const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank; | |||
2632 | const RegisterBank *SrcBank = | |||
2633 | OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; | |||
2634 | const RegisterBank *IdxBank = | |||
2635 | OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; | |||
2636 | ||||
2637 | Register BaseIdxReg; | |||
2638 | unsigned ConstOffset; | |||
2639 | std::tie(BaseIdxReg, ConstOffset) = | |||
2640 | AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg()); | |||
2641 | ||||
2642 | // See if the index is an add of a constant which will be foldable by moving | |||
2643 | // the base register of the index later if this is going to be executed in a | |||
2644 | // waterfall loop. This is essentially to reassociate the add of a constant | |||
2645 | // with the readfirstlane. | |||
2646 | bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && | |||
2647 | ConstOffset > 0 && | |||
2648 | ConstOffset < SrcTy.getNumElements(); | |||
2649 | ||||
2650 | // Move the base register. We'll re-insert the add later. | |||
2651 | if (ShouldMoveIndexIntoLoop) | |||
2652 | MI.getOperand(2).setReg(BaseIdxReg); | |||
2653 | ||||
2654 | // If this is a VGPR result only because the index was a VGPR result, the | |||
2655 | // actual indexing will be done on the SGPR source vector, which will | |||
2656 | // produce a scalar result. We need to copy to the VGPR result inside the | |||
2657 | // waterfall loop. | |||
2658 | const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank && | |||
2659 | SrcBank == &AMDGPU::SGPRRegBank; | |||
2660 | if (DstRegs.empty()) { | |||
2661 | applyDefaultMapping(OpdMapper); | |||
2662 | ||||
2663 | executeInWaterfallLoop(MI, MRI, { 2 }); | |||
2664 | ||||
2665 | if (NeedCopyToVGPR) { | |||
2666 | // We don't want a phi for this temporary reg. | |||
2667 | Register TmpReg = MRI.createGenericVirtualRegister(DstTy); | |||
2668 | MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank); | |||
2669 | MI.getOperand(0).setReg(TmpReg); | |||
2670 | B.setInsertPt(*MI.getParent(), ++MI.getIterator()); | |||
2671 | ||||
2672 | // Use a v_mov_b32 here to make the exec dependency explicit. | |||
2673 | buildVCopy(B, DstReg, TmpReg); | |||
2674 | } | |||
2675 | ||||
2676 | // Re-insert the constant offset add inside the waterfall loop. | |||
2677 | if (ShouldMoveIndexIntoLoop) | |||
2678 | reinsertVectorIndexAdd(B, MI, 2, ConstOffset); | |||
2679 | ||||
2680 | return; | |||
2681 | } | |||
2682 | ||||
2683 | assert(DstTy.getSizeInBits() == 64)(static_cast <bool> (DstTy.getSizeInBits() == 64) ? void (0) : __assert_fail ("DstTy.getSizeInBits() == 64", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2683, __extension__ __PRETTY_FUNCTION__)); | |||
2684 | ||||
2685 | LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); | |||
2686 | ||||
2687 | auto CastSrc = B.buildBitcast(Vec32, SrcReg); | |||
2688 | auto One = B.buildConstant(S32, 1); | |||
2689 | ||||
2690 | MachineBasicBlock::iterator MII = MI.getIterator(); | |||
2691 | ||||
2692 | // Split the vector index into 32-bit pieces. Prepare to move all of the | |||
2693 | // new instructions into a waterfall loop if necessary. | |||
2694 | // | |||
2695 | // Don't put the bitcast or constant in the loop. | |||
2696 | MachineInstrSpan Span(MII, &B.getMBB()); | |||
2697 | ||||
2698 | // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). | |||
2699 | auto IdxLo = B.buildShl(S32, BaseIdxReg, One); | |||
2700 | auto IdxHi = B.buildAdd(S32, IdxLo, One); | |||
2701 | ||||
2702 | auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); | |||
2703 | auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); | |||
2704 | ||||
2705 | MRI.setRegBank(DstReg, *DstBank); | |||
2706 | MRI.setRegBank(CastSrc.getReg(0), *SrcBank); | |||
2707 | MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); | |||
2708 | MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); | |||
2709 | MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); | |||
2710 | ||||
2711 | SmallSet<Register, 4> OpsToWaterfall; | |||
2712 | if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { | |||
2713 | MI.eraseFromParent(); | |||
2714 | return; | |||
2715 | } | |||
2716 | ||||
2717 | // Remove the original instruction to avoid potentially confusing the | |||
2718 | // waterfall loop logic. | |||
2719 | B.setInstr(*Span.begin()); | |||
2720 | MI.eraseFromParent(); | |||
2721 | executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), | |||
2722 | OpsToWaterfall, MRI); | |||
2723 | ||||
2724 | if (NeedCopyToVGPR) { | |||
2725 | MachineBasicBlock *LoopBB = Extract1->getParent(); | |||
2726 | Register TmpReg0 = MRI.createGenericVirtualRegister(S32); | |||
2727 | Register TmpReg1 = MRI.createGenericVirtualRegister(S32); | |||
2728 | MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank); | |||
2729 | MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank); | |||
2730 | ||||
2731 | Extract0->getOperand(0).setReg(TmpReg0); | |||
2732 | Extract1->getOperand(0).setReg(TmpReg1); | |||
2733 | ||||
2734 | B.setInsertPt(*LoopBB, ++Extract1->getIterator()); | |||
2735 | ||||
2736 | buildVCopy(B, DstRegs[0], TmpReg0); | |||
2737 | buildVCopy(B, DstRegs[1], TmpReg1); | |||
2738 | } | |||
2739 | ||||
2740 | if (ShouldMoveIndexIntoLoop) | |||
2741 | reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); | |||
2742 | ||||
2743 | return; | |||
2744 | } | |||
2745 | case AMDGPU::G_INSERT_VECTOR_ELT: { | |||
2746 | SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); | |||
2747 | ||||
2748 | Register DstReg = MI.getOperand(0).getReg(); | |||
2749 | LLT VecTy = MRI.getType(DstReg); | |||
2750 | ||||
2751 | assert(OpdMapper.getVRegs(0).empty())(static_cast <bool> (OpdMapper.getVRegs(0).empty()) ? void (0) : __assert_fail ("OpdMapper.getVRegs(0).empty()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2751, __extension__ __PRETTY_FUNCTION__)); | |||
2752 | assert(OpdMapper.getVRegs(3).empty())(static_cast <bool> (OpdMapper.getVRegs(3).empty()) ? void (0) : __assert_fail ("OpdMapper.getVRegs(3).empty()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2752, __extension__ __PRETTY_FUNCTION__)); | |||
2753 | ||||
2754 | if (substituteSimpleCopyRegs(OpdMapper, 1)) | |||
2755 | MRI.setType(MI.getOperand(1).getReg(), VecTy); | |||
2756 | ||||
2757 | if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper)) | |||
2758 | return; | |||
2759 | ||||
2760 | const RegisterBank *IdxBank = | |||
2761 | OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; | |||
2762 | ||||
2763 | Register SrcReg = MI.getOperand(1).getReg(); | |||
2764 | Register InsReg = MI.getOperand(2).getReg(); | |||
2765 | LLT InsTy = MRI.getType(InsReg); | |||
2766 | (void)InsTy; | |||
2767 | ||||
2768 | Register BaseIdxReg; | |||
2769 | unsigned ConstOffset; | |||
2770 | std::tie(BaseIdxReg, ConstOffset) = | |||
2771 | AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg()); | |||
2772 | ||||
2773 | // See if the index is an add of a constant which will be foldable by moving | |||
2774 | // the base register of the index later if this is going to be executed in a | |||
2775 | // waterfall loop. This is essentially to reassociate the add of a constant | |||
2776 | // with the readfirstlane. | |||
2777 | bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && | |||
2778 | ConstOffset > 0 && | |||
2779 | ConstOffset < VecTy.getNumElements(); | |||
2780 | ||||
2781 | // Move the base register. We'll re-insert the add later. | |||
2782 | if (ShouldMoveIndexIntoLoop) | |||
2783 | MI.getOperand(3).setReg(BaseIdxReg); | |||
2784 | ||||
2785 | ||||
2786 | if (InsRegs.empty()) { | |||
2787 | executeInWaterfallLoop(MI, MRI, { 3 }); | |||
2788 | ||||
2789 | // Re-insert the constant offset add inside the waterfall loop. | |||
2790 | if (ShouldMoveIndexIntoLoop) { | |||
2791 | MachineIRBuilder B(MI); | |||
2792 | reinsertVectorIndexAdd(B, MI, 3, ConstOffset); | |||
2793 | } | |||
2794 | ||||
2795 | return; | |||
2796 | } | |||
2797 | ||||
2798 | ||||
2799 | assert(InsTy.getSizeInBits() == 64)(static_cast <bool> (InsTy.getSizeInBits() == 64) ? void (0) : __assert_fail ("InsTy.getSizeInBits() == 64", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2799, __extension__ __PRETTY_FUNCTION__)); | |||
2800 | ||||
2801 | const LLT S32 = LLT::scalar(32); | |||
2802 | LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32); | |||
2803 | ||||
2804 | MachineIRBuilder B(MI); | |||
2805 | auto CastSrc = B.buildBitcast(Vec32, SrcReg); | |||
2806 | auto One = B.buildConstant(S32, 1); | |||
2807 | ||||
2808 | // Split the vector index into 32-bit pieces. Prepare to move all of the | |||
2809 | // new instructions into a waterfall loop if necessary. | |||
2810 | // | |||
2811 | // Don't put the bitcast or constant in the loop. | |||
2812 | MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); | |||
2813 | ||||
2814 | // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). | |||
2815 | auto IdxLo = B.buildShl(S32, BaseIdxReg, One); | |||
2816 | auto IdxHi = B.buildAdd(S32, IdxLo, One); | |||
2817 | ||||
2818 | auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); | |||
2819 | auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); | |||
2820 | ||||
2821 | const RegisterBank *DstBank = | |||
2822 | OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; | |||
2823 | const RegisterBank *SrcBank = | |||
2824 | OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; | |||
2825 | const RegisterBank *InsSrcBank = | |||
2826 | OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; | |||
2827 | ||||
2828 | MRI.setRegBank(InsReg, *InsSrcBank); | |||
2829 | MRI.setRegBank(CastSrc.getReg(0), *SrcBank); | |||
2830 | MRI.setRegBank(InsLo.getReg(0), *DstBank); | |||
2831 | MRI.setRegBank(InsHi.getReg(0), *DstBank); | |||
2832 | MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); | |||
2833 | MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); | |||
2834 | MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); | |||
2835 | ||||
2836 | ||||
2837 | SmallSet<Register, 4> OpsToWaterfall; | |||
2838 | if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { | |||
2839 | B.setInsertPt(B.getMBB(), MI); | |||
2840 | B.buildBitcast(DstReg, InsHi); | |||
2841 | MI.eraseFromParent(); | |||
2842 | return; | |||
2843 | } | |||
2844 | ||||
2845 | B.setInstr(*Span.begin()); | |||
2846 | MI.eraseFromParent(); | |||
2847 | ||||
2848 | // Figure out the point after the waterfall loop before mangling the control | |||
2849 | // flow. | |||
2850 | executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), | |||
2851 | OpsToWaterfall, MRI); | |||
2852 | ||||
2853 | // The insertion point is now right after the original instruction. | |||
2854 | // | |||
2855 | // Keep the bitcast to the original vector type out of the loop. Doing this | |||
2856 | // saved an extra phi we don't need inside the loop. | |||
2857 | B.buildBitcast(DstReg, InsHi); | |||
2858 | ||||
2859 | // Re-insert the constant offset add inside the waterfall loop. | |||
2860 | if (ShouldMoveIndexIntoLoop) | |||
2861 | reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); | |||
2862 | ||||
2863 | return; | |||
2864 | } | |||
2865 | case AMDGPU::G_AMDGPU_BUFFER_LOAD: | |||
2866 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: | |||
2867 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: | |||
2868 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: | |||
2869 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: | |||
2870 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: | |||
2871 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: | |||
2872 | case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: | |||
2873 | case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: | |||
2874 | case AMDGPU::G_AMDGPU_BUFFER_STORE: | |||
2875 | case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: | |||
2876 | case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: | |||
2877 | case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: | |||
2878 | case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: | |||
2879 | case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: | |||
2880 | case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: { | |||
2881 | applyDefaultMapping(OpdMapper); | |||
2882 | executeInWaterfallLoop(MI, MRI, {1, 4}); | |||
2883 | return; | |||
2884 | } | |||
2885 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: | |||
2886 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: | |||
2887 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: | |||
2888 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: | |||
2889 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: | |||
2890 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: | |||
2891 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: | |||
2892 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: | |||
2893 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: | |||
2894 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: | |||
2895 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: | |||
2896 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { | |||
2897 | applyDefaultMapping(OpdMapper); | |||
2898 | executeInWaterfallLoop(MI, MRI, {2, 5}); | |||
2899 | return; | |||
2900 | } | |||
2901 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: | |||
2902 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: | |||
2903 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { | |||
2904 | applyDefaultMapping(OpdMapper); | |||
2905 | executeInWaterfallLoop(MI, MRI, {2, 5}); | |||
2906 | return; | |||
2907 | } | |||
2908 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { | |||
2909 | applyDefaultMapping(OpdMapper); | |||
2910 | executeInWaterfallLoop(MI, MRI, {3, 6}); | |||
2911 | return; | |||
2912 | } | |||
2913 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { | |||
2914 | applyMappingSBufferLoad(OpdMapper); | |||
2915 | return; | |||
2916 | } | |||
2917 | case AMDGPU::G_INTRINSIC: { | |||
2918 | switch (MI.getIntrinsicID()) { | |||
2919 | case Intrinsic::amdgcn_readlane: { | |||
2920 | substituteSimpleCopyRegs(OpdMapper, 2); | |||
2921 | ||||
2922 | assert(OpdMapper.getVRegs(0).empty())(static_cast <bool> (OpdMapper.getVRegs(0).empty()) ? void (0) : __assert_fail ("OpdMapper.getVRegs(0).empty()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2922, __extension__ __PRETTY_FUNCTION__)); | |||
2923 | assert(OpdMapper.getVRegs(3).empty())(static_cast <bool> (OpdMapper.getVRegs(3).empty()) ? void (0) : __assert_fail ("OpdMapper.getVRegs(3).empty()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2923, __extension__ __PRETTY_FUNCTION__)); | |||
2924 | ||||
2925 | // Make sure the index is an SGPR. It doesn't make sense to run this in a | |||
2926 | // waterfall loop, so assume it's a uniform value. | |||
2927 | constrainOpWithReadfirstlane(MI, MRI, 3); // Index | |||
2928 | return; | |||
2929 | } | |||
2930 | case Intrinsic::amdgcn_writelane: { | |||
2931 | assert(OpdMapper.getVRegs(0).empty())(static_cast <bool> (OpdMapper.getVRegs(0).empty()) ? void (0) : __assert_fail ("OpdMapper.getVRegs(0).empty()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2931, __extension__ __PRETTY_FUNCTION__)); | |||
2932 | assert(OpdMapper.getVRegs(2).empty())(static_cast <bool> (OpdMapper.getVRegs(2).empty()) ? void (0) : __assert_fail ("OpdMapper.getVRegs(2).empty()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2932, __extension__ __PRETTY_FUNCTION__)); | |||
2933 | assert(OpdMapper.getVRegs(3).empty())(static_cast <bool> (OpdMapper.getVRegs(3).empty()) ? void (0) : __assert_fail ("OpdMapper.getVRegs(3).empty()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2933, __extension__ __PRETTY_FUNCTION__)); | |||
2934 | ||||
2935 | substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val | |||
2936 | constrainOpWithReadfirstlane(MI, MRI, 2); // Source value | |||
2937 | constrainOpWithReadfirstlane(MI, MRI, 3); // Index | |||
2938 | return; | |||
2939 | } | |||
2940 | case Intrinsic::amdgcn_interp_p1: | |||
2941 | case Intrinsic::amdgcn_interp_p2: | |||
2942 | case Intrinsic::amdgcn_interp_mov: | |||
2943 | case Intrinsic::amdgcn_interp_p1_f16: | |||
2944 | case Intrinsic::amdgcn_interp_p2_f16: { | |||
2945 | applyDefaultMapping(OpdMapper); | |||
2946 | ||||
2947 | // Readlane for m0 value, which is always the last operand. | |||
2948 | // FIXME: Should this be a waterfall loop instead? | |||
2949 | constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index | |||
2950 | return; | |||
2951 | } | |||
2952 | case Intrinsic::amdgcn_permlane16: | |||
2953 | case Intrinsic::amdgcn_permlanex16: { | |||
2954 | // Doing a waterfall loop over these wouldn't make any sense. | |||
2955 | substituteSimpleCopyRegs(OpdMapper, 2); | |||
2956 | substituteSimpleCopyRegs(OpdMapper, 3); | |||
2957 | constrainOpWithReadfirstlane(MI, MRI, 4); | |||
2958 | constrainOpWithReadfirstlane(MI, MRI, 5); | |||
2959 | return; | |||
2960 | } | |||
2961 | case Intrinsic::amdgcn_sbfe: | |||
2962 | applyMappingBFEIntrinsic(OpdMapper, true); | |||
2963 | return; | |||
2964 | case Intrinsic::amdgcn_ubfe: | |||
2965 | applyMappingBFEIntrinsic(OpdMapper, false); | |||
2966 | return; | |||
2967 | case Intrinsic::amdgcn_ballot: | |||
2968 | // Use default handling and insert copy to vcc source. | |||
2969 | break; | |||
2970 | } | |||
2971 | break; | |||
2972 | } | |||
2973 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: | |||
2974 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { | |||
2975 | const AMDGPU::RsrcIntrinsic *RSrcIntrin | |||
2976 | = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID()); | |||
2977 | assert(RSrcIntrin && RSrcIntrin->IsImage)(static_cast <bool> (RSrcIntrin && RSrcIntrin-> IsImage) ? void (0) : __assert_fail ("RSrcIntrin && RSrcIntrin->IsImage" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2977, __extension__ __PRETTY_FUNCTION__)); | |||
2978 | // Non-images can have complications from operands that allow both SGPR | |||
2979 | // and VGPR. For now it's too complicated to figure out the final opcode | |||
2980 | // to derive the register bank from the MCInstrDesc. | |||
2981 | applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); | |||
2982 | return; | |||
2983 | } | |||
2984 | case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { | |||
2985 | unsigned N = MI.getNumExplicitOperands() - 2; | |||
2986 | executeInWaterfallLoop(MI, MRI, { N }); | |||
2987 | return; | |||
2988 | } | |||
2989 | case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { | |||
2990 | auto IntrID = MI.getIntrinsicID(); | |||
2991 | switch (IntrID) { | |||
2992 | case Intrinsic::amdgcn_ds_ordered_add: | |||
2993 | case Intrinsic::amdgcn_ds_ordered_swap: { | |||
2994 | // This is only allowed to execute with 1 lane, so readfirstlane is safe. | |||
2995 | assert(OpdMapper.getVRegs(0).empty())(static_cast <bool> (OpdMapper.getVRegs(0).empty()) ? void (0) : __assert_fail ("OpdMapper.getVRegs(0).empty()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 2995, __extension__ __PRETTY_FUNCTION__)); | |||
2996 | substituteSimpleCopyRegs(OpdMapper, 3); | |||
2997 | constrainOpWithReadfirstlane(MI, MRI, 2); // M0 | |||
2998 | return; | |||
2999 | } | |||
3000 | case Intrinsic::amdgcn_ds_gws_init: | |||
3001 | case Intrinsic::amdgcn_ds_gws_barrier: | |||
3002 | case Intrinsic::amdgcn_ds_gws_sema_br: { | |||
3003 | // Only the first lane is executes, so readfirstlane is safe. | |||
3004 | substituteSimpleCopyRegs(OpdMapper, 1); | |||
3005 | constrainOpWithReadfirstlane(MI, MRI, 2); // M0 | |||
3006 | return; | |||
3007 | } | |||
3008 | case Intrinsic::amdgcn_ds_gws_sema_v: | |||
3009 | case Intrinsic::amdgcn_ds_gws_sema_p: | |||
3010 | case Intrinsic::amdgcn_ds_gws_sema_release_all: { | |||
3011 | // Only the first lane is executes, so readfirstlane is safe. | |||
3012 | constrainOpWithReadfirstlane(MI, MRI, 1); // M0 | |||
3013 | return; | |||
3014 | } | |||
3015 | case Intrinsic::amdgcn_ds_append: | |||
3016 | case Intrinsic::amdgcn_ds_consume: { | |||
3017 | constrainOpWithReadfirstlane(MI, MRI, 2); // M0 | |||
3018 | return; | |||
3019 | } | |||
3020 | case Intrinsic::amdgcn_s_sendmsg: | |||
3021 | case Intrinsic::amdgcn_s_sendmsghalt: { | |||
3022 | // FIXME: Should this use a waterfall loop? | |||
3023 | constrainOpWithReadfirstlane(MI, MRI, 2); // M0 | |||
3024 | return; | |||
3025 | } | |||
3026 | case Intrinsic::amdgcn_s_setreg: { | |||
3027 | constrainOpWithReadfirstlane(MI, MRI, 2); | |||
3028 | return; | |||
3029 | } | |||
3030 | default: { | |||
3031 | if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = | |||
3032 | AMDGPU::lookupRsrcIntrinsic(IntrID)) { | |||
3033 | // Non-images can have complications from operands that allow both SGPR | |||
3034 | // and VGPR. For now it's too complicated to figure out the final opcode | |||
3035 | // to derive the register bank from the MCInstrDesc. | |||
3036 | if (RSrcIntrin->IsImage) { | |||
3037 | applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); | |||
3038 | return; | |||
3039 | } | |||
3040 | } | |||
3041 | ||||
3042 | break; | |||
3043 | } | |||
3044 | } | |||
3045 | break; | |||
3046 | } | |||
3047 | case AMDGPU::G_LOAD: | |||
3048 | case AMDGPU::G_ZEXTLOAD: | |||
3049 | case AMDGPU::G_SEXTLOAD: { | |||
3050 | if (applyMappingLoad(MI, OpdMapper, MRI)) | |||
3051 | return; | |||
3052 | break; | |||
3053 | } | |||
3054 | case AMDGPU::G_DYN_STACKALLOC: | |||
3055 | applyMappingDynStackAlloc(MI, OpdMapper, MRI); | |||
3056 | return; | |||
3057 | default: | |||
3058 | break; | |||
3059 | } | |||
3060 | ||||
3061 | return applyDefaultMapping(OpdMapper); | |||
3062 | } | |||
3063 | ||||
3064 | // vgpr, sgpr -> vgpr | |||
3065 | // vgpr, agpr -> vgpr | |||
3066 | // agpr, agpr -> agpr | |||
3067 | // agpr, sgpr -> vgpr | |||
3068 | static unsigned regBankUnion(unsigned RB0, unsigned RB1) { | |||
3069 | if (RB0 == AMDGPU::InvalidRegBankID) | |||
3070 | return RB1; | |||
3071 | if (RB1 == AMDGPU::InvalidRegBankID) | |||
3072 | return RB0; | |||
3073 | ||||
3074 | if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) | |||
3075 | return AMDGPU::SGPRRegBankID; | |||
3076 | ||||
3077 | if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID) | |||
3078 | return AMDGPU::AGPRRegBankID; | |||
3079 | ||||
3080 | return AMDGPU::VGPRRegBankID; | |||
3081 | } | |||
3082 | ||||
3083 | static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) { | |||
3084 | if (RB0 == AMDGPU::InvalidRegBankID) | |||
3085 | return RB1; | |||
3086 | if (RB1 == AMDGPU::InvalidRegBankID) | |||
3087 | return RB0; | |||
3088 | ||||
3089 | // vcc, vcc -> vcc | |||
3090 | // vcc, sgpr -> vcc | |||
3091 | // vcc, vgpr -> vcc | |||
3092 | if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID) | |||
3093 | return AMDGPU::VCCRegBankID; | |||
3094 | ||||
3095 | // vcc, vgpr -> vgpr | |||
3096 | return regBankUnion(RB0, RB1); | |||
3097 | } | |||
3098 | ||||
3099 | unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI, | |||
3100 | const MachineInstr &MI) const { | |||
3101 | unsigned RegBank = AMDGPU::InvalidRegBankID; | |||
3102 | ||||
3103 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { | |||
3104 | if (!MI.getOperand(i).isReg()) | |||
3105 | continue; | |||
3106 | Register Reg = MI.getOperand(i).getReg(); | |||
3107 | if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { | |||
3108 | RegBank = regBankUnion(RegBank, Bank->getID()); | |||
3109 | if (RegBank == AMDGPU::VGPRRegBankID) | |||
3110 | break; | |||
3111 | } | |||
3112 | } | |||
3113 | ||||
3114 | return RegBank; | |||
3115 | } | |||
3116 | ||||
3117 | bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { | |||
3118 | const MachineFunction &MF = *MI.getParent()->getParent(); | |||
3119 | const MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
3120 | for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) { | |||
3121 | if (!MI.getOperand(i).isReg()) | |||
3122 | continue; | |||
3123 | Register Reg = MI.getOperand(i).getReg(); | |||
3124 | if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { | |||
3125 | if (Bank->getID() != AMDGPU::SGPRRegBankID) | |||
3126 | return false; | |||
3127 | } | |||
3128 | } | |||
3129 | return true; | |||
3130 | } | |||
3131 | ||||
3132 | const RegisterBankInfo::InstructionMapping & | |||
3133 | AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { | |||
3134 | const MachineFunction &MF = *MI.getParent()->getParent(); | |||
3135 | const MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
3136 | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); | |||
3137 | ||||
3138 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { | |||
3139 | const MachineOperand &SrcOp = MI.getOperand(i); | |||
3140 | if (!SrcOp.isReg()) | |||
3141 | continue; | |||
3142 | ||||
3143 | unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI); | |||
3144 | OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); | |||
3145 | } | |||
3146 | return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), | |||
3147 | MI.getNumOperands()); | |||
3148 | } | |||
3149 | ||||
3150 | const RegisterBankInfo::InstructionMapping & | |||
3151 | AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { | |||
3152 | const MachineFunction &MF = *MI.getParent()->getParent(); | |||
3153 | const MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
3154 | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); | |||
3155 | ||||
3156 | // Even though we technically could use SGPRs, this would require knowledge of | |||
3157 | // the constant bus restriction. Force all sources to VGPR (except for VCC). | |||
3158 | // | |||
3159 | // TODO: Unary ops are trivially OK, so accept SGPRs? | |||
3160 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { | |||
3161 | const MachineOperand &Src = MI.getOperand(i); | |||
3162 | if (!Src.isReg()) | |||
3163 | continue; | |||
3164 | ||||
3165 | unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI); | |||
3166 | unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; | |||
3167 | OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); | |||
3168 | } | |||
3169 | ||||
3170 | return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), | |||
3171 | MI.getNumOperands()); | |||
3172 | } | |||
3173 | ||||
3174 | const RegisterBankInfo::InstructionMapping & | |||
3175 | AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { | |||
3176 | const MachineFunction &MF = *MI.getParent()->getParent(); | |||
3177 | const MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
3178 | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); | |||
3179 | ||||
3180 | for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { | |||
3181 | const MachineOperand &Op = MI.getOperand(I); | |||
3182 | if (!Op.isReg()) | |||
3183 | continue; | |||
3184 | ||||
3185 | unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); | |||
3186 | OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); | |||
3187 | } | |||
3188 | ||||
3189 | return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), | |||
3190 | MI.getNumOperands()); | |||
3191 | } | |||
3192 | ||||
3193 | const RegisterBankInfo::InstructionMapping & | |||
3194 | AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, | |||
3195 | const MachineInstr &MI, | |||
3196 | int RsrcIdx) const { | |||
3197 | // The reported argument index is relative to the IR intrinsic call arguments, | |||
3198 | // so we need to shift by the number of defs and the intrinsic ID. | |||
3199 | RsrcIdx += MI.getNumExplicitDefs() + 1; | |||
3200 | ||||
3201 | const int NumOps = MI.getNumOperands(); | |||
3202 | SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); | |||
3203 | ||||
3204 | // TODO: Should packed/unpacked D16 difference be reported here as part of | |||
3205 | // the value mapping? | |||
3206 | for (int I = 0; I != NumOps; ++I) { | |||
3207 | if (!MI.getOperand(I).isReg()) | |||
3208 | continue; | |||
3209 | ||||
3210 | Register OpReg = MI.getOperand(I).getReg(); | |||
3211 | // We replace some dead address operands with $noreg | |||
3212 | if (!OpReg) | |||
3213 | continue; | |||
3214 | ||||
3215 | unsigned Size = getSizeInBits(OpReg, MRI, *TRI); | |||
3216 | ||||
3217 | // FIXME: Probably need a new intrinsic register bank searchable table to | |||
3218 | // handle arbitrary intrinsics easily. | |||
3219 | // | |||
3220 | // If this has a sampler, it immediately follows rsrc. | |||
3221 | const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; | |||
3222 | ||||
3223 | if (MustBeSGPR) { | |||
3224 | // If this must be an SGPR, so we must report whatever it is as legal. | |||
3225 | unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID); | |||
3226 | OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size); | |||
3227 | } else { | |||
3228 | // Some operands must be VGPR, and these are easy to copy to. | |||
3229 | OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); | |||
3230 | } | |||
3231 | } | |||
3232 | ||||
3233 | return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps); | |||
3234 | } | |||
3235 | ||||
3236 | /// Return the mapping for a pointer arugment. | |||
3237 | const RegisterBankInfo::ValueMapping * | |||
3238 | AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, | |||
3239 | Register PtrReg) const { | |||
3240 | LLT PtrTy = MRI.getType(PtrReg); | |||
3241 | unsigned Size = PtrTy.getSizeInBits(); | |||
3242 | if (Subtarget.useFlatForGlobal() || | |||
3243 | !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace())) | |||
3244 | return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); | |||
3245 | ||||
3246 | // If we're using MUBUF instructions for global memory, an SGPR base register | |||
3247 | // is possible. Otherwise this needs to be a VGPR. | |||
3248 | const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); | |||
3249 | return AMDGPU::getValueMapping(PtrBank->getID(), Size); | |||
3250 | } | |||
3251 | ||||
3252 | const RegisterBankInfo::InstructionMapping & | |||
3253 | AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { | |||
3254 | ||||
3255 | const MachineFunction &MF = *MI.getParent()->getParent(); | |||
3256 | const MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
3257 | SmallVector<const ValueMapping*, 2> OpdsMapping(2); | |||
3258 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); | |||
3259 | Register PtrReg = MI.getOperand(1).getReg(); | |||
3260 | LLT PtrTy = MRI.getType(PtrReg); | |||
3261 | unsigned AS = PtrTy.getAddressSpace(); | |||
3262 | unsigned PtrSize = PtrTy.getSizeInBits(); | |||
3263 | ||||
3264 | const ValueMapping *ValMapping; | |||
3265 | const ValueMapping *PtrMapping; | |||
3266 | ||||
3267 | const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); | |||
3268 | ||||
3269 | if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) { | |||
3270 | if (isScalarLoadLegal(MI)) { | |||
3271 | // We have a uniform instruction so we want to use an SMRD load | |||
3272 | ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); | |||
3273 | PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); | |||
3274 | } else { | |||
3275 | ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); | |||
3276 | ||||
3277 | // If we're using MUBUF instructions for global memory, an SGPR base | |||
3278 | // register is possible. Otherwise this needs to be a VGPR. | |||
3279 | unsigned PtrBankID = Subtarget.useFlatForGlobal() ? | |||
3280 | AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID; | |||
3281 | ||||
3282 | PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize); | |||
3283 | } | |||
3284 | } else { | |||
3285 | ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); | |||
3286 | PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); | |||
3287 | } | |||
3288 | ||||
3289 | OpdsMapping[0] = ValMapping; | |||
3290 | OpdsMapping[1] = PtrMapping; | |||
3291 | const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( | |||
3292 | 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); | |||
3293 | return Mapping; | |||
3294 | ||||
3295 | // FIXME: Do we want to add a mapping for FLAT load, or should we just | |||
3296 | // handle that during instruction selection? | |||
3297 | } | |||
3298 | ||||
3299 | unsigned | |||
3300 | AMDGPURegisterBankInfo::getRegBankID(Register Reg, | |||
3301 | const MachineRegisterInfo &MRI, | |||
3302 | unsigned Default) const { | |||
3303 | const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); | |||
3304 | return Bank ? Bank->getID() : Default; | |||
3305 | } | |||
3306 | ||||
3307 | const RegisterBankInfo::ValueMapping * | |||
3308 | AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, | |||
3309 | const MachineRegisterInfo &MRI, | |||
3310 | const TargetRegisterInfo &TRI) const { | |||
3311 | // Lie and claim anything is legal, even though this needs to be an SGPR | |||
3312 | // applyMapping will have to deal with it as a waterfall loop. | |||
3313 | unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID); | |||
3314 | unsigned Size = getSizeInBits(Reg, MRI, TRI); | |||
3315 | return AMDGPU::getValueMapping(Bank, Size); | |||
3316 | } | |||
3317 | ||||
3318 | const RegisterBankInfo::ValueMapping * | |||
3319 | AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, | |||
3320 | const MachineRegisterInfo &MRI, | |||
3321 | const TargetRegisterInfo &TRI) const { | |||
3322 | unsigned Size = getSizeInBits(Reg, MRI, TRI); | |||
3323 | return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); | |||
3324 | } | |||
3325 | ||||
3326 | const RegisterBankInfo::ValueMapping * | |||
3327 | AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, | |||
3328 | const MachineRegisterInfo &MRI, | |||
3329 | const TargetRegisterInfo &TRI) const { | |||
3330 | unsigned Size = getSizeInBits(Reg, MRI, TRI); | |||
3331 | return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size); | |||
3332 | } | |||
3333 | ||||
3334 | /// | |||
3335 | /// This function must return a legal mapping, because | |||
3336 | /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called | |||
3337 | /// in RegBankSelect::Mode::Fast. Any mapping that would cause a | |||
3338 | /// VGPR to SGPR generated is illegal. | |||
3339 | /// | |||
3340 | // Operands that must be SGPRs must accept potentially divergent VGPRs as | |||
3341 | // legal. These will be dealt with in applyMappingImpl. | |||
3342 | // | |||
3343 | const RegisterBankInfo::InstructionMapping & | |||
3344 | AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { | |||
3345 | const MachineFunction &MF = *MI.getParent()->getParent(); | |||
3346 | const MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
3347 | ||||
3348 | if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) { | |||
3349 | // The default logic bothers to analyze impossible alternative mappings. We | |||
3350 | // want the most straightforward mapping, so just directly handle this. | |||
3351 | const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI, | |||
3352 | *TRI); | |||
3353 | const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI, | |||
3354 | *TRI); | |||
3355 | assert(SrcBank && "src bank should have been assigned already")(static_cast <bool> (SrcBank && "src bank should have been assigned already" ) ? void (0) : __assert_fail ("SrcBank && \"src bank should have been assigned already\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 3355, __extension__ __PRETTY_FUNCTION__)); | |||
3356 | if (!DstBank) | |||
3357 | DstBank = SrcBank; | |||
3358 | ||||
3359 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); | |||
3360 | if (cannotCopy(*DstBank, *SrcBank, Size)) | |||
3361 | return getInvalidInstructionMapping(); | |||
3362 | ||||
3363 | const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank); | |||
3364 | unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2; | |||
3365 | SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize); | |||
3366 | OpdsMapping[0] = &ValMap; | |||
3367 | if (MI.getOpcode() == AMDGPU::G_FREEZE) | |||
3368 | OpdsMapping[1] = &ValMap; | |||
3369 | ||||
3370 | return getInstructionMapping( | |||
3371 | 1, /*Cost*/ 1, | |||
3372 | /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize); | |||
3373 | } | |||
3374 | ||||
3375 | if (MI.isRegSequence()) { | |||
3376 | // If any input is a VGPR, the result must be a VGPR. The default handling | |||
3377 | // assumes any copy between banks is legal. | |||
3378 | unsigned BankID = AMDGPU::SGPRRegBankID; | |||
3379 | ||||
3380 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { | |||
3381 | auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI); | |||
3382 | // It doesn't make sense to use vcc or scc banks here, so just ignore | |||
3383 | // them. | |||
3384 | if (OpBank != AMDGPU::SGPRRegBankID) { | |||
3385 | BankID = AMDGPU::VGPRRegBankID; | |||
3386 | break; | |||
3387 | } | |||
3388 | } | |||
3389 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); | |||
3390 | ||||
3391 | const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); | |||
3392 | return getInstructionMapping( | |||
3393 | 1, /*Cost*/ 1, | |||
3394 | /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); | |||
3395 | } | |||
3396 | ||||
3397 | // The default handling is broken and doesn't handle illegal SGPR->VGPR copies | |||
3398 | // properly. | |||
3399 | // | |||
3400 | // TODO: There are additional exec masking dependencies to analyze. | |||
3401 | if (MI.getOpcode() == TargetOpcode::G_PHI) { | |||
3402 | unsigned ResultBank = AMDGPU::InvalidRegBankID; | |||
3403 | Register DstReg = MI.getOperand(0).getReg(); | |||
3404 | ||||
3405 | // Sometimes the result may have already been assigned a bank. | |||
3406 | if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI)) | |||
3407 | ResultBank = DstBank->getID(); | |||
3408 | ||||
3409 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { | |||
3410 | Register Reg = MI.getOperand(I).getReg(); | |||
3411 | const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); | |||
3412 | ||||
3413 | // FIXME: Assuming VGPR for any undetermined inputs. | |||
3414 | if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { | |||
3415 | ResultBank = AMDGPU::VGPRRegBankID; | |||
3416 | break; | |||
3417 | } | |||
3418 | ||||
3419 | // FIXME: Need to promote SGPR case to s32 | |||
3420 | unsigned OpBank = Bank->getID(); | |||
3421 | ResultBank = regBankBoolUnion(ResultBank, OpBank); | |||
3422 | } | |||
3423 | ||||
3424 | assert(ResultBank != AMDGPU::InvalidRegBankID)(static_cast <bool> (ResultBank != AMDGPU::InvalidRegBankID ) ? void (0) : __assert_fail ("ResultBank != AMDGPU::InvalidRegBankID" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 3424, __extension__ __PRETTY_FUNCTION__)); | |||
3425 | ||||
3426 | unsigned Size = MRI.getType(DstReg).getSizeInBits(); | |||
3427 | ||||
3428 | const ValueMapping &ValMap = | |||
3429 | getValueMapping(0, Size, getRegBank(ResultBank)); | |||
3430 | return getInstructionMapping( | |||
3431 | 1, /*Cost*/ 1, | |||
3432 | /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); | |||
3433 | } | |||
3434 | ||||
3435 | const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); | |||
3436 | if (Mapping.isValid()) | |||
3437 | return Mapping; | |||
3438 | ||||
3439 | SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); | |||
3440 | ||||
3441 | switch (MI.getOpcode()) { | |||
3442 | default: | |||
3443 | return getInvalidInstructionMapping(); | |||
3444 | ||||
3445 | case AMDGPU::G_AND: | |||
3446 | case AMDGPU::G_OR: | |||
3447 | case AMDGPU::G_XOR: { | |||
3448 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
3449 | if (Size == 1) { | |||
3450 | const RegisterBank *DstBank | |||
3451 | = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); | |||
3452 | ||||
3453 | unsigned TargetBankID = AMDGPU::InvalidRegBankID; | |||
3454 | unsigned BankLHS = AMDGPU::InvalidRegBankID; | |||
3455 | unsigned BankRHS = AMDGPU::InvalidRegBankID; | |||
3456 | if (DstBank) { | |||
3457 | TargetBankID = DstBank->getID(); | |||
3458 | if (DstBank == &AMDGPU::VCCRegBank) { | |||
3459 | TargetBankID = AMDGPU::VCCRegBankID; | |||
3460 | BankLHS = AMDGPU::VCCRegBankID; | |||
3461 | BankRHS = AMDGPU::VCCRegBankID; | |||
3462 | } else { | |||
3463 | BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, | |||
3464 | AMDGPU::SGPRRegBankID); | |||
3465 | BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, | |||
3466 | AMDGPU::SGPRRegBankID); | |||
3467 | } | |||
3468 | } else { | |||
3469 | BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, | |||
3470 | AMDGPU::VCCRegBankID); | |||
3471 | BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, | |||
3472 | AMDGPU::VCCRegBankID); | |||
3473 | ||||
3474 | // Both inputs should be true booleans to produce a boolean result. | |||
3475 | if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { | |||
3476 | TargetBankID = AMDGPU::VGPRRegBankID; | |||
3477 | } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { | |||
3478 | TargetBankID = AMDGPU::VCCRegBankID; | |||
3479 | BankLHS = AMDGPU::VCCRegBankID; | |||
3480 | BankRHS = AMDGPU::VCCRegBankID; | |||
3481 | } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { | |||
3482 | TargetBankID = AMDGPU::SGPRRegBankID; | |||
3483 | } | |||
3484 | } | |||
3485 | ||||
3486 | OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); | |||
3487 | OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); | |||
3488 | OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); | |||
3489 | break; | |||
3490 | } | |||
3491 | ||||
3492 | if (Size == 64) { | |||
3493 | ||||
3494 | if (isSALUMapping(MI)) { | |||
3495 | OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); | |||
3496 | OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; | |||
3497 | } else { | |||
3498 | OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); | |||
3499 | unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/); | |||
3500 | OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); | |||
3501 | ||||
3502 | unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/); | |||
3503 | OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); | |||
3504 | } | |||
3505 | ||||
3506 | break; | |||
3507 | } | |||
3508 | ||||
3509 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | |||
3510 | } | |||
3511 | case AMDGPU::G_PTR_ADD: | |||
3512 | case AMDGPU::G_PTRMASK: | |||
3513 | case AMDGPU::G_ADD: | |||
3514 | case AMDGPU::G_SUB: | |||
3515 | case AMDGPU::G_MUL: | |||
3516 | case AMDGPU::G_SHL: | |||
3517 | case AMDGPU::G_LSHR: | |||
3518 | case AMDGPU::G_ASHR: | |||
3519 | case AMDGPU::G_UADDO: | |||
3520 | case AMDGPU::G_USUBO: | |||
3521 | case AMDGPU::G_UADDE: | |||
3522 | case AMDGPU::G_SADDE: | |||
3523 | case AMDGPU::G_USUBE: | |||
3524 | case AMDGPU::G_SSUBE: | |||
3525 | case AMDGPU::G_SMIN: | |||
3526 | case AMDGPU::G_SMAX: | |||
3527 | case AMDGPU::G_UMIN: | |||
3528 | case AMDGPU::G_UMAX: | |||
3529 | case AMDGPU::G_ABS: | |||
3530 | case AMDGPU::G_SHUFFLE_VECTOR: | |||
3531 | if (isSALUMapping(MI)) | |||
3532 | return getDefaultMappingSOP(MI); | |||
3533 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | |||
3534 | ||||
3535 | case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU | |||
3536 | case AMDGPU::G_SSUBSAT: | |||
3537 | case AMDGPU::G_UADDSAT: | |||
3538 | case AMDGPU::G_USUBSAT: | |||
3539 | case AMDGPU::G_FADD: | |||
3540 | case AMDGPU::G_FSUB: | |||
3541 | case AMDGPU::G_FPTOSI: | |||
3542 | case AMDGPU::G_FPTOUI: | |||
3543 | case AMDGPU::G_FMUL: | |||
3544 | case AMDGPU::G_FMA: | |||
3545 | case AMDGPU::G_FMAD: | |||
3546 | case AMDGPU::G_FSQRT: | |||
3547 | case AMDGPU::G_FFLOOR: | |||
3548 | case AMDGPU::G_FCEIL: | |||
3549 | case AMDGPU::G_FRINT: | |||
3550 | case AMDGPU::G_SITOFP: | |||
3551 | case AMDGPU::G_UITOFP: | |||
3552 | case AMDGPU::G_FPTRUNC: | |||
3553 | case AMDGPU::G_FPEXT: | |||
3554 | case AMDGPU::G_FEXP2: | |||
3555 | case AMDGPU::G_FLOG2: | |||
3556 | case AMDGPU::G_FMINNUM: | |||
3557 | case AMDGPU::G_FMAXNUM: | |||
3558 | case AMDGPU::G_FMINNUM_IEEE: | |||
3559 | case AMDGPU::G_FMAXNUM_IEEE: | |||
3560 | case AMDGPU::G_FCANONICALIZE: | |||
3561 | case AMDGPU::G_INTRINSIC_TRUNC: | |||
3562 | case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? | |||
3563 | case AMDGPU::G_FSHR: // TODO: Expand for scalar | |||
3564 | case AMDGPU::G_AMDGPU_FFBH_U32: | |||
3565 | case AMDGPU::G_AMDGPU_FMIN_LEGACY: | |||
3566 | case AMDGPU::G_AMDGPU_FMAX_LEGACY: | |||
3567 | case AMDGPU::G_AMDGPU_RCP_IFLAG: | |||
3568 | case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: | |||
3569 | case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: | |||
3570 | case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: | |||
3571 | case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: | |||
3572 | case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: | |||
3573 | case AMDGPU::G_AMDGPU_SMED3: | |||
3574 | return getDefaultMappingVOP(MI); | |||
3575 | case AMDGPU::G_UMULH: | |||
3576 | case AMDGPU::G_SMULH: { | |||
3577 | if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) | |||
3578 | return getDefaultMappingSOP(MI); | |||
3579 | return getDefaultMappingVOP(MI); | |||
3580 | } | |||
3581 | case AMDGPU::G_IMPLICIT_DEF: { | |||
3582 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
3583 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); | |||
3584 | break; | |||
3585 | } | |||
3586 | case AMDGPU::G_FCONSTANT: | |||
3587 | case AMDGPU::G_CONSTANT: | |||
3588 | case AMDGPU::G_GLOBAL_VALUE: | |||
3589 | case AMDGPU::G_BLOCK_ADDR: | |||
3590 | case AMDGPU::G_READCYCLECOUNTER: { | |||
3591 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
3592 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); | |||
3593 | break; | |||
3594 | } | |||
3595 | case AMDGPU::G_FRAME_INDEX: { | |||
3596 | // TODO: This should be the same as other constants, but eliminateFrameIndex | |||
3597 | // currently assumes VALU uses. | |||
3598 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
3599 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); | |||
3600 | break; | |||
3601 | } | |||
3602 | case AMDGPU::G_DYN_STACKALLOC: { | |||
3603 | // Result is always uniform, and a wave reduction is needed for the source. | |||
3604 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); | |||
3605 | unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); | |||
3606 | OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32); | |||
3607 | break; | |||
3608 | } | |||
3609 | case AMDGPU::G_INSERT: { | |||
3610 | unsigned BankID = getMappingType(MRI, MI); | |||
3611 | unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); | |||
3612 | unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); | |||
3613 | unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); | |||
3614 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); | |||
3615 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); | |||
3616 | OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); | |||
3617 | OpdsMapping[3] = nullptr; | |||
3618 | break; | |||
3619 | } | |||
3620 | case AMDGPU::G_EXTRACT: { | |||
3621 | unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); | |||
3622 | unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); | |||
3623 | unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); | |||
3624 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); | |||
3625 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); | |||
3626 | OpdsMapping[2] = nullptr; | |||
3627 | break; | |||
3628 | } | |||
3629 | case AMDGPU::G_BUILD_VECTOR: | |||
3630 | case AMDGPU::G_BUILD_VECTOR_TRUNC: { | |||
3631 | LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); | |||
3632 | if (DstTy == LLT::vector(2, 16)) { | |||
3633 | unsigned DstSize = DstTy.getSizeInBits(); | |||
3634 | unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); | |||
3635 | unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); | |||
3636 | unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI); | |||
3637 | unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); | |||
3638 | ||||
3639 | OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); | |||
3640 | OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); | |||
3641 | OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); | |||
3642 | break; | |||
3643 | } | |||
3644 | ||||
3645 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | |||
3646 | } | |||
3647 | case AMDGPU::G_MERGE_VALUES: | |||
3648 | case AMDGPU::G_CONCAT_VECTORS: { | |||
3649 | unsigned Bank = getMappingType(MRI, MI); | |||
3650 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
3651 | unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); | |||
3652 | ||||
3653 | OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); | |||
3654 | // Op1 and Dst should use the same register bank. | |||
3655 | for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) | |||
3656 | OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); | |||
3657 | break; | |||
3658 | } | |||
3659 | case AMDGPU::G_BITREVERSE: | |||
3660 | case AMDGPU::G_BITCAST: | |||
3661 | case AMDGPU::G_INTTOPTR: | |||
3662 | case AMDGPU::G_PTRTOINT: | |||
3663 | case AMDGPU::G_FABS: | |||
3664 | case AMDGPU::G_FNEG: { | |||
3665 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
3666 | unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); | |||
3667 | OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); | |||
3668 | break; | |||
3669 | } | |||
3670 | case AMDGPU::G_CTLZ_ZERO_UNDEF: | |||
3671 | case AMDGPU::G_CTTZ_ZERO_UNDEF: | |||
3672 | case AMDGPU::G_CTPOP: { | |||
3673 | unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); | |||
3674 | unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); | |||
3675 | OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); | |||
3676 | ||||
3677 | // This should really be getValueMappingSGPR64Only, but allowing the generic | |||
3678 | // code to handle the register split just makes using LegalizerHelper more | |||
3679 | // difficult. | |||
3680 | OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); | |||
3681 | break; | |||
3682 | } | |||
3683 | case AMDGPU::G_TRUNC: { | |||
3684 | Register Dst = MI.getOperand(0).getReg(); | |||
3685 | Register Src = MI.getOperand(1).getReg(); | |||
3686 | unsigned Bank = getRegBankID(Src, MRI); | |||
3687 | unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); | |||
3688 | unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); | |||
3689 | OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); | |||
3690 | OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); | |||
3691 | break; | |||
3692 | } | |||
3693 | case AMDGPU::G_ZEXT: | |||
3694 | case AMDGPU::G_SEXT: | |||
3695 | case AMDGPU::G_ANYEXT: | |||
3696 | case AMDGPU::G_SEXT_INREG: { | |||
3697 | Register Dst = MI.getOperand(0).getReg(); | |||
3698 | Register Src = MI.getOperand(1).getReg(); | |||
3699 | unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); | |||
3700 | unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); | |||
3701 | ||||
3702 | unsigned DstBank; | |||
3703 | const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); | |||
3704 | assert(SrcBank)(static_cast <bool> (SrcBank) ? void (0) : __assert_fail ("SrcBank", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 3704, __extension__ __PRETTY_FUNCTION__)); | |||
3705 | switch (SrcBank->getID()) { | |||
3706 | case AMDGPU::SGPRRegBankID: | |||
3707 | DstBank = AMDGPU::SGPRRegBankID; | |||
3708 | break; | |||
3709 | default: | |||
3710 | DstBank = AMDGPU::VGPRRegBankID; | |||
3711 | break; | |||
3712 | } | |||
3713 | ||||
3714 | // Scalar extend can use 64-bit BFE, but VGPRs require extending to | |||
3715 | // 32-bits, and then to 64. | |||
3716 | OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); | |||
3717 | OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), | |||
3718 | SrcSize); | |||
3719 | break; | |||
3720 | } | |||
3721 | case AMDGPU::G_FCMP: { | |||
3722 | unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); | |||
3723 | unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); | |||
3724 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); | |||
3725 | OpdsMapping[1] = nullptr; // Predicate Operand. | |||
3726 | OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); | |||
3727 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); | |||
3728 | break; | |||
3729 | } | |||
3730 | case AMDGPU::G_STORE: { | |||
3731 | assert(MI.getOperand(0).isReg())(static_cast <bool> (MI.getOperand(0).isReg()) ? void ( 0) : __assert_fail ("MI.getOperand(0).isReg()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 3731, __extension__ __PRETTY_FUNCTION__)); | |||
3732 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
3733 | ||||
3734 | // FIXME: We need to specify a different reg bank once scalar stores are | |||
3735 | // supported. | |||
3736 | const ValueMapping *ValMapping = | |||
3737 | AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); | |||
3738 | OpdsMapping[0] = ValMapping; | |||
3739 | OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); | |||
3740 | break; | |||
3741 | } | |||
3742 | case AMDGPU::G_ICMP: { | |||
3743 | auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); | |||
3744 | unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); | |||
3745 | ||||
3746 | // See if the result register has already been constrained to vcc, which may | |||
3747 | // happen due to control flow intrinsic lowering. | |||
3748 | unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, | |||
3749 | AMDGPU::SGPRRegBankID); | |||
3750 | unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); | |||
3751 | unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI); | |||
3752 | ||||
3753 | bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID && | |||
3754 | Op2Bank == AMDGPU::SGPRRegBankID && | |||
3755 | Op3Bank == AMDGPU::SGPRRegBankID && | |||
3756 | (Size == 32 || (Size == 64 && | |||
3757 | (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && | |||
3758 | Subtarget.hasScalarCompareEq64())); | |||
3759 | ||||
3760 | DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; | |||
3761 | unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; | |||
3762 | ||||
3763 | // TODO: Use 32-bit for scalar output size. | |||
3764 | // SCC results will need to be copied to a 32-bit SGPR virtual register. | |||
3765 | const unsigned ResultSize = 1; | |||
3766 | ||||
3767 | OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize); | |||
3768 | OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size); | |||
3769 | OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size); | |||
3770 | break; | |||
3771 | } | |||
3772 | case AMDGPU::G_EXTRACT_VECTOR_ELT: { | |||
3773 | // VGPR index can be used for waterfall when indexing a SGPR vector. | |||
3774 | unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); | |||
3775 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
3776 | unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); | |||
3777 | unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); | |||
3778 | unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI); | |||
3779 | unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); | |||
3780 | ||||
3781 | OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); | |||
3782 | OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); | |||
3783 | ||||
3784 | // The index can be either if the source vector is VGPR. | |||
3785 | OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); | |||
3786 | break; | |||
3787 | } | |||
3788 | case AMDGPU::G_INSERT_VECTOR_ELT: { | |||
3789 | unsigned OutputBankID = isSALUMapping(MI) ? | |||
3790 | AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; | |||
3791 | ||||
3792 | unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
3793 | unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); | |||
3794 | unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); | |||
3795 | unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI); | |||
3796 | unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI); | |||
3797 | ||||
3798 | OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); | |||
3799 | OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); | |||
3800 | ||||
3801 | // This is a weird case, because we need to break down the mapping based on | |||
3802 | // the register bank of a different operand. | |||
3803 | if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) { | |||
3804 | OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID, | |||
3805 | InsertSize); | |||
3806 | } else { | |||
3807 | assert(InsertSize == 32 || InsertSize == 64)(static_cast <bool> (InsertSize == 32 || InsertSize == 64 ) ? void (0) : __assert_fail ("InsertSize == 32 || InsertSize == 64" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 3807, __extension__ __PRETTY_FUNCTION__)); | |||
3808 | OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize); | |||
3809 | } | |||
3810 | ||||
3811 | // The index can be either if the source vector is VGPR. | |||
3812 | OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); | |||
3813 | break; | |||
3814 | } | |||
3815 | case AMDGPU::G_UNMERGE_VALUES: { | |||
3816 | unsigned Bank = getMappingType(MRI, MI); | |||
3817 | ||||
3818 | // Op1 and Dst should use the same register bank. | |||
3819 | // FIXME: Shouldn't this be the default? Why do we need to handle this? | |||
3820 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { | |||
3821 | unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); | |||
3822 | OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); | |||
3823 | } | |||
3824 | break; | |||
3825 | } | |||
3826 | case AMDGPU::G_AMDGPU_BUFFER_LOAD: | |||
3827 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: | |||
3828 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: | |||
3829 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: | |||
3830 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: | |||
3831 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: | |||
3832 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: | |||
3833 | case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: | |||
3834 | case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: | |||
3835 | case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: | |||
3836 | case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: | |||
3837 | case AMDGPU::G_AMDGPU_BUFFER_STORE: | |||
3838 | case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: | |||
3839 | case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: | |||
3840 | case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: | |||
3841 | case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: { | |||
3842 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); | |||
3843 | ||||
3844 | // rsrc | |||
3845 | OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); | |||
3846 | ||||
3847 | // vindex | |||
3848 | OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); | |||
3849 | ||||
3850 | // voffset | |||
3851 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); | |||
3852 | ||||
3853 | // soffset | |||
3854 | OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); | |||
3855 | ||||
3856 | // Any remaining operands are immediates and were correctly null | |||
3857 | // initialized. | |||
3858 | break; | |||
3859 | } | |||
3860 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: | |||
3861 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: | |||
3862 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: | |||
3863 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: | |||
3864 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: | |||
3865 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: | |||
3866 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: | |||
3867 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: | |||
3868 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: | |||
3869 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: | |||
3870 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: | |||
3871 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: | |||
3872 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: | |||
3873 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: | |||
3874 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { | |||
3875 | // vdata_out | |||
3876 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); | |||
3877 | ||||
3878 | // vdata_in | |||
3879 | OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); | |||
3880 | ||||
3881 | // rsrc | |||
3882 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); | |||
3883 | ||||
3884 | // vindex | |||
3885 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); | |||
3886 | ||||
3887 | // voffset | |||
3888 | OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); | |||
3889 | ||||
3890 | // soffset | |||
3891 | OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); | |||
3892 | ||||
3893 | // Any remaining operands are immediates and were correctly null | |||
3894 | // initialized. | |||
3895 | break; | |||
3896 | } | |||
3897 | case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { | |||
3898 | // vdata_out | |||
3899 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); | |||
3900 | ||||
3901 | // vdata_in | |||
3902 | OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); | |||
3903 | ||||
3904 | // cmp | |||
3905 | OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); | |||
3906 | ||||
3907 | // rsrc | |||
3908 | OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); | |||
3909 | ||||
3910 | // vindex | |||
3911 | OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); | |||
3912 | ||||
3913 | // voffset | |||
3914 | OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); | |||
3915 | ||||
3916 | // soffset | |||
3917 | OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); | |||
3918 | ||||
3919 | // Any remaining operands are immediates and were correctly null | |||
3920 | // initialized. | |||
3921 | break; | |||
3922 | } | |||
3923 | case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { | |||
3924 | // Lie and claim everything is legal, even though some need to be | |||
3925 | // SGPRs. applyMapping will have to deal with it as a waterfall loop. | |||
3926 | OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); | |||
3927 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); | |||
3928 | ||||
3929 | // We need to convert this to a MUBUF if either the resource of offset is | |||
3930 | // VGPR. | |||
3931 | unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID(); | |||
3932 | unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); | |||
3933 | unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank); | |||
3934 | ||||
3935 | unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
3936 | OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0); | |||
3937 | break; | |||
3938 | } | |||
3939 | case AMDGPU::G_INTRINSIC: { | |||
3940 | switch (MI.getIntrinsicID()) { | |||
3941 | default: | |||
3942 | return getInvalidInstructionMapping(); | |||
3943 | case Intrinsic::amdgcn_div_fmas: | |||
3944 | case Intrinsic::amdgcn_div_fixup: | |||
3945 | case Intrinsic::amdgcn_trig_preop: | |||
3946 | case Intrinsic::amdgcn_sin: | |||
3947 | case Intrinsic::amdgcn_cos: | |||
3948 | case Intrinsic::amdgcn_log_clamp: | |||
3949 | case Intrinsic::amdgcn_rcp: | |||
3950 | case Intrinsic::amdgcn_rcp_legacy: | |||
3951 | case Intrinsic::amdgcn_sqrt: | |||
3952 | case Intrinsic::amdgcn_rsq: | |||
3953 | case Intrinsic::amdgcn_rsq_legacy: | |||
3954 | case Intrinsic::amdgcn_rsq_clamp: | |||
3955 | case Intrinsic::amdgcn_fmul_legacy: | |||
3956 | case Intrinsic::amdgcn_fma_legacy: | |||
3957 | case Intrinsic::amdgcn_ldexp: | |||
3958 | case Intrinsic::amdgcn_frexp_mant: | |||
3959 | case Intrinsic::amdgcn_frexp_exp: | |||
3960 | case Intrinsic::amdgcn_fract: | |||
3961 | case Intrinsic::amdgcn_cvt_pkrtz: | |||
3962 | case Intrinsic::amdgcn_cvt_pknorm_i16: | |||
3963 | case Intrinsic::amdgcn_cvt_pknorm_u16: | |||
3964 | case Intrinsic::amdgcn_cvt_pk_i16: | |||
3965 | case Intrinsic::amdgcn_cvt_pk_u16: | |||
3966 | case Intrinsic::amdgcn_fmed3: | |||
3967 | case Intrinsic::amdgcn_cubeid: | |||
3968 | case Intrinsic::amdgcn_cubema: | |||
3969 | case Intrinsic::amdgcn_cubesc: | |||
3970 | case Intrinsic::amdgcn_cubetc: | |||
3971 | case Intrinsic::amdgcn_sffbh: | |||
3972 | case Intrinsic::amdgcn_fmad_ftz: | |||
3973 | case Intrinsic::amdgcn_mbcnt_lo: | |||
3974 | case Intrinsic::amdgcn_mbcnt_hi: | |||
3975 | case Intrinsic::amdgcn_mul_u24: | |||
3976 | case Intrinsic::amdgcn_mul_i24: | |||
3977 | case Intrinsic::amdgcn_lerp: | |||
3978 | case Intrinsic::amdgcn_sad_u8: | |||
3979 | case Intrinsic::amdgcn_msad_u8: | |||
3980 | case Intrinsic::amdgcn_sad_hi_u8: | |||
3981 | case Intrinsic::amdgcn_sad_u16: | |||
3982 | case Intrinsic::amdgcn_qsad_pk_u16_u8: | |||
3983 | case Intrinsic::amdgcn_mqsad_pk_u16_u8: | |||
3984 | case Intrinsic::amdgcn_mqsad_u32_u8: | |||
3985 | case Intrinsic::amdgcn_cvt_pk_u8_f32: | |||
3986 | case Intrinsic::amdgcn_alignbit: | |||
3987 | case Intrinsic::amdgcn_alignbyte: | |||
3988 | case Intrinsic::amdgcn_perm: | |||
3989 | case Intrinsic::amdgcn_fdot2: | |||
3990 | case Intrinsic::amdgcn_sdot2: | |||
3991 | case Intrinsic::amdgcn_udot2: | |||
3992 | case Intrinsic::amdgcn_sdot4: | |||
3993 | case Intrinsic::amdgcn_udot4: | |||
3994 | case Intrinsic::amdgcn_sdot8: | |||
3995 | case Intrinsic::amdgcn_udot8: | |||
3996 | return getDefaultMappingVOP(MI); | |||
3997 | case Intrinsic::amdgcn_sbfe: | |||
3998 | case Intrinsic::amdgcn_ubfe: | |||
3999 | if (isSALUMapping(MI)) | |||
4000 | return getDefaultMappingSOP(MI); | |||
4001 | return getDefaultMappingVOP(MI); | |||
4002 | case Intrinsic::amdgcn_ds_swizzle: | |||
4003 | case Intrinsic::amdgcn_ds_permute: | |||
4004 | case Intrinsic::amdgcn_ds_bpermute: | |||
4005 | case Intrinsic::amdgcn_update_dpp: | |||
4006 | case Intrinsic::amdgcn_mov_dpp8: | |||
4007 | case Intrinsic::amdgcn_mov_dpp: | |||
4008 | case Intrinsic::amdgcn_strict_wwm: | |||
4009 | case Intrinsic::amdgcn_wwm: | |||
4010 | case Intrinsic::amdgcn_strict_wqm: | |||
4011 | case Intrinsic::amdgcn_wqm: | |||
4012 | case Intrinsic::amdgcn_softwqm: | |||
4013 | case Intrinsic::amdgcn_set_inactive: | |||
4014 | return getDefaultMappingAllVGPR(MI); | |||
4015 | case Intrinsic::amdgcn_kernarg_segment_ptr: | |||
4016 | case Intrinsic::amdgcn_s_getpc: | |||
4017 | case Intrinsic::amdgcn_groupstaticsize: | |||
4018 | case Intrinsic::amdgcn_reloc_constant: | |||
4019 | case Intrinsic::returnaddress: { | |||
4020 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
4021 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); | |||
4022 | break; | |||
4023 | } | |||
4024 | case Intrinsic::amdgcn_wqm_vote: { | |||
4025 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
4026 | OpdsMapping[0] = OpdsMapping[2] | |||
4027 | = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); | |||
4028 | break; | |||
4029 | } | |||
4030 | case Intrinsic::amdgcn_ps_live: { | |||
4031 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); | |||
4032 | break; | |||
4033 | } | |||
4034 | case Intrinsic::amdgcn_div_scale: { | |||
4035 | unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
4036 | unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); | |||
4037 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); | |||
4038 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); | |||
4039 | ||||
4040 | unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); | |||
4041 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); | |||
4042 | OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); | |||
4043 | break; | |||
4044 | } | |||
4045 | case Intrinsic::amdgcn_class: { | |||
4046 | Register Src0Reg = MI.getOperand(2).getReg(); | |||
4047 | Register Src1Reg = MI.getOperand(3).getReg(); | |||
4048 | unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); | |||
4049 | unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); | |||
4050 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
4051 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); | |||
4052 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size); | |||
4053 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size); | |||
4054 | break; | |||
4055 | } | |||
4056 | case Intrinsic::amdgcn_icmp: | |||
4057 | case Intrinsic::amdgcn_fcmp: { | |||
4058 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
4059 | // This is not VCCRegBank because this is not used in boolean contexts. | |||
4060 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); | |||
4061 | unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); | |||
4062 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); | |||
4063 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); | |||
4064 | break; | |||
4065 | } | |||
4066 | case Intrinsic::amdgcn_readlane: { | |||
4067 | // This must be an SGPR, but accept a VGPR. | |||
4068 | Register IdxReg = MI.getOperand(3).getReg(); | |||
4069 | unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); | |||
4070 | unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); | |||
4071 | OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); | |||
4072 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | |||
4073 | } | |||
4074 | case Intrinsic::amdgcn_readfirstlane: { | |||
4075 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
4076 | unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); | |||
4077 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); | |||
4078 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); | |||
4079 | break; | |||
4080 | } | |||
4081 | case Intrinsic::amdgcn_writelane: { | |||
4082 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
4083 | Register SrcReg = MI.getOperand(2).getReg(); | |||
4084 | unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); | |||
4085 | unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID); | |||
4086 | Register IdxReg = MI.getOperand(3).getReg(); | |||
4087 | unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); | |||
4088 | unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); | |||
4089 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); | |||
4090 | ||||
4091 | // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted | |||
4092 | // to legalize. | |||
4093 | OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); | |||
4094 | OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); | |||
4095 | OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); | |||
4096 | break; | |||
4097 | } | |||
4098 | case Intrinsic::amdgcn_if_break: { | |||
4099 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); | |||
4100 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); | |||
4101 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); | |||
4102 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); | |||
4103 | break; | |||
4104 | } | |||
4105 | case Intrinsic::amdgcn_permlane16: | |||
4106 | case Intrinsic::amdgcn_permlanex16: { | |||
4107 | unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); | |||
4108 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); | |||
4109 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); | |||
4110 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); | |||
4111 | OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); | |||
4112 | OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); | |||
4113 | break; | |||
4114 | } | |||
4115 | case Intrinsic::amdgcn_mfma_f32_4x4x1f32: | |||
4116 | case Intrinsic::amdgcn_mfma_f32_4x4x4f16: | |||
4117 | case Intrinsic::amdgcn_mfma_i32_4x4x4i8: | |||
4118 | case Intrinsic::amdgcn_mfma_f32_4x4x2bf16: | |||
4119 | case Intrinsic::amdgcn_mfma_f32_16x16x1f32: | |||
4120 | case Intrinsic::amdgcn_mfma_f32_16x16x4f32: | |||
4121 | case Intrinsic::amdgcn_mfma_f32_16x16x4f16: | |||
4122 | case Intrinsic::amdgcn_mfma_f32_16x16x16f16: | |||
4123 | case Intrinsic::amdgcn_mfma_i32_16x16x4i8: | |||
4124 | case Intrinsic::amdgcn_mfma_i32_16x16x16i8: | |||
4125 | case Intrinsic::amdgcn_mfma_f32_16x16x2bf16: | |||
4126 | case Intrinsic::amdgcn_mfma_f32_16x16x8bf16: | |||
4127 | case Intrinsic::amdgcn_mfma_f32_32x32x1f32: | |||
4128 | case Intrinsic::amdgcn_mfma_f32_32x32x2f32: | |||
4129 | case Intrinsic::amdgcn_mfma_f32_32x32x4f16: | |||
4130 | case Intrinsic::amdgcn_mfma_f32_32x32x8f16: | |||
4131 | case Intrinsic::amdgcn_mfma_i32_32x32x4i8: | |||
4132 | case Intrinsic::amdgcn_mfma_i32_32x32x8i8: | |||
4133 | case Intrinsic::amdgcn_mfma_f32_32x32x2bf16: | |||
4134 | case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: | |||
4135 | case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k: | |||
4136 | case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k: | |||
4137 | case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k: | |||
4138 | case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k: | |||
4139 | case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k: | |||
4140 | case Intrinsic::amdgcn_mfma_f64_16x16x4f64: | |||
4141 | case Intrinsic::amdgcn_mfma_f64_4x4x4f64: { | |||
4142 | // Default for MAI intrinsics. | |||
4143 | // srcC can also be an immediate which can be folded later. | |||
4144 | // FIXME: Should we eventually add an alternative mapping with AGPR src | |||
4145 | // for srcA/srcB? | |||
4146 | // | |||
4147 | // vdst, srcA, srcB, srcC | |||
4148 | OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); | |||
4149 | OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); | |||
4150 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); | |||
4151 | OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); | |||
4152 | break; | |||
4153 | } | |||
4154 | case Intrinsic::amdgcn_interp_p1: | |||
4155 | case Intrinsic::amdgcn_interp_p2: | |||
4156 | case Intrinsic::amdgcn_interp_mov: | |||
4157 | case Intrinsic::amdgcn_interp_p1_f16: | |||
4158 | case Intrinsic::amdgcn_interp_p2_f16: { | |||
4159 | const int M0Idx = MI.getNumOperands() - 1; | |||
4160 | Register M0Reg = MI.getOperand(M0Idx).getReg(); | |||
4161 | unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); | |||
4162 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
4163 | ||||
4164 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); | |||
4165 | for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) | |||
4166 | OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); | |||
4167 | ||||
4168 | // Must be SGPR, but we must take whatever the original bank is and fix it | |||
4169 | // later. | |||
4170 | OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); | |||
4171 | break; | |||
4172 | } | |||
4173 | case Intrinsic::amdgcn_ballot: { | |||
4174 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
4175 | unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); | |||
4176 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); | |||
4177 | OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); | |||
4178 | break; | |||
4179 | } | |||
4180 | } | |||
4181 | break; | |||
4182 | } | |||
4183 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: | |||
4184 | case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { | |||
4185 | auto IntrID = MI.getIntrinsicID(); | |||
4186 | const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID); | |||
4187 | assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic")(static_cast <bool> (RSrcIntrin && "missing RsrcIntrinsic for image intrinsic" ) ? void (0) : __assert_fail ("RSrcIntrin && \"missing RsrcIntrinsic for image intrinsic\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 4187, __extension__ __PRETTY_FUNCTION__)); | |||
4188 | // Non-images can have complications from operands that allow both SGPR | |||
4189 | // and VGPR. For now it's too complicated to figure out the final opcode | |||
4190 | // to derive the register bank from the MCInstrDesc. | |||
4191 | assert(RSrcIntrin->IsImage)(static_cast <bool> (RSrcIntrin->IsImage) ? void (0) : __assert_fail ("RSrcIntrin->IsImage", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 4191, __extension__ __PRETTY_FUNCTION__)); | |||
4192 | return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); | |||
4193 | } | |||
4194 | case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { | |||
4195 | unsigned N = MI.getNumExplicitOperands() - 2; | |||
4196 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128); | |||
4197 | OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI); | |||
4198 | for (unsigned I = 2; I < N; ++I) | |||
4199 | OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); | |||
4200 | break; | |||
4201 | } | |||
4202 | case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { | |||
4203 | auto IntrID = MI.getIntrinsicID(); | |||
4204 | switch (IntrID) { | |||
4205 | case Intrinsic::amdgcn_s_getreg: | |||
4206 | case Intrinsic::amdgcn_s_memtime: | |||
4207 | case Intrinsic::amdgcn_s_memrealtime: | |||
4208 | case Intrinsic::amdgcn_s_get_waveid_in_workgroup: { | |||
4209 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
4210 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); | |||
4211 | break; | |||
4212 | } | |||
4213 | case Intrinsic::amdgcn_global_atomic_fadd: | |||
4214 | case Intrinsic::amdgcn_global_atomic_csub: | |||
4215 | case Intrinsic::amdgcn_global_atomic_fmin: | |||
4216 | case Intrinsic::amdgcn_global_atomic_fmax: | |||
4217 | case Intrinsic::amdgcn_flat_atomic_fadd: | |||
4218 | case Intrinsic::amdgcn_flat_atomic_fmin: | |||
4219 | case Intrinsic::amdgcn_flat_atomic_fmax: | |||
4220 | return getDefaultMappingAllVGPR(MI); | |||
4221 | case Intrinsic::amdgcn_ds_ordered_add: | |||
4222 | case Intrinsic::amdgcn_ds_ordered_swap: { | |||
4223 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
4224 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); | |||
4225 | unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, | |||
4226 | AMDGPU::SGPRRegBankID); | |||
4227 | OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); | |||
4228 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); | |||
4229 | break; | |||
4230 | } | |||
4231 | case Intrinsic::amdgcn_ds_append: | |||
4232 | case Intrinsic::amdgcn_ds_consume: { | |||
4233 | unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
4234 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); | |||
4235 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); | |||
4236 | break; | |||
4237 | } | |||
4238 | case Intrinsic::amdgcn_exp_compr: | |||
4239 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); | |||
4240 | OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); | |||
4241 | break; | |||
4242 | case Intrinsic::amdgcn_exp: | |||
4243 | // FIXME: Could we support packed types here? | |||
4244 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); | |||
4245 | OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); | |||
4246 | OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); | |||
4247 | OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); | |||
4248 | break; | |||
4249 | case Intrinsic::amdgcn_s_sendmsg: | |||
4250 | case Intrinsic::amdgcn_s_sendmsghalt: { | |||
4251 | // This must be an SGPR, but accept a VGPR. | |||
4252 | unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, | |||
4253 | AMDGPU::SGPRRegBankID); | |||
4254 | OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); | |||
4255 | break; | |||
4256 | } | |||
4257 | case Intrinsic::amdgcn_s_setreg: { | |||
4258 | // This must be an SGPR, but accept a VGPR. | |||
4259 | unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, | |||
4260 | AMDGPU::SGPRRegBankID); | |||
4261 | OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); | |||
4262 | break; | |||
4263 | } | |||
4264 | case Intrinsic::amdgcn_end_cf: { | |||
4265 | unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); | |||
4266 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); | |||
4267 | break; | |||
4268 | } | |||
4269 | case Intrinsic::amdgcn_else: { | |||
4270 | unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); | |||
4271 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); | |||
4272 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); | |||
4273 | OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); | |||
4274 | break; | |||
4275 | } | |||
4276 | case Intrinsic::amdgcn_live_mask: { | |||
4277 | OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); | |||
4278 | break; | |||
4279 | } | |||
4280 | case Intrinsic::amdgcn_wqm_demote: | |||
4281 | case Intrinsic::amdgcn_kill: { | |||
4282 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); | |||
4283 | break; | |||
4284 | } | |||
4285 | case Intrinsic::amdgcn_raw_buffer_load: | |||
4286 | case Intrinsic::amdgcn_raw_tbuffer_load: { | |||
4287 | // FIXME: Should make intrinsic ID the last operand of the instruction, | |||
4288 | // then this would be the same as store | |||
4289 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); | |||
4290 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); | |||
4291 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); | |||
4292 | OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); | |||
4293 | break; | |||
4294 | } | |||
4295 | case Intrinsic::amdgcn_raw_buffer_store: | |||
4296 | case Intrinsic::amdgcn_raw_buffer_store_format: | |||
4297 | case Intrinsic::amdgcn_raw_tbuffer_store: { | |||
4298 | OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); | |||
4299 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); | |||
4300 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); | |||
4301 | OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); | |||
4302 | break; | |||
4303 | } | |||
4304 | case Intrinsic::amdgcn_struct_buffer_load: | |||
4305 | case Intrinsic::amdgcn_struct_tbuffer_load: { | |||
4306 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); | |||
4307 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); | |||
4308 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); | |||
4309 | OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); | |||
4310 | OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); | |||
4311 | break; | |||
4312 | } | |||
4313 | case Intrinsic::amdgcn_struct_buffer_store: | |||
4314 | case Intrinsic::amdgcn_struct_tbuffer_store: { | |||
4315 | OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); | |||
4316 | OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); | |||
4317 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); | |||
4318 | OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); | |||
4319 | OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); | |||
4320 | break; | |||
4321 | } | |||
4322 | case Intrinsic::amdgcn_init_exec_from_input: { | |||
4323 | unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); | |||
4324 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); | |||
4325 | break; | |||
4326 | } | |||
4327 | case Intrinsic::amdgcn_ds_gws_init: | |||
4328 | case Intrinsic::amdgcn_ds_gws_barrier: | |||
4329 | case Intrinsic::amdgcn_ds_gws_sema_br: { | |||
4330 | OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); | |||
4331 | ||||
4332 | // This must be an SGPR, but accept a VGPR. | |||
4333 | unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, | |||
4334 | AMDGPU::SGPRRegBankID); | |||
4335 | OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); | |||
4336 | break; | |||
4337 | } | |||
4338 | case Intrinsic::amdgcn_ds_gws_sema_v: | |||
4339 | case Intrinsic::amdgcn_ds_gws_sema_p: | |||
4340 | case Intrinsic::amdgcn_ds_gws_sema_release_all: { | |||
4341 | // This must be an SGPR, but accept a VGPR. | |||
4342 | unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, | |||
4343 | AMDGPU::SGPRRegBankID); | |||
4344 | OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); | |||
4345 | break; | |||
4346 | } | |||
4347 | default: | |||
4348 | return getInvalidInstructionMapping(); | |||
4349 | } | |||
4350 | break; | |||
4351 | } | |||
4352 | case AMDGPU::G_SELECT: { | |||
4353 | unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); | |||
4354 | unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, | |||
4355 | AMDGPU::SGPRRegBankID); | |||
4356 | unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, | |||
4357 | AMDGPU::SGPRRegBankID); | |||
4358 | bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && | |||
4359 | Op3Bank == AMDGPU::SGPRRegBankID; | |||
4360 | ||||
4361 | unsigned CondBankDefault = SGPRSrcs ? | |||
4362 | AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; | |||
4363 | unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, | |||
4364 | CondBankDefault); | |||
4365 | if (CondBank == AMDGPU::SGPRRegBankID) | |||
4366 | CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; | |||
4367 | else if (CondBank == AMDGPU::VGPRRegBankID) | |||
4368 | CondBank = AMDGPU::VCCRegBankID; | |||
4369 | ||||
4370 | unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ? | |||
4371 | AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; | |||
4372 | ||||
4373 | assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID)(static_cast <bool> (CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID) ? void (0) : __assert_fail ("CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 4373, __extension__ __PRETTY_FUNCTION__)); | |||
4374 | ||||
4375 | // TODO: Should report 32-bit for scalar condition type. | |||
4376 | if (Size == 64) { | |||
4377 | OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); | |||
4378 | OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); | |||
4379 | OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); | |||
4380 | OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); | |||
4381 | } else { | |||
4382 | OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); | |||
4383 | OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); | |||
4384 | OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); | |||
4385 | OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); | |||
4386 | } | |||
4387 | ||||
4388 | break; | |||
4389 | } | |||
4390 | ||||
4391 | case AMDGPU::G_LOAD: | |||
4392 | case AMDGPU::G_ZEXTLOAD: | |||
4393 | case AMDGPU::G_SEXTLOAD: | |||
4394 | return getInstrMappingForLoad(MI); | |||
4395 | ||||
4396 | case AMDGPU::G_ATOMICRMW_XCHG: | |||
4397 | case AMDGPU::G_ATOMICRMW_ADD: | |||
4398 | case AMDGPU::G_ATOMICRMW_SUB: | |||
4399 | case AMDGPU::G_ATOMICRMW_AND: | |||
4400 | case AMDGPU::G_ATOMICRMW_OR: | |||
4401 | case AMDGPU::G_ATOMICRMW_XOR: | |||
4402 | case AMDGPU::G_ATOMICRMW_MAX: | |||
4403 | case AMDGPU::G_ATOMICRMW_MIN: | |||
4404 | case AMDGPU::G_ATOMICRMW_UMAX: | |||
4405 | case AMDGPU::G_ATOMICRMW_UMIN: | |||
4406 | case AMDGPU::G_ATOMICRMW_FADD: | |||
4407 | case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: | |||
4408 | case AMDGPU::G_AMDGPU_ATOMIC_INC: | |||
4409 | case AMDGPU::G_AMDGPU_ATOMIC_DEC: | |||
4410 | case AMDGPU::G_AMDGPU_ATOMIC_FMIN: | |||
4411 | case AMDGPU::G_AMDGPU_ATOMIC_FMAX: { | |||
4412 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); | |||
4413 | OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); | |||
4414 | OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); | |||
4415 | break; | |||
4416 | } | |||
4417 | case AMDGPU::G_ATOMIC_CMPXCHG: { | |||
4418 | OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); | |||
4419 | OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); | |||
4420 | OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); | |||
4421 | OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); | |||
4422 | break; | |||
4423 | } | |||
4424 | case AMDGPU::G_BRCOND: { | |||
4425 | unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, | |||
4426 | AMDGPU::SGPRRegBankID); | |||
4427 | assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1)(static_cast <bool> (MRI.getType(MI.getOperand(0).getReg ()).getSizeInBits() == 1) ? void (0) : __assert_fail ("MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp" , 4427, __extension__ __PRETTY_FUNCTION__)); | |||
4428 | if (Bank != AMDGPU::SGPRRegBankID) | |||
4429 | Bank = AMDGPU::VCCRegBankID; | |||
4430 | ||||
4431 | OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); | |||
4432 | break; | |||
4433 | } | |||
4434 | } | |||
4435 | ||||
4436 | return getInstructionMapping(/*ID*/1, /*Cost*/1, | |||
4437 | getOperandsMapping(OpdsMapping), | |||
4438 | MI.getNumOperands()); | |||
4439 | } |
1 | //== llvm/Support/LowLevelTypeImpl.h --------------------------- -*- C++ -*-==// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// Implement a low-level type suitable for MachineInstr level instruction |
10 | /// selection. |
11 | /// |
12 | /// For a type attached to a MachineInstr, we only care about 2 details: total |
13 | /// size and the number of vector lanes (if any). Accordingly, there are 4 |
14 | /// possible valid type-kinds: |
15 | /// |
16 | /// * `sN` for scalars and aggregates |
17 | /// * `<N x sM>` for vectors, which must have at least 2 elements. |
18 | /// * `pN` for pointers |
19 | /// |
20 | /// Other information required for correct selection is expected to be carried |
21 | /// by the opcode, or non-type flags. For example the distinction between G_ADD |
22 | /// and G_FADD for int/float or fast-math flags. |
23 | /// |
24 | //===----------------------------------------------------------------------===// |
25 | |
26 | #ifndef LLVM_SUPPORT_LOWLEVELTYPEIMPL_H |
27 | #define LLVM_SUPPORT_LOWLEVELTYPEIMPL_H |
28 | |
29 | #include "llvm/ADT/DenseMapInfo.h" |
30 | #include "llvm/Support/Debug.h" |
31 | #include "llvm/Support/MachineValueType.h" |
32 | #include <cassert> |
33 | |
34 | namespace llvm { |
35 | |
36 | class DataLayout; |
37 | class Type; |
38 | class raw_ostream; |
39 | |
40 | class LLT { |
41 | public: |
42 | /// Get a low-level scalar or aggregate "bag of bits". |
43 | static LLT scalar(unsigned SizeInBits) { |
44 | assert(SizeInBits > 0 && "invalid scalar size")(static_cast <bool> (SizeInBits > 0 && "invalid scalar size" ) ? void (0) : __assert_fail ("SizeInBits > 0 && \"invalid scalar size\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 44, __extension__ __PRETTY_FUNCTION__)); |
45 | return LLT{/*isPointer=*/false, /*isVector=*/false, /*NumElements=*/0, |
46 | SizeInBits, /*AddressSpace=*/0}; |
47 | } |
48 | |
49 | /// Get a low-level pointer in the given address space. |
50 | static LLT pointer(unsigned AddressSpace, unsigned SizeInBits) { |
51 | assert(SizeInBits > 0 && "invalid pointer size")(static_cast <bool> (SizeInBits > 0 && "invalid pointer size" ) ? void (0) : __assert_fail ("SizeInBits > 0 && \"invalid pointer size\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 51, __extension__ __PRETTY_FUNCTION__)); |
52 | return LLT{/*isPointer=*/true, /*isVector=*/false, /*NumElements=*/0, |
53 | SizeInBits, AddressSpace}; |
54 | } |
55 | |
56 | /// Get a low-level vector of some number of elements and element width. |
57 | /// \p NumElements must be at least 2. |
58 | static LLT vector(uint16_t NumElements, unsigned ScalarSizeInBits) { |
59 | assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements" ) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 59, __extension__ __PRETTY_FUNCTION__)); |
60 | assert(ScalarSizeInBits > 0 && "invalid vector element size")(static_cast <bool> (ScalarSizeInBits > 0 && "invalid vector element size") ? void (0) : __assert_fail ("ScalarSizeInBits > 0 && \"invalid vector element size\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 60, __extension__ __PRETTY_FUNCTION__)); |
61 | return LLT{/*isPointer=*/false, /*isVector=*/true, NumElements, |
62 | ScalarSizeInBits, /*AddressSpace=*/0}; |
63 | } |
64 | |
65 | /// Get a low-level vector of some number of elements and element type. |
66 | static LLT vector(uint16_t NumElements, LLT ScalarTy) { |
67 | assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements" ) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 67, __extension__ __PRETTY_FUNCTION__)); |
68 | assert(!ScalarTy.isVector() && "invalid vector element type")(static_cast <bool> (!ScalarTy.isVector() && "invalid vector element type" ) ? void (0) : __assert_fail ("!ScalarTy.isVector() && \"invalid vector element type\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 68, __extension__ __PRETTY_FUNCTION__)); |
69 | return LLT{ScalarTy.isPointer(), /*isVector=*/true, NumElements, |
70 | ScalarTy.getSizeInBits(), |
71 | ScalarTy.isPointer() ? ScalarTy.getAddressSpace() : 0}; |
72 | } |
73 | |
74 | static LLT scalarOrVector(uint16_t NumElements, LLT ScalarTy) { |
75 | return NumElements == 1 ? ScalarTy : LLT::vector(NumElements, ScalarTy); |
76 | } |
77 | |
78 | static LLT scalarOrVector(uint16_t NumElements, unsigned ScalarSize) { |
79 | return scalarOrVector(NumElements, LLT::scalar(ScalarSize)); |
80 | } |
81 | |
82 | explicit LLT(bool isPointer, bool isVector, uint16_t NumElements, |
83 | unsigned SizeInBits, unsigned AddressSpace) { |
84 | init(isPointer, isVector, NumElements, SizeInBits, AddressSpace); |
85 | } |
86 | explicit LLT() : IsPointer(false), IsVector(false), RawData(0) {} |
87 | |
88 | explicit LLT(MVT VT); |
89 | |
90 | bool isValid() const { return RawData != 0; } |
91 | |
92 | bool isScalar() const { return isValid() && !IsPointer && !IsVector; } |
93 | |
94 | bool isPointer() const { return isValid() && IsPointer && !IsVector; } |
95 | |
96 | bool isVector() const { return isValid() && IsVector; } |
97 | |
98 | /// Returns the number of elements in a vector LLT. Must only be called on |
99 | /// vector types. |
100 | uint16_t getNumElements() const { |
101 | assert(IsVector && "cannot get number of elements on scalar/aggregate")(static_cast <bool> (IsVector && "cannot get number of elements on scalar/aggregate" ) ? void (0) : __assert_fail ("IsVector && \"cannot get number of elements on scalar/aggregate\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 101, __extension__ __PRETTY_FUNCTION__)); |
102 | if (!IsPointer) |
103 | return getFieldValue(VectorElementsFieldInfo); |
104 | else |
105 | return getFieldValue(PointerVectorElementsFieldInfo); |
106 | } |
107 | |
108 | /// Returns the total size of the type. Must only be called on sized types. |
109 | unsigned getSizeInBits() const { |
110 | if (isPointer() || isScalar()) |
111 | return getScalarSizeInBits(); |
112 | return getScalarSizeInBits() * getNumElements(); |
113 | } |
114 | |
115 | /// Returns the total size of the type in bytes, i.e. number of whole bytes |
116 | /// needed to represent the size in bits. Must only be called on sized types. |
117 | unsigned getSizeInBytes() const { |
118 | return (getSizeInBits() + 7) / 8; |
119 | } |
120 | |
121 | LLT getScalarType() const { |
122 | return isVector() ? getElementType() : *this; |
123 | } |
124 | |
125 | /// If this type is a vector, return a vector with the same number of elements |
126 | /// but the new element type. Otherwise, return the new element type. |
127 | LLT changeElementType(LLT NewEltTy) const { |
128 | return isVector() ? LLT::vector(getNumElements(), NewEltTy) : NewEltTy; |
129 | } |
130 | |
131 | /// If this type is a vector, return a vector with the same number of elements |
132 | /// but the new element size. Otherwise, return the new element type. Invalid |
133 | /// for pointer types. For pointer types, use changeElementType. |
134 | LLT changeElementSize(unsigned NewEltSize) const { |
135 | assert(!getScalarType().isPointer() &&(static_cast <bool> (!getScalarType().isPointer() && "invalid to directly change element size for pointers") ? void (0) : __assert_fail ("!getScalarType().isPointer() && \"invalid to directly change element size for pointers\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 136, __extension__ __PRETTY_FUNCTION__)) |
136 | "invalid to directly change element size for pointers")(static_cast <bool> (!getScalarType().isPointer() && "invalid to directly change element size for pointers") ? void (0) : __assert_fail ("!getScalarType().isPointer() && \"invalid to directly change element size for pointers\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 136, __extension__ __PRETTY_FUNCTION__)); |
137 | return isVector() ? LLT::vector(getNumElements(), NewEltSize) |
138 | : LLT::scalar(NewEltSize); |
139 | } |
140 | |
141 | /// Return a vector or scalar with the same element type and the new number of |
142 | /// elements. |
143 | LLT changeNumElements(unsigned NewNumElts) const { |
144 | return LLT::scalarOrVector(NewNumElts, getScalarType()); |
145 | } |
146 | |
147 | /// Return a type that is \p Factor times smaller. Reduces the number of |
148 | /// elements if this is a vector, or the bitwidth for scalar/pointers. Does |
149 | /// not attempt to handle cases that aren't evenly divisible. |
150 | LLT divide(int Factor) const { |
151 | assert(Factor != 1)(static_cast <bool> (Factor != 1) ? void (0) : __assert_fail ("Factor != 1", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 151, __extension__ __PRETTY_FUNCTION__)); |
152 | if (isVector()) { |
153 | assert(getNumElements() % Factor == 0)(static_cast <bool> (getNumElements() % Factor == 0) ? void (0) : __assert_fail ("getNumElements() % Factor == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 153, __extension__ __PRETTY_FUNCTION__)); |
154 | return scalarOrVector(getNumElements() / Factor, getElementType()); |
155 | } |
156 | |
157 | assert(getSizeInBits() % Factor == 0)(static_cast <bool> (getSizeInBits() % Factor == 0) ? void (0) : __assert_fail ("getSizeInBits() % Factor == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 157, __extension__ __PRETTY_FUNCTION__)); |
158 | return scalar(getSizeInBits() / Factor); |
159 | } |
160 | |
161 | bool isByteSized() const { return (getSizeInBits() & 7) == 0; } |
162 | |
163 | unsigned getScalarSizeInBits() const { |
164 | assert(RawData != 0 && "Invalid Type")(static_cast <bool> (RawData != 0 && "Invalid Type" ) ? void (0) : __assert_fail ("RawData != 0 && \"Invalid Type\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 164, __extension__ __PRETTY_FUNCTION__)); |
165 | if (!IsVector) { |
166 | if (!IsPointer) |
167 | return getFieldValue(ScalarSizeFieldInfo); |
168 | else |
169 | return getFieldValue(PointerSizeFieldInfo); |
170 | } else { |
171 | if (!IsPointer) |
172 | return getFieldValue(VectorSizeFieldInfo); |
173 | else |
174 | return getFieldValue(PointerVectorSizeFieldInfo); |
175 | } |
176 | } |
177 | |
178 | unsigned getAddressSpace() const { |
179 | assert(RawData != 0 && "Invalid Type")(static_cast <bool> (RawData != 0 && "Invalid Type" ) ? void (0) : __assert_fail ("RawData != 0 && \"Invalid Type\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 179, __extension__ __PRETTY_FUNCTION__)); |
180 | assert(IsPointer && "cannot get address space of non-pointer type")(static_cast <bool> (IsPointer && "cannot get address space of non-pointer type" ) ? void (0) : __assert_fail ("IsPointer && \"cannot get address space of non-pointer type\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 180, __extension__ __PRETTY_FUNCTION__)); |
181 | if (!IsVector) |
182 | return getFieldValue(PointerAddressSpaceFieldInfo); |
183 | else |
184 | return getFieldValue(PointerVectorAddressSpaceFieldInfo); |
185 | } |
186 | |
187 | /// Returns the vector's element type. Only valid for vector types. |
188 | LLT getElementType() const { |
189 | assert(isVector() && "cannot get element type of scalar/aggregate")(static_cast <bool> (isVector() && "cannot get element type of scalar/aggregate" ) ? void (0) : __assert_fail ("isVector() && \"cannot get element type of scalar/aggregate\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 189, __extension__ __PRETTY_FUNCTION__)); |
190 | if (IsPointer) |
191 | return pointer(getAddressSpace(), getScalarSizeInBits()); |
192 | else |
193 | return scalar(getScalarSizeInBits()); |
194 | } |
195 | |
196 | void print(raw_ostream &OS) const; |
197 | |
198 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
199 | LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const { |
200 | print(dbgs()); |
201 | dbgs() << '\n'; |
202 | } |
203 | #endif |
204 | |
205 | bool operator==(const LLT &RHS) const { |
206 | return IsPointer == RHS.IsPointer && IsVector == RHS.IsVector && |
207 | RHS.RawData == RawData; |
208 | } |
209 | |
210 | bool operator!=(const LLT &RHS) const { return !(*this == RHS); } |
211 | |
212 | friend struct DenseMapInfo<LLT>; |
213 | friend class GISelInstProfileBuilder; |
214 | |
215 | private: |
216 | /// LLT is packed into 64 bits as follows: |
217 | /// isPointer : 1 |
218 | /// isVector : 1 |
219 | /// with 62 bits remaining for Kind-specific data, packed in bitfields |
220 | /// as described below. As there isn't a simple portable way to pack bits |
221 | /// into bitfields, here the different fields in the packed structure is |
222 | /// described in static const *Field variables. Each of these variables |
223 | /// is a 2-element array, with the first element describing the bitfield size |
224 | /// and the second element describing the bitfield offset. |
225 | typedef int BitFieldInfo[2]; |
226 | /// |
227 | /// This is how the bitfields are packed per Kind: |
228 | /// * Invalid: |
229 | /// gets encoded as RawData == 0, as that is an invalid encoding, since for |
230 | /// valid encodings, SizeInBits/SizeOfElement must be larger than 0. |
231 | /// * Non-pointer scalar (isPointer == 0 && isVector == 0): |
232 | /// SizeInBits: 32; |
233 | static const constexpr BitFieldInfo ScalarSizeFieldInfo{32, 0}; |
234 | /// * Pointer (isPointer == 1 && isVector == 0): |
235 | /// SizeInBits: 16; |
236 | /// AddressSpace: 24; |
237 | static const constexpr BitFieldInfo PointerSizeFieldInfo{16, 0}; |
238 | static const constexpr BitFieldInfo PointerAddressSpaceFieldInfo{ |
239 | 24, PointerSizeFieldInfo[0] + PointerSizeFieldInfo[1]}; |
240 | /// * Vector-of-non-pointer (isPointer == 0 && isVector == 1): |
241 | /// NumElements: 16; |
242 | /// SizeOfElement: 32; |
243 | static const constexpr BitFieldInfo VectorElementsFieldInfo{16, 0}; |
244 | static const constexpr BitFieldInfo VectorSizeFieldInfo{ |
245 | 32, VectorElementsFieldInfo[0] + VectorElementsFieldInfo[1]}; |
246 | /// * Vector-of-pointer (isPointer == 1 && isVector == 1): |
247 | /// NumElements: 16; |
248 | /// SizeOfElement: 16; |
249 | /// AddressSpace: 24; |
250 | static const constexpr BitFieldInfo PointerVectorElementsFieldInfo{16, 0}; |
251 | static const constexpr BitFieldInfo PointerVectorSizeFieldInfo{ |
252 | 16, |
253 | PointerVectorElementsFieldInfo[1] + PointerVectorElementsFieldInfo[0]}; |
254 | static const constexpr BitFieldInfo PointerVectorAddressSpaceFieldInfo{ |
255 | 24, PointerVectorSizeFieldInfo[1] + PointerVectorSizeFieldInfo[0]}; |
256 | |
257 | uint64_t IsPointer : 1; |
258 | uint64_t IsVector : 1; |
259 | uint64_t RawData : 62; |
260 | |
261 | static uint64_t getMask(const BitFieldInfo FieldInfo) { |
262 | const int FieldSizeInBits = FieldInfo[0]; |
263 | return (((uint64_t)1) << FieldSizeInBits) - 1; |
264 | } |
265 | static uint64_t maskAndShift(uint64_t Val, uint64_t Mask, uint8_t Shift) { |
266 | assert(Val <= Mask && "Value too large for field")(static_cast <bool> (Val <= Mask && "Value too large for field" ) ? void (0) : __assert_fail ("Val <= Mask && \"Value too large for field\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 266, __extension__ __PRETTY_FUNCTION__)); |
267 | return (Val & Mask) << Shift; |
268 | } |
269 | static uint64_t maskAndShift(uint64_t Val, const BitFieldInfo FieldInfo) { |
270 | return maskAndShift(Val, getMask(FieldInfo), FieldInfo[1]); |
271 | } |
272 | uint64_t getFieldValue(const BitFieldInfo FieldInfo) const { |
273 | return getMask(FieldInfo) & (RawData >> FieldInfo[1]); |
274 | } |
275 | |
276 | void init(bool IsPointer, bool IsVector, uint16_t NumElements, |
277 | unsigned SizeInBits, unsigned AddressSpace) { |
278 | this->IsPointer = IsPointer; |
279 | this->IsVector = IsVector; |
280 | if (!IsVector) { |
281 | if (!IsPointer) |
282 | RawData = maskAndShift(SizeInBits, ScalarSizeFieldInfo); |
283 | else |
284 | RawData = maskAndShift(SizeInBits, PointerSizeFieldInfo) | |
285 | maskAndShift(AddressSpace, PointerAddressSpaceFieldInfo); |
286 | } else { |
287 | assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements" ) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\"" , "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h" , 287, __extension__ __PRETTY_FUNCTION__)); |
288 | if (!IsPointer) |
289 | RawData = maskAndShift(NumElements, VectorElementsFieldInfo) | |
290 | maskAndShift(SizeInBits, VectorSizeFieldInfo); |
291 | else |
292 | RawData = |
293 | maskAndShift(NumElements, PointerVectorElementsFieldInfo) | |
294 | maskAndShift(SizeInBits, PointerVectorSizeFieldInfo) | |
295 | maskAndShift(AddressSpace, PointerVectorAddressSpaceFieldInfo); |
296 | } |
297 | } |
298 | |
299 | uint64_t getUniqueRAWLLTData() const { |
300 | return ((uint64_t)RawData) << 2 | ((uint64_t)IsPointer) << 1 | |
301 | ((uint64_t)IsVector); |
302 | } |
303 | }; |
304 | |
305 | inline raw_ostream& operator<<(raw_ostream &OS, const LLT &Ty) { |
306 | Ty.print(OS); |
307 | return OS; |
308 | } |
309 | |
310 | template<> struct DenseMapInfo<LLT> { |
311 | static inline LLT getEmptyKey() { |
312 | LLT Invalid; |
313 | Invalid.IsPointer = true; |
314 | return Invalid; |
315 | } |
316 | static inline LLT getTombstoneKey() { |
317 | LLT Invalid; |
318 | Invalid.IsVector = true; |
319 | return Invalid; |
320 | } |
321 | static inline unsigned getHashValue(const LLT &Ty) { |
322 | uint64_t Val = Ty.getUniqueRAWLLTData(); |
323 | return DenseMapInfo<uint64_t>::getHashValue(Val); |
324 | } |
325 | static bool isEqual(const LLT &LHS, const LLT &RHS) { |
326 | return LHS == RHS; |
327 | } |
328 | }; |
329 | |
330 | } |
331 | |
332 | #endif // LLVM_SUPPORT_LOWLEVELTYPEIMPL_H |