Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Warning:line 2965, column 7
Value stored to 'ValMapping' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPURegisterBankInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -mframe-pointer=none -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-11/lib/clang/11.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/include -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-11/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347=. -ferror-limit 19 -fmessage-length 0 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-03-09-184146-41876-1 -x c++ /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
1//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the RegisterBankInfo class for
10/// AMDGPU.
11///
12/// \par
13///
14/// AMDGPU has unique register bank constraints that require special high level
15/// strategies to deal with. There are two main true physical register banks
16/// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17/// sort of pseudo-register bank needed to represent SGPRs used in a vector
18/// boolean context. There is also the AGPR bank, which is a special purpose
19/// physical register bank present on some subtargets.
20///
21/// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22/// be uniform. It is generally not valid to legalize operands by inserting
23/// copies as on other targets. Operations which require uniform, SGPR operands
24/// generally require scalarization by repeatedly executing the instruction,
25/// activating each set of lanes using a unique set of input values. This is
26/// referred to as a waterfall loop.
27///
28/// \par Booleans
29///
30/// Booleans (s1 values) requires special consideration. A vector compare result
31/// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32/// register. These are represented with the VCC bank. During selection, we need
33/// to be able to unambiguously go back from a register class to a register
34/// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35/// bank, we need to know the use context type. An SGPR s1 value always means a
36/// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37/// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38/// a 32-bit virtual register. Taken together, this means we need to adjust the
39/// type of boolean operations to be regbank legal. All SALU booleans need to be
40/// widened to 32-bits, and all VALU booleans need to be s1 values.
41///
42/// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43/// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44/// bank. A non-boolean source (such as a truncate from a 1-bit load from
45/// memory) will require a copy to the VCC bank which will require clearing the
46/// high bits and inserting a compare.
47///
48/// \par Constant bus restriction
49///
50/// VALU instructions have a limitation known as the constant bus
51/// restriction. Most VALU instructions can use SGPR operands, but may read at
52/// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53/// instructions). This is one unique SGPR, so the same SGPR may be used for
54/// multiple operands. From a register bank perspective, any combination of
55/// operands should be legal as an SGPR, but this is contextually dependent on
56/// the SGPR operands all being the same register. There is therefore optimal to
57/// choose the SGPR with the most uses to minimize the number of copies.
58///
59/// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60/// operation should have its source operands all mapped to VGPRs (except for
61/// VCC), inserting copies from any SGPR operands. This the most trival legal
62/// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63/// complicated to solve here. Every optimization pattern or instruction
64/// selected to multiple outputs would have to enforce this rule, and there
65/// would be additional complexity in tracking this rule for every G_*
66/// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67/// picking the optimal operand combination from a post-isel optimization pass.
68///
69//===----------------------------------------------------------------------===//
70
71#include "AMDGPURegisterBankInfo.h"
72
73#include "AMDGPUGlobalISelUtils.h"
74#include "AMDGPUInstrInfo.h"
75#include "AMDGPUSubtarget.h"
76#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
77#include "SIMachineFunctionInfo.h"
78#include "SIRegisterInfo.h"
79#include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h"
80#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
81#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
82#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
83#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
84#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
85#include "llvm/CodeGen/TargetRegisterInfo.h"
86#include "llvm/CodeGen/TargetSubtargetInfo.h"
87#include "llvm/IR/Constants.h"
88
89#define GET_TARGET_REGBANK_IMPL
90#include "AMDGPUGenRegisterBank.inc"
91
92// This file will be TableGen'ed at some point.
93#include "AMDGPUGenRegisterBankInfo.def"
94
95using namespace llvm;
96using namespace MIPatternMatch;
97
98namespace {
99
100// Observer to apply a register bank to new registers created by LegalizerHelper.
101class ApplyRegBankMapping final : public GISelChangeObserver {
102private:
103 const AMDGPURegisterBankInfo &RBI;
104 MachineRegisterInfo &MRI;
105 const RegisterBank *NewBank;
106 SmallVector<MachineInstr *, 4> NewInsts;
107
108public:
109 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
110 MachineRegisterInfo &MRI_, const RegisterBank *RB)
111 : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
112
113 ~ApplyRegBankMapping() {
114 for (MachineInstr *MI : NewInsts)
115 applyBank(*MI);
116 }
117
118 /// Set any registers that don't have a set register class or bank to SALU.
119 void applyBank(MachineInstr &MI) {
120 const unsigned Opc = MI.getOpcode();
121 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
122 Opc == AMDGPU::G_SEXT) {
123 // LegalizerHelper wants to use the basic legalization artifacts when
124 // widening etc. We don't handle selection with vcc in artifact sources,
125 // so we need to use a sslect instead to handle these properly.
126 Register DstReg = MI.getOperand(0).getReg();
127 Register SrcReg = MI.getOperand(1).getReg();
128 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
129 if (SrcBank == &AMDGPU::VCCRegBank) {
130 const LLT S32 = LLT::scalar(32);
131 assert(MRI.getType(SrcReg) == LLT::scalar(1))((MRI.getType(SrcReg) == LLT::scalar(1)) ? static_cast<void
> (0) : __assert_fail ("MRI.getType(SrcReg) == LLT::scalar(1)"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 131, __PRETTY_FUNCTION__))
;
132 assert(MRI.getType(DstReg) == S32)((MRI.getType(DstReg) == S32) ? static_cast<void> (0) :
__assert_fail ("MRI.getType(DstReg) == S32", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 132, __PRETTY_FUNCTION__))
;
133 assert(NewBank == &AMDGPU::VGPRRegBank)((NewBank == &AMDGPU::VGPRRegBank) ? static_cast<void>
(0) : __assert_fail ("NewBank == &AMDGPU::VGPRRegBank", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 133, __PRETTY_FUNCTION__))
;
134
135 // Replace the extension with a select, which really uses the boolean
136 // source.
137 MachineIRBuilder B(MI);
138 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
139 auto False = B.buildConstant(S32, 0);
140 B.buildSelect(DstReg, SrcReg, True, False);
141 MRI.setRegBank(True.getReg(0), *NewBank);
142 MRI.setRegBank(False.getReg(0), *NewBank);
143 MI.eraseFromParent();
144 }
145
146 assert(!MRI.getRegClassOrRegBank(DstReg))((!MRI.getRegClassOrRegBank(DstReg)) ? static_cast<void>
(0) : __assert_fail ("!MRI.getRegClassOrRegBank(DstReg)", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 146, __PRETTY_FUNCTION__))
;
147 MRI.setRegBank(DstReg, *NewBank);
148 return;
149 }
150
151#ifndef NDEBUG
152 if (Opc == AMDGPU::G_TRUNC) {
153 Register DstReg = MI.getOperand(0).getReg();
154 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
155 assert(DstBank != &AMDGPU::VCCRegBank)((DstBank != &AMDGPU::VCCRegBank) ? static_cast<void>
(0) : __assert_fail ("DstBank != &AMDGPU::VCCRegBank", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 155, __PRETTY_FUNCTION__))
;
156 }
157#endif
158
159 for (MachineOperand &Op : MI.operands()) {
160 if (!Op.isReg())
161 continue;
162
163 // We may see physical registers if building a real MI
164 Register Reg = Op.getReg();
165 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
166 continue;
167
168 const RegisterBank *RB = NewBank;
169 if (MRI.getType(Reg) == LLT::scalar(1)) {
170 assert(NewBank == &AMDGPU::VGPRRegBank &&((NewBank == &AMDGPU::VGPRRegBank && "s1 operands should only be used for vector bools"
) ? static_cast<void> (0) : __assert_fail ("NewBank == &AMDGPU::VGPRRegBank && \"s1 operands should only be used for vector bools\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 171, __PRETTY_FUNCTION__))
171 "s1 operands should only be used for vector bools")((NewBank == &AMDGPU::VGPRRegBank && "s1 operands should only be used for vector bools"
) ? static_cast<void> (0) : __assert_fail ("NewBank == &AMDGPU::VGPRRegBank && \"s1 operands should only be used for vector bools\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 171, __PRETTY_FUNCTION__))
;
172 assert((MI.getOpcode() != AMDGPU::G_TRUNC &&(((MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode(
) != AMDGPU::G_ANYEXT) && "not expecting legalization artifacts here"
) ? static_cast<void> (0) : __assert_fail ("(MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode() != AMDGPU::G_ANYEXT) && \"not expecting legalization artifacts here\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 174, __PRETTY_FUNCTION__))
173 MI.getOpcode() != AMDGPU::G_ANYEXT) &&(((MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode(
) != AMDGPU::G_ANYEXT) && "not expecting legalization artifacts here"
) ? static_cast<void> (0) : __assert_fail ("(MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode() != AMDGPU::G_ANYEXT) && \"not expecting legalization artifacts here\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 174, __PRETTY_FUNCTION__))
174 "not expecting legalization artifacts here")(((MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode(
) != AMDGPU::G_ANYEXT) && "not expecting legalization artifacts here"
) ? static_cast<void> (0) : __assert_fail ("(MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode() != AMDGPU::G_ANYEXT) && \"not expecting legalization artifacts here\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 174, __PRETTY_FUNCTION__))
;
175 RB = &AMDGPU::VCCRegBank;
176 }
177
178 MRI.setRegBank(Reg, *RB);
179 }
180 }
181
182 void erasingInstr(MachineInstr &MI) override {}
183
184 void createdInstr(MachineInstr &MI) override {
185 // At this point, the instruction was just inserted and has no operands.
186 NewInsts.push_back(&MI);
187 }
188
189 void changingInstr(MachineInstr &MI) override {}
190 void changedInstr(MachineInstr &MI) override {}
191};
192
193}
194AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
195 : AMDGPUGenRegisterBankInfo(),
196 Subtarget(ST),
197 TRI(Subtarget.getRegisterInfo()),
198 TII(Subtarget.getInstrInfo()) {
199
200 // HACK: Until this is fully tablegen'd.
201 static llvm::once_flag InitializeRegisterBankFlag;
202
203 static auto InitializeRegisterBankOnce = [this]() {
204 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&((&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank
&& &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU
::VGPRRegBank && &getRegBank(AMDGPU::AGPRRegBankID
) == &AMDGPU::AGPRRegBank) ? static_cast<void> (0) :
__assert_fail ("&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 206, __PRETTY_FUNCTION__))
205 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&((&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank
&& &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU
::VGPRRegBank && &getRegBank(AMDGPU::AGPRRegBankID
) == &AMDGPU::AGPRRegBank) ? static_cast<void> (0) :
__assert_fail ("&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 206, __PRETTY_FUNCTION__))
206 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank)((&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank
&& &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU
::VGPRRegBank && &getRegBank(AMDGPU::AGPRRegBankID
) == &AMDGPU::AGPRRegBank) ? static_cast<void> (0) :
__assert_fail ("&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 206, __PRETTY_FUNCTION__))
;
207 (void)this;
208 };
209
210 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
211}
212
213static bool isVectorRegisterBank(const RegisterBank &Bank) {
214 unsigned BankID = Bank.getID();
215 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
216}
217
218unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
219 const RegisterBank &Src,
220 unsigned Size) const {
221 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
222 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
223 isVectorRegisterBank(Src)) {
224 return std::numeric_limits<unsigned>::max();
225 }
226
227 // Bool values are tricky, because the meaning is based on context. The SCC
228 // and VCC banks are for the natural scalar and vector conditions produced by
229 // a compare.
230 //
231 // Legalization doesn't know about the necessary context, so an s1 use may
232 // have been a truncate from an arbitrary value, in which case a copy (lowered
233 // as a compare with 0) needs to be inserted.
234 if (Size == 1 &&
235 (Dst.getID() == AMDGPU::SGPRRegBankID) &&
236 (isVectorRegisterBank(Src) ||
237 Src.getID() == AMDGPU::SGPRRegBankID ||
238 Src.getID() == AMDGPU::VCCRegBankID))
239 return std::numeric_limits<unsigned>::max();
240
241 if (Src.getID() == AMDGPU::VCCRegBankID)
242 return std::numeric_limits<unsigned>::max();
243
244 // There is no direct copy between AGPRs.
245 if (Dst.getID() == AMDGPU::AGPRRegBankID &&
246 Src.getID() == AMDGPU::AGPRRegBankID)
247 return 4;
248
249 return RegisterBankInfo::copyCost(Dst, Src, Size);
250}
251
252unsigned AMDGPURegisterBankInfo::getBreakDownCost(
253 const ValueMapping &ValMapping,
254 const RegisterBank *CurBank) const {
255 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
256 // VGPR.
257 // FIXME: Is there a better way to do this?
258 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
259 return 10; // This is expensive.
260
261 assert(ValMapping.NumBreakDowns == 2 &&((ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown
[0].Length == 32 && ValMapping.BreakDown[0].StartIdx ==
0 && ValMapping.BreakDown[1].Length == 32 &&
ValMapping.BreakDown[1].StartIdx == 32 && ValMapping
.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank) ? static_cast
<void> (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 266, __PRETTY_FUNCTION__))
262 ValMapping.BreakDown[0].Length == 32 &&((ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown
[0].Length == 32 && ValMapping.BreakDown[0].StartIdx ==
0 && ValMapping.BreakDown[1].Length == 32 &&
ValMapping.BreakDown[1].StartIdx == 32 && ValMapping
.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank) ? static_cast
<void> (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 266, __PRETTY_FUNCTION__))
263 ValMapping.BreakDown[0].StartIdx == 0 &&((ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown
[0].Length == 32 && ValMapping.BreakDown[0].StartIdx ==
0 && ValMapping.BreakDown[1].Length == 32 &&
ValMapping.BreakDown[1].StartIdx == 32 && ValMapping
.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank) ? static_cast
<void> (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 266, __PRETTY_FUNCTION__))
264 ValMapping.BreakDown[1].Length == 32 &&((ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown
[0].Length == 32 && ValMapping.BreakDown[0].StartIdx ==
0 && ValMapping.BreakDown[1].Length == 32 &&
ValMapping.BreakDown[1].StartIdx == 32 && ValMapping
.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank) ? static_cast
<void> (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 266, __PRETTY_FUNCTION__))
265 ValMapping.BreakDown[1].StartIdx == 32 &&((ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown
[0].Length == 32 && ValMapping.BreakDown[0].StartIdx ==
0 && ValMapping.BreakDown[1].Length == 32 &&
ValMapping.BreakDown[1].StartIdx == 32 && ValMapping
.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank) ? static_cast
<void> (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 266, __PRETTY_FUNCTION__))
266 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank)((ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown
[0].Length == 32 && ValMapping.BreakDown[0].StartIdx ==
0 && ValMapping.BreakDown[1].Length == 32 &&
ValMapping.BreakDown[1].StartIdx == 32 && ValMapping
.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank) ? static_cast
<void> (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 266, __PRETTY_FUNCTION__))
;
267
268 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
269 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
270 // want.
271
272 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
273 // alignment restrictions, but this probably isn't important.
274 return 1;
275}
276
277const RegisterBank &
278AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
279 LLT Ty) const {
280 if (&RC == &AMDGPU::SReg_1RegClass)
281 return AMDGPU::VCCRegBank;
282
283 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
284 // VCC-like use.
285 if (TRI->isSGPRClass(&RC)) {
286 // FIXME: This probably came from a copy from a physical register, which
287 // should be inferrrable from the copied to-type. We don't have many boolean
288 // physical register constraints so just assume a normal SGPR for now.
289 if (!Ty.isValid())
290 return AMDGPU::SGPRRegBank;
291
292 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
293 }
294
295 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
296}
297
298template <unsigned NumOps>
299RegisterBankInfo::InstructionMappings
300AMDGPURegisterBankInfo::addMappingFromTable(
301 const MachineInstr &MI, const MachineRegisterInfo &MRI,
302 const std::array<unsigned, NumOps> RegSrcOpIdx,
303 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
304
305 InstructionMappings AltMappings;
306
307 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
308
309 unsigned Sizes[NumOps];
310 for (unsigned I = 0; I < NumOps; ++I) {
311 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
312 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
313 }
314
315 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
316 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
317 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
318 }
319
320 // getInstrMapping's default mapping uses ID 1, so start at 2.
321 unsigned MappingID = 2;
322 for (const auto &Entry : Table) {
323 for (unsigned I = 0; I < NumOps; ++I) {
324 int OpIdx = RegSrcOpIdx[I];
325 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
326 }
327
328 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
329 getOperandsMapping(Operands),
330 Operands.size()));
331 }
332
333 return AltMappings;
334}
335
336RegisterBankInfo::InstructionMappings
337AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
338 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
339 switch (MI.getIntrinsicID()) {
340 case Intrinsic::amdgcn_readlane: {
341 static const OpRegBankEntry<3> Table[2] = {
342 // Perfectly legal.
343 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
344
345 // Need a readfirstlane for the index.
346 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
347 };
348
349 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
350 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
351 }
352 case Intrinsic::amdgcn_writelane: {
353 static const OpRegBankEntry<4> Table[4] = {
354 // Perfectly legal.
355 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
356
357 // Need readfirstlane of first op
358 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
359
360 // Need readfirstlane of second op
361 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
362
363 // Need readfirstlane of both ops
364 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
365 };
366
367 // rsrc, voffset, offset
368 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
369 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
370 }
371 default:
372 return RegisterBankInfo::getInstrAlternativeMappings(MI);
373 }
374}
375
376RegisterBankInfo::InstructionMappings
377AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
378 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
379
380 switch (MI.getIntrinsicID()) {
381 case Intrinsic::amdgcn_s_buffer_load: {
382 static const OpRegBankEntry<2> Table[4] = {
383 // Perfectly legal.
384 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
385
386 // Only need 1 register in loop
387 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
388
389 // Have to waterfall the resource.
390 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
391
392 // Have to waterfall the resource, and the offset.
393 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
394 };
395
396 // rsrc, offset
397 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
398 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
399 }
400 case Intrinsic::amdgcn_ds_ordered_add:
401 case Intrinsic::amdgcn_ds_ordered_swap: {
402 // VGPR = M0, VGPR
403 static const OpRegBankEntry<3> Table[2] = {
404 // Perfectly legal.
405 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
406
407 // Need a readfirstlane for m0
408 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
409 };
410
411 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
412 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
413 }
414 case Intrinsic::amdgcn_s_sendmsg:
415 case Intrinsic::amdgcn_s_sendmsghalt: {
416 // FIXME: Should have no register for immediate
417 static const OpRegBankEntry<1> Table[2] = {
418 // Perfectly legal.
419 { { AMDGPU::SGPRRegBankID }, 1 },
420
421 // Need readlane
422 { { AMDGPU::VGPRRegBankID }, 3 }
423 };
424
425 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
426 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
427 }
428 default:
429 return RegisterBankInfo::getInstrAlternativeMappings(MI);
430 }
431}
432
433static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
434 const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
435 return I && I->getMetadata("amdgpu.noclobber");
436}
437
438// FIXME: Returns uniform if there's no source value information. This is
439// probably wrong.
440static bool isScalarLoadLegal(const MachineInstr &MI) {
441 if (!MI.hasOneMemOperand())
442 return false;
443
444 const MachineMemOperand *MMO = *MI.memoperands_begin();
445 const unsigned AS = MMO->getAddrSpace();
446 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
447 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
448
449 // There are no extending SMRD/SMEM loads, and they require 4-byte alignment.
450 return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 &&
451 // Can't do a scalar atomic load.
452 !MMO->isAtomic() &&
453 // Don't use scalar loads for volatile accesses to non-constant address
454 // spaces.
455 (IsConst || !MMO->isVolatile()) &&
456 // Memory must be known constant, or not written before this load.
457 (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
458 AMDGPUInstrInfo::isUniformMMO(MMO);
459}
460
461RegisterBankInfo::InstructionMappings
462AMDGPURegisterBankInfo::getInstrAlternativeMappings(
463 const MachineInstr &MI) const {
464
465 const MachineFunction &MF = *MI.getParent()->getParent();
466 const MachineRegisterInfo &MRI = MF.getRegInfo();
467
468
469 InstructionMappings AltMappings;
470 switch (MI.getOpcode()) {
471 case TargetOpcode::G_CONSTANT: {
472 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
473 if (Size == 1) {
474 static const OpRegBankEntry<1> Table[3] = {
475 { { AMDGPU::VGPRRegBankID }, 1 },
476 { { AMDGPU::SGPRRegBankID }, 1 },
477 { { AMDGPU::VCCRegBankID }, 1 }
478 };
479
480 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
481 }
482
483 LLVM_FALLTHROUGH[[gnu::fallthrough]];
484 }
485 case TargetOpcode::G_FCONSTANT:
486 case TargetOpcode::G_FRAME_INDEX:
487 case TargetOpcode::G_GLOBAL_VALUE: {
488 static const OpRegBankEntry<1> Table[2] = {
489 { { AMDGPU::VGPRRegBankID }, 1 },
490 { { AMDGPU::SGPRRegBankID }, 1 }
491 };
492
493 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
494 }
495 case TargetOpcode::G_AND:
496 case TargetOpcode::G_OR:
497 case TargetOpcode::G_XOR: {
498 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
499
500 if (Size == 1) {
501 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
502 const InstructionMapping &SCCMapping = getInstructionMapping(
503 1, 1, getOperandsMapping(
504 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
505 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
506 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
507 3); // Num Operands
508 AltMappings.push_back(&SCCMapping);
509
510 const InstructionMapping &VCCMapping0 = getInstructionMapping(
511 2, 1, getOperandsMapping(
512 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
513 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
514 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
515 3); // Num Operands
516 AltMappings.push_back(&VCCMapping0);
517 return AltMappings;
518 }
519
520 if (Size != 64)
521 break;
522
523 const InstructionMapping &SSMapping = getInstructionMapping(
524 1, 1, getOperandsMapping(
525 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
526 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
527 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
528 3); // Num Operands
529 AltMappings.push_back(&SSMapping);
530
531 const InstructionMapping &VVMapping = getInstructionMapping(
532 2, 2, getOperandsMapping(
533 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
534 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
535 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
536 3); // Num Operands
537 AltMappings.push_back(&VVMapping);
538 break;
539 }
540 case TargetOpcode::G_LOAD:
541 case TargetOpcode::G_ZEXTLOAD:
542 case TargetOpcode::G_SEXTLOAD: {
543 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
544 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
545 unsigned PtrSize = PtrTy.getSizeInBits();
546 unsigned AS = PtrTy.getAddressSpace();
547 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
548
549 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
550 AS != AMDGPUAS::PRIVATE_ADDRESS) &&
551 isScalarLoadLegal(MI)) {
552 const InstructionMapping &SSMapping = getInstructionMapping(
553 1, 1, getOperandsMapping(
554 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
555 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
556 2); // Num Operands
557 AltMappings.push_back(&SSMapping);
558 }
559
560 const InstructionMapping &VVMapping = getInstructionMapping(
561 2, 1, getOperandsMapping(
562 {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
563 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
564 2); // Num Operands
565 AltMappings.push_back(&VVMapping);
566
567 // It may be possible to have a vgpr = load sgpr mapping here, because
568 // the mubuf instructions support this kind of load, but probably for only
569 // gfx7 and older. However, the addressing mode matching in the instruction
570 // selector should be able to do a better job of detecting and selecting
571 // these kinds of loads from the vgpr = load vgpr mapping.
572
573 return AltMappings;
574
575 }
576 case TargetOpcode::G_SELECT: {
577 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
578 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
579 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
580 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
581 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
582 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
583 4); // Num Operands
584 AltMappings.push_back(&SSMapping);
585
586 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
587 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
588 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
589 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
590 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
591 4); // Num Operands
592 AltMappings.push_back(&VVMapping);
593
594 return AltMappings;
595 }
596 case TargetOpcode::G_SMIN:
597 case TargetOpcode::G_SMAX:
598 case TargetOpcode::G_UMIN:
599 case TargetOpcode::G_UMAX: {
600 static const OpRegBankEntry<3> Table[2] = {
601 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
602
603 // Scalar requires cmp+select, and extends if 16-bit.
604 // FIXME: Should there be separate costs for 32 and 16-bit
605 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
606 };
607
608 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
609 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
610 }
611 case TargetOpcode::G_UADDE:
612 case TargetOpcode::G_USUBE:
613 case TargetOpcode::G_SADDE:
614 case TargetOpcode::G_SSUBE: {
615 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
616 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
617 getOperandsMapping(
618 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
619 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
620 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
621 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
622 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
623 5); // Num Operands
624 AltMappings.push_back(&SSMapping);
625
626 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
627 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
628 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
629 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
630 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
631 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
632 5); // Num Operands
633 AltMappings.push_back(&VVMapping);
634 return AltMappings;
635 }
636 case AMDGPU::G_BRCOND: {
637 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1)((MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1
) ? static_cast<void> (0) : __assert_fail ("MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 637, __PRETTY_FUNCTION__))
;
638
639 // TODO: Change type to 32 for scalar
640 const InstructionMapping &SMapping = getInstructionMapping(
641 1, 1, getOperandsMapping(
642 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
643 2); // Num Operands
644 AltMappings.push_back(&SMapping);
645
646 const InstructionMapping &VMapping = getInstructionMapping(
647 1, 1, getOperandsMapping(
648 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
649 2); // Num Operands
650 AltMappings.push_back(&VMapping);
651 return AltMappings;
652 }
653 case AMDGPU::G_INTRINSIC:
654 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
655 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
656 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
657 default:
658 break;
659 }
660 return RegisterBankInfo::getInstrAlternativeMappings(MI);
661}
662
663void AMDGPURegisterBankInfo::split64BitValueForMapping(
664 MachineIRBuilder &B,
665 SmallVector<Register, 2> &Regs,
666 LLT HalfTy,
667 Register Reg) const {
668 assert(HalfTy.getSizeInBits() == 32)((HalfTy.getSizeInBits() == 32) ? static_cast<void> (0)
: __assert_fail ("HalfTy.getSizeInBits() == 32", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 668, __PRETTY_FUNCTION__))
;
669 MachineRegisterInfo *MRI = B.getMRI();
670 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
671 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
672 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
673 MRI->setRegBank(LoLHS, *Bank);
674 MRI->setRegBank(HiLHS, *Bank);
675
676 Regs.push_back(LoLHS);
677 Regs.push_back(HiLHS);
678
679 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
680 .addDef(LoLHS)
681 .addDef(HiLHS)
682 .addUse(Reg);
683}
684
685/// Replace the current type each register in \p Regs has with \p NewTy
686static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
687 LLT NewTy) {
688 for (Register Reg : Regs) {
689 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits())((MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()) ?
static_cast<void> (0) : __assert_fail ("MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 689, __PRETTY_FUNCTION__))
;
690 MRI.setType(Reg, NewTy);
691 }
692}
693
694static LLT getHalfSizedType(LLT Ty) {
695 if (Ty.isVector()) {
696 assert(Ty.getNumElements() % 2 == 0)((Ty.getNumElements() % 2 == 0) ? static_cast<void> (0)
: __assert_fail ("Ty.getNumElements() % 2 == 0", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 696, __PRETTY_FUNCTION__))
;
697 return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
698 }
699
700 assert(Ty.getSizeInBits() % 2 == 0)((Ty.getSizeInBits() % 2 == 0) ? static_cast<void> (0) :
__assert_fail ("Ty.getSizeInBits() % 2 == 0", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 700, __PRETTY_FUNCTION__))
;
701 return LLT::scalar(Ty.getSizeInBits() / 2);
702}
703
704/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
705/// any of the required SGPR operands are VGPRs, perform a waterfall loop to
706/// execute the instruction for each unique combination of values in all lanes
707/// in the wave. The block will be split such that rest of the instructions are
708/// moved to a new block.
709///
710/// Essentially performs this loop:
711//
712/// Save Execution Mask
713/// For (Lane : Wavefront) {
714/// Enable Lane, Disable all other lanes
715/// SGPR = read SGPR value for current lane from VGPR
716/// VGPRResult[Lane] = use_op SGPR
717/// }
718/// Restore Execution Mask
719///
720/// There is additional complexity to try for compare values to identify the
721/// unique values used.
722bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
723 MachineIRBuilder &B,
724 iterator_range<MachineBasicBlock::iterator> Range,
725 SmallSet<Register, 4> &SGPROperandRegs,
726 MachineRegisterInfo &MRI) const {
727 SmallVector<Register, 4> ResultRegs;
728 SmallVector<Register, 4> InitResultRegs;
729 SmallVector<Register, 4> PhiRegs;
730
731 // Track use registers which have already been expanded with a readfirstlane
732 // sequence. This may have multiple uses if moving a sequence.
733 DenseMap<Register, Register> WaterfalledRegMap;
734
735 MachineBasicBlock &MBB = B.getMBB();
736 MachineFunction *MF = &B.getMF();
737
738 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
739 const unsigned WaveAndOpc = Subtarget.isWave32() ?
740 AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
741 const unsigned MovTermOpc = Subtarget.isWave32() ?
742 AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
743 const unsigned XorTermOpc = Subtarget.isWave32() ?
744 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
745 const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
746 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
747 const unsigned ExecReg = Subtarget.isWave32() ?
748 AMDGPU::EXEC_LO : AMDGPU::EXEC;
749
750#ifndef NDEBUG
751 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
752#endif
753
754 for (MachineInstr &MI : Range) {
755 for (MachineOperand &Def : MI.defs()) {
756 LLT ResTy = MRI.getType(Def.getReg());
757 const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
758 ResultRegs.push_back(Def.getReg());
759 Register InitReg = B.buildUndef(ResTy).getReg(0);
760 Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
761 InitResultRegs.push_back(InitReg);
762 PhiRegs.push_back(PhiReg);
763 MRI.setRegBank(PhiReg, *DefBank);
764 MRI.setRegBank(InitReg, *DefBank);
765 }
766 }
767
768 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
769 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
770
771 // Don't bother using generic instructions/registers for the exec mask.
772 B.buildInstr(TargetOpcode::IMPLICIT_DEF)
773 .addDef(InitSaveExecReg);
774
775 Register PhiExec = MRI.createVirtualRegister(WaveRC);
776 Register NewExec = MRI.createVirtualRegister(WaveRC);
777
778 // To insert the loop we need to split the block. Move everything before this
779 // point to a new block, and insert a new empty block before this instruction.
780 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
781 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
782 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
783 MachineFunction::iterator MBBI(MBB);
784 ++MBBI;
785 MF->insert(MBBI, LoopBB);
786 MF->insert(MBBI, RestoreExecBB);
787 MF->insert(MBBI, RemainderBB);
788
789 LoopBB->addSuccessor(RestoreExecBB);
790 LoopBB->addSuccessor(LoopBB);
791
792 // Move the rest of the block into a new block.
793 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
794 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
795
796 MBB.addSuccessor(LoopBB);
797 RestoreExecBB->addSuccessor(RemainderBB);
798
799 B.setInsertPt(*LoopBB, LoopBB->end());
800
801 B.buildInstr(TargetOpcode::PHI)
802 .addDef(PhiExec)
803 .addReg(InitSaveExecReg)
804 .addMBB(&MBB)
805 .addReg(NewExec)
806 .addMBB(LoopBB);
807
808 for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
809 B.buildInstr(TargetOpcode::G_PHI)
810 .addDef(std::get<2>(Result))
811 .addReg(std::get<0>(Result)) // Initial value / implicit_def
812 .addMBB(&MBB)
813 .addReg(std::get<1>(Result)) // Mid-loop value.
814 .addMBB(LoopBB);
815 }
816
817 const DebugLoc &DL = B.getDL();
818
819 MachineInstr &FirstInst = *Range.begin();
820
821 // Move the instruction into the loop. Note we moved everything after
822 // Range.end() already into a new block, so Range.end() is no longer valid.
823 LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
824
825 // Figure out the iterator range after splicing the instructions.
826 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
827 auto NewEnd = LoopBB->end();
828
829 MachineBasicBlock::iterator I = Range.begin();
830 B.setInsertPt(*LoopBB, I);
831
832 Register CondReg;
833
834 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize)((std::distance(NewBegin, NewEnd) == OrigRangeSize) ? static_cast
<void> (0) : __assert_fail ("std::distance(NewBegin, NewEnd) == OrigRangeSize"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 834, __PRETTY_FUNCTION__))
;
835
836 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
837 for (MachineOperand &Op : MI.uses()) {
838 if (!Op.isReg() || Op.isDef())
839 continue;
840
841 Register OldReg = Op.getReg();
842 if (!SGPROperandRegs.count(OldReg))
843 continue;
844
845 // See if we already processed this register in another instruction in the
846 // sequence.
847 auto OldVal = WaterfalledRegMap.find(OldReg);
848 if (OldVal != WaterfalledRegMap.end()) {
849 Op.setReg(OldVal->second);
850 continue;
851 }
852
853 LLT OpTy = MRI.getType(Op.getReg());
854 unsigned OpSize = OpTy.getSizeInBits();
855
856 // Can only do a readlane of 32-bit pieces.
857 if (OpSize == 32) {
858 // Avoid extra copies in the simple case of one 32-bit register.
859 Register CurrentLaneOpReg
860 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
861 MRI.setType(CurrentLaneOpReg, OpTy);
862
863 constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
864 // Read the next variant <- also loop target.
865 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
866 CurrentLaneOpReg)
867 .addReg(Op.getReg());
868
869 Register NewCondReg = MRI.createVirtualRegister(WaveRC);
870 bool First = CondReg == AMDGPU::NoRegister;
871 if (First)
872 CondReg = NewCondReg;
873
874 // Compare the just read M0 value to all possible Idx values.
875 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
876 .addDef(NewCondReg)
877 .addReg(CurrentLaneOpReg)
878 .addReg(Op.getReg());
879 Op.setReg(CurrentLaneOpReg);
880
881 if (!First) {
882 Register AndReg = MRI.createVirtualRegister(WaveRC);
883
884 // If there are multiple operands to consider, and the conditions.
885 B.buildInstr(WaveAndOpc)
886 .addDef(AndReg)
887 .addReg(NewCondReg)
888 .addReg(CondReg);
889 CondReg = AndReg;
890 }
891 } else {
892 LLT S32 = LLT::scalar(32);
893 SmallVector<Register, 8> ReadlanePieces;
894
895 // The compares can be done as 64-bit, but the extract needs to be done
896 // in 32-bit pieces.
897
898 bool Is64 = OpSize % 64 == 0;
899
900 LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
901 unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
902 : AMDGPU::V_CMP_EQ_U32_e64;
903
904 // The compares can be done as 64-bit, but the extract needs to be done
905 // in 32-bit pieces.
906
907 // Insert the unmerge before the loop.
908
909 B.setMBB(MBB);
910 auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
911 B.setInstr(*I);
912
913 unsigned NumPieces = Unmerge->getNumOperands() - 1;
914 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
915 Register UnmergePiece = Unmerge.getReg(PieceIdx);
916
917 Register CurrentLaneOpReg;
918 if (Is64) {
919 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
920 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
921
922 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
923 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
924 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
925
926 // Read the next variant <- also loop target.
927 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
928 CurrentLaneOpRegLo)
929 .addReg(UnmergePiece, 0, AMDGPU::sub0);
930
931 // Read the next variant <- also loop target.
932 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
933 CurrentLaneOpRegHi)
934 .addReg(UnmergePiece, 0, AMDGPU::sub1);
935
936 CurrentLaneOpReg =
937 B.buildMerge(LLT::scalar(64),
938 {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
939 .getReg(0);
940
941 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
942
943 if (OpTy.getScalarSizeInBits() == 64) {
944 // If we need to produce a 64-bit element vector, so use the
945 // merged pieces
946 ReadlanePieces.push_back(CurrentLaneOpReg);
947 } else {
948 // 32-bit element type.
949 ReadlanePieces.push_back(CurrentLaneOpRegLo);
950 ReadlanePieces.push_back(CurrentLaneOpRegHi);
951 }
952 } else {
953 CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
954 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
955 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
956
957 // Read the next variant <- also loop target.
958 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
959 CurrentLaneOpReg)
960 .addReg(UnmergePiece);
961 ReadlanePieces.push_back(CurrentLaneOpReg);
962 }
963
964 Register NewCondReg = MRI.createVirtualRegister(WaveRC);
965 bool First = CondReg == AMDGPU::NoRegister;
966 if (First)
967 CondReg = NewCondReg;
968
969 B.buildInstr(CmpOp)
970 .addDef(NewCondReg)
971 .addReg(CurrentLaneOpReg)
972 .addReg(UnmergePiece);
973
974 if (!First) {
975 Register AndReg = MRI.createVirtualRegister(WaveRC);
976
977 // If there are multiple operands to consider, and the conditions.
978 B.buildInstr(WaveAndOpc)
979 .addDef(AndReg)
980 .addReg(NewCondReg)
981 .addReg(CondReg);
982 CondReg = AndReg;
983 }
984 }
985
986 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
987 // BUILD_VECTOR
988 if (OpTy.isVector()) {
989 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
990 Op.setReg(Merge.getReg(0));
991 } else {
992 auto Merge = B.buildMerge(OpTy, ReadlanePieces);
993 Op.setReg(Merge.getReg(0));
994 }
995
996 MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
997 }
998
999 // Make sure we don't re-process this register again.
1000 WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
1001 }
1002 }
1003
1004 B.setInsertPt(*LoopBB, LoopBB->end());
1005
1006 // Update EXEC, save the original EXEC value to VCC.
1007 B.buildInstr(AndSaveExecOpc)
1008 .addDef(NewExec)
1009 .addReg(CondReg, RegState::Kill);
1010
1011 MRI.setSimpleHint(NewExec, CondReg);
1012
1013 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
1014 B.buildInstr(XorTermOpc)
1015 .addDef(ExecReg)
1016 .addReg(ExecReg)
1017 .addReg(NewExec);
1018
1019 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
1020 // s_cbranch_scc0?
1021
1022 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
1023 B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
1024 .addMBB(LoopBB);
1025
1026 // Save the EXEC mask before the loop.
1027 BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
1028 .addReg(ExecReg);
1029
1030 // Restore the EXEC mask after the loop.
1031 B.setMBB(*RestoreExecBB);
1032 B.buildInstr(MovTermOpc)
1033 .addDef(ExecReg)
1034 .addReg(SaveExecReg);
1035
1036 // Set the insert point after the original instruction, so any new
1037 // instructions will be in the remainder.
1038 B.setInsertPt(*RemainderBB, RemainderBB->begin());
1039
1040 return true;
1041}
1042
1043// Return any unique registers used by \p MI at \p OpIndices that need to be
1044// handled in a waterfall loop. Returns these registers in \p
1045// SGPROperandRegs. Returns true if there are any operansd to handle and a
1046// waterfall loop is necessary.
1047bool AMDGPURegisterBankInfo::collectWaterfallOperands(
1048 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
1049 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
1050 for (unsigned Op : OpIndices) {
1051 assert(MI.getOperand(Op).isUse())((MI.getOperand(Op).isUse()) ? static_cast<void> (0) : __assert_fail
("MI.getOperand(Op).isUse()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 1051, __PRETTY_FUNCTION__))
;
1052 Register Reg = MI.getOperand(Op).getReg();
1053 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
1054 if (OpBank->getID() == AMDGPU::VGPRRegBankID)
1055 SGPROperandRegs.insert(Reg);
1056 }
1057
1058 // No operands need to be replaced, so no need to loop.
1059 return !SGPROperandRegs.empty();
1060}
1061
1062bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1063 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
1064 ArrayRef<unsigned> OpIndices) const {
1065 // Use a set to avoid extra readfirstlanes in the case where multiple operands
1066 // are the same register.
1067 SmallSet<Register, 4> SGPROperandRegs;
1068
1069 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
1070 return false;
1071
1072 MachineBasicBlock::iterator I = MI.getIterator();
1073 return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1074 SGPROperandRegs, MRI);
1075}
1076
1077bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1078 MachineInstr &MI, MachineRegisterInfo &MRI,
1079 ArrayRef<unsigned> OpIndices) const {
1080 MachineIRBuilder B(MI);
1081 return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1082}
1083
1084// Legalize an operand that must be an SGPR by inserting a readfirstlane.
1085void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1086 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1087 Register Reg = MI.getOperand(OpIdx).getReg();
1088 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1089 if (Bank != &AMDGPU::VGPRRegBank)
1090 return;
1091
1092 MachineIRBuilder B(MI);
1093 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1094 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1095 .addDef(SGPR)
1096 .addReg(Reg);
1097
1098 MRI.setType(SGPR, MRI.getType(Reg));
1099
1100 const TargetRegisterClass *Constrained =
1101 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1102 (void)Constrained;
1103 assert(Constrained && "Failed to constrain readfirstlane src reg")((Constrained && "Failed to constrain readfirstlane src reg"
) ? static_cast<void> (0) : __assert_fail ("Constrained && \"Failed to constrain readfirstlane src reg\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 1103, __PRETTY_FUNCTION__))
;
1104
1105 MI.getOperand(OpIdx).setReg(SGPR);
1106}
1107
1108// When regbankselect repairs registers, it will insert a repair instruction
1109// which defines the repaired register. Then it calls applyMapping and expects
1110// that the targets will either delete or rewrite the originally wrote to the
1111// repaired registers. Beccause of this, we end up in a situation where
1112// we have 2 instructions defining the same registers.
1113static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI,
1114 Register Reg,
1115 const MachineInstr &MI) {
1116 // Is there some way we can assert that there are exactly 2 def instructions?
1117 for (MachineInstr &Other : MRI.def_instructions(Reg)) {
1118 if (&Other != &MI)
1119 return &Other;
1120 }
1121
1122 return nullptr;
1123}
1124
1125bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
1126 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1127 MachineRegisterInfo &MRI) const {
1128 Register DstReg = MI.getOperand(0).getReg();
1129 const LLT LoadTy = MRI.getType(DstReg);
1130 unsigned LoadSize = LoadTy.getSizeInBits();
1131 const unsigned MaxNonSmrdLoadSize = 128;
1132 // 128-bit loads are supported for all instruction types.
1133 if (LoadSize <= MaxNonSmrdLoadSize)
1134 return false;
1135
1136 SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0));
1137 SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
1138
1139 // If the pointer is an SGPR, we have nothing to do.
1140 if (SrcRegs.empty()) {
1141 const RegisterBank *PtrBank =
1142 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1143 if (PtrBank == &AMDGPU::SGPRRegBank)
1144 return false;
1145 SrcRegs.push_back(MI.getOperand(1).getReg());
1146 }
1147
1148 assert(LoadSize % MaxNonSmrdLoadSize == 0)((LoadSize % MaxNonSmrdLoadSize == 0) ? static_cast<void>
(0) : __assert_fail ("LoadSize % MaxNonSmrdLoadSize == 0", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 1148, __PRETTY_FUNCTION__))
;
1149
1150 // We want to get the repair instruction now, because it will help us
1151 // determine which instruction the legalizer inserts that will also
1152 // write to DstReg.
1153 MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI);
1154
1155 // RegBankSelect only emits scalar types, so we need to reset the pointer
1156 // operand to a pointer type.
1157 Register BasePtrReg = SrcRegs[0];
1158 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1159 MRI.setType(BasePtrReg, PtrTy);
1160
1161 MachineIRBuilder B(MI);
1162
1163 unsigned SplitElts =
1164 MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
1165 const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType());
1166 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
1167 GISelObserverWrapper Observer(&O);
1168 B.setChangeObserver(Observer);
1169 LegalizerHelper Helper(B.getMF(), Observer, B);
1170 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1171 return false;
1172
1173 // At this point, the legalizer has split the original load into smaller
1174 // loads. At the end of lowering, it inserts an instruction (LegalizedInst)
1175 // that combines the outputs of the lower loads and writes it to DstReg.
1176 // The register bank selector has also added the RepairInst which writes to
1177 // DstReg as well.
1178
1179 MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst);
1180
1181 // Replace the output of the LegalizedInst with a temporary register, since
1182 // RepairInst already defines DstReg.
1183 Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg));
1184 LegalizedInst->getOperand(0).setReg(TmpReg);
1185 B.setInsertPt(*RepairInst->getParent(), RepairInst);
1186
1187 for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) {
1188 Register IdxReg = B.buildConstant(LLT::scalar(32), DefIdx).getReg(0);
1189 MRI.setRegBank(IdxReg, AMDGPU::VGPRRegBank);
1190 B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg);
1191 }
1192
1193 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1194 return true;
1195}
1196
1197bool AMDGPURegisterBankInfo::applyMappingImage(
1198 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1199 MachineRegisterInfo &MRI, int RsrcIdx) const {
1200 const int NumDefs = MI.getNumExplicitDefs();
1201
1202 // The reported argument index is relative to the IR intrinsic call arguments,
1203 // so we need to shift by the number of defs and the intrinsic ID.
1204 RsrcIdx += NumDefs + 1;
1205
1206 // Insert copies to VGPR arguments.
1207 applyDefaultMapping(OpdMapper);
1208
1209 // Fixup any SGPR arguments.
1210 SmallVector<unsigned, 4> SGPRIndexes;
1211 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1212 if (!MI.getOperand(I).isReg())
1213 continue;
1214
1215 // If this intrinsic has a sampler, it immediately follows rsrc.
1216 if (I == RsrcIdx || I == RsrcIdx + 1)
1217 SGPRIndexes.push_back(I);
1218 }
1219
1220 executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1221 return true;
1222}
1223
1224static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
1225 Register Reg) {
1226 MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
1227 if (!Def)
1228 return Reg;
1229
1230 // TODO: Guard against this being an implicit def
1231 return Def->getOperand(0).getReg();
1232}
1233
1234// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1235// the three offsets (voffset, soffset and instoffset)
1236static unsigned setBufferOffsets(MachineIRBuilder &B,
1237 const AMDGPURegisterBankInfo &RBI,
1238 Register CombinedOffset,
1239 Register &VOffsetReg,
1240 Register &SOffsetReg,
1241 int64_t &InstOffsetVal,
1242 unsigned Align) {
1243 const LLT S32 = LLT::scalar(32);
1244 MachineRegisterInfo *MRI = B.getMRI();
1245
1246 if (Optional<int64_t> Imm = getConstantVRegVal(CombinedOffset, *MRI)) {
1247 uint32_t SOffset, ImmOffset;
1248 if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset,
1249 &RBI.Subtarget, Align)) {
1250 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1251 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1252 InstOffsetVal = ImmOffset;
1253
1254 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1255 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1256 return SOffset + ImmOffset;
1257 }
1258 }
1259
1260 Register Base;
1261 unsigned Offset;
1262 MachineInstr *Unused;
1263
1264 std::tie(Base, Offset, Unused)
1265 = AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1266
1267 uint32_t SOffset, ImmOffset;
1268 if (Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1269 &RBI.Subtarget, Align)) {
1270 if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1271 VOffsetReg = Base;
1272 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1273 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1274 InstOffsetVal = ImmOffset;
1275 return 0; // XXX - Why is this 0?
1276 }
1277
1278 // If we have SGPR base, we can use it for soffset.
1279 if (SOffset == 0) {
1280 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1281 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1282 SOffsetReg = Base;
1283 InstOffsetVal = ImmOffset;
1284 return 0; // XXX - Why is this 0?
1285 }
1286 }
1287
1288 // Handle the variable sgpr + vgpr case.
1289 if (MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI)) {
1290 Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1291 Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1292
1293 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1294 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1295
1296 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1297 VOffsetReg = Src0;
1298 SOffsetReg = Src1;
1299 return 0;
1300 }
1301
1302 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1303 VOffsetReg = Src1;
1304 SOffsetReg = Src0;
1305 return 0;
1306 }
1307 }
1308
1309 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1310 // have an SGPR offset and a VGPR resource.
1311 if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1312 VOffsetReg = CombinedOffset;
1313 } else {
1314 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1315 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1316 }
1317
1318 SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1319 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1320 return 0;
1321}
1322
1323static LLT divideLLT(LLT Ty, int Factor) {
1324 if (Ty.isVector())
1325 return LLT::vector(Ty.getNumElements() / Factor, Ty.getElementType());
1326 return LLT::scalar(Ty.getSizeInBits() / Factor);
1327}
1328
1329bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1330 const OperandsMapper &OpdMapper) const {
1331 MachineInstr &MI = OpdMapper.getMI();
1332 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1333
1334 const LLT S32 = LLT::scalar(32);
1335 Register Dst = MI.getOperand(0).getReg();
1336 LLT Ty = MRI.getType(Dst);
1337
1338 const RegisterBank *RSrcBank =
1339 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1340 const RegisterBank *OffsetBank =
1341 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1342 if (RSrcBank == &AMDGPU::SGPRRegBank &&
1343 OffsetBank == &AMDGPU::SGPRRegBank)
1344 return true; // Legal mapping
1345
1346 // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back
1347 // here but don't have an MMO.
1348
1349 unsigned LoadSize = Ty.getSizeInBits();
1350 int NumLoads = 1;
1351 if (LoadSize == 256 || LoadSize == 512) {
1352 NumLoads = LoadSize / 128;
1353 Ty = divideLLT(Ty, NumLoads);
1354 }
1355
1356 // Use the alignment to ensure that the required offsets will fit into the
1357 // immediate offsets.
1358 const unsigned Align = NumLoads > 1 ? 16 * NumLoads : 1;
1359
1360 MachineIRBuilder B(MI);
1361 MachineFunction &MF = B.getMF();
1362
1363 Register SOffset;
1364 Register VOffset;
1365 int64_t ImmOffset = 0;
1366
1367 unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1368 VOffset, SOffset, ImmOffset, Align);
1369
1370 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1371 // can, but we neeed to track an MMO for that.
1372 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1373 const unsigned MemAlign = 4; // FIXME: ABI type alignment?
1374 MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1375 MachinePointerInfo(),
1376 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1377 MachineMemOperand::MOInvariant,
1378 MemSize, MemAlign);
1379 if (MMOOffset != 0)
1380 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1381
1382 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1383 // assume that the buffer is unswizzled.
1384
1385 Register RSrc = MI.getOperand(1).getReg();
1386 Register VIndex = B.buildConstant(S32, 0).getReg(0);
1387 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1388
1389 SmallVector<Register, 4> LoadParts(NumLoads);
1390
1391 MachineBasicBlock::iterator MII = MI.getIterator();
1392 MachineInstrSpan Span(MII, &B.getMBB());
1393
1394 for (int i = 0; i < NumLoads; ++i) {
1395 if (NumLoads == 1) {
1396 LoadParts[i] = Dst;
1397 } else {
1398 LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1399 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1400 }
1401
1402 MachineMemOperand *MMO = BaseMMO;
1403 if (i != 0)
1404 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1405
1406 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1407 .addDef(LoadParts[i]) // vdata
1408 .addUse(RSrc) // rsrc
1409 .addUse(VIndex) // vindex
1410 .addUse(VOffset) // voffset
1411 .addUse(SOffset) // soffset
1412 .addImm(ImmOffset + 16 * i) // offset(imm)
1413 .addImm(0) // cachepolicy, swizzled buffer(imm)
1414 .addImm(0) // idxen(imm)
1415 .addMemOperand(MMO);
1416 }
1417
1418 // TODO: If only the resource is a VGPR, it may be better to execute the
1419 // scalar load in the waterfall loop if the resource is expected to frequently
1420 // be dynamically uniform.
1421 if (RSrcBank != &AMDGPU::SGPRRegBank) {
1422 // Remove the original instruction to avoid potentially confusing the
1423 // waterfall loop logic.
1424 B.setInstr(*Span.begin());
1425 MI.eraseFromParent();
1426
1427 SmallSet<Register, 4> OpsToWaterfall;
1428
1429 OpsToWaterfall.insert(RSrc);
1430 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1431 OpsToWaterfall, MRI);
1432 }
1433
1434 if (NumLoads != 1) {
1435 if (Ty.isVector())
1436 B.buildConcatVectors(Dst, LoadParts);
1437 else
1438 B.buildMerge(Dst, LoadParts);
1439 }
1440
1441 // We removed the instruction earlier with a waterfall loop.
1442 if (RSrcBank == &AMDGPU::SGPRRegBank)
1443 MI.eraseFromParent();
1444
1445 return true;
1446}
1447
1448bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
1449 const OperandsMapper &OpdMapper, bool Signed) const {
1450 MachineInstr &MI = OpdMapper.getMI();
1451 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1452
1453 // Insert basic copies
1454 applyDefaultMapping(OpdMapper);
1455
1456 Register DstReg = MI.getOperand(0).getReg();
1457 LLT Ty = MRI.getType(DstReg);
1458
1459 const LLT S32 = LLT::scalar(32);
1460
1461 const RegisterBank *DstBank =
1462 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1463 if (DstBank == &AMDGPU::VGPRRegBank) {
1464 if (Ty == S32)
1465 return true;
1466
1467 // TODO: 64-bit version is scalar only, so we need to expand this.
1468 return false;
1469 }
1470
1471 Register SrcReg = MI.getOperand(2).getReg();
1472 Register OffsetReg = MI.getOperand(3).getReg();
1473 Register WidthReg = MI.getOperand(4).getReg();
1474
1475 // The scalar form packs the offset and width in a single operand.
1476
1477 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1478 GISelObserverWrapper Observer(&ApplyBank);
1479 MachineIRBuilder B(MI);
1480 B.setChangeObserver(Observer);
1481
1482 // Ensure the high bits are clear to insert the offset.
1483 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1484 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1485
1486 // Zeros out the low bits, so don't bother clamping the input value.
1487 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1488
1489 // Transformation function, pack the offset and width of a BFE into
1490 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1491 // source, bits [5:0] contain the offset and bits [22:16] the width.
1492 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1493
1494 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1495 // register class constraints.
1496 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1497 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1498
1499 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1500 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1501 llvm_unreachable("failed to constrain BFE")::llvm::llvm_unreachable_internal("failed to constrain BFE", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 1501)
;
1502
1503 MI.eraseFromParent();
1504 return true;
1505}
1506
1507// FIXME: Duplicated from LegalizerHelper
1508static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
1509 switch (Opc) {
1510 case TargetOpcode::G_SMIN:
1511 return CmpInst::ICMP_SLT;
1512 case TargetOpcode::G_SMAX:
1513 return CmpInst::ICMP_SGT;
1514 case TargetOpcode::G_UMIN:
1515 return CmpInst::ICMP_ULT;
1516 case TargetOpcode::G_UMAX:
1517 return CmpInst::ICMP_UGT;
1518 default:
1519 llvm_unreachable("not in integer min/max")::llvm::llvm_unreachable_internal("not in integer min/max", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 1519)
;
1520 }
1521}
1522
1523static unsigned minMaxToExtend(unsigned Opc) {
1524 switch (Opc) {
1525 case TargetOpcode::G_SMIN:
1526 case TargetOpcode::G_SMAX:
1527 return TargetOpcode::G_SEXT;
1528 case TargetOpcode::G_UMIN:
1529 case TargetOpcode::G_UMAX:
1530 return TargetOpcode::G_ZEXT;
1531 default:
1532 llvm_unreachable("not in integer min/max")::llvm::llvm_unreachable_internal("not in integer min/max", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 1532)
;
1533 }
1534}
1535
1536// Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1537// any illegal vector extend or unmerge operations.
1538static std::pair<Register, Register>
1539unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1540 const LLT S32 = LLT::scalar(32);
1541 auto Bitcast = B.buildBitcast(S32, Src);
1542
1543 if (ExtOpcode == TargetOpcode::G_SEXT) {
1544 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1545 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1546 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1547 }
1548
1549 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1550 if (ExtOpcode == TargetOpcode::G_ZEXT) {
1551 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1552 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1553 }
1554
1555 assert(ExtOpcode == TargetOpcode::G_ANYEXT)((ExtOpcode == TargetOpcode::G_ANYEXT) ? static_cast<void>
(0) : __assert_fail ("ExtOpcode == TargetOpcode::G_ANYEXT", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 1555, __PRETTY_FUNCTION__))
;
1556 return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1557}
1558
1559static MachineInstr *buildExpandedScalarMinMax(MachineIRBuilder &B,
1560 CmpInst::Predicate Pred,
1561 Register Dst, Register Src0,
1562 Register Src1) {
1563 const LLT CmpType = LLT::scalar(32);
1564 auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1);
1565 return B.buildSelect(Dst, Cmp, Src0, Src1);
1566}
1567
1568// FIXME: Duplicated from LegalizerHelper, except changing the boolean type.
1569void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B,
1570 MachineInstr &MI) const {
1571 Register Dst = MI.getOperand(0).getReg();
1572 Register Src0 = MI.getOperand(1).getReg();
1573 Register Src1 = MI.getOperand(2).getReg();
1574
1575 const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
1576 MachineInstr *Sel = buildExpandedScalarMinMax(B, Pred, Dst, Src0, Src1);
1577
1578 Register CmpReg = Sel->getOperand(1).getReg();
1579 B.getMRI()->setRegBank(CmpReg, AMDGPU::SGPRRegBank);
1580 MI.eraseFromParent();
1581}
1582
1583// For cases where only a single copy is inserted for matching register banks.
1584// Replace the register in the instruction operand
1585static bool substituteSimpleCopyRegs(
1586 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1587 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1588 if (!SrcReg.empty()) {
1589 assert(SrcReg.size() == 1)((SrcReg.size() == 1) ? static_cast<void> (0) : __assert_fail
("SrcReg.size() == 1", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 1589, __PRETTY_FUNCTION__))
;
1590 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1591 return true;
1592 }
1593
1594 return false;
1595}
1596
1597/// Handle register layout difference for f16 images for some subtargets.
1598Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1599 MachineRegisterInfo &MRI,
1600 Register Reg) const {
1601 if (!Subtarget.hasUnpackedD16VMem())
1602 return Reg;
1603
1604 const LLT S16 = LLT::scalar(16);
1605 LLT StoreVT = MRI.getType(Reg);
1606 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1607 return Reg;
1608
1609 auto Unmerge = B.buildUnmerge(S16, Reg);
1610
1611
1612 SmallVector<Register, 4> WideRegs;
1613 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1614 WideRegs.push_back(Unmerge.getReg(I));
1615
1616 const LLT S32 = LLT::scalar(32);
1617 int NumElts = StoreVT.getNumElements();
1618
1619 return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1620}
1621
1622static std::pair<Register, unsigned>
1623getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1624 int64_t Const;
1625 if (mi_match(Reg, MRI, m_ICst(Const)))
1626 return std::make_pair(Register(), Const);
1627
1628 Register Base;
1629 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1630 return std::make_pair(Base, Const);
1631
1632 // TODO: Handle G_OR used for add case
1633 return std::make_pair(Reg, 0);
1634}
1635
1636std::pair<Register, unsigned>
1637AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1638 Register OrigOffset) const {
1639 const unsigned MaxImm = 4095;
1640 Register BaseReg;
1641 unsigned ImmOffset;
1642 const LLT S32 = LLT::scalar(32);
1643
1644 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1645 OrigOffset);
1646
1647 unsigned C1 = 0;
1648 if (ImmOffset != 0) {
1649 // If the immediate value is too big for the immoffset field, put the value
1650 // and -4096 into the immoffset field so that the value that is copied/added
1651 // for the voffset field is a multiple of 4096, and it stands more chance
1652 // of being CSEd with the copy/add for another similar load/store.
1653 // However, do not do that rounding down to a multiple of 4096 if that is a
1654 // negative number, as it appears to be illegal to have a negative offset
1655 // in the vgpr, even if adding the immediate offset makes it positive.
1656 unsigned Overflow = ImmOffset & ~MaxImm;
1657 ImmOffset -= Overflow;
1658 if ((int32_t)Overflow < 0) {
1659 Overflow += ImmOffset;
1660 ImmOffset = 0;
1661 }
1662
1663 C1 = ImmOffset;
1664 if (Overflow != 0) {
1665 if (!BaseReg)
1666 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1667 else {
1668 auto OverflowVal = B.buildConstant(S32, Overflow);
1669 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1670 }
1671 }
1672 }
1673
1674 if (!BaseReg)
1675 BaseReg = B.buildConstant(S32, 0).getReg(0);
1676
1677 return {BaseReg, C1};
1678}
1679
1680static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
1681 int64_t C;
1682 return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
1683}
1684
1685static unsigned extractGLC(unsigned CachePolicy) {
1686 return CachePolicy & 1;
1687}
1688
1689static unsigned extractSLC(unsigned CachePolicy) {
1690 return (CachePolicy >> 1) & 1;
1691}
1692
1693static unsigned extractDLC(unsigned CachePolicy) {
1694 return (CachePolicy >> 2) & 1;
1695}
1696
1697MachineInstr *
1698AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
1699 MachineInstr &MI) const {
1700 MachineRegisterInfo &MRI = *B.getMRI();
1701 executeInWaterfallLoop(B, MI, MRI, {2, 4});
1702
1703 // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1704
1705 Register VData = MI.getOperand(1).getReg();
1706 LLT Ty = MRI.getType(VData);
1707
1708 int EltSize = Ty.getScalarSizeInBits();
1709 int Size = Ty.getSizeInBits();
1710
1711 // FIXME: Broken integer truncstore.
1712 if (EltSize != 32)
1713 report_fatal_error("unhandled intrinsic store");
1714
1715 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1716 const int MemSize = (*MI.memoperands_begin())->getSize();
1717
1718
1719 Register RSrc = MI.getOperand(2).getReg();
1720 Register VOffset = MI.getOperand(3).getReg();
1721 Register SOffset = MI.getOperand(4).getReg();
1722 unsigned CachePolicy = MI.getOperand(5).getImm();
1723
1724 unsigned ImmOffset;
1725 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
1726
1727 const bool Offen = !isZero(VOffset, MRI);
1728
1729 unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
1730 switch (8 * MemSize) {
1731 case 8:
1732 Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
1733 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
1734 break;
1735 case 16:
1736 Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
1737 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
1738 break;
1739 default:
1740 Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
1741 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
1742 if (Size > 32)
1743 Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
1744 break;
1745 }
1746
1747
1748 // Set the insertion point back to the instruction in case it was moved into a
1749 // loop.
1750 B.setInstr(MI);
1751
1752 MachineInstrBuilder MIB = B.buildInstr(Opc)
1753 .addUse(VData);
1754
1755 if (Offen)
1756 MIB.addUse(VOffset);
1757
1758 MIB.addUse(RSrc)
1759 .addUse(SOffset)
1760 .addImm(ImmOffset)
1761 .addImm(extractGLC(CachePolicy))
1762 .addImm(extractSLC(CachePolicy))
1763 .addImm(0) // tfe: FIXME: Remove from inst
1764 .addImm(extractDLC(CachePolicy))
1765 .cloneMemRefs(MI);
1766
1767 // FIXME: We need a way to report failure from applyMappingImpl.
1768 // Insert constrain copies before inserting the loop.
1769 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1770 report_fatal_error("failed to constrain selected store intrinsic");
1771
1772 return MIB;
1773}
1774
1775bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1776 Register SrcReg) const {
1777 MachineRegisterInfo &MRI = *B.getMRI();
1778 LLT SrcTy = MRI.getType(SrcReg);
1779 if (SrcTy.getSizeInBits() == 32) {
1780 // Use a v_mov_b32 here to make the exec dependency explicit.
1781 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1782 .addDef(DstReg)
1783 .addUse(SrcReg);
1784 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1785 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1786 }
1787
1788 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1789 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1790
1791 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1792 .addDef(TmpReg0)
1793 .addUse(SrcReg, 0, AMDGPU::sub0);
1794 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1795 .addDef(TmpReg1)
1796 .addUse(SrcReg, 0, AMDGPU::sub1);
1797 B.buildInstr(AMDGPU::REG_SEQUENCE)
1798 .addDef(DstReg)
1799 .addUse(TmpReg0)
1800 .addImm(AMDGPU::sub0)
1801 .addUse(TmpReg1)
1802 .addImm(AMDGPU::sub1);
1803
1804 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1805 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1806}
1807
1808/// Utility function for pushing dynamic vector indexes with a constant offset
1809/// into waterwall loops.
1810static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1811 MachineInstr &IdxUseInstr,
1812 unsigned OpIdx,
1813 unsigned ConstOffset) {
1814 MachineRegisterInfo &MRI = *B.getMRI();
1815 const LLT S32 = LLT::scalar(32);
1816 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1817 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1818
1819 auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1820
1821 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1822 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1823 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1824 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1825}
1826
1827void AMDGPURegisterBankInfo::applyMappingImpl(
1828 const OperandsMapper &OpdMapper) const {
1829 MachineInstr &MI = OpdMapper.getMI();
1830 unsigned Opc = MI.getOpcode();
1831 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1832 switch (Opc) {
1833 case AMDGPU::G_PHI: {
1834 Register DstReg = MI.getOperand(0).getReg();
1835 LLT DstTy = MRI.getType(DstReg);
1836 if (DstTy != LLT::scalar(1))
1837 break;
1838
1839 const LLT S32 = LLT::scalar(32);
1840 const RegisterBank *DstBank =
1841 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1842 if (DstBank == &AMDGPU::VCCRegBank) {
1843 applyDefaultMapping(OpdMapper);
1844 // The standard handling only considers the result register bank for
1845 // phis. For VCC, blindly inserting a copy when the phi is lowered will
1846 // produce an invalid copy. We can only copy with some kind of compare to
1847 // get a vector boolean result. Insert a regitser bank copy that will be
1848 // correctly lowered to a compare.
1849 MachineIRBuilder B(*MI.getParent()->getParent());
1850
1851 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
1852 Register SrcReg = MI.getOperand(I).getReg();
1853 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
1854
1855 if (SrcBank != &AMDGPU::VCCRegBank) {
1856 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
1857 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
1858
1859 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
1860 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
1861 MI.getOperand(I).setReg(Copy.getReg(0));
1862 }
1863 }
1864
1865 return;
1866 }
1867
1868 // Phi handling is strange and only considers the bank of the destination.
1869 substituteSimpleCopyRegs(OpdMapper, 0);
1870
1871 // Promote SGPR/VGPR booleans to s32
1872 MachineFunction *MF = MI.getParent()->getParent();
1873 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
1874 GISelObserverWrapper Observer(&ApplyBank);
1875 MachineIRBuilder B(MI);
1876 LegalizerHelper Helper(*MF, Observer, B);
1877
1878 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
1879 llvm_unreachable("widen scalar should have succeeded")::llvm::llvm_unreachable_internal("widen scalar should have succeeded"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 1879)
;
1880
1881 return;
1882 }
1883 case AMDGPU::G_ICMP:
1884 case AMDGPU::G_UADDO:
1885 case AMDGPU::G_USUBO:
1886 case AMDGPU::G_UADDE:
1887 case AMDGPU::G_SADDE:
1888 case AMDGPU::G_USUBE:
1889 case AMDGPU::G_SSUBE: {
1890 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
1891 Register DstReg = MI.getOperand(BoolDstOp).getReg();
1892
1893 const RegisterBank *DstBank =
1894 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1895 if (DstBank != &AMDGPU::SGPRRegBank)
1896 break;
1897
1898 const bool HasCarryIn = MI.getNumOperands() == 5;
1899
1900 // If this is a scalar compare, promote the result to s32, as the selection
1901 // will end up using a copy to a 32-bit vreg.
1902 const LLT S32 = LLT::scalar(32);
1903 Register NewDstReg = MRI.createGenericVirtualRegister(S32);
1904 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
1905 MI.getOperand(BoolDstOp).setReg(NewDstReg);
1906 MachineIRBuilder B(MI);
1907
1908 if (HasCarryIn) {
1909 Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
1910 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
1911 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
1912 MI.getOperand(4).setReg(NewSrcReg);
1913 }
1914
1915 MachineBasicBlock *MBB = MI.getParent();
1916 B.setInsertPt(*MBB, std::next(MI.getIterator()));
1917
1918 // If we had a constrained VCC result register, a copy was inserted to VCC
1919 // from SGPR.
1920 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
1921 if (DefRegs.empty())
1922 DefRegs.push_back(DstReg);
1923 B.buildTrunc(DefRegs[0], NewDstReg);
1924 return;
1925 }
1926 case AMDGPU::G_SELECT: {
1927 Register DstReg = MI.getOperand(0).getReg();
1928 LLT DstTy = MRI.getType(DstReg);
1929
1930 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
1931 if (CondRegs.empty())
1932 CondRegs.push_back(MI.getOperand(1).getReg());
1933 else {
1934 assert(CondRegs.size() == 1)((CondRegs.size() == 1) ? static_cast<void> (0) : __assert_fail
("CondRegs.size() == 1", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 1934, __PRETTY_FUNCTION__))
;
1935 }
1936
1937 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
1938 if (CondBank == &AMDGPU::SGPRRegBank) {
1939 MachineIRBuilder B(MI);
1940 const LLT S32 = LLT::scalar(32);
1941 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
1942 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
1943
1944 MI.getOperand(1).setReg(NewCondReg);
1945 B.buildZExt(NewCondReg, CondRegs[0]);
1946 }
1947
1948 if (DstTy.getSizeInBits() != 64)
1949 break;
1950
1951 MachineIRBuilder B(MI);
1952 LLT HalfTy = getHalfSizedType(DstTy);
1953
1954 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1955 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1956 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
1957
1958 // All inputs are SGPRs, nothing special to do.
1959 if (DefRegs.empty()) {
1960 assert(Src1Regs.empty() && Src2Regs.empty())((Src1Regs.empty() && Src2Regs.empty()) ? static_cast
<void> (0) : __assert_fail ("Src1Regs.empty() && Src2Regs.empty()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 1960, __PRETTY_FUNCTION__))
;
1961 break;
1962 }
1963
1964 if (Src1Regs.empty())
1965 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1966 else {
1967 setRegsToType(MRI, Src1Regs, HalfTy);
1968 }
1969
1970 if (Src2Regs.empty())
1971 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
1972 else
1973 setRegsToType(MRI, Src2Regs, HalfTy);
1974
1975 setRegsToType(MRI, DefRegs, HalfTy);
1976
1977 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
1978 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
1979
1980 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1981 MI.eraseFromParent();
1982 return;
1983 }
1984 case AMDGPU::G_BRCOND: {
1985 Register CondReg = MI.getOperand(0).getReg();
1986 // FIXME: Should use legalizer helper, but should change bool ext type.
1987 const RegisterBank *CondBank =
1988 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1989
1990 if (CondBank == &AMDGPU::SGPRRegBank) {
1991 MachineIRBuilder B(MI);
1992 const LLT S32 = LLT::scalar(32);
1993 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
1994 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
1995
1996 MI.getOperand(0).setReg(NewCondReg);
1997 B.buildZExt(NewCondReg, CondReg);
1998 return;
1999 }
2000
2001 break;
2002 }
2003 case AMDGPU::G_AND:
2004 case AMDGPU::G_OR:
2005 case AMDGPU::G_XOR: {
2006 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2007 // there is a VGPR input.
2008 Register DstReg = MI.getOperand(0).getReg();
2009 LLT DstTy = MRI.getType(DstReg);
2010
2011 if (DstTy.getSizeInBits() == 1) {
2012 const RegisterBank *DstBank =
2013 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2014 if (DstBank == &AMDGPU::VCCRegBank)
2015 break;
2016
2017 MachineFunction *MF = MI.getParent()->getParent();
2018 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2019 GISelObserverWrapper Observer(&ApplyBank);
2020 MachineIRBuilder B(MI);
2021 LegalizerHelper Helper(*MF, Observer, B);
2022
2023 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2024 LegalizerHelper::Legalized)
2025 llvm_unreachable("widen scalar should have succeeded")::llvm::llvm_unreachable_internal("widen scalar should have succeeded"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2025)
;
2026 return;
2027 }
2028
2029 if (DstTy.getSizeInBits() != 64)
2030 break;
2031
2032 LLT HalfTy = getHalfSizedType(DstTy);
2033 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2034 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2035 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2036
2037 // All inputs are SGPRs, nothing special to do.
2038 if (DefRegs.empty()) {
2039 assert(Src0Regs.empty() && Src1Regs.empty())((Src0Regs.empty() && Src1Regs.empty()) ? static_cast
<void> (0) : __assert_fail ("Src0Regs.empty() && Src1Regs.empty()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2039, __PRETTY_FUNCTION__))
;
2040 break;
2041 }
2042
2043 assert(DefRegs.size() == 2)((DefRegs.size() == 2) ? static_cast<void> (0) : __assert_fail
("DefRegs.size() == 2", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2043, __PRETTY_FUNCTION__))
;
2044 assert(Src0Regs.size() == Src1Regs.size() &&((Src0Regs.size() == Src1Regs.size() && (Src0Regs.empty
() || Src0Regs.size() == 2)) ? static_cast<void> (0) : __assert_fail
("Src0Regs.size() == Src1Regs.size() && (Src0Regs.empty() || Src0Regs.size() == 2)"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2045, __PRETTY_FUNCTION__))
2045 (Src0Regs.empty() || Src0Regs.size() == 2))((Src0Regs.size() == Src1Regs.size() && (Src0Regs.empty
() || Src0Regs.size() == 2)) ? static_cast<void> (0) : __assert_fail
("Src0Regs.size() == Src1Regs.size() && (Src0Regs.empty() || Src0Regs.size() == 2)"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2045, __PRETTY_FUNCTION__))
;
2046
2047 // Depending on where the source registers came from, the generic code may
2048 // have decided to split the inputs already or not. If not, we still need to
2049 // extract the values.
2050 MachineIRBuilder B(MI);
2051
2052 if (Src0Regs.empty())
2053 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2054 else
2055 setRegsToType(MRI, Src0Regs, HalfTy);
2056
2057 if (Src1Regs.empty())
2058 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2059 else
2060 setRegsToType(MRI, Src1Regs, HalfTy);
2061
2062 setRegsToType(MRI, DefRegs, HalfTy);
2063
2064 B.buildInstr(Opc)
2065 .addDef(DefRegs[0])
2066 .addUse(Src0Regs[0])
2067 .addUse(Src1Regs[0]);
2068
2069 B.buildInstr(Opc)
2070 .addDef(DefRegs[1])
2071 .addUse(Src0Regs[1])
2072 .addUse(Src1Regs[1]);
2073
2074 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2075 MI.eraseFromParent();
2076 return;
2077 }
2078 case AMDGPU::G_ADD:
2079 case AMDGPU::G_SUB:
2080 case AMDGPU::G_MUL: {
2081 Register DstReg = MI.getOperand(0).getReg();
2082 LLT DstTy = MRI.getType(DstReg);
2083 if (DstTy != LLT::scalar(16))
2084 break;
2085
2086 const RegisterBank *DstBank =
2087 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2088 if (DstBank == &AMDGPU::VGPRRegBank)
2089 break;
2090
2091 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2092 MachineFunction *MF = MI.getParent()->getParent();
2093 MachineIRBuilder B(MI);
2094 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2095 GISelObserverWrapper Observer(&ApplySALU);
2096 LegalizerHelper Helper(*MF, Observer, B);
2097
2098 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2099 LegalizerHelper::Legalized)
2100 llvm_unreachable("widen scalar should have succeeded")::llvm::llvm_unreachable_internal("widen scalar should have succeeded"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2100)
;
2101 return;
2102 }
2103 case AMDGPU::G_SMIN:
2104 case AMDGPU::G_SMAX:
2105 case AMDGPU::G_UMIN:
2106 case AMDGPU::G_UMAX: {
2107 Register DstReg = MI.getOperand(0).getReg();
2108 const RegisterBank *DstBank =
2109 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2110 if (DstBank == &AMDGPU::VGPRRegBank)
2111 break;
2112
2113 MachineFunction *MF = MI.getParent()->getParent();
2114 MachineIRBuilder B(MI);
2115
2116 // Turn scalar min/max into a compare and select.
2117 LLT Ty = MRI.getType(DstReg);
2118 const LLT S32 = LLT::scalar(32);
2119 const LLT S16 = LLT::scalar(16);
2120 const LLT V2S16 = LLT::vector(2, 16);
2121
2122 if (Ty == V2S16) {
2123 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2124 GISelObserverWrapper Observer(&ApplySALU);
2125 B.setChangeObserver(Observer);
2126
2127 // Need to widen to s32, and expand as cmp + select, and avoid producing
2128 // illegal vector extends or unmerges that would need further
2129 // legalization.
2130 //
2131 // TODO: Should we just readfirstlane? That should probably be handled
2132 // with a UniformVGPR register bank that wouldn't need special
2133 // consideration here.
2134
2135 Register Dst = MI.getOperand(0).getReg();
2136 Register Src0 = MI.getOperand(1).getReg();
2137 Register Src1 = MI.getOperand(2).getReg();
2138
2139 Register WideSrc0Lo, WideSrc0Hi;
2140 Register WideSrc1Lo, WideSrc1Hi;
2141
2142 unsigned ExtendOp = minMaxToExtend(MI.getOpcode());
2143
2144 std::tie(WideSrc0Lo, WideSrc0Hi) = unpackV2S16ToS32(B, Src0, ExtendOp);
2145 std::tie(WideSrc1Lo, WideSrc1Hi) = unpackV2S16ToS32(B, Src1, ExtendOp);
2146
2147 Register Lo = MRI.createGenericVirtualRegister(S32);
2148 Register Hi = MRI.createGenericVirtualRegister(S32);
2149 const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
2150 buildExpandedScalarMinMax(B, Pred, Lo, WideSrc0Lo, WideSrc1Lo);
2151 buildExpandedScalarMinMax(B, Pred, Hi, WideSrc0Hi, WideSrc1Hi);
2152
2153 B.buildBuildVectorTrunc(Dst, {Lo, Hi});
2154 MI.eraseFromParent();
2155 } else if (Ty == S16) {
2156 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2157 GISelObserverWrapper Observer(&ApplySALU);
2158 LegalizerHelper Helper(*MF, Observer, B);
2159
2160 // Need to widen to s32, and expand as cmp + select.
2161 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2162 llvm_unreachable("widenScalar should have succeeded")::llvm::llvm_unreachable_internal("widenScalar should have succeeded"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2162)
;
2163
2164 // FIXME: This is relying on widenScalar leaving MI in place.
2165 lowerScalarMinMax(B, MI);
2166 } else
2167 lowerScalarMinMax(B, MI);
2168
2169 return;
2170 }
2171 case AMDGPU::G_SEXT_INREG: {
2172 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2173 if (SrcRegs.empty())
2174 break; // Nothing to repair
2175
2176 const LLT S32 = LLT::scalar(32);
2177 MachineIRBuilder B(MI);
2178 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2179 GISelObserverWrapper Observer(&O);
2180 B.setChangeObserver(Observer);
2181
2182 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2183 // we would need to further expand, and doesn't let us directly set the
2184 // result registers.
2185 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2186
2187 int Amt = MI.getOperand(2).getImm();
2188 if (Amt <= 32) {
2189 if (Amt == 32) {
2190 // The low bits are unchanged.
2191 B.buildCopy(DstRegs[0], SrcRegs[0]);
2192 } else {
2193 // Extend in the low bits and propagate the sign bit to the high half.
2194 B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
2195 }
2196
2197 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2198 } else {
2199 // The low bits are unchanged, and extend in the high bits.
2200 B.buildCopy(DstRegs[0], SrcRegs[0]);
2201 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2202 }
2203
2204 Register DstReg = MI.getOperand(0).getReg();
2205 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2206 MI.eraseFromParent();
2207 return;
2208 }
2209 case AMDGPU::G_CTPOP:
2210 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2211 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2212 MachineIRBuilder B(MI);
2213 MachineFunction &MF = B.getMF();
2214
2215 const RegisterBank *DstBank =
2216 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2217 if (DstBank == &AMDGPU::SGPRRegBank)
2218 break;
2219
2220 Register SrcReg = MI.getOperand(1).getReg();
2221 const LLT S32 = LLT::scalar(32);
2222 LLT Ty = MRI.getType(SrcReg);
2223 if (Ty == S32)
2224 break;
2225
2226 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2227 GISelObserverWrapper Observer(&ApplyVALU);
2228 LegalizerHelper Helper(MF, Observer, B);
2229
2230 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2231 llvm_unreachable("narrowScalar should have succeeded")::llvm::llvm_unreachable_internal("narrowScalar should have succeeded"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2231)
;
2232 return;
2233 }
2234 case AMDGPU::G_SEXT:
2235 case AMDGPU::G_ZEXT: {
2236 Register SrcReg = MI.getOperand(1).getReg();
2237 LLT SrcTy = MRI.getType(SrcReg);
2238 bool Signed = Opc == AMDGPU::G_SEXT;
2239
2240 MachineIRBuilder B(MI);
2241 const RegisterBank *SrcBank =
2242 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2243
2244 Register DstReg = MI.getOperand(0).getReg();
2245 LLT DstTy = MRI.getType(DstReg);
2246 if (DstTy.isScalar() &&
2247 SrcBank != &AMDGPU::SGPRRegBank &&
2248 SrcBank != &AMDGPU::VCCRegBank &&
2249 // FIXME: Should handle any type that round to s64 when irregular
2250 // breakdowns supported.
2251 DstTy.getSizeInBits() == 64 &&
2252 SrcTy.getSizeInBits() <= 32) {
2253 const LLT S32 = LLT::scalar(32);
2254 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2255
2256 // Extend to 32-bit, and then extend the low half.
2257 if (Signed) {
2258 // TODO: Should really be buildSExtOrCopy
2259 B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2260
2261 // Replicate sign bit from 32-bit extended part.
2262 auto ShiftAmt = B.buildConstant(S32, 31);
2263 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
2264 B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt);
2265 } else {
2266 B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2267 B.buildConstant(DefRegs[1], 0);
2268 }
2269
2270 MRI.setRegBank(DstReg, *SrcBank);
2271 MI.eraseFromParent();
2272 return;
2273 }
2274
2275 if (SrcTy != LLT::scalar(1))
2276 return;
2277
2278 if (SrcBank == &AMDGPU::VCCRegBank) {
2279 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2280
2281 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2282
2283 unsigned DstSize = DstTy.getSizeInBits();
2284 // 64-bit select is SGPR only
2285 const bool UseSel64 = DstSize > 32 &&
2286 SrcBank->getID() == AMDGPU::SGPRRegBankID;
2287
2288 // TODO: Should s16 select be legal?
2289 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2290 auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2291 auto False = B.buildConstant(SelType, 0);
2292
2293 MRI.setRegBank(True.getReg(0), *DstBank);
2294 MRI.setRegBank(False.getReg(0), *DstBank);
2295 MRI.setRegBank(DstReg, *DstBank);
2296
2297 if (DstSize > 32) {
2298 B.buildSelect(DefRegs[0], SrcReg, True, False);
2299 B.buildCopy(DefRegs[1], DefRegs[0]);
2300 } else if (DstSize < 32) {
2301 auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2302 MRI.setRegBank(Sel.getReg(0), *DstBank);
2303 B.buildTrunc(DstReg, Sel);
2304 } else {
2305 B.buildSelect(DstReg, SrcReg, True, False);
2306 }
2307
2308 MI.eraseFromParent();
2309 return;
2310 }
2311
2312 // Fixup the case with an s1 src that isn't a condition register. Use shifts
2313 // instead of introducing a compare to avoid an unnecessary condition
2314 // register (and since there's no scalar 16-bit compares).
2315 auto Ext = B.buildAnyExt(DstTy, SrcReg);
2316 auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1);
2317 auto Shl = B.buildShl(DstTy, Ext, ShiftAmt);
2318
2319 if (MI.getOpcode() == AMDGPU::G_SEXT)
2320 B.buildAShr(DstReg, Shl, ShiftAmt);
2321 else
2322 B.buildLShr(DstReg, Shl, ShiftAmt);
2323
2324 MRI.setRegBank(DstReg, *SrcBank);
2325 MRI.setRegBank(Ext.getReg(0), *SrcBank);
2326 MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
2327 MRI.setRegBank(Shl.getReg(0), *SrcBank);
2328 MI.eraseFromParent();
2329 return;
2330 }
2331 case AMDGPU::G_BUILD_VECTOR:
2332 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2333 Register DstReg = MI.getOperand(0).getReg();
2334 LLT DstTy = MRI.getType(DstReg);
2335 if (DstTy != LLT::vector(2, 16))
2336 break;
2337
2338 assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty())((MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty
()) ? static_cast<void> (0) : __assert_fail ("MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2338, __PRETTY_FUNCTION__))
;
2339 substituteSimpleCopyRegs(OpdMapper, 1);
2340 substituteSimpleCopyRegs(OpdMapper, 2);
2341
2342 const RegisterBank *DstBank =
2343 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2344 if (DstBank == &AMDGPU::SGPRRegBank)
2345 break; // Can use S_PACK_* instructions.
2346
2347 MachineIRBuilder B(MI);
2348
2349 Register Lo = MI.getOperand(1).getReg();
2350 Register Hi = MI.getOperand(2).getReg();
2351 const LLT S32 = LLT::scalar(32);
2352
2353 const RegisterBank *BankLo =
2354 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2355 const RegisterBank *BankHi =
2356 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2357
2358 Register ZextLo;
2359 Register ShiftHi;
2360
2361 if (Opc == AMDGPU::G_BUILD_VECTOR) {
2362 ZextLo = B.buildZExt(S32, Lo).getReg(0);
2363 MRI.setRegBank(ZextLo, *BankLo);
2364
2365 Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
2366 MRI.setRegBank(ZextHi, *BankHi);
2367
2368 auto ShiftAmt = B.buildConstant(S32, 16);
2369 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2370
2371 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
2372 MRI.setRegBank(ShiftHi, *BankHi);
2373 } else {
2374 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
2375 MRI.setRegBank(MaskLo, *BankLo);
2376
2377 auto ShiftAmt = B.buildConstant(S32, 16);
2378 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2379
2380 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
2381 MRI.setRegBank(ShiftHi, *BankHi);
2382
2383 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
2384 MRI.setRegBank(ZextLo, *BankLo);
2385 }
2386
2387 auto Or = B.buildOr(S32, ZextLo, ShiftHi);
2388 MRI.setRegBank(Or.getReg(0), *DstBank);
2389
2390 B.buildBitcast(DstReg, Or);
2391 MI.eraseFromParent();
2392 return;
2393 }
2394 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2395 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2396
2397 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty())((OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs
(2).empty()) ? static_cast<void> (0) : __assert_fail ("OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2397, __PRETTY_FUNCTION__))
;
2398
2399 Register DstReg = MI.getOperand(0).getReg();
2400 Register SrcReg = MI.getOperand(1).getReg();
2401
2402 const LLT S32 = LLT::scalar(32);
2403 LLT DstTy = MRI.getType(DstReg);
2404 LLT SrcTy = MRI.getType(SrcReg);
2405
2406 MachineIRBuilder B(MI);
2407
2408 const ValueMapping &DstMapping
2409 = OpdMapper.getInstrMapping().getOperandMapping(0);
2410 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2411 const RegisterBank *SrcBank =
2412 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2413 const RegisterBank *IdxBank =
2414 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2415
2416 Register BaseIdxReg;
2417 unsigned ConstOffset;
2418 MachineInstr *OffsetDef;
2419 std::tie(BaseIdxReg, ConstOffset, OffsetDef) =
2420 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2421
2422 // See if the index is an add of a constant which will be foldable by moving
2423 // the base register of the index later if this is going to be executed in a
2424 // waterfall loop. This is essentially to reassociate the add of a constant
2425 // with the readfirstlane.
2426 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2427 ConstOffset > 0 &&
2428 ConstOffset < SrcTy.getNumElements();
2429
2430 // Move the base register. We'll re-insert the add later.
2431 if (ShouldMoveIndexIntoLoop)
2432 MI.getOperand(2).setReg(BaseIdxReg);
2433
2434 // If this is a VGPR result only because the index was a VGPR result, the
2435 // actual indexing will be done on the SGPR source vector, which will
2436 // produce a scalar result. We need to copy to the VGPR result inside the
2437 // waterfall loop.
2438 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2439 SrcBank == &AMDGPU::SGPRRegBank;
2440 if (DstRegs.empty()) {
2441 applyDefaultMapping(OpdMapper);
2442
2443 executeInWaterfallLoop(MI, MRI, { 2 });
2444
2445 if (NeedCopyToVGPR) {
2446 // We don't want a phi for this temporary reg.
2447 Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2448 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2449 MI.getOperand(0).setReg(TmpReg);
2450 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2451
2452 // Use a v_mov_b32 here to make the exec dependency explicit.
2453 buildVCopy(B, DstReg, TmpReg);
2454 }
2455
2456 // Re-insert the constant offset add inside the waterfall loop.
2457 if (ShouldMoveIndexIntoLoop)
2458 reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2459
2460 return;
2461 }
2462
2463 assert(DstTy.getSizeInBits() == 64)((DstTy.getSizeInBits() == 64) ? static_cast<void> (0) :
__assert_fail ("DstTy.getSizeInBits() == 64", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2463, __PRETTY_FUNCTION__))
;
2464
2465 LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
2466
2467 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2468 auto One = B.buildConstant(S32, 1);
2469
2470 MachineBasicBlock::iterator MII = MI.getIterator();
2471
2472 // Split the vector index into 32-bit pieces. Prepare to move all of the
2473 // new instructions into a waterfall loop if necessary.
2474 //
2475 // Don't put the bitcast or constant in the loop.
2476 MachineInstrSpan Span(MII, &B.getMBB());
2477
2478 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2479 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2480 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2481
2482 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2483 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2484
2485 MRI.setRegBank(DstReg, *DstBank);
2486 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2487 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2488 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2489 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2490
2491 SmallSet<Register, 4> OpsToWaterfall;
2492 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2493 MI.eraseFromParent();
2494 return;
2495 }
2496
2497 // Remove the original instruction to avoid potentially confusing the
2498 // waterfall loop logic.
2499 B.setInstr(*Span.begin());
2500 MI.eraseFromParent();
2501 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2502 OpsToWaterfall, MRI);
2503
2504 if (NeedCopyToVGPR) {
2505 MachineBasicBlock *LoopBB = Extract1->getParent();
2506 Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2507 Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2508 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2509 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2510
2511 Extract0->getOperand(0).setReg(TmpReg0);
2512 Extract1->getOperand(0).setReg(TmpReg1);
2513
2514 B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2515
2516 buildVCopy(B, DstRegs[0], TmpReg0);
2517 buildVCopy(B, DstRegs[1], TmpReg1);
2518 }
2519
2520 if (ShouldMoveIndexIntoLoop)
2521 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2522
2523 return;
2524 }
2525 case AMDGPU::G_INSERT_VECTOR_ELT: {
2526 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2527
2528 Register DstReg = MI.getOperand(0).getReg();
2529 LLT VecTy = MRI.getType(DstReg);
2530
2531 assert(OpdMapper.getVRegs(0).empty())((OpdMapper.getVRegs(0).empty()) ? static_cast<void> (0
) : __assert_fail ("OpdMapper.getVRegs(0).empty()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2531, __PRETTY_FUNCTION__))
;
2532 assert(OpdMapper.getVRegs(3).empty())((OpdMapper.getVRegs(3).empty()) ? static_cast<void> (0
) : __assert_fail ("OpdMapper.getVRegs(3).empty()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2532, __PRETTY_FUNCTION__))
;
2533
2534 const RegisterBank *IdxBank =
2535 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2536
2537 if (substituteSimpleCopyRegs(OpdMapper, 1))
2538 MRI.setType(MI.getOperand(1).getReg(), VecTy);
2539
2540 Register SrcReg = MI.getOperand(1).getReg();
2541 Register InsReg = MI.getOperand(2).getReg();
2542 LLT InsTy = MRI.getType(InsReg);
2543 (void)InsTy;
2544
2545 Register BaseIdxReg;
2546 unsigned ConstOffset;
2547 MachineInstr *OffsetDef;
2548 std::tie(BaseIdxReg, ConstOffset, OffsetDef) =
2549 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2550
2551 // See if the index is an add of a constant which will be foldable by moving
2552 // the base register of the index later if this is going to be executed in a
2553 // waterfall loop. This is essentially to reassociate the add of a constant
2554 // with the readfirstlane.
2555 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2556 ConstOffset > 0 &&
2557 ConstOffset < VecTy.getNumElements();
2558
2559 // Move the base register. We'll re-insert the add later.
2560 if (ShouldMoveIndexIntoLoop)
2561 MI.getOperand(3).setReg(BaseIdxReg);
2562
2563
2564 if (InsRegs.empty()) {
2565 executeInWaterfallLoop(MI, MRI, { 3 });
2566
2567 // Re-insert the constant offset add inside the waterfall loop.
2568 if (ShouldMoveIndexIntoLoop) {
2569 MachineIRBuilder B(MI);
2570 reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2571 }
2572
2573 return;
2574 }
2575
2576
2577 assert(InsTy.getSizeInBits() == 64)((InsTy.getSizeInBits() == 64) ? static_cast<void> (0) :
__assert_fail ("InsTy.getSizeInBits() == 64", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2577, __PRETTY_FUNCTION__))
;
2578
2579 const LLT S32 = LLT::scalar(32);
2580 LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32);
2581
2582 MachineIRBuilder B(MI);
2583 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2584 auto One = B.buildConstant(S32, 1);
2585
2586 // Split the vector index into 32-bit pieces. Prepare to move all of the
2587 // new instructions into a waterfall loop if necessary.
2588 //
2589 // Don't put the bitcast or constant in the loop.
2590 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2591
2592 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2593 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2594 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2595
2596 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2597 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2598
2599 const RegisterBank *DstBank =
2600 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2601 const RegisterBank *SrcBank =
2602 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2603 const RegisterBank *InsSrcBank =
2604 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2605
2606 MRI.setRegBank(InsReg, *InsSrcBank);
2607 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2608 MRI.setRegBank(InsLo.getReg(0), *DstBank);
2609 MRI.setRegBank(InsHi.getReg(0), *DstBank);
2610 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2611 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2612 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2613
2614
2615 SmallSet<Register, 4> OpsToWaterfall;
2616 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2617 B.setInsertPt(B.getMBB(), MI);
2618 B.buildBitcast(DstReg, InsHi);
2619 MI.eraseFromParent();
2620 return;
2621 }
2622
2623 B.setInstr(*Span.begin());
2624 MI.eraseFromParent();
2625
2626 // Figure out the point after the waterfall loop before mangling the control
2627 // flow.
2628 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2629 OpsToWaterfall, MRI);
2630
2631 // The insertion point is now right after the original instruction.
2632 //
2633 // Keep the bitcast to the original vector type out of the loop. Doing this
2634 // saved an extra phi we don't need inside the loop.
2635 B.buildBitcast(DstReg, InsHi);
2636
2637 // Re-insert the constant offset add inside the waterfall loop.
2638 if (ShouldMoveIndexIntoLoop)
2639 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2640
2641 return;
2642 }
2643 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2644 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2645 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2646 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2647 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2648 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2649 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2650 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2651 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2652 case AMDGPU::G_AMDGPU_BUFFER_STORE:
2653 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2654 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2655 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2656 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2657 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2658 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2659 applyDefaultMapping(OpdMapper);
2660 executeInWaterfallLoop(MI, MRI, {1, 4});
2661 return;
2662 }
2663 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2664 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2665 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2666 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2667 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2668 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2669 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2670 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2671 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2672 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2673 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2674 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2675 applyDefaultMapping(OpdMapper);
2676 executeInWaterfallLoop(MI, MRI, {2, 5});
2677 return;
2678 }
2679 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2680 applyDefaultMapping(OpdMapper);
2681 executeInWaterfallLoop(MI, MRI, {3, 6});
2682 return;
2683 }
2684 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2685 applyMappingSBufferLoad(OpdMapper);
2686 return;
2687 }
2688 case AMDGPU::G_INTRINSIC: {
2689 switch (MI.getIntrinsicID()) {
2690 case Intrinsic::amdgcn_readlane: {
2691 substituteSimpleCopyRegs(OpdMapper, 2);
2692
2693 assert(OpdMapper.getVRegs(0).empty())((OpdMapper.getVRegs(0).empty()) ? static_cast<void> (0
) : __assert_fail ("OpdMapper.getVRegs(0).empty()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2693, __PRETTY_FUNCTION__))
;
2694 assert(OpdMapper.getVRegs(3).empty())((OpdMapper.getVRegs(3).empty()) ? static_cast<void> (0
) : __assert_fail ("OpdMapper.getVRegs(3).empty()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2694, __PRETTY_FUNCTION__))
;
2695
2696 // Make sure the index is an SGPR. It doesn't make sense to run this in a
2697 // waterfall loop, so assume it's a uniform value.
2698 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2699 return;
2700 }
2701 case Intrinsic::amdgcn_writelane: {
2702 assert(OpdMapper.getVRegs(0).empty())((OpdMapper.getVRegs(0).empty()) ? static_cast<void> (0
) : __assert_fail ("OpdMapper.getVRegs(0).empty()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2702, __PRETTY_FUNCTION__))
;
2703 assert(OpdMapper.getVRegs(2).empty())((OpdMapper.getVRegs(2).empty()) ? static_cast<void> (0
) : __assert_fail ("OpdMapper.getVRegs(2).empty()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2703, __PRETTY_FUNCTION__))
;
2704 assert(OpdMapper.getVRegs(3).empty())((OpdMapper.getVRegs(3).empty()) ? static_cast<void> (0
) : __assert_fail ("OpdMapper.getVRegs(3).empty()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2704, __PRETTY_FUNCTION__))
;
2705
2706 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2707 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
2708 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2709 return;
2710 }
2711 case Intrinsic::amdgcn_interp_p1:
2712 case Intrinsic::amdgcn_interp_p2:
2713 case Intrinsic::amdgcn_interp_mov:
2714 case Intrinsic::amdgcn_interp_p1_f16:
2715 case Intrinsic::amdgcn_interp_p2_f16: {
2716 applyDefaultMapping(OpdMapper);
2717
2718 // Readlane for m0 value, which is always the last operand.
2719 // FIXME: Should this be a waterfall loop instead?
2720 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
2721 return;
2722 }
2723 case Intrinsic::amdgcn_permlane16:
2724 case Intrinsic::amdgcn_permlanex16: {
2725 // Doing a waterfall loop over these wouldn't make any sense.
2726 substituteSimpleCopyRegs(OpdMapper, 2);
2727 substituteSimpleCopyRegs(OpdMapper, 3);
2728 constrainOpWithReadfirstlane(MI, MRI, 4);
2729 constrainOpWithReadfirstlane(MI, MRI, 5);
2730 return;
2731 }
2732 case Intrinsic::amdgcn_sbfe:
2733 applyMappingBFEIntrinsic(OpdMapper, true);
2734 return;
2735 case Intrinsic::amdgcn_ubfe:
2736 applyMappingBFEIntrinsic(OpdMapper, false);
2737 return;
2738 }
2739 break;
2740 }
2741 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
2742 auto IntrID = MI.getIntrinsicID();
2743 switch (IntrID) {
2744 case Intrinsic::amdgcn_ds_ordered_add:
2745 case Intrinsic::amdgcn_ds_ordered_swap: {
2746 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
2747 assert(OpdMapper.getVRegs(0).empty())((OpdMapper.getVRegs(0).empty()) ? static_cast<void> (0
) : __assert_fail ("OpdMapper.getVRegs(0).empty()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2747, __PRETTY_FUNCTION__))
;
2748 substituteSimpleCopyRegs(OpdMapper, 3);
2749 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2750 return;
2751 }
2752 case Intrinsic::amdgcn_ds_gws_init:
2753 case Intrinsic::amdgcn_ds_gws_barrier:
2754 case Intrinsic::amdgcn_ds_gws_sema_br: {
2755 // Only the first lane is executes, so readfirstlane is safe.
2756 substituteSimpleCopyRegs(OpdMapper, 1);
2757 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2758 return;
2759 }
2760 case Intrinsic::amdgcn_ds_gws_sema_v:
2761 case Intrinsic::amdgcn_ds_gws_sema_p:
2762 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
2763 // Only the first lane is executes, so readfirstlane is safe.
2764 constrainOpWithReadfirstlane(MI, MRI, 1); // M0
2765 return;
2766 }
2767 case Intrinsic::amdgcn_ds_append:
2768 case Intrinsic::amdgcn_ds_consume: {
2769 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2770 return;
2771 }
2772 case Intrinsic::amdgcn_s_sendmsg:
2773 case Intrinsic::amdgcn_s_sendmsghalt: {
2774 // FIXME: Should this use a waterfall loop?
2775 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2776 return;
2777 }
2778 default: {
2779 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
2780 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
2781 // Non-images can have complications from operands that allow both SGPR
2782 // and VGPR. For now it's too complicated to figure out the final opcode
2783 // to derive the register bank from the MCInstrDesc.
2784 if (RSrcIntrin->IsImage) {
2785 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
2786 return;
2787 }
2788 }
2789
2790 break;
2791 }
2792 }
2793 break;
2794 }
2795 case AMDGPU::G_LOAD:
2796 case AMDGPU::G_ZEXTLOAD:
2797 case AMDGPU::G_SEXTLOAD: {
2798 if (applyMappingWideLoad(MI, OpdMapper, MRI))
2799 return;
2800 break;
2801 }
2802 default:
2803 break;
2804 }
2805
2806 return applyDefaultMapping(OpdMapper);
2807}
2808
2809bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
2810 const MachineFunction &MF = *MI.getParent()->getParent();
2811 const MachineRegisterInfo &MRI = MF.getRegInfo();
2812 for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
2813 if (!MI.getOperand(i).isReg())
2814 continue;
2815 Register Reg = MI.getOperand(i).getReg();
2816 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
2817 if (Bank->getID() != AMDGPU::SGPRRegBankID)
2818 return false;
2819 }
2820 }
2821 return true;
2822}
2823
2824const RegisterBankInfo::InstructionMapping &
2825AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
2826 const MachineFunction &MF = *MI.getParent()->getParent();
2827 const MachineRegisterInfo &MRI = MF.getRegInfo();
2828 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
2829
2830 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
2831 const MachineOperand &SrcOp = MI.getOperand(i);
2832 if (!SrcOp.isReg())
2833 continue;
2834
2835 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
2836 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2837 }
2838 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
2839 MI.getNumOperands());
2840}
2841
2842const RegisterBankInfo::InstructionMapping &
2843AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
2844 const MachineFunction &MF = *MI.getParent()->getParent();
2845 const MachineRegisterInfo &MRI = MF.getRegInfo();
2846 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
2847
2848 // Even though we technically could use SGPRs, this would require knowledge of
2849 // the constant bus restriction. Force all sources to VGPR (except for VCC).
2850 //
2851 // TODO: Unary ops are trivially OK, so accept SGPRs?
2852 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
2853 const MachineOperand &Src = MI.getOperand(i);
2854 if (!Src.isReg())
2855 continue;
2856
2857 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
2858 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
2859 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
2860 }
2861
2862 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
2863 MI.getNumOperands());
2864}
2865
2866const RegisterBankInfo::InstructionMapping &
2867AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
2868 const MachineFunction &MF = *MI.getParent()->getParent();
2869 const MachineRegisterInfo &MRI = MF.getRegInfo();
2870 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
2871
2872 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
2873 const MachineOperand &Op = MI.getOperand(I);
2874 if (!Op.isReg())
2875 continue;
2876
2877 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
2878 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2879 }
2880
2881 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
2882 MI.getNumOperands());
2883}
2884
2885const RegisterBankInfo::InstructionMapping &
2886AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
2887 const MachineInstr &MI,
2888 int RsrcIdx) const {
2889 // The reported argument index is relative to the IR intrinsic call arguments,
2890 // so we need to shift by the number of defs and the intrinsic ID.
2891 RsrcIdx += MI.getNumExplicitDefs() + 1;
2892
2893 const int NumOps = MI.getNumOperands();
2894 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
2895
2896 // TODO: Should packed/unpacked D16 difference be reported here as part of
2897 // the value mapping?
2898 for (int I = 0; I != NumOps; ++I) {
2899 if (!MI.getOperand(I).isReg())
2900 continue;
2901
2902 Register OpReg = MI.getOperand(I).getReg();
2903 unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
2904
2905 // FIXME: Probably need a new intrinsic register bank searchable table to
2906 // handle arbitrary intrinsics easily.
2907 //
2908 // If this has a sampler, it immediately follows rsrc.
2909 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
2910
2911 if (MustBeSGPR) {
2912 // If this must be an SGPR, so we must report whatever it is as legal.
2913 unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2914 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
2915 } else {
2916 // Some operands must be VGPR, and these are easy to copy to.
2917 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2918 }
2919 }
2920
2921 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
2922}
2923
2924/// Return the mapping for a pointer arugment.
2925const RegisterBankInfo::ValueMapping *
2926AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
2927 Register PtrReg) const {
2928 LLT PtrTy = MRI.getType(PtrReg);
2929 unsigned Size = PtrTy.getSizeInBits();
2930 if (Subtarget.useFlatForGlobal() ||
2931 !SITargetLowering::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
2932 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
2933
2934 // If we're using MUBUF instructions for global memory, an SGPR base register
2935 // is possible. Otherwise this needs to be a VGPR.
2936 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
2937 return AMDGPU::getValueMapping(PtrBank->getID(), Size);
2938}
2939
2940const RegisterBankInfo::InstructionMapping &
2941AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
2942
2943 const MachineFunction &MF = *MI.getParent()->getParent();
2944 const MachineRegisterInfo &MRI = MF.getRegInfo();
2945 SmallVector<const ValueMapping*, 2> OpdsMapping(2);
2946 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2947 LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
2948 Register PtrReg = MI.getOperand(1).getReg();
2949 LLT PtrTy = MRI.getType(PtrReg);
2950 unsigned AS = PtrTy.getAddressSpace();
2951 unsigned PtrSize = PtrTy.getSizeInBits();
2952
2953 const ValueMapping *ValMapping;
2954 const ValueMapping *PtrMapping;
2955
2956 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
2957
2958 if (PtrBank == &AMDGPU::SGPRRegBank &&
2959 SITargetLowering::isFlatGlobalAddrSpace(AS)) {
2960 if (isScalarLoadLegal(MI)) {
2961 // We have a uniform instruction so we want to use an SMRD load
2962 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2963 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
2964 } else {
2965 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
Value stored to 'ValMapping' is never read
2966
2967 // If we're using MUBUF instructions for global memory, an SGPR base
2968 // register is possible. Otherwise this needs to be a VGPR.
2969 unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
2970 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
2971
2972 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
2973 ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID,
2974 LoadTy);
2975 }
2976 } else {
2977 ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
2978 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
2979 }
2980
2981 OpdsMapping[0] = ValMapping;
2982 OpdsMapping[1] = PtrMapping;
2983 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
2984 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
2985 return Mapping;
2986
2987 // FIXME: Do we want to add a mapping for FLAT load, or should we just
2988 // handle that during instruction selection?
2989}
2990
2991unsigned
2992AMDGPURegisterBankInfo::getRegBankID(Register Reg,
2993 const MachineRegisterInfo &MRI,
2994 const TargetRegisterInfo &TRI,
2995 unsigned Default) const {
2996 const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
2997 return Bank ? Bank->getID() : Default;
2998}
2999
3000
3001static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3002 return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ?
3003 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3004}
3005
3006static int regBankBoolUnion(int RB0, int RB1) {
3007 if (RB0 == -1)
3008 return RB1;
3009 if (RB1 == -1)
3010 return RB0;
3011
3012 // vcc, vcc -> vcc
3013 // vcc, sgpr -> vcc
3014 // vcc, vgpr -> vcc
3015 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3016 return AMDGPU::VCCRegBankID;
3017
3018 // vcc, vgpr -> vgpr
3019 return regBankUnion(RB0, RB1);
3020}
3021
3022const RegisterBankInfo::ValueMapping *
3023AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3024 const MachineRegisterInfo &MRI,
3025 const TargetRegisterInfo &TRI) const {
3026 // Lie and claim anything is legal, even though this needs to be an SGPR
3027 // applyMapping will have to deal with it as a waterfall loop.
3028 unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID);
3029 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3030 return AMDGPU::getValueMapping(Bank, Size);
3031}
3032
3033const RegisterBankInfo::ValueMapping *
3034AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3035 const MachineRegisterInfo &MRI,
3036 const TargetRegisterInfo &TRI) const {
3037 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3038 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3039}
3040
3041const RegisterBankInfo::ValueMapping *
3042AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3043 const MachineRegisterInfo &MRI,
3044 const TargetRegisterInfo &TRI) const {
3045 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3046 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3047}
3048
3049///
3050/// This function must return a legal mapping, because
3051/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3052/// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3053/// VGPR to SGPR generated is illegal.
3054///
3055// Operands that must be SGPRs must accept potentially divergent VGPRs as
3056// legal. These will be dealt with in applyMappingImpl.
3057//
3058const RegisterBankInfo::InstructionMapping &
3059AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3060 const MachineFunction &MF = *MI.getParent()->getParent();
3061 const MachineRegisterInfo &MRI = MF.getRegInfo();
3062
3063 if (MI.isRegSequence()) {
3064 // If any input is a VGPR, the result must be a VGPR. The default handling
3065 // assumes any copy between banks is legal.
3066 unsigned BankID = AMDGPU::SGPRRegBankID;
3067
3068 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3069 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI);
3070 // It doesn't make sense to use vcc or scc banks here, so just ignore
3071 // them.
3072 if (OpBank != AMDGPU::SGPRRegBankID) {
3073 BankID = AMDGPU::VGPRRegBankID;
3074 break;
3075 }
3076 }
3077 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3078
3079 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3080 return getInstructionMapping(
3081 1, /*Cost*/ 1,
3082 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3083 }
3084
3085 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3086 // properly.
3087 //
3088 // TODO: There are additional exec masking dependencies to analyze.
3089 if (MI.getOpcode() == TargetOpcode::G_PHI) {
3090 // TODO: Generate proper invalid bank enum.
3091 int ResultBank = -1;
3092 Register DstReg = MI.getOperand(0).getReg();
3093
3094 // Sometimes the result may have already been assigned a bank.
3095 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3096 ResultBank = DstBank->getID();
3097
3098 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3099 Register Reg = MI.getOperand(I).getReg();
3100 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3101
3102 // FIXME: Assuming VGPR for any undetermined inputs.
3103 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3104 ResultBank = AMDGPU::VGPRRegBankID;
3105 break;
3106 }
3107
3108 // FIXME: Need to promote SGPR case to s32
3109 unsigned OpBank = Bank->getID();
3110 ResultBank = regBankBoolUnion(ResultBank, OpBank);
3111 }
3112
3113 assert(ResultBank != -1)((ResultBank != -1) ? static_cast<void> (0) : __assert_fail
("ResultBank != -1", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 3113, __PRETTY_FUNCTION__))
;
3114
3115 unsigned Size = MRI.getType(DstReg).getSizeInBits();
3116
3117 const ValueMapping &ValMap =
3118 getValueMapping(0, Size, getRegBank(ResultBank));
3119 return getInstructionMapping(
3120 1, /*Cost*/ 1,
3121 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3122 }
3123
3124 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3125 if (Mapping.isValid())
3126 return Mapping;
3127
3128 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3129
3130 switch (MI.getOpcode()) {
3131 default:
3132 return getInvalidInstructionMapping();
3133
3134 case AMDGPU::G_AND:
3135 case AMDGPU::G_OR:
3136 case AMDGPU::G_XOR: {
3137 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3138 if (Size == 1) {
3139 const RegisterBank *DstBank
3140 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3141
3142 unsigned TargetBankID = -1;
3143 unsigned BankLHS = -1;
3144 unsigned BankRHS = -1;
3145 if (DstBank) {
3146 TargetBankID = DstBank->getID();
3147 if (DstBank == &AMDGPU::VCCRegBank) {
3148 TargetBankID = AMDGPU::VCCRegBankID;
3149 BankLHS = AMDGPU::VCCRegBankID;
3150 BankRHS = AMDGPU::VCCRegBankID;
3151 } else {
3152 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
3153 AMDGPU::SGPRRegBankID);
3154 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
3155 AMDGPU::SGPRRegBankID);
3156 }
3157 } else {
3158 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
3159 AMDGPU::VCCRegBankID);
3160 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
3161 AMDGPU::VCCRegBankID);
3162
3163 // Both inputs should be true booleans to produce a boolean result.
3164 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3165 TargetBankID = AMDGPU::VGPRRegBankID;
3166 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3167 TargetBankID = AMDGPU::VCCRegBankID;
3168 BankLHS = AMDGPU::VCCRegBankID;
3169 BankRHS = AMDGPU::VCCRegBankID;
3170 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3171 TargetBankID = AMDGPU::SGPRRegBankID;
3172 }
3173 }
3174
3175 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3176 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3177 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3178 break;
3179 }
3180
3181 if (Size == 64) {
3182
3183 if (isSALUMapping(MI)) {
3184 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3185 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3186 } else {
3187 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3188 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/);
3189 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3190
3191 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/);
3192 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3193 }
3194
3195 break;
3196 }
3197
3198 LLVM_FALLTHROUGH[[gnu::fallthrough]];
3199 }
3200 case AMDGPU::G_PTR_ADD:
3201 case AMDGPU::G_ADD:
3202 case AMDGPU::G_SUB:
3203 case AMDGPU::G_MUL:
3204 case AMDGPU::G_SHL:
3205 case AMDGPU::G_LSHR:
3206 case AMDGPU::G_ASHR:
3207 case AMDGPU::G_UADDO:
3208 case AMDGPU::G_USUBO:
3209 case AMDGPU::G_UADDE:
3210 case AMDGPU::G_SADDE:
3211 case AMDGPU::G_USUBE:
3212 case AMDGPU::G_SSUBE:
3213 case AMDGPU::G_SMIN:
3214 case AMDGPU::G_SMAX:
3215 case AMDGPU::G_UMIN:
3216 case AMDGPU::G_UMAX:
3217 case AMDGPU::G_SHUFFLE_VECTOR:
3218 if (isSALUMapping(MI))
3219 return getDefaultMappingSOP(MI);
3220 LLVM_FALLTHROUGH[[gnu::fallthrough]];
3221
3222 case AMDGPU::G_FADD:
3223 case AMDGPU::G_FSUB:
3224 case AMDGPU::G_FPTOSI:
3225 case AMDGPU::G_FPTOUI:
3226 case AMDGPU::G_FMUL:
3227 case AMDGPU::G_FMA:
3228 case AMDGPU::G_FMAD:
3229 case AMDGPU::G_FSQRT:
3230 case AMDGPU::G_FFLOOR:
3231 case AMDGPU::G_FCEIL:
3232 case AMDGPU::G_FRINT:
3233 case AMDGPU::G_SITOFP:
3234 case AMDGPU::G_UITOFP:
3235 case AMDGPU::G_FPTRUNC:
3236 case AMDGPU::G_FPEXT:
3237 case AMDGPU::G_FEXP2:
3238 case AMDGPU::G_FLOG2:
3239 case AMDGPU::G_FMINNUM:
3240 case AMDGPU::G_FMAXNUM:
3241 case AMDGPU::G_FMINNUM_IEEE:
3242 case AMDGPU::G_FMAXNUM_IEEE:
3243 case AMDGPU::G_FCANONICALIZE:
3244 case AMDGPU::G_INTRINSIC_TRUNC:
3245 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3246 case AMDGPU::G_AMDGPU_FFBH_U32:
3247 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3248 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3249 case AMDGPU::G_AMDGPU_RCP_IFLAG:
3250 return getDefaultMappingVOP(MI);
3251 case AMDGPU::G_UMULH:
3252 case AMDGPU::G_SMULH: {
3253 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3254 return getDefaultMappingSOP(MI);
3255 return getDefaultMappingVOP(MI);
3256 }
3257 case AMDGPU::G_IMPLICIT_DEF: {
3258 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3259 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3260 break;
3261 }
3262 case AMDGPU::G_FCONSTANT:
3263 case AMDGPU::G_CONSTANT:
3264 case AMDGPU::G_GLOBAL_VALUE:
3265 case AMDGPU::G_BLOCK_ADDR:
3266 case AMDGPU::G_READCYCLECOUNTER: {
3267 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3268 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3269 break;
3270 }
3271 case AMDGPU::G_FRAME_INDEX: {
3272 // TODO: This should be the same as other constants, but eliminateFrameIndex
3273 // currently assumes VALU uses.
3274 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3275 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3276 break;
3277 }
3278 case AMDGPU::G_INSERT: {
3279 unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
3280 AMDGPU::VGPRRegBankID;
3281 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3282 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3283 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3284 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3285 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3286 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3287 OpdsMapping[3] = nullptr;
3288 break;
3289 }
3290 case AMDGPU::G_EXTRACT: {
3291 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
3292 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3293 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3294 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3295 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3296 OpdsMapping[2] = nullptr;
3297 break;
3298 }
3299 case AMDGPU::G_BUILD_VECTOR:
3300 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3301 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3302 if (DstTy == LLT::vector(2, 16)) {
3303 unsigned DstSize = DstTy.getSizeInBits();
3304 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3305 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
3306 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
3307 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3308
3309 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3310 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3311 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3312 break;
3313 }
3314
3315 LLVM_FALLTHROUGH[[gnu::fallthrough]];
3316 }
3317 case AMDGPU::G_MERGE_VALUES:
3318 case AMDGPU::G_CONCAT_VECTORS: {
3319 unsigned Bank = isSALUMapping(MI) ?
3320 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3321 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3322 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3323
3324 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3325 // Op1 and Dst should use the same register bank.
3326 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3327 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3328 break;
3329 }
3330 case AMDGPU::G_BITCAST:
3331 case AMDGPU::G_INTTOPTR:
3332 case AMDGPU::G_PTRTOINT:
3333 case AMDGPU::G_BITREVERSE:
3334 case AMDGPU::G_FABS:
3335 case AMDGPU::G_FNEG: {
3336 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3337 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
3338 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3339 break;
3340 }
3341 case AMDGPU::G_CTLZ_ZERO_UNDEF:
3342 case AMDGPU::G_CTTZ_ZERO_UNDEF:
3343 case AMDGPU::G_CTPOP: {
3344 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3345 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
3346 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3347
3348 // This should really be getValueMappingSGPR64Only, but allowing the generic
3349 // code to handle the register split just makes using LegalizerHelper more
3350 // difficult.
3351 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3352 break;
3353 }
3354 case AMDGPU::G_TRUNC: {
3355 Register Dst = MI.getOperand(0).getReg();
3356 Register Src = MI.getOperand(1).getReg();
3357 unsigned Bank = getRegBankID(Src, MRI, *TRI);
3358 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3359 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3360 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3361 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3362 break;
3363 }
3364 case AMDGPU::G_ZEXT:
3365 case AMDGPU::G_SEXT:
3366 case AMDGPU::G_ANYEXT:
3367 case AMDGPU::G_SEXT_INREG: {
3368 Register Dst = MI.getOperand(0).getReg();
3369 Register Src = MI.getOperand(1).getReg();
3370 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3371 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3372
3373 unsigned DstBank;
3374 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3375 assert(SrcBank)((SrcBank) ? static_cast<void> (0) : __assert_fail ("SrcBank"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 3375, __PRETTY_FUNCTION__))
;
3376 switch (SrcBank->getID()) {
3377 case AMDGPU::SGPRRegBankID:
3378 DstBank = AMDGPU::SGPRRegBankID;
3379 break;
3380 default:
3381 DstBank = AMDGPU::VGPRRegBankID;
3382 break;
3383 }
3384
3385 // TODO: Should anyext be split into 32-bit part as well?
3386 if (MI.getOpcode() == AMDGPU::G_ANYEXT) {
3387 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
3388 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize);
3389 } else {
3390 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3391 // 32-bits, and then to 64.
3392 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3393 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3394 SrcSize);
3395 }
3396 break;
3397 }
3398 case AMDGPU::G_FCMP: {
3399 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3400 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
3401 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3402 OpdsMapping[1] = nullptr; // Predicate Operand.
3403 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
3404 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3405 break;
3406 }
3407 case AMDGPU::G_STORE: {
3408 assert(MI.getOperand(0).isReg())((MI.getOperand(0).isReg()) ? static_cast<void> (0) : __assert_fail
("MI.getOperand(0).isReg()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 3408, __PRETTY_FUNCTION__))
;
3409 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3410
3411 // FIXME: We need to specify a different reg bank once scalar stores are
3412 // supported.
3413 const ValueMapping *ValMapping =
3414 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3415 OpdsMapping[0] = ValMapping;
3416 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
3417 break;
3418 }
3419 case AMDGPU::G_ICMP: {
3420 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3421 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3422
3423 // See if the result register has already been constrained to vcc, which may
3424 // happen due to control flow intrinsic lowering.
3425 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
3426 AMDGPU::SGPRRegBankID);
3427 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
3428 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
3429
3430 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
3431 Op2Bank == AMDGPU::SGPRRegBankID &&
3432 Op3Bank == AMDGPU::SGPRRegBankID &&
3433 (Size == 32 || (Size == 64 &&
3434 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
3435 Subtarget.hasScalarCompareEq64()));
3436
3437 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3438 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3439
3440 // TODO: Use 32-bit for scalar output size.
3441 // SCC results will need to be copied to a 32-bit SGPR virtual register.
3442 const unsigned ResultSize = 1;
3443
3444 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
3445 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
3446 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
3447 break;
3448 }
3449 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
3450 // VGPR index can be used for waterfall when indexing a SGPR vector.
3451 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
3452 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3453 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3454 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3455 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
3456 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
3457
3458 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
3459 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
3460
3461 // The index can be either if the source vector is VGPR.
3462 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3463 break;
3464 }
3465 case AMDGPU::G_INSERT_VECTOR_ELT: {
3466 unsigned OutputBankID = isSALUMapping(MI) ?
3467 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3468
3469 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3470 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3471 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
3472 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(),
3473 MRI, *TRI);
3474 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
3475
3476 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3477 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3478
3479 // This is a weird case, because we need to break down the mapping based on
3480 // the register bank of a different operand.
3481 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
3482 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
3483 InsertSize);
3484 } else {
3485 assert(InsertSize == 32 || InsertSize == 64)((InsertSize == 32 || InsertSize == 64) ? static_cast<void
> (0) : __assert_fail ("InsertSize == 32 || InsertSize == 64"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 3485, __PRETTY_FUNCTION__))
;
3486 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
3487 }
3488
3489 // The index can be either if the source vector is VGPR.
3490 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
3491 break;
3492 }
3493 case AMDGPU::G_UNMERGE_VALUES: {
3494 unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
3495 AMDGPU::VGPRRegBankID;
3496
3497 // Op1 and Dst should use the same register bank.
3498 // FIXME: Shouldn't this be the default? Why do we need to handle this?
3499 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3500 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
3501 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
3502 }
3503 break;
3504 }
3505 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3506 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3507 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3508 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3509 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3510 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3511 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3512 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3513 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3514 case AMDGPU::G_AMDGPU_BUFFER_STORE:
3515 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3516 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3517 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3518 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
3519 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3520
3521 // rsrc
3522 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3523
3524 // vindex
3525 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3526
3527 // voffset
3528 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3529
3530 // soffset
3531 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3532
3533 // Any remaining operands are immediates and were correctly null
3534 // initialized.
3535 break;
3536 }
3537 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3538 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3539 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3540 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3541 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3542 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3543 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3544 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3545 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3546 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3547 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3548 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
3549 // vdata_out
3550 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3551
3552 // vdata_in
3553 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3554
3555 // rsrc
3556 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3557
3558 // vindex
3559 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3560
3561 // voffset
3562 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3563
3564 // soffset
3565 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3566
3567 // Any remaining operands are immediates and were correctly null
3568 // initialized.
3569 break;
3570 }
3571 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3572 // vdata_out
3573 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3574
3575 // vdata_in
3576 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3577
3578 // cmp
3579 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3580
3581 // rsrc
3582 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3583
3584 // vindex
3585 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3586
3587 // voffset
3588 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3589
3590 // soffset
3591 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
3592
3593 // Any remaining operands are immediates and were correctly null
3594 // initialized.
3595 break;
3596 }
3597 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
3598 // Lie and claim everything is legal, even though some need to be
3599 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
3600 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3601 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3602
3603 // We need to convert this to a MUBUF if either the resource of offset is
3604 // VGPR.
3605 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
3606 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
3607 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
3608
3609 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3610 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
3611 break;
3612 }
3613 case AMDGPU::G_INTRINSIC: {
3614 switch (MI.getIntrinsicID()) {
3615 default:
3616 return getInvalidInstructionMapping();
3617 case Intrinsic::amdgcn_div_fmas:
3618 case Intrinsic::amdgcn_div_fixup:
3619 case Intrinsic::amdgcn_trig_preop:
3620 case Intrinsic::amdgcn_sin:
3621 case Intrinsic::amdgcn_cos:
3622 case Intrinsic::amdgcn_log_clamp:
3623 case Intrinsic::amdgcn_rcp:
3624 case Intrinsic::amdgcn_rcp_legacy:
3625 case Intrinsic::amdgcn_rsq:
3626 case Intrinsic::amdgcn_rsq_legacy:
3627 case Intrinsic::amdgcn_rsq_clamp:
3628 case Intrinsic::amdgcn_fmul_legacy:
3629 case Intrinsic::amdgcn_ldexp:
3630 case Intrinsic::amdgcn_frexp_mant:
3631 case Intrinsic::amdgcn_frexp_exp:
3632 case Intrinsic::amdgcn_fract:
3633 case Intrinsic::amdgcn_cvt_pkrtz:
3634 case Intrinsic::amdgcn_cvt_pknorm_i16:
3635 case Intrinsic::amdgcn_cvt_pknorm_u16:
3636 case Intrinsic::amdgcn_cvt_pk_i16:
3637 case Intrinsic::amdgcn_cvt_pk_u16:
3638 case Intrinsic::amdgcn_fmed3:
3639 case Intrinsic::amdgcn_cubeid:
3640 case Intrinsic::amdgcn_cubema:
3641 case Intrinsic::amdgcn_cubesc:
3642 case Intrinsic::amdgcn_cubetc:
3643 case Intrinsic::amdgcn_sffbh:
3644 case Intrinsic::amdgcn_fmad_ftz:
3645 case Intrinsic::amdgcn_mbcnt_lo:
3646 case Intrinsic::amdgcn_mbcnt_hi:
3647 case Intrinsic::amdgcn_mul_u24:
3648 case Intrinsic::amdgcn_mul_i24:
3649 case Intrinsic::amdgcn_lerp:
3650 case Intrinsic::amdgcn_sad_u8:
3651 case Intrinsic::amdgcn_msad_u8:
3652 case Intrinsic::amdgcn_sad_hi_u8:
3653 case Intrinsic::amdgcn_sad_u16:
3654 case Intrinsic::amdgcn_qsad_pk_u16_u8:
3655 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
3656 case Intrinsic::amdgcn_mqsad_u32_u8:
3657 case Intrinsic::amdgcn_cvt_pk_u8_f32:
3658 case Intrinsic::amdgcn_alignbit:
3659 case Intrinsic::amdgcn_alignbyte:
3660 case Intrinsic::amdgcn_fdot2:
3661 case Intrinsic::amdgcn_sdot2:
3662 case Intrinsic::amdgcn_udot2:
3663 case Intrinsic::amdgcn_sdot4:
3664 case Intrinsic::amdgcn_udot4:
3665 case Intrinsic::amdgcn_sdot8:
3666 case Intrinsic::amdgcn_udot8:
3667 return getDefaultMappingVOP(MI);
3668 case Intrinsic::amdgcn_sbfe:
3669 case Intrinsic::amdgcn_ubfe:
3670 if (isSALUMapping(MI))
3671 return getDefaultMappingSOP(MI);
3672 return getDefaultMappingVOP(MI);
3673 case Intrinsic::amdgcn_ds_swizzle:
3674 case Intrinsic::amdgcn_ds_permute:
3675 case Intrinsic::amdgcn_ds_bpermute:
3676 case Intrinsic::amdgcn_update_dpp:
3677 case Intrinsic::amdgcn_mov_dpp8:
3678 case Intrinsic::amdgcn_mov_dpp:
3679 case Intrinsic::amdgcn_wwm:
3680 case Intrinsic::amdgcn_wqm:
3681 case Intrinsic::amdgcn_softwqm:
3682 return getDefaultMappingAllVGPR(MI);
3683 case Intrinsic::amdgcn_kernarg_segment_ptr:
3684 case Intrinsic::amdgcn_s_getpc:
3685 case Intrinsic::amdgcn_groupstaticsize: {
3686 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3687 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3688 break;
3689 }
3690 case Intrinsic::amdgcn_wqm_vote: {
3691 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3692 OpdsMapping[0] = OpdsMapping[2]
3693 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
3694 break;
3695 }
3696 case Intrinsic::amdgcn_ps_live: {
3697 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3698 break;
3699 }
3700 case Intrinsic::amdgcn_div_scale: {
3701 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3702 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3703 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
3704 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
3705
3706 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
3707 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
3708 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
3709 break;
3710 }
3711 case Intrinsic::amdgcn_class: {
3712 Register Src0Reg = MI.getOperand(2).getReg();
3713 Register Src1Reg = MI.getOperand(3).getReg();
3714 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
3715 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
3716 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3717 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
3718 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
3719 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
3720 break;
3721 }
3722 case Intrinsic::amdgcn_icmp:
3723 case Intrinsic::amdgcn_fcmp: {
3724 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3725 // This is not VCCRegBank because this is not used in boolean contexts.
3726 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
3727 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3728 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
3729 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
3730 break;
3731 }
3732 case Intrinsic::amdgcn_readlane: {
3733 // This must be an SGPR, but accept a VGPR.
3734 Register IdxReg = MI.getOperand(3).getReg();
3735 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
3736 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
3737 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3738 LLVM_FALLTHROUGH[[gnu::fallthrough]];
3739 }
3740 case Intrinsic::amdgcn_readfirstlane: {
3741 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3742 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3743 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
3744 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
3745 break;
3746 }
3747 case Intrinsic::amdgcn_writelane: {
3748 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3749 Register SrcReg = MI.getOperand(2).getReg();
3750 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
3751 unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
3752 Register IdxReg = MI.getOperand(3).getReg();
3753 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
3754 unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
3755 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
3756
3757 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
3758 // to legalize.
3759 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
3760 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3761 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
3762 break;
3763 }
3764 case Intrinsic::amdgcn_if_break: {
3765 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3766 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3767 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3768 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3769 break;
3770 }
3771 case Intrinsic::amdgcn_permlane16:
3772 case Intrinsic::amdgcn_permlanex16: {
3773 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3774 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3775 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3776 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3777 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3778 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3779 break;
3780 }
3781 case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
3782 case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
3783 case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
3784 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
3785 case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
3786 case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
3787 case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
3788 case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
3789 case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
3790 case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
3791 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
3792 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
3793 case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
3794 case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
3795 case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
3796 case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
3797 case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
3798 case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
3799 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
3800 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: {
3801 // Default for MAI intrinsics.
3802 // srcC can also be an immediate which can be folded later.
3803 // FIXME: Should we eventually add an alternative mapping with AGPR src
3804 // for srcA/srcB?
3805 //
3806 // vdst, srcA, srcB, srcC
3807 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3808 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3809 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3810 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3811 break;
3812 }
3813 case Intrinsic::amdgcn_interp_p1:
3814 case Intrinsic::amdgcn_interp_p2:
3815 case Intrinsic::amdgcn_interp_mov:
3816 case Intrinsic::amdgcn_interp_p1_f16:
3817 case Intrinsic::amdgcn_interp_p2_f16: {
3818 const int M0Idx = MI.getNumOperands() - 1;
3819 Register M0Reg = MI.getOperand(M0Idx).getReg();
3820 unsigned M0Bank = getRegBankID(M0Reg, MRI, *TRI, AMDGPU::SGPRRegBankID);
3821 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3822
3823 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
3824 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
3825 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3826
3827 // Must be SGPR, but we must take whatever the original bank is and fix it
3828 // later.
3829 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
3830 break;
3831 }
3832 }
3833 break;
3834 }
3835 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
3836 auto IntrID = MI.getIntrinsicID();
3837 switch (IntrID) {
3838 case Intrinsic::amdgcn_s_getreg:
3839 case Intrinsic::amdgcn_s_memtime:
3840 case Intrinsic::amdgcn_s_memrealtime:
3841 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
3842 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3843 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3844 break;
3845 }
3846 case Intrinsic::amdgcn_ds_fadd:
3847 case Intrinsic::amdgcn_ds_fmin:
3848 case Intrinsic::amdgcn_ds_fmax:
3849 return getDefaultMappingAllVGPR(MI);
3850 case Intrinsic::amdgcn_ds_ordered_add:
3851 case Intrinsic::amdgcn_ds_ordered_swap: {
3852 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3853 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
3854 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
3855 AMDGPU::SGPRRegBankID);
3856 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
3857 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3858 break;
3859 }
3860 case Intrinsic::amdgcn_ds_append:
3861 case Intrinsic::amdgcn_ds_consume: {
3862 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3863 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
3864 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3865 break;
3866 }
3867 case Intrinsic::amdgcn_exp_compr:
3868 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3869 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3870 break;
3871 case Intrinsic::amdgcn_exp:
3872 // FIXME: Could we support packed types here?
3873 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3874 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3875 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3876 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3877 break;
3878 case Intrinsic::amdgcn_s_sendmsg:
3879 case Intrinsic::amdgcn_s_sendmsghalt: {
3880 // This must be an SGPR, but accept a VGPR.
3881 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
3882 AMDGPU::SGPRRegBankID);
3883 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
3884 break;
3885 }
3886 case Intrinsic::amdgcn_end_cf:
3887 case Intrinsic::amdgcn_init_exec: {
3888 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3889 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3890 break;
3891 }
3892 case Intrinsic::amdgcn_else: {
3893 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3894 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3895 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
3896 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
3897 break;
3898 }
3899 case Intrinsic::amdgcn_kill: {
3900 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3901 break;
3902 }
3903 case Intrinsic::amdgcn_raw_buffer_load:
3904 case Intrinsic::amdgcn_raw_tbuffer_load: {
3905 // FIXME: Should make intrinsic ID the last operand of the instruction,
3906 // then this would be the same as store
3907 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3908 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3909 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3910 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3911 break;
3912 }
3913 case Intrinsic::amdgcn_raw_buffer_store:
3914 case Intrinsic::amdgcn_raw_buffer_store_format:
3915 case Intrinsic::amdgcn_raw_tbuffer_store: {
3916 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3917 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3918 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3919 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3920 break;
3921 }
3922 case Intrinsic::amdgcn_struct_buffer_load:
3923 case Intrinsic::amdgcn_struct_tbuffer_load: {
3924 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3925 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3926 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3927 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3928 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3929 break;
3930 }
3931 case Intrinsic::amdgcn_struct_buffer_store:
3932 case Intrinsic::amdgcn_struct_tbuffer_store: {
3933 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3934 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3935 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3936 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3937 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3938 break;
3939 }
3940 case Intrinsic::amdgcn_init_exec_from_input: {
3941 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3942 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3943 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3944 break;
3945 }
3946 case Intrinsic::amdgcn_ds_gws_init:
3947 case Intrinsic::amdgcn_ds_gws_barrier:
3948 case Intrinsic::amdgcn_ds_gws_sema_br: {
3949 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3950
3951 // This must be an SGPR, but accept a VGPR.
3952 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
3953 AMDGPU::SGPRRegBankID);
3954 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
3955 break;
3956 }
3957 case Intrinsic::amdgcn_ds_gws_sema_v:
3958 case Intrinsic::amdgcn_ds_gws_sema_p:
3959 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3960 // This must be an SGPR, but accept a VGPR.
3961 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
3962 AMDGPU::SGPRRegBankID);
3963 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
3964 break;
3965 }
3966 default:
3967 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3968 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3969 // Non-images can have complications from operands that allow both SGPR
3970 // and VGPR. For now it's too complicated to figure out the final opcode
3971 // to derive the register bank from the MCInstrDesc.
3972 if (RSrcIntrin->IsImage)
3973 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
3974 }
3975
3976 return getInvalidInstructionMapping();
3977 }
3978 break;
3979 }
3980 case AMDGPU::G_SELECT: {
3981 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3982 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
3983 AMDGPU::SGPRRegBankID);
3984 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI,
3985 AMDGPU::SGPRRegBankID);
3986 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
3987 Op3Bank == AMDGPU::SGPRRegBankID;
3988
3989 unsigned CondBankDefault = SGPRSrcs ?
3990 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3991 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
3992 CondBankDefault);
3993 if (CondBank == AMDGPU::SGPRRegBankID)
3994 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3995 else if (CondBank == AMDGPU::VGPRRegBankID)
3996 CondBank = AMDGPU::VCCRegBankID;
3997
3998 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
3999 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4000
4001 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID)((CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID
) ? static_cast<void> (0) : __assert_fail ("CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 4001, __PRETTY_FUNCTION__))
;
4002
4003 // TODO: Should report 32-bit for scalar condition type.
4004 if (Size == 64) {
4005 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4006 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4007 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4008 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4009 } else {
4010 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4011 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4012 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4013 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4014 }
4015
4016 break;
4017 }
4018
4019 case AMDGPU::G_LOAD:
4020 case AMDGPU::G_ZEXTLOAD:
4021 case AMDGPU::G_SEXTLOAD:
4022 return getInstrMappingForLoad(MI);
4023
4024 case AMDGPU::G_ATOMICRMW_XCHG:
4025 case AMDGPU::G_ATOMICRMW_ADD:
4026 case AMDGPU::G_ATOMICRMW_SUB:
4027 case AMDGPU::G_ATOMICRMW_AND:
4028 case AMDGPU::G_ATOMICRMW_OR:
4029 case AMDGPU::G_ATOMICRMW_XOR:
4030 case AMDGPU::G_ATOMICRMW_MAX:
4031 case AMDGPU::G_ATOMICRMW_MIN:
4032 case AMDGPU::G_ATOMICRMW_UMAX:
4033 case AMDGPU::G_ATOMICRMW_UMIN:
4034 case AMDGPU::G_ATOMICRMW_FADD:
4035 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4036 case AMDGPU::G_AMDGPU_ATOMIC_INC:
4037 case AMDGPU::G_AMDGPU_ATOMIC_DEC: {
4038 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4039 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4040 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4041 break;
4042 }
4043 case AMDGPU::G_ATOMIC_CMPXCHG: {
4044 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4045 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4046 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4047 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4048 break;
4049 }
4050 case AMDGPU::G_BRCOND: {
4051 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
4052 AMDGPU::SGPRRegBankID);
4053 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1)((MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1
) ? static_cast<void> (0) : __assert_fail ("MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 4053, __PRETTY_FUNCTION__))
;
4054 if (Bank != AMDGPU::SGPRRegBankID)
4055 Bank = AMDGPU::VCCRegBankID;
4056
4057 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4058 break;
4059 }
4060 }
4061
4062 return getInstructionMapping(/*ID*/1, /*Cost*/1,
4063 getOperandsMapping(OpdsMapping),
4064 MI.getNumOperands());
4065}