Bug Summary

File:llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Warning:line 1121, column 3
Division by zero

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPURegisterBankInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/build-llvm/lib/Target/AMDGPU -resource-dir /usr/lib/llvm-13/lib/clang/13.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/build-llvm/include -I /build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-13/lib/clang/13.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-06-21-164211-33944-1 -x c++ /build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

1//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the RegisterBankInfo class for
10/// AMDGPU.
11///
12/// \par
13///
14/// AMDGPU has unique register bank constraints that require special high level
15/// strategies to deal with. There are two main true physical register banks
16/// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17/// sort of pseudo-register bank needed to represent SGPRs used in a vector
18/// boolean context. There is also the AGPR bank, which is a special purpose
19/// physical register bank present on some subtargets.
20///
21/// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22/// be uniform. It is generally not valid to legalize operands by inserting
23/// copies as on other targets. Operations which require uniform, SGPR operands
24/// generally require scalarization by repeatedly executing the instruction,
25/// activating each set of lanes using a unique set of input values. This is
26/// referred to as a waterfall loop.
27///
28/// \par Booleans
29///
30/// Booleans (s1 values) requires special consideration. A vector compare result
31/// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32/// register. These are represented with the VCC bank. During selection, we need
33/// to be able to unambiguously go back from a register class to a register
34/// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35/// bank, we need to know the use context type. An SGPR s1 value always means a
36/// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37/// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38/// a 32-bit virtual register. Taken together, this means we need to adjust the
39/// type of boolean operations to be regbank legal. All SALU booleans need to be
40/// widened to 32-bits, and all VALU booleans need to be s1 values.
41///
42/// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43/// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44/// bank. A non-boolean source (such as a truncate from a 1-bit load from
45/// memory) will require a copy to the VCC bank which will require clearing the
46/// high bits and inserting a compare.
47///
48/// \par Constant bus restriction
49///
50/// VALU instructions have a limitation known as the constant bus
51/// restriction. Most VALU instructions can use SGPR operands, but may read at
52/// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53/// instructions). This is one unique SGPR, so the same SGPR may be used for
54/// multiple operands. From a register bank perspective, any combination of
55/// operands should be legal as an SGPR, but this is contextually dependent on
56/// the SGPR operands all being the same register. There is therefore optimal to
57/// choose the SGPR with the most uses to minimize the number of copies.
58///
59/// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60/// operation should have its source operands all mapped to VGPRs (except for
61/// VCC), inserting copies from any SGPR operands. This the most trival legal
62/// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63/// complicated to solve here. Every optimization pattern or instruction
64/// selected to multiple outputs would have to enforce this rule, and there
65/// would be additional complexity in tracking this rule for every G_*
66/// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67/// picking the optimal operand combination from a post-isel optimization pass.
68///
69//===----------------------------------------------------------------------===//
70
71#include "AMDGPURegisterBankInfo.h"
72
73#include "AMDGPU.h"
74#include "AMDGPUGlobalISelUtils.h"
75#include "AMDGPUInstrInfo.h"
76#include "GCNSubtarget.h"
77#include "SIMachineFunctionInfo.h"
78#include "SIRegisterInfo.h"
79#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
80#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
81#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
82#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
83#include "llvm/IR/IntrinsicsAMDGPU.h"
84
85#define GET_TARGET_REGBANK_IMPL
86#include "AMDGPUGenRegisterBank.inc"
87
88// This file will be TableGen'ed at some point.
89#include "AMDGPUGenRegisterBankInfo.def"
90
91using namespace llvm;
92using namespace MIPatternMatch;
93
94namespace {
95
96// Observer to apply a register bank to new registers created by LegalizerHelper.
97class ApplyRegBankMapping final : public GISelChangeObserver {
98private:
99 const AMDGPURegisterBankInfo &RBI;
100 MachineRegisterInfo &MRI;
101 const RegisterBank *NewBank;
102 SmallVector<MachineInstr *, 4> NewInsts;
103
104public:
105 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
106 MachineRegisterInfo &MRI_, const RegisterBank *RB)
107 : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
108
109 ~ApplyRegBankMapping() {
110 for (MachineInstr *MI : NewInsts)
111 applyBank(*MI);
112 }
113
114 /// Set any registers that don't have a set register class or bank to SALU.
115 void applyBank(MachineInstr &MI) {
116 const unsigned Opc = MI.getOpcode();
117 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
118 Opc == AMDGPU::G_SEXT) {
119 // LegalizerHelper wants to use the basic legalization artifacts when
120 // widening etc. We don't handle selection with vcc in artifact sources,
121 // so we need to use a sslect instead to handle these properly.
122 Register DstReg = MI.getOperand(0).getReg();
123 Register SrcReg = MI.getOperand(1).getReg();
124 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
125 if (SrcBank == &AMDGPU::VCCRegBank) {
126 const LLT S32 = LLT::scalar(32);
127 assert(MRI.getType(SrcReg) == LLT::scalar(1))(static_cast <bool> (MRI.getType(SrcReg) == LLT::scalar
(1)) ? void (0) : __assert_fail ("MRI.getType(SrcReg) == LLT::scalar(1)"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 127, __extension__ __PRETTY_FUNCTION__))
;
128 assert(MRI.getType(DstReg) == S32)(static_cast <bool> (MRI.getType(DstReg) == S32) ? void
(0) : __assert_fail ("MRI.getType(DstReg) == S32", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 128, __extension__ __PRETTY_FUNCTION__))
;
129 assert(NewBank == &AMDGPU::VGPRRegBank)(static_cast <bool> (NewBank == &AMDGPU::VGPRRegBank
) ? void (0) : __assert_fail ("NewBank == &AMDGPU::VGPRRegBank"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 129, __extension__ __PRETTY_FUNCTION__))
;
130
131 // Replace the extension with a select, which really uses the boolean
132 // source.
133 MachineIRBuilder B(MI);
134 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
135 auto False = B.buildConstant(S32, 0);
136 B.buildSelect(DstReg, SrcReg, True, False);
137 MRI.setRegBank(True.getReg(0), *NewBank);
138 MRI.setRegBank(False.getReg(0), *NewBank);
139 MI.eraseFromParent();
140 }
141
142 assert(!MRI.getRegClassOrRegBank(DstReg))(static_cast <bool> (!MRI.getRegClassOrRegBank(DstReg))
? void (0) : __assert_fail ("!MRI.getRegClassOrRegBank(DstReg)"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 142, __extension__ __PRETTY_FUNCTION__))
;
143 MRI.setRegBank(DstReg, *NewBank);
144 return;
145 }
146
147#ifndef NDEBUG
148 if (Opc == AMDGPU::G_TRUNC) {
149 Register DstReg = MI.getOperand(0).getReg();
150 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
151 assert(DstBank != &AMDGPU::VCCRegBank)(static_cast <bool> (DstBank != &AMDGPU::VCCRegBank
) ? void (0) : __assert_fail ("DstBank != &AMDGPU::VCCRegBank"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 151, __extension__ __PRETTY_FUNCTION__))
;
152 }
153#endif
154
155 for (MachineOperand &Op : MI.operands()) {
156 if (!Op.isReg())
157 continue;
158
159 // We may see physical registers if building a real MI
160 Register Reg = Op.getReg();
161 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
162 continue;
163
164 const RegisterBank *RB = NewBank;
165 if (MRI.getType(Reg) == LLT::scalar(1)) {
166 assert(NewBank == &AMDGPU::VGPRRegBank &&(static_cast <bool> (NewBank == &AMDGPU::VGPRRegBank
&& "s1 operands should only be used for vector bools"
) ? void (0) : __assert_fail ("NewBank == &AMDGPU::VGPRRegBank && \"s1 operands should only be used for vector bools\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 167, __extension__ __PRETTY_FUNCTION__))
167 "s1 operands should only be used for vector bools")(static_cast <bool> (NewBank == &AMDGPU::VGPRRegBank
&& "s1 operands should only be used for vector bools"
) ? void (0) : __assert_fail ("NewBank == &AMDGPU::VGPRRegBank && \"s1 operands should only be used for vector bools\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 167, __extension__ __PRETTY_FUNCTION__))
;
168 assert((MI.getOpcode() != AMDGPU::G_TRUNC &&(static_cast <bool> ((MI.getOpcode() != AMDGPU::G_TRUNC
&& MI.getOpcode() != AMDGPU::G_ANYEXT) && "not expecting legalization artifacts here"
) ? void (0) : __assert_fail ("(MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode() != AMDGPU::G_ANYEXT) && \"not expecting legalization artifacts here\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 170, __extension__ __PRETTY_FUNCTION__))
169 MI.getOpcode() != AMDGPU::G_ANYEXT) &&(static_cast <bool> ((MI.getOpcode() != AMDGPU::G_TRUNC
&& MI.getOpcode() != AMDGPU::G_ANYEXT) && "not expecting legalization artifacts here"
) ? void (0) : __assert_fail ("(MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode() != AMDGPU::G_ANYEXT) && \"not expecting legalization artifacts here\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 170, __extension__ __PRETTY_FUNCTION__))
170 "not expecting legalization artifacts here")(static_cast <bool> ((MI.getOpcode() != AMDGPU::G_TRUNC
&& MI.getOpcode() != AMDGPU::G_ANYEXT) && "not expecting legalization artifacts here"
) ? void (0) : __assert_fail ("(MI.getOpcode() != AMDGPU::G_TRUNC && MI.getOpcode() != AMDGPU::G_ANYEXT) && \"not expecting legalization artifacts here\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 170, __extension__ __PRETTY_FUNCTION__))
;
171 RB = &AMDGPU::VCCRegBank;
172 }
173
174 MRI.setRegBank(Reg, *RB);
175 }
176 }
177
178 void erasingInstr(MachineInstr &MI) override {}
179
180 void createdInstr(MachineInstr &MI) override {
181 // At this point, the instruction was just inserted and has no operands.
182 NewInsts.push_back(&MI);
183 }
184
185 void changingInstr(MachineInstr &MI) override {}
186 void changedInstr(MachineInstr &MI) override {
187 // FIXME: In principle we should probably add the instruction to NewInsts,
188 // but the way the LegalizerHelper uses the observer, we will always see the
189 // registers we need to set the regbank on also referenced in a new
190 // instruction.
191 }
192};
193
194}
195AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
196 : AMDGPUGenRegisterBankInfo(),
197 Subtarget(ST),
198 TRI(Subtarget.getRegisterInfo()),
199 TII(Subtarget.getInstrInfo()) {
200
201 // HACK: Until this is fully tablegen'd.
202 static llvm::once_flag InitializeRegisterBankFlag;
203
204 static auto InitializeRegisterBankOnce = [this]() {
205 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&(static_cast <bool> (&getRegBank(AMDGPU::SGPRRegBankID
) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU
::VGPRRegBankID) == &AMDGPU::VGPRRegBank && &
getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank
) ? void (0) : __assert_fail ("&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 207, __extension__ __PRETTY_FUNCTION__))
206 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&(static_cast <bool> (&getRegBank(AMDGPU::SGPRRegBankID
) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU
::VGPRRegBankID) == &AMDGPU::VGPRRegBank && &
getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank
) ? void (0) : __assert_fail ("&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 207, __extension__ __PRETTY_FUNCTION__))
207 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank)(static_cast <bool> (&getRegBank(AMDGPU::SGPRRegBankID
) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU
::VGPRRegBankID) == &AMDGPU::VGPRRegBank && &
getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank
) ? void (0) : __assert_fail ("&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 207, __extension__ __PRETTY_FUNCTION__))
;
208 (void)this;
209 };
210
211 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
212}
213
214static bool isVectorRegisterBank(const RegisterBank &Bank) {
215 unsigned BankID = Bank.getID();
216 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
217}
218
219unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
220 const RegisterBank &Src,
221 unsigned Size) const {
222 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
223 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
224 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
225 return std::numeric_limits<unsigned>::max();
226 }
227
228 // Bool values are tricky, because the meaning is based on context. The SCC
229 // and VCC banks are for the natural scalar and vector conditions produced by
230 // a compare.
231 //
232 // Legalization doesn't know about the necessary context, so an s1 use may
233 // have been a truncate from an arbitrary value, in which case a copy (lowered
234 // as a compare with 0) needs to be inserted.
235 if (Size == 1 &&
236 (Dst.getID() == AMDGPU::SGPRRegBankID) &&
237 (isVectorRegisterBank(Src) ||
238 Src.getID() == AMDGPU::SGPRRegBankID ||
239 Src.getID() == AMDGPU::VCCRegBankID))
240 return std::numeric_limits<unsigned>::max();
241
242 // There is no direct copy between AGPRs.
243 if (Dst.getID() == AMDGPU::AGPRRegBankID &&
244 Src.getID() == AMDGPU::AGPRRegBankID)
245 return 4;
246
247 return RegisterBankInfo::copyCost(Dst, Src, Size);
248}
249
250unsigned AMDGPURegisterBankInfo::getBreakDownCost(
251 const ValueMapping &ValMapping,
252 const RegisterBank *CurBank) const {
253 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
254 // VGPR.
255 // FIXME: Is there a better way to do this?
256 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
257 return 10; // This is expensive.
258
259 assert(ValMapping.NumBreakDowns == 2 &&(static_cast <bool> (ValMapping.NumBreakDowns == 2 &&
ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown
[0].StartIdx == 0 && ValMapping.BreakDown[1].Length ==
32 && ValMapping.BreakDown[1].StartIdx == 32 &&
ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank
) ? void (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 264, __extension__ __PRETTY_FUNCTION__))
260 ValMapping.BreakDown[0].Length == 32 &&(static_cast <bool> (ValMapping.NumBreakDowns == 2 &&
ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown
[0].StartIdx == 0 && ValMapping.BreakDown[1].Length ==
32 && ValMapping.BreakDown[1].StartIdx == 32 &&
ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank
) ? void (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 264, __extension__ __PRETTY_FUNCTION__))
261 ValMapping.BreakDown[0].StartIdx == 0 &&(static_cast <bool> (ValMapping.NumBreakDowns == 2 &&
ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown
[0].StartIdx == 0 && ValMapping.BreakDown[1].Length ==
32 && ValMapping.BreakDown[1].StartIdx == 32 &&
ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank
) ? void (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 264, __extension__ __PRETTY_FUNCTION__))
262 ValMapping.BreakDown[1].Length == 32 &&(static_cast <bool> (ValMapping.NumBreakDowns == 2 &&
ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown
[0].StartIdx == 0 && ValMapping.BreakDown[1].Length ==
32 && ValMapping.BreakDown[1].StartIdx == 32 &&
ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank
) ? void (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 264, __extension__ __PRETTY_FUNCTION__))
263 ValMapping.BreakDown[1].StartIdx == 32 &&(static_cast <bool> (ValMapping.NumBreakDowns == 2 &&
ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown
[0].StartIdx == 0 && ValMapping.BreakDown[1].Length ==
32 && ValMapping.BreakDown[1].StartIdx == 32 &&
ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank
) ? void (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 264, __extension__ __PRETTY_FUNCTION__))
264 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank)(static_cast <bool> (ValMapping.NumBreakDowns == 2 &&
ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown
[0].StartIdx == 0 && ValMapping.BreakDown[1].Length ==
32 && ValMapping.BreakDown[1].StartIdx == 32 &&
ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank
) ? void (0) : __assert_fail ("ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && ValMapping.BreakDown[1].Length == 32 && ValMapping.BreakDown[1].StartIdx == 32 && ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 264, __extension__ __PRETTY_FUNCTION__))
;
265
266 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
267 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
268 // want.
269
270 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
271 // alignment restrictions, but this probably isn't important.
272 return 1;
273}
274
275const RegisterBank &
276AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
277 LLT Ty) const {
278 if (&RC == &AMDGPU::SReg_1RegClass)
279 return AMDGPU::VCCRegBank;
280
281 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
282 // VCC-like use.
283 if (TRI->isSGPRClass(&RC)) {
284 // FIXME: This probably came from a copy from a physical register, which
285 // should be inferrrable from the copied to-type. We don't have many boolean
286 // physical register constraints so just assume a normal SGPR for now.
287 if (!Ty.isValid())
288 return AMDGPU::SGPRRegBank;
289
290 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
291 }
292
293 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
294}
295
296template <unsigned NumOps>
297RegisterBankInfo::InstructionMappings
298AMDGPURegisterBankInfo::addMappingFromTable(
299 const MachineInstr &MI, const MachineRegisterInfo &MRI,
300 const std::array<unsigned, NumOps> RegSrcOpIdx,
301 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
302
303 InstructionMappings AltMappings;
304
305 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
306
307 unsigned Sizes[NumOps];
308 for (unsigned I = 0; I < NumOps; ++I) {
309 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
310 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
311 }
312
313 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
314 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
315 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
316 }
317
318 // getInstrMapping's default mapping uses ID 1, so start at 2.
319 unsigned MappingID = 2;
320 for (const auto &Entry : Table) {
321 for (unsigned I = 0; I < NumOps; ++I) {
322 int OpIdx = RegSrcOpIdx[I];
323 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
324 }
325
326 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
327 getOperandsMapping(Operands),
328 Operands.size()));
329 }
330
331 return AltMappings;
332}
333
334RegisterBankInfo::InstructionMappings
335AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
336 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
337 switch (MI.getIntrinsicID()) {
338 case Intrinsic::amdgcn_readlane: {
339 static const OpRegBankEntry<3> Table[2] = {
340 // Perfectly legal.
341 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
342
343 // Need a readfirstlane for the index.
344 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
345 };
346
347 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
348 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
349 }
350 case Intrinsic::amdgcn_writelane: {
351 static const OpRegBankEntry<4> Table[4] = {
352 // Perfectly legal.
353 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
354
355 // Need readfirstlane of first op
356 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
357
358 // Need readfirstlane of second op
359 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
360
361 // Need readfirstlane of both ops
362 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
363 };
364
365 // rsrc, voffset, offset
366 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
367 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
368 }
369 default:
370 return RegisterBankInfo::getInstrAlternativeMappings(MI);
371 }
372}
373
374RegisterBankInfo::InstructionMappings
375AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
376 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
377
378 switch (MI.getIntrinsicID()) {
379 case Intrinsic::amdgcn_s_buffer_load: {
380 static const OpRegBankEntry<2> Table[4] = {
381 // Perfectly legal.
382 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
383
384 // Only need 1 register in loop
385 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
386
387 // Have to waterfall the resource.
388 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
389
390 // Have to waterfall the resource, and the offset.
391 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
392 };
393
394 // rsrc, offset
395 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
396 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
397 }
398 case Intrinsic::amdgcn_ds_ordered_add:
399 case Intrinsic::amdgcn_ds_ordered_swap: {
400 // VGPR = M0, VGPR
401 static const OpRegBankEntry<3> Table[2] = {
402 // Perfectly legal.
403 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
404
405 // Need a readfirstlane for m0
406 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
407 };
408
409 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
410 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
411 }
412 case Intrinsic::amdgcn_s_sendmsg:
413 case Intrinsic::amdgcn_s_sendmsghalt: {
414 // FIXME: Should have no register for immediate
415 static const OpRegBankEntry<1> Table[2] = {
416 // Perfectly legal.
417 { { AMDGPU::SGPRRegBankID }, 1 },
418
419 // Need readlane
420 { { AMDGPU::VGPRRegBankID }, 3 }
421 };
422
423 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
424 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
425 }
426 default:
427 return RegisterBankInfo::getInstrAlternativeMappings(MI);
428 }
429}
430
431static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
432 const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
433 return I && I->getMetadata("amdgpu.noclobber");
434}
435
436// FIXME: Returns uniform if there's no source value information. This is
437// probably wrong.
438static bool isScalarLoadLegal(const MachineInstr &MI) {
439 if (!MI.hasOneMemOperand())
440 return false;
441
442 const MachineMemOperand *MMO = *MI.memoperands_begin();
443 const unsigned AS = MMO->getAddrSpace();
444 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
445 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
446 // Require 4-byte alignment.
447 return MMO->getAlign() >= Align(4) &&
448 // Can't do a scalar atomic load.
449 !MMO->isAtomic() &&
450 // Don't use scalar loads for volatile accesses to non-constant address
451 // spaces.
452 (IsConst || !MMO->isVolatile()) &&
453 // Memory must be known constant, or not written before this load.
454 (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
455 AMDGPUInstrInfo::isUniformMMO(MMO);
456}
457
458RegisterBankInfo::InstructionMappings
459AMDGPURegisterBankInfo::getInstrAlternativeMappings(
460 const MachineInstr &MI) const {
461
462 const MachineFunction &MF = *MI.getParent()->getParent();
463 const MachineRegisterInfo &MRI = MF.getRegInfo();
464
465
466 InstructionMappings AltMappings;
467 switch (MI.getOpcode()) {
468 case TargetOpcode::G_CONSTANT: {
469 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
470 if (Size == 1) {
471 static const OpRegBankEntry<1> Table[3] = {
472 { { AMDGPU::VGPRRegBankID }, 1 },
473 { { AMDGPU::SGPRRegBankID }, 1 },
474 { { AMDGPU::VCCRegBankID }, 1 }
475 };
476
477 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
478 }
479
480 LLVM_FALLTHROUGH[[gnu::fallthrough]];
481 }
482 case TargetOpcode::G_FCONSTANT:
483 case TargetOpcode::G_FRAME_INDEX:
484 case TargetOpcode::G_GLOBAL_VALUE: {
485 static const OpRegBankEntry<1> Table[2] = {
486 { { AMDGPU::VGPRRegBankID }, 1 },
487 { { AMDGPU::SGPRRegBankID }, 1 }
488 };
489
490 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
491 }
492 case TargetOpcode::G_AND:
493 case TargetOpcode::G_OR:
494 case TargetOpcode::G_XOR: {
495 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
496
497 if (Size == 1) {
498 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
499 const InstructionMapping &SCCMapping = getInstructionMapping(
500 1, 1, getOperandsMapping(
501 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
502 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
503 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
504 3); // Num Operands
505 AltMappings.push_back(&SCCMapping);
506
507 const InstructionMapping &VCCMapping0 = getInstructionMapping(
508 2, 1, getOperandsMapping(
509 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
510 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
511 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
512 3); // Num Operands
513 AltMappings.push_back(&VCCMapping0);
514 return AltMappings;
515 }
516
517 if (Size != 64)
518 break;
519
520 const InstructionMapping &SSMapping = getInstructionMapping(
521 1, 1, getOperandsMapping(
522 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
523 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
524 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
525 3); // Num Operands
526 AltMappings.push_back(&SSMapping);
527
528 const InstructionMapping &VVMapping = getInstructionMapping(
529 2, 2, getOperandsMapping(
530 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
531 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
532 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
533 3); // Num Operands
534 AltMappings.push_back(&VVMapping);
535 break;
536 }
537 case TargetOpcode::G_LOAD:
538 case TargetOpcode::G_ZEXTLOAD:
539 case TargetOpcode::G_SEXTLOAD: {
540 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
541 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
542 unsigned PtrSize = PtrTy.getSizeInBits();
543 unsigned AS = PtrTy.getAddressSpace();
544
545 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
546 AS != AMDGPUAS::PRIVATE_ADDRESS) &&
547 isScalarLoadLegal(MI)) {
548 const InstructionMapping &SSMapping = getInstructionMapping(
549 1, 1, getOperandsMapping(
550 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
551 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
552 2); // Num Operands
553 AltMappings.push_back(&SSMapping);
554 }
555
556 const InstructionMapping &VVMapping = getInstructionMapping(
557 2, 1,
558 getOperandsMapping(
559 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
560 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
561 2); // Num Operands
562 AltMappings.push_back(&VVMapping);
563
564 // It may be possible to have a vgpr = load sgpr mapping here, because
565 // the mubuf instructions support this kind of load, but probably for only
566 // gfx7 and older. However, the addressing mode matching in the instruction
567 // selector should be able to do a better job of detecting and selecting
568 // these kinds of loads from the vgpr = load vgpr mapping.
569
570 return AltMappings;
571
572 }
573 case TargetOpcode::G_SELECT: {
574 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
575 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
576 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
577 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
578 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
579 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
580 4); // Num Operands
581 AltMappings.push_back(&SSMapping);
582
583 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
584 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
585 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
586 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
587 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
588 4); // Num Operands
589 AltMappings.push_back(&VVMapping);
590
591 return AltMappings;
592 }
593 case TargetOpcode::G_UADDE:
594 case TargetOpcode::G_USUBE:
595 case TargetOpcode::G_SADDE:
596 case TargetOpcode::G_SSUBE: {
597 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
598 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
599 getOperandsMapping(
600 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
601 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
602 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
603 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
604 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
605 5); // Num Operands
606 AltMappings.push_back(&SSMapping);
607
608 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
609 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
610 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
611 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
612 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
613 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
614 5); // Num Operands
615 AltMappings.push_back(&VVMapping);
616 return AltMappings;
617 }
618 case AMDGPU::G_BRCOND: {
619 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1)(static_cast <bool> (MRI.getType(MI.getOperand(0).getReg
()).getSizeInBits() == 1) ? void (0) : __assert_fail ("MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 619, __extension__ __PRETTY_FUNCTION__))
;
620
621 // TODO: Change type to 32 for scalar
622 const InstructionMapping &SMapping = getInstructionMapping(
623 1, 1, getOperandsMapping(
624 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
625 2); // Num Operands
626 AltMappings.push_back(&SMapping);
627
628 const InstructionMapping &VMapping = getInstructionMapping(
629 1, 1, getOperandsMapping(
630 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
631 2); // Num Operands
632 AltMappings.push_back(&VMapping);
633 return AltMappings;
634 }
635 case AMDGPU::G_INTRINSIC:
636 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
637 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
638 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
639 default:
640 break;
641 }
642 return RegisterBankInfo::getInstrAlternativeMappings(MI);
643}
644
645void AMDGPURegisterBankInfo::split64BitValueForMapping(
646 MachineIRBuilder &B,
647 SmallVector<Register, 2> &Regs,
648 LLT HalfTy,
649 Register Reg) const {
650 assert(HalfTy.getSizeInBits() == 32)(static_cast <bool> (HalfTy.getSizeInBits() == 32) ? void
(0) : __assert_fail ("HalfTy.getSizeInBits() == 32", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 650, __extension__ __PRETTY_FUNCTION__))
;
651 MachineRegisterInfo *MRI = B.getMRI();
652 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
653 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
654 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
655 MRI->setRegBank(LoLHS, *Bank);
656 MRI->setRegBank(HiLHS, *Bank);
657
658 Regs.push_back(LoLHS);
659 Regs.push_back(HiLHS);
660
661 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
662 .addDef(LoLHS)
663 .addDef(HiLHS)
664 .addUse(Reg);
665}
666
667/// Replace the current type each register in \p Regs has with \p NewTy
668static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
669 LLT NewTy) {
670 for (Register Reg : Regs) {
671 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits())(static_cast <bool> (MRI.getType(Reg).getSizeInBits() ==
NewTy.getSizeInBits()) ? void (0) : __assert_fail ("MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 671, __extension__ __PRETTY_FUNCTION__))
;
672 MRI.setType(Reg, NewTy);
673 }
674}
675
676static LLT getHalfSizedType(LLT Ty) {
677 if (Ty.isVector()) {
678 assert(Ty.getNumElements() % 2 == 0)(static_cast <bool> (Ty.getNumElements() % 2 == 0) ? void
(0) : __assert_fail ("Ty.getNumElements() % 2 == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 678, __extension__ __PRETTY_FUNCTION__))
;
679 return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
680 }
681
682 assert(Ty.getSizeInBits() % 2 == 0)(static_cast <bool> (Ty.getSizeInBits() % 2 == 0) ? void
(0) : __assert_fail ("Ty.getSizeInBits() % 2 == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 682, __extension__ __PRETTY_FUNCTION__))
;
683 return LLT::scalar(Ty.getSizeInBits() / 2);
684}
685
686/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
687/// any of the required SGPR operands are VGPRs, perform a waterfall loop to
688/// execute the instruction for each unique combination of values in all lanes
689/// in the wave. The block will be split such that rest of the instructions are
690/// moved to a new block.
691///
692/// Essentially performs this loop:
693//
694/// Save Execution Mask
695/// For (Lane : Wavefront) {
696/// Enable Lane, Disable all other lanes
697/// SGPR = read SGPR value for current lane from VGPR
698/// VGPRResult[Lane] = use_op SGPR
699/// }
700/// Restore Execution Mask
701///
702/// There is additional complexity to try for compare values to identify the
703/// unique values used.
704bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
705 MachineIRBuilder &B,
706 iterator_range<MachineBasicBlock::iterator> Range,
707 SmallSet<Register, 4> &SGPROperandRegs,
708 MachineRegisterInfo &MRI) const {
709 SmallVector<Register, 4> ResultRegs;
710 SmallVector<Register, 4> InitResultRegs;
711 SmallVector<Register, 4> PhiRegs;
712
713 // Track use registers which have already been expanded with a readfirstlane
714 // sequence. This may have multiple uses if moving a sequence.
715 DenseMap<Register, Register> WaterfalledRegMap;
716
717 MachineBasicBlock &MBB = B.getMBB();
718 MachineFunction *MF = &B.getMF();
719
720 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
721 const unsigned WaveAndOpc = Subtarget.isWave32() ?
722 AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
723 const unsigned MovTermOpc = Subtarget.isWave32() ?
724 AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
725 const unsigned XorTermOpc = Subtarget.isWave32() ?
726 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
727 const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
728 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
729 const unsigned ExecReg = Subtarget.isWave32() ?
730 AMDGPU::EXEC_LO : AMDGPU::EXEC;
731
732#ifndef NDEBUG
733 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
734#endif
735
736 for (MachineInstr &MI : Range) {
737 for (MachineOperand &Def : MI.defs()) {
738 if (MRI.use_nodbg_empty(Def.getReg()))
739 continue;
740
741 LLT ResTy = MRI.getType(Def.getReg());
742 const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
743 ResultRegs.push_back(Def.getReg());
744 Register InitReg = B.buildUndef(ResTy).getReg(0);
745 Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
746 InitResultRegs.push_back(InitReg);
747 PhiRegs.push_back(PhiReg);
748 MRI.setRegBank(PhiReg, *DefBank);
749 MRI.setRegBank(InitReg, *DefBank);
750 }
751 }
752
753 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
754 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
755
756 // Don't bother using generic instructions/registers for the exec mask.
757 B.buildInstr(TargetOpcode::IMPLICIT_DEF)
758 .addDef(InitSaveExecReg);
759
760 Register PhiExec = MRI.createVirtualRegister(WaveRC);
761 Register NewExec = MRI.createVirtualRegister(WaveRC);
762
763 // To insert the loop we need to split the block. Move everything before this
764 // point to a new block, and insert a new empty block before this instruction.
765 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
766 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
767 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
768 MachineFunction::iterator MBBI(MBB);
769 ++MBBI;
770 MF->insert(MBBI, LoopBB);
771 MF->insert(MBBI, RestoreExecBB);
772 MF->insert(MBBI, RemainderBB);
773
774 LoopBB->addSuccessor(RestoreExecBB);
775 LoopBB->addSuccessor(LoopBB);
776
777 // Move the rest of the block into a new block.
778 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
779 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
780
781 MBB.addSuccessor(LoopBB);
782 RestoreExecBB->addSuccessor(RemainderBB);
783
784 B.setInsertPt(*LoopBB, LoopBB->end());
785
786 B.buildInstr(TargetOpcode::PHI)
787 .addDef(PhiExec)
788 .addReg(InitSaveExecReg)
789 .addMBB(&MBB)
790 .addReg(NewExec)
791 .addMBB(LoopBB);
792
793 for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
794 B.buildInstr(TargetOpcode::G_PHI)
795 .addDef(std::get<2>(Result))
796 .addReg(std::get<0>(Result)) // Initial value / implicit_def
797 .addMBB(&MBB)
798 .addReg(std::get<1>(Result)) // Mid-loop value.
799 .addMBB(LoopBB);
800 }
801
802 const DebugLoc &DL = B.getDL();
803
804 MachineInstr &FirstInst = *Range.begin();
805
806 // Move the instruction into the loop. Note we moved everything after
807 // Range.end() already into a new block, so Range.end() is no longer valid.
808 LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
809
810 // Figure out the iterator range after splicing the instructions.
811 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
812 auto NewEnd = LoopBB->end();
813
814 MachineBasicBlock::iterator I = Range.begin();
815 B.setInsertPt(*LoopBB, I);
816
817 Register CondReg;
818
819 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize)(static_cast <bool> (std::distance(NewBegin, NewEnd) ==
OrigRangeSize) ? void (0) : __assert_fail ("std::distance(NewBegin, NewEnd) == OrigRangeSize"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 819, __extension__ __PRETTY_FUNCTION__))
;
820
821 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
822 for (MachineOperand &Op : MI.uses()) {
823 if (!Op.isReg() || Op.isDef())
824 continue;
825
826 Register OldReg = Op.getReg();
827 if (!SGPROperandRegs.count(OldReg))
828 continue;
829
830 // See if we already processed this register in another instruction in the
831 // sequence.
832 auto OldVal = WaterfalledRegMap.find(OldReg);
833 if (OldVal != WaterfalledRegMap.end()) {
834 Op.setReg(OldVal->second);
835 continue;
836 }
837
838 Register OpReg = Op.getReg();
839 LLT OpTy = MRI.getType(OpReg);
840
841 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
842 if (OpBank != &AMDGPU::VGPRRegBank) {
843 // Insert copy from AGPR to VGPR before the loop.
844 B.setMBB(MBB);
845 OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
846 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
847 B.setInstr(*I);
848 }
849
850 unsigned OpSize = OpTy.getSizeInBits();
851
852 // Can only do a readlane of 32-bit pieces.
853 if (OpSize == 32) {
854 // Avoid extra copies in the simple case of one 32-bit register.
855 Register CurrentLaneOpReg
856 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
857 MRI.setType(CurrentLaneOpReg, OpTy);
858
859 constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
860 // Read the next variant <- also loop target.
861 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
862 CurrentLaneOpReg)
863 .addReg(OpReg);
864
865 Register NewCondReg = MRI.createVirtualRegister(WaveRC);
866 bool First = CondReg == AMDGPU::NoRegister;
867 if (First)
868 CondReg = NewCondReg;
869
870 // Compare the just read M0 value to all possible Idx values.
871 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
872 .addDef(NewCondReg)
873 .addReg(CurrentLaneOpReg)
874 .addReg(OpReg);
875 Op.setReg(CurrentLaneOpReg);
876
877 if (!First) {
878 Register AndReg = MRI.createVirtualRegister(WaveRC);
879
880 // If there are multiple operands to consider, and the conditions.
881 B.buildInstr(WaveAndOpc)
882 .addDef(AndReg)
883 .addReg(NewCondReg)
884 .addReg(CondReg);
885 CondReg = AndReg;
886 }
887 } else {
888 LLT S32 = LLT::scalar(32);
889 SmallVector<Register, 8> ReadlanePieces;
890
891 // The compares can be done as 64-bit, but the extract needs to be done
892 // in 32-bit pieces.
893
894 bool Is64 = OpSize % 64 == 0;
895
896 LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
897 unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
898 : AMDGPU::V_CMP_EQ_U32_e64;
899
900 // The compares can be done as 64-bit, but the extract needs to be done
901 // in 32-bit pieces.
902
903 // Insert the unmerge before the loop.
904
905 B.setMBB(MBB);
906 auto Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
907 B.setInstr(*I);
908
909 unsigned NumPieces = Unmerge->getNumOperands() - 1;
910 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
911 Register UnmergePiece = Unmerge.getReg(PieceIdx);
912
913 Register CurrentLaneOpReg;
914 if (Is64) {
915 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
916 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
917
918 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
919 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
920 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
921
922 // Read the next variant <- also loop target.
923 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
924 CurrentLaneOpRegLo)
925 .addReg(UnmergePiece, 0, AMDGPU::sub0);
926
927 // Read the next variant <- also loop target.
928 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
929 CurrentLaneOpRegHi)
930 .addReg(UnmergePiece, 0, AMDGPU::sub1);
931
932 CurrentLaneOpReg =
933 B.buildMerge(LLT::scalar(64),
934 {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
935 .getReg(0);
936
937 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
938
939 if (OpTy.getScalarSizeInBits() == 64) {
940 // If we need to produce a 64-bit element vector, so use the
941 // merged pieces
942 ReadlanePieces.push_back(CurrentLaneOpReg);
943 } else {
944 // 32-bit element type.
945 ReadlanePieces.push_back(CurrentLaneOpRegLo);
946 ReadlanePieces.push_back(CurrentLaneOpRegHi);
947 }
948 } else {
949 CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
950 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
951 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
952
953 // Read the next variant <- also loop target.
954 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
955 CurrentLaneOpReg)
956 .addReg(UnmergePiece);
957 ReadlanePieces.push_back(CurrentLaneOpReg);
958 }
959
960 Register NewCondReg = MRI.createVirtualRegister(WaveRC);
961 bool First = CondReg == AMDGPU::NoRegister;
962 if (First)
963 CondReg = NewCondReg;
964
965 B.buildInstr(CmpOp)
966 .addDef(NewCondReg)
967 .addReg(CurrentLaneOpReg)
968 .addReg(UnmergePiece);
969
970 if (!First) {
971 Register AndReg = MRI.createVirtualRegister(WaveRC);
972
973 // If there are multiple operands to consider, and the conditions.
974 B.buildInstr(WaveAndOpc)
975 .addDef(AndReg)
976 .addReg(NewCondReg)
977 .addReg(CondReg);
978 CondReg = AndReg;
979 }
980 }
981
982 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
983 // BUILD_VECTOR
984 if (OpTy.isVector()) {
985 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
986 Op.setReg(Merge.getReg(0));
987 } else {
988 auto Merge = B.buildMerge(OpTy, ReadlanePieces);
989 Op.setReg(Merge.getReg(0));
990 }
991
992 MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
993 }
994
995 // Make sure we don't re-process this register again.
996 WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
997 }
998 }
999
1000 B.setInsertPt(*LoopBB, LoopBB->end());
1001
1002 // Update EXEC, save the original EXEC value to VCC.
1003 B.buildInstr(AndSaveExecOpc)
1004 .addDef(NewExec)
1005 .addReg(CondReg, RegState::Kill);
1006
1007 MRI.setSimpleHint(NewExec, CondReg);
1008
1009 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
1010 B.buildInstr(XorTermOpc)
1011 .addDef(ExecReg)
1012 .addReg(ExecReg)
1013 .addReg(NewExec);
1014
1015 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
1016 // s_cbranch_scc0?
1017
1018 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
1019 B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
1020 .addMBB(LoopBB);
1021
1022 // Save the EXEC mask before the loop.
1023 BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
1024 .addReg(ExecReg);
1025
1026 // Restore the EXEC mask after the loop.
1027 B.setMBB(*RestoreExecBB);
1028 B.buildInstr(MovTermOpc)
1029 .addDef(ExecReg)
1030 .addReg(SaveExecReg);
1031
1032 // Set the insert point after the original instruction, so any new
1033 // instructions will be in the remainder.
1034 B.setInsertPt(*RemainderBB, RemainderBB->begin());
1035
1036 return true;
1037}
1038
1039// Return any unique registers used by \p MI at \p OpIndices that need to be
1040// handled in a waterfall loop. Returns these registers in \p
1041// SGPROperandRegs. Returns true if there are any operands to handle and a
1042// waterfall loop is necessary.
1043bool AMDGPURegisterBankInfo::collectWaterfallOperands(
1044 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
1045 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
1046 for (unsigned Op : OpIndices) {
1047 assert(MI.getOperand(Op).isUse())(static_cast <bool> (MI.getOperand(Op).isUse()) ? void (
0) : __assert_fail ("MI.getOperand(Op).isUse()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 1047, __extension__ __PRETTY_FUNCTION__))
;
1048 Register Reg = MI.getOperand(Op).getReg();
1049 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
1050 if (OpBank->getID() != AMDGPU::SGPRRegBankID)
1051 SGPROperandRegs.insert(Reg);
1052 }
1053
1054 // No operands need to be replaced, so no need to loop.
1055 return !SGPROperandRegs.empty();
1056}
1057
1058bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1059 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
1060 ArrayRef<unsigned> OpIndices) const {
1061 // Use a set to avoid extra readfirstlanes in the case where multiple operands
1062 // are the same register.
1063 SmallSet<Register, 4> SGPROperandRegs;
1064
1065 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
1066 return false;
1067
1068 MachineBasicBlock::iterator I = MI.getIterator();
1069 return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1070 SGPROperandRegs, MRI);
1071}
1072
1073bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1074 MachineInstr &MI, MachineRegisterInfo &MRI,
1075 ArrayRef<unsigned> OpIndices) const {
1076 MachineIRBuilder B(MI);
1077 return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1078}
1079
1080// Legalize an operand that must be an SGPR by inserting a readfirstlane.
1081void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1082 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1083 Register Reg = MI.getOperand(OpIdx).getReg();
1084 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1085 if (Bank == &AMDGPU::SGPRRegBank)
1086 return;
1087
1088 LLT Ty = MRI.getType(Reg);
1089 MachineIRBuilder B(MI);
1090
1091 if (Bank != &AMDGPU::VGPRRegBank) {
1092 // We need to copy from AGPR to VGPR
1093 Reg = B.buildCopy(Ty, Reg).getReg(0);
1094 MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
1095 }
1096
1097 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1098 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1099 .addDef(SGPR)
1100 .addReg(Reg);
1101
1102 MRI.setType(SGPR, Ty);
1103
1104 const TargetRegisterClass *Constrained =
1105 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1106 (void)Constrained;
1107 assert(Constrained && "Failed to constrain readfirstlane src reg")(static_cast <bool> (Constrained && "Failed to constrain readfirstlane src reg"
) ? void (0) : __assert_fail ("Constrained && \"Failed to constrain readfirstlane src reg\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 1107, __extension__ __PRETTY_FUNCTION__))
;
1108
1109 MI.getOperand(OpIdx).setReg(SGPR);
1110}
1111
1112/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1113/// rest will be in the remainder.
1114static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1115 unsigned TotalSize = Ty.getSizeInBits();
1116 if (!Ty.isVector())
1
Calling 'LLT::isVector'
3
Returning from 'LLT::isVector'
4
Taking false branch
1117 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1118
1119 LLT EltTy = Ty.getElementType();
1120 unsigned EltSize = EltTy.getSizeInBits();
5
Calling 'LLT::getSizeInBits'
8
Returning from 'LLT::getSizeInBits'
9
'EltSize' initialized to 0
1121 assert(FirstSize % EltSize == 0)(static_cast <bool> (FirstSize % EltSize == 0) ? void (
0) : __assert_fail ("FirstSize % EltSize == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 1121, __extension__ __PRETTY_FUNCTION__))
;
10
Division by zero
1122
1123 unsigned FirstPartNumElts = FirstSize / EltSize;
1124 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1125
1126 return {LLT::scalarOrVector(FirstPartNumElts, EltTy),
1127 LLT::scalarOrVector(RemainderElts, EltTy)};
1128}
1129
1130static LLT widen96To128(LLT Ty) {
1131 if (!Ty.isVector())
1132 return LLT::scalar(128);
1133
1134 LLT EltTy = Ty.getElementType();
1135 assert(128 % EltTy.getSizeInBits() == 0)(static_cast <bool> (128 % EltTy.getSizeInBits() == 0) ?
void (0) : __assert_fail ("128 % EltTy.getSizeInBits() == 0"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 1135, __extension__ __PRETTY_FUNCTION__))
;
1136 return LLT::vector(128 / EltTy.getSizeInBits(), EltTy);
1137}
1138
1139bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
1140 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1141 MachineRegisterInfo &MRI) const {
1142 Register DstReg = MI.getOperand(0).getReg();
1143 const LLT LoadTy = MRI.getType(DstReg);
1144 unsigned LoadSize = LoadTy.getSizeInBits();
1145 const unsigned MaxNonSmrdLoadSize = 128;
1146
1147 const RegisterBank *DstBank =
1148 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1149 if (DstBank == &AMDGPU::SGPRRegBank) {
1150 // There are some special cases that we need to look at for 32 bit and 96
1151 // bit SGPR loads otherwise we have nothing to do.
1152 if (LoadSize != 32 && LoadSize != 96)
1153 return false;
1154
1155 MachineMemOperand *MMO = *MI.memoperands_begin();
1156 const unsigned MemSize = 8 * MMO->getSize();
1157 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1158 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1159 // scalar loads should have a load size of 32 but memory access size of less
1160 // than 32.
1161 if (LoadSize == 32 &&
1162 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1163 return false;
1164
1165 Register PtrReg = MI.getOperand(1).getReg();
1166
1167 ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1168 MachineIRBuilder B(MI, O);
1169
1170 if (LoadSize == 32) {
1171 // This is an extending load from a sub-dword size. Widen the memory
1172 // access size to 4 bytes and clear the extra high bits appropriately
1173 const LLT S32 = LLT::scalar(32);
1174 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1175 // Must extend the sign bit into higher bits for a G_SEXTLOAD
1176 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1177 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1178 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1179 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1180 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1181 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1182 } else
1183 // We do not need to touch the higher bits for regular loads.
1184 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1185 } else {
1186 // 96-bit loads are only available for vector loads. We need to split this
1187 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1188 if (MMO->getAlign() < Align(16)) {
1189 LLT Part64, Part32;
1190 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1191 auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
1192 auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
1193
1194 auto Undef = B.buildUndef(LoadTy);
1195 auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
1196 B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
1197 } else {
1198 LLT WiderTy = widen96To128(LoadTy);
1199 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1200 B.buildExtract(MI.getOperand(0), WideLoad, 0);
1201 }
1202 }
1203
1204 MI.eraseFromParent();
1205 return true;
1206 }
1207
1208 // 128-bit loads are supported for all instruction types.
1209 if (LoadSize <= MaxNonSmrdLoadSize)
1210 return false;
1211
1212 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1213 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1214
1215 if (SrcRegs.empty())
1216 SrcRegs.push_back(MI.getOperand(1).getReg());
1217
1218 assert(LoadSize % MaxNonSmrdLoadSize == 0)(static_cast <bool> (LoadSize % MaxNonSmrdLoadSize == 0
) ? void (0) : __assert_fail ("LoadSize % MaxNonSmrdLoadSize == 0"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 1218, __extension__ __PRETTY_FUNCTION__))
;
1219
1220 // RegBankSelect only emits scalar types, so we need to reset the pointer
1221 // operand to a pointer type.
1222 Register BasePtrReg = SrcRegs[0];
1223 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1224 MRI.setType(BasePtrReg, PtrTy);
1225
1226 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1227 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1228 ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
1229 MachineIRBuilder B(MI, Observer);
1230 LegalizerHelper Helper(B.getMF(), Observer, B);
1231
1232 if (LoadTy.isVector()) {
1233 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1234 return false;
1235 } else {
1236 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1237 return false;
1238 }
1239
1240 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1241 return true;
1242}
1243
1244bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1245 MachineInstr &MI,
1246 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1247 MachineRegisterInfo &MRI) const {
1248 const MachineFunction &MF = *MI.getMF();
1249 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1250 const auto &TFI = *ST.getFrameLowering();
1251
1252 // Guard in case the stack growth direction ever changes with scratch
1253 // instructions.
1254 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1255 return false;
1256
1257 Register Dst = MI.getOperand(0).getReg();
1258 Register AllocSize = MI.getOperand(1).getReg();
1259 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1260
1261 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1262
1263 // TODO: Need to emit a wave reduction to get the maximum size.
1264 if (SizeBank != &AMDGPU::SGPRRegBank)
1265 return false;
1266
1267 LLT PtrTy = MRI.getType(Dst);
1268 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1269
1270 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1271 Register SPReg = Info->getStackPtrOffsetReg();
1272 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1273 MachineIRBuilder B(MI, ApplyBank);
1274
1275 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1276 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1277
1278 auto SPCopy = B.buildCopy(PtrTy, SPReg);
1279 if (Alignment > TFI.getStackAlign()) {
1280 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1281 B.buildMaskLowPtrBits(Dst, PtrAdd,
1282 Log2(Alignment) + ST.getWavefrontSizeLog2());
1283 } else {
1284 B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1285 }
1286
1287 MI.eraseFromParent();
1288 return true;
1289}
1290
1291bool AMDGPURegisterBankInfo::applyMappingImage(
1292 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1293 MachineRegisterInfo &MRI, int RsrcIdx) const {
1294 const int NumDefs = MI.getNumExplicitDefs();
1295
1296 // The reported argument index is relative to the IR intrinsic call arguments,
1297 // so we need to shift by the number of defs and the intrinsic ID.
1298 RsrcIdx += NumDefs + 1;
1299
1300 // Insert copies to VGPR arguments.
1301 applyDefaultMapping(OpdMapper);
1302
1303 // Fixup any SGPR arguments.
1304 SmallVector<unsigned, 4> SGPRIndexes;
1305 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1306 if (!MI.getOperand(I).isReg())
1307 continue;
1308
1309 // If this intrinsic has a sampler, it immediately follows rsrc.
1310 if (I == RsrcIdx || I == RsrcIdx + 1)
1311 SGPRIndexes.push_back(I);
1312 }
1313
1314 executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1315 return true;
1316}
1317
1318static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
1319 Register Reg) {
1320 MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
1321 if (!Def)
1322 return Reg;
1323
1324 // TODO: Guard against this being an implicit def
1325 return Def->getOperand(0).getReg();
1326}
1327
1328// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1329// the three offsets (voffset, soffset and instoffset)
1330static unsigned setBufferOffsets(MachineIRBuilder &B,
1331 const AMDGPURegisterBankInfo &RBI,
1332 Register CombinedOffset, Register &VOffsetReg,
1333 Register &SOffsetReg, int64_t &InstOffsetVal,
1334 Align Alignment) {
1335 const LLT S32 = LLT::scalar(32);
1336 MachineRegisterInfo *MRI = B.getMRI();
1337
1338 if (Optional<int64_t> Imm = getConstantVRegSExtVal(CombinedOffset, *MRI)) {
1339 uint32_t SOffset, ImmOffset;
1340 if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
1341 Alignment)) {
1342 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1343 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1344 InstOffsetVal = ImmOffset;
1345
1346 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1347 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1348 return SOffset + ImmOffset;
1349 }
1350 }
1351
1352 Register Base;
1353 unsigned Offset;
1354
1355 std::tie(Base, Offset) =
1356 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1357
1358 uint32_t SOffset, ImmOffset;
1359 if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1360 &RBI.Subtarget, Alignment)) {
1361 if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1362 VOffsetReg = Base;
1363 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1364 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1365 InstOffsetVal = ImmOffset;
1366 return 0; // XXX - Why is this 0?
1367 }
1368
1369 // If we have SGPR base, we can use it for soffset.
1370 if (SOffset == 0) {
1371 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1372 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1373 SOffsetReg = Base;
1374 InstOffsetVal = ImmOffset;
1375 return 0; // XXX - Why is this 0?
1376 }
1377 }
1378
1379 // Handle the variable sgpr + vgpr case.
1380 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1381 if (Add && (int)Offset >= 0) {
1382 Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1383 Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1384
1385 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1386 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1387
1388 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1389 VOffsetReg = Src0;
1390 SOffsetReg = Src1;
1391 return 0;
1392 }
1393
1394 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1395 VOffsetReg = Src1;
1396 SOffsetReg = Src0;
1397 return 0;
1398 }
1399 }
1400
1401 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1402 // have an SGPR offset and a VGPR resource.
1403 if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1404 VOffsetReg = CombinedOffset;
1405 } else {
1406 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1407 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1408 }
1409
1410 SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1411 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1412 return 0;
1413}
1414
1415bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1416 const OperandsMapper &OpdMapper) const {
1417 MachineInstr &MI = OpdMapper.getMI();
1418 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1419
1420 const LLT S32 = LLT::scalar(32);
1421 Register Dst = MI.getOperand(0).getReg();
1422 LLT Ty = MRI.getType(Dst);
1423
1424 const RegisterBank *RSrcBank =
1425 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1426 const RegisterBank *OffsetBank =
1427 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1428 if (RSrcBank == &AMDGPU::SGPRRegBank &&
1429 OffsetBank == &AMDGPU::SGPRRegBank)
1430 return true; // Legal mapping
1431
1432 // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back
1433 // here but don't have an MMO.
1434
1435 unsigned LoadSize = Ty.getSizeInBits();
1436 int NumLoads = 1;
1437 if (LoadSize == 256 || LoadSize == 512) {
1438 NumLoads = LoadSize / 128;
1439 Ty = Ty.divide(NumLoads);
1440 }
1441
1442 // Use the alignment to ensure that the required offsets will fit into the
1443 // immediate offsets.
1444 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1445
1446 MachineIRBuilder B(MI);
1447 MachineFunction &MF = B.getMF();
1448
1449 Register SOffset;
1450 Register VOffset;
1451 int64_t ImmOffset = 0;
1452
1453 unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1454 VOffset, SOffset, ImmOffset, Alignment);
1455
1456 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1457 // can, but we neeed to track an MMO for that.
1458 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1459 const Align MemAlign(4); // FIXME: ABI type alignment?
1460 MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1461 MachinePointerInfo(),
1462 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1463 MachineMemOperand::MOInvariant,
1464 MemSize, MemAlign);
1465 if (MMOOffset != 0)
1466 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1467
1468 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1469 // assume that the buffer is unswizzled.
1470
1471 Register RSrc = MI.getOperand(1).getReg();
1472 Register VIndex = B.buildConstant(S32, 0).getReg(0);
1473 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1474
1475 SmallVector<Register, 4> LoadParts(NumLoads);
1476
1477 MachineBasicBlock::iterator MII = MI.getIterator();
1478 MachineInstrSpan Span(MII, &B.getMBB());
1479
1480 for (int i = 0; i < NumLoads; ++i) {
1481 if (NumLoads == 1) {
1482 LoadParts[i] = Dst;
1483 } else {
1484 LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1485 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1486 }
1487
1488 MachineMemOperand *MMO = BaseMMO;
1489 if (i != 0)
1490 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1491
1492 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1493 .addDef(LoadParts[i]) // vdata
1494 .addUse(RSrc) // rsrc
1495 .addUse(VIndex) // vindex
1496 .addUse(VOffset) // voffset
1497 .addUse(SOffset) // soffset
1498 .addImm(ImmOffset + 16 * i) // offset(imm)
1499 .addImm(0) // cachepolicy, swizzled buffer(imm)
1500 .addImm(0) // idxen(imm)
1501 .addMemOperand(MMO);
1502 }
1503
1504 // TODO: If only the resource is a VGPR, it may be better to execute the
1505 // scalar load in the waterfall loop if the resource is expected to frequently
1506 // be dynamically uniform.
1507 if (RSrcBank != &AMDGPU::SGPRRegBank) {
1508 // Remove the original instruction to avoid potentially confusing the
1509 // waterfall loop logic.
1510 B.setInstr(*Span.begin());
1511 MI.eraseFromParent();
1512
1513 SmallSet<Register, 4> OpsToWaterfall;
1514
1515 OpsToWaterfall.insert(RSrc);
1516 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1517 OpsToWaterfall, MRI);
1518 }
1519
1520 if (NumLoads != 1) {
1521 if (Ty.isVector())
1522 B.buildConcatVectors(Dst, LoadParts);
1523 else
1524 B.buildMerge(Dst, LoadParts);
1525 }
1526
1527 // We removed the instruction earlier with a waterfall loop.
1528 if (RSrcBank == &AMDGPU::SGPRRegBank)
1529 MI.eraseFromParent();
1530
1531 return true;
1532}
1533
1534bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
1535 const OperandsMapper &OpdMapper, bool Signed) const {
1536 MachineInstr &MI = OpdMapper.getMI();
1537 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1538
1539 // Insert basic copies
1540 applyDefaultMapping(OpdMapper);
1541
1542 Register DstReg = MI.getOperand(0).getReg();
1543 LLT Ty = MRI.getType(DstReg);
1544
1545 const LLT S32 = LLT::scalar(32);
1546
1547 const RegisterBank *DstBank =
1548 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1549 if (DstBank == &AMDGPU::VGPRRegBank) {
1550 if (Ty == S32)
1551 return true;
1552
1553 // TODO: 64-bit version is scalar only, so we need to expand this.
1554 return false;
1555 }
1556
1557 Register SrcReg = MI.getOperand(2).getReg();
1558 Register OffsetReg = MI.getOperand(3).getReg();
1559 Register WidthReg = MI.getOperand(4).getReg();
1560
1561 // The scalar form packs the offset and width in a single operand.
1562
1563 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1564 MachineIRBuilder B(MI, ApplyBank);
1565
1566 // Ensure the high bits are clear to insert the offset.
1567 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1568 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1569
1570 // Zeros out the low bits, so don't bother clamping the input value.
1571 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1572
1573 // Transformation function, pack the offset and width of a BFE into
1574 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1575 // source, bits [5:0] contain the offset and bits [22:16] the width.
1576 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1577
1578 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1579 // register class constraints.
1580 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1581 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1582
1583 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1584 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1585 llvm_unreachable("failed to constrain BFE")::llvm::llvm_unreachable_internal("failed to constrain BFE", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 1585)
;
1586
1587 MI.eraseFromParent();
1588 return true;
1589}
1590
1591// Return a suitable opcode for extending the operands of Opc when widening.
1592static unsigned getExtendOp(unsigned Opc) {
1593 switch (Opc) {
1594 case TargetOpcode::G_ASHR:
1595 case TargetOpcode::G_SMIN:
1596 case TargetOpcode::G_SMAX:
1597 return TargetOpcode::G_SEXT;
1598 case TargetOpcode::G_LSHR:
1599 case TargetOpcode::G_UMIN:
1600 case TargetOpcode::G_UMAX:
1601 return TargetOpcode::G_ZEXT;
1602 default:
1603 return TargetOpcode::G_ANYEXT;
1604 }
1605}
1606
1607// Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1608// any illegal vector extend or unmerge operations.
1609static std::pair<Register, Register>
1610unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1611 const LLT S32 = LLT::scalar(32);
1612 auto Bitcast = B.buildBitcast(S32, Src);
1613
1614 if (ExtOpcode == TargetOpcode::G_SEXT) {
1615 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1616 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1617 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1618 }
1619
1620 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1621 if (ExtOpcode == TargetOpcode::G_ZEXT) {
1622 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1623 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1624 }
1625
1626 assert(ExtOpcode == TargetOpcode::G_ANYEXT)(static_cast <bool> (ExtOpcode == TargetOpcode::G_ANYEXT
) ? void (0) : __assert_fail ("ExtOpcode == TargetOpcode::G_ANYEXT"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 1626, __extension__ __PRETTY_FUNCTION__))
;
1627 return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1628}
1629
1630// For cases where only a single copy is inserted for matching register banks.
1631// Replace the register in the instruction operand
1632static bool substituteSimpleCopyRegs(
1633 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1634 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1635 if (!SrcReg.empty()) {
1636 assert(SrcReg.size() == 1)(static_cast <bool> (SrcReg.size() == 1) ? void (0) : __assert_fail
("SrcReg.size() == 1", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 1636, __extension__ __PRETTY_FUNCTION__))
;
1637 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1638 return true;
1639 }
1640
1641 return false;
1642}
1643
1644/// Handle register layout difference for f16 images for some subtargets.
1645Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1646 MachineRegisterInfo &MRI,
1647 Register Reg) const {
1648 if (!Subtarget.hasUnpackedD16VMem())
1649 return Reg;
1650
1651 const LLT S16 = LLT::scalar(16);
1652 LLT StoreVT = MRI.getType(Reg);
1653 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1654 return Reg;
1655
1656 auto Unmerge = B.buildUnmerge(S16, Reg);
1657
1658
1659 SmallVector<Register, 4> WideRegs;
1660 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1661 WideRegs.push_back(Unmerge.getReg(I));
1662
1663 const LLT S32 = LLT::scalar(32);
1664 int NumElts = StoreVT.getNumElements();
1665
1666 return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1667}
1668
1669static std::pair<Register, unsigned>
1670getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1671 int64_t Const;
1672 if (mi_match(Reg, MRI, m_ICst(Const)))
1673 return std::make_pair(Register(), Const);
1674
1675 Register Base;
1676 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1677 return std::make_pair(Base, Const);
1678
1679 // TODO: Handle G_OR used for add case
1680 return std::make_pair(Reg, 0);
1681}
1682
1683std::pair<Register, unsigned>
1684AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1685 Register OrigOffset) const {
1686 const unsigned MaxImm = 4095;
1687 Register BaseReg;
1688 unsigned ImmOffset;
1689 const LLT S32 = LLT::scalar(32);
1690
1691 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1692 OrigOffset);
1693
1694 unsigned C1 = 0;
1695 if (ImmOffset != 0) {
1696 // If the immediate value is too big for the immoffset field, put the value
1697 // and -4096 into the immoffset field so that the value that is copied/added
1698 // for the voffset field is a multiple of 4096, and it stands more chance
1699 // of being CSEd with the copy/add for another similar load/store.
1700 // However, do not do that rounding down to a multiple of 4096 if that is a
1701 // negative number, as it appears to be illegal to have a negative offset
1702 // in the vgpr, even if adding the immediate offset makes it positive.
1703 unsigned Overflow = ImmOffset & ~MaxImm;
1704 ImmOffset -= Overflow;
1705 if ((int32_t)Overflow < 0) {
1706 Overflow += ImmOffset;
1707 ImmOffset = 0;
1708 }
1709
1710 C1 = ImmOffset;
1711 if (Overflow != 0) {
1712 if (!BaseReg)
1713 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1714 else {
1715 auto OverflowVal = B.buildConstant(S32, Overflow);
1716 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1717 }
1718 }
1719 }
1720
1721 if (!BaseReg)
1722 BaseReg = B.buildConstant(S32, 0).getReg(0);
1723
1724 return {BaseReg, C1};
1725}
1726
1727static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
1728 int64_t C;
1729 return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
1730}
1731
1732static unsigned extractCPol(unsigned CachePolicy) {
1733 return CachePolicy & AMDGPU::CPol::ALL;
1734}
1735
1736static unsigned extractSWZ(unsigned CachePolicy) {
1737 return (CachePolicy >> 3) & 1;
1738}
1739
1740
1741MachineInstr *
1742AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
1743 MachineInstr &MI) const {
1744 MachineRegisterInfo &MRI = *B.getMRI();
1745 executeInWaterfallLoop(B, MI, MRI, {2, 4});
1746
1747 // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1748
1749 Register VData = MI.getOperand(1).getReg();
1750 LLT Ty = MRI.getType(VData);
1751
1752 int EltSize = Ty.getScalarSizeInBits();
1753 int Size = Ty.getSizeInBits();
1754
1755 // FIXME: Broken integer truncstore.
1756 if (EltSize != 32)
1757 report_fatal_error("unhandled intrinsic store");
1758
1759 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1760 const int MemSize = (*MI.memoperands_begin())->getSize();
1761
1762
1763 Register RSrc = MI.getOperand(2).getReg();
1764 Register VOffset = MI.getOperand(3).getReg();
1765 Register SOffset = MI.getOperand(4).getReg();
1766 unsigned CachePolicy = MI.getOperand(5).getImm();
1767
1768 unsigned ImmOffset;
1769 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
1770
1771 const bool Offen = !isZero(VOffset, MRI);
1772
1773 unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
1774 switch (8 * MemSize) {
1775 case 8:
1776 Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
1777 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
1778 break;
1779 case 16:
1780 Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
1781 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
1782 break;
1783 default:
1784 Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
1785 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
1786 if (Size > 32)
1787 Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
1788 break;
1789 }
1790
1791
1792 // Set the insertion point back to the instruction in case it was moved into a
1793 // loop.
1794 B.setInstr(MI);
1795
1796 MachineInstrBuilder MIB = B.buildInstr(Opc)
1797 .addUse(VData);
1798
1799 if (Offen)
1800 MIB.addUse(VOffset);
1801
1802 MIB.addUse(RSrc)
1803 .addUse(SOffset)
1804 .addImm(ImmOffset)
1805 .addImm(extractCPol(CachePolicy))
1806 .addImm(0) // tfe: FIXME: Remove from inst
1807 .addImm(extractSWZ(CachePolicy))
1808 .cloneMemRefs(MI);
1809
1810 // FIXME: We need a way to report failure from applyMappingImpl.
1811 // Insert constrain copies before inserting the loop.
1812 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1813 report_fatal_error("failed to constrain selected store intrinsic");
1814
1815 return MIB;
1816}
1817
1818bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1819 Register SrcReg) const {
1820 MachineRegisterInfo &MRI = *B.getMRI();
1821 LLT SrcTy = MRI.getType(SrcReg);
1822 if (SrcTy.getSizeInBits() == 32) {
1823 // Use a v_mov_b32 here to make the exec dependency explicit.
1824 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1825 .addDef(DstReg)
1826 .addUse(SrcReg);
1827 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1828 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1829 }
1830
1831 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1832 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1833
1834 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1835 .addDef(TmpReg0)
1836 .addUse(SrcReg, 0, AMDGPU::sub0);
1837 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1838 .addDef(TmpReg1)
1839 .addUse(SrcReg, 0, AMDGPU::sub1);
1840 B.buildInstr(AMDGPU::REG_SEQUENCE)
1841 .addDef(DstReg)
1842 .addUse(TmpReg0)
1843 .addImm(AMDGPU::sub0)
1844 .addUse(TmpReg1)
1845 .addImm(AMDGPU::sub1);
1846
1847 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1848 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1849}
1850
1851/// Utility function for pushing dynamic vector indexes with a constant offset
1852/// into waterwall loops.
1853static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1854 MachineInstr &IdxUseInstr,
1855 unsigned OpIdx,
1856 unsigned ConstOffset) {
1857 MachineRegisterInfo &MRI = *B.getMRI();
1858 const LLT S32 = LLT::scalar(32);
1859 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1860 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1861
1862 auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1863
1864 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1865 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1866 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1867 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1868}
1869
1870/// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1871/// original 32-bit source value (to be inserted in the low part of the combined
1872/// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1873/// value.
1874static void extendLow32IntoHigh32(MachineIRBuilder &B,
1875 Register Hi32Reg, Register Lo32Reg,
1876 unsigned ExtOpc,
1877 const RegisterBank &RegBank,
1878 bool IsBooleanSrc = false) {
1879 if (ExtOpc == AMDGPU::G_ZEXT) {
1880 B.buildConstant(Hi32Reg, 0);
1881 } else if (ExtOpc == AMDGPU::G_SEXT) {
1882 if (IsBooleanSrc) {
1883 // If we know the original source was an s1, the high half is the same as
1884 // the low.
1885 B.buildCopy(Hi32Reg, Lo32Reg);
1886 } else {
1887 // Replicate sign bit from 32-bit extended part.
1888 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1889 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1890 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1891 }
1892 } else {
1893 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension")(static_cast <bool> (ExtOpc == AMDGPU::G_ANYEXT &&
"not an integer extension") ? void (0) : __assert_fail ("ExtOpc == AMDGPU::G_ANYEXT && \"not an integer extension\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 1893, __extension__ __PRETTY_FUNCTION__))
;
1894 B.buildUndef(Hi32Reg);
1895 }
1896}
1897
1898bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1899 MachineInstr &MI, MachineRegisterInfo &MRI,
1900 const OperandsMapper &OpdMapper) const {
1901
1902 Register VecReg = MI.getOperand(1).getReg();
1903 Register Idx = MI.getOperand(2).getReg();
1904
1905 const RegisterBank &IdxBank =
1906 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1907
1908 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1909
1910 LLT VecTy = MRI.getType(VecReg);
1911 unsigned EltSize = VecTy.getScalarSizeInBits();
1912 unsigned NumElem = VecTy.getNumElements();
1913
1914 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1915 IsDivergentIdx))
1916 return false;
1917
1918 MachineIRBuilder B(MI);
1919 LLT S32 = LLT::scalar(32);
1920
1921 const RegisterBank &DstBank =
1922 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1923 const RegisterBank &SrcBank =
1924 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1925
1926 const RegisterBank &CCBank =
1927 (DstBank == AMDGPU::SGPRRegBank &&
1928 SrcBank == AMDGPU::SGPRRegBank &&
1929 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1930 : AMDGPU::VCCRegBank;
1931 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1932
1933 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1934 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1935 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1936 }
1937
1938 LLT EltTy = VecTy.getScalarType();
1939 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1940 unsigned NumLanes = DstRegs.size();
1941 if (!NumLanes)
1942 NumLanes = 1;
1943 else
1944 EltTy = MRI.getType(DstRegs[0]);
1945
1946 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1947 SmallVector<Register, 2> Res(NumLanes);
1948 for (unsigned L = 0; L < NumLanes; ++L)
1949 Res[L] = UnmergeToEltTy.getReg(L);
1950
1951 for (unsigned I = 1; I < NumElem; ++I) {
1952 auto IC = B.buildConstant(S32, I);
1953 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1954 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1955 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1956
1957 for (unsigned L = 0; L < NumLanes; ++L) {
1958 auto S = B.buildSelect(EltTy, Cmp,
1959 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1960
1961 for (unsigned N : { 0, 2, 3 })
1962 MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1963
1964 Res[L] = S->getOperand(0).getReg();
1965 }
1966 }
1967
1968 for (unsigned L = 0; L < NumLanes; ++L) {
1969 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
1970 B.buildCopy(DstReg, Res[L]);
1971 MRI.setRegBank(DstReg, DstBank);
1972 }
1973
1974 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
1975 MI.eraseFromParent();
1976
1977 return true;
1978}
1979
1980// Insert a cross regbank copy for a register if it already has a bank that
1981// differs from the one we want to set.
1982static Register constrainRegToBank(MachineRegisterInfo &MRI,
1983 MachineIRBuilder &B, Register &Reg,
1984 const RegisterBank &Bank) {
1985 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
1986 if (CurrBank && *CurrBank != Bank) {
1987 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
1988 MRI.setRegBank(Copy, Bank);
1989 return Copy;
1990 }
1991
1992 MRI.setRegBank(Reg, Bank);
1993 return Reg;
1994}
1995
1996bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
1997 MachineInstr &MI, MachineRegisterInfo &MRI,
1998 const OperandsMapper &OpdMapper) const {
1999
2000 Register VecReg = MI.getOperand(1).getReg();
2001 Register Idx = MI.getOperand(3).getReg();
2002
2003 const RegisterBank &IdxBank =
2004 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2005
2006 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2007
2008 LLT VecTy = MRI.getType(VecReg);
2009 unsigned EltSize = VecTy.getScalarSizeInBits();
2010 unsigned NumElem = VecTy.getNumElements();
2011
2012 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2013 IsDivergentIdx))
2014 return false;
2015
2016 MachineIRBuilder B(MI);
2017 LLT S32 = LLT::scalar(32);
2018
2019 const RegisterBank &DstBank =
2020 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2021 const RegisterBank &SrcBank =
2022 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2023 const RegisterBank &InsBank =
2024 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2025
2026 const RegisterBank &CCBank =
2027 (DstBank == AMDGPU::SGPRRegBank &&
2028 SrcBank == AMDGPU::SGPRRegBank &&
2029 InsBank == AMDGPU::SGPRRegBank &&
2030 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2031 : AMDGPU::VCCRegBank;
2032 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2033
2034 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2035 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2036 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2037 }
2038
2039 LLT EltTy = VecTy.getScalarType();
2040 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2041 unsigned NumLanes = InsRegs.size();
2042 if (!NumLanes) {
2043 NumLanes = 1;
2044 InsRegs.push_back(MI.getOperand(2).getReg());
2045 } else {
2046 EltTy = MRI.getType(InsRegs[0]);
2047 }
2048
2049 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2050 SmallVector<Register, 16> Ops(NumElem * NumLanes);
2051
2052 for (unsigned I = 0; I < NumElem; ++I) {
2053 auto IC = B.buildConstant(S32, I);
2054 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2055 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2056 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2057
2058 for (unsigned L = 0; L < NumLanes; ++L) {
2059 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2060 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2061 Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2062
2063 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2064 MRI.setRegBank(Select, DstBank);
2065
2066 Ops[I * NumLanes + L] = Select;
2067 }
2068 }
2069
2070 LLT MergeTy = LLT::vector(Ops.size(), EltTy);
2071 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2072 B.buildBuildVector(MI.getOperand(0), Ops);
2073 } else {
2074 auto Vec = B.buildBuildVector(MergeTy, Ops);
2075 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2076 B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2077 }
2078
2079 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2080 MI.eraseFromParent();
2081
2082 return true;
2083}
2084
2085void AMDGPURegisterBankInfo::applyMappingImpl(
2086 const OperandsMapper &OpdMapper) const {
2087 MachineInstr &MI = OpdMapper.getMI();
2088 unsigned Opc = MI.getOpcode();
2089 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2090 switch (Opc) {
2091 case AMDGPU::G_PHI: {
2092 Register DstReg = MI.getOperand(0).getReg();
2093 LLT DstTy = MRI.getType(DstReg);
2094 if (DstTy != LLT::scalar(1))
2095 break;
2096
2097 const LLT S32 = LLT::scalar(32);
2098 const RegisterBank *DstBank =
2099 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2100 if (DstBank == &AMDGPU::VCCRegBank) {
2101 applyDefaultMapping(OpdMapper);
2102 // The standard handling only considers the result register bank for
2103 // phis. For VCC, blindly inserting a copy when the phi is lowered will
2104 // produce an invalid copy. We can only copy with some kind of compare to
2105 // get a vector boolean result. Insert a regitser bank copy that will be
2106 // correctly lowered to a compare.
2107 MachineIRBuilder B(*MI.getParent()->getParent());
2108
2109 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2110 Register SrcReg = MI.getOperand(I).getReg();
2111 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2112
2113 if (SrcBank != &AMDGPU::VCCRegBank) {
2114 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2115 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2116
2117 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2118 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2119 MI.getOperand(I).setReg(Copy.getReg(0));
2120 }
2121 }
2122
2123 return;
2124 }
2125
2126 // Phi handling is strange and only considers the bank of the destination.
2127 substituteSimpleCopyRegs(OpdMapper, 0);
2128
2129 // Promote SGPR/VGPR booleans to s32
2130 MachineFunction *MF = MI.getParent()->getParent();
2131 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2132 MachineIRBuilder B(MI, ApplyBank);
2133 LegalizerHelper Helper(*MF, ApplyBank, B);
2134
2135 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2136 llvm_unreachable("widen scalar should have succeeded")::llvm::llvm_unreachable_internal("widen scalar should have succeeded"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2136)
;
2137
2138 return;
2139 }
2140 case AMDGPU::G_ICMP:
2141 case AMDGPU::G_UADDO:
2142 case AMDGPU::G_USUBO:
2143 case AMDGPU::G_UADDE:
2144 case AMDGPU::G_SADDE:
2145 case AMDGPU::G_USUBE:
2146 case AMDGPU::G_SSUBE: {
2147 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2148 Register DstReg = MI.getOperand(BoolDstOp).getReg();
2149
2150 const RegisterBank *DstBank =
2151 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2152 if (DstBank != &AMDGPU::SGPRRegBank)
2153 break;
2154
2155 const bool HasCarryIn = MI.getNumOperands() == 5;
2156
2157 // If this is a scalar compare, promote the result to s32, as the selection
2158 // will end up using a copy to a 32-bit vreg.
2159 const LLT S32 = LLT::scalar(32);
2160 Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2161 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2162 MI.getOperand(BoolDstOp).setReg(NewDstReg);
2163 MachineIRBuilder B(MI);
2164
2165 if (HasCarryIn) {
2166 Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2167 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2168 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2169 MI.getOperand(4).setReg(NewSrcReg);
2170 }
2171
2172 MachineBasicBlock *MBB = MI.getParent();
2173 B.setInsertPt(*MBB, std::next(MI.getIterator()));
2174
2175 // If we had a constrained VCC result register, a copy was inserted to VCC
2176 // from SGPR.
2177 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2178 if (DefRegs.empty())
2179 DefRegs.push_back(DstReg);
2180 B.buildTrunc(DefRegs[0], NewDstReg);
2181 return;
2182 }
2183 case AMDGPU::G_SELECT: {
2184 Register DstReg = MI.getOperand(0).getReg();
2185 LLT DstTy = MRI.getType(DstReg);
2186
2187 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2188 if (CondRegs.empty())
2189 CondRegs.push_back(MI.getOperand(1).getReg());
2190 else {
2191 assert(CondRegs.size() == 1)(static_cast <bool> (CondRegs.size() == 1) ? void (0) :
__assert_fail ("CondRegs.size() == 1", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2191, __extension__ __PRETTY_FUNCTION__))
;
2192 }
2193
2194 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2195 if (CondBank == &AMDGPU::SGPRRegBank) {
2196 MachineIRBuilder B(MI);
2197 const LLT S32 = LLT::scalar(32);
2198 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2199 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2200
2201 MI.getOperand(1).setReg(NewCondReg);
2202 B.buildZExt(NewCondReg, CondRegs[0]);
2203 }
2204
2205 if (DstTy.getSizeInBits() != 64)
2206 break;
2207
2208 MachineIRBuilder B(MI);
2209 LLT HalfTy = getHalfSizedType(DstTy);
2210
2211 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2212 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2213 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2214
2215 // All inputs are SGPRs, nothing special to do.
2216 if (DefRegs.empty()) {
2217 assert(Src1Regs.empty() && Src2Regs.empty())(static_cast <bool> (Src1Regs.empty() && Src2Regs
.empty()) ? void (0) : __assert_fail ("Src1Regs.empty() && Src2Regs.empty()"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2217, __extension__ __PRETTY_FUNCTION__))
;
2218 break;
2219 }
2220
2221 if (Src1Regs.empty())
2222 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2223 else {
2224 setRegsToType(MRI, Src1Regs, HalfTy);
2225 }
2226
2227 if (Src2Regs.empty())
2228 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2229 else
2230 setRegsToType(MRI, Src2Regs, HalfTy);
2231
2232 setRegsToType(MRI, DefRegs, HalfTy);
2233
2234 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2235 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2236
2237 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2238 MI.eraseFromParent();
2239 return;
2240 }
2241 case AMDGPU::G_BRCOND: {
2242 Register CondReg = MI.getOperand(0).getReg();
2243 // FIXME: Should use legalizer helper, but should change bool ext type.
2244 const RegisterBank *CondBank =
2245 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2246
2247 if (CondBank == &AMDGPU::SGPRRegBank) {
2248 MachineIRBuilder B(MI);
2249 const LLT S32 = LLT::scalar(32);
2250 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2251 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2252
2253 MI.getOperand(0).setReg(NewCondReg);
2254 B.buildZExt(NewCondReg, CondReg);
2255 return;
2256 }
2257
2258 break;
2259 }
2260 case AMDGPU::G_AND:
2261 case AMDGPU::G_OR:
2262 case AMDGPU::G_XOR: {
2263 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2264 // there is a VGPR input.
2265 Register DstReg = MI.getOperand(0).getReg();
2266 LLT DstTy = MRI.getType(DstReg);
2267
2268 if (DstTy.getSizeInBits() == 1) {
2269 const RegisterBank *DstBank =
2270 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2271 if (DstBank == &AMDGPU::VCCRegBank)
2272 break;
2273
2274 MachineFunction *MF = MI.getParent()->getParent();
2275 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2276 MachineIRBuilder B(MI, ApplyBank);
2277 LegalizerHelper Helper(*MF, ApplyBank, B);
2278
2279 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2280 LegalizerHelper::Legalized)
2281 llvm_unreachable("widen scalar should have succeeded")::llvm::llvm_unreachable_internal("widen scalar should have succeeded"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2281)
;
2282 return;
2283 }
2284
2285 if (DstTy.getSizeInBits() != 64)
2286 break;
2287
2288 LLT HalfTy = getHalfSizedType(DstTy);
2289 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2290 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2291 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2292
2293 // All inputs are SGPRs, nothing special to do.
2294 if (DefRegs.empty()) {
2295 assert(Src0Regs.empty() && Src1Regs.empty())(static_cast <bool> (Src0Regs.empty() && Src1Regs
.empty()) ? void (0) : __assert_fail ("Src0Regs.empty() && Src1Regs.empty()"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2295, __extension__ __PRETTY_FUNCTION__))
;
2296 break;
2297 }
2298
2299 assert(DefRegs.size() == 2)(static_cast <bool> (DefRegs.size() == 2) ? void (0) : __assert_fail
("DefRegs.size() == 2", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2299, __extension__ __PRETTY_FUNCTION__))
;
2300 assert(Src0Regs.size() == Src1Regs.size() &&(static_cast <bool> (Src0Regs.size() == Src1Regs.size()
&& (Src0Regs.empty() || Src0Regs.size() == 2)) ? void
(0) : __assert_fail ("Src0Regs.size() == Src1Regs.size() && (Src0Regs.empty() || Src0Regs.size() == 2)"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2301, __extension__ __PRETTY_FUNCTION__))
2301 (Src0Regs.empty() || Src0Regs.size() == 2))(static_cast <bool> (Src0Regs.size() == Src1Regs.size()
&& (Src0Regs.empty() || Src0Regs.size() == 2)) ? void
(0) : __assert_fail ("Src0Regs.size() == Src1Regs.size() && (Src0Regs.empty() || Src0Regs.size() == 2)"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2301, __extension__ __PRETTY_FUNCTION__))
;
2302
2303 // Depending on where the source registers came from, the generic code may
2304 // have decided to split the inputs already or not. If not, we still need to
2305 // extract the values.
2306 MachineIRBuilder B(MI);
2307
2308 if (Src0Regs.empty())
2309 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2310 else
2311 setRegsToType(MRI, Src0Regs, HalfTy);
2312
2313 if (Src1Regs.empty())
2314 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2315 else
2316 setRegsToType(MRI, Src1Regs, HalfTy);
2317
2318 setRegsToType(MRI, DefRegs, HalfTy);
2319
2320 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2321 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2322
2323 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2324 MI.eraseFromParent();
2325 return;
2326 }
2327 case AMDGPU::G_ABS: {
2328 Register SrcReg = MI.getOperand(1).getReg();
2329 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2330
2331 // There is no VALU abs instruction so we need to replace it with a sub and
2332 // max combination.
2333 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2334 MachineFunction *MF = MI.getParent()->getParent();
2335 ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
2336 MachineIRBuilder B(MI, Apply);
2337 LegalizerHelper Helper(*MF, Apply, B);
2338
2339 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2340 llvm_unreachable("lowerAbsToMaxNeg should have succeeded")::llvm::llvm_unreachable_internal("lowerAbsToMaxNeg should have succeeded"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2340)
;
2341 return;
2342 }
2343 LLVM_FALLTHROUGH[[gnu::fallthrough]];
2344 }
2345 case AMDGPU::G_ADD:
2346 case AMDGPU::G_SUB:
2347 case AMDGPU::G_MUL:
2348 case AMDGPU::G_SHL:
2349 case AMDGPU::G_LSHR:
2350 case AMDGPU::G_ASHR:
2351 case AMDGPU::G_SMIN:
2352 case AMDGPU::G_SMAX:
2353 case AMDGPU::G_UMIN:
2354 case AMDGPU::G_UMAX: {
2355 Register DstReg = MI.getOperand(0).getReg();
2356 LLT DstTy = MRI.getType(DstReg);
2357
2358 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2359 // Packed 16-bit operations need to be scalarized and promoted.
2360 if (DstTy != LLT::scalar(16) && DstTy != LLT::vector(2, 16))
2361 break;
2362
2363 const RegisterBank *DstBank =
2364 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2365 if (DstBank == &AMDGPU::VGPRRegBank)
2366 break;
2367
2368 const LLT S32 = LLT::scalar(32);
2369 MachineBasicBlock *MBB = MI.getParent();
2370 MachineFunction *MF = MBB->getParent();
2371 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2372 MachineIRBuilder B(MI, ApplySALU);
2373
2374 if (DstTy.isVector()) {
2375 Register WideSrc0Lo, WideSrc0Hi;
2376 Register WideSrc1Lo, WideSrc1Hi;
2377
2378 unsigned ExtendOp = getExtendOp(MI.getOpcode());
2379 std::tie(WideSrc0Lo, WideSrc0Hi)
2380 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2381 std::tie(WideSrc1Lo, WideSrc1Hi)
2382 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2383 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2384 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2385 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2386 MI.eraseFromParent();
2387 } else {
2388 LegalizerHelper Helper(*MF, ApplySALU, B);
2389
2390 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2391 llvm_unreachable("widen scalar should have succeeded")::llvm::llvm_unreachable_internal("widen scalar should have succeeded"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2391)
;
2392
2393 // FIXME: s16 shift amounts should be legal.
2394 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2395 Opc == AMDGPU::G_ASHR) {
2396 B.setInsertPt(*MBB, MI.getIterator());
2397 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2398 llvm_unreachable("widen scalar should have succeeded")::llvm::llvm_unreachable_internal("widen scalar should have succeeded"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2398)
;
2399 }
2400 }
2401
2402 return;
2403 }
2404 case AMDGPU::G_SEXT_INREG: {
2405 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2406 if (SrcRegs.empty())
2407 break; // Nothing to repair
2408
2409 const LLT S32 = LLT::scalar(32);
2410 MachineIRBuilder B(MI);
2411 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2412 GISelObserverWrapper Observer(&O);
2413 B.setChangeObserver(Observer);
2414
2415 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2416 // we would need to further expand, and doesn't let us directly set the
2417 // result registers.
2418 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2419
2420 int Amt = MI.getOperand(2).getImm();
2421 if (Amt <= 32) {
2422 if (Amt == 32) {
2423 // The low bits are unchanged.
2424 B.buildCopy(DstRegs[0], SrcRegs[0]);
2425 } else {
2426 // Extend in the low bits and propagate the sign bit to the high half.
2427 B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
2428 }
2429
2430 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2431 } else {
2432 // The low bits are unchanged, and extend in the high bits.
2433 B.buildCopy(DstRegs[0], SrcRegs[0]);
2434 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2435 }
2436
2437 Register DstReg = MI.getOperand(0).getReg();
2438 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2439 MI.eraseFromParent();
2440 return;
2441 }
2442 case AMDGPU::G_CTPOP:
2443 case AMDGPU::G_BITREVERSE:
2444 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2445 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2446 const RegisterBank *DstBank =
2447 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2448 if (DstBank == &AMDGPU::SGPRRegBank)
2449 break;
2450
2451 Register SrcReg = MI.getOperand(1).getReg();
2452 const LLT S32 = LLT::scalar(32);
2453 LLT Ty = MRI.getType(SrcReg);
2454 if (Ty == S32)
2455 break;
2456
2457 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2458 MachineIRBuilder B(MI, ApplyVALU);
2459
2460 MachineFunction &MF = B.getMF();
2461 LegalizerHelper Helper(MF, ApplyVALU, B);
2462
2463 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2464 llvm_unreachable("narrowScalar should have succeeded")::llvm::llvm_unreachable_internal("narrowScalar should have succeeded"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2464)
;
2465 return;
2466 }
2467 case AMDGPU::G_SEXT:
2468 case AMDGPU::G_ZEXT:
2469 case AMDGPU::G_ANYEXT: {
2470 Register SrcReg = MI.getOperand(1).getReg();
2471 LLT SrcTy = MRI.getType(SrcReg);
2472 const bool Signed = Opc == AMDGPU::G_SEXT;
2473
2474 assert(empty(OpdMapper.getVRegs(1)))(static_cast <bool> (empty(OpdMapper.getVRegs(1))) ? void
(0) : __assert_fail ("empty(OpdMapper.getVRegs(1))", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2474, __extension__ __PRETTY_FUNCTION__))
;
2475
2476 MachineIRBuilder B(MI);
2477 const RegisterBank *SrcBank =
2478 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2479
2480 Register DstReg = MI.getOperand(0).getReg();
2481 LLT DstTy = MRI.getType(DstReg);
2482 if (DstTy.isScalar() &&
2483 SrcBank != &AMDGPU::SGPRRegBank &&
2484 SrcBank != &AMDGPU::VCCRegBank &&
2485 // FIXME: Should handle any type that round to s64 when irregular
2486 // breakdowns supported.
2487 DstTy.getSizeInBits() == 64 &&
2488 SrcTy.getSizeInBits() <= 32) {
2489 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2490
2491 // Extend to 32-bit, and then extend the low half.
2492 if (Signed) {
2493 // TODO: Should really be buildSExtOrCopy
2494 B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2495 } else if (Opc == AMDGPU::G_ZEXT) {
2496 B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2497 } else {
2498 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2499 }
2500
2501 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2502 MRI.setRegBank(DstReg, *SrcBank);
2503 MI.eraseFromParent();
2504 return;
2505 }
2506
2507 if (SrcTy != LLT::scalar(1))
2508 return;
2509
2510 // It is not legal to have a legalization artifact with a VCC source. Rather
2511 // than introducing a copy, insert the select we would have to select the
2512 // copy to.
2513 if (SrcBank == &AMDGPU::VCCRegBank) {
2514 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2515
2516 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2517
2518 unsigned DstSize = DstTy.getSizeInBits();
2519 // 64-bit select is SGPR only
2520 const bool UseSel64 = DstSize > 32 &&
2521 SrcBank->getID() == AMDGPU::SGPRRegBankID;
2522
2523 // TODO: Should s16 select be legal?
2524 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2525 auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2526 auto False = B.buildConstant(SelType, 0);
2527
2528 MRI.setRegBank(True.getReg(0), *DstBank);
2529 MRI.setRegBank(False.getReg(0), *DstBank);
2530 MRI.setRegBank(DstReg, *DstBank);
2531
2532 if (DstSize > 32) {
2533 B.buildSelect(DefRegs[0], SrcReg, True, False);
2534 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2535 } else if (DstSize < 32) {
2536 auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2537 MRI.setRegBank(Sel.getReg(0), *DstBank);
2538 B.buildTrunc(DstReg, Sel);
2539 } else {
2540 B.buildSelect(DstReg, SrcReg, True, False);
2541 }
2542
2543 MI.eraseFromParent();
2544 return;
2545 }
2546
2547 break;
2548 }
2549 case AMDGPU::G_BUILD_VECTOR:
2550 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2551 Register DstReg = MI.getOperand(0).getReg();
2552 LLT DstTy = MRI.getType(DstReg);
2553 if (DstTy != LLT::vector(2, 16))
2554 break;
2555
2556 assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty())(static_cast <bool> (MI.getNumOperands() == 3 &&
OpdMapper.getVRegs(0).empty()) ? void (0) : __assert_fail ("MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2556, __extension__ __PRETTY_FUNCTION__))
;
2557 substituteSimpleCopyRegs(OpdMapper, 1);
2558 substituteSimpleCopyRegs(OpdMapper, 2);
2559
2560 const RegisterBank *DstBank =
2561 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2562 if (DstBank == &AMDGPU::SGPRRegBank)
2563 break; // Can use S_PACK_* instructions.
2564
2565 MachineIRBuilder B(MI);
2566
2567 Register Lo = MI.getOperand(1).getReg();
2568 Register Hi = MI.getOperand(2).getReg();
2569 const LLT S32 = LLT::scalar(32);
2570
2571 const RegisterBank *BankLo =
2572 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2573 const RegisterBank *BankHi =
2574 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2575
2576 Register ZextLo;
2577 Register ShiftHi;
2578
2579 if (Opc == AMDGPU::G_BUILD_VECTOR) {
2580 ZextLo = B.buildZExt(S32, Lo).getReg(0);
2581 MRI.setRegBank(ZextLo, *BankLo);
2582
2583 Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
2584 MRI.setRegBank(ZextHi, *BankHi);
2585
2586 auto ShiftAmt = B.buildConstant(S32, 16);
2587 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2588
2589 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
2590 MRI.setRegBank(ShiftHi, *BankHi);
2591 } else {
2592 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
2593 MRI.setRegBank(MaskLo, *BankLo);
2594
2595 auto ShiftAmt = B.buildConstant(S32, 16);
2596 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2597
2598 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
2599 MRI.setRegBank(ShiftHi, *BankHi);
2600
2601 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
2602 MRI.setRegBank(ZextLo, *BankLo);
2603 }
2604
2605 auto Or = B.buildOr(S32, ZextLo, ShiftHi);
2606 MRI.setRegBank(Or.getReg(0), *DstBank);
2607
2608 B.buildBitcast(DstReg, Or);
2609 MI.eraseFromParent();
2610 return;
2611 }
2612 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2613 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2614
2615 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty())(static_cast <bool> (OpdMapper.getVRegs(1).empty() &&
OpdMapper.getVRegs(2).empty()) ? void (0) : __assert_fail ("OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2615, __extension__ __PRETTY_FUNCTION__))
;
2616
2617 Register DstReg = MI.getOperand(0).getReg();
2618 Register SrcReg = MI.getOperand(1).getReg();
2619
2620 const LLT S32 = LLT::scalar(32);
2621 LLT DstTy = MRI.getType(DstReg);
2622 LLT SrcTy = MRI.getType(SrcReg);
2623
2624 if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2625 return;
2626
2627 MachineIRBuilder B(MI);
2628
2629 const ValueMapping &DstMapping
2630 = OpdMapper.getInstrMapping().getOperandMapping(0);
2631 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2632 const RegisterBank *SrcBank =
2633 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2634 const RegisterBank *IdxBank =
2635 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2636
2637 Register BaseIdxReg;
2638 unsigned ConstOffset;
2639 std::tie(BaseIdxReg, ConstOffset) =
2640 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2641
2642 // See if the index is an add of a constant which will be foldable by moving
2643 // the base register of the index later if this is going to be executed in a
2644 // waterfall loop. This is essentially to reassociate the add of a constant
2645 // with the readfirstlane.
2646 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2647 ConstOffset > 0 &&
2648 ConstOffset < SrcTy.getNumElements();
2649
2650 // Move the base register. We'll re-insert the add later.
2651 if (ShouldMoveIndexIntoLoop)
2652 MI.getOperand(2).setReg(BaseIdxReg);
2653
2654 // If this is a VGPR result only because the index was a VGPR result, the
2655 // actual indexing will be done on the SGPR source vector, which will
2656 // produce a scalar result. We need to copy to the VGPR result inside the
2657 // waterfall loop.
2658 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2659 SrcBank == &AMDGPU::SGPRRegBank;
2660 if (DstRegs.empty()) {
2661 applyDefaultMapping(OpdMapper);
2662
2663 executeInWaterfallLoop(MI, MRI, { 2 });
2664
2665 if (NeedCopyToVGPR) {
2666 // We don't want a phi for this temporary reg.
2667 Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2668 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2669 MI.getOperand(0).setReg(TmpReg);
2670 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2671
2672 // Use a v_mov_b32 here to make the exec dependency explicit.
2673 buildVCopy(B, DstReg, TmpReg);
2674 }
2675
2676 // Re-insert the constant offset add inside the waterfall loop.
2677 if (ShouldMoveIndexIntoLoop)
2678 reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2679
2680 return;
2681 }
2682
2683 assert(DstTy.getSizeInBits() == 64)(static_cast <bool> (DstTy.getSizeInBits() == 64) ? void
(0) : __assert_fail ("DstTy.getSizeInBits() == 64", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2683, __extension__ __PRETTY_FUNCTION__))
;
2684
2685 LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
2686
2687 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2688 auto One = B.buildConstant(S32, 1);
2689
2690 MachineBasicBlock::iterator MII = MI.getIterator();
2691
2692 // Split the vector index into 32-bit pieces. Prepare to move all of the
2693 // new instructions into a waterfall loop if necessary.
2694 //
2695 // Don't put the bitcast or constant in the loop.
2696 MachineInstrSpan Span(MII, &B.getMBB());
2697
2698 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2699 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2700 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2701
2702 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2703 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2704
2705 MRI.setRegBank(DstReg, *DstBank);
2706 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2707 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2708 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2709 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2710
2711 SmallSet<Register, 4> OpsToWaterfall;
2712 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2713 MI.eraseFromParent();
2714 return;
2715 }
2716
2717 // Remove the original instruction to avoid potentially confusing the
2718 // waterfall loop logic.
2719 B.setInstr(*Span.begin());
2720 MI.eraseFromParent();
2721 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2722 OpsToWaterfall, MRI);
2723
2724 if (NeedCopyToVGPR) {
2725 MachineBasicBlock *LoopBB = Extract1->getParent();
2726 Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2727 Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2728 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2729 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2730
2731 Extract0->getOperand(0).setReg(TmpReg0);
2732 Extract1->getOperand(0).setReg(TmpReg1);
2733
2734 B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2735
2736 buildVCopy(B, DstRegs[0], TmpReg0);
2737 buildVCopy(B, DstRegs[1], TmpReg1);
2738 }
2739
2740 if (ShouldMoveIndexIntoLoop)
2741 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2742
2743 return;
2744 }
2745 case AMDGPU::G_INSERT_VECTOR_ELT: {
2746 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2747
2748 Register DstReg = MI.getOperand(0).getReg();
2749 LLT VecTy = MRI.getType(DstReg);
2750
2751 assert(OpdMapper.getVRegs(0).empty())(static_cast <bool> (OpdMapper.getVRegs(0).empty()) ? void
(0) : __assert_fail ("OpdMapper.getVRegs(0).empty()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2751, __extension__ __PRETTY_FUNCTION__))
;
2752 assert(OpdMapper.getVRegs(3).empty())(static_cast <bool> (OpdMapper.getVRegs(3).empty()) ? void
(0) : __assert_fail ("OpdMapper.getVRegs(3).empty()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2752, __extension__ __PRETTY_FUNCTION__))
;
2753
2754 if (substituteSimpleCopyRegs(OpdMapper, 1))
2755 MRI.setType(MI.getOperand(1).getReg(), VecTy);
2756
2757 if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2758 return;
2759
2760 const RegisterBank *IdxBank =
2761 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2762
2763 Register SrcReg = MI.getOperand(1).getReg();
2764 Register InsReg = MI.getOperand(2).getReg();
2765 LLT InsTy = MRI.getType(InsReg);
2766 (void)InsTy;
2767
2768 Register BaseIdxReg;
2769 unsigned ConstOffset;
2770 std::tie(BaseIdxReg, ConstOffset) =
2771 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2772
2773 // See if the index is an add of a constant which will be foldable by moving
2774 // the base register of the index later if this is going to be executed in a
2775 // waterfall loop. This is essentially to reassociate the add of a constant
2776 // with the readfirstlane.
2777 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2778 ConstOffset > 0 &&
2779 ConstOffset < VecTy.getNumElements();
2780
2781 // Move the base register. We'll re-insert the add later.
2782 if (ShouldMoveIndexIntoLoop)
2783 MI.getOperand(3).setReg(BaseIdxReg);
2784
2785
2786 if (InsRegs.empty()) {
2787 executeInWaterfallLoop(MI, MRI, { 3 });
2788
2789 // Re-insert the constant offset add inside the waterfall loop.
2790 if (ShouldMoveIndexIntoLoop) {
2791 MachineIRBuilder B(MI);
2792 reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2793 }
2794
2795 return;
2796 }
2797
2798
2799 assert(InsTy.getSizeInBits() == 64)(static_cast <bool> (InsTy.getSizeInBits() == 64) ? void
(0) : __assert_fail ("InsTy.getSizeInBits() == 64", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2799, __extension__ __PRETTY_FUNCTION__))
;
2800
2801 const LLT S32 = LLT::scalar(32);
2802 LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32);
2803
2804 MachineIRBuilder B(MI);
2805 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2806 auto One = B.buildConstant(S32, 1);
2807
2808 // Split the vector index into 32-bit pieces. Prepare to move all of the
2809 // new instructions into a waterfall loop if necessary.
2810 //
2811 // Don't put the bitcast or constant in the loop.
2812 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2813
2814 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2815 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2816 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2817
2818 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2819 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2820
2821 const RegisterBank *DstBank =
2822 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2823 const RegisterBank *SrcBank =
2824 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2825 const RegisterBank *InsSrcBank =
2826 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2827
2828 MRI.setRegBank(InsReg, *InsSrcBank);
2829 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2830 MRI.setRegBank(InsLo.getReg(0), *DstBank);
2831 MRI.setRegBank(InsHi.getReg(0), *DstBank);
2832 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2833 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2834 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2835
2836
2837 SmallSet<Register, 4> OpsToWaterfall;
2838 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2839 B.setInsertPt(B.getMBB(), MI);
2840 B.buildBitcast(DstReg, InsHi);
2841 MI.eraseFromParent();
2842 return;
2843 }
2844
2845 B.setInstr(*Span.begin());
2846 MI.eraseFromParent();
2847
2848 // Figure out the point after the waterfall loop before mangling the control
2849 // flow.
2850 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2851 OpsToWaterfall, MRI);
2852
2853 // The insertion point is now right after the original instruction.
2854 //
2855 // Keep the bitcast to the original vector type out of the loop. Doing this
2856 // saved an extra phi we don't need inside the loop.
2857 B.buildBitcast(DstReg, InsHi);
2858
2859 // Re-insert the constant offset add inside the waterfall loop.
2860 if (ShouldMoveIndexIntoLoop)
2861 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2862
2863 return;
2864 }
2865 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2866 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2867 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2868 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2869 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2870 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2871 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2872 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2873 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2874 case AMDGPU::G_AMDGPU_BUFFER_STORE:
2875 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2876 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2877 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2878 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2879 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2880 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2881 applyDefaultMapping(OpdMapper);
2882 executeInWaterfallLoop(MI, MRI, {1, 4});
2883 return;
2884 }
2885 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2886 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2887 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2888 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2889 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2890 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2891 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2892 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2893 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2894 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2895 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2896 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2897 applyDefaultMapping(OpdMapper);
2898 executeInWaterfallLoop(MI, MRI, {2, 5});
2899 return;
2900 }
2901 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
2902 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
2903 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
2904 applyDefaultMapping(OpdMapper);
2905 executeInWaterfallLoop(MI, MRI, {2, 5});
2906 return;
2907 }
2908 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2909 applyDefaultMapping(OpdMapper);
2910 executeInWaterfallLoop(MI, MRI, {3, 6});
2911 return;
2912 }
2913 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2914 applyMappingSBufferLoad(OpdMapper);
2915 return;
2916 }
2917 case AMDGPU::G_INTRINSIC: {
2918 switch (MI.getIntrinsicID()) {
2919 case Intrinsic::amdgcn_readlane: {
2920 substituteSimpleCopyRegs(OpdMapper, 2);
2921
2922 assert(OpdMapper.getVRegs(0).empty())(static_cast <bool> (OpdMapper.getVRegs(0).empty()) ? void
(0) : __assert_fail ("OpdMapper.getVRegs(0).empty()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2922, __extension__ __PRETTY_FUNCTION__))
;
2923 assert(OpdMapper.getVRegs(3).empty())(static_cast <bool> (OpdMapper.getVRegs(3).empty()) ? void
(0) : __assert_fail ("OpdMapper.getVRegs(3).empty()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2923, __extension__ __PRETTY_FUNCTION__))
;
2924
2925 // Make sure the index is an SGPR. It doesn't make sense to run this in a
2926 // waterfall loop, so assume it's a uniform value.
2927 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2928 return;
2929 }
2930 case Intrinsic::amdgcn_writelane: {
2931 assert(OpdMapper.getVRegs(0).empty())(static_cast <bool> (OpdMapper.getVRegs(0).empty()) ? void
(0) : __assert_fail ("OpdMapper.getVRegs(0).empty()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2931, __extension__ __PRETTY_FUNCTION__))
;
2932 assert(OpdMapper.getVRegs(2).empty())(static_cast <bool> (OpdMapper.getVRegs(2).empty()) ? void
(0) : __assert_fail ("OpdMapper.getVRegs(2).empty()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2932, __extension__ __PRETTY_FUNCTION__))
;
2933 assert(OpdMapper.getVRegs(3).empty())(static_cast <bool> (OpdMapper.getVRegs(3).empty()) ? void
(0) : __assert_fail ("OpdMapper.getVRegs(3).empty()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2933, __extension__ __PRETTY_FUNCTION__))
;
2934
2935 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2936 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
2937 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2938 return;
2939 }
2940 case Intrinsic::amdgcn_interp_p1:
2941 case Intrinsic::amdgcn_interp_p2:
2942 case Intrinsic::amdgcn_interp_mov:
2943 case Intrinsic::amdgcn_interp_p1_f16:
2944 case Intrinsic::amdgcn_interp_p2_f16: {
2945 applyDefaultMapping(OpdMapper);
2946
2947 // Readlane for m0 value, which is always the last operand.
2948 // FIXME: Should this be a waterfall loop instead?
2949 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
2950 return;
2951 }
2952 case Intrinsic::amdgcn_permlane16:
2953 case Intrinsic::amdgcn_permlanex16: {
2954 // Doing a waterfall loop over these wouldn't make any sense.
2955 substituteSimpleCopyRegs(OpdMapper, 2);
2956 substituteSimpleCopyRegs(OpdMapper, 3);
2957 constrainOpWithReadfirstlane(MI, MRI, 4);
2958 constrainOpWithReadfirstlane(MI, MRI, 5);
2959 return;
2960 }
2961 case Intrinsic::amdgcn_sbfe:
2962 applyMappingBFEIntrinsic(OpdMapper, true);
2963 return;
2964 case Intrinsic::amdgcn_ubfe:
2965 applyMappingBFEIntrinsic(OpdMapper, false);
2966 return;
2967 case Intrinsic::amdgcn_ballot:
2968 // Use default handling and insert copy to vcc source.
2969 break;
2970 }
2971 break;
2972 }
2973 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
2974 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
2975 const AMDGPU::RsrcIntrinsic *RSrcIntrin
2976 = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
2977 assert(RSrcIntrin && RSrcIntrin->IsImage)(static_cast <bool> (RSrcIntrin && RSrcIntrin->
IsImage) ? void (0) : __assert_fail ("RSrcIntrin && RSrcIntrin->IsImage"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2977, __extension__ __PRETTY_FUNCTION__))
;
2978 // Non-images can have complications from operands that allow both SGPR
2979 // and VGPR. For now it's too complicated to figure out the final opcode
2980 // to derive the register bank from the MCInstrDesc.
2981 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
2982 return;
2983 }
2984 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
2985 unsigned N = MI.getNumExplicitOperands() - 2;
2986 executeInWaterfallLoop(MI, MRI, { N });
2987 return;
2988 }
2989 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
2990 auto IntrID = MI.getIntrinsicID();
2991 switch (IntrID) {
2992 case Intrinsic::amdgcn_ds_ordered_add:
2993 case Intrinsic::amdgcn_ds_ordered_swap: {
2994 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
2995 assert(OpdMapper.getVRegs(0).empty())(static_cast <bool> (OpdMapper.getVRegs(0).empty()) ? void
(0) : __assert_fail ("OpdMapper.getVRegs(0).empty()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 2995, __extension__ __PRETTY_FUNCTION__))
;
2996 substituteSimpleCopyRegs(OpdMapper, 3);
2997 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2998 return;
2999 }
3000 case Intrinsic::amdgcn_ds_gws_init:
3001 case Intrinsic::amdgcn_ds_gws_barrier:
3002 case Intrinsic::amdgcn_ds_gws_sema_br: {
3003 // Only the first lane is executes, so readfirstlane is safe.
3004 substituteSimpleCopyRegs(OpdMapper, 1);
3005 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3006 return;
3007 }
3008 case Intrinsic::amdgcn_ds_gws_sema_v:
3009 case Intrinsic::amdgcn_ds_gws_sema_p:
3010 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3011 // Only the first lane is executes, so readfirstlane is safe.
3012 constrainOpWithReadfirstlane(MI, MRI, 1); // M0
3013 return;
3014 }
3015 case Intrinsic::amdgcn_ds_append:
3016 case Intrinsic::amdgcn_ds_consume: {
3017 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3018 return;
3019 }
3020 case Intrinsic::amdgcn_s_sendmsg:
3021 case Intrinsic::amdgcn_s_sendmsghalt: {
3022 // FIXME: Should this use a waterfall loop?
3023 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3024 return;
3025 }
3026 case Intrinsic::amdgcn_s_setreg: {
3027 constrainOpWithReadfirstlane(MI, MRI, 2);
3028 return;
3029 }
3030 default: {
3031 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3032 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3033 // Non-images can have complications from operands that allow both SGPR
3034 // and VGPR. For now it's too complicated to figure out the final opcode
3035 // to derive the register bank from the MCInstrDesc.
3036 if (RSrcIntrin->IsImage) {
3037 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3038 return;
3039 }
3040 }
3041
3042 break;
3043 }
3044 }
3045 break;
3046 }
3047 case AMDGPU::G_LOAD:
3048 case AMDGPU::G_ZEXTLOAD:
3049 case AMDGPU::G_SEXTLOAD: {
3050 if (applyMappingLoad(MI, OpdMapper, MRI))
3051 return;
3052 break;
3053 }
3054 case AMDGPU::G_DYN_STACKALLOC:
3055 applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3056 return;
3057 default:
3058 break;
3059 }
3060
3061 return applyDefaultMapping(OpdMapper);
3062}
3063
3064// vgpr, sgpr -> vgpr
3065// vgpr, agpr -> vgpr
3066// agpr, agpr -> agpr
3067// agpr, sgpr -> vgpr
3068static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3069 if (RB0 == AMDGPU::InvalidRegBankID)
3070 return RB1;
3071 if (RB1 == AMDGPU::InvalidRegBankID)
3072 return RB0;
3073
3074 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3075 return AMDGPU::SGPRRegBankID;
3076
3077 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3078 return AMDGPU::AGPRRegBankID;
3079
3080 return AMDGPU::VGPRRegBankID;
3081}
3082
3083static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3084 if (RB0 == AMDGPU::InvalidRegBankID)
3085 return RB1;
3086 if (RB1 == AMDGPU::InvalidRegBankID)
3087 return RB0;
3088
3089 // vcc, vcc -> vcc
3090 // vcc, sgpr -> vcc
3091 // vcc, vgpr -> vcc
3092 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3093 return AMDGPU::VCCRegBankID;
3094
3095 // vcc, vgpr -> vgpr
3096 return regBankUnion(RB0, RB1);
3097}
3098
3099unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3100 const MachineInstr &MI) const {
3101 unsigned RegBank = AMDGPU::InvalidRegBankID;
3102
3103 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3104 if (!MI.getOperand(i).isReg())
3105 continue;
3106 Register Reg = MI.getOperand(i).getReg();
3107 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3108 RegBank = regBankUnion(RegBank, Bank->getID());
3109 if (RegBank == AMDGPU::VGPRRegBankID)
3110 break;
3111 }
3112 }
3113
3114 return RegBank;
3115}
3116
3117bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3118 const MachineFunction &MF = *MI.getParent()->getParent();
3119 const MachineRegisterInfo &MRI = MF.getRegInfo();
3120 for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
3121 if (!MI.getOperand(i).isReg())
3122 continue;
3123 Register Reg = MI.getOperand(i).getReg();
3124 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3125 if (Bank->getID() != AMDGPU::SGPRRegBankID)
3126 return false;
3127 }
3128 }
3129 return true;
3130}
3131
3132const RegisterBankInfo::InstructionMapping &
3133AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3134 const MachineFunction &MF = *MI.getParent()->getParent();
3135 const MachineRegisterInfo &MRI = MF.getRegInfo();
3136 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3137
3138 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3139 const MachineOperand &SrcOp = MI.getOperand(i);
3140 if (!SrcOp.isReg())
3141 continue;
3142
3143 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3144 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3145 }
3146 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3147 MI.getNumOperands());
3148}
3149
3150const RegisterBankInfo::InstructionMapping &
3151AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3152 const MachineFunction &MF = *MI.getParent()->getParent();
3153 const MachineRegisterInfo &MRI = MF.getRegInfo();
3154 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3155
3156 // Even though we technically could use SGPRs, this would require knowledge of
3157 // the constant bus restriction. Force all sources to VGPR (except for VCC).
3158 //
3159 // TODO: Unary ops are trivially OK, so accept SGPRs?
3160 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3161 const MachineOperand &Src = MI.getOperand(i);
3162 if (!Src.isReg())
3163 continue;
3164
3165 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3166 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3167 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3168 }
3169
3170 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3171 MI.getNumOperands());
3172}
3173
3174const RegisterBankInfo::InstructionMapping &
3175AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3176 const MachineFunction &MF = *MI.getParent()->getParent();
3177 const MachineRegisterInfo &MRI = MF.getRegInfo();
3178 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3179
3180 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3181 const MachineOperand &Op = MI.getOperand(I);
3182 if (!Op.isReg())
3183 continue;
3184
3185 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3186 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3187 }
3188
3189 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3190 MI.getNumOperands());
3191}
3192
3193const RegisterBankInfo::InstructionMapping &
3194AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3195 const MachineInstr &MI,
3196 int RsrcIdx) const {
3197 // The reported argument index is relative to the IR intrinsic call arguments,
3198 // so we need to shift by the number of defs and the intrinsic ID.
3199 RsrcIdx += MI.getNumExplicitDefs() + 1;
3200
3201 const int NumOps = MI.getNumOperands();
3202 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3203
3204 // TODO: Should packed/unpacked D16 difference be reported here as part of
3205 // the value mapping?
3206 for (int I = 0; I != NumOps; ++I) {
3207 if (!MI.getOperand(I).isReg())
3208 continue;
3209
3210 Register OpReg = MI.getOperand(I).getReg();
3211 // We replace some dead address operands with $noreg
3212 if (!OpReg)
3213 continue;
3214
3215 unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3216
3217 // FIXME: Probably need a new intrinsic register bank searchable table to
3218 // handle arbitrary intrinsics easily.
3219 //
3220 // If this has a sampler, it immediately follows rsrc.
3221 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3222
3223 if (MustBeSGPR) {
3224 // If this must be an SGPR, so we must report whatever it is as legal.
3225 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3226 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3227 } else {
3228 // Some operands must be VGPR, and these are easy to copy to.
3229 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3230 }
3231 }
3232
3233 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3234}
3235
3236/// Return the mapping for a pointer arugment.
3237const RegisterBankInfo::ValueMapping *
3238AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3239 Register PtrReg) const {
3240 LLT PtrTy = MRI.getType(PtrReg);
3241 unsigned Size = PtrTy.getSizeInBits();
3242 if (Subtarget.useFlatForGlobal() ||
3243 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3244 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3245
3246 // If we're using MUBUF instructions for global memory, an SGPR base register
3247 // is possible. Otherwise this needs to be a VGPR.
3248 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3249 return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3250}
3251
3252const RegisterBankInfo::InstructionMapping &
3253AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3254
3255 const MachineFunction &MF = *MI.getParent()->getParent();
3256 const MachineRegisterInfo &MRI = MF.getRegInfo();
3257 SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3258 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3259 Register PtrReg = MI.getOperand(1).getReg();
3260 LLT PtrTy = MRI.getType(PtrReg);
3261 unsigned AS = PtrTy.getAddressSpace();
3262 unsigned PtrSize = PtrTy.getSizeInBits();
3263
3264 const ValueMapping *ValMapping;
3265 const ValueMapping *PtrMapping;
3266
3267 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3268
3269 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3270 if (isScalarLoadLegal(MI)) {
3271 // We have a uniform instruction so we want to use an SMRD load
3272 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3273 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3274 } else {
3275 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3276
3277 // If we're using MUBUF instructions for global memory, an SGPR base
3278 // register is possible. Otherwise this needs to be a VGPR.
3279 unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3280 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3281
3282 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3283 }
3284 } else {
3285 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3286 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3287 }
3288
3289 OpdsMapping[0] = ValMapping;
3290 OpdsMapping[1] = PtrMapping;
3291 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3292 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3293 return Mapping;
3294
3295 // FIXME: Do we want to add a mapping for FLAT load, or should we just
3296 // handle that during instruction selection?
3297}
3298
3299unsigned
3300AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3301 const MachineRegisterInfo &MRI,
3302 unsigned Default) const {
3303 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3304 return Bank ? Bank->getID() : Default;
3305}
3306
3307const RegisterBankInfo::ValueMapping *
3308AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3309 const MachineRegisterInfo &MRI,
3310 const TargetRegisterInfo &TRI) const {
3311 // Lie and claim anything is legal, even though this needs to be an SGPR
3312 // applyMapping will have to deal with it as a waterfall loop.
3313 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3314 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3315 return AMDGPU::getValueMapping(Bank, Size);
3316}
3317
3318const RegisterBankInfo::ValueMapping *
3319AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3320 const MachineRegisterInfo &MRI,
3321 const TargetRegisterInfo &TRI) const {
3322 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3323 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3324}
3325
3326const RegisterBankInfo::ValueMapping *
3327AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3328 const MachineRegisterInfo &MRI,
3329 const TargetRegisterInfo &TRI) const {
3330 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3331 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3332}
3333
3334///
3335/// This function must return a legal mapping, because
3336/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3337/// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3338/// VGPR to SGPR generated is illegal.
3339///
3340// Operands that must be SGPRs must accept potentially divergent VGPRs as
3341// legal. These will be dealt with in applyMappingImpl.
3342//
3343const RegisterBankInfo::InstructionMapping &
3344AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3345 const MachineFunction &MF = *MI.getParent()->getParent();
3346 const MachineRegisterInfo &MRI = MF.getRegInfo();
3347
3348 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3349 // The default logic bothers to analyze impossible alternative mappings. We
3350 // want the most straightforward mapping, so just directly handle this.
3351 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3352 *TRI);
3353 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3354 *TRI);
3355 assert(SrcBank && "src bank should have been assigned already")(static_cast <bool> (SrcBank && "src bank should have been assigned already"
) ? void (0) : __assert_fail ("SrcBank && \"src bank should have been assigned already\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 3355, __extension__ __PRETTY_FUNCTION__))
;
3356 if (!DstBank)
3357 DstBank = SrcBank;
3358
3359 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3360 if (cannotCopy(*DstBank, *SrcBank, Size))
3361 return getInvalidInstructionMapping();
3362
3363 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3364 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3365 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3366 OpdsMapping[0] = &ValMap;
3367 if (MI.getOpcode() == AMDGPU::G_FREEZE)
3368 OpdsMapping[1] = &ValMap;
3369
3370 return getInstructionMapping(
3371 1, /*Cost*/ 1,
3372 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3373 }
3374
3375 if (MI.isRegSequence()) {
3376 // If any input is a VGPR, the result must be a VGPR. The default handling
3377 // assumes any copy between banks is legal.
3378 unsigned BankID = AMDGPU::SGPRRegBankID;
3379
3380 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3381 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3382 // It doesn't make sense to use vcc or scc banks here, so just ignore
3383 // them.
3384 if (OpBank != AMDGPU::SGPRRegBankID) {
3385 BankID = AMDGPU::VGPRRegBankID;
3386 break;
3387 }
3388 }
3389 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3390
3391 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3392 return getInstructionMapping(
3393 1, /*Cost*/ 1,
3394 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3395 }
3396
3397 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3398 // properly.
3399 //
3400 // TODO: There are additional exec masking dependencies to analyze.
3401 if (MI.getOpcode() == TargetOpcode::G_PHI) {
3402 unsigned ResultBank = AMDGPU::InvalidRegBankID;
3403 Register DstReg = MI.getOperand(0).getReg();
3404
3405 // Sometimes the result may have already been assigned a bank.
3406 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3407 ResultBank = DstBank->getID();
3408
3409 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3410 Register Reg = MI.getOperand(I).getReg();
3411 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3412
3413 // FIXME: Assuming VGPR for any undetermined inputs.
3414 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3415 ResultBank = AMDGPU::VGPRRegBankID;
3416 break;
3417 }
3418
3419 // FIXME: Need to promote SGPR case to s32
3420 unsigned OpBank = Bank->getID();
3421 ResultBank = regBankBoolUnion(ResultBank, OpBank);
3422 }
3423
3424 assert(ResultBank != AMDGPU::InvalidRegBankID)(static_cast <bool> (ResultBank != AMDGPU::InvalidRegBankID
) ? void (0) : __assert_fail ("ResultBank != AMDGPU::InvalidRegBankID"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 3424, __extension__ __PRETTY_FUNCTION__))
;
3425
3426 unsigned Size = MRI.getType(DstReg).getSizeInBits();
3427
3428 const ValueMapping &ValMap =
3429 getValueMapping(0, Size, getRegBank(ResultBank));
3430 return getInstructionMapping(
3431 1, /*Cost*/ 1,
3432 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3433 }
3434
3435 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3436 if (Mapping.isValid())
3437 return Mapping;
3438
3439 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3440
3441 switch (MI.getOpcode()) {
3442 default:
3443 return getInvalidInstructionMapping();
3444
3445 case AMDGPU::G_AND:
3446 case AMDGPU::G_OR:
3447 case AMDGPU::G_XOR: {
3448 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3449 if (Size == 1) {
3450 const RegisterBank *DstBank
3451 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3452
3453 unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3454 unsigned BankLHS = AMDGPU::InvalidRegBankID;
3455 unsigned BankRHS = AMDGPU::InvalidRegBankID;
3456 if (DstBank) {
3457 TargetBankID = DstBank->getID();
3458 if (DstBank == &AMDGPU::VCCRegBank) {
3459 TargetBankID = AMDGPU::VCCRegBankID;
3460 BankLHS = AMDGPU::VCCRegBankID;
3461 BankRHS = AMDGPU::VCCRegBankID;
3462 } else {
3463 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3464 AMDGPU::SGPRRegBankID);
3465 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3466 AMDGPU::SGPRRegBankID);
3467 }
3468 } else {
3469 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3470 AMDGPU::VCCRegBankID);
3471 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3472 AMDGPU::VCCRegBankID);
3473
3474 // Both inputs should be true booleans to produce a boolean result.
3475 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3476 TargetBankID = AMDGPU::VGPRRegBankID;
3477 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3478 TargetBankID = AMDGPU::VCCRegBankID;
3479 BankLHS = AMDGPU::VCCRegBankID;
3480 BankRHS = AMDGPU::VCCRegBankID;
3481 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3482 TargetBankID = AMDGPU::SGPRRegBankID;
3483 }
3484 }
3485
3486 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3487 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3488 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3489 break;
3490 }
3491
3492 if (Size == 64) {
3493
3494 if (isSALUMapping(MI)) {
3495 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3496 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3497 } else {
3498 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3499 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3500 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3501
3502 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3503 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3504 }
3505
3506 break;
3507 }
3508
3509 LLVM_FALLTHROUGH[[gnu::fallthrough]];
3510 }
3511 case AMDGPU::G_PTR_ADD:
3512 case AMDGPU::G_PTRMASK:
3513 case AMDGPU::G_ADD:
3514 case AMDGPU::G_SUB:
3515 case AMDGPU::G_MUL:
3516 case AMDGPU::G_SHL:
3517 case AMDGPU::G_LSHR:
3518 case AMDGPU::G_ASHR:
3519 case AMDGPU::G_UADDO:
3520 case AMDGPU::G_USUBO:
3521 case AMDGPU::G_UADDE:
3522 case AMDGPU::G_SADDE:
3523 case AMDGPU::G_USUBE:
3524 case AMDGPU::G_SSUBE:
3525 case AMDGPU::G_SMIN:
3526 case AMDGPU::G_SMAX:
3527 case AMDGPU::G_UMIN:
3528 case AMDGPU::G_UMAX:
3529 case AMDGPU::G_ABS:
3530 case AMDGPU::G_SHUFFLE_VECTOR:
3531 if (isSALUMapping(MI))
3532 return getDefaultMappingSOP(MI);
3533 LLVM_FALLTHROUGH[[gnu::fallthrough]];
3534
3535 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3536 case AMDGPU::G_SSUBSAT:
3537 case AMDGPU::G_UADDSAT:
3538 case AMDGPU::G_USUBSAT:
3539 case AMDGPU::G_FADD:
3540 case AMDGPU::G_FSUB:
3541 case AMDGPU::G_FPTOSI:
3542 case AMDGPU::G_FPTOUI:
3543 case AMDGPU::G_FMUL:
3544 case AMDGPU::G_FMA:
3545 case AMDGPU::G_FMAD:
3546 case AMDGPU::G_FSQRT:
3547 case AMDGPU::G_FFLOOR:
3548 case AMDGPU::G_FCEIL:
3549 case AMDGPU::G_FRINT:
3550 case AMDGPU::G_SITOFP:
3551 case AMDGPU::G_UITOFP:
3552 case AMDGPU::G_FPTRUNC:
3553 case AMDGPU::G_FPEXT:
3554 case AMDGPU::G_FEXP2:
3555 case AMDGPU::G_FLOG2:
3556 case AMDGPU::G_FMINNUM:
3557 case AMDGPU::G_FMAXNUM:
3558 case AMDGPU::G_FMINNUM_IEEE:
3559 case AMDGPU::G_FMAXNUM_IEEE:
3560 case AMDGPU::G_FCANONICALIZE:
3561 case AMDGPU::G_INTRINSIC_TRUNC:
3562 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3563 case AMDGPU::G_FSHR: // TODO: Expand for scalar
3564 case AMDGPU::G_AMDGPU_FFBH_U32:
3565 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3566 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3567 case AMDGPU::G_AMDGPU_RCP_IFLAG:
3568 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3569 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3570 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3571 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3572 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3573 case AMDGPU::G_AMDGPU_SMED3:
3574 return getDefaultMappingVOP(MI);
3575 case AMDGPU::G_UMULH:
3576 case AMDGPU::G_SMULH: {
3577 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3578 return getDefaultMappingSOP(MI);
3579 return getDefaultMappingVOP(MI);
3580 }
3581 case AMDGPU::G_IMPLICIT_DEF: {
3582 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3583 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3584 break;
3585 }
3586 case AMDGPU::G_FCONSTANT:
3587 case AMDGPU::G_CONSTANT:
3588 case AMDGPU::G_GLOBAL_VALUE:
3589 case AMDGPU::G_BLOCK_ADDR:
3590 case AMDGPU::G_READCYCLECOUNTER: {
3591 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3592 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3593 break;
3594 }
3595 case AMDGPU::G_FRAME_INDEX: {
3596 // TODO: This should be the same as other constants, but eliminateFrameIndex
3597 // currently assumes VALU uses.
3598 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3599 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3600 break;
3601 }
3602 case AMDGPU::G_DYN_STACKALLOC: {
3603 // Result is always uniform, and a wave reduction is needed for the source.
3604 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3605 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3606 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3607 break;
3608 }
3609 case AMDGPU::G_INSERT: {
3610 unsigned BankID = getMappingType(MRI, MI);
3611 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3612 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3613 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3614 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3615 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3616 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3617 OpdsMapping[3] = nullptr;
3618 break;
3619 }
3620 case AMDGPU::G_EXTRACT: {
3621 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3622 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3623 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3624 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3625 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3626 OpdsMapping[2] = nullptr;
3627 break;
3628 }
3629 case AMDGPU::G_BUILD_VECTOR:
3630 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3631 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3632 if (DstTy == LLT::vector(2, 16)) {
3633 unsigned DstSize = DstTy.getSizeInBits();
3634 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3635 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3636 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3637 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3638
3639 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3640 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3641 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3642 break;
3643 }
3644
3645 LLVM_FALLTHROUGH[[gnu::fallthrough]];
3646 }
3647 case AMDGPU::G_MERGE_VALUES:
3648 case AMDGPU::G_CONCAT_VECTORS: {
3649 unsigned Bank = getMappingType(MRI, MI);
3650 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3651 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3652
3653 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3654 // Op1 and Dst should use the same register bank.
3655 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3656 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3657 break;
3658 }
3659 case AMDGPU::G_BITREVERSE:
3660 case AMDGPU::G_BITCAST:
3661 case AMDGPU::G_INTTOPTR:
3662 case AMDGPU::G_PTRTOINT:
3663 case AMDGPU::G_FABS:
3664 case AMDGPU::G_FNEG: {
3665 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3666 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3667 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3668 break;
3669 }
3670 case AMDGPU::G_CTLZ_ZERO_UNDEF:
3671 case AMDGPU::G_CTTZ_ZERO_UNDEF:
3672 case AMDGPU::G_CTPOP: {
3673 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3674 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3675 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3676
3677 // This should really be getValueMappingSGPR64Only, but allowing the generic
3678 // code to handle the register split just makes using LegalizerHelper more
3679 // difficult.
3680 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3681 break;
3682 }
3683 case AMDGPU::G_TRUNC: {
3684 Register Dst = MI.getOperand(0).getReg();
3685 Register Src = MI.getOperand(1).getReg();
3686 unsigned Bank = getRegBankID(Src, MRI);
3687 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3688 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3689 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3690 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3691 break;
3692 }
3693 case AMDGPU::G_ZEXT:
3694 case AMDGPU::G_SEXT:
3695 case AMDGPU::G_ANYEXT:
3696 case AMDGPU::G_SEXT_INREG: {
3697 Register Dst = MI.getOperand(0).getReg();
3698 Register Src = MI.getOperand(1).getReg();
3699 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3700 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3701
3702 unsigned DstBank;
3703 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3704 assert(SrcBank)(static_cast <bool> (SrcBank) ? void (0) : __assert_fail
("SrcBank", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 3704, __extension__ __PRETTY_FUNCTION__))
;
3705 switch (SrcBank->getID()) {
3706 case AMDGPU::SGPRRegBankID:
3707 DstBank = AMDGPU::SGPRRegBankID;
3708 break;
3709 default:
3710 DstBank = AMDGPU::VGPRRegBankID;
3711 break;
3712 }
3713
3714 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3715 // 32-bits, and then to 64.
3716 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3717 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3718 SrcSize);
3719 break;
3720 }
3721 case AMDGPU::G_FCMP: {
3722 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3723 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3724 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3725 OpdsMapping[1] = nullptr; // Predicate Operand.
3726 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
3727 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3728 break;
3729 }
3730 case AMDGPU::G_STORE: {
3731 assert(MI.getOperand(0).isReg())(static_cast <bool> (MI.getOperand(0).isReg()) ? void (
0) : __assert_fail ("MI.getOperand(0).isReg()", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 3731, __extension__ __PRETTY_FUNCTION__))
;
3732 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3733
3734 // FIXME: We need to specify a different reg bank once scalar stores are
3735 // supported.
3736 const ValueMapping *ValMapping =
3737 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3738 OpdsMapping[0] = ValMapping;
3739 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
3740 break;
3741 }
3742 case AMDGPU::G_ICMP: {
3743 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3744 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3745
3746 // See if the result register has already been constrained to vcc, which may
3747 // happen due to control flow intrinsic lowering.
3748 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
3749 AMDGPU::SGPRRegBankID);
3750 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3751 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
3752
3753 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
3754 Op2Bank == AMDGPU::SGPRRegBankID &&
3755 Op3Bank == AMDGPU::SGPRRegBankID &&
3756 (Size == 32 || (Size == 64 &&
3757 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
3758 Subtarget.hasScalarCompareEq64()));
3759
3760 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3761 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3762
3763 // TODO: Use 32-bit for scalar output size.
3764 // SCC results will need to be copied to a 32-bit SGPR virtual register.
3765 const unsigned ResultSize = 1;
3766
3767 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
3768 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
3769 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
3770 break;
3771 }
3772 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
3773 // VGPR index can be used for waterfall when indexing a SGPR vector.
3774 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3775 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3776 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3777 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3778 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3779 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
3780
3781 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
3782 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
3783
3784 // The index can be either if the source vector is VGPR.
3785 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3786 break;
3787 }
3788 case AMDGPU::G_INSERT_VECTOR_ELT: {
3789 unsigned OutputBankID = isSALUMapping(MI) ?
3790 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3791
3792 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3793 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3794 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
3795 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3796 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
3797
3798 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3799 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3800
3801 // This is a weird case, because we need to break down the mapping based on
3802 // the register bank of a different operand.
3803 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
3804 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
3805 InsertSize);
3806 } else {
3807 assert(InsertSize == 32 || InsertSize == 64)(static_cast <bool> (InsertSize == 32 || InsertSize == 64
) ? void (0) : __assert_fail ("InsertSize == 32 || InsertSize == 64"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 3807, __extension__ __PRETTY_FUNCTION__))
;
3808 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
3809 }
3810
3811 // The index can be either if the source vector is VGPR.
3812 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
3813 break;
3814 }
3815 case AMDGPU::G_UNMERGE_VALUES: {
3816 unsigned Bank = getMappingType(MRI, MI);
3817
3818 // Op1 and Dst should use the same register bank.
3819 // FIXME: Shouldn't this be the default? Why do we need to handle this?
3820 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3821 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
3822 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
3823 }
3824 break;
3825 }
3826 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3827 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3828 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3829 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3830 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3831 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3832 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3833 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3834 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3835 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3836 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
3837 case AMDGPU::G_AMDGPU_BUFFER_STORE:
3838 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3839 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3840 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3841 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
3842 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3843
3844 // rsrc
3845 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3846
3847 // vindex
3848 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3849
3850 // voffset
3851 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3852
3853 // soffset
3854 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3855
3856 // Any remaining operands are immediates and were correctly null
3857 // initialized.
3858 break;
3859 }
3860 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3861 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3862 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3863 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3864 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3865 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3866 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3867 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3868 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3869 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3870 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3871 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3872 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3873 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3874 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3875 // vdata_out
3876 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3877
3878 // vdata_in
3879 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3880
3881 // rsrc
3882 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3883
3884 // vindex
3885 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3886
3887 // voffset
3888 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3889
3890 // soffset
3891 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3892
3893 // Any remaining operands are immediates and were correctly null
3894 // initialized.
3895 break;
3896 }
3897 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3898 // vdata_out
3899 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3900
3901 // vdata_in
3902 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3903
3904 // cmp
3905 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3906
3907 // rsrc
3908 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3909
3910 // vindex
3911 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3912
3913 // voffset
3914 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3915
3916 // soffset
3917 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
3918
3919 // Any remaining operands are immediates and were correctly null
3920 // initialized.
3921 break;
3922 }
3923 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
3924 // Lie and claim everything is legal, even though some need to be
3925 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
3926 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3927 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3928
3929 // We need to convert this to a MUBUF if either the resource of offset is
3930 // VGPR.
3931 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
3932 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
3933 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
3934
3935 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3936 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
3937 break;
3938 }
3939 case AMDGPU::G_INTRINSIC: {
3940 switch (MI.getIntrinsicID()) {
3941 default:
3942 return getInvalidInstructionMapping();
3943 case Intrinsic::amdgcn_div_fmas:
3944 case Intrinsic::amdgcn_div_fixup:
3945 case Intrinsic::amdgcn_trig_preop:
3946 case Intrinsic::amdgcn_sin:
3947 case Intrinsic::amdgcn_cos:
3948 case Intrinsic::amdgcn_log_clamp:
3949 case Intrinsic::amdgcn_rcp:
3950 case Intrinsic::amdgcn_rcp_legacy:
3951 case Intrinsic::amdgcn_sqrt:
3952 case Intrinsic::amdgcn_rsq:
3953 case Intrinsic::amdgcn_rsq_legacy:
3954 case Intrinsic::amdgcn_rsq_clamp:
3955 case Intrinsic::amdgcn_fmul_legacy:
3956 case Intrinsic::amdgcn_fma_legacy:
3957 case Intrinsic::amdgcn_ldexp:
3958 case Intrinsic::amdgcn_frexp_mant:
3959 case Intrinsic::amdgcn_frexp_exp:
3960 case Intrinsic::amdgcn_fract:
3961 case Intrinsic::amdgcn_cvt_pkrtz:
3962 case Intrinsic::amdgcn_cvt_pknorm_i16:
3963 case Intrinsic::amdgcn_cvt_pknorm_u16:
3964 case Intrinsic::amdgcn_cvt_pk_i16:
3965 case Intrinsic::amdgcn_cvt_pk_u16:
3966 case Intrinsic::amdgcn_fmed3:
3967 case Intrinsic::amdgcn_cubeid:
3968 case Intrinsic::amdgcn_cubema:
3969 case Intrinsic::amdgcn_cubesc:
3970 case Intrinsic::amdgcn_cubetc:
3971 case Intrinsic::amdgcn_sffbh:
3972 case Intrinsic::amdgcn_fmad_ftz:
3973 case Intrinsic::amdgcn_mbcnt_lo:
3974 case Intrinsic::amdgcn_mbcnt_hi:
3975 case Intrinsic::amdgcn_mul_u24:
3976 case Intrinsic::amdgcn_mul_i24:
3977 case Intrinsic::amdgcn_lerp:
3978 case Intrinsic::amdgcn_sad_u8:
3979 case Intrinsic::amdgcn_msad_u8:
3980 case Intrinsic::amdgcn_sad_hi_u8:
3981 case Intrinsic::amdgcn_sad_u16:
3982 case Intrinsic::amdgcn_qsad_pk_u16_u8:
3983 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
3984 case Intrinsic::amdgcn_mqsad_u32_u8:
3985 case Intrinsic::amdgcn_cvt_pk_u8_f32:
3986 case Intrinsic::amdgcn_alignbit:
3987 case Intrinsic::amdgcn_alignbyte:
3988 case Intrinsic::amdgcn_perm:
3989 case Intrinsic::amdgcn_fdot2:
3990 case Intrinsic::amdgcn_sdot2:
3991 case Intrinsic::amdgcn_udot2:
3992 case Intrinsic::amdgcn_sdot4:
3993 case Intrinsic::amdgcn_udot4:
3994 case Intrinsic::amdgcn_sdot8:
3995 case Intrinsic::amdgcn_udot8:
3996 return getDefaultMappingVOP(MI);
3997 case Intrinsic::amdgcn_sbfe:
3998 case Intrinsic::amdgcn_ubfe:
3999 if (isSALUMapping(MI))
4000 return getDefaultMappingSOP(MI);
4001 return getDefaultMappingVOP(MI);
4002 case Intrinsic::amdgcn_ds_swizzle:
4003 case Intrinsic::amdgcn_ds_permute:
4004 case Intrinsic::amdgcn_ds_bpermute:
4005 case Intrinsic::amdgcn_update_dpp:
4006 case Intrinsic::amdgcn_mov_dpp8:
4007 case Intrinsic::amdgcn_mov_dpp:
4008 case Intrinsic::amdgcn_strict_wwm:
4009 case Intrinsic::amdgcn_wwm:
4010 case Intrinsic::amdgcn_strict_wqm:
4011 case Intrinsic::amdgcn_wqm:
4012 case Intrinsic::amdgcn_softwqm:
4013 case Intrinsic::amdgcn_set_inactive:
4014 return getDefaultMappingAllVGPR(MI);
4015 case Intrinsic::amdgcn_kernarg_segment_ptr:
4016 case Intrinsic::amdgcn_s_getpc:
4017 case Intrinsic::amdgcn_groupstaticsize:
4018 case Intrinsic::amdgcn_reloc_constant:
4019 case Intrinsic::returnaddress: {
4020 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4021 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4022 break;
4023 }
4024 case Intrinsic::amdgcn_wqm_vote: {
4025 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4026 OpdsMapping[0] = OpdsMapping[2]
4027 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4028 break;
4029 }
4030 case Intrinsic::amdgcn_ps_live: {
4031 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4032 break;
4033 }
4034 case Intrinsic::amdgcn_div_scale: {
4035 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4036 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4037 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4038 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4039
4040 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4041 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4042 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4043 break;
4044 }
4045 case Intrinsic::amdgcn_class: {
4046 Register Src0Reg = MI.getOperand(2).getReg();
4047 Register Src1Reg = MI.getOperand(3).getReg();
4048 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4049 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4050 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4051 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4052 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4053 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4054 break;
4055 }
4056 case Intrinsic::amdgcn_icmp:
4057 case Intrinsic::amdgcn_fcmp: {
4058 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4059 // This is not VCCRegBank because this is not used in boolean contexts.
4060 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4061 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4062 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4063 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4064 break;
4065 }
4066 case Intrinsic::amdgcn_readlane: {
4067 // This must be an SGPR, but accept a VGPR.
4068 Register IdxReg = MI.getOperand(3).getReg();
4069 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4070 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4071 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4072 LLVM_FALLTHROUGH[[gnu::fallthrough]];
4073 }
4074 case Intrinsic::amdgcn_readfirstlane: {
4075 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4076 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4077 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4078 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4079 break;
4080 }
4081 case Intrinsic::amdgcn_writelane: {
4082 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4083 Register SrcReg = MI.getOperand(2).getReg();
4084 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4085 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4086 Register IdxReg = MI.getOperand(3).getReg();
4087 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4088 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4089 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4090
4091 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4092 // to legalize.
4093 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4094 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4095 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4096 break;
4097 }
4098 case Intrinsic::amdgcn_if_break: {
4099 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4100 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4101 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4102 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4103 break;
4104 }
4105 case Intrinsic::amdgcn_permlane16:
4106 case Intrinsic::amdgcn_permlanex16: {
4107 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4108 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4109 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4110 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4111 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4112 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4113 break;
4114 }
4115 case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4116 case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4117 case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4118 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4119 case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4120 case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4121 case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4122 case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4123 case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4124 case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4125 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4126 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4127 case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4128 case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4129 case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4130 case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4131 case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4132 case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4133 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4134 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4135 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4136 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4137 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4138 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4139 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4140 case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4141 case Intrinsic::amdgcn_mfma_f64_4x4x4f64: {
4142 // Default for MAI intrinsics.
4143 // srcC can also be an immediate which can be folded later.
4144 // FIXME: Should we eventually add an alternative mapping with AGPR src
4145 // for srcA/srcB?
4146 //
4147 // vdst, srcA, srcB, srcC
4148 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4149 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4150 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4151 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4152 break;
4153 }
4154 case Intrinsic::amdgcn_interp_p1:
4155 case Intrinsic::amdgcn_interp_p2:
4156 case Intrinsic::amdgcn_interp_mov:
4157 case Intrinsic::amdgcn_interp_p1_f16:
4158 case Intrinsic::amdgcn_interp_p2_f16: {
4159 const int M0Idx = MI.getNumOperands() - 1;
4160 Register M0Reg = MI.getOperand(M0Idx).getReg();
4161 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4162 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4163
4164 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4165 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4166 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4167
4168 // Must be SGPR, but we must take whatever the original bank is and fix it
4169 // later.
4170 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4171 break;
4172 }
4173 case Intrinsic::amdgcn_ballot: {
4174 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4175 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4176 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4177 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4178 break;
4179 }
4180 }
4181 break;
4182 }
4183 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4184 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
4185 auto IntrID = MI.getIntrinsicID();
4186 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4187 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic")(static_cast <bool> (RSrcIntrin && "missing RsrcIntrinsic for image intrinsic"
) ? void (0) : __assert_fail ("RSrcIntrin && \"missing RsrcIntrinsic for image intrinsic\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 4187, __extension__ __PRETTY_FUNCTION__))
;
4188 // Non-images can have complications from operands that allow both SGPR
4189 // and VGPR. For now it's too complicated to figure out the final opcode
4190 // to derive the register bank from the MCInstrDesc.
4191 assert(RSrcIntrin->IsImage)(static_cast <bool> (RSrcIntrin->IsImage) ? void (0)
: __assert_fail ("RSrcIntrin->IsImage", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 4191, __extension__ __PRETTY_FUNCTION__))
;
4192 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4193 }
4194 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4195 unsigned N = MI.getNumExplicitOperands() - 2;
4196 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4197 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4198 for (unsigned I = 2; I < N; ++I)
4199 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4200 break;
4201 }
4202 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4203 auto IntrID = MI.getIntrinsicID();
4204 switch (IntrID) {
4205 case Intrinsic::amdgcn_s_getreg:
4206 case Intrinsic::amdgcn_s_memtime:
4207 case Intrinsic::amdgcn_s_memrealtime:
4208 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
4209 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4210 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4211 break;
4212 }
4213 case Intrinsic::amdgcn_global_atomic_fadd:
4214 case Intrinsic::amdgcn_global_atomic_csub:
4215 case Intrinsic::amdgcn_global_atomic_fmin:
4216 case Intrinsic::amdgcn_global_atomic_fmax:
4217 case Intrinsic::amdgcn_flat_atomic_fadd:
4218 case Intrinsic::amdgcn_flat_atomic_fmin:
4219 case Intrinsic::amdgcn_flat_atomic_fmax:
4220 return getDefaultMappingAllVGPR(MI);
4221 case Intrinsic::amdgcn_ds_ordered_add:
4222 case Intrinsic::amdgcn_ds_ordered_swap: {
4223 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4224 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4225 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4226 AMDGPU::SGPRRegBankID);
4227 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4228 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4229 break;
4230 }
4231 case Intrinsic::amdgcn_ds_append:
4232 case Intrinsic::amdgcn_ds_consume: {
4233 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4234 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4235 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4236 break;
4237 }
4238 case Intrinsic::amdgcn_exp_compr:
4239 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4240 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4241 break;
4242 case Intrinsic::amdgcn_exp:
4243 // FIXME: Could we support packed types here?
4244 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4245 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4246 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4247 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4248 break;
4249 case Intrinsic::amdgcn_s_sendmsg:
4250 case Intrinsic::amdgcn_s_sendmsghalt: {
4251 // This must be an SGPR, but accept a VGPR.
4252 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4253 AMDGPU::SGPRRegBankID);
4254 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4255 break;
4256 }
4257 case Intrinsic::amdgcn_s_setreg: {
4258 // This must be an SGPR, but accept a VGPR.
4259 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4260 AMDGPU::SGPRRegBankID);
4261 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4262 break;
4263 }
4264 case Intrinsic::amdgcn_end_cf: {
4265 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4266 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4267 break;
4268 }
4269 case Intrinsic::amdgcn_else: {
4270 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4271 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4272 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4273 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4274 break;
4275 }
4276 case Intrinsic::amdgcn_live_mask: {
4277 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4278 break;
4279 }
4280 case Intrinsic::amdgcn_wqm_demote:
4281 case Intrinsic::amdgcn_kill: {
4282 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4283 break;
4284 }
4285 case Intrinsic::amdgcn_raw_buffer_load:
4286 case Intrinsic::amdgcn_raw_tbuffer_load: {
4287 // FIXME: Should make intrinsic ID the last operand of the instruction,
4288 // then this would be the same as store
4289 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4290 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4291 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4292 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4293 break;
4294 }
4295 case Intrinsic::amdgcn_raw_buffer_store:
4296 case Intrinsic::amdgcn_raw_buffer_store_format:
4297 case Intrinsic::amdgcn_raw_tbuffer_store: {
4298 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4299 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4300 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4301 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4302 break;
4303 }
4304 case Intrinsic::amdgcn_struct_buffer_load:
4305 case Intrinsic::amdgcn_struct_tbuffer_load: {
4306 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4307 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4308 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4309 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4310 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4311 break;
4312 }
4313 case Intrinsic::amdgcn_struct_buffer_store:
4314 case Intrinsic::amdgcn_struct_tbuffer_store: {
4315 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4316 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4317 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4318 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4319 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4320 break;
4321 }
4322 case Intrinsic::amdgcn_init_exec_from_input: {
4323 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4324 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4325 break;
4326 }
4327 case Intrinsic::amdgcn_ds_gws_init:
4328 case Intrinsic::amdgcn_ds_gws_barrier:
4329 case Intrinsic::amdgcn_ds_gws_sema_br: {
4330 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4331
4332 // This must be an SGPR, but accept a VGPR.
4333 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4334 AMDGPU::SGPRRegBankID);
4335 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4336 break;
4337 }
4338 case Intrinsic::amdgcn_ds_gws_sema_v:
4339 case Intrinsic::amdgcn_ds_gws_sema_p:
4340 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4341 // This must be an SGPR, but accept a VGPR.
4342 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4343 AMDGPU::SGPRRegBankID);
4344 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4345 break;
4346 }
4347 default:
4348 return getInvalidInstructionMapping();
4349 }
4350 break;
4351 }
4352 case AMDGPU::G_SELECT: {
4353 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4354 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4355 AMDGPU::SGPRRegBankID);
4356 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4357 AMDGPU::SGPRRegBankID);
4358 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4359 Op3Bank == AMDGPU::SGPRRegBankID;
4360
4361 unsigned CondBankDefault = SGPRSrcs ?
4362 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4363 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4364 CondBankDefault);
4365 if (CondBank == AMDGPU::SGPRRegBankID)
4366 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4367 else if (CondBank == AMDGPU::VGPRRegBankID)
4368 CondBank = AMDGPU::VCCRegBankID;
4369
4370 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4371 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4372
4373 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID)(static_cast <bool> (CondBank == AMDGPU::VCCRegBankID ||
CondBank == AMDGPU::SGPRRegBankID) ? void (0) : __assert_fail
("CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 4373, __extension__ __PRETTY_FUNCTION__))
;
4374
4375 // TODO: Should report 32-bit for scalar condition type.
4376 if (Size == 64) {
4377 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4378 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4379 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4380 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4381 } else {
4382 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4383 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4384 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4385 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4386 }
4387
4388 break;
4389 }
4390
4391 case AMDGPU::G_LOAD:
4392 case AMDGPU::G_ZEXTLOAD:
4393 case AMDGPU::G_SEXTLOAD:
4394 return getInstrMappingForLoad(MI);
4395
4396 case AMDGPU::G_ATOMICRMW_XCHG:
4397 case AMDGPU::G_ATOMICRMW_ADD:
4398 case AMDGPU::G_ATOMICRMW_SUB:
4399 case AMDGPU::G_ATOMICRMW_AND:
4400 case AMDGPU::G_ATOMICRMW_OR:
4401 case AMDGPU::G_ATOMICRMW_XOR:
4402 case AMDGPU::G_ATOMICRMW_MAX:
4403 case AMDGPU::G_ATOMICRMW_MIN:
4404 case AMDGPU::G_ATOMICRMW_UMAX:
4405 case AMDGPU::G_ATOMICRMW_UMIN:
4406 case AMDGPU::G_ATOMICRMW_FADD:
4407 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4408 case AMDGPU::G_AMDGPU_ATOMIC_INC:
4409 case AMDGPU::G_AMDGPU_ATOMIC_DEC:
4410 case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
4411 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
4412 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4413 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4414 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4415 break;
4416 }
4417 case AMDGPU::G_ATOMIC_CMPXCHG: {
4418 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4419 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4420 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4421 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4422 break;
4423 }
4424 case AMDGPU::G_BRCOND: {
4425 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4426 AMDGPU::SGPRRegBankID);
4427 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1)(static_cast <bool> (MRI.getType(MI.getOperand(0).getReg
()).getSizeInBits() == 1) ? void (0) : __assert_fail ("MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1"
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp"
, 4427, __extension__ __PRETTY_FUNCTION__))
;
4428 if (Bank != AMDGPU::SGPRRegBankID)
4429 Bank = AMDGPU::VCCRegBankID;
4430
4431 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4432 break;
4433 }
4434 }
4435
4436 return getInstructionMapping(/*ID*/1, /*Cost*/1,
4437 getOperandsMapping(OpdsMapping),
4438 MI.getNumOperands());
4439}

/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h

1//== llvm/Support/LowLevelTypeImpl.h --------------------------- -*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// Implement a low-level type suitable for MachineInstr level instruction
10/// selection.
11///
12/// For a type attached to a MachineInstr, we only care about 2 details: total
13/// size and the number of vector lanes (if any). Accordingly, there are 4
14/// possible valid type-kinds:
15///
16/// * `sN` for scalars and aggregates
17/// * `<N x sM>` for vectors, which must have at least 2 elements.
18/// * `pN` for pointers
19///
20/// Other information required for correct selection is expected to be carried
21/// by the opcode, or non-type flags. For example the distinction between G_ADD
22/// and G_FADD for int/float or fast-math flags.
23///
24//===----------------------------------------------------------------------===//
25
26#ifndef LLVM_SUPPORT_LOWLEVELTYPEIMPL_H
27#define LLVM_SUPPORT_LOWLEVELTYPEIMPL_H
28
29#include "llvm/ADT/DenseMapInfo.h"
30#include "llvm/Support/Debug.h"
31#include "llvm/Support/MachineValueType.h"
32#include <cassert>
33
34namespace llvm {
35
36class DataLayout;
37class Type;
38class raw_ostream;
39
40class LLT {
41public:
42 /// Get a low-level scalar or aggregate "bag of bits".
43 static LLT scalar(unsigned SizeInBits) {
44 assert(SizeInBits > 0 && "invalid scalar size")(static_cast <bool> (SizeInBits > 0 && "invalid scalar size"
) ? void (0) : __assert_fail ("SizeInBits > 0 && \"invalid scalar size\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 44, __extension__ __PRETTY_FUNCTION__))
;
45 return LLT{/*isPointer=*/false, /*isVector=*/false, /*NumElements=*/0,
46 SizeInBits, /*AddressSpace=*/0};
47 }
48
49 /// Get a low-level pointer in the given address space.
50 static LLT pointer(unsigned AddressSpace, unsigned SizeInBits) {
51 assert(SizeInBits > 0 && "invalid pointer size")(static_cast <bool> (SizeInBits > 0 && "invalid pointer size"
) ? void (0) : __assert_fail ("SizeInBits > 0 && \"invalid pointer size\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 51, __extension__ __PRETTY_FUNCTION__))
;
52 return LLT{/*isPointer=*/true, /*isVector=*/false, /*NumElements=*/0,
53 SizeInBits, AddressSpace};
54 }
55
56 /// Get a low-level vector of some number of elements and element width.
57 /// \p NumElements must be at least 2.
58 static LLT vector(uint16_t NumElements, unsigned ScalarSizeInBits) {
59 assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements"
) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 59, __extension__ __PRETTY_FUNCTION__))
;
60 assert(ScalarSizeInBits > 0 && "invalid vector element size")(static_cast <bool> (ScalarSizeInBits > 0 &&
"invalid vector element size") ? void (0) : __assert_fail ("ScalarSizeInBits > 0 && \"invalid vector element size\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 60, __extension__ __PRETTY_FUNCTION__))
;
61 return LLT{/*isPointer=*/false, /*isVector=*/true, NumElements,
62 ScalarSizeInBits, /*AddressSpace=*/0};
63 }
64
65 /// Get a low-level vector of some number of elements and element type.
66 static LLT vector(uint16_t NumElements, LLT ScalarTy) {
67 assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements"
) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 67, __extension__ __PRETTY_FUNCTION__))
;
68 assert(!ScalarTy.isVector() && "invalid vector element type")(static_cast <bool> (!ScalarTy.isVector() && "invalid vector element type"
) ? void (0) : __assert_fail ("!ScalarTy.isVector() && \"invalid vector element type\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 68, __extension__ __PRETTY_FUNCTION__))
;
69 return LLT{ScalarTy.isPointer(), /*isVector=*/true, NumElements,
70 ScalarTy.getSizeInBits(),
71 ScalarTy.isPointer() ? ScalarTy.getAddressSpace() : 0};
72 }
73
74 static LLT scalarOrVector(uint16_t NumElements, LLT ScalarTy) {
75 return NumElements == 1 ? ScalarTy : LLT::vector(NumElements, ScalarTy);
76 }
77
78 static LLT scalarOrVector(uint16_t NumElements, unsigned ScalarSize) {
79 return scalarOrVector(NumElements, LLT::scalar(ScalarSize));
80 }
81
82 explicit LLT(bool isPointer, bool isVector, uint16_t NumElements,
83 unsigned SizeInBits, unsigned AddressSpace) {
84 init(isPointer, isVector, NumElements, SizeInBits, AddressSpace);
85 }
86 explicit LLT() : IsPointer(false), IsVector(false), RawData(0) {}
87
88 explicit LLT(MVT VT);
89
90 bool isValid() const { return RawData != 0; }
91
92 bool isScalar() const { return isValid() && !IsPointer && !IsVector; }
93
94 bool isPointer() const { return isValid() && IsPointer && !IsVector; }
95
96 bool isVector() const { return isValid() && IsVector; }
2
Returning value, which participates in a condition later
97
98 /// Returns the number of elements in a vector LLT. Must only be called on
99 /// vector types.
100 uint16_t getNumElements() const {
101 assert(IsVector && "cannot get number of elements on scalar/aggregate")(static_cast <bool> (IsVector && "cannot get number of elements on scalar/aggregate"
) ? void (0) : __assert_fail ("IsVector && \"cannot get number of elements on scalar/aggregate\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 101, __extension__ __PRETTY_FUNCTION__))
;
102 if (!IsPointer)
103 return getFieldValue(VectorElementsFieldInfo);
104 else
105 return getFieldValue(PointerVectorElementsFieldInfo);
106 }
107
108 /// Returns the total size of the type. Must only be called on sized types.
109 unsigned getSizeInBits() const {
110 if (isPointer() || isScalar())
6
Taking false branch
111 return getScalarSizeInBits();
112 return getScalarSizeInBits() * getNumElements();
7
Returning zero
113 }
114
115 /// Returns the total size of the type in bytes, i.e. number of whole bytes
116 /// needed to represent the size in bits. Must only be called on sized types.
117 unsigned getSizeInBytes() const {
118 return (getSizeInBits() + 7) / 8;
119 }
120
121 LLT getScalarType() const {
122 return isVector() ? getElementType() : *this;
123 }
124
125 /// If this type is a vector, return a vector with the same number of elements
126 /// but the new element type. Otherwise, return the new element type.
127 LLT changeElementType(LLT NewEltTy) const {
128 return isVector() ? LLT::vector(getNumElements(), NewEltTy) : NewEltTy;
129 }
130
131 /// If this type is a vector, return a vector with the same number of elements
132 /// but the new element size. Otherwise, return the new element type. Invalid
133 /// for pointer types. For pointer types, use changeElementType.
134 LLT changeElementSize(unsigned NewEltSize) const {
135 assert(!getScalarType().isPointer() &&(static_cast <bool> (!getScalarType().isPointer() &&
"invalid to directly change element size for pointers") ? void
(0) : __assert_fail ("!getScalarType().isPointer() && \"invalid to directly change element size for pointers\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 136, __extension__ __PRETTY_FUNCTION__))
136 "invalid to directly change element size for pointers")(static_cast <bool> (!getScalarType().isPointer() &&
"invalid to directly change element size for pointers") ? void
(0) : __assert_fail ("!getScalarType().isPointer() && \"invalid to directly change element size for pointers\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 136, __extension__ __PRETTY_FUNCTION__))
;
137 return isVector() ? LLT::vector(getNumElements(), NewEltSize)
138 : LLT::scalar(NewEltSize);
139 }
140
141 /// Return a vector or scalar with the same element type and the new number of
142 /// elements.
143 LLT changeNumElements(unsigned NewNumElts) const {
144 return LLT::scalarOrVector(NewNumElts, getScalarType());
145 }
146
147 /// Return a type that is \p Factor times smaller. Reduces the number of
148 /// elements if this is a vector, or the bitwidth for scalar/pointers. Does
149 /// not attempt to handle cases that aren't evenly divisible.
150 LLT divide(int Factor) const {
151 assert(Factor != 1)(static_cast <bool> (Factor != 1) ? void (0) : __assert_fail
("Factor != 1", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 151, __extension__ __PRETTY_FUNCTION__))
;
152 if (isVector()) {
153 assert(getNumElements() % Factor == 0)(static_cast <bool> (getNumElements() % Factor == 0) ? void
(0) : __assert_fail ("getNumElements() % Factor == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 153, __extension__ __PRETTY_FUNCTION__))
;
154 return scalarOrVector(getNumElements() / Factor, getElementType());
155 }
156
157 assert(getSizeInBits() % Factor == 0)(static_cast <bool> (getSizeInBits() % Factor == 0) ? void
(0) : __assert_fail ("getSizeInBits() % Factor == 0", "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 157, __extension__ __PRETTY_FUNCTION__))
;
158 return scalar(getSizeInBits() / Factor);
159 }
160
161 bool isByteSized() const { return (getSizeInBits() & 7) == 0; }
162
163 unsigned getScalarSizeInBits() const {
164 assert(RawData != 0 && "Invalid Type")(static_cast <bool> (RawData != 0 && "Invalid Type"
) ? void (0) : __assert_fail ("RawData != 0 && \"Invalid Type\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 164, __extension__ __PRETTY_FUNCTION__))
;
165 if (!IsVector) {
166 if (!IsPointer)
167 return getFieldValue(ScalarSizeFieldInfo);
168 else
169 return getFieldValue(PointerSizeFieldInfo);
170 } else {
171 if (!IsPointer)
172 return getFieldValue(VectorSizeFieldInfo);
173 else
174 return getFieldValue(PointerVectorSizeFieldInfo);
175 }
176 }
177
178 unsigned getAddressSpace() const {
179 assert(RawData != 0 && "Invalid Type")(static_cast <bool> (RawData != 0 && "Invalid Type"
) ? void (0) : __assert_fail ("RawData != 0 && \"Invalid Type\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 179, __extension__ __PRETTY_FUNCTION__))
;
180 assert(IsPointer && "cannot get address space of non-pointer type")(static_cast <bool> (IsPointer && "cannot get address space of non-pointer type"
) ? void (0) : __assert_fail ("IsPointer && \"cannot get address space of non-pointer type\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 180, __extension__ __PRETTY_FUNCTION__))
;
181 if (!IsVector)
182 return getFieldValue(PointerAddressSpaceFieldInfo);
183 else
184 return getFieldValue(PointerVectorAddressSpaceFieldInfo);
185 }
186
187 /// Returns the vector's element type. Only valid for vector types.
188 LLT getElementType() const {
189 assert(isVector() && "cannot get element type of scalar/aggregate")(static_cast <bool> (isVector() && "cannot get element type of scalar/aggregate"
) ? void (0) : __assert_fail ("isVector() && \"cannot get element type of scalar/aggregate\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 189, __extension__ __PRETTY_FUNCTION__))
;
190 if (IsPointer)
191 return pointer(getAddressSpace(), getScalarSizeInBits());
192 else
193 return scalar(getScalarSizeInBits());
194 }
195
196 void print(raw_ostream &OS) const;
197
198#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
199 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const {
200 print(dbgs());
201 dbgs() << '\n';
202 }
203#endif
204
205 bool operator==(const LLT &RHS) const {
206 return IsPointer == RHS.IsPointer && IsVector == RHS.IsVector &&
207 RHS.RawData == RawData;
208 }
209
210 bool operator!=(const LLT &RHS) const { return !(*this == RHS); }
211
212 friend struct DenseMapInfo<LLT>;
213 friend class GISelInstProfileBuilder;
214
215private:
216 /// LLT is packed into 64 bits as follows:
217 /// isPointer : 1
218 /// isVector : 1
219 /// with 62 bits remaining for Kind-specific data, packed in bitfields
220 /// as described below. As there isn't a simple portable way to pack bits
221 /// into bitfields, here the different fields in the packed structure is
222 /// described in static const *Field variables. Each of these variables
223 /// is a 2-element array, with the first element describing the bitfield size
224 /// and the second element describing the bitfield offset.
225 typedef int BitFieldInfo[2];
226 ///
227 /// This is how the bitfields are packed per Kind:
228 /// * Invalid:
229 /// gets encoded as RawData == 0, as that is an invalid encoding, since for
230 /// valid encodings, SizeInBits/SizeOfElement must be larger than 0.
231 /// * Non-pointer scalar (isPointer == 0 && isVector == 0):
232 /// SizeInBits: 32;
233 static const constexpr BitFieldInfo ScalarSizeFieldInfo{32, 0};
234 /// * Pointer (isPointer == 1 && isVector == 0):
235 /// SizeInBits: 16;
236 /// AddressSpace: 24;
237 static const constexpr BitFieldInfo PointerSizeFieldInfo{16, 0};
238 static const constexpr BitFieldInfo PointerAddressSpaceFieldInfo{
239 24, PointerSizeFieldInfo[0] + PointerSizeFieldInfo[1]};
240 /// * Vector-of-non-pointer (isPointer == 0 && isVector == 1):
241 /// NumElements: 16;
242 /// SizeOfElement: 32;
243 static const constexpr BitFieldInfo VectorElementsFieldInfo{16, 0};
244 static const constexpr BitFieldInfo VectorSizeFieldInfo{
245 32, VectorElementsFieldInfo[0] + VectorElementsFieldInfo[1]};
246 /// * Vector-of-pointer (isPointer == 1 && isVector == 1):
247 /// NumElements: 16;
248 /// SizeOfElement: 16;
249 /// AddressSpace: 24;
250 static const constexpr BitFieldInfo PointerVectorElementsFieldInfo{16, 0};
251 static const constexpr BitFieldInfo PointerVectorSizeFieldInfo{
252 16,
253 PointerVectorElementsFieldInfo[1] + PointerVectorElementsFieldInfo[0]};
254 static const constexpr BitFieldInfo PointerVectorAddressSpaceFieldInfo{
255 24, PointerVectorSizeFieldInfo[1] + PointerVectorSizeFieldInfo[0]};
256
257 uint64_t IsPointer : 1;
258 uint64_t IsVector : 1;
259 uint64_t RawData : 62;
260
261 static uint64_t getMask(const BitFieldInfo FieldInfo) {
262 const int FieldSizeInBits = FieldInfo[0];
263 return (((uint64_t)1) << FieldSizeInBits) - 1;
264 }
265 static uint64_t maskAndShift(uint64_t Val, uint64_t Mask, uint8_t Shift) {
266 assert(Val <= Mask && "Value too large for field")(static_cast <bool> (Val <= Mask && "Value too large for field"
) ? void (0) : __assert_fail ("Val <= Mask && \"Value too large for field\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 266, __extension__ __PRETTY_FUNCTION__))
;
267 return (Val & Mask) << Shift;
268 }
269 static uint64_t maskAndShift(uint64_t Val, const BitFieldInfo FieldInfo) {
270 return maskAndShift(Val, getMask(FieldInfo), FieldInfo[1]);
271 }
272 uint64_t getFieldValue(const BitFieldInfo FieldInfo) const {
273 return getMask(FieldInfo) & (RawData >> FieldInfo[1]);
274 }
275
276 void init(bool IsPointer, bool IsVector, uint16_t NumElements,
277 unsigned SizeInBits, unsigned AddressSpace) {
278 this->IsPointer = IsPointer;
279 this->IsVector = IsVector;
280 if (!IsVector) {
281 if (!IsPointer)
282 RawData = maskAndShift(SizeInBits, ScalarSizeFieldInfo);
283 else
284 RawData = maskAndShift(SizeInBits, PointerSizeFieldInfo) |
285 maskAndShift(AddressSpace, PointerAddressSpaceFieldInfo);
286 } else {
287 assert(NumElements > 1 && "invalid number of vector elements")(static_cast <bool> (NumElements > 1 && "invalid number of vector elements"
) ? void (0) : __assert_fail ("NumElements > 1 && \"invalid number of vector elements\""
, "/build/llvm-toolchain-snapshot-13~++20210621111111+acefe0eaaf82/llvm/include/llvm/Support/LowLevelTypeImpl.h"
, 287, __extension__ __PRETTY_FUNCTION__))
;
288 if (!IsPointer)
289 RawData = maskAndShift(NumElements, VectorElementsFieldInfo) |
290 maskAndShift(SizeInBits, VectorSizeFieldInfo);
291 else
292 RawData =
293 maskAndShift(NumElements, PointerVectorElementsFieldInfo) |
294 maskAndShift(SizeInBits, PointerVectorSizeFieldInfo) |
295 maskAndShift(AddressSpace, PointerVectorAddressSpaceFieldInfo);
296 }
297 }
298
299 uint64_t getUniqueRAWLLTData() const {
300 return ((uint64_t)RawData) << 2 | ((uint64_t)IsPointer) << 1 |
301 ((uint64_t)IsVector);
302 }
303};
304
305inline raw_ostream& operator<<(raw_ostream &OS, const LLT &Ty) {
306 Ty.print(OS);
307 return OS;
308}
309
310template<> struct DenseMapInfo<LLT> {
311 static inline LLT getEmptyKey() {
312 LLT Invalid;
313 Invalid.IsPointer = true;
314 return Invalid;
315 }
316 static inline LLT getTombstoneKey() {
317 LLT Invalid;
318 Invalid.IsVector = true;
319 return Invalid;
320 }
321 static inline unsigned getHashValue(const LLT &Ty) {
322 uint64_t Val = Ty.getUniqueRAWLLTData();
323 return DenseMapInfo<uint64_t>::getHashValue(Val);
324 }
325 static bool isEqual(const LLT &LHS, const LLT &RHS) {
326 return LHS == RHS;
327 }
328};
329
330}
331
332#endif // LLVM_SUPPORT_LOWLEVELTYPEIMPL_H