Bug Summary

File:llvm/include/llvm/Support/MathExtras.h
Warning:line 252, column 30
The result of the right shift is undefined due to shifting by '33', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name SILoadStoreOptimizer.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AMDGPU -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/include -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include -D NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-09-04-040900-46481-1 -x c++ /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

1//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass tries to fuse DS instructions with close by immediate offsets.
10// This will fuse operations such as
11// ds_read_b32 v0, v2 offset:16
12// ds_read_b32 v1, v2 offset:32
13// ==>
14// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15//
16// The same is done for certain SMEM and VMEM opcodes, e.g.:
17// s_buffer_load_dword s4, s[0:3], 4
18// s_buffer_load_dword s5, s[0:3], 8
19// ==>
20// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21//
22// This pass also tries to promote constant offset to the immediate by
23// adjusting the base. It tries to use a base from the nearby instructions that
24// allows it to have a 13bit constant offset and then promotes the 13bit offset
25// to the immediate.
26// E.g.
27// s_movk_i32 s0, 0x1800
28// v_add_co_u32_e32 v0, vcc, s0, v2
29// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30//
31// s_movk_i32 s0, 0x1000
32// v_add_co_u32_e32 v5, vcc, s0, v2
33// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34// global_load_dwordx2 v[5:6], v[5:6], off
35// global_load_dwordx2 v[0:1], v[0:1], off
36// =>
37// s_movk_i32 s0, 0x1000
38// v_add_co_u32_e32 v5, vcc, s0, v2
39// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40// global_load_dwordx2 v[5:6], v[5:6], off
41// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42//
43// Future improvements:
44//
45// - This is currently missing stores of constants because loading
46// the constant into the data register is placed between the stores, although
47// this is arguably a scheduling problem.
48//
49// - Live interval recomputing seems inefficient. This currently only matches
50// one pair, and recomputes live intervals and moves on to the next pair. It
51// would be better to compute a list of all merges that need to occur.
52//
53// - With a list of instructions to process, we can also merge more. If a
54// cluster of loads have offsets that are too large to fit in the 8-bit
55// offsets, but are close enough to fit in the 8 bits, we can add to the base
56// pointer and use the new reduced offsets.
57//
58//===----------------------------------------------------------------------===//
59
60#include "AMDGPU.h"
61#include "GCNSubtarget.h"
62#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63#include "llvm/Analysis/AliasAnalysis.h"
64#include "llvm/CodeGen/MachineFunctionPass.h"
65#include "llvm/InitializePasses.h"
66
67using namespace llvm;
68
69#define DEBUG_TYPE"si-load-store-opt" "si-load-store-opt"
70
71namespace {
72enum InstClassEnum {
73 UNKNOWN,
74 DS_READ,
75 DS_WRITE,
76 S_BUFFER_LOAD_IMM,
77 BUFFER_LOAD,
78 BUFFER_STORE,
79 MIMG,
80 TBUFFER_LOAD,
81 TBUFFER_STORE,
82};
83
84struct AddressRegs {
85 unsigned char NumVAddrs = 0;
86 bool SBase = false;
87 bool SRsrc = false;
88 bool SOffset = false;
89 bool VAddr = false;
90 bool Addr = false;
91 bool SSamp = false;
92};
93
94// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
95const unsigned MaxAddressRegs = 12 + 1 + 1;
96
97class SILoadStoreOptimizer : public MachineFunctionPass {
98 struct CombineInfo {
99 MachineBasicBlock::iterator I;
100 unsigned EltSize;
101 unsigned Offset;
102 unsigned Width;
103 unsigned Format;
104 unsigned BaseOff;
105 unsigned DMask;
106 InstClassEnum InstClass;
107 unsigned CPol = 0;
108 bool UseST64;
109 int AddrIdx[MaxAddressRegs];
110 const MachineOperand *AddrReg[MaxAddressRegs];
111 unsigned NumAddresses;
112 unsigned Order;
113
114 bool hasSameBaseAddress(const MachineInstr &MI) {
115 for (unsigned i = 0; i < NumAddresses; i++) {
116 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
117
118 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
119 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
120 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
121 return false;
122 }
123 continue;
124 }
125
126 // Check same base pointer. Be careful of subregisters, which can occur
127 // with vectors of pointers.
128 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
129 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
130 return false;
131 }
132 }
133 return true;
134 }
135
136 bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
137 for (unsigned i = 0; i < NumAddresses; ++i) {
138 const MachineOperand *AddrOp = AddrReg[i];
139 // Immediates are always OK.
140 if (AddrOp->isImm())
141 continue;
142
143 // Don't try to merge addresses that aren't either immediates or registers.
144 // TODO: Should be possible to merge FrameIndexes and maybe some other
145 // non-register
146 if (!AddrOp->isReg())
147 return false;
148
149 // TODO: We should be able to merge physical reg addreses.
150 if (AddrOp->getReg().isPhysical())
151 return false;
152
153 // If an address has only one use then there will be on other
154 // instructions with the same address, so we can't merge this one.
155 if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
156 return false;
157 }
158 return true;
159 }
160
161 void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
162 const GCNSubtarget &STM);
163 };
164
165 struct BaseRegisters {
166 Register LoReg;
167 Register HiReg;
168
169 unsigned LoSubReg = 0;
170 unsigned HiSubReg = 0;
171 };
172
173 struct MemAddress {
174 BaseRegisters Base;
175 int64_t Offset = 0;
176 };
177
178 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
179
180private:
181 const GCNSubtarget *STM = nullptr;
182 const SIInstrInfo *TII = nullptr;
183 const SIRegisterInfo *TRI = nullptr;
184 MachineRegisterInfo *MRI = nullptr;
185 AliasAnalysis *AA = nullptr;
186 bool OptimizeAgain;
187
188 static bool dmasksCanBeCombined(const CombineInfo &CI,
189 const SIInstrInfo &TII,
190 const CombineInfo &Paired);
191 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
192 CombineInfo &Paired, bool Modify = false);
193 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
194 const CombineInfo &Paired);
195 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
196 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
197 const CombineInfo &Paired);
198 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
199 const CombineInfo &Paired);
200 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
201
202 bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
203 SmallVectorImpl<MachineInstr *> &InstsToMove);
204
205 unsigned read2Opcode(unsigned EltSize) const;
206 unsigned read2ST64Opcode(unsigned EltSize) const;
207 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI,
208 CombineInfo &Paired,
209 const SmallVectorImpl<MachineInstr *> &InstsToMove);
210
211 unsigned write2Opcode(unsigned EltSize) const;
212 unsigned write2ST64Opcode(unsigned EltSize) const;
213 MachineBasicBlock::iterator
214 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
215 const SmallVectorImpl<MachineInstr *> &InstsToMove);
216 MachineBasicBlock::iterator
217 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
218 const SmallVectorImpl<MachineInstr *> &InstsToMove);
219 MachineBasicBlock::iterator
220 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
221 const SmallVectorImpl<MachineInstr *> &InstsToMove);
222 MachineBasicBlock::iterator
223 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
224 const SmallVectorImpl<MachineInstr *> &InstsToMove);
225 MachineBasicBlock::iterator
226 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
227 const SmallVectorImpl<MachineInstr *> &InstsToMove);
228 MachineBasicBlock::iterator
229 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
230 const SmallVectorImpl<MachineInstr *> &InstsToMove);
231 MachineBasicBlock::iterator
232 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
233 const SmallVectorImpl<MachineInstr *> &InstsToMove);
234
235 void updateBaseAndOffset(MachineInstr &I, Register NewBase,
236 int32_t NewOffset) const;
237 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
238 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
239 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
240 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
241 /// Promotes constant offset to the immediate by adjusting the base. It
242 /// tries to use a base from the nearby instructions that allows it to have
243 /// a 13bit constant offset which gets promoted to the immediate.
244 bool promoteConstantOffsetToImm(MachineInstr &CI,
245 MemInfoMap &Visited,
246 SmallPtrSet<MachineInstr *, 4> &Promoted) const;
247 void addInstToMergeableList(const CombineInfo &CI,
248 std::list<std::list<CombineInfo> > &MergeableInsts) const;
249
250 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
251 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
252 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
253 std::list<std::list<CombineInfo>> &MergeableInsts) const;
254
255public:
256 static char ID;
257
258 SILoadStoreOptimizer() : MachineFunctionPass(ID) {
259 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
260 }
261
262 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
263 bool &OptimizeListAgain);
264 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
265
266 bool runOnMachineFunction(MachineFunction &MF) override;
267
268 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
269
270 void getAnalysisUsage(AnalysisUsage &AU) const override {
271 AU.setPreservesCFG();
272 AU.addRequired<AAResultsWrapperPass>();
273
274 MachineFunctionPass::getAnalysisUsage(AU);
275 }
276
277 MachineFunctionProperties getRequiredProperties() const override {
278 return MachineFunctionProperties()
279 .set(MachineFunctionProperties::Property::IsSSA);
280 }
281};
282
283static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
284 const unsigned Opc = MI.getOpcode();
285
286 if (TII.isMUBUF(Opc)) {
287 // FIXME: Handle d16 correctly
288 return AMDGPU::getMUBUFElements(Opc);
289 }
290 if (TII.isMIMG(MI)) {
291 uint64_t DMaskImm =
292 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
293 return countPopulation(DMaskImm);
294 }
295 if (TII.isMTBUF(Opc)) {
296 return AMDGPU::getMTBUFElements(Opc);
297 }
298
299 switch (Opc) {
300 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
301 return 1;
302 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
303 return 2;
304 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
305 return 4;
306 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
307 return 8;
308 case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH[[gnu::fallthrough]];
309 case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH[[gnu::fallthrough]];
310 case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH[[gnu::fallthrough]];
311 case AMDGPU::DS_WRITE_B32_gfx9:
312 return 1;
313 case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH[[gnu::fallthrough]];
314 case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH[[gnu::fallthrough]];
315 case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH[[gnu::fallthrough]];
316 case AMDGPU::DS_WRITE_B64_gfx9:
317 return 2;
318 default:
319 return 0;
320 }
321}
322
323/// Maps instruction opcode to enum InstClassEnum.
324static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
325 switch (Opc) {
326 default:
327 if (TII.isMUBUF(Opc)) {
328 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
329 default:
330 return UNKNOWN;
331 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
332 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
333 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
334 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
335 return BUFFER_LOAD;
336 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
337 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
338 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
339 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
340 return BUFFER_STORE;
341 }
342 }
343 if (TII.isMIMG(Opc)) {
344 // Ignore instructions encoded without vaddr.
345 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
346 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
347 return UNKNOWN;
348 // Ignore BVH instructions
349 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
350 return UNKNOWN;
351 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
352 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
353 TII.isGather4(Opc))
354 return UNKNOWN;
355 return MIMG;
356 }
357 if (TII.isMTBUF(Opc)) {
358 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
359 default:
360 return UNKNOWN;
361 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
362 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
363 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
364 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
365 return TBUFFER_LOAD;
366 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
367 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
368 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
369 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
370 return TBUFFER_STORE;
371 }
372 }
373 return UNKNOWN;
374 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
375 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
376 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
377 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
378 return S_BUFFER_LOAD_IMM;
379 case AMDGPU::DS_READ_B32:
380 case AMDGPU::DS_READ_B32_gfx9:
381 case AMDGPU::DS_READ_B64:
382 case AMDGPU::DS_READ_B64_gfx9:
383 return DS_READ;
384 case AMDGPU::DS_WRITE_B32:
385 case AMDGPU::DS_WRITE_B32_gfx9:
386 case AMDGPU::DS_WRITE_B64:
387 case AMDGPU::DS_WRITE_B64_gfx9:
388 return DS_WRITE;
389 }
390}
391
392/// Determines instruction subclass from opcode. Only instructions
393/// of the same subclass can be merged together.
394static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
395 switch (Opc) {
396 default:
397 if (TII.isMUBUF(Opc))
398 return AMDGPU::getMUBUFBaseOpcode(Opc);
399 if (TII.isMIMG(Opc)) {
400 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
401 assert(Info)(static_cast<void> (0));
402 return Info->BaseOpcode;
403 }
404 if (TII.isMTBUF(Opc))
405 return AMDGPU::getMTBUFBaseOpcode(Opc);
406 return -1;
407 case AMDGPU::DS_READ_B32:
408 case AMDGPU::DS_READ_B32_gfx9:
409 case AMDGPU::DS_READ_B64:
410 case AMDGPU::DS_READ_B64_gfx9:
411 case AMDGPU::DS_WRITE_B32:
412 case AMDGPU::DS_WRITE_B32_gfx9:
413 case AMDGPU::DS_WRITE_B64:
414 case AMDGPU::DS_WRITE_B64_gfx9:
415 return Opc;
416 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
417 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
418 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
419 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
420 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
421 }
422}
423
424static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
425 AddressRegs Result;
426
427 if (TII.isMUBUF(Opc)) {
428 if (AMDGPU::getMUBUFHasVAddr(Opc))
429 Result.VAddr = true;
430 if (AMDGPU::getMUBUFHasSrsrc(Opc))
431 Result.SRsrc = true;
432 if (AMDGPU::getMUBUFHasSoffset(Opc))
433 Result.SOffset = true;
434
435 return Result;
436 }
437
438 if (TII.isMIMG(Opc)) {
439 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
440 if (VAddr0Idx >= 0) {
441 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
442 Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
443 } else {
444 Result.VAddr = true;
445 }
446 Result.SRsrc = true;
447 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
448 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
449 Result.SSamp = true;
450
451 return Result;
452 }
453 if (TII.isMTBUF(Opc)) {
454 if (AMDGPU::getMTBUFHasVAddr(Opc))
455 Result.VAddr = true;
456 if (AMDGPU::getMTBUFHasSrsrc(Opc))
457 Result.SRsrc = true;
458 if (AMDGPU::getMTBUFHasSoffset(Opc))
459 Result.SOffset = true;
460
461 return Result;
462 }
463
464 switch (Opc) {
465 default:
466 return Result;
467 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
468 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
469 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
470 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
471 Result.SBase = true;
472 return Result;
473 case AMDGPU::DS_READ_B32:
474 case AMDGPU::DS_READ_B64:
475 case AMDGPU::DS_READ_B32_gfx9:
476 case AMDGPU::DS_READ_B64_gfx9:
477 case AMDGPU::DS_WRITE_B32:
478 case AMDGPU::DS_WRITE_B64:
479 case AMDGPU::DS_WRITE_B32_gfx9:
480 case AMDGPU::DS_WRITE_B64_gfx9:
481 Result.Addr = true;
482 return Result;
483 }
484}
485
486void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
487 const SIInstrInfo &TII,
488 const GCNSubtarget &STM) {
489 I = MI;
490 unsigned Opc = MI->getOpcode();
491 InstClass = getInstClass(Opc, TII);
492
493 if (InstClass == UNKNOWN)
494 return;
495
496 switch (InstClass) {
497 case DS_READ:
498 EltSize =
499 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
500 : 4;
501 break;
502 case DS_WRITE:
503 EltSize =
504 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
505 : 4;
506 break;
507 case S_BUFFER_LOAD_IMM:
508 EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4);
509 break;
510 default:
511 EltSize = 4;
512 break;
513 }
514
515 if (InstClass == MIMG) {
516 DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
517 // Offset is not considered for MIMG instructions.
518 Offset = 0;
519 } else {
520 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
521 Offset = I->getOperand(OffsetIdx).getImm();
522 }
523
524 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
525 Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
526
527 Width = getOpcodeWidth(*I, TII);
528
529 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
530 Offset &= 0xffff;
531 } else if (InstClass != MIMG) {
532 CPol = TII.getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
533 }
534
535 AddressRegs Regs = getRegs(Opc, TII);
536
537 NumAddresses = 0;
538 for (unsigned J = 0; J < Regs.NumVAddrs; J++)
539 AddrIdx[NumAddresses++] =
540 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
541 if (Regs.Addr)
542 AddrIdx[NumAddresses++] =
543 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
544 if (Regs.SBase)
545 AddrIdx[NumAddresses++] =
546 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
547 if (Regs.SRsrc)
548 AddrIdx[NumAddresses++] =
549 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
550 if (Regs.SOffset)
551 AddrIdx[NumAddresses++] =
552 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
553 if (Regs.VAddr)
554 AddrIdx[NumAddresses++] =
555 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
556 if (Regs.SSamp)
557 AddrIdx[NumAddresses++] =
558 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
559 assert(NumAddresses <= MaxAddressRegs)(static_cast<void> (0));
560
561 for (unsigned J = 0; J < NumAddresses; J++)
562 AddrReg[J] = &I->getOperand(AddrIdx[J]);
563}
564
565} // end anonymous namespace.
566
567INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,static void *initializeSILoadStoreOptimizerPassOnce(PassRegistry
&Registry) {
568 "SI Load Store Optimizer", false, false)static void *initializeSILoadStoreOptimizerPassOnce(PassRegistry
&Registry) {
569INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)initializeAAResultsWrapperPassPass(Registry);
570INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",PassInfo *PI = new PassInfo( "SI Load Store Optimizer", "si-load-store-opt"
, &SILoadStoreOptimizer::ID, PassInfo::NormalCtor_t(callDefaultCtor
<SILoadStoreOptimizer>), false, false); Registry.registerPass
(*PI, true); return PI; } static llvm::once_flag InitializeSILoadStoreOptimizerPassFlag
; void llvm::initializeSILoadStoreOptimizerPass(PassRegistry &
Registry) { llvm::call_once(InitializeSILoadStoreOptimizerPassFlag
, initializeSILoadStoreOptimizerPassOnce, std::ref(Registry))
; }
571 false, false)PassInfo *PI = new PassInfo( "SI Load Store Optimizer", "si-load-store-opt"
, &SILoadStoreOptimizer::ID, PassInfo::NormalCtor_t(callDefaultCtor
<SILoadStoreOptimizer>), false, false); Registry.registerPass
(*PI, true); return PI; } static llvm::once_flag InitializeSILoadStoreOptimizerPassFlag
; void llvm::initializeSILoadStoreOptimizerPass(PassRegistry &
Registry) { llvm::call_once(InitializeSILoadStoreOptimizerPassFlag
, initializeSILoadStoreOptimizerPassOnce, std::ref(Registry))
; }
572
573char SILoadStoreOptimizer::ID = 0;
574
575char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
576
577FunctionPass *llvm::createSILoadStoreOptimizerPass() {
578 return new SILoadStoreOptimizer();
579}
580
581static void moveInstsAfter(MachineBasicBlock::iterator I,
582 ArrayRef<MachineInstr *> InstsToMove) {
583 MachineBasicBlock *MBB = I->getParent();
584 ++I;
585 for (MachineInstr *MI : InstsToMove) {
586 MI->removeFromParent();
587 MBB->insert(I, MI);
588 }
589}
590
591static void addDefsUsesToList(const MachineInstr &MI,
592 DenseSet<Register> &RegDefs,
593 DenseSet<Register> &PhysRegUses) {
594 for (const MachineOperand &Op : MI.operands()) {
595 if (Op.isReg()) {
596 if (Op.isDef())
597 RegDefs.insert(Op.getReg());
598 else if (Op.readsReg() && Op.getReg().isPhysical())
599 PhysRegUses.insert(Op.getReg());
600 }
601 }
602}
603
604static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
605 MachineBasicBlock::iterator B,
606 AliasAnalysis *AA) {
607 // RAW or WAR - cannot reorder
608 // WAW - cannot reorder
609 // RAR - safe to reorder
610 return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
611}
612
613// Add MI and its defs to the lists if MI reads one of the defs that are
614// already in the list. Returns true in that case.
615static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs,
616 DenseSet<Register> &PhysRegUses,
617 SmallVectorImpl<MachineInstr *> &Insts) {
618 for (MachineOperand &Use : MI.operands()) {
619 // If one of the defs is read, then there is a use of Def between I and the
620 // instruction that I will potentially be merged with. We will need to move
621 // this instruction after the merged instructions.
622 //
623 // Similarly, if there is a def which is read by an instruction that is to
624 // be moved for merging, then we need to move the def-instruction as well.
625 // This can only happen for physical registers such as M0; virtual
626 // registers are in SSA form.
627 if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
628 (Use.isDef() && RegDefs.count(Use.getReg())) ||
629 (Use.isDef() && Use.getReg().isPhysical() &&
630 PhysRegUses.count(Use.getReg())))) {
631 Insts.push_back(&MI);
632 addDefsUsesToList(MI, RegDefs, PhysRegUses);
633 return true;
634 }
635 }
636
637 return false;
638}
639
640static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
641 ArrayRef<MachineInstr *> InstsToMove,
642 AliasAnalysis *AA) {
643 assert(MemOp.mayLoadOrStore())(static_cast<void> (0));
644
645 for (MachineInstr *InstToMove : InstsToMove) {
646 if (!InstToMove->mayLoadOrStore())
647 continue;
648 if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
649 return false;
650 }
651 return true;
652}
653
654// This function assumes that \p A and \p B have are identical except for
655// size and offset, and they referecne adjacent memory.
656static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
657 const MachineMemOperand *A,
658 const MachineMemOperand *B) {
659 unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
660 unsigned Size = A->getSize() + B->getSize();
661 // This function adds the offset parameter to the existing offset for A,
662 // so we pass 0 here as the offset and then manually set it to the correct
663 // value after the call.
664 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
665 MMO->setOffset(MinOffset);
666 return MMO;
667}
668
669bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
670 const SIInstrInfo &TII,
671 const CombineInfo &Paired) {
672 assert(CI.InstClass == MIMG)(static_cast<void> (0));
673
674 // Ignore instructions with tfe/lwe set.
675 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
676 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
677
678 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
679 return false;
680
681 // Check other optional immediate operands for equality.
682 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
683 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
684 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
685
686 for (auto op : OperandsToMatch) {
687 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
688 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
689 return false;
690 if (Idx != -1 &&
691 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
692 return false;
693 }
694
695 // Check DMask for overlaps.
696 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
697 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
698
699 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
700 if ((1u << AllowedBitsForMin) <= MinMask)
701 return false;
702
703 return true;
704}
705
706static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
707 unsigned ComponentCount,
708 const GCNSubtarget &STI) {
709 if (ComponentCount > 4)
710 return 0;
711
712 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
713 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
714 if (!OldFormatInfo)
715 return 0;
716
717 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
718 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
719 ComponentCount,
720 OldFormatInfo->NumFormat, STI);
721
722 if (!NewFormatInfo)
723 return 0;
724
725 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&(static_cast<void> (0))
726 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp)(static_cast<void> (0));
727
728 return NewFormatInfo->Format;
729}
730
731// Return the value in the inclusive range [Lo,Hi] that is aligned to the
732// highest power of two. Note that the result is well defined for all inputs
733// including corner cases like:
734// - if Lo == Hi, return that value
735// - if Lo == 0, return 0 (even though the "- 1" below underflows
736// - if Lo > Hi, return 0 (as if the range wrapped around)
737static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
738 return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
1
Calling 'maskLeadingOnes<unsigned int>'
739}
740
741bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
742 const GCNSubtarget &STI,
743 CombineInfo &Paired,
744 bool Modify) {
745 assert(CI.InstClass != MIMG)(static_cast<void> (0));
746
747 // XXX - Would the same offset be OK? Is there any reason this would happen or
748 // be useful?
749 if (CI.Offset == Paired.Offset)
750 return false;
751
752 // This won't be valid if the offset isn't aligned.
753 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
754 return false;
755
756 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
757
758 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
759 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
760 if (!Info0)
761 return false;
762 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
763 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
764 if (!Info1)
765 return false;
766
767 if (Info0->BitsPerComp != Info1->BitsPerComp ||
768 Info0->NumFormat != Info1->NumFormat)
769 return false;
770
771 // TODO: Should be possible to support more formats, but if format loads
772 // are not dword-aligned, the merged load might not be valid.
773 if (Info0->BitsPerComp != 32)
774 return false;
775
776 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
777 return false;
778 }
779
780 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
781 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
782 CI.UseST64 = false;
783 CI.BaseOff = 0;
784
785 // Handle all non-DS instructions.
786 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
787 return (EltOffset0 + CI.Width == EltOffset1 ||
788 EltOffset1 + Paired.Width == EltOffset0) &&
789 CI.CPol == Paired.CPol &&
790 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol);
791 }
792
793 // If the offset in elements doesn't fit in 8-bits, we might be able to use
794 // the stride 64 versions.
795 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
796 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
797 if (Modify) {
798 CI.Offset = EltOffset0 / 64;
799 Paired.Offset = EltOffset1 / 64;
800 CI.UseST64 = true;
801 }
802 return true;
803 }
804
805 // Check if the new offsets fit in the reduced 8-bit range.
806 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
807 if (Modify) {
808 CI.Offset = EltOffset0;
809 Paired.Offset = EltOffset1;
810 }
811 return true;
812 }
813
814 // Try to shift base address to decrease offsets.
815 uint32_t Min = std::min(EltOffset0, EltOffset1);
816 uint32_t Max = std::max(EltOffset0, EltOffset1);
817
818 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
819 if (((Max - Min) & ~Mask) == 0) {
820 if (Modify) {
821 // From the range of values we could use for BaseOff, choose the one that
822 // is aligned to the highest power of two, to maximise the chance that
823 // the same offset can be reused for other load/store pairs.
824 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
825 // Copy the low bits of the offsets, so that when we adjust them by
826 // subtracting BaseOff they will be multiples of 64.
827 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
828 CI.BaseOff = BaseOff * CI.EltSize;
829 CI.Offset = (EltOffset0 - BaseOff) / 64;
830 Paired.Offset = (EltOffset1 - BaseOff) / 64;
831 CI.UseST64 = true;
832 }
833 return true;
834 }
835
836 if (isUInt<8>(Max - Min)) {
837 if (Modify) {
838 // From the range of values we could use for BaseOff, choose the one that
839 // is aligned to the highest power of two, to maximise the chance that
840 // the same offset can be reused for other load/store pairs.
841 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
842 CI.BaseOff = BaseOff * CI.EltSize;
843 CI.Offset = EltOffset0 - BaseOff;
844 Paired.Offset = EltOffset1 - BaseOff;
845 }
846 return true;
847 }
848
849 return false;
850}
851
852bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
853 const CombineInfo &CI,
854 const CombineInfo &Paired) {
855 const unsigned Width = (CI.Width + Paired.Width);
856 switch (CI.InstClass) {
857 default:
858 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
859 case S_BUFFER_LOAD_IMM:
860 switch (Width) {
861 default:
862 return false;
863 case 2:
864 case 4:
865 case 8:
866 return true;
867 }
868 }
869}
870
871const TargetRegisterClass *
872SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
873 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
874 return TRI->getRegClassForReg(*MRI, Dst->getReg());
875 }
876 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
877 return TRI->getRegClassForReg(*MRI, Src->getReg());
878 }
879 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
880 return TRI->getRegClassForReg(*MRI, Src->getReg());
881 }
882 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
883 return TRI->getRegClassForReg(*MRI, Dst->getReg());
884 }
885 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
886 return TRI->getRegClassForReg(*MRI, Src->getReg());
887 }
888 return nullptr;
889}
890
891/// This function assumes that CI comes before Paired in a basic block.
892bool SILoadStoreOptimizer::checkAndPrepareMerge(
893 CombineInfo &CI, CombineInfo &Paired,
894 SmallVectorImpl<MachineInstr *> &InstsToMove) {
895
896 // Check both offsets (or masks for MIMG) can be combined and fit in the
897 // reduced range.
898 if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired))
899 return false;
900
901 if (CI.InstClass != MIMG &&
902 (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)))
903 return false;
904
905 const unsigned Opc = CI.I->getOpcode();
906 const InstClassEnum InstClass = getInstClass(Opc, *TII);
907
908 if (InstClass == UNKNOWN) {
909 return false;
910 }
911 const unsigned InstSubclass = getInstSubclass(Opc, *TII);
912
913 // Do not merge VMEM buffer instructions with "swizzled" bit set.
914 int Swizzled =
915 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz);
916 if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
917 return false;
918
919 DenseSet<Register> RegDefsToMove;
920 DenseSet<Register> PhysRegUsesToMove;
921 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
922
923 const TargetRegisterClass *DataRC = getDataRegClass(*CI.I);
924 bool IsAGPR = TRI->hasAGPRs(DataRC);
925
926 MachineBasicBlock::iterator E = std::next(Paired.I);
927 MachineBasicBlock::iterator MBBI = std::next(CI.I);
928 MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
929 for (; MBBI != E; ++MBBI) {
930
931 if (MBBI == MBBE) {
932 // CombineInfo::Order is a hint on the instruction ordering within the
933 // basic block. This hint suggests that CI precedes Paired, which is
934 // true most of the time. However, moveInstsAfter() processing a
935 // previous list may have changed this order in a situation when it
936 // moves an instruction which exists in some other merge list.
937 // In this case it must be dependent.
938 return false;
939 }
940
941 if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
942 (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
943 // This is not a matching instruction, but we can keep looking as
944 // long as one of these conditions are met:
945 // 1. It is safe to move I down past MBBI.
946 // 2. It is safe to move MBBI down past the instruction that I will
947 // be merged into.
948
949 if (MBBI->hasUnmodeledSideEffects()) {
950 // We can't re-order this instruction with respect to other memory
951 // operations, so we fail both conditions mentioned above.
952 return false;
953 }
954
955 if (MBBI->mayLoadOrStore() &&
956 (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
957 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) {
958 // We fail condition #1, but we may still be able to satisfy condition
959 // #2. Add this instruction to the move list and then we will check
960 // if condition #2 holds once we have selected the matching instruction.
961 InstsToMove.push_back(&*MBBI);
962 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
963 continue;
964 }
965
966 // When we match I with another DS instruction we will be moving I down
967 // to the location of the matched instruction any uses of I will need to
968 // be moved down as well.
969 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
970 InstsToMove);
971 continue;
972 }
973
974 // Don't merge volatiles.
975 if (MBBI->hasOrderedMemoryRef())
976 return false;
977
978 int Swizzled =
979 AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz);
980 if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm())
981 return false;
982
983 // Handle a case like
984 // DS_WRITE_B32 addr, v, idx0
985 // w = DS_READ_B32 addr, idx0
986 // DS_WRITE_B32 addr, f(w), idx1
987 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
988 // merging of the two writes.
989 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
990 InstsToMove))
991 continue;
992
993 if (&*MBBI == &*Paired.I) {
994 if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR)
995 return false;
996 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
997 // operands. However we are reporting that ds_write2 shall have
998 // only VGPR data so that machine copy propagation does not
999 // create an illegal instruction with a VGPR and AGPR sources.
1000 // Consequenctially if we create such instruction the verifier
1001 // will complain.
1002 if (IsAGPR && CI.InstClass == DS_WRITE)
1003 return false;
1004
1005 // We need to go through the list of instructions that we plan to
1006 // move and make sure they are all safe to move down past the merged
1007 // instruction.
1008 if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) {
1009
1010 // Call offsetsCanBeCombined with modify = true so that the offsets are
1011 // correct for the new instruction. This should return true, because
1012 // this function should only be called on CombineInfo objects that
1013 // have already been confirmed to be mergeable.
1014 if (CI.InstClass != MIMG)
1015 offsetsCanBeCombined(CI, *STM, Paired, true);
1016 return true;
1017 }
1018 return false;
1019 }
1020
1021 // We've found a load/store that we couldn't merge for some reason.
1022 // We could potentially keep looking, but we'd need to make sure that
1023 // it was safe to move I and also all the instruction in InstsToMove
1024 // down past this instruction.
1025 // check if we can move I across MBBI and if we can move all I's users
1026 if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
1027 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))
1028 break;
1029 }
1030 return false;
1031}
1032
1033unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1034 if (STM->ldsRequiresM0Init())
1035 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1036 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1037}
1038
1039unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1040 if (STM->ldsRequiresM0Init())
1041 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1042
1043 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1044 : AMDGPU::DS_READ2ST64_B64_gfx9;
1045}
1046
1047MachineBasicBlock::iterator
1048SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1049 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1050 MachineBasicBlock *MBB = CI.I->getParent();
1051
1052 // Be careful, since the addresses could be subregisters themselves in weird
1053 // cases, like vectors of pointers.
1054 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1055
1056 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1057 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1058
1059 unsigned NewOffset0 = CI.Offset;
1060 unsigned NewOffset1 = Paired.Offset;
1061 unsigned Opc =
1062 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1063
1064 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1065 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1066
1067 if (NewOffset0 > NewOffset1) {
1068 // Canonicalize the merged instruction so the smaller offset comes first.
1069 std::swap(NewOffset0, NewOffset1);
1070 std::swap(SubRegIdx0, SubRegIdx1);
1071 }
1072
1073 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&(static_cast<void> (0))
1074 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit")(static_cast<void> (0));
1075
1076 const MCInstrDesc &Read2Desc = TII->get(Opc);
1077
1078 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1079 Register DestReg = MRI->createVirtualRegister(SuperRC);
1080
1081 DebugLoc DL = CI.I->getDebugLoc();
1082
1083 Register BaseReg = AddrReg->getReg();
1084 unsigned BaseSubReg = AddrReg->getSubReg();
1085 unsigned BaseRegFlags = 0;
1086 if (CI.BaseOff) {
1087 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1088 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1089 .addImm(CI.BaseOff);
1090
1091 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1092 BaseRegFlags = RegState::Kill;
1093
1094 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
1095 .addReg(ImmReg)
1096 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1097 .addImm(0); // clamp bit
1098 BaseSubReg = 0;
1099 }
1100
1101 MachineInstrBuilder Read2 =
1102 BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg)
1103 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1104 .addImm(NewOffset0) // offset0
1105 .addImm(NewOffset1) // offset1
1106 .addImm(0) // gds
1107 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1108
1109 (void)Read2;
1110
1111 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1112
1113 // Copy to the old destination registers.
1114 BuildMI(*MBB, Paired.I, DL, CopyDesc)
1115 .add(*Dest0) // Copy to same destination including flags and sub reg.
1116 .addReg(DestReg, 0, SubRegIdx0);
1117 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1118 .add(*Dest1)
1119 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1120
1121 moveInstsAfter(Copy1, InstsToMove);
1122
1123 CI.I->eraseFromParent();
1124 Paired.I->eraseFromParent();
1125
1126 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n')do { } while (false);
1127 return Read2;
1128}
1129
1130unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1131 if (STM->ldsRequiresM0Init())
1132 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1133 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1134 : AMDGPU::DS_WRITE2_B64_gfx9;
1135}
1136
1137unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1138 if (STM->ldsRequiresM0Init())
1139 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1140 : AMDGPU::DS_WRITE2ST64_B64;
1141
1142 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1143 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1144}
1145
1146MachineBasicBlock::iterator
1147SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
1148 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1149 MachineBasicBlock *MBB = CI.I->getParent();
1150
1151 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1152 // sure we preserve the subregister index and any register flags set on them.
1153 const MachineOperand *AddrReg =
1154 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1155 const MachineOperand *Data0 =
1156 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1157 const MachineOperand *Data1 =
1158 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1159
1160 unsigned NewOffset0 = CI.Offset;
1161 unsigned NewOffset1 = Paired.Offset;
1162 unsigned Opc =
1163 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1164
1165 if (NewOffset0 > NewOffset1) {
1166 // Canonicalize the merged instruction so the smaller offset comes first.
1167 std::swap(NewOffset0, NewOffset1);
1168 std::swap(Data0, Data1);
1169 }
1170
1171 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&(static_cast<void> (0))
1172 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit")(static_cast<void> (0));
1173
1174 const MCInstrDesc &Write2Desc = TII->get(Opc);
1175 DebugLoc DL = CI.I->getDebugLoc();
1176
1177 Register BaseReg = AddrReg->getReg();
1178 unsigned BaseSubReg = AddrReg->getSubReg();
1179 unsigned BaseRegFlags = 0;
1180 if (CI.BaseOff) {
1181 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1182 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1183 .addImm(CI.BaseOff);
1184
1185 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1186 BaseRegFlags = RegState::Kill;
1187
1188 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
1189 .addReg(ImmReg)
1190 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1191 .addImm(0); // clamp bit
1192 BaseSubReg = 0;
1193 }
1194
1195 MachineInstrBuilder Write2 =
1196 BuildMI(*MBB, Paired.I, DL, Write2Desc)
1197 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1198 .add(*Data0) // data0
1199 .add(*Data1) // data1
1200 .addImm(NewOffset0) // offset0
1201 .addImm(NewOffset1) // offset1
1202 .addImm(0) // gds
1203 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1204
1205 moveInstsAfter(Write2, InstsToMove);
1206
1207 CI.I->eraseFromParent();
1208 Paired.I->eraseFromParent();
1209
1210 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n')do { } while (false);
1211 return Write2;
1212}
1213
1214MachineBasicBlock::iterator
1215SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1216 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1217 MachineBasicBlock *MBB = CI.I->getParent();
1218 DebugLoc DL = CI.I->getDebugLoc();
1219 const unsigned Opcode = getNewOpcode(CI, Paired);
1220
1221 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1222
1223 Register DestReg = MRI->createVirtualRegister(SuperRC);
1224 unsigned MergedDMask = CI.DMask | Paired.DMask;
1225 unsigned DMaskIdx =
1226 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1227
1228 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1229 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1230 if (I == DMaskIdx)
1231 MIB.addImm(MergedDMask);
1232 else
1233 MIB.add((*CI.I).getOperand(I));
1234 }
1235
1236 // It shouldn't be possible to get this far if the two instructions
1237 // don't have a single memoperand, because MachineInstr::mayAlias()
1238 // will return true if this is the case.
1239 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand())(static_cast<void> (0));
1240
1241 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1242 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1243
1244 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1245
1246 unsigned SubRegIdx0, SubRegIdx1;
1247 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1248
1249 // Copy to the old destination registers.
1250 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1251 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1252 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1253
1254 BuildMI(*MBB, Paired.I, DL, CopyDesc)
1255 .add(*Dest0) // Copy to same destination including flags and sub reg.
1256 .addReg(DestReg, 0, SubRegIdx0);
1257 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1258 .add(*Dest1)
1259 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1260
1261 moveInstsAfter(Copy1, InstsToMove);
1262
1263 CI.I->eraseFromParent();
1264 Paired.I->eraseFromParent();
1265 return New;
1266}
1267
1268MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
1269 CombineInfo &CI, CombineInfo &Paired,
1270 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1271 MachineBasicBlock *MBB = CI.I->getParent();
1272 DebugLoc DL = CI.I->getDebugLoc();
1273 const unsigned Opcode = getNewOpcode(CI, Paired);
1274
1275 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1276
1277 Register DestReg = MRI->createVirtualRegister(SuperRC);
1278 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1279
1280 // It shouldn't be possible to get this far if the two instructions
1281 // don't have a single memoperand, because MachineInstr::mayAlias()
1282 // will return true if this is the case.
1283 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand())(static_cast<void> (0));
1284
1285 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1286 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1287
1288 MachineInstr *New =
1289 BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg)
1290 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
1291 .addImm(MergedOffset) // offset
1292 .addImm(CI.CPol) // cpol
1293 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1294
1295 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1296 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1297 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1298
1299 // Copy to the old destination registers.
1300 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1301 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1302 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1303
1304 BuildMI(*MBB, Paired.I, DL, CopyDesc)
1305 .add(*Dest0) // Copy to same destination including flags and sub reg.
1306 .addReg(DestReg, 0, SubRegIdx0);
1307 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1308 .add(*Dest1)
1309 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1310
1311 moveInstsAfter(Copy1, InstsToMove);
1312
1313 CI.I->eraseFromParent();
1314 Paired.I->eraseFromParent();
1315 return New;
1316}
1317
1318MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1319 CombineInfo &CI, CombineInfo &Paired,
1320 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1321 MachineBasicBlock *MBB = CI.I->getParent();
1322 DebugLoc DL = CI.I->getDebugLoc();
1323
1324 const unsigned Opcode = getNewOpcode(CI, Paired);
1325
1326 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1327
1328 // Copy to the new source register.
1329 Register DestReg = MRI->createVirtualRegister(SuperRC);
1330 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1331
1332 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1333
1334 AddressRegs Regs = getRegs(Opcode, *TII);
1335
1336 if (Regs.VAddr)
1337 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1338
1339 // It shouldn't be possible to get this far if the two instructions
1340 // don't have a single memoperand, because MachineInstr::mayAlias()
1341 // will return true if this is the case.
1342 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand())(static_cast<void> (0));
1343
1344 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1345 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1346
1347 MachineInstr *New =
1348 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1349 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1350 .addImm(MergedOffset) // offset
1351 .addImm(CI.CPol) // cpol
1352 .addImm(0) // tfe
1353 .addImm(0) // swz
1354 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1355
1356 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1357 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1358 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1359
1360 // Copy to the old destination registers.
1361 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1362 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1363 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1364
1365 BuildMI(*MBB, Paired.I, DL, CopyDesc)
1366 .add(*Dest0) // Copy to same destination including flags and sub reg.
1367 .addReg(DestReg, 0, SubRegIdx0);
1368 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1369 .add(*Dest1)
1370 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1371
1372 moveInstsAfter(Copy1, InstsToMove);
1373
1374 CI.I->eraseFromParent();
1375 Paired.I->eraseFromParent();
1376 return New;
1377}
1378
1379MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1380 CombineInfo &CI, CombineInfo &Paired,
1381 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1382 MachineBasicBlock *MBB = CI.I->getParent();
1383 DebugLoc DL = CI.I->getDebugLoc();
1384
1385 const unsigned Opcode = getNewOpcode(CI, Paired);
1386
1387 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1388
1389 // Copy to the new source register.
1390 Register DestReg = MRI->createVirtualRegister(SuperRC);
1391 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1392
1393 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1394
1395 AddressRegs Regs = getRegs(Opcode, *TII);
1396
1397 if (Regs.VAddr)
1398 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1399
1400 unsigned JoinedFormat =
1401 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1402
1403 // It shouldn't be possible to get this far if the two instructions
1404 // don't have a single memoperand, because MachineInstr::mayAlias()
1405 // will return true if this is the case.
1406 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand())(static_cast<void> (0));
1407
1408 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1409 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1410
1411 MachineInstr *New =
1412 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1413 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1414 .addImm(MergedOffset) // offset
1415 .addImm(JoinedFormat) // format
1416 .addImm(CI.CPol) // cpol
1417 .addImm(0) // tfe
1418 .addImm(0) // swz
1419 .addMemOperand(
1420 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1421
1422 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1423 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1424 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1425
1426 // Copy to the old destination registers.
1427 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1428 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1429 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1430
1431 BuildMI(*MBB, Paired.I, DL, CopyDesc)
1432 .add(*Dest0) // Copy to same destination including flags and sub reg.
1433 .addReg(DestReg, 0, SubRegIdx0);
1434 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1435 .add(*Dest1)
1436 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1437
1438 moveInstsAfter(Copy1, InstsToMove);
1439
1440 CI.I->eraseFromParent();
1441 Paired.I->eraseFromParent();
1442 return New;
1443}
1444
1445MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1446 CombineInfo &CI, CombineInfo &Paired,
1447 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1448 MachineBasicBlock *MBB = CI.I->getParent();
1449 DebugLoc DL = CI.I->getDebugLoc();
1450
1451 const unsigned Opcode = getNewOpcode(CI, Paired);
1452
1453 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1454 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1455 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1456
1457 // Copy to the new source register.
1458 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1459 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1460
1461 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1462 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1463
1464 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1465 .add(*Src0)
1466 .addImm(SubRegIdx0)
1467 .add(*Src1)
1468 .addImm(SubRegIdx1);
1469
1470 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
1471 .addReg(SrcReg, RegState::Kill);
1472
1473 AddressRegs Regs = getRegs(Opcode, *TII);
1474
1475 if (Regs.VAddr)
1476 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1477
1478 unsigned JoinedFormat =
1479 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1480
1481 // It shouldn't be possible to get this far if the two instructions
1482 // don't have a single memoperand, because MachineInstr::mayAlias()
1483 // will return true if this is the case.
1484 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand())(static_cast<void> (0));
1485
1486 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1487 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1488
1489 MachineInstr *New =
1490 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1491 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1492 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1493 .addImm(JoinedFormat) // format
1494 .addImm(CI.CPol) // cpol
1495 .addImm(0) // tfe
1496 .addImm(0) // swz
1497 .addMemOperand(
1498 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1499
1500 moveInstsAfter(MIB, InstsToMove);
1501
1502 CI.I->eraseFromParent();
1503 Paired.I->eraseFromParent();
1504 return New;
1505}
1506
1507unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1508 const CombineInfo &Paired) {
1509 const unsigned Width = CI.Width + Paired.Width;
1510
1511 switch (CI.InstClass) {
1512 default:
1513 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE)(static_cast<void> (0));
1514 // FIXME: Handle d16 correctly
1515 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1516 Width);
1517 case TBUFFER_LOAD:
1518 case TBUFFER_STORE:
1519 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1520 Width);
1521
1522 case UNKNOWN:
1523 llvm_unreachable("Unknown instruction class")__builtin_unreachable();
1524 case S_BUFFER_LOAD_IMM:
1525 switch (Width) {
1526 default:
1527 return 0;
1528 case 2:
1529 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1530 case 4:
1531 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1532 case 8:
1533 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1534 }
1535 case MIMG:
1536 assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&(static_cast<void> (0))
1537 "No overlaps")(static_cast<void> (0));
1538 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1539 }
1540}
1541
1542std::pair<unsigned, unsigned>
1543SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1544 const CombineInfo &Paired) {
1545
1546 assert(CI.Width != 0 && Paired.Width != 0 && "Width cannot be zero")(static_cast<void> (0));
1547
1548 bool ReverseOrder;
1549 if (CI.InstClass == MIMG) {
1550 assert((static_cast<void> (0))
1551 (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&(static_cast<void> (0))
1552 "No overlaps")(static_cast<void> (0));
1553 ReverseOrder = CI.DMask > Paired.DMask;
1554 } else
1555 ReverseOrder = CI.Offset > Paired.Offset;
1556
1557 unsigned Idx0;
1558 unsigned Idx1;
1559
1560 if (CI.Width + Paired.Width > 4) {
1561 assert(CI.Width == 4 && Paired.Width == 4)(static_cast<void> (0));
1562
1563 if (ReverseOrder) {
1564 Idx1 = AMDGPU::sub0_sub1_sub2_sub3;
1565 Idx0 = AMDGPU::sub4_sub5_sub6_sub7;
1566 } else {
1567 Idx0 = AMDGPU::sub0_sub1_sub2_sub3;
1568 Idx1 = AMDGPU::sub4_sub5_sub6_sub7;
1569 }
1570 } else {
1571 static const unsigned Idxs[4][4] = {
1572 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1573 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
1574 {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
1575 {AMDGPU::sub3, 0, 0, 0},
1576 };
1577
1578 assert(CI.Width >= 1 && CI.Width <= 3)(static_cast<void> (0));
1579 assert(Paired.Width >= 1 && Paired.Width <= 3)(static_cast<void> (0));
1580
1581 if (ReverseOrder) {
1582 Idx1 = Idxs[0][Paired.Width - 1];
1583 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1584 } else {
1585 Idx0 = Idxs[0][CI.Width - 1];
1586 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1587 }
1588 }
1589
1590 return std::make_pair(Idx0, Idx1);
1591}
1592
1593const TargetRegisterClass *
1594SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1595 const CombineInfo &Paired) {
1596 if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1597 switch (CI.Width + Paired.Width) {
1598 default:
1599 return nullptr;
1600 case 2:
1601 return &AMDGPU::SReg_64_XEXECRegClass;
1602 case 4:
1603 return &AMDGPU::SGPR_128RegClass;
1604 case 8:
1605 return &AMDGPU::SGPR_256RegClass;
1606 case 16:
1607 return &AMDGPU::SGPR_512RegClass;
1608 }
1609 }
1610
1611 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1612 return TRI->hasAGPRs(getDataRegClass(*CI.I))
1613 ? TRI->getAGPRClassForBitWidth(BitWidth)
1614 : TRI->getVGPRClassForBitWidth(BitWidth);
1615}
1616
1617MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1618 CombineInfo &CI, CombineInfo &Paired,
1619 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1620 MachineBasicBlock *MBB = CI.I->getParent();
1621 DebugLoc DL = CI.I->getDebugLoc();
1622
1623 const unsigned Opcode = getNewOpcode(CI, Paired);
1624
1625 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1626 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1627 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1628
1629 // Copy to the new source register.
1630 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1631 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1632
1633 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1634 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1635
1636 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1637 .add(*Src0)
1638 .addImm(SubRegIdx0)
1639 .add(*Src1)
1640 .addImm(SubRegIdx1);
1641
1642 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
1643 .addReg(SrcReg, RegState::Kill);
1644
1645 AddressRegs Regs = getRegs(Opcode, *TII);
1646
1647 if (Regs.VAddr)
1648 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1649
1650
1651 // It shouldn't be possible to get this far if the two instructions
1652 // don't have a single memoperand, because MachineInstr::mayAlias()
1653 // will return true if this is the case.
1654 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand())(static_cast<void> (0));
1655
1656 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1657 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1658
1659 MachineInstr *New =
1660 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1661 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1662 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1663 .addImm(CI.CPol) // cpol
1664 .addImm(0) // tfe
1665 .addImm(0) // swz
1666 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1667
1668 moveInstsAfter(MIB, InstsToMove);
1669
1670 CI.I->eraseFromParent();
1671 Paired.I->eraseFromParent();
1672 return New;
1673}
1674
1675MachineOperand
1676SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1677 APInt V(32, Val, true);
1678 if (TII->isInlineConstant(V))
1679 return MachineOperand::CreateImm(Val);
1680
1681 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1682 MachineInstr *Mov =
1683 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1684 TII->get(AMDGPU::S_MOV_B32), Reg)
1685 .addImm(Val);
1686 (void)Mov;
1687 LLVM_DEBUG(dbgs() << " "; Mov->dump())do { } while (false);
1688 return MachineOperand::CreateReg(Reg, false);
1689}
1690
1691// Compute base address using Addr and return the final register.
1692Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1693 const MemAddress &Addr) const {
1694 MachineBasicBlock *MBB = MI.getParent();
1695 MachineBasicBlock::iterator MBBI = MI.getIterator();
1696 DebugLoc DL = MI.getDebugLoc();
1697
1698 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||(static_cast<void> (0))
1699 Addr.Base.LoSubReg) &&(static_cast<void> (0))
1700 "Expected 32-bit Base-Register-Low!!")(static_cast<void> (0));
1701
1702 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||(static_cast<void> (0))
1703 Addr.Base.HiSubReg) &&(static_cast<void> (0))
1704 "Expected 32-bit Base-Register-Hi!!")(static_cast<void> (0));
1705
1706 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n")do { } while (false);
1707 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1708 MachineOperand OffsetHi =
1709 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1710
1711 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1712 Register CarryReg = MRI->createVirtualRegister(CarryRC);
1713 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1714
1715 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1716 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1717 MachineInstr *LoHalf =
1718 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1719 .addReg(CarryReg, RegState::Define)
1720 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1721 .add(OffsetLo)
1722 .addImm(0); // clamp bit
1723 (void)LoHalf;
1724 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();)do { } while (false);
1725
1726 MachineInstr *HiHalf =
1727 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1728 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1729 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1730 .add(OffsetHi)
1731 .addReg(CarryReg, RegState::Kill)
1732 .addImm(0); // clamp bit
1733 (void)HiHalf;
1734 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();)do { } while (false);
1735
1736 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1737 MachineInstr *FullBase =
1738 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1739 .addReg(DestSub0)
1740 .addImm(AMDGPU::sub0)
1741 .addReg(DestSub1)
1742 .addImm(AMDGPU::sub1);
1743 (void)FullBase;
1744 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";)do { } while (false);
1745
1746 return FullDestReg;
1747}
1748
1749// Update base and offset with the NewBase and NewOffset in MI.
1750void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1751 Register NewBase,
1752 int32_t NewOffset) const {
1753 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1754 Base->setReg(NewBase);
1755 Base->setIsKill(false);
1756 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1757}
1758
1759Optional<int32_t>
1760SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1761 if (Op.isImm())
1762 return Op.getImm();
1763
1764 if (!Op.isReg())
1765 return None;
1766
1767 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1768 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1769 !Def->getOperand(1).isImm())
1770 return None;
1771
1772 return Def->getOperand(1).getImm();
1773}
1774
1775// Analyze Base and extracts:
1776// - 32bit base registers, subregisters
1777// - 64bit constant offset
1778// Expecting base computation as:
1779// %OFFSET0:sgpr_32 = S_MOV_B32 8000
1780// %LO:vgpr_32, %c:sreg_64_xexec =
1781// V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1782// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1783// %Base:vreg_64 =
1784// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1785void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1786 MemAddress &Addr) const {
1787 if (!Base.isReg())
1788 return;
1789
1790 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1791 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1792 || Def->getNumOperands() != 5)
1793 return;
1794
1795 MachineOperand BaseLo = Def->getOperand(1);
1796 MachineOperand BaseHi = Def->getOperand(3);
1797 if (!BaseLo.isReg() || !BaseHi.isReg())
1798 return;
1799
1800 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1801 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1802
1803 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
1804 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1805 return;
1806
1807 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1808 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1809
1810 auto Offset0P = extractConstOffset(*Src0);
1811 if (Offset0P)
1812 BaseLo = *Src1;
1813 else {
1814 if (!(Offset0P = extractConstOffset(*Src1)))
1815 return;
1816 BaseLo = *Src0;
1817 }
1818
1819 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1820 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1821
1822 if (Src0->isImm())
1823 std::swap(Src0, Src1);
1824
1825 if (!Src1->isImm())
1826 return;
1827
1828 uint64_t Offset1 = Src1->getImm();
1829 BaseHi = *Src0;
1830
1831 Addr.Base.LoReg = BaseLo.getReg();
1832 Addr.Base.HiReg = BaseHi.getReg();
1833 Addr.Base.LoSubReg = BaseLo.getSubReg();
1834 Addr.Base.HiSubReg = BaseHi.getSubReg();
1835 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1836}
1837
1838bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1839 MachineInstr &MI,
1840 MemInfoMap &Visited,
1841 SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1842
1843 if (!(MI.mayLoad() ^ MI.mayStore()))
1844 return false;
1845
1846 // TODO: Support flat and scratch.
1847 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1848 return false;
1849
1850 if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL__null)
1851 return false;
1852
1853 if (AnchorList.count(&MI))
1854 return false;
1855
1856 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump())do { } while (false);
1857
1858 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1859 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";)do { } while (false);
1860 return false;
1861 }
1862
1863 // Step1: Find the base-registers and a 64bit constant offset.
1864 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1865 MemAddress MAddr;
1866 if (Visited.find(&MI) == Visited.end()) {
1867 processBaseWithConstOffset(Base, MAddr);
1868 Visited[&MI] = MAddr;
1869 } else
1870 MAddr = Visited[&MI];
1871
1872 if (MAddr.Offset == 0) {
1873 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"do { } while (false)
1874 " constant offsets that can be promoted.\n";)do { } while (false);
1875 return false;
1876 }
1877
1878 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "do { } while (false)
1879 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";)do { } while (false);
1880
1881 // Step2: Traverse through MI's basic block and find an anchor(that has the
1882 // same base-registers) with the highest 13bit distance from MI's offset.
1883 // E.g. (64bit loads)
1884 // bb:
1885 // addr1 = &a + 4096; load1 = load(addr1, 0)
1886 // addr2 = &a + 6144; load2 = load(addr2, 0)
1887 // addr3 = &a + 8192; load3 = load(addr3, 0)
1888 // addr4 = &a + 10240; load4 = load(addr4, 0)
1889 // addr5 = &a + 12288; load5 = load(addr5, 0)
1890 //
1891 // Starting from the first load, the optimization will try to find a new base
1892 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1893 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1894 // as the new-base(anchor) because of the maximum distance which can
1895 // accomodate more intermediate bases presumeably.
1896 //
1897 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1898 // (&a + 8192) for load1, load2, load4.
1899 // addr = &a + 8192
1900 // load1 = load(addr, -4096)
1901 // load2 = load(addr, -2048)
1902 // load3 = load(addr, 0)
1903 // load4 = load(addr, 2048)
1904 // addr5 = &a + 12288; load5 = load(addr5, 0)
1905 //
1906 MachineInstr *AnchorInst = nullptr;
1907 MemAddress AnchorAddr;
1908 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1909 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
1910
1911 MachineBasicBlock *MBB = MI.getParent();
1912 MachineBasicBlock::iterator E = MBB->end();
1913 MachineBasicBlock::iterator MBBI = MI.getIterator();
1914 ++MBBI;
1915 const SITargetLowering *TLI =
1916 static_cast<const SITargetLowering *>(STM->getTargetLowering());
1917
1918 for ( ; MBBI != E; ++MBBI) {
1919 MachineInstr &MINext = *MBBI;
1920 // TODO: Support finding an anchor(with same base) from store addresses or
1921 // any other load addresses where the opcodes are different.
1922 if (MINext.getOpcode() != MI.getOpcode() ||
1923 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1924 continue;
1925
1926 const MachineOperand &BaseNext =
1927 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1928 MemAddress MAddrNext;
1929 if (Visited.find(&MINext) == Visited.end()) {
1930 processBaseWithConstOffset(BaseNext, MAddrNext);
1931 Visited[&MINext] = MAddrNext;
1932 } else
1933 MAddrNext = Visited[&MINext];
1934
1935 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1936 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1937 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1938 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1939 continue;
1940
1941 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1942
1943 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1944 TargetLoweringBase::AddrMode AM;
1945 AM.HasBaseReg = true;
1946 AM.BaseOffs = Dist;
1947 if (TLI->isLegalGlobalAddressingMode(AM) &&
1948 (uint32_t)std::abs(Dist) > MaxDist) {
1949 MaxDist = std::abs(Dist);
1950
1951 AnchorAddr = MAddrNext;
1952 AnchorInst = &MINext;
1953 }
1954 }
1955
1956 if (AnchorInst) {
1957 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";do { } while (false)
1958 AnchorInst->dump())do { } while (false);
1959 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "do { } while (false)
1960 << AnchorAddr.Offset << "\n\n")do { } while (false);
1961
1962 // Instead of moving up, just re-compute anchor-instruction's base address.
1963 Register Base = computeBase(MI, AnchorAddr);
1964
1965 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1966 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();)do { } while (false);
1967
1968 for (auto P : InstsWCommonBase) {
1969 TargetLoweringBase::AddrMode AM;
1970 AM.HasBaseReg = true;
1971 AM.BaseOffs = P.second - AnchorAddr.Offset;
1972
1973 if (TLI->isLegalGlobalAddressingMode(AM)) {
1974 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;do { } while (false)
1975 dbgs() << ")"; P.first->dump())do { } while (false);
1976 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1977 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump())do { } while (false);
1978 }
1979 }
1980 AnchorList.insert(AnchorInst);
1981 return true;
1982 }
1983
1984 return false;
1985}
1986
1987void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
1988 std::list<std::list<CombineInfo> > &MergeableInsts) const {
1989 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
1990 if (AddrList.front().InstClass == CI.InstClass &&
1991 AddrList.front().hasSameBaseAddress(*CI.I)) {
1992 AddrList.emplace_back(CI);
1993 return;
1994 }
1995 }
1996
1997 // Base address not found, so add a new list.
1998 MergeableInsts.emplace_back(1, CI);
1999}
2000
2001std::pair<MachineBasicBlock::iterator, bool>
2002SILoadStoreOptimizer::collectMergeableInsts(
2003 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2004 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2005 std::list<std::list<CombineInfo>> &MergeableInsts) const {
2006 bool Modified = false;
2007
2008 // Sort potential mergeable instructions into lists. One list per base address.
2009 unsigned Order = 0;
2010 MachineBasicBlock::iterator BlockI = Begin;
2011 for (; BlockI != End; ++BlockI) {
2012 MachineInstr &MI = *BlockI;
2013
2014 // We run this before checking if an address is mergeable, because it can produce
2015 // better code even if the instructions aren't mergeable.
2016 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2017 Modified = true;
2018
2019 // Don't combine if volatile. We also won't be able to merge across this, so
2020 // break the search. We can look after this barrier for separate merges.
2021 if (MI.hasOrderedMemoryRef()) {
2022 LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI)do { } while (false);
2023
2024 // Search will resume after this instruction in a separate merge list.
2025 ++BlockI;
2026 break;
2027 }
2028
2029 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2030 if (InstClass == UNKNOWN)
2031 continue;
2032
2033 CombineInfo CI;
2034 CI.setMI(MI, *TII, *STM);
2035 CI.Order = Order++;
2036
2037 if (!CI.hasMergeableAddress(*MRI))
2038 continue;
2039
2040 LLVM_DEBUG(dbgs() << "Mergeable: " << MI)do { } while (false);
2041
2042 addInstToMergeableList(CI, MergeableInsts);
2043 }
2044
2045 // At this point we have lists of Mergeable instructions.
2046 //
2047 // Part 2: Sort lists by offset and then for each CombineInfo object in the
2048 // list try to find an instruction that can be merged with I. If an instruction
2049 // is found, it is stored in the Paired field. If no instructions are found, then
2050 // the CombineInfo object is deleted from the list.
2051
2052 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2053 E = MergeableInsts.end(); I != E;) {
2054
2055 std::list<CombineInfo> &MergeList = *I;
2056 if (MergeList.size() <= 1) {
2057 // This means we have found only one instruction with a given address
2058 // that can be merged, and we need at least 2 instructions to do a merge,
2059 // so this list can be discarded.
2060 I = MergeableInsts.erase(I);
2061 continue;
2062 }
2063
2064 // Sort the lists by offsets, this way mergeable instructions will be
2065 // adjacent to each other in the list, which will make it easier to find
2066 // matches.
2067 MergeList.sort(
2068 [] (const CombineInfo &A, CombineInfo &B) {
2069 return A.Offset < B.Offset;
2070 });
2071 ++I;
2072 }
2073
2074 return std::make_pair(BlockI, Modified);
2075}
2076
2077// Scan through looking for adjacent LDS operations with constant offsets from
2078// the same base register. We rely on the scheduler to do the hard work of
2079// clustering nearby loads, and assume these are all adjacent.
2080bool SILoadStoreOptimizer::optimizeBlock(
2081 std::list<std::list<CombineInfo> > &MergeableInsts) {
2082 bool Modified = false;
2083
2084 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2085 E = MergeableInsts.end(); I != E;) {
2086 std::list<CombineInfo> &MergeList = *I;
2087
2088 bool OptimizeListAgain = false;
2089 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2090 // We weren't able to make any changes, so delete the list so we don't
2091 // process the same instructions the next time we try to optimize this
2092 // block.
2093 I = MergeableInsts.erase(I);
2094 continue;
2095 }
2096
2097 Modified = true;
2098
2099 // We made changes, but also determined that there were no more optimization
2100 // opportunities, so we don't need to reprocess the list
2101 if (!OptimizeListAgain) {
2102 I = MergeableInsts.erase(I);
2103 continue;
2104 }
2105 OptimizeAgain = true;
2106 }
2107 return Modified;
2108}
2109
2110bool
2111SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2112 std::list<CombineInfo> &MergeList,
2113 bool &OptimizeListAgain) {
2114 if (MergeList.empty())
2115 return false;
2116
2117 bool Modified = false;
2118
2119 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2120 Next = std::next(I)) {
2121
2122 auto First = I;
2123 auto Second = Next;
2124
2125 if ((*First).Order > (*Second).Order)
2126 std::swap(First, Second);
2127 CombineInfo &CI = *First;
2128 CombineInfo &Paired = *Second;
2129
2130 SmallVector<MachineInstr *, 8> InstsToMove;
2131 if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) {
2132 ++I;
2133 continue;
2134 }
2135
2136 Modified = true;
2137
2138 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I)do { } while (false);
2139
2140 switch (CI.InstClass) {
2141 default:
2142 llvm_unreachable("unknown InstClass")__builtin_unreachable();
2143 break;
2144 case DS_READ: {
2145 MachineBasicBlock::iterator NewMI =
2146 mergeRead2Pair(CI, Paired, InstsToMove);
2147 CI.setMI(NewMI, *TII, *STM);
2148 break;
2149 }
2150 case DS_WRITE: {
2151 MachineBasicBlock::iterator NewMI =
2152 mergeWrite2Pair(CI, Paired, InstsToMove);
2153 CI.setMI(NewMI, *TII, *STM);
2154 break;
2155 }
2156 case S_BUFFER_LOAD_IMM: {
2157 MachineBasicBlock::iterator NewMI =
2158 mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
2159 CI.setMI(NewMI, *TII, *STM);
2160 OptimizeListAgain |= (CI.Width + Paired.Width) < 8;
2161 break;
2162 }
2163 case BUFFER_LOAD: {
2164 MachineBasicBlock::iterator NewMI =
2165 mergeBufferLoadPair(CI, Paired, InstsToMove);
2166 CI.setMI(NewMI, *TII, *STM);
2167 OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2168 break;
2169 }
2170 case BUFFER_STORE: {
2171 MachineBasicBlock::iterator NewMI =
2172 mergeBufferStorePair(CI, Paired, InstsToMove);
2173 CI.setMI(NewMI, *TII, *STM);
2174 OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2175 break;
2176 }
2177 case MIMG: {
2178 MachineBasicBlock::iterator NewMI =
2179 mergeImagePair(CI, Paired, InstsToMove);
2180 CI.setMI(NewMI, *TII, *STM);
2181 OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2182 break;
2183 }
2184 case TBUFFER_LOAD: {
2185 MachineBasicBlock::iterator NewMI =
2186 mergeTBufferLoadPair(CI, Paired, InstsToMove);
2187 CI.setMI(NewMI, *TII, *STM);
2188 OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2189 break;
2190 }
2191 case TBUFFER_STORE: {
2192 MachineBasicBlock::iterator NewMI =
2193 mergeTBufferStorePair(CI, Paired, InstsToMove);
2194 CI.setMI(NewMI, *TII, *STM);
2195 OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2196 break;
2197 }
2198 }
2199 CI.Order = Paired.Order;
2200 if (I == Second)
2201 I = Next;
2202
2203 MergeList.erase(Second);
2204 }
2205
2206 return Modified;
2207}
2208
2209bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2210 if (skipFunction(MF.getFunction()))
2211 return false;
2212
2213 STM = &MF.getSubtarget<GCNSubtarget>();
2214 if (!STM->loadStoreOptEnabled())
2215 return false;
2216
2217 TII = STM->getInstrInfo();
2218 TRI = &TII->getRegisterInfo();
2219
2220 MRI = &MF.getRegInfo();
2221 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2222
2223 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n")do { } while (false);
2224
2225 bool Modified = false;
2226
2227 // Contains the list of instructions for which constant offsets are being
2228 // promoted to the IMM. This is tracked for an entire block at time.
2229 SmallPtrSet<MachineInstr *, 4> AnchorList;
2230 MemInfoMap Visited;
2231
2232 for (MachineBasicBlock &MBB : MF) {
2233 MachineBasicBlock::iterator SectionEnd;
2234 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2235 I = SectionEnd) {
2236 bool CollectModified;
2237 std::list<std::list<CombineInfo>> MergeableInsts;
2238
2239 // First pass: Collect list of all instructions we know how to merge in a
2240 // subset of the block.
2241 std::tie(SectionEnd, CollectModified) =
2242 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2243
2244 Modified |= CollectModified;
2245
2246 do {
2247 OptimizeAgain = false;
2248 Modified |= optimizeBlock(MergeableInsts);
2249 } while (OptimizeAgain);
2250 }
2251
2252 Visited.clear();
2253 AnchorList.clear();
2254 }
2255
2256 return Modified;
2257}

/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <cassert>
18#include <climits>
19#include <cmath>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB != ZB_Undefined && Val == 0)
117 return 32;
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")(static_cast<void> (0));
252 return N
2.1
'N' is not equal to 0
2.1
'N' is not equal to 0
== 0 ? 0 : (T(-1) >> (Bits - N));
3
'?' condition is false
4
The result of the right shift is undefined due to shifting by '33', which is greater or equal to the width of type 'unsigned int'
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
2
Calling 'maskTrailingOnes<unsigned int>'
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315#if __has_builtin(__builtin_bitreverse8)1
316template<>
317inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
318 return __builtin_bitreverse8(Val);
319}
320#endif
321
322#if __has_builtin(__builtin_bitreverse16)1
323template<>
324inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
325 return __builtin_bitreverse16(Val);
326}
327#endif
328
329#if __has_builtin(__builtin_bitreverse32)1
330template<>
331inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
332 return __builtin_bitreverse32(Val);
333}
334#endif
335
336#if __has_builtin(__builtin_bitreverse64)1
337template<>
338inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
339 return __builtin_bitreverse64(Val);
340}
341#endif
342
343// NOTE: The following support functions use the _32/_64 extensions instead of
344// type overloading so that signed and unsigned integers can be used without
345// ambiguity.
346
347/// Return the high 32 bits of a 64 bit value.
348constexpr inline uint32_t Hi_32(uint64_t Value) {
349 return static_cast<uint32_t>(Value >> 32);
350}
351
352/// Return the low 32 bits of a 64 bit value.
353constexpr inline uint32_t Lo_32(uint64_t Value) {
354 return static_cast<uint32_t>(Value);
355}
356
357/// Make a 64-bit integer from a high / low pair of 32-bit integers.
358constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
359 return ((uint64_t)High << 32) | (uint64_t)Low;
360}
361
362/// Checks if an integer fits into the given bit width.
363template <unsigned N> constexpr inline bool isInt(int64_t x) {
364 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
365}
366// Template specializations to get better code for common cases.
367template <> constexpr inline bool isInt<8>(int64_t x) {
368 return static_cast<int8_t>(x) == x;
369}
370template <> constexpr inline bool isInt<16>(int64_t x) {
371 return static_cast<int16_t>(x) == x;
372}
373template <> constexpr inline bool isInt<32>(int64_t x) {
374 return static_cast<int32_t>(x) == x;
375}
376
377/// Checks if a signed integer is an N bit number shifted left by S.
378template <unsigned N, unsigned S>
379constexpr inline bool isShiftedInt(int64_t x) {
380 static_assert(
381 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
382 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
383 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
384}
385
386/// Checks if an unsigned integer fits into the given bit width.
387///
388/// This is written as two functions rather than as simply
389///
390/// return N >= 64 || X < (UINT64_C(1) << N);
391///
392/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
393/// left too many places.
394template <unsigned N>
395constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
396 static_assert(N > 0, "isUInt<0> doesn't make sense");
397 return X < (UINT64_C(1)1UL << (N));
398}
399template <unsigned N>
400constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) {
401 return true;
402}
403
404// Template specializations to get better code for common cases.
405template <> constexpr inline bool isUInt<8>(uint64_t x) {
406 return static_cast<uint8_t>(x) == x;
407}
408template <> constexpr inline bool isUInt<16>(uint64_t x) {
409 return static_cast<uint16_t>(x) == x;
410}
411template <> constexpr inline bool isUInt<32>(uint64_t x) {
412 return static_cast<uint32_t>(x) == x;
413}
414
415/// Checks if a unsigned integer is an N bit number shifted left by S.
416template <unsigned N, unsigned S>
417constexpr inline bool isShiftedUInt(uint64_t x) {
418 static_assert(
419 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
420 static_assert(N + S <= 64,
421 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
422 // Per the two static_asserts above, S must be strictly less than 64. So
423 // 1 << S is not undefined behavior.
424 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
425}
426
427/// Gets the maximum value for a N-bit unsigned integer.
428inline uint64_t maxUIntN(uint64_t N) {
429 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast<void> (0));
430
431 // uint64_t(1) << 64 is undefined behavior, so we can't do
432 // (uint64_t(1) << N) - 1
433 // without checking first that N != 64. But this works and doesn't have a
434 // branch.
435 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
436}
437
438/// Gets the minimum value for a N-bit signed integer.
439inline int64_t minIntN(int64_t N) {
440 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast<void> (0));
441
442 return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1));
443}
444
445/// Gets the maximum value for a N-bit signed integer.
446inline int64_t maxIntN(int64_t N) {
447 assert(N > 0 && N <= 64 && "integer width out of range")(static_cast<void> (0));
448
449 // This relies on two's complement wraparound when N == 64, so we convert to
450 // int64_t only at the very end to avoid UB.
451 return (UINT64_C(1)1UL << (N - 1)) - 1;
452}
453
454/// Checks if an unsigned integer fits into the given (dynamic) bit width.
455inline bool isUIntN(unsigned N, uint64_t x) {
456 return N >= 64 || x <= maxUIntN(N);
457}
458
459/// Checks if an signed integer fits into the given (dynamic) bit width.
460inline bool isIntN(unsigned N, int64_t x) {
461 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
462}
463
464/// Return true if the argument is a non-empty sequence of ones starting at the
465/// least significant bit with the remainder zero (32 bit version).
466/// Ex. isMask_32(0x0000FFFFU) == true.
467constexpr inline bool isMask_32(uint32_t Value) {
468 return Value && ((Value + 1) & Value) == 0;
469}
470
471/// Return true if the argument is a non-empty sequence of ones starting at the
472/// least significant bit with the remainder zero (64 bit version).
473constexpr inline bool isMask_64(uint64_t Value) {
474 return Value && ((Value + 1) & Value) == 0;
475}
476
477/// Return true if the argument contains a non-empty sequence of ones with the
478/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
479constexpr inline bool isShiftedMask_32(uint32_t Value) {
480 return Value && isMask_32((Value - 1) | Value);
481}
482
483/// Return true if the argument contains a non-empty sequence of ones with the
484/// remainder zero (64 bit version.)
485constexpr inline bool isShiftedMask_64(uint64_t Value) {
486 return Value && isMask_64((Value - 1) | Value);
487}
488
489/// Return true if the argument is a power of two > 0.
490/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
491constexpr inline bool isPowerOf2_32(uint32_t Value) {
492 return Value && !(Value & (Value - 1));
493}
494
495/// Return true if the argument is a power of two > 0 (64 bit edition.)
496constexpr inline bool isPowerOf2_64(uint64_t Value) {
497 return Value && !(Value & (Value - 1));
498}
499
500/// Count the number of ones from the most significant bit to the first
501/// zero bit.
502///
503/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
504/// Only unsigned integral types are allowed.
505///
506/// \param ZB the behavior on an input of all ones. Only ZB_Width and
507/// ZB_Undefined are valid arguments.
508template <typename T>
509unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
510 static_assert(std::numeric_limits<T>::is_integer &&
511 !std::numeric_limits<T>::is_signed,
512 "Only unsigned integral types are allowed.");
513 return countLeadingZeros<T>(~Value, ZB);
514}
515
516/// Count the number of ones from the least significant bit to the first
517/// zero bit.
518///
519/// Ex. countTrailingOnes(0x00FF00FF) == 8.
520/// Only unsigned integral types are allowed.
521///
522/// \param ZB the behavior on an input of all ones. Only ZB_Width and
523/// ZB_Undefined are valid arguments.
524template <typename T>
525unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
526 static_assert(std::numeric_limits<T>::is_integer &&
527 !std::numeric_limits<T>::is_signed,
528 "Only unsigned integral types are allowed.");
529 return countTrailingZeros<T>(~Value, ZB);
530}
531
532namespace detail {
533template <typename T, std::size_t SizeOfT> struct PopulationCounter {
534 static unsigned count(T Value) {
535 // Generic version, forward to 32 bits.
536 static_assert(SizeOfT <= 4, "Not implemented!");
537#if defined(__GNUC__4)
538 return __builtin_popcount(Value);
539#else
540 uint32_t v = Value;
541 v = v - ((v >> 1) & 0x55555555);
542 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
543 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
544#endif
545 }
546};
547
548template <typename T> struct PopulationCounter<T, 8> {
549 static unsigned count(T Value) {
550#if defined(__GNUC__4)
551 return __builtin_popcountll(Value);
552#else
553 uint64_t v = Value;
554 v = v - ((v >> 1) & 0x5555555555555555ULL);
555 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
556 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
557 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
558#endif
559 }
560};
561} // namespace detail
562
563/// Count the number of set bits in a value.
564/// Ex. countPopulation(0xF000F000) = 8
565/// Returns 0 if the word is zero.
566template <typename T>
567inline unsigned countPopulation(T Value) {
568 static_assert(std::numeric_limits<T>::is_integer &&
569 !std::numeric_limits<T>::is_signed,
570 "Only unsigned integral types are allowed.");
571 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
572}
573
574/// Compile time Log2.
575/// Valid only for positive powers of two.
576template <size_t kValue> constexpr inline size_t CTLog2() {
577 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
578 "Value is not a valid power of 2");
579 return 1 + CTLog2<kValue / 2>();
580}
581
582template <> constexpr inline size_t CTLog2<1>() { return 0; }
583
584/// Return the log base 2 of the specified value.
585inline double Log2(double Value) {
586#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
587 return __builtin_log(Value) / __builtin_log(2.0);
588#else
589 return log2(Value);
590#endif
591}
592
593/// Return the floor log base 2 of the specified value, -1 if the value is zero.
594/// (32 bit edition.)
595/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
596inline unsigned Log2_32(uint32_t Value) {
597 return 31 - countLeadingZeros(Value);
598}
599
600/// Return the floor log base 2 of the specified value, -1 if the value is zero.
601/// (64 bit edition.)
602inline unsigned Log2_64(uint64_t Value) {
603 return 63 - countLeadingZeros(Value);
604}
605
606/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
607/// (32 bit edition).
608/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
609inline unsigned Log2_32_Ceil(uint32_t Value) {
610 return 32 - countLeadingZeros(Value - 1);
611}
612
613/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
614/// (64 bit edition.)
615inline unsigned Log2_64_Ceil(uint64_t Value) {
616 return 64 - countLeadingZeros(Value - 1);
617}
618
619/// Return the greatest common divisor of the values using Euclid's algorithm.
620template <typename T>
621inline T greatestCommonDivisor(T A, T B) {
622 while (B) {
623 T Tmp = B;
624 B = A % B;
625 A = Tmp;
626 }
627 return A;
628}
629
630inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
631 return greatestCommonDivisor<uint64_t>(A, B);
632}
633
634/// This function takes a 64-bit integer and returns the bit equivalent double.
635inline double BitsToDouble(uint64_t Bits) {
636 double D;
637 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
638 memcpy(&D, &Bits, sizeof(Bits));
639 return D;
640}
641
642/// This function takes a 32-bit integer and returns the bit equivalent float.
643inline float BitsToFloat(uint32_t Bits) {
644 float F;
645 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
646 memcpy(&F, &Bits, sizeof(Bits));
647 return F;
648}
649
650/// This function takes a double and returns the bit equivalent 64-bit integer.
651/// Note that copying doubles around changes the bits of NaNs on some hosts,
652/// notably x86, so this routine cannot be used if these bits are needed.
653inline uint64_t DoubleToBits(double Double) {
654 uint64_t Bits;
655 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
656 memcpy(&Bits, &Double, sizeof(Double));
657 return Bits;
658}
659
660/// This function takes a float and returns the bit equivalent 32-bit integer.
661/// Note that copying floats around changes the bits of NaNs on some hosts,
662/// notably x86, so this routine cannot be used if these bits are needed.
663inline uint32_t FloatToBits(float Float) {
664 uint32_t Bits;
665 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
666 memcpy(&Bits, &Float, sizeof(Float));
667 return Bits;
668}
669
670/// A and B are either alignments or offsets. Return the minimum alignment that
671/// may be assumed after adding the two together.
672constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
673 // The largest power of 2 that divides both A and B.
674 //
675 // Replace "-Value" by "1+~Value" in the following commented code to avoid
676 // MSVC warning C4146
677 // return (A | B) & -(A | B);
678 return (A | B) & (1 + ~(A | B));
679}
680
681/// Returns the next power of two (in 64-bits) that is strictly greater than A.
682/// Returns zero on overflow.
683inline uint64_t NextPowerOf2(uint64_t A) {
684 A |= (A >> 1);
685 A |= (A >> 2);
686 A |= (A >> 4);
687 A |= (A >> 8);
688 A |= (A >> 16);
689 A |= (A >> 32);
690 return A + 1;
691}
692
693/// Returns the power of two which is less than or equal to the given value.
694/// Essentially, it is a floor operation across the domain of powers of two.
695inline uint64_t PowerOf2Floor(uint64_t A) {
696 if (!A) return 0;
697 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
698}
699
700/// Returns the power of two which is greater than or equal to the given value.
701/// Essentially, it is a ceil operation across the domain of powers of two.
702inline uint64_t PowerOf2Ceil(uint64_t A) {
703 if (!A)
704 return 0;
705 return NextPowerOf2(A - 1);
706}
707
708/// Returns the next integer (mod 2**64) that is greater than or equal to
709/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
710///
711/// If non-zero \p Skew is specified, the return value will be a minimal
712/// integer that is greater than or equal to \p Value and equal to
713/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
714/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
715///
716/// Examples:
717/// \code
718/// alignTo(5, 8) = 8
719/// alignTo(17, 8) = 24
720/// alignTo(~0LL, 8) = 0
721/// alignTo(321, 255) = 510
722///
723/// alignTo(5, 8, 7) = 7
724/// alignTo(17, 8, 1) = 17
725/// alignTo(~0LL, 8, 3) = 3
726/// alignTo(321, 255, 42) = 552
727/// \endcode
728inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
729 assert(Align != 0u && "Align can't be 0.")(static_cast<void> (0));
730 Skew %= Align;
731 return (Value + Align - 1 - Skew) / Align * Align + Skew;
732}
733
734/// Returns the next integer (mod 2**64) that is greater than or equal to
735/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
736template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
737 static_assert(Align != 0u, "Align must be non-zero");
738 return (Value + Align - 1) / Align * Align;
739}
740
741/// Returns the integer ceil(Numerator / Denominator).
742inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
743 return alignTo(Numerator, Denominator) / Denominator;
744}
745
746/// Returns the integer nearest(Numerator / Denominator).
747inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
748 return (Numerator + (Denominator / 2)) / Denominator;
749}
750
751/// Returns the largest uint64_t less than or equal to \p Value and is
752/// \p Skew mod \p Align. \p Align must be non-zero
753inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
754 assert(Align != 0u && "Align can't be 0.")(static_cast<void> (0));
755 Skew %= Align;
756 return (Value - Skew) / Align * Align + Skew;
757}
758
759/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
760/// Requires 0 < B <= 32.
761template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
762 static_assert(B > 0, "Bit width can't be 0.");
763 static_assert(B <= 32, "Bit width out of range.");
764 return int32_t(X << (32 - B)) >> (32 - B);
765}
766
767/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
768/// Requires 0 < B <= 32.
769inline int32_t SignExtend32(uint32_t X, unsigned B) {
770 assert(B > 0 && "Bit width can't be 0.")(static_cast<void> (0));
771 assert(B <= 32 && "Bit width out of range.")(static_cast<void> (0));
772 return int32_t(X << (32 - B)) >> (32 - B);
773}
774
775/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
776/// Requires 0 < B <= 64.
777template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
778 static_assert(B > 0, "Bit width can't be 0.");
779 static_assert(B <= 64, "Bit width out of range.");
780 return int64_t(x << (64 - B)) >> (64 - B);
781}
782
783/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
784/// Requires 0 < B <= 64.
785inline int64_t SignExtend64(uint64_t X, unsigned B) {
786 assert(B > 0 && "Bit width can't be 0.")(static_cast<void> (0));
787 assert(B <= 64 && "Bit width out of range.")(static_cast<void> (0));
788 return int64_t(X << (64 - B)) >> (64 - B);
789}
790
791/// Subtract two unsigned integers, X and Y, of type T and return the absolute
792/// value of the result.
793template <typename T>
794std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
795 return X > Y ? (X - Y) : (Y - X);
796}
797
798/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
799/// maximum representable value of T on overflow. ResultOverflowed indicates if
800/// the result is larger than the maximum representable value of type T.
801template <typename T>
802std::enable_if_t<std::is_unsigned<T>::value, T>
803SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
804 bool Dummy;
805 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
806 // Hacker's Delight, p. 29
807 T Z = X + Y;
808 Overflowed = (Z < X || Z < Y);
809 if (Overflowed)
810 return std::numeric_limits<T>::max();
811 else
812 return Z;
813}
814
815/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
816/// maximum representable value of T on overflow. ResultOverflowed indicates if
817/// the result is larger than the maximum representable value of type T.
818template <typename T>
819std::enable_if_t<std::is_unsigned<T>::value, T>
820SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
821 bool Dummy;
822 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
823
824 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
825 // because it fails for uint16_t (where multiplication can have undefined
826 // behavior due to promotion to int), and requires a division in addition
827 // to the multiplication.
828
829 Overflowed = false;
830
831 // Log2(Z) would be either Log2Z or Log2Z + 1.
832 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
833 // will necessarily be less than Log2Max as desired.
834 int Log2Z = Log2_64(X) + Log2_64(Y);
835 const T Max = std::numeric_limits<T>::max();
836 int Log2Max = Log2_64(Max);
837 if (Log2Z < Log2Max) {
838 return X * Y;
839 }
840 if (Log2Z > Log2Max) {
841 Overflowed = true;
842 return Max;
843 }
844
845 // We're going to use the top bit, and maybe overflow one
846 // bit past it. Multiply all but the bottom bit then add
847 // that on at the end.
848 T Z = (X >> 1) * Y;
849 if (Z & ~(Max >> 1)) {
850 Overflowed = true;
851 return Max;
852 }
853 Z <<= 1;
854 if (X & 1)
855 return SaturatingAdd(Z, Y, ResultOverflowed);
856
857 return Z;
858}
859
860/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
861/// the product. Clamp the result to the maximum representable value of T on
862/// overflow. ResultOverflowed indicates if the result is larger than the
863/// maximum representable value of type T.
864template <typename T>
865std::enable_if_t<std::is_unsigned<T>::value, T>
866SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
867 bool Dummy;
868 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
869
870 T Product = SaturatingMultiply(X, Y, &Overflowed);
871 if (Overflowed)
872 return Product;
873
874 return SaturatingAdd(A, Product, &Overflowed);
875}
876
877/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
878extern const float huge_valf;
879
880
881/// Add two signed integers, computing the two's complement truncated result,
882/// returning true if overflow occured.
883template <typename T>
884std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
885#if __has_builtin(__builtin_add_overflow)1
886 return __builtin_add_overflow(X, Y, &Result);
887#else
888 // Perform the unsigned addition.
889 using U = std::make_unsigned_t<T>;
890 const U UX = static_cast<U>(X);
891 const U UY = static_cast<U>(Y);
892 const U UResult = UX + UY;
893
894 // Convert to signed.
895 Result = static_cast<T>(UResult);
896
897 // Adding two positive numbers should result in a positive number.
898 if (X > 0 && Y > 0)
899 return Result <= 0;
900 // Adding two negatives should result in a negative number.
901 if (X < 0 && Y < 0)
902 return Result >= 0;
903 return false;
904#endif
905}
906
907/// Subtract two signed integers, computing the two's complement truncated
908/// result, returning true if an overflow ocurred.
909template <typename T>
910std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
911#if __has_builtin(__builtin_sub_overflow)1
912 return __builtin_sub_overflow(X, Y, &Result);
913#else
914 // Perform the unsigned addition.
915 using U = std::make_unsigned_t<T>;
916 const U UX = static_cast<U>(X);
917 const U UY = static_cast<U>(Y);
918 const U UResult = UX - UY;
919
920 // Convert to signed.
921 Result = static_cast<T>(UResult);
922
923 // Subtracting a positive number from a negative results in a negative number.
924 if (X <= 0 && Y > 0)
925 return Result >= 0;
926 // Subtracting a negative number from a positive results in a positive number.
927 if (X >= 0 && Y < 0)
928 return Result <= 0;
929 return false;
930#endif
931}
932
933/// Multiply two signed integers, computing the two's complement truncated
934/// result, returning true if an overflow ocurred.
935template <typename T>
936std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
937 // Perform the unsigned multiplication on absolute values.
938 using U = std::make_unsigned_t<T>;
939 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
940 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
941 const U UResult = UX * UY;
942
943 // Convert to signed.
944 const bool IsNegative = (X < 0) ^ (Y < 0);
945 Result = IsNegative ? (0 - UResult) : UResult;
946
947 // If any of the args was 0, result is 0 and no overflow occurs.
948 if (UX == 0 || UY == 0)
949 return false;
950
951 // UX and UY are in [1, 2^n], where n is the number of digits.
952 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
953 // positive) divided by an argument compares to the other.
954 if (IsNegative)
955 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
956 else
957 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
958}
959
960} // End llvm namespace
961
962#endif