/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Bug Summary

File:	lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Warning:	line 1022, column 9 Value stored to 'Modified' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name SIInsertWaitcnts.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -momit-leaf-frame-pointer -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-9/lib/clang/9.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-9~svn359426/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-9~svn359426/build-llvm/include -I /build/llvm-toolchain-snapshot-9~svn359426/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/include/clang/9.0.0/include/ -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-9/lib/clang/9.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++11 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-9~svn359426/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-9~svn359426=. -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -stack-protector 2 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -o /tmp/scan-build-2019-05-01-032957-29988-1 -x c++ /build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp -faddrsig

1	//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Insert wait instructions for memory reads and writes.
11	///
12	/// Memory reads and writes are issued asynchronously, so we need to insert
13	/// S_WAITCNT instructions when we want to access any of their results or
14	/// overwrite any register that's used asynchronously.
15	///
16	/// TODO: This pass currently keeps one timeline per hardware counter. A more
17	/// finely-grained approach that keeps one timeline per event type could
18	/// sometimes get away with generating weaker s_waitcnt instructions. For
19	/// example, when both SMEM and LDS are in flight and we need to wait for
20	/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21	/// but the pass will currently generate a conservative lgkmcnt(0) because
22	/// multiple event types are in flight.
23	//
24	//===----------------------------------------------------------------------===//
25
26	#include "AMDGPU.h"
27	#include "AMDGPUSubtarget.h"
28	#include "SIDefines.h"
29	#include "SIInstrInfo.h"
30	#include "SIMachineFunctionInfo.h"
31	#include "SIRegisterInfo.h"
32	#include "Utils/AMDGPUBaseInfo.h"
33	#include "llvm/ADT/DenseMap.h"
34	#include "llvm/ADT/DenseSet.h"
35	#include "llvm/ADT/PostOrderIterator.h"
36	#include "llvm/ADT/STLExtras.h"
37	#include "llvm/ADT/SmallVector.h"
38	#include "llvm/CodeGen/MachineBasicBlock.h"
39	#include "llvm/CodeGen/MachineFunction.h"
40	#include "llvm/CodeGen/MachineFunctionPass.h"
41	#include "llvm/CodeGen/MachineInstr.h"
42	#include "llvm/CodeGen/MachineInstrBuilder.h"
43	#include "llvm/CodeGen/MachineMemOperand.h"
44	#include "llvm/CodeGen/MachineOperand.h"
45	#include "llvm/CodeGen/MachineRegisterInfo.h"
46	#include "llvm/IR/DebugLoc.h"
47	#include "llvm/Pass.h"
48	#include "llvm/Support/Debug.h"
49	#include "llvm/Support/DebugCounter.h"
50	#include "llvm/Support/ErrorHandling.h"
51	#include "llvm/Support/raw_ostream.h"
52	#include <algorithm>
53	#include <cassert>
54	#include <cstdint>
55	#include <cstring>
56	#include <memory>
57	#include <utility>
58	#include <vector>
59
60	using namespace llvm;
61
62	#define DEBUG_TYPE"si-insert-waitcnts" "si-insert-waitcnts"
63
64	DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",static const unsigned ForceExpCounter = DebugCounter::registerCounter ("si-insert-waitcnts""-forceexp", "Force emit s_waitcnt expcnt(0) instrs" )
65	"Force emit s_waitcnt expcnt(0) instrs")static const unsigned ForceExpCounter = DebugCounter::registerCounter ("si-insert-waitcnts""-forceexp", "Force emit s_waitcnt expcnt(0) instrs" );
66	DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",static const unsigned ForceLgkmCounter = DebugCounter::registerCounter ("si-insert-waitcnts""-forcelgkm", "Force emit s_waitcnt lgkmcnt(0) instrs" )
67	"Force emit s_waitcnt lgkmcnt(0) instrs")static const unsigned ForceLgkmCounter = DebugCounter::registerCounter ("si-insert-waitcnts""-forcelgkm", "Force emit s_waitcnt lgkmcnt(0) instrs" );
68	DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",static const unsigned ForceVMCounter = DebugCounter::registerCounter ("si-insert-waitcnts""-forcevm", "Force emit s_waitcnt vmcnt(0) instrs" )
69	"Force emit s_waitcnt vmcnt(0) instrs")static const unsigned ForceVMCounter = DebugCounter::registerCounter ("si-insert-waitcnts""-forcevm", "Force emit s_waitcnt vmcnt(0) instrs" );
70
71	static cl::opt<bool> ForceEmitZeroFlag(
72	"amdgpu-waitcnt-forcezero",
73	cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
74	cl::init(false), cl::Hidden);
75
76	namespace {
77
78	template <typename EnumT>
79	class enum_iterator
80	: public iterator_facade_base<enum_iterator<EnumT>,
81	std::forward_iterator_tag, const EnumT> {
82	EnumT Value;
83	public:
84	enum_iterator() = default;
85	enum_iterator(EnumT Value) : Value(Value) {}
86
87	enum_iterator &operator++() {
88	Value = static_cast<EnumT>(Value + 1);
89	return *this;
90	}
91
92	bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
93
94	EnumT operator*() const { return Value; }
95	};
96
97	// Class of object that encapsulates latest instruction counter score
98	// associated with the operand. Used for determining whether
99	// s_waitcnt instruction needs to be emited.
100
101	#define CNT_MASK(t)(1u << (t)) (1u << (t))
102
103	enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
104
105	iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
106	return make_range(enum_iterator<InstCounterType>(VM_CNT),
107	enum_iterator<InstCounterType>(NUM_INST_CNTS));
108	}
109
110	using RegInterval = std::pair<signed, signed>;
111
112	struct {
113	uint32_t VmcntMax;
114	uint32_t ExpcntMax;
115	uint32_t LgkmcntMax;
116	int32_t NumVGPRsMax;
117	int32_t NumSGPRsMax;
118	} HardwareLimits;
119
120	struct {
121	unsigned VGPR0;
122	unsigned VGPRL;
123	unsigned SGPR0;
124	unsigned SGPRL;
125	} RegisterEncoding;
126
127	enum WaitEventType {
128	VMEM_ACCESS, // vector-memory read & write
129	LDS_ACCESS, // lds read & write
130	GDS_ACCESS, // gds read & write
131	SQ_MESSAGE, // send message
132	SMEM_ACCESS, // scalar-memory read & write
133	EXP_GPR_LOCK, // export holding on its data src
134	GDS_GPR_LOCK, // GDS holding on its data and addr src
135	EXP_POS_ACCESS, // write to export position
136	EXP_PARAM_ACCESS, // write to export parameter
137	VMW_GPR_LOCK, // vector-memory write holding on its data src
138	NUM_WAIT_EVENTS,
139	};
140
141	static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
142	(1 << VMEM_ACCESS),
143	(1 << SMEM_ACCESS) \| (1 << LDS_ACCESS) \| (1 << GDS_ACCESS) \|
144	(1 << SQ_MESSAGE),
145	(1 << EXP_GPR_LOCK) \| (1 << GDS_GPR_LOCK) \| (1 << VMW_GPR_LOCK) \|
146	(1 << EXP_PARAM_ACCESS) \| (1 << EXP_POS_ACCESS),
147	};
148
149	// The mapping is:
150	// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
151	// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
152	// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
153	// We reserve a fixed number of VGPR slots in the scoring tables for
154	// special tokens like SCMEM_LDS (needed for buffer load to LDS).
155	enum RegisterMapping {
156	SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
157	SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
158	NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
159	EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
160	NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
161	};
162
163	void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
164	switch (T) {
165	case VM_CNT:
166	Wait.VmCnt = std::min(Wait.VmCnt, Count);
167	break;
168	case EXP_CNT:
169	Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
170	break;
171	case LGKM_CNT:
172	Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
173	break;
174	default:
175	llvm_unreachable("bad InstCounterType")::llvm::llvm_unreachable_internal("bad InstCounterType", "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 175);
176	}
177	}
178
179	// This objects maintains the current score brackets of each wait counter, and
180	// a per-register scoreboard for each wait counter.
181	//
182	// We also maintain the latest score for every event type that can change the
183	// waitcnt in order to know if there are multiple types of events within
184	// the brackets. When multiple types of event happen in the bracket,
185	// wait count may get decreased out of order, therefore we need to put in
186	// "s_waitcnt 0" before use.
187	class WaitcntBrackets {
188	public:
189	WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
190	for (auto T : inst_counter_types())
191	memset(VgprScores[T], 0, sizeof(VgprScores[T]));
192	}
193
194	static uint32_t getWaitCountMax(InstCounterType T) {
195	switch (T) {
196	case VM_CNT:
197	return HardwareLimits.VmcntMax;
198	case LGKM_CNT:
199	return HardwareLimits.LgkmcntMax;
200	case EXP_CNT:
201	return HardwareLimits.ExpcntMax;
202	default:
203	break;
204	}
205	return 0;
206	}
207
208	uint32_t getScoreLB(InstCounterType T) const {
209	assert(T < NUM_INST_CNTS)((T < NUM_INST_CNTS) ? static_cast<void> (0) : __assert_fail ("T < NUM_INST_CNTS", "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 209, __PRETTY_FUNCTION__));
210	if (T >= NUM_INST_CNTS)
211	return 0;
212	return ScoreLBs[T];
213	}
214
215	uint32_t getScoreUB(InstCounterType T) const {
216	assert(T < NUM_INST_CNTS)((T < NUM_INST_CNTS) ? static_cast<void> (0) : __assert_fail ("T < NUM_INST_CNTS", "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 216, __PRETTY_FUNCTION__));
217	if (T >= NUM_INST_CNTS)
218	return 0;
219	return ScoreUBs[T];
220	}
221
222	// Mapping from event to counter.
223	InstCounterType eventCounter(WaitEventType E) {
224	if (E == VMEM_ACCESS)
225	return VM_CNT;
226	if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
227	return LGKM_CNT;
228	assert(WaitEventMaskForInst[EXP_CNT] & (1 << E))((WaitEventMaskForInst[EXP_CNT] & (1 << E)) ? static_cast <void> (0) : __assert_fail ("WaitEventMaskForInst[EXP_CNT] & (1 << E)" , "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 228, __PRETTY_FUNCTION__));
229	return EXP_CNT;
230	}
231
232	uint32_t getRegScore(int GprNo, InstCounterType T) {
233	if (GprNo < NUM_ALL_VGPRS) {
234	return VgprScores[T][GprNo];
235	}
236	assert(T == LGKM_CNT)((T == LGKM_CNT) ? static_cast<void> (0) : __assert_fail ("T == LGKM_CNT", "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 236, __PRETTY_FUNCTION__));
237	return SgprScores[GprNo - NUM_ALL_VGPRS];
238	}
239
240	void clear() {
241	memset(ScoreLBs, 0, sizeof(ScoreLBs));
242	memset(ScoreUBs, 0, sizeof(ScoreUBs));
243	PendingEvents = 0;
244	memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents));
245	for (auto T : inst_counter_types())
246	memset(VgprScores[T], 0, sizeof(VgprScores[T]));
247	memset(SgprScores, 0, sizeof(SgprScores));
248	}
249
250	bool merge(const WaitcntBrackets &Other);
251
252	RegInterval getRegInterval(const MachineInstr MI, const SIInstrInfo TII,
253	const MachineRegisterInfo *MRI,
254	const SIRegisterInfo *TRI, unsigned OpNo,
255	bool Def) const;
256
257	int32_t getMaxVGPR() const { return VgprUB; }
258	int32_t getMaxSGPR() const { return SgprUB; }
259
260	bool counterOutOfOrder(InstCounterType T) const;
261	bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
262	bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
263	void determineWait(InstCounterType T, uint32_t ScoreToWait,
264	AMDGPU::Waitcnt &Wait) const;
265	void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
266	void applyWaitcnt(InstCounterType T, unsigned Count);
267	void updateByEvent(const SIInstrInfo TII, const SIRegisterInfo TRI,
268	const MachineRegisterInfo *MRI, WaitEventType E,
269	MachineInstr &MI);
270
271	bool hasPending() const { return PendingEvents != 0; }
272	bool hasPendingEvent(WaitEventType E) const {
273	return PendingEvents & (1 << E);
274	}
275
276	bool hasPendingFlat() const {
277	return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
278	LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) \|\|
279	(LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
280	LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
281	}
282
283	void setPendingFlat() {
284	LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
285	LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
286	}
287
288	void print(raw_ostream &);
289	void dump() { print(dbgs()); }
290
291	private:
292	struct MergeInfo {
293	uint32_t OldLB;
294	uint32_t OtherLB;
295	uint32_t MyShift;
296	uint32_t OtherShift;
297	};
298	static bool mergeScore(const MergeInfo &M, uint32_t &Score,
299	uint32_t OtherScore);
300
301	void setScoreLB(InstCounterType T, uint32_t Val) {
302	assert(T < NUM_INST_CNTS)((T < NUM_INST_CNTS) ? static_cast<void> (0) : __assert_fail ("T < NUM_INST_CNTS", "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 302, __PRETTY_FUNCTION__));
303	if (T >= NUM_INST_CNTS)
304	return;
305	ScoreLBs[T] = Val;
306	}
307
308	void setScoreUB(InstCounterType T, uint32_t Val) {
309	assert(T < NUM_INST_CNTS)((T < NUM_INST_CNTS) ? static_cast<void> (0) : __assert_fail ("T < NUM_INST_CNTS", "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 309, __PRETTY_FUNCTION__));
310	if (T >= NUM_INST_CNTS)
311	return;
312	ScoreUBs[T] = Val;
313	if (T == EXP_CNT) {
314	uint32_t UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
315	if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
316	ScoreLBs[T] = UB;
317	}
318	}
319
320	void setRegScore(int GprNo, InstCounterType T, uint32_t Val) {
321	if (GprNo < NUM_ALL_VGPRS) {
322	if (GprNo > VgprUB) {
323	VgprUB = GprNo;
324	}
325	VgprScores[T][GprNo] = Val;
326	} else {
327	assert(T == LGKM_CNT)((T == LGKM_CNT) ? static_cast<void> (0) : __assert_fail ("T == LGKM_CNT", "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 327, __PRETTY_FUNCTION__));
328	if (GprNo - NUM_ALL_VGPRS > SgprUB) {
329	SgprUB = GprNo - NUM_ALL_VGPRS;
330	}
331	SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
332	}
333	}
334
335	void setExpScore(const MachineInstr MI, const SIInstrInfo TII,
336	const SIRegisterInfo TRI, const MachineRegisterInfo MRI,
337	unsigned OpNo, uint32_t Val);
338
339	const GCNSubtarget *ST = nullptr;
340	uint32_t ScoreLBs[NUM_INST_CNTS] = {0};
341	uint32_t ScoreUBs[NUM_INST_CNTS] = {0};
342	uint32_t PendingEvents = 0;
343	bool MixedPendingEvents[NUM_INST_CNTS] = {false};
344	// Remember the last flat memory operation.
345	uint32_t LastFlat[NUM_INST_CNTS] = {0};
346	// wait_cnt scores for every vgpr.
347	// Keep track of the VgprUB and SgprUB to make merge at join efficient.
348	int32_t VgprUB = 0;
349	int32_t SgprUB = 0;
350	uint32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
351	// Wait cnt scores for every sgpr, only lgkmcnt is relevant.
352	uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
353	};
354
355	class SIInsertWaitcnts : public MachineFunctionPass {
356	private:
357	const GCNSubtarget *ST = nullptr;
358	const SIInstrInfo *TII = nullptr;
359	const SIRegisterInfo *TRI = nullptr;
360	const MachineRegisterInfo *MRI = nullptr;
361	AMDGPU::IsaVersion IV;
362
363	DenseSet<MachineInstr *> TrackedWaitcntSet;
364	DenseSet<MachineInstr *> VCCZBugHandledSet;
365
366	struct BlockInfo {
367	MachineBasicBlock *MBB;
368	std::unique_ptr<WaitcntBrackets> Incoming;
369	bool Dirty = true;
370
371	explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
372	};
373
374	std::vector<BlockInfo> BlockInfos; // by reverse post-order traversal index
375	DenseMap<MachineBasicBlock *, unsigned> RpotIdxMap;
376
377	// ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
378	// because of amdgpu-waitcnt-forcezero flag
379	bool ForceEmitZeroWaitcnts;
380	bool ForceEmitWaitcnt[NUM_INST_CNTS];
381
382	public:
383	static char ID;
384
385	SIInsertWaitcnts() : MachineFunctionPass(ID) {
386	(void)ForceExpCounter;
387	(void)ForceLgkmCounter;
388	(void)ForceVMCounter;
389	}
390
391	bool runOnMachineFunction(MachineFunction &MF) override;
392
393	StringRef getPassName() const override {
394	return "SI insert wait instructions";
395	}
396
397	void getAnalysisUsage(AnalysisUsage &AU) const override {
398	AU.setPreservesCFG();
399	MachineFunctionPass::getAnalysisUsage(AU);
400	}
401
402	bool isForceEmitWaitcnt() const {
403	for (auto T : inst_counter_types())
404	if (ForceEmitWaitcnt[T])
405	return true;
406	return false;
407	}
408
409	void setForceEmitWaitcnt() {
410	// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
411	// For debug builds, get the debug counter info and adjust if need be
412	#ifndef NDEBUG
413	if (DebugCounter::isCounterSet(ForceExpCounter) &&
414	DebugCounter::shouldExecute(ForceExpCounter)) {
415	ForceEmitWaitcnt[EXP_CNT] = true;
416	} else {
417	ForceEmitWaitcnt[EXP_CNT] = false;
418	}
419
420	if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
421	DebugCounter::shouldExecute(ForceLgkmCounter)) {
422	ForceEmitWaitcnt[LGKM_CNT] = true;
423	} else {
424	ForceEmitWaitcnt[LGKM_CNT] = false;
425	}
426
427	if (DebugCounter::isCounterSet(ForceVMCounter) &&
428	DebugCounter::shouldExecute(ForceVMCounter)) {
429	ForceEmitWaitcnt[VM_CNT] = true;
430	} else {
431	ForceEmitWaitcnt[VM_CNT] = false;
432	}
433	#endif // NDEBUG
434	}
435
436	bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
437	bool generateWaitcntInstBefore(MachineInstr &MI,
438	WaitcntBrackets &ScoreBrackets,
439	MachineInstr *OldWaitcntInstr);
440	void updateEventWaitcntAfter(MachineInstr &Inst,
441	WaitcntBrackets *ScoreBrackets);
442	bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
443	WaitcntBrackets &ScoreBrackets);
444	};
445
446	} // end anonymous namespace
447
448	RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
449	const SIInstrInfo *TII,
450	const MachineRegisterInfo *MRI,
451	const SIRegisterInfo *TRI,
452	unsigned OpNo, bool Def) const {
453	const MachineOperand &Op = MI->getOperand(OpNo);
454	if (!Op.isReg() \|\| !TRI->isInAllocatableClass(Op.getReg()) \|\|
455	(Def && !Op.isDef()))
456	return {-1, -1};
457
458	// A use via a PW operand does not need a waitcnt.
459	// A partial write is not a WAW.
460	assert(!Op.getSubReg() \|\| !Op.isUndef())((!Op.getSubReg() \|\| !Op.isUndef()) ? static_cast<void> (0) : __assert_fail ("!Op.getSubReg() \|\| !Op.isUndef()", "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 460, __PRETTY_FUNCTION__));
461
462	RegInterval Result;
463	const MachineRegisterInfo &MRIA = *MRI;
464
465	unsigned Reg = TRI->getEncodingValue(Op.getReg());
466
467	if (TRI->isVGPR(MRIA, Op.getReg())) {
468	assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL)((Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding .VGPRL) ? static_cast<void> (0) : __assert_fail ("Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL" , "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 468, __PRETTY_FUNCTION__));
469	Result.first = Reg - RegisterEncoding.VGPR0;
470	assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS)((Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS ) ? static_cast<void> (0) : __assert_fail ("Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS" , "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 470, __PRETTY_FUNCTION__));
471	} else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
472	assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS)((Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS ) ? static_cast<void> (0) : __assert_fail ("Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS" , "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 472, __PRETTY_FUNCTION__));
473	Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
474	assert(Result.first >= NUM_ALL_VGPRS &&((Result.first >= NUM_ALL_VGPRS && Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS) ? static_cast<void> ( 0) : __assert_fail ("Result.first >= NUM_ALL_VGPRS && Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS" , "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 475, __PRETTY_FUNCTION__))
475	Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS)((Result.first >= NUM_ALL_VGPRS && Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS) ? static_cast<void> ( 0) : __assert_fail ("Result.first >= NUM_ALL_VGPRS && Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS" , "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 475, __PRETTY_FUNCTION__));
476	}
477	// TODO: Handle TTMP
478	// else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
479	else
480	return {-1, -1};
481
482	const MachineInstr &MIA = *MI;
483	const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
484	unsigned Size = TRI->getRegSizeInBits(*RC);
485	Result.second = Result.first + (Size / 32);
486
487	return Result;
488	}
489
490	void WaitcntBrackets::setExpScore(const MachineInstr *MI,
491	const SIInstrInfo *TII,
492	const SIRegisterInfo *TRI,
493	const MachineRegisterInfo *MRI, unsigned OpNo,
494	uint32_t Val) {
495	RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
496	LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-insert-waitcnts")) { { const MachineOperand &Opnd = MI ->getOperand(OpNo); ((TRI->isVGPR(MRI, Opnd.getReg())) ? static_cast<void> (0) : __assert_fail ("TRI->isVGPR(MRI, Opnd.getReg())" , "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 498, __PRETTY_FUNCTION__)); }; } } while (false)
497	const MachineOperand &Opnd = MI->getOperand(OpNo);do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-insert-waitcnts")) { { const MachineOperand &Opnd = MI ->getOperand(OpNo); ((TRI->isVGPR(MRI, Opnd.getReg())) ? static_cast<void> (0) : __assert_fail ("TRI->isVGPR(MRI, Opnd.getReg())" , "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 498, __PRETTY_FUNCTION__)); }; } } while (false)
498	assert(TRI->isVGPR(MRI, Opnd.getReg()));do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-insert-waitcnts")) { { const MachineOperand &Opnd = MI ->getOperand(OpNo); ((TRI->isVGPR(MRI, Opnd.getReg())) ? static_cast<void> (0) : __assert_fail ("TRI->isVGPR(*MRI, Opnd.getReg())" , "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 498, __PRETTY_FUNCTION__)); }; } } while (false)
499	})do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-insert-waitcnts")) { { const MachineOperand &Opnd = MI ->getOperand(OpNo); ((TRI->isVGPR(MRI, Opnd.getReg())) ? static_cast<void> (0) : __assert_fail ("TRI->isVGPR(MRI, Opnd.getReg())" , "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 498, __PRETTY_FUNCTION__)); }; } } while (false);
500	for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
501	setRegScore(RegNo, EXP_CNT, Val);
502	}
503	}
504
505	void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
506	const SIRegisterInfo *TRI,
507	const MachineRegisterInfo *MRI,
508	WaitEventType E, MachineInstr &Inst) {
509	const MachineRegisterInfo &MRIA = *MRI;
510	InstCounterType T = eventCounter(E);
511	uint32_t CurrScore = getScoreUB(T) + 1;
512	if (CurrScore == 0)
513	report_fatal_error("InsertWaitcnt score wraparound");
514	// PendingEvents and ScoreUB need to be update regardless if this event
515	// changes the score of a register or not.
516	// Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
517	if (!hasPendingEvent(E)) {
518	if (PendingEvents & WaitEventMaskForInst[T])
519	MixedPendingEvents[T] = true;
520	PendingEvents \|= 1 << E;
521	}
522	setScoreUB(T, CurrScore);
523
524	if (T == EXP_CNT) {
525	// Put score on the source vgprs. If this is a store, just use those
526	// specific register(s).
527	if (TII->isDS(Inst) && (Inst.mayStore() \|\| Inst.mayLoad())) {
528	// All GDS operations must protect their address register (same as
529	// export.)
530	if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
531	Inst.getOpcode() != AMDGPU::DS_CONSUME) {
532	setExpScore(
533	&Inst, TII, TRI, MRI,
534	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
535	CurrScore);
536	}
537	if (Inst.mayStore()) {
538	if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
539	AMDGPU::OpName::data0) != -1) {
540	setExpScore(
541	&Inst, TII, TRI, MRI,
542	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
543	CurrScore);
544	}
545	if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
546	AMDGPU::OpName::data1) != -1) {
547	setExpScore(&Inst, TII, TRI, MRI,
548	AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
549	AMDGPU::OpName::data1),
550	CurrScore);
551	}
552	} else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
553	Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
554	Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
555	Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
556	Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
557	Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
558	Inst.getOpcode() != AMDGPU::DS_APPEND &&
559	Inst.getOpcode() != AMDGPU::DS_CONSUME &&
560	Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
561	for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
562	const MachineOperand &Op = Inst.getOperand(I);
563	if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
564	setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
565	}
566	}
567	}
568	} else if (TII->isFLAT(Inst)) {
569	if (Inst.mayStore()) {
570	setExpScore(
571	&Inst, TII, TRI, MRI,
572	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
573	CurrScore);
574	} else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
575	setExpScore(
576	&Inst, TII, TRI, MRI,
577	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
578	CurrScore);
579	}
580	} else if (TII->isMIMG(Inst)) {
581	if (Inst.mayStore()) {
582	setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
583	} else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
584	setExpScore(
585	&Inst, TII, TRI, MRI,
586	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
587	CurrScore);
588	}
589	} else if (TII->isMTBUF(Inst)) {
590	if (Inst.mayStore()) {
591	setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
592	}
593	} else if (TII->isMUBUF(Inst)) {
594	if (Inst.mayStore()) {
595	setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
596	} else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
597	setExpScore(
598	&Inst, TII, TRI, MRI,
599	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
600	CurrScore);
601	}
602	} else {
603	if (TII->isEXP(Inst)) {
604	// For export the destination registers are really temps that
605	// can be used as the actual source after export patching, so
606	// we need to treat them like sources and set the EXP_CNT
607	// score.
608	for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
609	MachineOperand &DefMO = Inst.getOperand(I);
610	if (DefMO.isReg() && DefMO.isDef() &&
611	TRI->isVGPR(MRIA, DefMO.getReg())) {
612	setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
613	CurrScore);
614	}
615	}
616	}
617	for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
618	MachineOperand &MO = Inst.getOperand(I);
619	if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
620	setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
621	}
622	}
623	}
624	#if 0 // TODO: check if this is handled by MUBUF code above.
625	} else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD \|\|
626	Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 \|\|
627	Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
628	MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
629	unsigned OpNo;//TODO: find the OpNo for this operand;
630	RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
631	for (signed RegNo = Interval.first; RegNo < Interval.second;
632	++RegNo) {
633	setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
634	}
635	#endif
636	} else {
637	// Match the score to the destination registers.
638	for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
639	RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
640	if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
641	continue;
642	for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
643	setRegScore(RegNo, T, CurrScore);
644	}
645	}
646	if (TII->isDS(Inst) && Inst.mayStore()) {
647	setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
648	}
649	}
650	}
651
652	void WaitcntBrackets::print(raw_ostream &OS) {
653	OS << '\n';
654	for (auto T : inst_counter_types()) {
655	uint32_t LB = getScoreLB(T);
656	uint32_t UB = getScoreUB(T);
657
658	switch (T) {
659	case VM_CNT:
660	OS << " VM_CNT(" << UB - LB << "): ";
661	break;
662	case LGKM_CNT:
663	OS << " LGKM_CNT(" << UB - LB << "): ";
664	break;
665	case EXP_CNT:
666	OS << " EXP_CNT(" << UB - LB << "): ";
667	break;
668	default:
669	OS << " UNKNOWN(" << UB - LB << "): ";
670	break;
671	}
672
673	if (LB < UB) {
674	// Print vgpr scores.
675	for (int J = 0; J <= getMaxVGPR(); J++) {
676	uint32_t RegScore = getRegScore(J, T);
677	if (RegScore <= LB)
678	continue;
679	uint32_t RelScore = RegScore - LB - 1;
680	if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
681	OS << RelScore << ":v" << J << " ";
682	} else {
683	OS << RelScore << ":ds ";
684	}
685	}
686	// Also need to print sgpr scores for lgkm_cnt.
687	if (T == LGKM_CNT) {
688	for (int J = 0; J <= getMaxSGPR(); J++) {
689	uint32_t RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
690	if (RegScore <= LB)
691	continue;
692	uint32_t RelScore = RegScore - LB - 1;
693	OS << RelScore << ":s" << J << " ";
694	}
695	}
696	}
697	OS << '\n';
698	}
699	OS << '\n';
700	}
701
702	/// Simplify the waitcnt, in the sense of removing redundant counts, and return
703	/// whether a waitcnt instruction is needed at all.
704	bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
705	return simplifyWaitcnt(VM_CNT, Wait.VmCnt) \|
706	simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) \|
707	simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
708	}
709
710	bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
711	unsigned &Count) const {
712	const uint32_t LB = getScoreLB(T);
713	const uint32_t UB = getScoreUB(T);
714	if (Count < UB && UB - Count > LB)
715	return true;
716
717	Count = ~0u;
718	return false;
719	}
720
721	void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait,
722	AMDGPU::Waitcnt &Wait) const {
723	// If the score of src_operand falls within the bracket, we need an
724	// s_waitcnt instruction.
725	const uint32_t LB = getScoreLB(T);
726	const uint32_t UB = getScoreUB(T);
727	if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
728	if ((T == VM_CNT \|\| T == LGKM_CNT) &&
729	hasPendingFlat() &&
730	!ST->hasFlatLgkmVMemCountInOrder()) {
731	// If there is a pending FLAT operation, and this is a VMem or LGKM
732	// waitcnt and the target can report early completion, then we need
733	// to force a waitcnt 0.
734	addWait(Wait, T, 0);
735	} else if (counterOutOfOrder(T)) {
736	// Counter can get decremented out-of-order when there
737	// are multiple types event in the bracket. Also emit an s_wait counter
738	// with a conservative value of 0 for the counter.
739	addWait(Wait, T, 0);
740	} else {
741	addWait(Wait, T, UB - ScoreToWait);
742	}
743	}
744	}
745
746	void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
747	applyWaitcnt(VM_CNT, Wait.VmCnt);
748	applyWaitcnt(EXP_CNT, Wait.ExpCnt);
749	applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
750	}
751
752	void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
753	const uint32_t UB = getScoreUB(T);
754	if (Count >= UB)
755	return;
756	if (Count != 0) {
757	if (counterOutOfOrder(T))
758	return;
759	setScoreLB(T, std::max(getScoreLB(T), UB - Count));
760	} else {
761	setScoreLB(T, UB);
762	MixedPendingEvents[T] = false;
763	PendingEvents &= ~WaitEventMaskForInst[T];
764	}
765	}
766
767	// Where there are multiple types of event in the bracket of a counter,
768	// the decrement may go out of order.
769	bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
770	// Scalar memory read always can go out of order.
771	if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
772	return true;
773	return MixedPendingEvents[T];
774	}
775
776	INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,static void *initializeSIInsertWaitcntsPassOnce(PassRegistry & Registry) {
777	false)static void *initializeSIInsertWaitcntsPassOnce(PassRegistry & Registry) {
778	INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,PassInfo PI = new PassInfo( "SI Insert Waitcnts", "si-insert-waitcnts" , &SIInsertWaitcnts::ID, PassInfo::NormalCtor_t(callDefaultCtor <SIInsertWaitcnts>), false, false); Registry.registerPass (PI, true); return PI; } static llvm::once_flag InitializeSIInsertWaitcntsPassFlag ; void llvm::initializeSIInsertWaitcntsPass(PassRegistry & Registry) { llvm::call_once(InitializeSIInsertWaitcntsPassFlag , initializeSIInsertWaitcntsPassOnce, std::ref(Registry)); }
779	false)PassInfo PI = new PassInfo( "SI Insert Waitcnts", "si-insert-waitcnts" , &SIInsertWaitcnts::ID, PassInfo::NormalCtor_t(callDefaultCtor <SIInsertWaitcnts>), false, false); Registry.registerPass (PI, true); return PI; } static llvm::once_flag InitializeSIInsertWaitcntsPassFlag ; void llvm::initializeSIInsertWaitcntsPass(PassRegistry & Registry) { llvm::call_once(InitializeSIInsertWaitcntsPassFlag , initializeSIInsertWaitcntsPassOnce, std::ref(Registry)); }
780
781	char SIInsertWaitcnts::ID = 0;
782
783	char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
784
785	FunctionPass *llvm::createSIInsertWaitcntsPass() {
786	return new SIInsertWaitcnts();
787	}
788
789	static bool readsVCCZ(const MachineInstr &MI) {
790	unsigned Opc = MI.getOpcode();
791	return (Opc == AMDGPU::S_CBRANCH_VCCNZ \|\| Opc == AMDGPU::S_CBRANCH_VCCZ) &&
792	!MI.getOperand(1).isUndef();
793	}
794
795	/// Generate s_waitcnt instruction to be placed before cur_Inst.
796	/// Instructions of a given type are returned in order,
797	/// but instructions of different types can complete out of order.
798	/// We rely on this in-order completion
799	/// and simply assign a score to the memory access instructions.
800	/// We keep track of the active "score bracket" to determine
801	/// if an access of a memory read requires an s_waitcnt
802	/// and if so what the value of each counter is.
803	/// The "score bracket" is bound by the lower bound and upper bound
804	/// scores (_score_LB and _score_ub respectively).
805	bool SIInsertWaitcnts::generateWaitcntInstBefore(
806	MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
807	MachineInstr *OldWaitcntInstr) {
808	setForceEmitWaitcnt();
809	bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
810
811	if (MI.isDebugInstr())
812	return false;
813
814	AMDGPU::Waitcnt Wait;
815
816	// See if this instruction has a forced S_WAITCNT VM.
817	// TODO: Handle other cases of NeedsWaitcntVmBefore()
818	if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 \|\|
819	MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC \|\|
820	MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
821	Wait.VmCnt = 0;
822	}
823
824	// All waits must be resolved at call return.
825	// NOTE: this could be improved with knowledge of all call sites or
826	// with knowledge of the called routines.
827	if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG \|\|
828	MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
829	Wait = AMDGPU::Waitcnt::allZero(IV);
830	}
831	// Resolve vm waits before gs-done.
832	else if ((MI.getOpcode() == AMDGPU::S_SENDMSG \|\|
833	MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
834	((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
835	AMDGPU::SendMsg::ID_GS_DONE)) {
836	Wait.VmCnt = 0;
837	}
838	#if 0 // TODO: the following blocks of logic when we have fence.
839	else if (MI.getOpcode() == SC_FENCE) {
840	const unsigned int group_size =
841	context->shader_info->GetMaxThreadGroupSize();
842	// group_size == 0 means thread group size is unknown at compile time
843	const bool group_is_multi_wave =
844	(group_size == 0 \|\| group_size > target_info->GetWaveFrontSize());
845	const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
846
847	for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
848	SCRegType src_type = Inst->GetSrcType(i);
849	switch (src_type) {
850	case SCMEM_LDS:
851	if (group_is_multi_wave \|\|
852	context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
853	EmitWaitcnt \|= ScoreBrackets->updateByWait(LGKM_CNT,
854	ScoreBrackets->getScoreUB(LGKM_CNT));
855	// LDS may have to wait for VM_CNT after buffer load to LDS
856	if (target_info->HasBufferLoadToLDS()) {
857	EmitWaitcnt \|= ScoreBrackets->updateByWait(VM_CNT,
858	ScoreBrackets->getScoreUB(VM_CNT));
859	}
860	}
861	break;
862
863	case SCMEM_GDS:
864	if (group_is_multi_wave \|\| fence_is_global) {
865	EmitWaitcnt \|= ScoreBrackets->updateByWait(EXP_CNT,
866	ScoreBrackets->getScoreUB(EXP_CNT));
867	EmitWaitcnt \|= ScoreBrackets->updateByWait(LGKM_CNT,
868	ScoreBrackets->getScoreUB(LGKM_CNT));
869	}
870	break;
871
872	case SCMEM_UAV:
873	case SCMEM_TFBUF:
874	case SCMEM_RING:
875	case SCMEM_SCATTER:
876	if (group_is_multi_wave \|\| fence_is_global) {
877	EmitWaitcnt \|= ScoreBrackets->updateByWait(EXP_CNT,
878	ScoreBrackets->getScoreUB(EXP_CNT));
879	EmitWaitcnt \|= ScoreBrackets->updateByWait(VM_CNT,
880	ScoreBrackets->getScoreUB(VM_CNT));
881	}
882	break;
883
884	case SCMEM_SCRATCH:
885	default:
886	break;
887	}
888	}
889	}
890	#endif
891
892	// Export & GDS instructions do not read the EXEC mask until after the export
893	// is granted (which can occur well after the instruction is issued).
894	// The shader program must flush all EXP operations on the export-count
895	// before overwriting the EXEC mask.
896	else {
897	if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
898	// Export and GDS are tracked individually, either may trigger a waitcnt
899	// for EXEC.
900	if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) \|\|
901	ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) \|\|
902	ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) \|\|
903	ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
904	Wait.ExpCnt = 0;
905	}
906	}
907
908	#if 0 // TODO: the following code to handle CALL.
909	// The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
910	// However, there is a problem with EXP_CNT, because the call cannot
911	// easily tell if a register is used in the function, and if it did, then
912	// the referring instruction would have to have an S_WAITCNT, which is
913	// dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
914	// before the call.
915	if (MI.getOpcode() == SC_CALL) {
916	if (ScoreBrackets->getScoreUB(EXP_CNT) >
917	ScoreBrackets->getScoreLB(EXP_CNT)) {
918	ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
919	EmitWaitcnt \|= CNT_MASK(EXP_CNT)(1u << (EXP_CNT));
920	}
921	}
922	#endif
923
924	// FIXME: Should not be relying on memoperands.
925	// Look at the source operands of every instruction to see if
926	// any of them results from a previous memory operation that affects
927	// its current usage. If so, an s_waitcnt instruction needs to be
928	// emitted.
929	// If the source operand was defined by a load, add the s_waitcnt
930	// instruction.
931	for (const MachineMemOperand *Memop : MI.memoperands()) {
932	unsigned AS = Memop->getAddrSpace();
933	if (AS != AMDGPUAS::LOCAL_ADDRESS)
934	continue;
935	unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
936	// VM_CNT is only relevant to vgpr or LDS.
937	ScoreBrackets.determineWait(
938	VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
939	}
940
941	for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
942	const MachineOperand &Op = MI.getOperand(I);
943	const MachineRegisterInfo &MRIA = *MRI;
944	RegInterval Interval =
945	ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
946	for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
947	if (TRI->isVGPR(MRIA, Op.getReg())) {
948	// VM_CNT is only relevant to vgpr or LDS.
949	ScoreBrackets.determineWait(
950	VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
951	}
952	ScoreBrackets.determineWait(
953	LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
954	}
955	}
956	// End of for loop that looks at all source operands to decide vm_wait_cnt
957	// and lgk_wait_cnt.
958
959	// Two cases are handled for destination operands:
960	// 1) If the destination operand was defined by a load, add the s_waitcnt
961	// instruction to guarantee the right WAW order.
962	// 2) If a destination operand that was used by a recent export/store ins,
963	// add s_waitcnt on exp_cnt to guarantee the WAR order.
964	if (MI.mayStore()) {
965	// FIXME: Should not be relying on memoperands.
966	for (const MachineMemOperand *Memop : MI.memoperands()) {
967	unsigned AS = Memop->getAddrSpace();
968	if (AS != AMDGPUAS::LOCAL_ADDRESS)
969	continue;
970	unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
971	ScoreBrackets.determineWait(
972	VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
973	ScoreBrackets.determineWait(
974	EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
975	}
976	}
977	for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
978	MachineOperand &Def = MI.getOperand(I);
979	const MachineRegisterInfo &MRIA = *MRI;
980	RegInterval Interval =
981	ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
982	for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
983	if (TRI->isVGPR(MRIA, Def.getReg())) {
984	ScoreBrackets.determineWait(
985	VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
986	ScoreBrackets.determineWait(
987	EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
988	}
989	ScoreBrackets.determineWait(
990	LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
991	}
992	} // End of for loop that looks at all dest operands.
993	}
994
995	// Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
996	// occurs before the instruction. Doing it here prevents any additional
997	// S_WAITCNTs from being emitted if the instruction was marked as
998	// requiring a WAITCNT beforehand.
999	if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1000	!ST->hasAutoWaitcntBeforeBarrier()) {
1001	Wait = AMDGPU::Waitcnt::allZero(IV);
1002	}
1003
1004	// TODO: Remove this work-around, enable the assert for Bug 457939
1005	// after fixing the scheduler. Also, the Shader Compiler code is
1006	// independent of target.
1007	if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
1008	if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1009	ScoreBrackets.getScoreUB(LGKM_CNT) &&
1010	ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1011	Wait.LgkmCnt = 0;
1012	}
1013	}
1014
1015	// Early-out if no wait is indicated.
1016	if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
1017	bool Modified = false;
1018	if (OldWaitcntInstr) {
1019	if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
1020	TrackedWaitcntSet.erase(OldWaitcntInstr);
1021	OldWaitcntInstr->eraseFromParent();
1022	Modified = true;
	Value stored to 'Modified' is never read
1023	} else {
1024	int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
1025	ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
1026	}
1027	Modified = true;
1028	}
1029	return Modified;
1030	}
1031
1032	if (ForceEmitZeroWaitcnts)
1033	Wait = AMDGPU::Waitcnt::allZero(IV);
1034
1035	if (ForceEmitWaitcnt[VM_CNT])
1036	Wait.VmCnt = 0;
1037	if (ForceEmitWaitcnt[EXP_CNT])
1038	Wait.ExpCnt = 0;
1039	if (ForceEmitWaitcnt[LGKM_CNT])
1040	Wait.LgkmCnt = 0;
1041
1042	ScoreBrackets.applyWaitcnt(Wait);
1043
1044	AMDGPU::Waitcnt OldWait;
1045	if (OldWaitcntInstr) {
1046	OldWait =
1047	AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm());
1048	}
1049	if (OldWait.dominates(Wait))
1050	return false;
1051
1052	if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr))
1053	Wait = Wait.combined(OldWait);
1054
1055	unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1056	if (OldWaitcntInstr) {
1057	OldWaitcntInstr->getOperand(0).setImm(Enc);
1058
1059	LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-insert-waitcnts")) { dbgs() << "updateWaitcntInBlock\n" << "Old Instr: " << MI << '\n' << "New Instr: " << *OldWaitcntInstr << '\n'; } } while (false)
1060	<< "Old Instr: " << MI << '\n'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-insert-waitcnts")) { dbgs() << "updateWaitcntInBlock\n" << "Old Instr: " << MI << '\n' << "New Instr: " << *OldWaitcntInstr << '\n'; } } while (false)
1061	<< "New Instr: " << OldWaitcntInstr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-insert-waitcnts")) { dbgs() << "updateWaitcntInBlock\n" << "Old Instr: " << MI << '\n' << "New Instr: " << OldWaitcntInstr << '\n'; } } while (false);
1062	} else {
1063	auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1064	MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1065	.addImm(Enc);
1066	TrackedWaitcntSet.insert(SWaitInst);
1067
1068	LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-insert-waitcnts")) { dbgs() << "insertWaitcntInBlock\n" << "Old Instr: " << MI << '\n' << "New Instr: " << *SWaitInst << '\n'; } } while (false)
1069	<< "Old Instr: " << MI << '\n'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-insert-waitcnts")) { dbgs() << "insertWaitcntInBlock\n" << "Old Instr: " << MI << '\n' << "New Instr: " << *SWaitInst << '\n'; } } while (false)
1070	<< "New Instr: " << SWaitInst << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-insert-waitcnts")) { dbgs() << "insertWaitcntInBlock\n" << "Old Instr: " << MI << '\n' << "New Instr: " << SWaitInst << '\n'; } } while (false);
1071	}
1072
1073	return true;
1074	}
1075
1076	// This is a flat memory operation. Check to see if it has memory
1077	// tokens for both LDS and Memory, and if so mark it as a flat.
1078	bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1079	if (MI.memoperands_empty())
1080	return true;
1081
1082	for (const MachineMemOperand *Memop : MI.memoperands()) {
1083	unsigned AS = Memop->getAddrSpace();
1084	if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::FLAT_ADDRESS)
1085	return true;
1086	}
1087
1088	return false;
1089	}
1090
1091	void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1092	WaitcntBrackets *ScoreBrackets) {
1093	// Now look at the instruction opcode. If it is a memory access
1094	// instruction, update the upper-bound of the appropriate counter's
1095	// bracket and the destination operand scores.
1096	// TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1097	if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1098	if (TII->isAlwaysGDS(Inst.getOpcode()) \|\|
1099	TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1100	ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1101	ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1102	} else {
1103	ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1104	}
1105	} else if (TII->isFLAT(Inst)) {
1106	assert(Inst.mayLoad() \|\| Inst.mayStore())((Inst.mayLoad() \|\| Inst.mayStore()) ? static_cast<void> (0) : __assert_fail ("Inst.mayLoad() \|\| Inst.mayStore()", "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 1106, __PRETTY_FUNCTION__));
1107
1108	if (TII->usesVM_CNT(Inst))
1109	ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1110
1111	if (TII->usesLGKM_CNT(Inst)) {
1112	ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1113
1114	// This is a flat memory operation, so note it - it will require
1115	// that both the VM and LGKM be flushed to zero if it is pending when
1116	// a VM or LGKM dependency occurs.
1117	if (mayAccessLDSThroughFlat(Inst))
1118	ScoreBrackets->setPendingFlat();
1119	}
1120	} else if (SIInstrInfo::isVMEM(Inst) &&
1121	// TODO: get a better carve out.
1122	Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1123	Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1124	Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1125	ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1126	if (ST->vmemWriteNeedsExpWaitcnt() &&
1127	(Inst.mayStore() \|\| AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
1128	ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1129	}
1130	} else if (TII->isSMRD(Inst)) {
1131	ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1132	} else {
1133	switch (Inst.getOpcode()) {
1134	case AMDGPU::S_SENDMSG:
1135	case AMDGPU::S_SENDMSGHALT:
1136	ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1137	break;
1138	case AMDGPU::EXP:
1139	case AMDGPU::EXP_DONE: {
1140	int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1141	if (Imm >= 32 && Imm <= 63)
1142	ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1143	else if (Imm >= 12 && Imm <= 15)
1144	ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1145	else
1146	ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1147	break;
1148	}
1149	case AMDGPU::S_MEMTIME:
1150	case AMDGPU::S_MEMREALTIME:
1151	ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1152	break;
1153	default:
1154	break;
1155	}
1156	}
1157	}
1158
1159	bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score,
1160	uint32_t OtherScore) {
1161	uint32_t MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
1162	uint32_t OtherShifted =
1163	OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
1164	Score = std::max(MyShifted, OtherShifted);
1165	return OtherShifted > MyShifted;
1166	}
1167
1168	/// Merge the pending events and associater score brackets of \p Other into
1169	/// this brackets status.
1170	///
1171	/// Returns whether the merge resulted in a change that requires tighter waits
1172	/// (i.e. the merged brackets strictly dominate the original brackets).
1173	bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
1174	bool StrictDom = false;
1175
1176	for (auto T : inst_counter_types()) {
1177	// Merge event flags for this counter
1178	const bool OldOutOfOrder = counterOutOfOrder(T);
1179	const uint32_t OldEvents = PendingEvents & WaitEventMaskForInst[T];
1180	const uint32_t OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
1181	if (OtherEvents & ~OldEvents)
1182	StrictDom = true;
1183	if (Other.MixedPendingEvents[T] \|\|
1184	(OldEvents && OtherEvents && OldEvents != OtherEvents))
1185	MixedPendingEvents[T] = true;
1186	PendingEvents \|= OtherEvents;
1187
1188	// Merge scores for this counter
1189	const uint32_t MyPending = ScoreUBs[T] - ScoreLBs[T];
1190	const uint32_t OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
1191	MergeInfo M;
1192	M.OldLB = ScoreLBs[T];
1193	M.OtherLB = Other.ScoreLBs[T];
1194	M.MyShift = OtherPending > MyPending ? OtherPending - MyPending : 0;
1195	M.OtherShift = ScoreUBs[T] - Other.ScoreUBs[T] + M.MyShift;
1196
1197	const uint32_t NewUB = ScoreUBs[T] + M.MyShift;
1198	if (NewUB < ScoreUBs[T])
1199	report_fatal_error("waitcnt score overflow");
1200	ScoreUBs[T] = NewUB;
1201	ScoreLBs[T] = std::min(M.OldLB + M.MyShift, M.OtherLB + M.OtherShift);
1202
1203	StrictDom \|= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
1204
1205	bool RegStrictDom = false;
1206	for (int J = 0, E = std::max(getMaxVGPR(), Other.getMaxVGPR()) + 1; J != E;
1207	J++) {
1208	RegStrictDom \|= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
1209	}
1210
1211	if (T == LGKM_CNT) {
1212	for (int J = 0, E = std::max(getMaxSGPR(), Other.getMaxSGPR()) + 1;
1213	J != E; J++) {
1214	RegStrictDom \|= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
1215	}
1216	}
1217
1218	if (RegStrictDom && !OldOutOfOrder)
1219	StrictDom = true;
1220	}
1221
1222	VgprUB = std::max(getMaxVGPR(), Other.getMaxVGPR());
1223	SgprUB = std::max(getMaxSGPR(), Other.getMaxSGPR());
1224
1225	return StrictDom;
1226	}
1227
1228	// Generate s_waitcnt instructions where needed.
1229	bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1230	MachineBasicBlock &Block,
1231	WaitcntBrackets &ScoreBrackets) {
1232	bool Modified = false;
1233
1234	LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-insert-waitcnts")) { { dbgs() << "* Block" << Block.getNumber() << " *"; ScoreBrackets.dump(); }; } } while (false)
1235	dbgs() << "* Block" << Block.getNumber() << " ";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-insert-waitcnts")) { { dbgs() << " Block" << Block.getNumber() << " *"; ScoreBrackets.dump(); }; } } while (false)
1236	ScoreBrackets.dump();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-insert-waitcnts")) { { dbgs() << "* Block" << Block.getNumber() << " *"; ScoreBrackets.dump(); }; } } while (false)
1237	})do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-insert-waitcnts")) { { dbgs() << "* Block" << Block.getNumber() << " *"; ScoreBrackets.dump(); }; } } while (false);
1238
1239	// Walk over the instructions.
1240	MachineInstr *OldWaitcntInstr = nullptr;
1241
1242	for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1243	Iter != E;) {
1244	MachineInstr &Inst = *Iter;
1245
1246	// Remove any previously existing waitcnts.
1247	if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
1248	if (OldWaitcntInstr) {
1249	if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
1250	TrackedWaitcntSet.erase(OldWaitcntInstr);
1251	OldWaitcntInstr->eraseFromParent();
1252	OldWaitcntInstr = nullptr;
1253	} else if (!TrackedWaitcntSet.count(&Inst)) {
1254	// Two successive s_waitcnt's, both of which are pre-existing and
1255	// are therefore preserved.
1256	int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
1257	ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
1258	} else {
1259	++Iter;
1260	Inst.eraseFromParent();
1261	Modified = true;
1262	continue;
1263	}
1264	}
1265
1266	OldWaitcntInstr = &Inst;
1267	++Iter;
1268	continue;
1269	}
1270
1271	bool VCCZBugWorkAround = false;
1272	if (readsVCCZ(Inst) &&
1273	(!VCCZBugHandledSet.count(&Inst))) {
1274	if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1275	ScoreBrackets.getScoreUB(LGKM_CNT) &&
1276	ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1277	if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
1278	VCCZBugWorkAround = true;
1279	}
1280	}
1281
1282	// Generate an s_waitcnt instruction to be placed before
1283	// cur_Inst, if needed.
1284	Modified \|= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
1285	OldWaitcntInstr = nullptr;
1286
1287	updateEventWaitcntAfter(Inst, &ScoreBrackets);
1288
1289	#if 0 // TODO: implement resource type check controlled by options with ub = LB.
1290	// If this instruction generates a S_SETVSKIP because it is an
1291	// indexed resource, and we are on Tahiti, then it will also force
1292	// an S_WAITCNT vmcnt(0)
1293	if (RequireCheckResourceType(Inst, context)) {
1294	// Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1295	ScoreBrackets->setScoreLB(VM_CNT,
1296	ScoreBrackets->getScoreUB(VM_CNT));
1297	}
1298	#endif
1299
1300	LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-insert-waitcnts")) { { Inst.print(dbgs()); ScoreBrackets .dump(); }; } } while (false)
1301	Inst.print(dbgs());do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-insert-waitcnts")) { { Inst.print(dbgs()); ScoreBrackets .dump(); }; } } while (false)
1302	ScoreBrackets.dump();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-insert-waitcnts")) { { Inst.print(dbgs()); ScoreBrackets .dump(); }; } } while (false)
1303	})do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("si-insert-waitcnts")) { { Inst.print(dbgs()); ScoreBrackets .dump(); }; } } while (false);
1304
1305	// Check to see if this is a GWS instruction. If so, and if this is CI or
1306	// VI, then the generated code sequence will include an S_WAITCNT 0.
1307	// TODO: Are these the only GWS instructions?
1308	if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT \|\|
1309	Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V \|\|
1310	Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR \|\|
1311	Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P \|\|
1312	Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1313	// TODO: && context->target_info->GwsRequiresMemViolTest() ) {
1314	ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt());
1315	}
1316
1317	// TODO: Remove this work-around after fixing the scheduler and enable the
1318	// assert above.
1319	if (VCCZBugWorkAround) {
1320	// Restore the vccz bit. Any time a value is written to vcc, the vcc
1321	// bit is updated, so we can restore the bit by reading the value of
1322	// vcc and then writing it back to the register.
1323	BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1324	AMDGPU::VCC)
1325	.addReg(AMDGPU::VCC);
1326	VCCZBugHandledSet.insert(&Inst);
1327	Modified = true;
1328	}
1329
1330	++Iter;
1331	}
1332
1333	return Modified;
1334	}
1335
1336	bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1337	ST = &MF.getSubtarget<GCNSubtarget>();
1338	TII = ST->getInstrInfo();
1339	TRI = &TII->getRegisterInfo();
1340	MRI = &MF.getRegInfo();
1341	IV = AMDGPU::getIsaVersion(ST->getCPU());
1342	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1343
1344	ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1345	for (auto T : inst_counter_types())
1346	ForceEmitWaitcnt[T] = false;
1347
1348	HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1349	HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1350	HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1351
1352	HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1353	HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1354	assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS)((HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS) ? static_cast <void> (0) : __assert_fail ("HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS" , "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 1354, __PRETTY_FUNCTION__));
1355	assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS)((HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS) ? static_cast <void> (0) : __assert_fail ("HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS" , "/build/llvm-toolchain-snapshot-9~svn359426/lib/Target/AMDGPU/SIInsertWaitcnts.cpp" , 1355, __PRETTY_FUNCTION__));
1356
1357	RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1358	RegisterEncoding.VGPRL =
1359	RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1360	RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1361	RegisterEncoding.SGPRL =
1362	RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1363
1364	TrackedWaitcntSet.clear();
1365	VCCZBugHandledSet.clear();
1366	RpotIdxMap.clear();
1367	BlockInfos.clear();
1368
1369	// Keep iterating over the blocks in reverse post order, inserting and
1370	// updating s_waitcnt where needed, until a fix point is reached.
1371	for (MachineBasicBlock *MBB :
1372	ReversePostOrderTraversal<MachineFunction *>(&MF)) {
1373	RpotIdxMap[MBB] = BlockInfos.size();
1374	BlockInfos.emplace_back(MBB);
1375	}
1376
1377	std::unique_ptr<WaitcntBrackets> Brackets;
1378	bool Modified = false;
1379	bool Repeat;
1380	do {
1381	Repeat = false;
1382
1383	for (BlockInfo &BI : BlockInfos) {
1384	if (!BI.Dirty)
1385	continue;
1386
1387	unsigned Idx = std::distance(&*BlockInfos.begin(), &BI);
1388
1389	if (BI.Incoming) {
1390	if (!Brackets)
1391	Brackets = llvm::make_unique<WaitcntBrackets>(*BI.Incoming);
1392	else
1393	Brackets = BI.Incoming;
1394	} else {
1395	if (!Brackets)
1396	Brackets = llvm::make_unique<WaitcntBrackets>(ST);
1397	else
1398	Brackets->clear();
1399	}
1400
1401	Modified \|= insertWaitcntInBlock(MF, BI.MBB, Brackets);
1402	BI.Dirty = false;
1403
1404	if (Brackets->hasPending()) {
1405	BlockInfo *MoveBracketsToSucc = nullptr;
1406	for (MachineBasicBlock *Succ : BI.MBB->successors()) {
1407	unsigned SuccIdx = RpotIdxMap[Succ];
1408	BlockInfo &SuccBI = BlockInfos[SuccIdx];
1409	if (!SuccBI.Incoming) {
1410	SuccBI.Dirty = true;
1411	if (SuccIdx <= Idx)
1412	Repeat = true;
1413	if (!MoveBracketsToSucc) {
1414	MoveBracketsToSucc = &SuccBI;
1415	} else {
1416	SuccBI.Incoming = llvm::make_unique<WaitcntBrackets>(*Brackets);
1417	}
1418	} else if (SuccBI.Incoming->merge(*Brackets)) {
1419	SuccBI.Dirty = true;
1420	if (SuccIdx <= Idx)
1421	Repeat = true;
1422	}
1423	}
1424	if (MoveBracketsToSucc)
1425	MoveBracketsToSucc->Incoming = std::move(Brackets);
1426	}
1427	}
1428	} while (Repeat);
1429
1430	SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1431
1432	bool HaveScalarStores = false;
1433
1434	for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1435	++BI) {
1436	MachineBasicBlock &MBB = *BI;
1437
1438	for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1439	++I) {
1440	if (!HaveScalarStores && TII->isScalarStore(*I))
1441	HaveScalarStores = true;
1442
1443	if (I->getOpcode() == AMDGPU::S_ENDPGM \|\|
1444	I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1445	EndPgmBlocks.push_back(&MBB);
1446	}
1447	}
1448
1449	if (HaveScalarStores) {
1450	// If scalar writes are used, the cache must be flushed or else the next
1451	// wave to reuse the same scratch memory can be clobbered.
1452	//
1453	// Insert s_dcache_wb at wave termination points if there were any scalar
1454	// stores, and only if the cache hasn't already been flushed. This could be
1455	// improved by looking across blocks for flushes in postdominating blocks
1456	// from the stores but an explicitly requested flush is probably very rare.
1457	for (MachineBasicBlock *MBB : EndPgmBlocks) {
1458	bool SeenDCacheWB = false;
1459
1460	for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1461	++I) {
1462	if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1463	SeenDCacheWB = true;
1464	else if (TII->isScalarStore(*I))
1465	SeenDCacheWB = false;
1466
1467	// FIXME: It would be better to insert this before a waitcnt if any.
1468	if ((I->getOpcode() == AMDGPU::S_ENDPGM \|\|
1469	I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1470	!SeenDCacheWB) {
1471	Modified = true;
1472	BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1473	}
1474	}
1475	}
1476	}
1477
1478	if (!MFI->isEntryFunction()) {
1479	// Wait for any outstanding memory operations that the input registers may
1480	// depend on. We can't track them and it's better to the wait after the
1481	// costly call sequence.
1482
1483	// TODO: Could insert earlier and schedule more liberally with operations
1484	// that only use caller preserved registers.
1485	MachineBasicBlock &EntryBB = MF.front();
1486	BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1487	.addImm(0);
1488
1489	Modified = true;
1490	}
1491
1492	return Modified;
1493	}