LLVM 23.0.0git
AMDGPUSwLowerLDS.cpp
Go to the documentation of this file.
1//===-- AMDGPUSwLowerLDS.cpp -----------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass lowers the local data store, LDS, uses in kernel and non-kernel
10// functions in module to use dynamically allocated global memory.
11// Packed LDS Layout is emulated in the global memory.
12// The lowered memory instructions from LDS to global memory are then
13// instrumented for address sanitizer, to catch addressing errors.
14// This pass only work when address sanitizer has been enabled and has
15// instrumented the IR. It identifies that IR has been instrumented using
16// "nosanitize_address" module flag.
17//
18// Replacement of Kernel LDS accesses:
19// For a kernel, LDS access can be static or dynamic which are direct
20// (accessed within kernel) and indirect (accessed through non-kernels).
21// All these LDS accesses corresponding to kernel will be packed together,
22// where all static LDS accesses will be allocated first and then dynamic
23// LDS follows. The total size with alignment is calculated. A new LDS global
24// will be created for the kernel called "SW LDS" and it will have the
25// attribute "amdgpu-lds-size" attached with value of the size calculated.
26// All the LDS accesses in the module will be replaced by GEP with offset
27// into the "Sw LDS".
28// A new "llvm.amdgcn.<kernel>.dynlds" is created per kernel accessing
29// the dynamic LDS. This will be marked used by kernel and will have
30// MD_absolue_symbol metadata set to total static LDS size, Since dynamic
31// LDS allocation starts after all static LDS allocation.
32//
33// A device global memory equal to the total LDS size will be allocated.
34// At the prologue of the kernel, a single work-item from the
35// work-group, does a "malloc" and stores the pointer of the
36// allocation in "SW LDS".
37//
38// To store the offsets corresponding to all LDS accesses, another global
39// variable is created which will be called "SW LDS metadata" in this pass.
40// - SW LDS Global:
41// It is LDS global of ptr type with name
42// "llvm.amdgcn.sw.lds.<kernel-name>".
43// - Metadata Global:
44// It is of struct type, with n members. n equals the number of LDS
45// globals accessed by the kernel(direct and indirect). Each member of
46// struct is another struct of type {i32, i32, i32}. First member
47// corresponds to offset, second member corresponds to size of LDS global
48// being replaced and third represents the total aligned size. It will
49// have name "llvm.amdgcn.sw.lds.<kernel-name>.md". This global will have
50// an initializer with static LDS related offsets and sizes initialized.
51// But for dynamic LDS related entries, offsets will be initialized to
52// previous static LDS allocation end offset. Sizes for them will be zero
53// initially. These dynamic LDS offset and size values will be updated
54// within the kernel, since kernel can read the dynamic LDS size
55// allocation done at runtime with query to "hidden_dynamic_lds_size"
56// hidden kernel argument.
57//
58// At the epilogue of kernel, allocated memory would be made free by the same
59// single work-item.
60//
61// Replacement of non-kernel LDS accesses:
62// Multiple kernels can access the same non-kernel function.
63// All the kernels accessing LDS through non-kernels are sorted and
64// assigned a kernel-id. All the LDS globals accessed by non-kernels
65// are sorted. This information is used to build two tables:
66// - Base table:
67// Base table will have single row, with elements of the row
68// placed as per kernel ID. Each element in the row corresponds
69// to ptr of "SW LDS" variable created for that kernel.
70// - Offset table:
71// Offset table will have multiple rows and columns.
72// Rows are assumed to be from 0 to (n-1). n is total number
73// of kernels accessing the LDS through non-kernels.
74// Each row will have m elements. m is the total number of
75// unique LDS globals accessed by all non-kernels.
76// Each element in the row correspond to the ptr of
77// the replacement of LDS global done by that particular kernel.
78// A LDS variable in non-kernel will be replaced based on the information
79// from base and offset tables. Based on kernel-id query, ptr of "SW
80// LDS" for that corresponding kernel is obtained from base table.
81// The Offset into the base "SW LDS" is obtained from
82// corresponding element in offset table. With this information, replacement
83// value is obtained.
84//===----------------------------------------------------------------------===//
85
86#include "AMDGPU.h"
88#include "AMDGPUMemoryUtils.h"
89#include "AMDGPUTargetMachine.h"
90#include "llvm/ADT/DenseMap.h"
91#include "llvm/ADT/DenseSet.h"
92#include "llvm/ADT/SetVector.h"
94#include "llvm/ADT/StringRef.h"
98#include "llvm/IR/Constants.h"
99#include "llvm/IR/DIBuilder.h"
100#include "llvm/IR/DebugInfo.h"
102#include "llvm/IR/IRBuilder.h"
103#include "llvm/IR/Instructions.h"
104#include "llvm/IR/IntrinsicsAMDGPU.h"
105#include "llvm/IR/MDBuilder.h"
107#include "llvm/Pass.h"
111
112#include <algorithm>
113
114#define DEBUG_TYPE "amdgpu-sw-lower-lds"
115#define COV5_HIDDEN_DYN_LDS_SIZE_ARG 15
116
117using namespace llvm;
118using namespace AMDGPU;
119
120namespace {
121
123 AsanInstrumentLDS("amdgpu-asan-instrument-lds",
124 cl::desc("Run asan instrumentation on LDS instructions "
125 "lowered to global memory"),
126 cl::init(true), cl::Hidden);
127
128using DomTreeCallback = function_ref<DominatorTree *(Function &F)>;
129
130struct LDSAccessTypeInfo {
131 SetVector<GlobalVariable *> StaticLDSGlobals;
132 SetVector<GlobalVariable *> DynamicLDSGlobals;
133};
134
135// Struct to hold all the Metadata required for a kernel
136// to replace a LDS global uses with corresponding offset
137// in to device global memory.
138struct KernelLDSParameters {
139 GlobalVariable *SwLDS = nullptr;
140 GlobalVariable *SwDynLDS = nullptr;
141 GlobalVariable *SwLDSMetadata = nullptr;
142 LDSAccessTypeInfo DirectAccess;
143 LDSAccessTypeInfo IndirectAccess;
145 LDSToReplacementIndicesMap;
146 uint32_t MallocSize = 0;
147 uint32_t LDSSize = 0;
148 SmallVector<std::pair<uint32_t, uint32_t>, 64> RedzoneOffsetAndSizeVector;
149};
150
151// Struct to store information for creation of offset table
152// for all the non-kernel LDS accesses.
153struct NonKernelLDSParameters {
154 GlobalVariable *LDSBaseTable = nullptr;
155 GlobalVariable *LDSOffsetTable = nullptr;
156 SetVector<Function *> OrderedKernels;
157 SetVector<GlobalVariable *> OrdereLDSGlobals;
158};
159
160struct AsanInstrumentInfo {
161 int Scale = 0;
162 uint32_t Offset = 0;
163 SetVector<Instruction *> Instructions;
164};
165
166struct FunctionsAndLDSAccess {
167 DenseMap<Function *, KernelLDSParameters> KernelToLDSParametersMap;
168 SetVector<Function *> KernelsWithIndirectLDSAccess;
169 SetVector<Function *> NonKernelsWithLDSArgument;
170 SetVector<GlobalVariable *> AllNonKernelLDSAccess;
171 FunctionVariableMap NonKernelToLDSAccessMap;
172};
173
174class AMDGPUSwLowerLDS {
175public:
176 AMDGPUSwLowerLDS(Module &Mod, const AMDGPUTargetMachine &TM,
177 DomTreeCallback Callback)
178 : M(Mod), AMDGPUTM(TM), IRB(M.getContext()), DTCallback(Callback) {}
179 bool run();
180 void getUsesOfLDSByNonKernels();
181 void getNonKernelsWithLDSArguments(const CallGraph &CG);
183 getOrderedIndirectLDSAccessingKernels(SetVector<Function *> &Kernels);
185 getOrderedNonKernelAllLDSGlobals(SetVector<GlobalVariable *> &Variables);
186 void buildSwLDSGlobal(Function *Func);
187 void buildSwDynLDSGlobal(Function *Func);
188 void populateSwMetadataGlobal(Function *Func);
189 void populateSwLDSAttributeAndMetadata(Function *Func);
190 void populateLDSToReplacementIndicesMap(Function *Func);
191 void getLDSMemoryInstructions(Function *Func,
192 SetVector<Instruction *> &LDSInstructions);
193 void replaceKernelLDSAccesses(Function *Func);
194 Value *getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr, Value *LDSPtr);
195 void translateLDSMemoryOperationsToGlobalMemory(
196 Function *Func, Value *LoadMallocPtr,
197 SetVector<Instruction *> &LDSInstructions);
198 void poisonRedzones(Function *Func, Value *MallocPtr);
199 void lowerKernelLDSAccesses(Function *Func, DomTreeUpdater &DTU);
200 void buildNonKernelLDSOffsetTable(NonKernelLDSParameters &NKLDSParams);
201 void buildNonKernelLDSBaseTable(NonKernelLDSParameters &NKLDSParams);
202 Constant *
203 getAddressesOfVariablesInKernel(Function *Func,
204 SetVector<GlobalVariable *> &Variables);
205 void lowerNonKernelLDSAccesses(Function *Func,
206 SetVector<GlobalVariable *> &LDSGlobals,
207 NonKernelLDSParameters &NKLDSParams);
208 void
209 updateMallocSizeForDynamicLDS(Function *Func, Value **CurrMallocSize,
210 Value *HiddenDynLDSSize,
211 SetVector<GlobalVariable *> &DynamicLDSGlobals);
212 void initAsanInfo();
213
214private:
215 Module &M;
216 const AMDGPUTargetMachine &AMDGPUTM;
217 IRBuilder<> IRB;
218 DomTreeCallback DTCallback;
219 FunctionsAndLDSAccess FuncLDSAccessInfo;
220 AsanInstrumentInfo AsanInfo;
221};
222
223template <typename T> SetVector<T> sortByName(std::vector<T> &&V) {
224 // Sort the vector of globals or Functions based on their name.
225 // Returns a SetVector of globals/Functions.
226 sort(V, [](const auto *L, const auto *R) {
227 return L->getName() < R->getName();
228 });
229 return {SetVector<T>(llvm::from_range, V)};
230}
231
232SetVector<GlobalVariable *> AMDGPUSwLowerLDS::getOrderedNonKernelAllLDSGlobals(
233 SetVector<GlobalVariable *> &Variables) {
234 // Sort all the non-kernel LDS accesses based on their name.
235 return sortByName(
236 std::vector<GlobalVariable *>(Variables.begin(), Variables.end()));
237}
238
239SetVector<Function *> AMDGPUSwLowerLDS::getOrderedIndirectLDSAccessingKernels(
240 SetVector<Function *> &Kernels) {
241 // Sort the non-kernels accessing LDS based on their name.
242 // Also assign a kernel ID metadata based on the sorted order.
243 LLVMContext &Ctx = M.getContext();
244 if (Kernels.size() > UINT32_MAX) {
245 report_fatal_error("Unimplemented SW LDS lowering for > 2**32 kernels");
246 }
247 SetVector<Function *> OrderedKernels =
248 sortByName(std::vector<Function *>(Kernels.begin(), Kernels.end()));
249 for (size_t i = 0; i < Kernels.size(); i++) {
250 Metadata *AttrMDArgs[1] = {
252 };
253 Function *Func = OrderedKernels[i];
254 Func->setMetadata("llvm.amdgcn.lds.kernel.id",
255 MDNode::get(Ctx, AttrMDArgs));
256 }
257 return OrderedKernels;
258}
259
260void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) {
261 // Among the kernels accessing LDS, get list of
262 // Non-kernels to which a call is made and a ptr
263 // to addrspace(3) is passed as argument.
264 for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
265 Function *Func = K.first;
266 const CallGraphNode *CGN = CG[Func];
267 if (!CGN)
268 continue;
269 for (auto &I : *CGN) {
270 CallGraphNode *CallerCGN = I.second;
271 Function *CalledFunc = CallerCGN->getFunction();
272 if (!CalledFunc || CalledFunc->isDeclaration())
273 continue;
274 if (AMDGPU::isKernel(*CalledFunc))
275 continue;
276 for (auto AI = CalledFunc->arg_begin(), E = CalledFunc->arg_end();
277 AI != E; ++AI) {
278 Type *ArgTy = (*AI).getType();
279 if (!ArgTy->isPointerTy())
280 continue;
282 continue;
283 FuncLDSAccessInfo.NonKernelsWithLDSArgument.insert(CalledFunc);
284 // Also add the Calling function to KernelsWithIndirectLDSAccess list
285 // so that base table of LDS is generated.
286 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(Func);
287 }
288 }
289 }
290}
291
292void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
293 for (GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) {
295 continue;
296
297 for (User *V : GV->users()) {
298 if (auto *I = dyn_cast<Instruction>(V)) {
299 Function *F = I->getFunction();
300 if (!isKernel(*F) && !F->isDeclaration())
301 FuncLDSAccessInfo.NonKernelToLDSAccessMap[F].insert(GV);
302 }
303 }
304 }
305}
306
307static void recordLDSAbsoluteAddress(Module &M, GlobalVariable *GV,
308 uint32_t Address) {
309 // Write the specified address into metadata where it can be retrieved by
310 // the assembler. Format is a half open range, [Address Address+1)
311 LLVMContext &Ctx = M.getContext();
312 auto *IntTy = M.getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
313 MDBuilder MDB(Ctx);
314 MDNode *MetadataNode = MDB.createRange(ConstantInt::get(IntTy, Address),
315 ConstantInt::get(IntTy, Address + 1));
316 GV->setMetadata(LLVMContext::MD_absolute_symbol, MetadataNode);
317}
318
319static void addLDSSizeAttribute(Function *Func, uint32_t Offset,
320 bool IsDynLDS) {
321 if (Offset != 0) {
322 std::string Buffer;
323 raw_string_ostream SS{Buffer};
324 SS << Offset;
325 if (IsDynLDS)
326 SS << "," << Offset;
327 Func->addFnAttr("amdgpu-lds-size", Buffer);
328 }
329}
330
331static void markUsedByKernel(Function *Func, GlobalVariable *SGV) {
332 BasicBlock *Entry = &Func->getEntryBlock();
333 IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt());
334
335 Function *Decl = Intrinsic::getOrInsertDeclaration(Func->getParent(),
336 Intrinsic::donothing, {});
337
338 Value *UseInstance[1] = {
339 Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)};
340
341 Builder.CreateCall(Decl, {},
342 {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)});
343}
344
345void AMDGPUSwLowerLDS::buildSwLDSGlobal(Function *Func) {
346 // Create new LDS global required for each kernel to store
347 // device global memory pointer.
348 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
349 // Create new global pointer variable
350 LDSParams.SwLDS = new GlobalVariable(
351 M, IRB.getPtrTy(), false, GlobalValue::InternalLinkage,
352 PoisonValue::get(IRB.getPtrTy()), "llvm.amdgcn.sw.lds." + Func->getName(),
355 MD.NoAddress = true;
356 LDSParams.SwLDS->setSanitizerMetadata(MD);
357}
358
359void AMDGPUSwLowerLDS::buildSwDynLDSGlobal(Function *Func) {
360 // Create new Dyn LDS global if kernel accesses dyn LDS.
361 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
362 if (LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
363 LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
364 return;
365 // Create new global pointer variable
366 auto *emptyCharArray = ArrayType::get(IRB.getInt8Ty(), 0);
367 LDSParams.SwDynLDS = new GlobalVariable(
368 M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr,
369 "llvm.amdgcn." + Func->getName() + ".dynlds", nullptr,
371 markUsedByKernel(Func, LDSParams.SwDynLDS);
373 MD.NoAddress = true;
374 LDSParams.SwDynLDS->setSanitizerMetadata(MD);
375}
376
377void AMDGPUSwLowerLDS::populateSwLDSAttributeAndMetadata(Function *Func) {
378 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
379 bool IsDynLDSUsed = LDSParams.SwDynLDS;
380 uint32_t Offset = LDSParams.LDSSize;
381 recordLDSAbsoluteAddress(M, LDSParams.SwLDS, 0);
382 addLDSSizeAttribute(Func, Offset, IsDynLDSUsed);
383 if (LDSParams.SwDynLDS)
384 recordLDSAbsoluteAddress(M, LDSParams.SwDynLDS, Offset);
385}
386
387void AMDGPUSwLowerLDS::populateSwMetadataGlobal(Function *Func) {
388 // Create new metadata global for every kernel and initialize the
389 // start offsets and sizes corresponding to each LDS accesses.
390 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
391 auto &Ctx = M.getContext();
392 auto &DL = M.getDataLayout();
393 std::vector<Type *> Items;
394 Type *Int32Ty = IRB.getInt32Ty();
395 std::vector<Constant *> Initializers;
396 Align MaxAlignment(1);
397 auto UpdateMaxAlignment = [&MaxAlignment, &DL](GlobalVariable *GV) {
398 Align GVAlign = AMDGPU::getAlign(DL, GV);
399 MaxAlignment = std::max(MaxAlignment, GVAlign);
400 };
401
402 for (GlobalVariable *GV : LDSParams.DirectAccess.StaticLDSGlobals)
403 UpdateMaxAlignment(GV);
404
405 for (GlobalVariable *GV : LDSParams.DirectAccess.DynamicLDSGlobals)
406 UpdateMaxAlignment(GV);
407
408 for (GlobalVariable *GV : LDSParams.IndirectAccess.StaticLDSGlobals)
409 UpdateMaxAlignment(GV);
410
411 for (GlobalVariable *GV : LDSParams.IndirectAccess.DynamicLDSGlobals)
412 UpdateMaxAlignment(GV);
413
414 //{StartOffset, AlignedSizeInBytes}
415 SmallString<128> MDItemStr;
416 raw_svector_ostream MDItemOS(MDItemStr);
417 MDItemOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.item";
418
419 StructType *LDSItemTy =
420 StructType::create(Ctx, {Int32Ty, Int32Ty, Int32Ty}, MDItemOS.str());
421 uint32_t &MallocSize = LDSParams.MallocSize;
422 SetVector<GlobalVariable *> UniqueLDSGlobals;
423 int AsanScale = AsanInfo.Scale;
424 auto buildInitializerForSwLDSMD =
425 [&](SetVector<GlobalVariable *> &LDSGlobals) {
426 for (auto &GV : LDSGlobals) {
427 if (is_contained(UniqueLDSGlobals, GV))
428 continue;
429 UniqueLDSGlobals.insert(GV);
430
431 Type *Ty = GV->getValueType();
432 const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
433 Items.push_back(LDSItemTy);
434 Constant *ItemStartOffset = ConstantInt::get(Int32Ty, MallocSize);
435 Constant *SizeInBytesConst = ConstantInt::get(Int32Ty, SizeInBytes);
436 // Get redzone size corresponding a size.
437 const uint64_t RightRedzoneSize =
438 AMDGPU::getRedzoneSizeForGlobal(AsanScale, SizeInBytes);
439 // Update MallocSize with current size and redzone size.
440 MallocSize += SizeInBytes;
441 if (!AMDGPU::isDynamicLDS(*GV))
442 LDSParams.RedzoneOffsetAndSizeVector.emplace_back(MallocSize,
443 RightRedzoneSize);
444 MallocSize += RightRedzoneSize;
445 // Align current size plus redzone.
446 uint64_t AlignedSize =
447 alignTo(SizeInBytes + RightRedzoneSize, MaxAlignment);
448 Constant *AlignedSizeInBytesConst =
449 ConstantInt::get(Int32Ty, AlignedSize);
450 // Align MallocSize
451 MallocSize = alignTo(MallocSize, MaxAlignment);
452 Constant *InitItem =
453 ConstantStruct::get(LDSItemTy, {ItemStartOffset, SizeInBytesConst,
454 AlignedSizeInBytesConst});
455 Initializers.push_back(InitItem);
456 }
457 };
458 SetVector<GlobalVariable *> SwLDSVector;
459 SwLDSVector.insert(LDSParams.SwLDS);
460 buildInitializerForSwLDSMD(SwLDSVector);
461 buildInitializerForSwLDSMD(LDSParams.DirectAccess.StaticLDSGlobals);
462 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.StaticLDSGlobals);
463 buildInitializerForSwLDSMD(LDSParams.DirectAccess.DynamicLDSGlobals);
464 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.DynamicLDSGlobals);
465
466 // Update the LDS size used by the kernel.
467 Type *Ty = LDSParams.SwLDS->getValueType();
468 const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
469 uint64_t AlignedSize = alignTo(SizeInBytes, MaxAlignment);
470 LDSParams.LDSSize = AlignedSize;
471 SmallString<128> MDTypeStr;
472 raw_svector_ostream MDTypeOS(MDTypeStr);
473 MDTypeOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.type";
474 StructType *MetadataStructType =
475 StructType::create(Ctx, Items, MDTypeOS.str());
476 SmallString<128> MDStr;
477 raw_svector_ostream MDOS(MDStr);
478 MDOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md";
479 LDSParams.SwLDSMetadata = new GlobalVariable(
480 M, MetadataStructType, false, GlobalValue::InternalLinkage,
481 PoisonValue::get(MetadataStructType), MDOS.str(), nullptr,
483 Constant *data = ConstantStruct::get(MetadataStructType, Initializers);
484 LDSParams.SwLDSMetadata->setInitializer(data);
485 assert(LDSParams.SwLDS);
486 // Set the alignment to MaxAlignment for SwLDS.
487 LDSParams.SwLDS->setAlignment(MaxAlignment);
488 if (LDSParams.SwDynLDS)
489 LDSParams.SwDynLDS->setAlignment(MaxAlignment);
491 MD.NoAddress = true;
492 LDSParams.SwLDSMetadata->setSanitizerMetadata(MD);
493}
494
495void AMDGPUSwLowerLDS::populateLDSToReplacementIndicesMap(Function *Func) {
496 // Fill the corresponding LDS replacement indices for each LDS access
497 // related to this kernel.
498 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
499 SetVector<GlobalVariable *> UniqueLDSGlobals;
500 auto PopulateIndices = [&](SetVector<GlobalVariable *> &LDSGlobals,
501 uint32_t &Idx) {
502 for (auto &GV : LDSGlobals) {
503 if (is_contained(UniqueLDSGlobals, GV))
504 continue;
505 UniqueLDSGlobals.insert(GV);
506 LDSParams.LDSToReplacementIndicesMap[GV] = {0, Idx, 0};
507 ++Idx;
508 }
509 };
510 uint32_t Idx = 0;
511 SetVector<GlobalVariable *> SwLDSVector;
512 SwLDSVector.insert(LDSParams.SwLDS);
513 PopulateIndices(SwLDSVector, Idx);
514 PopulateIndices(LDSParams.DirectAccess.StaticLDSGlobals, Idx);
515 PopulateIndices(LDSParams.IndirectAccess.StaticLDSGlobals, Idx);
516 PopulateIndices(LDSParams.DirectAccess.DynamicLDSGlobals, Idx);
517 PopulateIndices(LDSParams.IndirectAccess.DynamicLDSGlobals, Idx);
518}
519
520static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV,
521 Value *Replacement) {
522 // Replace all uses of LDS global in this Function with a Replacement.
523 auto ReplaceUsesLambda = [Func](const Use &U) -> bool {
524 auto *V = U.getUser();
525 if (auto *Inst = dyn_cast<Instruction>(V)) {
526 auto *Func1 = Inst->getFunction();
527 if (Func == Func1)
528 return true;
529 }
530 return false;
531 };
532 GV->replaceUsesWithIf(Replacement, ReplaceUsesLambda);
533}
534
535void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(Function *Func) {
536 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
537 GlobalVariable *SwLDS = LDSParams.SwLDS;
538 assert(SwLDS);
539 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
540 assert(SwLDSMetadata);
541 StructType *SwLDSMetadataStructType =
542 cast<StructType>(SwLDSMetadata->getValueType());
543 Type *Int32Ty = IRB.getInt32Ty();
544 auto &IndirectAccess = LDSParams.IndirectAccess;
545 auto &DirectAccess = LDSParams.DirectAccess;
546 // Replace all uses of LDS global in this Function with a Replacement.
547 SetVector<GlobalVariable *> UniqueLDSGlobals;
548 auto ReplaceLDSGlobalUses = [&](SetVector<GlobalVariable *> &LDSGlobals) {
549 for (auto &GV : LDSGlobals) {
550 // Do not generate instructions if LDS access is in non-kernel
551 // i.e indirect-access.
552 if ((IndirectAccess.StaticLDSGlobals.contains(GV) ||
553 IndirectAccess.DynamicLDSGlobals.contains(GV)) &&
554 (!DirectAccess.StaticLDSGlobals.contains(GV) &&
555 !DirectAccess.DynamicLDSGlobals.contains(GV)))
556 continue;
557 if (is_contained(UniqueLDSGlobals, GV))
558 continue;
559 UniqueLDSGlobals.insert(GV);
560 auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
561 assert(Indices.size() == 3);
562 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]),
563 ConstantInt::get(Int32Ty, Indices[1]),
564 ConstantInt::get(Int32Ty, Indices[2])};
566 SwLDSMetadataStructType, SwLDSMetadata, GEPIdx, true);
568 Value *BasePlusOffset =
569 IRB.CreateInBoundsGEP(IRB.getInt8Ty(), SwLDS, {Offset});
570 LLVM_DEBUG(GV->printAsOperand(dbgs() << "Sw LDS Lowering, Replacing LDS ",
571 false));
572 replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
573 }
574 };
575 ReplaceLDSGlobalUses(DirectAccess.StaticLDSGlobals);
576 ReplaceLDSGlobalUses(IndirectAccess.StaticLDSGlobals);
577 ReplaceLDSGlobalUses(DirectAccess.DynamicLDSGlobals);
578 ReplaceLDSGlobalUses(IndirectAccess.DynamicLDSGlobals);
579}
580
581void AMDGPUSwLowerLDS::updateMallocSizeForDynamicLDS(
582 Function *Func, Value **CurrMallocSize, Value *HiddenDynLDSSize,
583 SetVector<GlobalVariable *> &DynamicLDSGlobals) {
584 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
585 Type *Int32Ty = IRB.getInt32Ty();
586
587 GlobalVariable *SwLDS = LDSParams.SwLDS;
588 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
589 assert(SwLDS && SwLDSMetadata);
590 StructType *MetadataStructType =
591 cast<StructType>(SwLDSMetadata->getValueType());
592 unsigned MaxAlignment = SwLDS->getAlignment();
593 Value *MaxAlignValue = IRB.getInt32(MaxAlignment);
594 Value *MaxAlignValueMinusOne = IRB.getInt32(MaxAlignment - 1);
595
596 for (GlobalVariable *DynGV : DynamicLDSGlobals) {
597 auto &Indices = LDSParams.LDSToReplacementIndicesMap[DynGV];
598 // Update the Offset metadata.
599 Constant *Index0 = ConstantInt::get(Int32Ty, 0);
600 Constant *Index1 = ConstantInt::get(Int32Ty, Indices[1]);
601
602 Constant *Index2Offset = ConstantInt::get(Int32Ty, 0);
603 auto *GEPForOffset = IRB.CreateInBoundsGEP(
604 MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2Offset});
605
606 IRB.CreateStore(*CurrMallocSize, GEPForOffset);
607 // Update the size and Aligned Size metadata.
608 Constant *Index2Size = ConstantInt::get(Int32Ty, 1);
609 auto *GEPForSize = IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
610 {Index0, Index1, Index2Size});
611
612 Value *CurrDynLDSSize = IRB.CreateLoad(Int32Ty, HiddenDynLDSSize);
613 IRB.CreateStore(CurrDynLDSSize, GEPForSize);
614 Constant *Index2AlignedSize = ConstantInt::get(Int32Ty, 2);
615 auto *GEPForAlignedSize = IRB.CreateInBoundsGEP(
616 MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2AlignedSize});
617
618 Value *AlignedDynLDSSize =
619 IRB.CreateAdd(CurrDynLDSSize, MaxAlignValueMinusOne);
620 AlignedDynLDSSize = IRB.CreateUDiv(AlignedDynLDSSize, MaxAlignValue);
621 AlignedDynLDSSize = IRB.CreateMul(AlignedDynLDSSize, MaxAlignValue);
622 IRB.CreateStore(AlignedDynLDSSize, GEPForAlignedSize);
623
624 // Update the Current Malloc Size
625 *CurrMallocSize = IRB.CreateAdd(*CurrMallocSize, AlignedDynLDSSize);
626 }
627}
628
629static DebugLoc getOrCreateDebugLoc(const Instruction *InsertBefore,
630 DISubprogram *SP) {
631 assert(InsertBefore);
632 if (InsertBefore->getDebugLoc())
633 return InsertBefore->getDebugLoc();
634 if (SP)
635 return DILocation::get(SP->getContext(), SP->getLine(), 1, SP);
636 return DebugLoc();
637}
638
639void AMDGPUSwLowerLDS::getLDSMemoryInstructions(
640 Function *Func, SetVector<Instruction *> &LDSInstructions) {
641 for (BasicBlock &BB : *Func) {
642 for (Instruction &Inst : BB) {
643 if (LoadInst *LI = dyn_cast<LoadInst>(&Inst)) {
644 if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
645 LDSInstructions.insert(&Inst);
646 } else if (StoreInst *SI = dyn_cast<StoreInst>(&Inst)) {
647 if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
648 LDSInstructions.insert(&Inst);
649 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(&Inst)) {
650 if (RMW->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
651 LDSInstructions.insert(&Inst);
652 } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(&Inst)) {
653 if (XCHG->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
654 LDSInstructions.insert(&Inst);
655 } else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&Inst)) {
656 if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
657 ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS)
658 LDSInstructions.insert(&Inst);
659 } else if (AnyMemIntrinsic *MI = dyn_cast<AnyMemIntrinsic>(&Inst)) {
660 if (MI->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
661 LDSInstructions.insert(&Inst);
662 } else if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
663 if (MTI->getSourceAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
664 LDSInstructions.insert(&Inst);
665 }
666 } else
667 continue;
668 }
669 }
670}
671
672Value *AMDGPUSwLowerLDS::getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr,
673 Value *LDSPtr) {
674 assert(LDSPtr && "Invalid LDS pointer operand");
675 Type *LDSPtrType = LDSPtr->getType();
676 LLVMContext &Ctx = M.getContext();
677 const DataLayout &DL = M.getDataLayout();
678 Type *IntTy = DL.getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
679 if (auto *VecPtrTy = dyn_cast<VectorType>(LDSPtrType)) {
680 // Handle vector of pointers
681 ElementCount NumElements = VecPtrTy->getElementCount();
682 IntTy = VectorType::get(IntTy, NumElements);
683 }
684 Value *GepIndex = IRB.CreatePtrToInt(LDSPtr, IntTy);
685 return IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {GepIndex});
686}
687
688void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory(
689 Function *Func, Value *LoadMallocPtr,
690 SetVector<Instruction *> &LDSInstructions) {
691 LLVM_DEBUG(dbgs() << "Translating LDS memory operations to global memory : "
692 << Func->getName());
693 for (Instruction *Inst : LDSInstructions) {
694 IRB.SetInsertPoint(Inst);
695 if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
696 Value *LIOperand = LI->getPointerOperand();
697 Value *Replacement =
698 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LIOperand);
699 LoadInst *NewLI = IRB.CreateAlignedLoad(LI->getType(), Replacement,
700 LI->getAlign(), LI->isVolatile());
701 NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
702 AsanInfo.Instructions.insert(NewLI);
703 LI->replaceAllUsesWith(NewLI);
704 LI->eraseFromParent();
705 } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
706 Value *SIOperand = SI->getPointerOperand();
707 Value *Replacement =
708 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, SIOperand);
709 StoreInst *NewSI = IRB.CreateAlignedStore(
710 SI->getValueOperand(), Replacement, SI->getAlign(), SI->isVolatile());
711 NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID());
712 AsanInfo.Instructions.insert(NewSI);
713 SI->replaceAllUsesWith(NewSI);
714 SI->eraseFromParent();
715 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
716 Value *RMWPtrOperand = RMW->getPointerOperand();
717 Value *RMWValOperand = RMW->getValOperand();
718 Value *Replacement =
719 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, RMWPtrOperand);
720 AtomicRMWInst *NewRMW = IRB.CreateAtomicRMW(
721 RMW->getOperation(), Replacement, RMWValOperand, RMW->getAlign(),
722 RMW->getOrdering(), RMW->getSyncScopeID());
723 NewRMW->setVolatile(RMW->isVolatile());
724 AsanInfo.Instructions.insert(NewRMW);
725 RMW->replaceAllUsesWith(NewRMW);
726 RMW->eraseFromParent();
727 } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(Inst)) {
728 Value *XCHGPtrOperand = XCHG->getPointerOperand();
729 Value *Replacement =
730 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, XCHGPtrOperand);
732 Replacement, XCHG->getCompareOperand(), XCHG->getNewValOperand(),
733 XCHG->getAlign(), XCHG->getSuccessOrdering(),
734 XCHG->getFailureOrdering(), XCHG->getSyncScopeID());
735 NewXCHG->setVolatile(XCHG->isVolatile());
736 AsanInfo.Instructions.insert(NewXCHG);
737 XCHG->replaceAllUsesWith(NewXCHG);
738 XCHG->eraseFromParent();
739 } else if (AnyMemIntrinsic *MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
740 Value *NewDest = MI->getRawDest();
741 if (MI->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
742 NewDest = getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, NewDest);
743 CallInst *NewMI = nullptr;
745 if (MI->isAtomic()) {
747 NewDest, MSI->getValue(), MSI->getLength(),
748 MSI->getDestAlign().valueOrOne(), MSI->getElementSizeInBytes());
749 } else {
750 NewMI = IRB.CreateMemSet(NewDest, MSI->getValue(), MSI->getLength(),
751 MSI->getDestAlign(),
752 cast<MemSetInst>(MI)->isVolatile());
753 }
755 Value *NewSrc = MTI->getRawSource();
756 if (MTI->getSourceAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
757 NewSrc = getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, NewSrc);
758 if (MI->isAtomic()) {
759 if (MI->getIntrinsicID() ==
760 Intrinsic::memmove_element_unordered_atomic) {
762 NewDest, MTI->getDestAlign().valueOrOne(), NewSrc,
763 MTI->getSourceAlign().valueOrOne(), MTI->getLength(),
764 MTI->getElementSizeInBytes());
765 } else {
767 NewDest, MTI->getDestAlign().valueOrOne(), NewSrc,
768 MTI->getSourceAlign().valueOrOne(), MTI->getLength(),
769 MTI->getElementSizeInBytes());
770 }
771 } else {
772 NewMI = IRB.CreateMemTransferInst(
773 MI->getIntrinsicID(), NewDest, MTI->getDestAlign(), NewSrc,
774 MTI->getSourceAlign(), MTI->getLength(),
775 cast<MemTransferInst>(MI)->isVolatile());
776 }
777 } else
778 reportFatalUsageError("Unimplemented LDS lowering memory intrinsic");
779 AsanInfo.Instructions.insert(NewMI);
780 MI->replaceAllUsesWith(NewMI);
781 MI->eraseFromParent();
782 } else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(Inst)) {
783 Value *AIOperand = ASC->getPointerOperand();
784 Value *Replacement =
785 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, AIOperand);
786 Value *NewAI = IRB.CreateAddrSpaceCast(Replacement, ASC->getType());
787 // Note: No need to add the instruction to AsanInfo instructions to be
788 // instrumented list. FLAT_ADDRESS ptr would have been already
789 // instrumented by asan pass prior to this pass.
790 ASC->replaceAllUsesWith(NewAI);
791 ASC->eraseFromParent();
792 } else
793 report_fatal_error("Unimplemented LDS lowering instruction");
794 }
795}
796
797void AMDGPUSwLowerLDS::poisonRedzones(Function *Func, Value *MallocPtr) {
798 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
799 Type *Int64Ty = IRB.getInt64Ty();
800 Type *VoidTy = IRB.getVoidTy();
801 FunctionCallee AsanPoisonRegion = M.getOrInsertFunction(
802 "__asan_poison_region",
803 FunctionType::get(VoidTy, {Int64Ty, Int64Ty}, false));
804
805 auto RedzonesVec = LDSParams.RedzoneOffsetAndSizeVector;
806 size_t VecSize = RedzonesVec.size();
807 for (unsigned i = 0; i < VecSize; i++) {
808 auto &RedzonePair = RedzonesVec[i];
809 uint64_t RedzoneOffset = RedzonePair.first;
810 uint64_t RedzoneSize = RedzonePair.second;
811 Value *RedzoneAddrOffset = IRB.CreateInBoundsGEP(
812 IRB.getInt8Ty(), MallocPtr, {IRB.getInt64(RedzoneOffset)});
813 Value *RedzoneAddress = IRB.CreatePtrToInt(RedzoneAddrOffset, Int64Ty);
814 IRB.CreateCall(AsanPoisonRegion,
815 {RedzoneAddress, IRB.getInt64(RedzoneSize)});
816 }
817}
818
819void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
820 DomTreeUpdater &DTU) {
821 LLVM_DEBUG(dbgs() << "Sw Lowering Kernel LDS for : " << Func->getName());
822 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
823 auto &Ctx = M.getContext();
824 auto *PrevEntryBlock = &Func->getEntryBlock();
825 SetVector<Instruction *> LDSInstructions;
826 getLDSMemoryInstructions(Func, LDSInstructions);
827 const DataLayout &DL = M.getDataLayout();
828
829 // Create malloc block.
830 auto *MallocBlock = BasicBlock::Create(Ctx, "Malloc", Func, PrevEntryBlock);
831
832 // Create WIdBlock block which has instructions related to selection of
833 // {0,0,0} indiex work item in the work group.
834 auto *WIdBlock = BasicBlock::Create(Ctx, "WId", Func, MallocBlock);
835
836 // Move constant-size allocas from the original entry block to the new entry
837 // block (WIdBlock) so they remain static allocas. Splice the leading cluster
838 // in bulk, then move any stragglers that are interleaved with other
839 // instructions.
840 auto SplitIt = PrevEntryBlock->getFirstNonPHIOrDbgOrAlloca();
841 WIdBlock->splice(WIdBlock->end(), PrevEntryBlock, PrevEntryBlock->begin(),
842 SplitIt);
843 for (Instruction &I : make_early_inc_range(*PrevEntryBlock))
844 if (auto *AI = dyn_cast<AllocaInst>(&I))
845 if (isa<ConstantInt>(AI->getArraySize()))
846 AI->moveBefore(*WIdBlock, WIdBlock->end());
847
848 IRB.SetInsertPoint(WIdBlock, WIdBlock->end());
849 DebugLoc FirstDL =
850 getOrCreateDebugLoc(&*PrevEntryBlock->begin(), Func->getSubprogram());
851 IRB.SetCurrentDebugLocation(FirstDL);
852 Value *WIdx = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
853 Value *WIdy = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_y, {});
854 Value *WIdz = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_z, {});
855 Value *XYOr = IRB.CreateOr(WIdx, WIdy);
856 Value *XYZOr = IRB.CreateOr(XYOr, WIdz);
857 Value *WIdzCond = IRB.CreateICmpEQ(XYZOr, IRB.getInt32(0));
858
859 // All work items will branch to PrevEntryBlock except {0,0,0} index
860 // work item which will branch to malloc block.
861 IRB.CreateCondBr(WIdzCond, MallocBlock, PrevEntryBlock);
862
863 // Malloc block
864 IRB.SetInsertPoint(MallocBlock, MallocBlock->begin());
865
866 // If Dynamic LDS globals are accessed by the kernel,
867 // Get the size of dyn lds from hidden dyn_lds_size kernel arg.
868 // Update the corresponding metadata global entries for this dyn lds global.
869 GlobalVariable *SwLDS = LDSParams.SwLDS;
870 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
871 assert(SwLDS && SwLDSMetadata);
872 StructType *MetadataStructType =
873 cast<StructType>(SwLDSMetadata->getValueType());
874 uint32_t MallocSize = 0;
875 Value *CurrMallocSize;
876 Type *Int32Ty = IRB.getInt32Ty();
877 Type *Int64Ty = IRB.getInt64Ty();
878
879 SetVector<GlobalVariable *> UniqueLDSGlobals;
880 auto GetUniqueLDSGlobals = [&](SetVector<GlobalVariable *> &LDSGlobals) {
881 for (auto &GV : LDSGlobals) {
882 if (is_contained(UniqueLDSGlobals, GV))
883 continue;
884 UniqueLDSGlobals.insert(GV);
885 }
886 };
887
888 GetUniqueLDSGlobals(LDSParams.DirectAccess.StaticLDSGlobals);
889 GetUniqueLDSGlobals(LDSParams.IndirectAccess.StaticLDSGlobals);
890 unsigned NumStaticLDS = 1 + UniqueLDSGlobals.size();
891 UniqueLDSGlobals.clear();
892
893 if (NumStaticLDS) {
894 auto *GEPForEndStaticLDSOffset =
895 IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
896 {ConstantInt::get(Int32Ty, 0),
897 ConstantInt::get(Int32Ty, NumStaticLDS - 1),
898 ConstantInt::get(Int32Ty, 0)});
899
900 auto *GEPForEndStaticLDSSize =
901 IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
902 {ConstantInt::get(Int32Ty, 0),
903 ConstantInt::get(Int32Ty, NumStaticLDS - 1),
904 ConstantInt::get(Int32Ty, 2)});
905
906 Value *EndStaticLDSOffset =
907 IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSOffset);
908 Value *EndStaticLDSSize = IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSSize);
909 CurrMallocSize = IRB.CreateAdd(EndStaticLDSOffset, EndStaticLDSSize);
910 } else
911 CurrMallocSize = IRB.getInt32(MallocSize);
912
913 if (LDSParams.SwDynLDS) {
916 "Dynamic LDS size query is only supported for CO V5 and later.");
917 // Get size from hidden dyn_lds_size argument of kernel
919 IRB.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {});
920 Value *HiddenDynLDSSize = IRB.CreateInBoundsGEP(
921 ImplicitArg->getType(), ImplicitArg,
922 {ConstantInt::get(Int64Ty, COV5_HIDDEN_DYN_LDS_SIZE_ARG)});
923 UniqueLDSGlobals.clear();
924 GetUniqueLDSGlobals(LDSParams.DirectAccess.DynamicLDSGlobals);
925 GetUniqueLDSGlobals(LDSParams.IndirectAccess.DynamicLDSGlobals);
926 updateMallocSizeForDynamicLDS(Func, &CurrMallocSize, HiddenDynLDSSize,
927 UniqueLDSGlobals);
928 }
929
930 CurrMallocSize = IRB.CreateZExt(CurrMallocSize, Int64Ty);
931
932 // Create a call to malloc function which does device global memory allocation
933 // with size equals to all LDS global accesses size in this kernel.
934 Value *ReturnAddress = IRB.CreateIntrinsic(
935 Intrinsic::returnaddress, IRB.getPtrTy(DL.getProgramAddressSpace()),
936 {IRB.getInt32(0)});
937 FunctionCallee MallocFunc = M.getOrInsertFunction(
938 StringRef("__asan_malloc_impl"),
939 FunctionType::get(Int64Ty, {Int64Ty, Int64Ty}, false));
940 Value *RAPtrToInt = IRB.CreatePtrToInt(ReturnAddress, Int64Ty);
941 Value *MallocCall = IRB.CreateCall(MallocFunc, {CurrMallocSize, RAPtrToInt});
942
943 Value *MallocPtr =
945
946 // Create store of malloc to new global
947 IRB.CreateStore(MallocPtr, SwLDS);
948
949 // Create calls to __asan_poison_region to poison redzones.
950 poisonRedzones(Func, MallocPtr);
951
952 // Create branch to PrevEntryBlock
953 IRB.CreateBr(PrevEntryBlock);
954
955 // Create wave-group barrier at the starting of Previous entry block
956 Type *Int1Ty = IRB.getInt1Ty();
957 IRB.SetInsertPoint(PrevEntryBlock, PrevEntryBlock->begin());
958 auto *XYZCondPhi = IRB.CreatePHI(Int1Ty, 2, "xyzCond");
959 XYZCondPhi->addIncoming(IRB.getInt1(0), WIdBlock);
960 XYZCondPhi->addIncoming(IRB.getInt1(1), MallocBlock);
961
962 IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {});
963
964 // Load malloc pointer from Sw LDS.
965 Value *LoadMallocPtr =
967
968 // Replace All uses of LDS globals with new LDS pointers.
969 replaceKernelLDSAccesses(Func);
970
971 // Replace Memory Operations on LDS with corresponding
972 // global memory pointers.
973 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
974 LDSInstructions);
975
976 auto *CondFreeBlock = BasicBlock::Create(Ctx, "CondFree", Func);
977 auto *FreeBlock = BasicBlock::Create(Ctx, "Free", Func);
978 auto *EndBlock = BasicBlock::Create(Ctx, "End", Func);
979 for (BasicBlock &BB : *Func) {
980 if (!BB.empty()) {
981 if (ReturnInst *RI = dyn_cast<ReturnInst>(&BB.back())) {
982 RI->eraseFromParent();
983 IRB.SetInsertPoint(&BB, BB.end());
984 IRB.CreateBr(CondFreeBlock);
985 }
986 }
987 }
988
989 // Cond Free Block
990 IRB.SetInsertPoint(CondFreeBlock, CondFreeBlock->begin());
991 IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {});
992 IRB.CreateCondBr(XYZCondPhi, FreeBlock, EndBlock);
993
994 // Free Block
995 IRB.SetInsertPoint(FreeBlock, FreeBlock->begin());
996
997 // Free the previously allocate device global memory.
998 FunctionCallee AsanFreeFunc = M.getOrInsertFunction(
999 StringRef("__asan_free_impl"),
1000 FunctionType::get(IRB.getVoidTy(), {Int64Ty, Int64Ty}, false));
1001 Value *ReturnAddr = IRB.CreateIntrinsic(
1002 Intrinsic::returnaddress, IRB.getPtrTy(DL.getProgramAddressSpace()),
1003 IRB.getInt32(0));
1004 Value *RAPToInt = IRB.CreatePtrToInt(ReturnAddr, Int64Ty);
1005 Value *MallocPtrToInt = IRB.CreatePtrToInt(LoadMallocPtr, Int64Ty);
1006 IRB.CreateCall(AsanFreeFunc, {MallocPtrToInt, RAPToInt});
1007
1008 IRB.CreateBr(EndBlock);
1009
1010 // End Block
1011 IRB.SetInsertPoint(EndBlock, EndBlock->begin());
1012 IRB.CreateRetVoid();
1013 // Update the DomTree with corresponding links to basic blocks.
1014 DTU.applyUpdates({{DominatorTree::Insert, WIdBlock, MallocBlock},
1015 {DominatorTree::Insert, MallocBlock, PrevEntryBlock},
1016 {DominatorTree::Insert, CondFreeBlock, FreeBlock},
1017 {DominatorTree::Insert, FreeBlock, EndBlock}});
1018}
1019
1020Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel(
1021 Function *Func, SetVector<GlobalVariable *> &Variables) {
1022 Type *Int32Ty = IRB.getInt32Ty();
1023 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1024
1025 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
1026 assert(SwLDSMetadata);
1027 auto *SwLDSMetadataStructType =
1028 cast<StructType>(SwLDSMetadata->getValueType());
1029 ArrayType *KernelOffsetsType =
1031
1032 SmallVector<Constant *> Elements;
1033 for (auto *GV : Variables) {
1034 auto It = LDSParams.LDSToReplacementIndicesMap.find(GV);
1035 if (It == LDSParams.LDSToReplacementIndicesMap.end()) {
1036 Elements.push_back(
1038 continue;
1039 }
1040 auto &Indices = It->second;
1041 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]),
1042 ConstantInt::get(Int32Ty, Indices[1]),
1043 ConstantInt::get(Int32Ty, Indices[2])};
1044 Constant *GEP = ConstantExpr::getGetElementPtr(SwLDSMetadataStructType,
1045 SwLDSMetadata, GEPIdx, true);
1046 Elements.push_back(GEP);
1047 }
1048 return ConstantArray::get(KernelOffsetsType, Elements);
1049}
1050
1051void AMDGPUSwLowerLDS::buildNonKernelLDSBaseTable(
1052 NonKernelLDSParameters &NKLDSParams) {
1053 // Base table will have single row, with elements of the row
1054 // placed as per kernel ID. Each element in the row corresponds
1055 // to addresss of "SW LDS" global of the kernel.
1056 auto &Kernels = NKLDSParams.OrderedKernels;
1057 if (Kernels.empty())
1058 return;
1059 const size_t NumberKernels = Kernels.size();
1060 ArrayType *AllKernelsOffsetsType =
1061 ArrayType::get(IRB.getPtrTy(AMDGPUAS::LOCAL_ADDRESS), NumberKernels);
1062 std::vector<Constant *> OverallConstantExprElts(NumberKernels);
1063 for (size_t i = 0; i < NumberKernels; i++) {
1064 Function *Func = Kernels[i];
1065 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1066 OverallConstantExprElts[i] = LDSParams.SwLDS;
1067 }
1068 Constant *init =
1069 ConstantArray::get(AllKernelsOffsetsType, OverallConstantExprElts);
1070 NKLDSParams.LDSBaseTable = new GlobalVariable(
1071 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
1072 "llvm.amdgcn.sw.lds.base.table", nullptr, GlobalValue::NotThreadLocal,
1075 MD.NoAddress = true;
1076 NKLDSParams.LDSBaseTable->setSanitizerMetadata(MD);
1077}
1078
1079void AMDGPUSwLowerLDS::buildNonKernelLDSOffsetTable(
1080 NonKernelLDSParameters &NKLDSParams) {
1081 // Offset table will have multiple rows and columns.
1082 // Rows are assumed to be from 0 to (n-1). n is total number
1083 // of kernels accessing the LDS through non-kernels.
1084 // Each row will have m elements. m is the total number of
1085 // unique LDS globals accessed by non-kernels.
1086 // Each element in the row correspond to the address of
1087 // the replacement of LDS global done by that particular kernel.
1088 auto &Variables = NKLDSParams.OrdereLDSGlobals;
1089 auto &Kernels = NKLDSParams.OrderedKernels;
1090 if (Variables.empty() || Kernels.empty())
1091 return;
1092 const size_t NumberVariables = Variables.size();
1093 const size_t NumberKernels = Kernels.size();
1094
1095 ArrayType *KernelOffsetsType =
1096 ArrayType::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), NumberVariables);
1097
1098 ArrayType *AllKernelsOffsetsType =
1099 ArrayType::get(KernelOffsetsType, NumberKernels);
1100 std::vector<Constant *> overallConstantExprElts(NumberKernels);
1101 for (size_t i = 0; i < NumberKernels; i++) {
1102 Function *Func = Kernels[i];
1103 overallConstantExprElts[i] =
1104 getAddressesOfVariablesInKernel(Func, Variables);
1105 }
1106 Constant *Init =
1107 ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts);
1108 NKLDSParams.LDSOffsetTable = new GlobalVariable(
1109 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, Init,
1110 "llvm.amdgcn.sw.lds.offset.table", nullptr, GlobalValue::NotThreadLocal,
1113 MD.NoAddress = true;
1114 NKLDSParams.LDSOffsetTable->setSanitizerMetadata(MD);
1115}
1116
1117void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses(
1118 Function *Func, SetVector<GlobalVariable *> &LDSGlobals,
1119 NonKernelLDSParameters &NKLDSParams) {
1120 // Replace LDS access in non-kernel with replacement queried from
1121 // Base table and offset from offset table.
1122 LLVM_DEBUG(dbgs() << "Sw LDS lowering, lower non-kernel access for : "
1123 << Func->getName());
1124 auto InsertAt = Func->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
1125 IRB.SetInsertPoint(InsertAt);
1126
1127 // Get LDS memory instructions.
1128 SetVector<Instruction *> LDSInstructions;
1129 getLDSMemoryInstructions(Func, LDSInstructions);
1130
1131 auto *KernelId = IRB.CreateIntrinsic(Intrinsic::amdgcn_lds_kernel_id, {});
1132 GlobalVariable *LDSBaseTable = NKLDSParams.LDSBaseTable;
1133 GlobalVariable *LDSOffsetTable = NKLDSParams.LDSOffsetTable;
1134 auto &OrdereLDSGlobals = NKLDSParams.OrdereLDSGlobals;
1135 Value *BaseGEP = IRB.CreateInBoundsGEP(
1136 LDSBaseTable->getValueType(), LDSBaseTable, {IRB.getInt32(0), KernelId});
1137 Value *BaseLoad =
1138 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::LOCAL_ADDRESS), BaseGEP);
1139 Value *LoadMallocPtr =
1140 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), BaseLoad);
1141
1142 for (GlobalVariable *GV : LDSGlobals) {
1143 const auto *GVIt = llvm::find(OrdereLDSGlobals, GV);
1144 assert(GVIt != OrdereLDSGlobals.end());
1145 uint32_t GVOffset = std::distance(OrdereLDSGlobals.begin(), GVIt);
1146
1147 Value *OffsetGEP = IRB.CreateInBoundsGEP(
1148 LDSOffsetTable->getValueType(), LDSOffsetTable,
1149 {IRB.getInt32(0), KernelId, IRB.getInt32(GVOffset)});
1150 Value *OffsetLoad =
1151 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), OffsetGEP);
1152 Value *Offset = IRB.CreateLoad(IRB.getInt32Ty(), OffsetLoad);
1153 Value *BasePlusOffset =
1154 IRB.CreateInBoundsGEP(IRB.getInt8Ty(), BaseLoad, {Offset});
1155 LLVM_DEBUG(dbgs() << "Sw LDS Lowering, Replace non-kernel LDS for "
1156 << GV->getName());
1157 replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
1158 }
1159 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
1160 LDSInstructions);
1161}
1162
1163static void reorderStaticDynamicIndirectLDSSet(KernelLDSParameters &LDSParams) {
1164 // Sort Static, dynamic LDS globals which are either
1165 // direct or indirect access on basis of name.
1166 auto &DirectAccess = LDSParams.DirectAccess;
1167 auto &IndirectAccess = LDSParams.IndirectAccess;
1168 LDSParams.DirectAccess.StaticLDSGlobals = sortByName(
1169 std::vector<GlobalVariable *>(DirectAccess.StaticLDSGlobals.begin(),
1170 DirectAccess.StaticLDSGlobals.end()));
1171 LDSParams.DirectAccess.DynamicLDSGlobals = sortByName(
1172 std::vector<GlobalVariable *>(DirectAccess.DynamicLDSGlobals.begin(),
1173 DirectAccess.DynamicLDSGlobals.end()));
1174 LDSParams.IndirectAccess.StaticLDSGlobals = sortByName(
1175 std::vector<GlobalVariable *>(IndirectAccess.StaticLDSGlobals.begin(),
1176 IndirectAccess.StaticLDSGlobals.end()));
1177 LDSParams.IndirectAccess.DynamicLDSGlobals = sortByName(
1178 std::vector<GlobalVariable *>(IndirectAccess.DynamicLDSGlobals.begin(),
1179 IndirectAccess.DynamicLDSGlobals.end()));
1180}
1181
1182void AMDGPUSwLowerLDS::initAsanInfo() {
1183 // Get Shadow mapping scale and offset.
1184 unsigned LongSize =
1185 M.getDataLayout().getPointerSizeInBits(AMDGPUAS::GLOBAL_ADDRESS);
1187 int Scale;
1188 bool OrShadowOffset;
1189 llvm::getAddressSanitizerParams(AMDGPUTM.getTargetTriple(), LongSize, false,
1190 &Offset, &Scale, &OrShadowOffset);
1191 AsanInfo.Scale = Scale;
1192 AsanInfo.Offset = Offset;
1193}
1194
1195static bool hasFnWithSanitizeAddressAttr(FunctionVariableMap &LDSAccesses) {
1196 for (auto &K : LDSAccesses) {
1197 Function *F = K.first;
1198 if (!F)
1199 continue;
1200 if (F->hasFnAttribute(Attribute::SanitizeAddress))
1201 return true;
1202 }
1203 return false;
1204}
1205
1206bool AMDGPUSwLowerLDS::run() {
1207 bool Changed = false;
1208
1209 CallGraph CG = CallGraph(M);
1210
1212
1213 // Get all the direct and indirect access of LDS for all the kernels.
1214 LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
1215
1216 // Flag to decide whether to lower all the LDS accesses
1217 // based on sanitize_address attribute.
1218 bool LowerAllLDS = hasFnWithSanitizeAddressAttr(LDSUsesInfo.direct_access) ||
1219 hasFnWithSanitizeAddressAttr(LDSUsesInfo.indirect_access);
1220
1221 if (!LowerAllLDS)
1222 return Changed;
1223
1224 // Utility to group LDS access into direct, indirect, static and dynamic.
1225 auto PopulateKernelStaticDynamicLDS = [&](FunctionVariableMap &LDSAccesses,
1226 bool DirectAccess) {
1227 for (auto &K : LDSAccesses) {
1228 Function *F = K.first;
1229 if (!F || K.second.empty())
1230 continue;
1231
1232 assert(isKernel(*F));
1233
1234 // Only inserts if key isn't already in the map.
1235 FuncLDSAccessInfo.KernelToLDSParametersMap.insert(
1236 {F, KernelLDSParameters()});
1237
1238 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[F];
1239 if (!DirectAccess)
1240 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(F);
1241 for (GlobalVariable *GV : K.second) {
1242 if (!DirectAccess) {
1243 if (AMDGPU::isDynamicLDS(*GV))
1244 LDSParams.IndirectAccess.DynamicLDSGlobals.insert(GV);
1245 else
1246 LDSParams.IndirectAccess.StaticLDSGlobals.insert(GV);
1247 FuncLDSAccessInfo.AllNonKernelLDSAccess.insert(GV);
1248 } else {
1249 if (AMDGPU::isDynamicLDS(*GV))
1250 LDSParams.DirectAccess.DynamicLDSGlobals.insert(GV);
1251 else
1252 LDSParams.DirectAccess.StaticLDSGlobals.insert(GV);
1253 }
1254 }
1255 }
1256 };
1257
1258 PopulateKernelStaticDynamicLDS(LDSUsesInfo.direct_access, true);
1259 PopulateKernelStaticDynamicLDS(LDSUsesInfo.indirect_access, false);
1260
1261 // Get address sanitizer scale.
1262 initAsanInfo();
1263
1264 for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
1265 Function *Func = K.first;
1266 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1267 if (LDSParams.DirectAccess.StaticLDSGlobals.empty() &&
1268 LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
1269 LDSParams.IndirectAccess.StaticLDSGlobals.empty() &&
1270 LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) {
1271 Changed = false;
1272 } else {
1274 CG, Func,
1275 {"amdgpu-no-workitem-id-x", "amdgpu-no-workitem-id-y",
1276 "amdgpu-no-workitem-id-z", "amdgpu-no-heap-ptr"});
1277 if (!LDSParams.IndirectAccess.StaticLDSGlobals.empty() ||
1278 !LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
1279 removeFnAttrFromReachable(CG, Func, {"amdgpu-no-lds-kernel-id"});
1280 reorderStaticDynamicIndirectLDSSet(LDSParams);
1281 buildSwLDSGlobal(Func);
1282 buildSwDynLDSGlobal(Func);
1283 populateSwMetadataGlobal(Func);
1284 populateSwLDSAttributeAndMetadata(Func);
1285 populateLDSToReplacementIndicesMap(Func);
1286 DomTreeUpdater DTU(DTCallback(*Func),
1287 DomTreeUpdater::UpdateStrategy::Lazy);
1288 lowerKernelLDSAccesses(Func, DTU);
1289 Changed = true;
1290 }
1291 }
1292
1293 // Get the Uses of LDS from non-kernels.
1294 getUsesOfLDSByNonKernels();
1295
1296 // Get non-kernels with LDS ptr as argument and called by kernels.
1297 getNonKernelsWithLDSArguments(CG);
1298
1299 // Lower LDS accesses in non-kernels.
1300 if (!FuncLDSAccessInfo.NonKernelToLDSAccessMap.empty() ||
1301 !FuncLDSAccessInfo.NonKernelsWithLDSArgument.empty()) {
1302 NonKernelLDSParameters NKLDSParams;
1303 NKLDSParams.OrderedKernels = getOrderedIndirectLDSAccessingKernels(
1304 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess);
1305 NKLDSParams.OrdereLDSGlobals = getOrderedNonKernelAllLDSGlobals(
1306 FuncLDSAccessInfo.AllNonKernelLDSAccess);
1307 buildNonKernelLDSBaseTable(NKLDSParams);
1308 buildNonKernelLDSOffsetTable(NKLDSParams);
1309 for (auto &K : FuncLDSAccessInfo.NonKernelToLDSAccessMap) {
1310 Function *Func = K.first;
1311 DenseSet<GlobalVariable *> &LDSGlobals = K.second;
1312 SetVector<GlobalVariable *> OrderedLDSGlobals = sortByName(
1313 std::vector<GlobalVariable *>(LDSGlobals.begin(), LDSGlobals.end()));
1314 lowerNonKernelLDSAccesses(Func, OrderedLDSGlobals, NKLDSParams);
1315 }
1316 for (Function *Func : FuncLDSAccessInfo.NonKernelsWithLDSArgument) {
1317 auto &K = FuncLDSAccessInfo.NonKernelToLDSAccessMap;
1318 if (K.contains(Func))
1319 continue;
1321 lowerNonKernelLDSAccesses(Func, Vec, NKLDSParams);
1322 }
1323 Changed = true;
1324 }
1325
1326 if (!Changed)
1327 return Changed;
1328
1329 for (auto &GV : make_early_inc_range(M.globals())) {
1331 // probably want to remove from used lists
1333 if (GV.use_empty())
1334 GV.eraseFromParent();
1335 }
1336 }
1337
1338 if (AsanInstrumentLDS) {
1339 SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
1340 for (Instruction *Inst : AsanInfo.Instructions) {
1341 SmallVector<InterestingMemoryOperand, 1> InterestingOperands;
1342 getInterestingMemoryOperands(M, Inst, InterestingOperands);
1343 llvm::append_range(OperandsToInstrument, InterestingOperands);
1344 }
1345 for (auto &Operand : OperandsToInstrument) {
1346 Value *Addr = Operand.getPtr();
1347 instrumentAddress(M, IRB, Operand.getInsn(), Operand.getInsn(), Addr,
1348 Operand.Alignment.valueOrOne(), Operand.TypeStoreSize,
1349 Operand.IsWrite, nullptr, false, false, AsanInfo.Scale,
1350 AsanInfo.Offset);
1351 Changed = true;
1352 }
1353 }
1354
1355 return Changed;
1356}
1357
1358class AMDGPUSwLowerLDSLegacy : public ModulePass {
1359public:
1360 const AMDGPUTargetMachine *AMDGPUTM;
1361 static char ID;
1362 AMDGPUSwLowerLDSLegacy(const AMDGPUTargetMachine *TM)
1363 : ModulePass(ID), AMDGPUTM(TM) {}
1364 bool runOnModule(Module &M) override;
1365 void getAnalysisUsage(AnalysisUsage &AU) const override {
1367 }
1368};
1369} // namespace
1370
1371char AMDGPUSwLowerLDSLegacy::ID = 0;
1372char &llvm::AMDGPUSwLowerLDSLegacyPassID = AMDGPUSwLowerLDSLegacy::ID;
1373
1374INITIALIZE_PASS_BEGIN(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
1375 "AMDGPU Software lowering of LDS", false, false)
1377INITIALIZE_PASS_END(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
1378 "AMDGPU Software lowering of LDS", false, false)
1379
1380bool AMDGPUSwLowerLDSLegacy::runOnModule(Module &M) {
1381 // AddressSanitizer pass adds "nosanitize_address" module flag if it has
1382 // instrumented the IR. Return early if the flag is not present.
1383 if (!M.getModuleFlag("nosanitize_address"))
1384 return false;
1385 DominatorTreeWrapperPass *const DTW =
1386 getAnalysisIfAvailable<DominatorTreeWrapperPass>();
1387 auto DTCallback = [&DTW](Function &F) -> DominatorTree * {
1388 return DTW ? &DTW->getDomTree() : nullptr;
1389 };
1390 if (!AMDGPUTM) {
1391 auto &TPC = getAnalysis<TargetPassConfig>();
1392 AMDGPUTM = &TPC.getTM<AMDGPUTargetMachine>();
1393 }
1394 AMDGPUSwLowerLDS SwLowerLDSImpl(M, *AMDGPUTM, DTCallback);
1395 bool IsChanged = SwLowerLDSImpl.run();
1396 return IsChanged;
1397}
1398
1399ModulePass *
1401 return new AMDGPUSwLowerLDSLegacy(TM);
1402}
1403
1406 // AddressSanitizer pass adds "nosanitize_address" module flag if it has
1407 // instrumented the IR. Return early if the flag is not present.
1408 if (!M.getModuleFlag("nosanitize_address"))
1409 return PreservedAnalyses::all();
1410 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1411 auto DTCallback = [&FAM](Function &F) -> DominatorTree * {
1412 return &FAM.getResult<DominatorTreeAnalysis>(F);
1413 };
1414 AMDGPUSwLowerLDS SwLowerLDSImpl(M, TM, DTCallback);
1415 bool IsChanged = SwLowerLDSImpl.run();
1416 if (!IsChanged)
1417 return PreservedAnalyses::all();
1418
1421 return PA;
1422}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
Hexagon Common GEP
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file implements a set that has insertion order iteration characteristics.
static Split data
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:119
Target-Independent Code Generator Pass Configuration Options pass.
static DebugLoc getOrCreateDebugLoc(const Instruction *InsertBefore, DISubprogram *SP)
This class represents a conversion between pointers from one address space to another.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents any memset intrinsic.
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
An instruction that atomically checks whether a specified value is in a memory location,...
void setVolatile(bool V)
Specify whether this is a volatile cmpxchg.
an instruction that atomically reads a memory location, combines it with another value,...
void setVolatile(bool V)
Specify whether this is a volatile RMW or not.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
A node in the call graph for a module.
Definition CallGraph.h:162
Function * getFunction() const
Returns the function that this call graph node represents.
Definition CallGraph.h:193
The basic data container for the call graph of a Module of IR.
Definition CallGraph.h:72
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537
static Constant * getGetElementPtr(Type *Ty, Constant *C, ArrayRef< Constant * > IdxList, GEPNoWrapFlags NW=GEPNoWrapFlags::none(), std::optional< ConstantRange > InRange=std::nullopt, Type *OnlyIfReducedTy=nullptr)
Getelementptr form.
Definition Constants.h:1464
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
LLVM_ABI void removeDeadConstantUsers() const
If there are any dead constant users dangling off of this constant, remove them.
Subprogram description. Uses SubclassData1.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:124
Implements a dense probed hash-table based set.
Definition DenseSet.h:289
Analysis pass which computes a DominatorTree.
Definition Dominators.h:270
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:306
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
arg_iterator arg_end()
Definition Function.h:877
arg_iterator arg_begin()
Definition Function.h:868
void applyUpdates(ArrayRef< UpdateT > Updates)
Submit updates to all available trees.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set a particular kind of metadata attachment.
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:337
LLVM_ABI void setSanitizerMetadata(SanitizerMetadata Meta)
Definition Globals.cpp:260
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ ExternalLinkage
Externally visible function.
Definition GlobalValue.h:53
Type * getValueType() const
uint64_t getAlignment() const
FIXME: Remove this function once transition to Align is over.
LLVM_ABI void eraseFromParent()
eraseFromParent - This method unlinks 'this' from the containing module and deletes it.
Definition Globals.cpp:538
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition IRBuilder.h:504
AtomicCmpXchgInst * CreateAtomicCmpXchg(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SyncScope::ID SSID=SyncScope::System)
Definition IRBuilder.h:1979
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:571
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1945
CondBrInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition IRBuilder.h:1238
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2247
void SetCurrentDebugLocation(const DebugLoc &L)
Set location information used by debugging information.
Definition IRBuilder.h:247
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:586
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition IRBuilder.h:591
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:2028
LLVM_ABI CallInst * CreateElementUnorderedAtomicMemMove(Value *Dst, Align DstAlign, Value *Src, Align SrcAlign, Value *Size, uint32_t ElementSize, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert an element unordered-atomic memmove between the specified pointers.
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1495
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:534
UncondBrInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition IRBuilder.h:1232
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:529
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2549
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2384
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1928
CallInst * CreateElementUnorderedAtomicMemSet(Value *Ptr, Value *Val, uint64_t Size, Align Alignment, uint32_t ElementSize, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert an element unordered-atomic memset of the region of memory starting at the given po...
Definition IRBuilder.h:681
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, MaybeAlign Align, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memset to the specified pointer and the specified value.
Definition IRBuilder.h:660
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2130
ReturnInst * CreateRetVoid()
Create a 'ret void' instruction.
Definition IRBuilder.h:1209
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1941
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1444
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2242
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2563
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:629
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Type * getVoidTy()
Fetch the type representing void.
Definition IRBuilder.h:624
LLVM_ABI CallInst * CreateElementUnorderedAtomicMemCpy(Value *Dst, Align DstAlign, Value *Src, Align SrcAlign, Value *Size, uint32_t ElementSize, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert an element unordered-atomic memcpy between the specified pointers.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition IRBuilder.h:1964
LLVM_ABI CallInst * CreateMemTransferInst(Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1614
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition IRBuilder.h:576
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2257
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1478
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System, bool Elementwise=false)
Definition IRBuilder.h:1992
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2868
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1069
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1561
Root of the metadata hierarchy.
Definition Metadata.h:64
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition Pass.h:255
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
A container for an operand bundle being viewed as a set of values rather than a set of uses.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
Return a value (possibly void), from a function.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
iterator end()
Get an iterator to the end of the SetVector.
Definition SetVector.h:112
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition SetVector.h:106
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
Class to represent struct types.
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:685
const Triple & getTargetTriple() const
Target-Independent Code Generator Pass Configuration Options.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
bool use_empty() const
Definition Value.h:346
LLVM_ABI bool replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:561
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
An efficient, type-erasing, non-owning reference to a callable.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
Changed
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
void getInterestingMemoryOperands(Module &M, Instruction *I, SmallVectorImpl< InterestingMemoryOperand > &Interesting)
Get all the memory operands from the instruction that needs to be instrumented.
bool isDynamicLDS(const GlobalVariable &GV)
unsigned getAMDHSACodeObjectVersion(const Module &M)
void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, ArrayRef< StringRef > FnAttrs)
Strip FnAttr attribute from any functions where we may have introduced its use.
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M)
DenseMap< Function *, DenseSet< GlobalVariable * > > FunctionVariableMap
bool isLDSVariableToLower(const GlobalVariable &GV)
bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M)
Align getAlign(const DataLayout &DL, const GlobalVariable *GV)
void instrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns, Instruction *InsertBefore, Value *Addr, Align Alignment, TypeSize TypeStoreSize, bool IsWrite, Value *SizeArgument, bool UseCalls, bool Recover, int AsanScale, int AsanOffset)
Instrument the memory operand Addr.
uint64_t getRedzoneSizeForGlobal(int AsanScale, uint64_t SizeInBytes)
Given SizeInBytes of the Value to be instrunmented, Returns the redzone size corresponding to it.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:558
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1764
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
ModulePass * createAMDGPUSwLowerLDSLegacyPass(const AMDGPUTargetMachine *TM=nullptr)
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:328
constexpr from_range_t from_range
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
char & AMDGPUSwLowerLDSLegacyPassID
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
LLVM_ABI void getAddressSanitizerParams(const Triple &TargetTriple, int LongSize, bool IsKasan, uint64_t *ShadowBase, int *MappingScale, bool *OrShadowOffset)
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
const AMDGPUTargetMachine & TM
Definition AMDGPU.h:333
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
FunctionVariableMap direct_access
FunctionVariableMap indirect_access
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39