LLVM 20.0.0git
AMDGPUSwLowerLDS.cpp
Go to the documentation of this file.
1//===-- AMDGPUSwLowerLDS.cpp -----------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass lowers the local data store, LDS, uses in kernel and non-kernel
10// functions in module to use dynamically allocated global memory.
11// Packed LDS Layout is emulated in the global memory.
12// The lowered memory instructions from LDS to global memory are then
13// instrumented for address sanitizer, to catch addressing errors.
14// This pass only work when address sanitizer has been enabled and has
15// instrumented the IR. It identifies that IR has been instrumented using
16// "nosanitize_address" module flag.
17//
18// Replacement of Kernel LDS accesses:
19// For a kernel, LDS access can be static or dynamic which are direct
20// (accessed within kernel) and indirect (accessed through non-kernels).
21// All these LDS accesses corresponding to kernel will be packed together,
22// where all static LDS accesses will be allocated first and then dynamic
23// LDS follows. The total size with alignment is calculated. A new LDS global
24// will be created for the kernel called "SW LDS" and it will have the
25// attribute "amdgpu-lds-size" attached with value of the size calculated.
26// All the LDS accesses in the module will be replaced by GEP with offset
27// into the "Sw LDS".
28// A new "llvm.amdgcn.<kernel>.dynlds" is created per kernel accessing
29// the dynamic LDS. This will be marked used by kernel and will have
30// MD_absolue_symbol metadata set to total static LDS size, Since dynamic
31// LDS allocation starts after all static LDS allocation.
32//
33// A device global memory equal to the total LDS size will be allocated.
34// At the prologue of the kernel, a single work-item from the
35// work-group, does a "malloc" and stores the pointer of the
36// allocation in "SW LDS".
37//
38// To store the offsets corresponding to all LDS accesses, another global
39// variable is created which will be called "SW LDS metadata" in this pass.
40// - SW LDS Global:
41// It is LDS global of ptr type with name
42// "llvm.amdgcn.sw.lds.<kernel-name>".
43// - Metadata Global:
44// It is of struct type, with n members. n equals the number of LDS
45// globals accessed by the kernel(direct and indirect). Each member of
46// struct is another struct of type {i32, i32, i32}. First member
47// corresponds to offset, second member corresponds to size of LDS global
48// being replaced and third represents the total aligned size. It will
49// have name "llvm.amdgcn.sw.lds.<kernel-name>.md". This global will have
50// an intializer with static LDS related offsets and sizes initialized.
51// But for dynamic LDS related entries, offsets will be intialized to
52// previous static LDS allocation end offset. Sizes for them will be zero
53// initially. These dynamic LDS offset and size values will be updated
54// within the kernel, since kernel can read the dynamic LDS size
55// allocation done at runtime with query to "hidden_dynamic_lds_size"
56// hidden kernel argument.
57//
58// At the epilogue of kernel, allocated memory would be made free by the same
59// single work-item.
60//
61// Replacement of non-kernel LDS accesses:
62// Multiple kernels can access the same non-kernel function.
63// All the kernels accessing LDS through non-kernels are sorted and
64// assigned a kernel-id. All the LDS globals accessed by non-kernels
65// are sorted. This information is used to build two tables:
66// - Base table:
67// Base table will have single row, with elements of the row
68// placed as per kernel ID. Each element in the row corresponds
69// to ptr of "SW LDS" variable created for that kernel.
70// - Offset table:
71// Offset table will have multiple rows and columns.
72// Rows are assumed to be from 0 to (n-1). n is total number
73// of kernels accessing the LDS through non-kernels.
74// Each row will have m elements. m is the total number of
75// unique LDS globals accessed by all non-kernels.
76// Each element in the row correspond to the ptr of
77// the replacement of LDS global done by that particular kernel.
78// A LDS variable in non-kernel will be replaced based on the information
79// from base and offset tables. Based on kernel-id query, ptr of "SW
80// LDS" for that corresponding kernel is obtained from base table.
81// The Offset into the base "SW LDS" is obtained from
82// corresponding element in offset table. With this information, replacement
83// value is obtained.
84//===----------------------------------------------------------------------===//
85
86#include "AMDGPU.h"
88#include "AMDGPUMemoryUtils.h"
89#include "AMDGPUTargetMachine.h"
90#include "llvm/ADT/DenseMap.h"
91#include "llvm/ADT/DenseSet.h"
92#include "llvm/ADT/SetVector.h"
94#include "llvm/ADT/StringRef.h"
98#include "llvm/IR/Constants.h"
99#include "llvm/IR/DIBuilder.h"
100#include "llvm/IR/DebugInfo.h"
102#include "llvm/IR/IRBuilder.h"
103#include "llvm/IR/Instructions.h"
104#include "llvm/IR/IntrinsicsAMDGPU.h"
105#include "llvm/IR/MDBuilder.h"
108#include "llvm/Pass.h"
112
113#include <algorithm>
114
115#define DEBUG_TYPE "amdgpu-sw-lower-lds"
116#define COV5_HIDDEN_DYN_LDS_SIZE_ARG 15
117
118using namespace llvm;
119using namespace AMDGPU;
120
121namespace {
122
124 AsanInstrumentLDS("amdgpu-asan-instrument-lds",
125 cl::desc("Run asan instrumentation on LDS instructions "
126 "lowered to global memory"),
127 cl::init(true), cl::Hidden);
128
129using DomTreeCallback = function_ref<DominatorTree *(Function &F)>;
130
131struct LDSAccessTypeInfo {
132 SetVector<GlobalVariable *> StaticLDSGlobals;
133 SetVector<GlobalVariable *> DynamicLDSGlobals;
134};
135
136// Struct to hold all the Metadata required for a kernel
137// to replace a LDS global uses with corresponding offset
138// in to device global memory.
139struct KernelLDSParameters {
140 GlobalVariable *SwLDS = nullptr;
141 GlobalVariable *SwDynLDS = nullptr;
142 GlobalVariable *SwLDSMetadata = nullptr;
143 LDSAccessTypeInfo DirectAccess;
144 LDSAccessTypeInfo IndirectAccess;
146 LDSToReplacementIndicesMap;
147 uint32_t MallocSize = 0;
148 uint32_t LDSSize = 0;
149 SmallVector<std::pair<uint32_t, uint32_t>, 64> RedzoneOffsetAndSizeVector;
150};
151
152// Struct to store information for creation of offset table
153// for all the non-kernel LDS accesses.
154struct NonKernelLDSParameters {
155 GlobalVariable *LDSBaseTable = nullptr;
156 GlobalVariable *LDSOffsetTable = nullptr;
157 SetVector<Function *> OrderedKernels;
158 SetVector<GlobalVariable *> OrdereLDSGlobals;
159};
160
161struct AsanInstrumentInfo {
162 int Scale = 0;
163 uint32_t Offset = 0;
164 SetVector<Instruction *> Instructions;
165};
166
167struct FunctionsAndLDSAccess {
168 DenseMap<Function *, KernelLDSParameters> KernelToLDSParametersMap;
169 SetVector<Function *> KernelsWithIndirectLDSAccess;
170 SetVector<Function *> NonKernelsWithLDSArgument;
171 SetVector<GlobalVariable *> AllNonKernelLDSAccess;
172 FunctionVariableMap NonKernelToLDSAccessMap;
173};
174
175class AMDGPUSwLowerLDS {
176public:
177 AMDGPUSwLowerLDS(Module &Mod, const AMDGPUTargetMachine &TM,
178 DomTreeCallback Callback)
179 : M(Mod), AMDGPUTM(TM), IRB(M.getContext()), DTCallback(Callback) {}
180 bool run();
181 void getUsesOfLDSByNonKernels();
182 void getNonKernelsWithLDSArguments(const CallGraph &CG);
184 getOrderedIndirectLDSAccessingKernels(SetVector<Function *> &Kernels);
186 getOrderedNonKernelAllLDSGlobals(SetVector<GlobalVariable *> &Variables);
187 void buildSwLDSGlobal(Function *Func);
188 void buildSwDynLDSGlobal(Function *Func);
189 void populateSwMetadataGlobal(Function *Func);
190 void populateSwLDSAttributeAndMetadata(Function *Func);
191 void populateLDSToReplacementIndicesMap(Function *Func);
192 void getLDSMemoryInstructions(Function *Func,
193 SetVector<Instruction *> &LDSInstructions);
194 void replaceKernelLDSAccesses(Function *Func);
195 Value *getTranslatedGlobalMemoryGEPOfLDSPointer(Value *LoadMallocPtr,
196 Value *LDSPtr);
197 void translateLDSMemoryOperationsToGlobalMemory(
198 Function *Func, Value *LoadMallocPtr,
199 SetVector<Instruction *> &LDSInstructions);
200 void poisonRedzones(Function *Func, Value *MallocPtr);
201 void lowerKernelLDSAccesses(Function *Func, DomTreeUpdater &DTU);
202 void buildNonKernelLDSOffsetTable(NonKernelLDSParameters &NKLDSParams);
203 void buildNonKernelLDSBaseTable(NonKernelLDSParameters &NKLDSParams);
204 Constant *
205 getAddressesOfVariablesInKernel(Function *Func,
206 SetVector<GlobalVariable *> &Variables);
207 void lowerNonKernelLDSAccesses(Function *Func,
208 SetVector<GlobalVariable *> &LDSGlobals,
209 NonKernelLDSParameters &NKLDSParams);
210 void
211 updateMallocSizeForDynamicLDS(Function *Func, Value **CurrMallocSize,
212 Value *HiddenDynLDSSize,
213 SetVector<GlobalVariable *> &DynamicLDSGlobals);
214 void initAsanInfo();
215
216private:
217 Module &M;
218 const AMDGPUTargetMachine &AMDGPUTM;
219 IRBuilder<> IRB;
220 DomTreeCallback DTCallback;
221 FunctionsAndLDSAccess FuncLDSAccessInfo;
222 AsanInstrumentInfo AsanInfo;
223};
224
225template <typename T> SetVector<T> sortByName(std::vector<T> &&V) {
226 // Sort the vector of globals or Functions based on their name.
227 // Returns a SetVector of globals/Functions.
228 sort(V, [](const auto *L, const auto *R) {
229 return L->getName() < R->getName();
230 });
231 return {SetVector<T>(V.begin(), V.end())};
232}
233
234SetVector<GlobalVariable *> AMDGPUSwLowerLDS::getOrderedNonKernelAllLDSGlobals(
235 SetVector<GlobalVariable *> &Variables) {
236 // Sort all the non-kernel LDS accesses based on their name.
237 return sortByName(
238 std::vector<GlobalVariable *>(Variables.begin(), Variables.end()));
239}
240
241SetVector<Function *> AMDGPUSwLowerLDS::getOrderedIndirectLDSAccessingKernels(
242 SetVector<Function *> &Kernels) {
243 // Sort the non-kernels accessing LDS based on their name.
244 // Also assign a kernel ID metadata based on the sorted order.
245 LLVMContext &Ctx = M.getContext();
246 if (Kernels.size() > UINT32_MAX) {
247 report_fatal_error("Unimplemented SW LDS lowering for > 2**32 kernels");
248 }
249 SetVector<Function *> OrderedKernels =
250 sortByName(std::vector<Function *>(Kernels.begin(), Kernels.end()));
251 for (size_t i = 0; i < Kernels.size(); i++) {
252 Metadata *AttrMDArgs[1] = {
253 ConstantAsMetadata::get(IRB.getInt32(i)),
254 };
255 Function *Func = OrderedKernels[i];
256 Func->setMetadata("llvm.amdgcn.lds.kernel.id",
257 MDNode::get(Ctx, AttrMDArgs));
258 }
259 return OrderedKernels;
260}
261
262void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) {
263 // Among the kernels accessing LDS, get list of
264 // Non-kernels to which a call is made and a ptr
265 // to addrspace(3) is passed as argument.
266 for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
267 Function *Func = K.first;
268 const CallGraphNode *CGN = CG[Func];
269 if (!CGN)
270 continue;
271 for (auto &I : *CGN) {
272 CallGraphNode *CallerCGN = I.second;
273 Function *CalledFunc = CallerCGN->getFunction();
274 if (!CalledFunc || CalledFunc->isDeclaration())
275 continue;
276 if (AMDGPU::isKernelLDS(CalledFunc))
277 continue;
278 for (auto AI = CalledFunc->arg_begin(), E = CalledFunc->arg_end();
279 AI != E; ++AI) {
280 Type *ArgTy = (*AI).getType();
281 if (!ArgTy->isPointerTy())
282 continue;
284 continue;
285 FuncLDSAccessInfo.NonKernelsWithLDSArgument.insert(CalledFunc);
286 // Also add the Calling function to KernelsWithIndirectLDSAccess list
287 // so that base table of LDS is generated.
288 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(Func);
289 }
290 }
291 }
292}
293
294void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
295 for (GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) {
297 continue;
298
299 for (User *V : GV->users()) {
300 if (auto *I = dyn_cast<Instruction>(V)) {
301 Function *F = I->getFunction();
302 if (!isKernelLDS(F) && F->hasFnAttribute(Attribute::SanitizeAddress) &&
303 !F->isDeclaration())
304 FuncLDSAccessInfo.NonKernelToLDSAccessMap[F].insert(GV);
305 }
306 }
307 }
308}
309
310static void recordLDSAbsoluteAddress(Module &M, GlobalVariable *GV,
312 // Write the specified address into metadata where it can be retrieved by
313 // the assembler. Format is a half open range, [Address Address+1)
314 LLVMContext &Ctx = M.getContext();
315 auto *IntTy = M.getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
316 MDBuilder MDB(Ctx);
317 MDNode *MetadataNode = MDB.createRange(ConstantInt::get(IntTy, Address),
318 ConstantInt::get(IntTy, Address + 1));
319 GV->setMetadata(LLVMContext::MD_absolute_symbol, MetadataNode);
320}
321
322static void addLDSSizeAttribute(Function *Func, uint32_t Offset,
323 bool IsDynLDS) {
324 if (Offset != 0) {
325 std::string Buffer;
326 raw_string_ostream SS{Buffer};
327 SS << Offset;
328 if (IsDynLDS)
329 SS << "," << Offset;
330 Func->addFnAttr("amdgpu-lds-size", Buffer);
331 }
332}
333
334static void markUsedByKernel(Function *Func, GlobalVariable *SGV) {
335 BasicBlock *Entry = &Func->getEntryBlock();
336 IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt());
337
338 Function *Decl = Intrinsic::getOrInsertDeclaration(Func->getParent(),
339 Intrinsic::donothing, {});
340
341 Value *UseInstance[1] = {
342 Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)};
343
344 Builder.CreateCall(Decl, {},
345 {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)});
346}
347
348void AMDGPUSwLowerLDS::buildSwLDSGlobal(Function *Func) {
349 // Create new LDS global required for each kernel to store
350 // device global memory pointer.
351 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
352 // Create new global pointer variable
353 LDSParams.SwLDS = new GlobalVariable(
354 M, IRB.getPtrTy(), false, GlobalValue::InternalLinkage,
355 PoisonValue::get(IRB.getPtrTy()), "llvm.amdgcn.sw.lds." + Func->getName(),
358 MD.NoAddress = true;
359 LDSParams.SwLDS->setSanitizerMetadata(MD);
360}
361
362void AMDGPUSwLowerLDS::buildSwDynLDSGlobal(Function *Func) {
363 // Create new Dyn LDS global if kernel accesses dyn LDS.
364 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
365 if (LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
366 LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
367 return;
368 // Create new global pointer variable
369 auto *emptyCharArray = ArrayType::get(IRB.getInt8Ty(), 0);
370 LDSParams.SwDynLDS = new GlobalVariable(
371 M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr,
372 "llvm.amdgcn." + Func->getName() + ".dynlds", nullptr,
374 markUsedByKernel(Func, LDSParams.SwDynLDS);
376 MD.NoAddress = true;
377 LDSParams.SwDynLDS->setSanitizerMetadata(MD);
378}
379
380void AMDGPUSwLowerLDS::populateSwLDSAttributeAndMetadata(Function *Func) {
381 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
382 bool IsDynLDSUsed = LDSParams.SwDynLDS ? true : false;
383 uint32_t Offset = LDSParams.LDSSize;
384 recordLDSAbsoluteAddress(M, LDSParams.SwLDS, 0);
385 addLDSSizeAttribute(Func, Offset, IsDynLDSUsed);
386 if (LDSParams.SwDynLDS)
387 recordLDSAbsoluteAddress(M, LDSParams.SwDynLDS, Offset);
388}
389
390void AMDGPUSwLowerLDS::populateSwMetadataGlobal(Function *Func) {
391 // Create new metadata global for every kernel and initialize the
392 // start offsets and sizes corresponding to each LDS accesses.
393 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
394 auto &Ctx = M.getContext();
395 auto &DL = M.getDataLayout();
396 std::vector<Type *> Items;
397 Type *Int32Ty = IRB.getInt32Ty();
398 std::vector<Constant *> Initializers;
399 Align MaxAlignment(1);
400 auto UpdateMaxAlignment = [&MaxAlignment, &DL](GlobalVariable *GV) {
401 Align GVAlign = AMDGPU::getAlign(DL, GV);
402 MaxAlignment = std::max(MaxAlignment, GVAlign);
403 };
404
405 for (GlobalVariable *GV : LDSParams.DirectAccess.StaticLDSGlobals)
406 UpdateMaxAlignment(GV);
407
408 for (GlobalVariable *GV : LDSParams.DirectAccess.DynamicLDSGlobals)
409 UpdateMaxAlignment(GV);
410
411 for (GlobalVariable *GV : LDSParams.IndirectAccess.StaticLDSGlobals)
412 UpdateMaxAlignment(GV);
413
414 for (GlobalVariable *GV : LDSParams.IndirectAccess.DynamicLDSGlobals)
415 UpdateMaxAlignment(GV);
416
417 //{StartOffset, AlignedSizeInBytes}
418 SmallString<128> MDItemStr;
419 raw_svector_ostream MDItemOS(MDItemStr);
420 MDItemOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.item";
421
422 StructType *LDSItemTy =
423 StructType::create(Ctx, {Int32Ty, Int32Ty, Int32Ty}, MDItemOS.str());
424 uint32_t &MallocSize = LDSParams.MallocSize;
425 SetVector<GlobalVariable *> UniqueLDSGlobals;
426 int AsanScale = AsanInfo.Scale;
427 auto buildInitializerForSwLDSMD =
428 [&](SetVector<GlobalVariable *> &LDSGlobals) {
429 for (auto &GV : LDSGlobals) {
430 if (is_contained(UniqueLDSGlobals, GV))
431 continue;
432 UniqueLDSGlobals.insert(GV);
433
434 Type *Ty = GV->getValueType();
435 const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
436 Items.push_back(LDSItemTy);
437 Constant *ItemStartOffset = ConstantInt::get(Int32Ty, MallocSize);
438 Constant *SizeInBytesConst = ConstantInt::get(Int32Ty, SizeInBytes);
439 // Get redzone size corresponding a size.
440 const uint64_t RightRedzoneSize =
441 AMDGPU::getRedzoneSizeForGlobal(AsanScale, SizeInBytes);
442 // Update MallocSize with current size and redzone size.
443 MallocSize += SizeInBytes;
444 if (!AMDGPU::isDynamicLDS(*GV))
445 LDSParams.RedzoneOffsetAndSizeVector.emplace_back(MallocSize,
446 RightRedzoneSize);
447 MallocSize += RightRedzoneSize;
448 // Align current size plus redzone.
449 uint64_t AlignedSize =
450 alignTo(SizeInBytes + RightRedzoneSize, MaxAlignment);
451 Constant *AlignedSizeInBytesConst =
452 ConstantInt::get(Int32Ty, AlignedSize);
453 // Align MallocSize
454 MallocSize = alignTo(MallocSize, MaxAlignment);
455 Constant *InitItem =
456 ConstantStruct::get(LDSItemTy, {ItemStartOffset, SizeInBytesConst,
457 AlignedSizeInBytesConst});
458 Initializers.push_back(InitItem);
459 }
460 };
461 SetVector<GlobalVariable *> SwLDSVector;
462 SwLDSVector.insert(LDSParams.SwLDS);
463 buildInitializerForSwLDSMD(SwLDSVector);
464 buildInitializerForSwLDSMD(LDSParams.DirectAccess.StaticLDSGlobals);
465 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.StaticLDSGlobals);
466 buildInitializerForSwLDSMD(LDSParams.DirectAccess.DynamicLDSGlobals);
467 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.DynamicLDSGlobals);
468
469 // Update the LDS size used by the kernel.
470 Type *Ty = LDSParams.SwLDS->getValueType();
471 const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
472 uint64_t AlignedSize = alignTo(SizeInBytes, MaxAlignment);
473 LDSParams.LDSSize = AlignedSize;
474 SmallString<128> MDTypeStr;
475 raw_svector_ostream MDTypeOS(MDTypeStr);
476 MDTypeOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.type";
477 StructType *MetadataStructType =
478 StructType::create(Ctx, Items, MDTypeOS.str());
479 SmallString<128> MDStr;
480 raw_svector_ostream MDOS(MDStr);
481 MDOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md";
482 LDSParams.SwLDSMetadata = new GlobalVariable(
483 M, MetadataStructType, false, GlobalValue::InternalLinkage,
484 PoisonValue::get(MetadataStructType), MDOS.str(), nullptr,
486 Constant *data = ConstantStruct::get(MetadataStructType, Initializers);
487 LDSParams.SwLDSMetadata->setInitializer(data);
488 assert(LDSParams.SwLDS);
489 // Set the alignment to MaxAlignment for SwLDS.
490 LDSParams.SwLDS->setAlignment(MaxAlignment);
491 if (LDSParams.SwDynLDS)
492 LDSParams.SwDynLDS->setAlignment(MaxAlignment);
494 MD.NoAddress = true;
495 LDSParams.SwLDSMetadata->setSanitizerMetadata(MD);
496}
497
498void AMDGPUSwLowerLDS::populateLDSToReplacementIndicesMap(Function *Func) {
499 // Fill the corresponding LDS replacement indices for each LDS access
500 // related to this kernel.
501 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
502 SetVector<GlobalVariable *> UniqueLDSGlobals;
503 auto PopulateIndices = [&](SetVector<GlobalVariable *> &LDSGlobals,
504 uint32_t &Idx) {
505 for (auto &GV : LDSGlobals) {
506 if (is_contained(UniqueLDSGlobals, GV))
507 continue;
508 UniqueLDSGlobals.insert(GV);
509 LDSParams.LDSToReplacementIndicesMap[GV] = {0, Idx, 0};
510 ++Idx;
511 }
512 };
513 uint32_t Idx = 0;
514 SetVector<GlobalVariable *> SwLDSVector;
515 SwLDSVector.insert(LDSParams.SwLDS);
516 PopulateIndices(SwLDSVector, Idx);
517 PopulateIndices(LDSParams.DirectAccess.StaticLDSGlobals, Idx);
518 PopulateIndices(LDSParams.IndirectAccess.StaticLDSGlobals, Idx);
519 PopulateIndices(LDSParams.DirectAccess.DynamicLDSGlobals, Idx);
520 PopulateIndices(LDSParams.IndirectAccess.DynamicLDSGlobals, Idx);
521}
522
523static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV,
524 Value *Replacement) {
525 // Replace all uses of LDS global in this Function with a Replacement.
526 auto ReplaceUsesLambda = [Func](const Use &U) -> bool {
527 auto *V = U.getUser();
528 if (auto *Inst = dyn_cast<Instruction>(V)) {
529 auto *Func1 = Inst->getParent()->getParent();
530 if (Func == Func1)
531 return true;
532 }
533 return false;
534 };
535 GV->replaceUsesWithIf(Replacement, ReplaceUsesLambda);
536}
537
538void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(Function *Func) {
539 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
540 GlobalVariable *SwLDS = LDSParams.SwLDS;
541 assert(SwLDS);
542 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
543 assert(SwLDSMetadata);
544 StructType *SwLDSMetadataStructType =
545 cast<StructType>(SwLDSMetadata->getValueType());
546 Type *Int32Ty = IRB.getInt32Ty();
547 auto &IndirectAccess = LDSParams.IndirectAccess;
548 auto &DirectAccess = LDSParams.DirectAccess;
549 // Replace all uses of LDS global in this Function with a Replacement.
550 SetVector<GlobalVariable *> UniqueLDSGlobals;
551 auto ReplaceLDSGlobalUses = [&](SetVector<GlobalVariable *> &LDSGlobals) {
552 for (auto &GV : LDSGlobals) {
553 // Do not generate instructions if LDS access is in non-kernel
554 // i.e indirect-access.
555 if ((IndirectAccess.StaticLDSGlobals.contains(GV) ||
556 IndirectAccess.DynamicLDSGlobals.contains(GV)) &&
557 (!DirectAccess.StaticLDSGlobals.contains(GV) &&
558 !DirectAccess.DynamicLDSGlobals.contains(GV)))
559 continue;
560 if (is_contained(UniqueLDSGlobals, GV))
561 continue;
562 UniqueLDSGlobals.insert(GV);
563 auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
564 assert(Indices.size() == 3);
565 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]),
566 ConstantInt::get(Int32Ty, Indices[1]),
567 ConstantInt::get(Int32Ty, Indices[2])};
569 SwLDSMetadataStructType, SwLDSMetadata, GEPIdx, true);
570 Value *Offset = IRB.CreateLoad(Int32Ty, GEP);
571 Value *BasePlusOffset =
572 IRB.CreateInBoundsGEP(IRB.getInt8Ty(), SwLDS, {Offset});
573 LLVM_DEBUG(GV->printAsOperand(dbgs() << "Sw LDS Lowering, Replacing LDS ",
574 false));
575 replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
576 }
577 };
578 ReplaceLDSGlobalUses(DirectAccess.StaticLDSGlobals);
579 ReplaceLDSGlobalUses(IndirectAccess.StaticLDSGlobals);
580 ReplaceLDSGlobalUses(DirectAccess.DynamicLDSGlobals);
581 ReplaceLDSGlobalUses(IndirectAccess.DynamicLDSGlobals);
582}
583
584void AMDGPUSwLowerLDS::updateMallocSizeForDynamicLDS(
585 Function *Func, Value **CurrMallocSize, Value *HiddenDynLDSSize,
586 SetVector<GlobalVariable *> &DynamicLDSGlobals) {
587 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
588 Type *Int32Ty = IRB.getInt32Ty();
589
590 GlobalVariable *SwLDS = LDSParams.SwLDS;
591 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
592 assert(SwLDS && SwLDSMetadata);
593 StructType *MetadataStructType =
594 cast<StructType>(SwLDSMetadata->getValueType());
595 unsigned MaxAlignment = SwLDS->getAlignment();
596 Value *MaxAlignValue = IRB.getInt32(MaxAlignment);
597 Value *MaxAlignValueMinusOne = IRB.getInt32(MaxAlignment - 1);
598
599 for (GlobalVariable *DynGV : DynamicLDSGlobals) {
600 auto &Indices = LDSParams.LDSToReplacementIndicesMap[DynGV];
601 // Update the Offset metadata.
602 Constant *Index0 = ConstantInt::get(Int32Ty, 0);
603 Constant *Index1 = ConstantInt::get(Int32Ty, Indices[1]);
604
605 Constant *Index2Offset = ConstantInt::get(Int32Ty, 0);
606 auto *GEPForOffset = IRB.CreateInBoundsGEP(
607 MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2Offset});
608
609 IRB.CreateStore(*CurrMallocSize, GEPForOffset);
610 // Update the size and Aligned Size metadata.
611 Constant *Index2Size = ConstantInt::get(Int32Ty, 1);
612 auto *GEPForSize = IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
613 {Index0, Index1, Index2Size});
614
615 Value *CurrDynLDSSize = IRB.CreateLoad(Int32Ty, HiddenDynLDSSize);
616 IRB.CreateStore(CurrDynLDSSize, GEPForSize);
617 Constant *Index2AlignedSize = ConstantInt::get(Int32Ty, 2);
618 auto *GEPForAlignedSize = IRB.CreateInBoundsGEP(
619 MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2AlignedSize});
620
621 Value *AlignedDynLDSSize =
622 IRB.CreateAdd(CurrDynLDSSize, MaxAlignValueMinusOne);
623 AlignedDynLDSSize = IRB.CreateUDiv(AlignedDynLDSSize, MaxAlignValue);
624 AlignedDynLDSSize = IRB.CreateMul(AlignedDynLDSSize, MaxAlignValue);
625 IRB.CreateStore(AlignedDynLDSSize, GEPForAlignedSize);
626
627 // Update the Current Malloc Size
628 *CurrMallocSize = IRB.CreateAdd(*CurrMallocSize, AlignedDynLDSSize);
629 }
630}
631
632static DebugLoc getOrCreateDebugLoc(const Instruction *InsertBefore,
633 DISubprogram *SP) {
634 assert(InsertBefore);
635 if (InsertBefore->getDebugLoc())
636 return InsertBefore->getDebugLoc();
637 if (SP)
638 return DILocation::get(SP->getContext(), SP->getLine(), 1, SP);
639 return DebugLoc();
640}
641
642void AMDGPUSwLowerLDS::getLDSMemoryInstructions(
643 Function *Func, SetVector<Instruction *> &LDSInstructions) {
644 for (BasicBlock &BB : *Func) {
645 for (Instruction &Inst : BB) {
646 if (LoadInst *LI = dyn_cast<LoadInst>(&Inst)) {
647 if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
648 LDSInstructions.insert(&Inst);
649 } else if (StoreInst *SI = dyn_cast<StoreInst>(&Inst)) {
650 if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
651 LDSInstructions.insert(&Inst);
652 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(&Inst)) {
653 if (RMW->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
654 LDSInstructions.insert(&Inst);
655 } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(&Inst)) {
656 if (XCHG->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
657 LDSInstructions.insert(&Inst);
658 } else
659 continue;
660 }
661 }
662}
663
664Value *
665AMDGPUSwLowerLDS::getTranslatedGlobalMemoryGEPOfLDSPointer(Value *LoadMallocPtr,
666 Value *LDSPtr) {
667 assert(LDSPtr && "Invalid LDS pointer operand");
668 Value *PtrToInt = IRB.CreatePtrToInt(LDSPtr, IRB.getInt32Ty());
669 Value *GEP =
670 IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {PtrToInt});
671 return GEP;
672}
673
674void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory(
675 Function *Func, Value *LoadMallocPtr,
676 SetVector<Instruction *> &LDSInstructions) {
677 LLVM_DEBUG(dbgs() << "Translating LDS memory operations to global memory : "
678 << Func->getName());
679 for (Instruction *Inst : LDSInstructions) {
680 IRB.SetInsertPoint(Inst);
681 if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
682 Value *LIOperand = LI->getPointerOperand();
683 Value *Replacement =
684 getTranslatedGlobalMemoryGEPOfLDSPointer(LoadMallocPtr, LIOperand);
685 LoadInst *NewLI = IRB.CreateAlignedLoad(LI->getType(), Replacement,
686 LI->getAlign(), LI->isVolatile());
687 NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
688 AsanInfo.Instructions.insert(NewLI);
689 LI->replaceAllUsesWith(NewLI);
690 LI->eraseFromParent();
691 } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
692 Value *SIOperand = SI->getPointerOperand();
693 Value *Replacement =
694 getTranslatedGlobalMemoryGEPOfLDSPointer(LoadMallocPtr, SIOperand);
695 StoreInst *NewSI = IRB.CreateAlignedStore(
696 SI->getValueOperand(), Replacement, SI->getAlign(), SI->isVolatile());
697 NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID());
698 AsanInfo.Instructions.insert(NewSI);
699 SI->replaceAllUsesWith(NewSI);
700 SI->eraseFromParent();
701 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
702 Value *RMWPtrOperand = RMW->getPointerOperand();
703 Value *RMWValOperand = RMW->getValOperand();
704 Value *Replacement = getTranslatedGlobalMemoryGEPOfLDSPointer(
705 LoadMallocPtr, RMWPtrOperand);
706 AtomicRMWInst *NewRMW = IRB.CreateAtomicRMW(
707 RMW->getOperation(), Replacement, RMWValOperand, RMW->getAlign(),
708 RMW->getOrdering(), RMW->getSyncScopeID());
709 NewRMW->setVolatile(RMW->isVolatile());
710 AsanInfo.Instructions.insert(NewRMW);
711 RMW->replaceAllUsesWith(NewRMW);
712 RMW->eraseFromParent();
713 } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(Inst)) {
714 Value *XCHGPtrOperand = XCHG->getPointerOperand();
715 Value *Replacement = getTranslatedGlobalMemoryGEPOfLDSPointer(
716 LoadMallocPtr, XCHGPtrOperand);
717 AtomicCmpXchgInst *NewXCHG = IRB.CreateAtomicCmpXchg(
718 Replacement, XCHG->getCompareOperand(), XCHG->getNewValOperand(),
719 XCHG->getAlign(), XCHG->getSuccessOrdering(),
720 XCHG->getFailureOrdering(), XCHG->getSyncScopeID());
721 NewXCHG->setVolatile(XCHG->isVolatile());
722 AsanInfo.Instructions.insert(NewXCHG);
723 XCHG->replaceAllUsesWith(NewXCHG);
724 XCHG->eraseFromParent();
725 } else
726 report_fatal_error("Unimplemented LDS lowering instruction");
727 }
728}
729
730void AMDGPUSwLowerLDS::poisonRedzones(Function *Func, Value *MallocPtr) {
731 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
732 Type *Int64Ty = IRB.getInt64Ty();
733 Type *VoidTy = IRB.getVoidTy();
734 FunctionCallee AsanPoisonRegion = M.getOrInsertFunction(
735 "__asan_poison_region",
736 FunctionType::get(VoidTy, {Int64Ty, Int64Ty}, false));
737
738 auto RedzonesVec = LDSParams.RedzoneOffsetAndSizeVector;
739 size_t VecSize = RedzonesVec.size();
740 for (unsigned i = 0; i < VecSize; i++) {
741 auto &RedzonePair = RedzonesVec[i];
742 uint64_t RedzoneOffset = RedzonePair.first;
743 uint64_t RedzoneSize = RedzonePair.second;
744 Value *RedzoneAddrOffset = IRB.CreateInBoundsGEP(
745 IRB.getInt8Ty(), MallocPtr, {IRB.getInt64(RedzoneOffset)});
746 Value *RedzoneAddress = IRB.CreatePtrToInt(RedzoneAddrOffset, Int64Ty);
747 IRB.CreateCall(AsanPoisonRegion,
748 {RedzoneAddress, IRB.getInt64(RedzoneSize)});
749 }
750}
751
752void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
753 DomTreeUpdater &DTU) {
754 LLVM_DEBUG(dbgs() << "Sw Lowering Kernel LDS for : " << Func->getName());
755 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
756 auto &Ctx = M.getContext();
757 auto *PrevEntryBlock = &Func->getEntryBlock();
758 SetVector<Instruction *> LDSInstructions;
759 getLDSMemoryInstructions(Func, LDSInstructions);
760
761 // Create malloc block.
762 auto *MallocBlock = BasicBlock::Create(Ctx, "Malloc", Func, PrevEntryBlock);
763
764 // Create WIdBlock block which has instructions related to selection of
765 // {0,0,0} indiex work item in the work group.
766 auto *WIdBlock = BasicBlock::Create(Ctx, "WId", Func, MallocBlock);
767 IRB.SetInsertPoint(WIdBlock, WIdBlock->begin());
768 DebugLoc FirstDL =
769 getOrCreateDebugLoc(&*PrevEntryBlock->begin(), Func->getSubprogram());
770 IRB.SetCurrentDebugLocation(FirstDL);
771 Value *WIdx = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {}, {});
772 Value *WIdy = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_y, {}, {});
773 Value *WIdz = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_z, {}, {});
774 Value *XYOr = IRB.CreateOr(WIdx, WIdy);
775 Value *XYZOr = IRB.CreateOr(XYOr, WIdz);
776 Value *WIdzCond = IRB.CreateICmpEQ(XYZOr, IRB.getInt32(0));
777
778 // All work items will branch to PrevEntryBlock except {0,0,0} index
779 // work item which will branch to malloc block.
780 IRB.CreateCondBr(WIdzCond, MallocBlock, PrevEntryBlock);
781
782 // Malloc block
783 IRB.SetInsertPoint(MallocBlock, MallocBlock->begin());
784
785 // If Dynamic LDS globals are accessed by the kernel,
786 // Get the size of dyn lds from hidden dyn_lds_size kernel arg.
787 // Update the corresponding metadata global entries for this dyn lds global.
788 GlobalVariable *SwLDS = LDSParams.SwLDS;
789 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
790 assert(SwLDS && SwLDSMetadata);
791 StructType *MetadataStructType =
792 cast<StructType>(SwLDSMetadata->getValueType());
793 uint32_t MallocSize = 0;
794 Value *CurrMallocSize;
795 Type *Int32Ty = IRB.getInt32Ty();
796 Type *Int64Ty = IRB.getInt64Ty();
797
798 SetVector<GlobalVariable *> UniqueLDSGlobals;
799 auto GetUniqueLDSGlobals = [&](SetVector<GlobalVariable *> &LDSGlobals) {
800 for (auto &GV : LDSGlobals) {
801 if (is_contained(UniqueLDSGlobals, GV))
802 continue;
803 UniqueLDSGlobals.insert(GV);
804 }
805 };
806
807 GetUniqueLDSGlobals(LDSParams.DirectAccess.StaticLDSGlobals);
808 GetUniqueLDSGlobals(LDSParams.IndirectAccess.StaticLDSGlobals);
809 unsigned NumStaticLDS = 1 + UniqueLDSGlobals.size();
810 UniqueLDSGlobals.clear();
811
812 if (NumStaticLDS) {
813 auto *GEPForEndStaticLDSOffset =
814 IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
815 {ConstantInt::get(Int32Ty, 0),
816 ConstantInt::get(Int32Ty, NumStaticLDS - 1),
817 ConstantInt::get(Int32Ty, 0)});
818
819 auto *GEPForEndStaticLDSSize =
820 IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
821 {ConstantInt::get(Int32Ty, 0),
822 ConstantInt::get(Int32Ty, NumStaticLDS - 1),
823 ConstantInt::get(Int32Ty, 2)});
824
825 Value *EndStaticLDSOffset =
826 IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSOffset);
827 Value *EndStaticLDSSize = IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSSize);
828 CurrMallocSize = IRB.CreateAdd(EndStaticLDSOffset, EndStaticLDSSize);
829 } else
830 CurrMallocSize = IRB.getInt32(MallocSize);
831
832 if (LDSParams.SwDynLDS) {
835 "Dynamic LDS size query is only supported for CO V5 and later.");
836 // Get size from hidden dyn_lds_size argument of kernel
837 Value *ImplicitArg =
838 IRB.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {}, {});
839 Value *HiddenDynLDSSize = IRB.CreateInBoundsGEP(
840 ImplicitArg->getType(), ImplicitArg,
841 {ConstantInt::get(Int64Ty, COV5_HIDDEN_DYN_LDS_SIZE_ARG)});
842 UniqueLDSGlobals.clear();
843 GetUniqueLDSGlobals(LDSParams.DirectAccess.DynamicLDSGlobals);
844 GetUniqueLDSGlobals(LDSParams.IndirectAccess.DynamicLDSGlobals);
845 updateMallocSizeForDynamicLDS(Func, &CurrMallocSize, HiddenDynLDSSize,
846 UniqueLDSGlobals);
847 }
848
849 CurrMallocSize = IRB.CreateZExt(CurrMallocSize, Int64Ty);
850
851 // Create a call to malloc function which does device global memory allocation
852 // with size equals to all LDS global accesses size in this kernel.
853 Value *ReturnAddress =
854 IRB.CreateIntrinsic(Intrinsic::returnaddress, {}, {IRB.getInt32(0)});
855 FunctionCallee MallocFunc = M.getOrInsertFunction(
856 StringRef("__asan_malloc_impl"),
857 FunctionType::get(Int64Ty, {Int64Ty, Int64Ty}, false));
858 Value *RAPtrToInt = IRB.CreatePtrToInt(ReturnAddress, Int64Ty);
859 Value *MallocCall = IRB.CreateCall(MallocFunc, {CurrMallocSize, RAPtrToInt});
860
861 Value *MallocPtr =
862 IRB.CreateIntToPtr(MallocCall, IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS));
863
864 // Create store of malloc to new global
865 IRB.CreateStore(MallocPtr, SwLDS);
866
867 // Create calls to __asan_poison_region to poison redzones.
868 poisonRedzones(Func, MallocPtr);
869
870 // Create branch to PrevEntryBlock
871 IRB.CreateBr(PrevEntryBlock);
872
873 // Create wave-group barrier at the starting of Previous entry block
874 Type *Int1Ty = IRB.getInt1Ty();
875 IRB.SetInsertPoint(PrevEntryBlock, PrevEntryBlock->begin());
876 auto *XYZCondPhi = IRB.CreatePHI(Int1Ty, 2, "xyzCond");
877 XYZCondPhi->addIncoming(IRB.getInt1(0), WIdBlock);
878 XYZCondPhi->addIncoming(IRB.getInt1(1), MallocBlock);
879
880 IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {});
881
882 // Load malloc pointer from Sw LDS.
883 Value *LoadMallocPtr =
884 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), SwLDS);
885
886 // Replace All uses of LDS globals with new LDS pointers.
887 replaceKernelLDSAccesses(Func);
888
889 // Replace Memory Operations on LDS with corresponding
890 // global memory pointers.
891 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
892 LDSInstructions);
893
894 auto *CondFreeBlock = BasicBlock::Create(Ctx, "CondFree", Func);
895 auto *FreeBlock = BasicBlock::Create(Ctx, "Free", Func);
896 auto *EndBlock = BasicBlock::Create(Ctx, "End", Func);
897 for (BasicBlock &BB : *Func) {
898 if (!BB.empty()) {
899 if (ReturnInst *RI = dyn_cast<ReturnInst>(&BB.back())) {
900 RI->eraseFromParent();
901 IRB.SetInsertPoint(&BB, BB.end());
902 IRB.CreateBr(CondFreeBlock);
903 }
904 }
905 }
906
907 // Cond Free Block
908 IRB.SetInsertPoint(CondFreeBlock, CondFreeBlock->begin());
909 IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {});
910 IRB.CreateCondBr(XYZCondPhi, FreeBlock, EndBlock);
911
912 // Free Block
913 IRB.SetInsertPoint(FreeBlock, FreeBlock->begin());
914
915 // Free the previously allocate device global memory.
916 FunctionCallee AsanFreeFunc = M.getOrInsertFunction(
917 StringRef("__asan_free_impl"),
918 FunctionType::get(IRB.getVoidTy(), {Int64Ty, Int64Ty}, false));
919 Value *ReturnAddr =
920 IRB.CreateIntrinsic(Intrinsic::returnaddress, {}, IRB.getInt32(0));
921 Value *RAPToInt = IRB.CreatePtrToInt(ReturnAddr, Int64Ty);
922 Value *MallocPtrToInt = IRB.CreatePtrToInt(LoadMallocPtr, Int64Ty);
923 IRB.CreateCall(AsanFreeFunc, {MallocPtrToInt, RAPToInt});
924
925 IRB.CreateBr(EndBlock);
926
927 // End Block
928 IRB.SetInsertPoint(EndBlock, EndBlock->begin());
929 IRB.CreateRetVoid();
930 // Update the DomTree with corresponding links to basic blocks.
931 DTU.applyUpdates({{DominatorTree::Insert, WIdBlock, MallocBlock},
932 {DominatorTree::Insert, MallocBlock, PrevEntryBlock},
933 {DominatorTree::Insert, CondFreeBlock, FreeBlock},
934 {DominatorTree::Insert, FreeBlock, EndBlock}});
935}
936
937Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel(
938 Function *Func, SetVector<GlobalVariable *> &Variables) {
939 Type *Int32Ty = IRB.getInt32Ty();
940 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
941
942 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
943 assert(SwLDSMetadata);
944 auto *SwLDSMetadataStructType =
945 cast<StructType>(SwLDSMetadata->getValueType());
946 ArrayType *KernelOffsetsType =
947 ArrayType::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), Variables.size());
948
950 for (auto *GV : Variables) {
951 if (!LDSParams.LDSToReplacementIndicesMap.contains(GV)) {
952 Elements.push_back(
954 continue;
955 }
956 auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
957 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]),
958 ConstantInt::get(Int32Ty, Indices[1]),
959 ConstantInt::get(Int32Ty, Indices[2])};
960 Constant *GEP = ConstantExpr::getGetElementPtr(SwLDSMetadataStructType,
961 SwLDSMetadata, GEPIdx, true);
962 Elements.push_back(GEP);
963 }
964 return ConstantArray::get(KernelOffsetsType, Elements);
965}
966
967void AMDGPUSwLowerLDS::buildNonKernelLDSBaseTable(
968 NonKernelLDSParameters &NKLDSParams) {
969 // Base table will have single row, with elements of the row
970 // placed as per kernel ID. Each element in the row corresponds
971 // to addresss of "SW LDS" global of the kernel.
972 auto &Kernels = NKLDSParams.OrderedKernels;
973 if (Kernels.empty())
974 return;
975 Type *Int32Ty = IRB.getInt32Ty();
976 const size_t NumberKernels = Kernels.size();
977 ArrayType *AllKernelsOffsetsType =
978 ArrayType::get(IRB.getPtrTy(AMDGPUAS::LOCAL_ADDRESS), NumberKernels);
979 std::vector<Constant *> OverallConstantExprElts(NumberKernels);
980 for (size_t i = 0; i < NumberKernels; i++) {
981 Function *Func = Kernels[i];
982 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
983 GlobalVariable *SwLDS = LDSParams.SwLDS;
984 assert(SwLDS);
985 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, 0)};
986 Constant *GEP =
987 ConstantExpr::getGetElementPtr(SwLDS->getType(), SwLDS, GEPIdx, true);
988 OverallConstantExprElts[i] = GEP;
989 }
990 Constant *init =
991 ConstantArray::get(AllKernelsOffsetsType, OverallConstantExprElts);
992 NKLDSParams.LDSBaseTable = new GlobalVariable(
993 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
994 "llvm.amdgcn.sw.lds.base.table", nullptr, GlobalValue::NotThreadLocal,
997 MD.NoAddress = true;
998 NKLDSParams.LDSBaseTable->setSanitizerMetadata(MD);
999}
1000
1001void AMDGPUSwLowerLDS::buildNonKernelLDSOffsetTable(
1002 NonKernelLDSParameters &NKLDSParams) {
1003 // Offset table will have multiple rows and columns.
1004 // Rows are assumed to be from 0 to (n-1). n is total number
1005 // of kernels accessing the LDS through non-kernels.
1006 // Each row will have m elements. m is the total number of
1007 // unique LDS globals accessed by non-kernels.
1008 // Each element in the row correspond to the address of
1009 // the replacement of LDS global done by that particular kernel.
1010 auto &Variables = NKLDSParams.OrdereLDSGlobals;
1011 auto &Kernels = NKLDSParams.OrderedKernels;
1012 if (Variables.empty() || Kernels.empty())
1013 return;
1014 const size_t NumberVariables = Variables.size();
1015 const size_t NumberKernels = Kernels.size();
1016
1017 ArrayType *KernelOffsetsType =
1018 ArrayType::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), NumberVariables);
1019
1020 ArrayType *AllKernelsOffsetsType =
1021 ArrayType::get(KernelOffsetsType, NumberKernels);
1022 std::vector<Constant *> overallConstantExprElts(NumberKernels);
1023 for (size_t i = 0; i < NumberKernels; i++) {
1024 Function *Func = Kernels[i];
1025 overallConstantExprElts[i] =
1026 getAddressesOfVariablesInKernel(Func, Variables);
1027 }
1028 Constant *Init =
1029 ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts);
1030 NKLDSParams.LDSOffsetTable = new GlobalVariable(
1031 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, Init,
1032 "llvm.amdgcn.sw.lds.offset.table", nullptr, GlobalValue::NotThreadLocal,
1035 MD.NoAddress = true;
1036 NKLDSParams.LDSOffsetTable->setSanitizerMetadata(MD);
1037}
1038
1039void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses(
1040 Function *Func, SetVector<GlobalVariable *> &LDSGlobals,
1041 NonKernelLDSParameters &NKLDSParams) {
1042 // Replace LDS access in non-kernel with replacement queried from
1043 // Base table and offset from offset table.
1044 LLVM_DEBUG(dbgs() << "Sw LDS lowering, lower non-kernel access for : "
1045 << Func->getName());
1046 auto InsertAt = Func->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
1047 IRB.SetInsertPoint(InsertAt);
1048
1049 // Get LDS memory instructions.
1050 SetVector<Instruction *> LDSInstructions;
1051 getLDSMemoryInstructions(Func, LDSInstructions);
1052
1053 auto *KernelId = IRB.CreateIntrinsic(Intrinsic::amdgcn_lds_kernel_id, {}, {});
1054 GlobalVariable *LDSBaseTable = NKLDSParams.LDSBaseTable;
1055 GlobalVariable *LDSOffsetTable = NKLDSParams.LDSOffsetTable;
1056 auto &OrdereLDSGlobals = NKLDSParams.OrdereLDSGlobals;
1057 Value *BaseGEP = IRB.CreateInBoundsGEP(
1058 LDSBaseTable->getValueType(), LDSBaseTable, {IRB.getInt32(0), KernelId});
1059 Value *BaseLoad =
1060 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::LOCAL_ADDRESS), BaseGEP);
1061 Value *LoadMallocPtr =
1062 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), BaseLoad);
1063
1064 for (GlobalVariable *GV : LDSGlobals) {
1065 const auto *GVIt =
1066 std::find(OrdereLDSGlobals.begin(), OrdereLDSGlobals.end(), GV);
1067 assert(GVIt != OrdereLDSGlobals.end());
1068 uint32_t GVOffset = std::distance(OrdereLDSGlobals.begin(), GVIt);
1069
1070 Value *OffsetGEP = IRB.CreateInBoundsGEP(
1071 LDSOffsetTable->getValueType(), LDSOffsetTable,
1072 {IRB.getInt32(0), KernelId, IRB.getInt32(GVOffset)});
1073 Value *OffsetLoad =
1074 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), OffsetGEP);
1075 Value *Offset = IRB.CreateLoad(IRB.getInt32Ty(), OffsetLoad);
1076 Value *BasePlusOffset =
1077 IRB.CreateInBoundsGEP(IRB.getInt8Ty(), BaseLoad, {Offset});
1078 LLVM_DEBUG(dbgs() << "Sw LDS Lowering, Replace non-kernel LDS for "
1079 << GV->getName());
1080 replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
1081 }
1082 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
1083 LDSInstructions);
1084}
1085
1086static void reorderStaticDynamicIndirectLDSSet(KernelLDSParameters &LDSParams) {
1087 // Sort Static, dynamic LDS globals which are either
1088 // direct or indirect access on basis of name.
1089 auto &DirectAccess = LDSParams.DirectAccess;
1090 auto &IndirectAccess = LDSParams.IndirectAccess;
1091 LDSParams.DirectAccess.StaticLDSGlobals = sortByName(
1092 std::vector<GlobalVariable *>(DirectAccess.StaticLDSGlobals.begin(),
1093 DirectAccess.StaticLDSGlobals.end()));
1094 LDSParams.DirectAccess.DynamicLDSGlobals = sortByName(
1095 std::vector<GlobalVariable *>(DirectAccess.DynamicLDSGlobals.begin(),
1096 DirectAccess.DynamicLDSGlobals.end()));
1097 LDSParams.IndirectAccess.StaticLDSGlobals = sortByName(
1098 std::vector<GlobalVariable *>(IndirectAccess.StaticLDSGlobals.begin(),
1099 IndirectAccess.StaticLDSGlobals.end()));
1100 LDSParams.IndirectAccess.DynamicLDSGlobals = sortByName(
1101 std::vector<GlobalVariable *>(IndirectAccess.DynamicLDSGlobals.begin(),
1102 IndirectAccess.DynamicLDSGlobals.end()));
1103}
1104
1105void AMDGPUSwLowerLDS::initAsanInfo() {
1106 // Get Shadow mapping scale and offset.
1107 unsigned LongSize =
1108 M.getDataLayout().getPointerSizeInBits(AMDGPUAS::GLOBAL_ADDRESS);
1110 int Scale;
1111 bool OrShadowOffset;
1112 llvm::getAddressSanitizerParams(Triple(AMDGPUTM.getTargetTriple()), LongSize,
1113 false, &Offset, &Scale, &OrShadowOffset);
1114 AsanInfo.Scale = Scale;
1115 AsanInfo.Offset = Offset;
1116}
1117
1118bool AMDGPUSwLowerLDS::run() {
1119 bool Changed = false;
1120
1121 CallGraph CG = CallGraph(M);
1122
1124
1125 // Get all the direct and indirect access of LDS for all the kernels.
1126 LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
1127
1128 // Utility to group LDS access into direct, indirect, static and dynamic.
1129 auto PopulateKernelStaticDynamicLDS = [&](FunctionVariableMap &LDSAccesses,
1130 bool DirectAccess) {
1131 for (auto &K : LDSAccesses) {
1132 Function *F = K.first;
1133 if (!F || K.second.empty())
1134 continue;
1135
1137 if (!F->hasFnAttribute(Attribute::SanitizeAddress))
1138 continue;
1139
1140 // Only inserts if key isn't already in the map.
1141 FuncLDSAccessInfo.KernelToLDSParametersMap.insert(
1142 {F, KernelLDSParameters()});
1143
1144 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[F];
1145 if (!DirectAccess)
1146 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(F);
1147 for (GlobalVariable *GV : K.second) {
1148 if (!DirectAccess) {
1149 if (AMDGPU::isDynamicLDS(*GV))
1150 LDSParams.IndirectAccess.DynamicLDSGlobals.insert(GV);
1151 else
1152 LDSParams.IndirectAccess.StaticLDSGlobals.insert(GV);
1153 FuncLDSAccessInfo.AllNonKernelLDSAccess.insert(GV);
1154 } else {
1155 if (AMDGPU::isDynamicLDS(*GV))
1156 LDSParams.DirectAccess.DynamicLDSGlobals.insert(GV);
1157 else
1158 LDSParams.DirectAccess.StaticLDSGlobals.insert(GV);
1159 }
1160 }
1161 }
1162 };
1163
1164 PopulateKernelStaticDynamicLDS(LDSUsesInfo.direct_access, true);
1165 PopulateKernelStaticDynamicLDS(LDSUsesInfo.indirect_access, false);
1166
1167 // Get address sanitizer scale.
1168 initAsanInfo();
1169
1170 for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
1171 Function *Func = K.first;
1172 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1173 if (LDSParams.DirectAccess.StaticLDSGlobals.empty() &&
1174 LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
1175 LDSParams.IndirectAccess.StaticLDSGlobals.empty() &&
1176 LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) {
1177 Changed = false;
1178 } else {
1180 {"amdgpu-no-workitem-id-x",
1181 "amdgpu-no-workitem-id-y",
1182 "amdgpu-no-workitem-id-z"});
1183 reorderStaticDynamicIndirectLDSSet(LDSParams);
1184 buildSwLDSGlobal(Func);
1185 buildSwDynLDSGlobal(Func);
1186 populateSwMetadataGlobal(Func);
1187 populateSwLDSAttributeAndMetadata(Func);
1188 populateLDSToReplacementIndicesMap(Func);
1189 DomTreeUpdater DTU(DTCallback(*Func),
1190 DomTreeUpdater::UpdateStrategy::Lazy);
1191 lowerKernelLDSAccesses(Func, DTU);
1192 Changed = true;
1193 }
1194 }
1195
1196 // Get the Uses of LDS from non-kernels.
1197 getUsesOfLDSByNonKernels();
1198
1199 // Get non-kernels with LDS ptr as argument and called by kernels.
1200 getNonKernelsWithLDSArguments(CG);
1201
1202 if (!FuncLDSAccessInfo.NonKernelToLDSAccessMap.empty() ||
1203 !FuncLDSAccessInfo.NonKernelsWithLDSArgument.empty()) {
1204 NonKernelLDSParameters NKLDSParams;
1205 NKLDSParams.OrderedKernels = getOrderedIndirectLDSAccessingKernels(
1206 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess);
1207 NKLDSParams.OrdereLDSGlobals = getOrderedNonKernelAllLDSGlobals(
1208 FuncLDSAccessInfo.AllNonKernelLDSAccess);
1209 buildNonKernelLDSBaseTable(NKLDSParams);
1210 buildNonKernelLDSOffsetTable(NKLDSParams);
1211 for (auto &K : FuncLDSAccessInfo.NonKernelToLDSAccessMap) {
1212 Function *Func = K.first;
1213 DenseSet<GlobalVariable *> &LDSGlobals = K.second;
1214 SetVector<GlobalVariable *> OrderedLDSGlobals = sortByName(
1215 std::vector<GlobalVariable *>(LDSGlobals.begin(), LDSGlobals.end()));
1216 lowerNonKernelLDSAccesses(Func, OrderedLDSGlobals, NKLDSParams);
1217 }
1218 for (Function *Func : FuncLDSAccessInfo.NonKernelsWithLDSArgument) {
1219 auto &K = FuncLDSAccessInfo.NonKernelToLDSAccessMap;
1220 if (K.find(Func) != K.end())
1221 continue;
1223 lowerNonKernelLDSAccesses(Func, Vec, NKLDSParams);
1224 }
1225 Changed = true;
1226 }
1227
1228 if (!Changed)
1229 return Changed;
1230
1231 for (auto &GV : make_early_inc_range(M.globals())) {
1233 // probably want to remove from used lists
1235 if (GV.use_empty())
1236 GV.eraseFromParent();
1237 }
1238 }
1239
1240 if (AsanInstrumentLDS) {
1241 SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
1242 for (Instruction *Inst : AsanInfo.Instructions) {
1243 SmallVector<InterestingMemoryOperand, 1> InterestingOperands;
1244 getInterestingMemoryOperands(M, Inst, InterestingOperands);
1245 for (auto &Operand : InterestingOperands) {
1246 OperandsToInstrument.push_back(Operand);
1247 }
1248 }
1249 for (auto &Operand : OperandsToInstrument) {
1250 Value *Addr = Operand.getPtr();
1251 instrumentAddress(M, IRB, Operand.getInsn(), Operand.getInsn(), Addr,
1252 Operand.Alignment.valueOrOne(), Operand.TypeStoreSize,
1253 Operand.IsWrite, nullptr, false, false, AsanInfo.Scale,
1254 AsanInfo.Offset);
1255 Changed = true;
1256 }
1257 }
1258
1259 return Changed;
1260}
1261
1262class AMDGPUSwLowerLDSLegacy : public ModulePass {
1263public:
1264 const AMDGPUTargetMachine *AMDGPUTM;
1265 static char ID;
1266 AMDGPUSwLowerLDSLegacy(const AMDGPUTargetMachine *TM)
1267 : ModulePass(ID), AMDGPUTM(TM) {
1269 }
1270 bool runOnModule(Module &M) override;
1271 void getAnalysisUsage(AnalysisUsage &AU) const override {
1273 }
1274};
1275} // namespace
1276
1277char AMDGPUSwLowerLDSLegacy::ID = 0;
1278char &llvm::AMDGPUSwLowerLDSLegacyPassID = AMDGPUSwLowerLDSLegacy::ID;
1279
1280INITIALIZE_PASS_BEGIN(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
1281 "AMDGPU Software lowering of LDS", false, false)
1283INITIALIZE_PASS_END(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
1284 "AMDGPU Software lowering of LDS", false, false)
1285
1286bool AMDGPUSwLowerLDSLegacy::runOnModule(Module &M) {
1287 // AddressSanitizer pass adds "nosanitize_address" module flag if it has
1288 // instrumented the IR. Return early if the flag is not present.
1289 if (!M.getModuleFlag("nosanitize_address"))
1290 return false;
1291 DominatorTreeWrapperPass *const DTW =
1292 getAnalysisIfAvailable<DominatorTreeWrapperPass>();
1293 auto DTCallback = [&DTW](Function &F) -> DominatorTree * {
1294 return DTW ? &DTW->getDomTree() : nullptr;
1295 };
1296 if (!AMDGPUTM) {
1297 auto &TPC = getAnalysis<TargetPassConfig>();
1298 AMDGPUTM = &TPC.getTM<AMDGPUTargetMachine>();
1299 }
1300 AMDGPUSwLowerLDS SwLowerLDSImpl(M, *AMDGPUTM, DTCallback);
1301 bool IsChanged = SwLowerLDSImpl.run();
1302 return IsChanged;
1303}
1304
1305ModulePass *
1307 return new AMDGPUSwLowerLDSLegacy(TM);
1308}
1309
1312 // AddressSanitizer pass adds "nosanitize_address" module flag if it has
1313 // instrumented the IR. Return early if the flag is not present.
1314 if (!M.getModuleFlag("nosanitize_address"))
1315 return PreservedAnalyses::all();
1316 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1317 auto DTCallback = [&FAM](Function &F) -> DominatorTree * {
1319 };
1320 AMDGPUSwLowerLDS SwLowerLDSImpl(M, TM, DTCallback);
1321 bool IsChanged = SwLowerLDSImpl.run();
1322 if (!IsChanged)
1323 return PreservedAnalyses::all();
1324
1327 return PA;
1328}
amdgpu sw lower lds
amdgpu sw lower AMDGPU Software lowering of LDS
#define COV5_HIDDEN_DYN_LDS_SIZE_ARG
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
basic Basic Alias true
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
uint64_t Addr
Hexagon Common GEP
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
pre isel intrinsic lowering
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file implements a set that has insertion order iteration characteristics.
This file contains some functions that are useful when dealing with strings.
Target-Independent Code Generator Pass Configuration Options pass.
static DebugLoc getOrCreateDebugLoc(const Instruction *InsertBefore, DISubprogram *SP)
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
Represent the analysis usage information of a pass.
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
void setVolatile(bool V)
Specify whether this is a volatile cmpxchg.
Definition: Instructions.h:559
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
void setVolatile(bool V)
Specify whether this is a volatile RMW or not.
Definition: Instructions.h:841
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:212
A node in the call graph for a module.
Definition: CallGraph.h:165
Function * getFunction() const
Returns the function that this call graph node represents.
Definition: CallGraph.h:196
The basic data container for the call graph of a Module of IR.
Definition: CallGraph.h:71
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1312
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:528
static Constant * getGetElementPtr(Type *Ty, Constant *C, ArrayRef< Constant * > IdxList, GEPNoWrapFlags NW=GEPNoWrapFlags::none(), std::optional< ConstantRange > InRange=std::nullopt, Type *OnlyIfReducedTy=nullptr)
Getelementptr form.
Definition: Constants.h:1267
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1378
This is an important base class in LLVM.
Definition: Constant.h:42
void removeDeadConstantUsers() const
If there are any dead constant users dangling off of this constant, remove them.
Definition: Constants.cpp:739
Subprogram description.
A debug info location.
Definition: DebugLoc.h:33
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
DominatorTree & getDomTree()
Definition: Dominators.h:325
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
arg_iterator arg_end()
Definition: Function.h:877
arg_iterator arg_begin()
Definition: Function.h:868
void applyUpdates(ArrayRef< UpdateT > Updates)
Submit updates to all available trees.
void setMetadata(unsigned KindID, MDNode *Node)
Set a particular kind of metadata attachment.
Definition: Metadata.cpp:1531
uint64_t getAlignment() const
FIXME: Remove this function once transition to Align is over.
Definition: GlobalObject.h:70
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:296
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:294
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
@ ExternalLinkage
Externally visible function.
Definition: GlobalValue.h:52
Type * getValueType() const
Definition: GlobalValue.h:296
void eraseFromParent()
eraseFromParent - This method unlinks 'this' from the containing module and deletes it.
Definition: Globals.cpp:488
Value * CreateConstInBoundsGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1912
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2444
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2697
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:567
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:471
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:95
Metadata node.
Definition: Metadata.h:1069
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1543
LLVMContext & getContext() const
Definition: Metadata.h:1233
void push_back(MachineInstr *MI)
Root of the metadata hierarchy.
Definition: Metadata.h:62
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:251
virtual bool runOnModule(Module &M)=0
runOnModule - Virtual method overriden by subclasses to process the module being operated on.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
A container for an operand bundle being viewed as a set of values rather than a set of uses.
Definition: InstrTypes.h:1073
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:131
Return a value (possibly void), from a function.
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:364
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Class to represent struct types.
Definition: DerivedTypes.h:218
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:612
Target-Independent Code Generator Pass Configuration Options.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static Type * getVoidTy(LLVMContext &C)
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
iterator_range< user_iterator > users()
Definition: Value.h:421
void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
Definition: AsmWriter.cpp:5144
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
bool use_empty() const
Definition: Value.h:344
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
An efficient, type-erasing, non-owning reference to a callable.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
StringRef str() const
Return a StringRef for the vector contents.
Definition: raw_ostream.h:720
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
void getInterestingMemoryOperands(Module &M, Instruction *I, SmallVectorImpl< InterestingMemoryOperand > &Interesting)
Get all the memory operands from the instruction that needs to be instrumented.
bool isDynamicLDS(const GlobalVariable &GV)
unsigned getAMDHSACodeObjectVersion(const Module &M)
void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, ArrayRef< StringRef > FnAttrs)
Strip FnAttr attribute from any functions where we may have introduced its use.
LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M)
bool isLDSVariableToLower(const GlobalVariable &GV)
bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M)
Align getAlign(const DataLayout &DL, const GlobalVariable *GV)
bool isKernelLDS(const Function *F)
void instrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns, Instruction *InsertBefore, Value *Addr, Align Alignment, TypeSize TypeStoreSize, bool IsWrite, Value *SizeArgument, bool UseCalls, bool Recover, int AsanScale, int AsanOffset)
Instrument the memory operand Addr.
uint64_t getRedzoneSizeForGlobal(int AsanScale, uint64_t SizeInBytes)
Given SizeInBytes of the Value to be instrunmented, Returns the redzone size corresponding to it.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:731
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
ModulePass * createAMDGPUSwLowerLDSLegacyPass(const AMDGPUTargetMachine *TM=nullptr)
void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
char & AMDGPUSwLowerLDSLegacyPassID
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
void getAddressSanitizerParams(const Triple &TargetTriple, int LongSize, bool IsKasan, uint64_t *ShadowBase, int *MappingScale, bool *OrShadowOffset)
const AMDGPUTargetMachine & TM
Definition: AMDGPU.h:287
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
FunctionVariableMap direct_access
FunctionVariableMap indirect_access
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39