LLVM 23.0.0git
NVPTXLowerArgs.cpp
Go to the documentation of this file.
1//===-- NVPTXLowerArgs.cpp - Lower arguments ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9//
10// Arguments to kernel and device functions are passed via param space,
11// which imposes certain restrictions:
12// http://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces
13//
14// Kernel parameters are read-only and accessible only via ld.param
15// instruction, directly or via a pointer.
16//
17// Device function parameters are directly accessible via
18// ld.param/st.param, but taking the address of one returns a pointer
19// to a copy created in local space which *can't* be used with
20// ld.param/st.param.
21//
22// Copying a byval struct into local memory in IR allows us to enforce
23// the param space restrictions, gives the rest of IR a pointer w/o
24// param space restrictions, and gives us an opportunity to eliminate
25// the copy.
26//
27// Pointer arguments to kernel functions need more work to be lowered:
28//
29// 1. Convert non-byval pointer arguments of CUDA kernels to pointers in the
30// global address space. This allows later optimizations to emit
31// ld.global.*/st.global.* for accessing these pointer arguments. For
32// example,
33//
34// define void @foo(float* %input) {
35// %v = load float, float* %input, align 4
36// ...
37// }
38//
39// becomes
40//
41// define void @foo(float* %input) {
42// %input2 = addrspacecast float* %input to float addrspace(1)*
43// %input3 = addrspacecast float addrspace(1)* %input2 to float*
44// %v = load float, float* %input3, align 4
45// ...
46// }
47//
48// Later, NVPTXInferAddressSpaces will optimize it to
49//
50// define void @foo(float* %input) {
51// %input2 = addrspacecast float* %input to float addrspace(1)*
52// %v = load float, float addrspace(1)* %input2, align 4
53// ...
54// }
55//
56// 2. Convert byval kernel parameters to pointers in the param address space
57// (so that NVPTX emits ld/st.param). Convert pointers *within* a byval
58// kernel parameter to pointers in the global address space. This allows
59// NVPTX to emit ld/st.global.
60//
61// struct S {
62// int *x;
63// int *y;
64// };
65// __global__ void foo(S s) {
66// int *b = s.y;
67// // use b
68// }
69//
70// "b" points to the global address space. In the IR level,
71//
72// define void @foo(ptr byval %input) {
73// %b_ptr = getelementptr {ptr, ptr}, ptr %input, i64 0, i32 1
74// %b = load ptr, ptr %b_ptr
75// ; use %b
76// }
77//
78// becomes
79//
80// define void @foo({i32*, i32*}* byval %input) {
81// %b_param = addrspacecat ptr %input to ptr addrspace(101)
82// %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0, i32 1
83// %b = load ptr, ptr addrspace(101) %b_ptr
84// %b_global = addrspacecast ptr %b to ptr addrspace(1)
85// ; use %b_generic
86// }
87//
88// Create a local copy of kernel byval parameters used in a way that *might* mutate
89// the parameter, by storing it in an alloca. Mutations to "grid_constant" parameters
90// are undefined behaviour, and don't require local copies.
91//
92// define void @foo(ptr byval(%struct.s) align 4 %input) {
93// store i32 42, ptr %input
94// ret void
95// }
96//
97// becomes
98//
99// define void @foo(ptr byval(%struct.s) align 4 %input) #1 {
100// %input1 = alloca %struct.s, align 4
101// %input2 = addrspacecast ptr %input to ptr addrspace(101)
102// %input3 = load %struct.s, ptr addrspace(101) %input2, align 4
103// store %struct.s %input3, ptr %input1, align 4
104// store i32 42, ptr %input1, align 4
105// ret void
106// }
107//
108// If %input were passed to a device function, or written to memory,
109// conservatively assume that %input gets mutated, and create a local copy.
110//
111// Convert param pointers to grid_constant byval kernel parameters that are
112// passed into calls (device functions, intrinsics, inline asm), or otherwise
113// "escape" (into stores/ptrtoints) to the generic address space, using the
114// `nvvm.ptr.param.to.gen` intrinsic, so that NVPTX emits cvta.param
115// (available for sm70+)
116//
117// define void @foo(ptr byval(%struct.s) %input) {
118// ; %input is a grid_constant
119// %call = call i32 @escape(ptr %input)
120// ret void
121// }
122//
123// becomes
124//
125// define void @foo(ptr byval(%struct.s) %input) {
126// %input1 = addrspacecast ptr %input to ptr addrspace(101)
127// ; the following intrinsic converts pointer to generic. We don't use an addrspacecast
128// ; to prevent generic -> param -> generic from getting cancelled out
129// %input1.gen = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1)
130// %call = call i32 @escape(ptr %input1.gen)
131// ret void
132// }
133//
134// TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't
135// cancel the addrspacecast pair this pass emits.
136//===----------------------------------------------------------------------===//
137
139#include "NVPTX.h"
140#include "NVPTXTargetMachine.h"
141#include "NVPTXUtilities.h"
142#include "llvm/ADT/STLExtras.h"
147#include "llvm/IR/Attributes.h"
148#include "llvm/IR/Function.h"
149#include "llvm/IR/IRBuilder.h"
150#include "llvm/IR/InstIterator.h"
151#include "llvm/IR/Instructions.h"
153#include "llvm/IR/IntrinsicsNVPTX.h"
154#include "llvm/IR/Type.h"
156#include "llvm/Pass.h"
157#include "llvm/Support/Debug.h"
160#include <numeric>
161#include <queue>
162
163#define DEBUG_TYPE "nvptx-lower-args"
164
165using namespace llvm;
166
167namespace {
168class NVPTXLowerArgsLegacyPass : public FunctionPass {
169 bool runOnFunction(Function &F) override;
170
171public:
172 static char ID; // Pass identification, replacement for typeid
173 NVPTXLowerArgsLegacyPass() : FunctionPass(ID) {}
174 StringRef getPassName() const override {
175 return "Lower pointer arguments of CUDA kernels";
176 }
177 void getAnalysisUsage(AnalysisUsage &AU) const override {
179 }
180};
181} // namespace
182
183char NVPTXLowerArgsLegacyPass::ID = 1;
184
185INITIALIZE_PASS_BEGIN(NVPTXLowerArgsLegacyPass, "nvptx-lower-args",
186 "Lower arguments (NVPTX)", false, false)
188INITIALIZE_PASS_END(NVPTXLowerArgsLegacyPass, "nvptx-lower-args",
189 "Lower arguments (NVPTX)", false, false)
190
191// =============================================================================
192// If the function had a byval struct ptr arg, say foo(ptr byval(%struct.x) %d),
193// and we can't guarantee that the only accesses are loads,
194// then add the following instructions to the first basic block:
195//
196// %temp = alloca %struct.x, align 8
197// %tempd = addrspacecast ptr %d to ptr addrspace(101)
198// %tv = load %struct.x, ptr addrspace(101) %tempd
199// store %struct.x %tv, ptr %temp, align 8
200//
201// The above code allocates some space in the stack and copies the incoming
202// struct from param space to local space.
203// Then replace all occurrences of %d by %temp.
204//
205// In case we know that all users are GEPs or Loads, replace them with the same
206// ones in parameter AS, so we can access them using ld.param.
207// =============================================================================
208
209/// Recursively convert the users of a param to the param address space.
210static void convertToParamAS(ArrayRef<Use *> OldUses, Value *Param) {
211 struct IP {
212 Use *OldUse;
213 Value *NewParam;
214 };
215
216 const auto CloneInstInParamAS = [](const IP &I) -> Value * {
217 auto *OldInst = cast<Instruction>(I.OldUse->getUser());
218 if (auto *LI = dyn_cast<LoadInst>(OldInst)) {
219 LI->setOperand(0, I.NewParam);
220 return LI;
221 }
222 if (auto *GEP = dyn_cast<GetElementPtrInst>(OldInst)) {
223 SmallVector<Value *, 4> Indices(GEP->indices());
224 auto *NewGEP = GetElementPtrInst::Create(
225 GEP->getSourceElementType(), I.NewParam, Indices, GEP->getName(),
226 GEP->getIterator());
227 NewGEP->setNoWrapFlags(GEP->getNoWrapFlags());
228 return NewGEP;
229 }
230 if (auto *BC = dyn_cast<BitCastInst>(OldInst)) {
231 auto *NewBCType = PointerType::get(BC->getContext(), ADDRESS_SPACE_PARAM);
232 return BitCastInst::Create(BC->getOpcode(), I.NewParam, NewBCType,
233 BC->getName(), BC->getIterator());
234 }
235 if (auto *ASC = dyn_cast<AddrSpaceCastInst>(OldInst)) {
236 assert(ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM);
237 (void)ASC;
238 // Just pass through the argument, the old ASC is no longer needed.
239 return I.NewParam;
240 }
241 if (auto *MI = dyn_cast<MemTransferInst>(OldInst)) {
242 if (MI->getRawSource() == I.OldUse->get()) {
243 // convert to memcpy/memmove from param space.
244 IRBuilder<> Builder(OldInst);
245 Intrinsic::ID ID = MI->getIntrinsicID();
246
247 CallInst *B = Builder.CreateMemTransferInst(
248 ID, MI->getRawDest(), MI->getDestAlign(), I.NewParam,
249 MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
250 for (unsigned I : {0, 1})
251 if (uint64_t Bytes = MI->getParamDereferenceableBytes(I))
252 B->addDereferenceableParamAttr(I, Bytes);
253 return B;
254 }
255 }
256
257 llvm_unreachable("Unsupported instruction");
258 };
259
260 auto ItemsToConvert =
261 map_to_vector(OldUses, [=](Use *U) -> IP { return {U, Param}; });
262 SmallVector<Instruction *> InstructionsToDelete;
263
264 while (!ItemsToConvert.empty()) {
265 IP I = ItemsToConvert.pop_back_val();
266 Value *NewInst = CloneInstInParamAS(I);
267 Instruction *OldInst = cast<Instruction>(I.OldUse->getUser());
268
269 if (NewInst && NewInst != OldInst) {
270 // We've created a new instruction. Queue users of the old instruction to
271 // be converted and the instruction itself to be deleted. We can't delete
272 // the old instruction yet, because it's still in use by a load somewhere.
273 for (Use &U : OldInst->uses())
274 ItemsToConvert.push_back({&U, NewInst});
275
276 InstructionsToDelete.push_back(OldInst);
277 }
278 }
279
280 // Now we know that all argument loads are using addresses in parameter space
281 // and we can finally remove the old instructions in generic AS. Instructions
282 // scheduled for removal should be processed in reverse order so the ones
283 // closest to the load are deleted first. Otherwise they may still be in use.
284 // E.g if we have Value = Load(BitCast(GEP(arg))), InstructionsToDelete will
285 // have {GEP,BitCast}. GEP can't be deleted first, because it's still used by
286 // the BitCast.
287 for (Instruction *I : llvm::reverse(InstructionsToDelete))
288 I->eraseFromParent();
289}
290
292 Function *F = Arg->getParent();
293 Type *ByValType = Arg->getParamByValType();
294 const DataLayout &DL = F->getDataLayout();
295
296 const Align OptimizedAlign =
297 TLI->getFunctionParamOptimizedAlign(F, ByValType, DL);
298 const Align CurrentAlign = Arg->getParamAlign().valueOrOne();
299
300 if (CurrentAlign >= OptimizedAlign)
301 return CurrentAlign;
302
303 LLVM_DEBUG(dbgs() << "Try to use alignment " << OptimizedAlign.value()
304 << " instead of " << CurrentAlign.value() << " for " << *Arg
305 << '\n');
306
307 Arg->removeAttr(Attribute::Alignment);
308 Arg->addAttr(Attribute::getWithAlignment(F->getContext(), OptimizedAlign));
309
310 return OptimizedAlign;
311}
312
313// Adjust alignment of arguments passed byval in .param address space. We can
314// increase alignment of such arguments in a way that ensures that we can
315// effectively vectorize their loads. We should also traverse all loads from
316// byval pointer and adjust their alignment, if those were using known offset.
317// Such alignment changes must be conformed with parameter store and load in
318// NVPTXTargetLowering::LowerCall.
319static void propagateAlignmentToLoads(Value *Val, Align NewAlign,
320 const DataLayout &DL) {
321 struct Load {
322 LoadInst *Inst;
324 };
325
326 struct LoadContext {
327 Value *InitialVal;
329 };
330
331 SmallVector<Load> Loads;
332 std::queue<LoadContext> Worklist;
333 Worklist.push({Val, 0});
334
335 while (!Worklist.empty()) {
336 LoadContext Ctx = Worklist.front();
337 Worklist.pop();
338
339 for (User *CurUser : Ctx.InitialVal->users()) {
340 if (auto *I = dyn_cast<LoadInst>(CurUser))
341 Loads.push_back({I, Ctx.Offset});
342 else if (isa<BitCastInst>(CurUser) || isa<AddrSpaceCastInst>(CurUser))
343 Worklist.push({cast<Instruction>(CurUser), Ctx.Offset});
344 else if (auto *I = dyn_cast<GetElementPtrInst>(CurUser)) {
345 APInt OffsetAccumulated =
346 APInt::getZero(DL.getIndexSizeInBits(ADDRESS_SPACE_PARAM));
347
348 if (!I->accumulateConstantOffset(DL, OffsetAccumulated))
349 continue;
350
351 uint64_t OffsetLimit = -1;
352 uint64_t Offset = OffsetAccumulated.getLimitedValue(OffsetLimit);
353 assert(Offset != OffsetLimit && "Expect Offset less than UINT64_MAX");
354
355 Worklist.push({I, Ctx.Offset + Offset});
356 }
357 }
358 }
359
360 for (Load &CurLoad : Loads) {
361 Align NewLoadAlign = commonAlignment(NewAlign, CurLoad.Offset);
362 Align CurLoadAlign = CurLoad.Inst->getAlign();
363 CurLoad.Inst->setAlignment(std::max(NewLoadAlign, CurLoadAlign));
364 }
365}
366
367// Create a call to the nvvm_internal_addrspace_wrap intrinsic and set the
368// alignment of the return value based on the alignment of the argument.
370 Argument &Arg) {
371 CallInst *ArgInParam =
372 IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,
374 &Arg, {}, Arg.getName() + ".param");
375
376 if (MaybeAlign ParamAlign = Arg.getParamAlign())
377 ArgInParam->addRetAttr(
378 Attribute::getWithAlignment(ArgInParam->getContext(), *ParamAlign));
379
380 Arg.addAttr(Attribute::get(Arg.getContext(), "nvvm.grid_constant"));
381 Arg.addAttr(Attribute::ReadOnly);
382
383 return ArgInParam;
384}
385
386namespace {
387struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> {
388 using Base = PtrUseVisitor<ArgUseChecker>;
389 // Set of phi/select instructions using the Arg
390 SmallPtrSet<Instruction *, 4> Conditionals;
391
392 ArgUseChecker(const DataLayout &DL) : PtrUseVisitor(DL) {}
393
394 PtrInfo visitArgPtr(Argument &A) {
395 assert(A.getType()->isPointerTy());
396 IntegerType *IntIdxTy = cast<IntegerType>(DL.getIndexType(A.getType()));
397 IsOffsetKnown = false;
398 Offset = APInt(IntIdxTy->getBitWidth(), 0);
399 PI.reset();
400
401 LLVM_DEBUG(dbgs() << "Checking Argument " << A << "\n");
402 // Enqueue the uses of this pointer.
403 enqueueUsers(A);
404
405 // Visit all the uses off the worklist until it is empty.
406 // Note that unlike PtrUseVisitor we intentionally do not track offsets.
407 // We're only interested in how we use the pointer.
408 while (!(Worklist.empty() || PI.isAborted())) {
409 UseToVisit ToVisit = Worklist.pop_back_val();
410 U = ToVisit.UseAndIsOffsetKnown.getPointer();
411 Instruction *I = cast<Instruction>(U->getUser());
412 LLVM_DEBUG(dbgs() << "Processing " << *I << "\n");
413 Base::visit(I);
414 }
415 if (PI.isEscaped())
416 LLVM_DEBUG(dbgs() << "Argument pointer escaped: " << *PI.getEscapingInst()
417 << "\n");
418 else if (PI.isAborted())
419 LLVM_DEBUG(dbgs() << "Pointer use needs a copy: " << *PI.getAbortingInst()
420 << "\n");
421 LLVM_DEBUG(dbgs() << "Traversed " << Conditionals.size()
422 << " conditionals\n");
423 return PI;
424 }
425
426 void visitStoreInst(StoreInst &SI) {
427 // Storing the pointer escapes it.
428 if (U->get() == SI.getValueOperand())
429 return PI.setEscapedAndAborted(&SI);
430
431 PI.setAborted(&SI);
432 }
433
434 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
435 // ASC to param space are no-ops and do not need a copy
437 return PI.setEscapedAndAborted(&ASC);
439 }
440
441 void visitPtrToIntInst(PtrToIntInst &I) { Base::visitPtrToIntInst(I); }
442
443 void visitPHINodeOrSelectInst(Instruction &I) {
445 enqueueUsers(I);
446 Conditionals.insert(&I);
447 }
448 // PHI and select just pass through the pointers.
449 void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
450 void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
451
452 // memcpy/memmove are OK when the pointer is source. We can convert them to
453 // AS-specific memcpy.
454 void visitMemTransferInst(MemTransferInst &II) {
455 if (*U == II.getRawDest())
456 PI.setAborted(&II);
457 }
458
459 void visitMemSetInst(MemSetInst &II) { PI.setAborted(&II); }
460}; // struct ArgUseChecker
461
462void copyByValParam(Function &F, Argument &Arg) {
463 LLVM_DEBUG(dbgs() << "Creating a local copy of " << Arg << "\n");
464 Type *ByValType = Arg.getParamByValType();
465 const DataLayout &DL = F.getDataLayout();
466 IRBuilder<> IRB(&F.getEntryBlock().front());
467 AllocaInst *AllocA = IRB.CreateAlloca(ByValType, nullptr, Arg.getName());
468 // Set the alignment to alignment of the byval parameter. This is because,
469 // later load/stores assume that alignment, and we are going to replace
470 // the use of the byval parameter with this alloca instruction.
471 AllocA->setAlignment(
472 Arg.getParamAlign().value_or(DL.getPrefTypeAlign(ByValType)));
473 Arg.replaceAllUsesWith(AllocA);
474
475 Value *ArgInParamAS = createNVVMInternalAddrspaceWrap(IRB, Arg);
476
477 // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
478 // addrspacecast preserves alignment. Since params are constant, this load
479 // is definitely not volatile.
480 const auto ArgSize = *AllocA->getAllocationSize(DL);
481 IRB.CreateMemCpy(AllocA, AllocA->getAlign(), ArgInParamAS, AllocA->getAlign(),
482 ArgSize);
483}
484} // namespace
485
486static bool argIsProcessed(Argument *Arg) {
487 if (Arg->use_empty())
488 return true;
489
490 // If the argument is already wrapped, it was processed by this pass before.
491 if (Arg->hasOneUse())
492 if (const auto *II = dyn_cast<IntrinsicInst>(*Arg->user_begin()))
493 if (II->getIntrinsicID() == Intrinsic::nvvm_internal_addrspace_wrap)
494 return true;
495
496 return false;
497}
498
499static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) {
500 Function *F = Arg->getParent();
502 const NVPTXSubtarget *ST = TM.getSubtargetImpl(*F);
503
504 const DataLayout &DL = F->getDataLayout();
505 IRBuilder<> IRB(&F->getEntryBlock().front());
506
507 if (argIsProcessed(Arg))
508 return;
509
510 const Align NewArgAlign = setByValParamAlign(Arg, ST->getTargetLowering());
511
512 // (1) First check the easy case, if were able to trace through all the uses
513 // and we can convert them all to param AS, then we'll do this.
514 ArgUseChecker AUC(DL);
515 ArgUseChecker::PtrInfo PI = AUC.visitArgPtr(*Arg);
516 const bool ArgUseIsReadOnly = !(PI.isEscaped() || PI.isAborted());
517 if (ArgUseIsReadOnly && AUC.Conditionals.empty()) {
518 // Convert all loads and intermediate operations to use parameter AS and
519 // skip creation of a local copy of the argument.
521 Value *ArgInParamAS = createNVVMInternalAddrspaceWrap(IRB, *Arg);
522 for (Use *U : UsesToUpdate)
523 convertToParamAS(U, ArgInParamAS);
524
525 propagateAlignmentToLoads(ArgInParamAS, NewArgAlign, DL);
526 return;
527 }
528
529 // (2) If the argument is grid constant, we get to use the pointer directly.
530 if (ST->hasCvtaParam() && (ArgUseIsReadOnly || isParamGridConstant(*Arg))) {
531 LLVM_DEBUG(dbgs() << "Using non-copy pointer to " << *Arg << "\n");
532
533 // Cast argument to param address space. Because the backend will emit the
534 // argument already in the param address space, we need to use the noop
535 // intrinsic, this had the added benefit of preventing other optimizations
536 // from folding away this pair of addrspacecasts.
537 Instruction *ArgInParamAS = createNVVMInternalAddrspaceWrap(IRB, *Arg);
538
539 // Cast param address to generic address space.
540 Value *GenericArg = IRB.CreateAddrSpaceCast(
541 ArgInParamAS, IRB.getPtrTy(ADDRESS_SPACE_GENERIC),
542 Arg->getName() + ".gen");
543
544 Arg->replaceAllUsesWith(GenericArg);
545
546 // Do not replace Arg in the cast to param space
547 ArgInParamAS->setOperand(0, Arg);
548 return;
549 }
550
551 // (3) Otherwise we have to create a copy of the argument in local memory.
552 copyByValParam(*F, *Arg);
553}
554
555static void markPointerAsAS(Value *Ptr, const unsigned AS) {
557 return;
558
559 // Deciding where to emit the addrspacecast pair.
560 BasicBlock::iterator InsertPt;
561 if (Argument *Arg = dyn_cast<Argument>(Ptr)) {
562 // Insert at the functon entry if Ptr is an argument.
563 InsertPt = Arg->getParent()->getEntryBlock().begin();
564 } else {
565 // Insert right after Ptr if Ptr is an instruction.
566 InsertPt = ++cast<Instruction>(Ptr)->getIterator();
567 assert(InsertPt != InsertPt->getParent()->end() &&
568 "We don't call this function with Ptr being a terminator.");
569 }
570
571 Instruction *PtrInGlobal = new AddrSpaceCastInst(
572 Ptr, PointerType::get(Ptr->getContext(), AS), Ptr->getName(), InsertPt);
573 Value *PtrInGeneric = new AddrSpaceCastInst(PtrInGlobal, Ptr->getType(),
574 Ptr->getName(), InsertPt);
575 // Replace with PtrInGeneric all uses of Ptr except PtrInGlobal.
576 Ptr->replaceAllUsesWith(PtrInGeneric);
577 PtrInGlobal->setOperand(0, Ptr);
578}
579
583
584static void handleIntToPtr(Value &V) {
585 if (!all_of(V.users(), [](User *U) { return isa<IntToPtrInst>(U); }))
586 return;
587
588 SmallVector<User *, 16> UsersToUpdate(V.users());
589 for (User *U : UsersToUpdate)
591}
592
593// =============================================================================
594// Main function for this pass.
595// =============================================================================
597 // Copying of byval aggregates + SROA may result in pointers being loaded as
598 // integers, followed by intotoptr. We may want to mark those as global, too,
599 // but only if the loaded integer is used exclusively for conversion to a
600 // pointer with inttoptr.
601 if (TM.getDrvInterface() == NVPTX::CUDA) {
602 // Mark pointers in byval structs as global.
603 for (auto &I : instructions(F)) {
604 auto *LI = dyn_cast<LoadInst>(&I);
605 if (!LI)
606 continue;
607
608 if (LI->getType()->isPointerTy() || LI->getType()->isIntegerTy()) {
609 Value *UO = getUnderlyingObject(LI->getPointerOperand());
610 if (Argument *Arg = dyn_cast<Argument>(UO)) {
611 if (Arg->hasByValAttr()) {
612 // LI is a load from a pointer within a byval kernel parameter.
613 if (LI->getType()->isPointerTy())
615 else
616 handleIntToPtr(*LI);
617 }
618 }
619 }
620 }
621
622 for (Argument &Arg : F.args())
623 if (Arg.getType()->isIntegerTy())
624 handleIntToPtr(Arg);
625 }
626
627 LLVM_DEBUG(dbgs() << "Lowering kernel args of " << F.getName() << "\n");
628 for (Argument &Arg : F.args())
629 if (Arg.hasByValAttr())
630 handleByValParam(TM, &Arg);
631
632 return true;
633}
634
635// Device functions only need to copy byval args into local memory.
637 LLVM_DEBUG(dbgs() << "Lowering function args of " << F.getName() << "\n");
638
640 const DataLayout &DL = F.getDataLayout();
641
642 for (Argument &Arg : F.args())
643 if (Arg.hasByValAttr()) {
644 const Align NewArgAlign = setByValParamAlign(&Arg, TLI);
645 propagateAlignmentToLoads(&Arg, NewArgAlign, DL);
646 }
647
648 return true;
649}
650
655
656bool NVPTXLowerArgsLegacyPass::runOnFunction(Function &F) {
657 auto &TM = getAnalysis<TargetPassConfig>().getTM<NVPTXTargetMachine>();
658 return processFunction(F, TM);
659}
661 return new NVPTXLowerArgsLegacyPass();
662}
663
665 LLVM_DEBUG(dbgs() << "Creating a copy of byval args of " << F.getName()
666 << "\n");
667 bool Changed = false;
668 if (isKernelFunction(F)) {
669 for (Argument &Arg : F.args())
670 if (Arg.hasByValAttr() && !isParamGridConstant(Arg)) {
671 copyByValParam(F, Arg);
672 Changed = true;
673 }
674 }
675 return Changed;
676}
677
683
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Expand Atomic instructions
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runOnFunction(Function &F, bool PostInlining)
Hexagon Common GEP
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
NVPTX address space definition.
static bool runOnDeviceFunction(const NVPTXTargetMachine &TM, Function &F)
nvptx lower Lower static false void convertToParamAS(ArrayRef< Use * > OldUses, Value *Param)
Recursively convert the users of a param to the param address space.
static CallInst * createNVVMInternalAddrspaceWrap(IRBuilder<> &IRB, Argument &Arg)
static bool copyFunctionByValArgs(Function &F)
static void markPointerAsAS(Value *Ptr, const unsigned AS)
static bool argIsProcessed(Argument *Arg)
static bool processFunction(Function &F, NVPTXTargetMachine &TM)
static bool runOnKernelFunction(const NVPTXTargetMachine &TM, Function &F)
static Align setByValParamAlign(Argument *Arg, const NVPTXTargetLowering *TLI)
static void propagateAlignmentToLoads(Value *Val, Align NewAlign, const DataLayout &DL)
static void handleIntToPtr(Value &V)
static void markPointerAsGlobal(Value *Ptr)
static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg)
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file provides a collection of visitors which walk the (instruction) uses of a pointer.
This file contains some templates that are useful if you are working with the STL at all.
This file defines less commonly used SmallVector utilities.
#define LLVM_DEBUG(...)
Definition Debug.h:114
Target-Independent Code Generator Pass Configuration Options pass.
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition APInt.h:476
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
This class represents a conversion between pointers from one address space to another.
unsigned getDestAddressSpace() const
Returns the address space of the result.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI void addAttr(Attribute::AttrKind Kind)
Definition Function.cpp:320
LLVM_ABI bool hasByValAttr() const
Return true if this argument has the byval attribute.
Definition Function.cpp:128
LLVM_ABI void removeAttr(Attribute::AttrKind Kind)
Remove attributes from an argument.
Definition Function.cpp:328
const Function * getParent() const
Definition Argument.h:44
LLVM_ABI Type * getParamByValType() const
If this is a byval argument, return its type.
Definition Function.cpp:224
LLVM_ABI MaybeAlign getParamAlign() const
If this is a byval or inalloca argument, return its alignment.
Definition Function.cpp:215
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:470
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
const BasicBlock & getEntryBlock() const
Definition Function.h:809
static GetElementPtrInst * Create(Type *PointeeType, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:604
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2181
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2787
void visit(Iterator Start, Iterator End)
Definition InstVisitor.h:87
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
An instruction for reading from memory.
const NVPTXTargetLowering * getTargetLowering() const override
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const
getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...
NVPTX::DrvInterface getDrvInterface() const
const NVPTXSubtarget * getSubtargetImpl(const Function &) const override
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
A base class for visitors over the uses of a pointer value.
void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC)
void visitPtrToIntInst(PtrToIntInst &I)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Target-Independent Code Generator Pass Configuration Options.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto map_to_vector(ContainerTy &&C, FuncTy &&F)
Map a range to a SmallVector with element types deduced from the mapping.
FunctionPass * createNVPTXLowerArgsPass()
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
bool isParamGridConstant(const Argument &Arg)
bool isKernelFunction(const Function &F)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)