doxygen/AMDGPULowerKernelAttributes_8cpp_source.html

//===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file This pass does attempts to make use of reqd_work_group_size metadata

/// to eliminate loads from the dispatch packet and to constant fold OpenCL

/// get_local_size-like functions.

//

//===----------------------------------------------------------------------===//


#include "AMDGPU.h"

#include "Utils/AMDGPUBaseInfo.h"

#include "llvm/Analysis/ConstantFolding.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/CodeGen/Passes.h"

#include "llvm/IR/Constants.h"

#include "llvm/IR/Function.h"

#include "llvm/IR/InstIterator.h"

#include "llvm/IR/Instructions.h"

#include "llvm/IR/IntrinsicsAMDGPU.h"

#include "llvm/IR/MDBuilder.h"

#include "llvm/IR/PatternMatch.h"

#include "llvm/Pass.h"


#define DEBUG_TYPE "amdgpu-lower-kernel-attributes"


using namespace llvm;


namespace {


// Field offsets in hsa_kernel_dispatch_packet_t.

enum DispatchPackedOffsets {

  WORKGROUP_SIZE_X = 4,

  WORKGROUP_SIZE_Y = 6,

  WORKGROUP_SIZE_Z = 8,


  GRID_SIZE_X = 12,

  GRID_SIZE_Y = 16,

  GRID_SIZE_Z = 20

};


// Field offsets to implicit kernel argument pointer.

enum ImplicitArgOffsets {

  HIDDEN_BLOCK_COUNT_X = 0,

  HIDDEN_BLOCK_COUNT_Y = 4,

  HIDDEN_BLOCK_COUNT_Z = 8,


  HIDDEN_GROUP_SIZE_X = 12,

  HIDDEN_GROUP_SIZE_Y = 14,

  HIDDEN_GROUP_SIZE_Z = 16,


  HIDDEN_REMAINDER_X = 18,

  HIDDEN_REMAINDER_Y = 20,

  HIDDEN_REMAINDER_Z = 22,

};


class AMDGPULowerKernelAttributes : public ModulePass {

public:

  static char ID;


  AMDGPULowerKernelAttributes() : ModulePass(ID) {}


  bool runOnModule(Module &M) override;


  StringRef getPassName() const override {

    return "AMDGPU Kernel Attributes";

  }


  void getAnalysisUsage(AnalysisUsage &AU) const override {

    AU.setPreservesAll();

 }

};


Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {

  auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr

                                 : Intrinsic::amdgcn_dispatch_ptr;

  return Intrinsic::getDeclarationIfExists(&M, IntrinsicId);

}


} // end anonymous namespace


static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,

                                            uint32_t MaxNumGroups) {

  if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max())

    return;


  if (!Load->getType()->isIntegerTy(32))

    return;


  // TODO: If there is existing range metadata, preserve it if it is stricter.

  MDBuilder MDB(Load->getContext());

  MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1));

  Load->setMetadata(LLVMContext::MD_range, Range);

}


static bool processUse(CallInst *CI, bool IsV5OrAbove) {

  Function *F = CI->getParent()->getParent();


  auto *MD = F->getMetadata("reqd_work_group_size");

  const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;


  const bool HasUniformWorkGroupSize =

    F->getFnAttribute("uniform-work-group-size").getValueAsBool();


  SmallVector<unsigned> MaxNumWorkgroups =

      AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups", 3);


  if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&

      none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))

    return false;


  Value *BlockCounts[3] = {nullptr, nullptr, nullptr};

  Value *GroupSizes[3]  = {nullptr, nullptr, nullptr};

  Value *Remainders[3]  = {nullptr, nullptr, nullptr};

  Value *GridSizes[3]   = {nullptr, nullptr, nullptr};


  const DataLayout &DL = F->getDataLayout();


  // We expect to see several GEP users, casted to the appropriate type and

  // loaded.

  for (User *U : CI->users()) {

    if (!U->hasOneUse())

      continue;


    int64_t Offset = 0;

    auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr?

    auto *BCI = dyn_cast<BitCastInst>(U);

    if (!Load && !BCI) {

      if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)

        continue;

      Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?

      BCI = dyn_cast<BitCastInst>(*U->user_begin());

    }


    if (BCI) {

      if (!BCI->hasOneUse())

        continue;

      Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI?

    }


    if (!Load || !Load->isSimple())

      continue;


    unsigned LoadSize = DL.getTypeStoreSize(Load->getType());


    // TODO: Handle merged loads.

    if (IsV5OrAbove) { // Base is ImplicitArgPtr.

      switch (Offset) {

      case HIDDEN_BLOCK_COUNT_X:

        if (LoadSize == 4) {

          BlockCounts[0] = Load;

          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]);

        }

        break;

      case HIDDEN_BLOCK_COUNT_Y:

        if (LoadSize == 4) {

          BlockCounts[1] = Load;

          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]);

        }

        break;

      case HIDDEN_BLOCK_COUNT_Z:

        if (LoadSize == 4) {

          BlockCounts[2] = Load;

          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]);

        }

        break;

      case HIDDEN_GROUP_SIZE_X:

        if (LoadSize == 2)

          GroupSizes[0] = Load;

        break;

      case HIDDEN_GROUP_SIZE_Y:

        if (LoadSize == 2)

          GroupSizes[1] = Load;

        break;

      case HIDDEN_GROUP_SIZE_Z:

        if (LoadSize == 2)

          GroupSizes[2] = Load;

        break;

      case HIDDEN_REMAINDER_X:

        if (LoadSize == 2)

          Remainders[0] = Load;

        break;

      case HIDDEN_REMAINDER_Y:

        if (LoadSize == 2)

          Remainders[1] = Load;

        break;

      case HIDDEN_REMAINDER_Z:

        if (LoadSize == 2)

          Remainders[2] = Load;

        break;

      default:

        break;

      }

    } else { // Base is DispatchPtr.

      switch (Offset) {

      case WORKGROUP_SIZE_X:

        if (LoadSize == 2)

          GroupSizes[0] = Load;

        break;

      case WORKGROUP_SIZE_Y:

        if (LoadSize == 2)

          GroupSizes[1] = Load;

        break;

      case WORKGROUP_SIZE_Z:

        if (LoadSize == 2)

          GroupSizes[2] = Load;

        break;

      case GRID_SIZE_X:

        if (LoadSize == 4)

          GridSizes[0] = Load;

        break;

      case GRID_SIZE_Y:

        if (LoadSize == 4)

          GridSizes[1] = Load;

        break;

      case GRID_SIZE_Z:

        if (LoadSize == 4)

          GridSizes[2] = Load;

        break;

      default:

        break;

      }

    }

  }


  bool MadeChange = false;

  if (IsV5OrAbove && HasUniformWorkGroupSize) {

    // Under v5  __ockl_get_local_size returns the value computed by the expression:

    //

    //   workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder

    //

    // For functions with the attribute uniform-work-group-size=true. we can evaluate

    // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned

    // for __ockl_get_local_size.

    for (int I = 0; I < 3; ++I) {

      Value *BlockCount = BlockCounts[I];

      if (!BlockCount)

        continue;


      using namespace llvm::PatternMatch;

      auto GroupIDIntrin =

          I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()

                 : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()

                           : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());


      for (User *ICmp : BlockCount->users()) {

        if (match(ICmp, m_SpecificICmp(ICmpInst::ICMP_ULT, GroupIDIntrin,

                                       m_Specific(BlockCount)))) {

          ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType()));

          MadeChange = true;

        }

      }

    }


    // All remainders should be 0 with uniform work group size.

    for (Value *Remainder : Remainders) {

      if (!Remainder)

        continue;

      Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType()));

      MadeChange = true;

    }

  } else if (HasUniformWorkGroupSize) { // Pre-V5.

    // Pattern match the code used to handle partial workgroup dispatches in the

    // library implementation of get_local_size, so the entire function can be

    // constant folded with a known group size.

    //

    // uint r = grid_size - group_id * group_size;

    // get_local_size = (r < group_size) ? r : group_size;

    //

    // If we have uniform-work-group-size (which is the default in OpenCL 1.2),

    // the grid_size is required to be a multiple of group_size). In this case:

    //

    // grid_size - (group_id * group_size) < group_size

    // ->

    // grid_size < group_size + (group_id * group_size)

    //

    // (grid_size / group_size) < 1 + group_id

    //

    // grid_size / group_size is at least 1, so we can conclude the select

    // condition is false (except for group_id == 0, where the select result is

    // the same).

    for (int I = 0; I < 3; ++I) {

      Value *GroupSize = GroupSizes[I];

      Value *GridSize = GridSizes[I];

      if (!GroupSize || !GridSize)

        continue;


      using namespace llvm::PatternMatch;

      auto GroupIDIntrin =

          I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()

                 : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()

                           : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());


      for (User *U : GroupSize->users()) {

        auto *ZextGroupSize = dyn_cast<ZExtInst>(U);

        if (!ZextGroupSize)

          continue;


        for (User *UMin : ZextGroupSize->users()) {

          if (match(UMin,

                    m_UMin(m_Sub(m_Specific(GridSize),

                                 m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))),

                           m_Specific(ZextGroupSize)))) {

            if (HasReqdWorkGroupSize) {

              ConstantInt *KnownSize

                = mdconst::extract<ConstantInt>(MD->getOperand(I));

              UMin->replaceAllUsesWith(ConstantFoldIntegerCast(

                  KnownSize, UMin->getType(), false, DL));

            } else {

              UMin->replaceAllUsesWith(ZextGroupSize);

            }


            MadeChange = true;

          }

        }

      }

    }

  }


  // If reqd_work_group_size is set, we can replace work group size with it.

  if (!HasReqdWorkGroupSize)

    return MadeChange;


  for (int I = 0; I < 3; I++) {

    Value *GroupSize = GroupSizes[I];

    if (!GroupSize)

      continue;


    ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));

    GroupSize->replaceAllUsesWith(

        ConstantFoldIntegerCast(KnownSize, GroupSize->getType(), false, DL));

    MadeChange = true;

  }


  return MadeChange;

}


// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get

// TargetPassConfig for subtarget.

bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {

  bool MadeChange = false;

  bool IsV5OrAbove =

      AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;

  Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);


  if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.

    return false;


  SmallPtrSet<Instruction *, 4> HandledUses;

  for (auto *U : BasePtr->users()) {

    CallInst *CI = cast<CallInst>(U);

    if (HandledUses.insert(CI).second) {

      if (processUse(CI, IsV5OrAbove))

        MadeChange = true;

    }

  }


  return MadeChange;

}


INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,

                      "AMDGPU Kernel Attributes", false, false)

INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,

                    "AMDGPU Kernel Attributes", false, false)


char AMDGPULowerKernelAttributes::ID = 0;


ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {

  return new AMDGPULowerKernelAttributes();

}


PreservedAnalyses

AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {

  bool IsV5OrAbove =

      AMDGPU::getAMDHSACodeObjectVersion(*F.getParent()) >= AMDGPU::AMDHSA_COV5;

  Function *BasePtr = getBasePtrIntrinsic(*F.getParent(), IsV5OrAbove);


  if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.

    return PreservedAnalyses::all();


  for (Instruction &I : instructions(F)) {

    if (CallInst *CI = dyn_cast<CallInst>(&I)) {

      if (CI->getCalledFunction() == BasePtr)

        processUse(CI, IsV5OrAbove);

    }

  }


  return PreservedAnalyses::all();

}

AMDGPUBaseInfo.h

Attributes
AMDGPU Kernel Attributes
Definition: AMDGPULowerKernelAttributes.cpp:370

DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPULowerKernelAttributes.cpp:29

annotateGridSizeLoadWithRangeMD
static void annotateGridSizeLoadWithRangeMD(LoadInst *Load, uint32_t MaxNumGroups)
Definition: AMDGPULowerKernelAttributes.cpp:86

processUse
static bool processUse(CallInst *CI, bool IsV5OrAbove)
Definition: AMDGPULowerKernelAttributes.cpp:100

AMDGPU.h

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: ARMSLSHardening.cpp:73

instructions
Expand Atomic instructions
Definition: AtomicExpandPass.cpp:172

Passes.h

ConstantFolding.h

Constants.h
This file contains the declarations for the subclasses of Constant, which represent the different fla...

X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")

Function.h

InstIterator.h

Instructions.h

F
#define F(x, y, z)
Definition: MD5.cpp:55

I
#define I(x, y, z)
Definition: MD5.cpp:58

MDBuilder.h

Range
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))

INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57

INITIALIZE_PASS_BEGIN
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52

Pass.h

PatternMatch.h

ValueTracking.h

llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:78

llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253

llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47

llvm::AnalysisUsage::setPreservesAll
void setPreservesAll()
Set by analyses that do not transform their input at all.
Definition: PassAnalysisSupport.h:130

llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341

llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1479

llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:83

llvm::ConstantInt::getTrue
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:866

llvm::Constant::getNullValue
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63

llvm::Function
Definition: Function.h:63

llvm::Instruction
Definition: Instruction.h:68

llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:176

llvm::MDBuilder
Definition: MDBuilder.h:36

llvm::MDBuilder::createRange
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition: MDBuilder.cpp:95

llvm::MDNode
Metadata node.
Definition: Metadata.h:1073

llvm::ModulePass
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:251

llvm::ModulePass::runOnModule
virtual bool runOnModule(Module &M)=0
runOnModule - Virtual method overriden by subclasses to process the module being operated on.

llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65

llvm::Pass::getAnalysisUsage
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98

llvm::Pass::getPassName
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81

llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111

llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117

llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384

llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51

llvm::User
Definition: User.h:44

llvm::Value
LLVM Value Representation.
Definition: Value.h:74

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255

llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534

llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:421

llvm::ilist_detail::node_parent_access::getParent
const ParentTy * getParent() const
Definition: ilist_node.h:32

uint32_t

unsigned

false
Definition: StackSlotColoring.cpp:193

llvm::AMDGPU::getAMDHSACodeObjectVersion
unsigned getAMDHSACodeObjectVersion(const Module &M)
Definition: AMDGPUBaseInfo.cpp:172

llvm::AMDGPU::AMDHSA_COV5
@ AMDHSA_COV5
Definition: AMDGPUBaseInfo.h:56

llvm::AMDGPU::getIntegerVecAttribute
SmallVector< unsigned > getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size, unsigned DefaultVal)
Definition: AMDGPUBaseInfo.cpp:1367

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24

llvm::Intrinsic::getDeclarationIfExists
Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)
This version supports overloaded intrinsics.
Definition: Intrinsics.cpp:746

llvm::PatternMatch
Definition: PatternMatch.h:47

llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49

llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885

llvm::PatternMatch::m_Mul
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1168

llvm::PatternMatch::m_SpecificICmp
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
Definition: PatternMatch.h:1690

llvm::PatternMatch::m_Sub
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1114

llvm::PatternMatch::m_UMin
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
Definition: PatternMatch.h:2360

llvm::codeview::EncodedFramePtrReg::BasePtr
@ BasePtr

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18

llvm::Offset
@ Offset
Definition: DWP.cpp:480

llvm::GetPointerBaseWithConstantOffset
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
Definition: ValueTracking.h:639

llvm::createAMDGPULowerKernelAttributesPass
ModulePass * createAMDGPULowerKernelAttributesPass()
Definition: AMDGPULowerKernelAttributes.cpp:374

llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753

llvm::RecurKind::UMin
@ UMin
Unsigned integer min implemented in terms of select(cmp()).

llvm::ConstantFoldIntegerCast
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
Definition: ConstantFolding.cpp:1549

llvm::AMDGPULowerKernelAttributesPass::run
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition: AMDGPULowerKernelAttributes.cpp:379