LLVM  10.0.0svn
NVPTXTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 #include "NVPTXUtilities.h"
11 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
17 #include "llvm/Support/Debug.h"
18 using namespace llvm;
19 
20 #define DEBUG_TYPE "NVPTXtti"
21 
22 // Whether the given intrinsic reads threadIdx.x/y/z.
23 static bool readsThreadIndex(const IntrinsicInst *II) {
24  switch (II->getIntrinsicID()) {
25  default: return false;
26  case Intrinsic::nvvm_read_ptx_sreg_tid_x:
27  case Intrinsic::nvvm_read_ptx_sreg_tid_y:
28  case Intrinsic::nvvm_read_ptx_sreg_tid_z:
29  return true;
30  }
31 }
32 
33 static bool readsLaneId(const IntrinsicInst *II) {
34  return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;
35 }
36 
37 // Whether the given intrinsic is an atomic instruction in PTX.
38 static bool isNVVMAtomic(const IntrinsicInst *II) {
39  switch (II->getIntrinsicID()) {
40  default: return false;
41  case Intrinsic::nvvm_atomic_load_inc_32:
42  case Intrinsic::nvvm_atomic_load_dec_32:
43 
44  case Intrinsic::nvvm_atomic_add_gen_f_cta:
45  case Intrinsic::nvvm_atomic_add_gen_f_sys:
46  case Intrinsic::nvvm_atomic_add_gen_i_cta:
47  case Intrinsic::nvvm_atomic_add_gen_i_sys:
48  case Intrinsic::nvvm_atomic_and_gen_i_cta:
49  case Intrinsic::nvvm_atomic_and_gen_i_sys:
50  case Intrinsic::nvvm_atomic_cas_gen_i_cta:
51  case Intrinsic::nvvm_atomic_cas_gen_i_sys:
52  case Intrinsic::nvvm_atomic_dec_gen_i_cta:
53  case Intrinsic::nvvm_atomic_dec_gen_i_sys:
54  case Intrinsic::nvvm_atomic_inc_gen_i_cta:
55  case Intrinsic::nvvm_atomic_inc_gen_i_sys:
56  case Intrinsic::nvvm_atomic_max_gen_i_cta:
57  case Intrinsic::nvvm_atomic_max_gen_i_sys:
58  case Intrinsic::nvvm_atomic_min_gen_i_cta:
59  case Intrinsic::nvvm_atomic_min_gen_i_sys:
60  case Intrinsic::nvvm_atomic_or_gen_i_cta:
61  case Intrinsic::nvvm_atomic_or_gen_i_sys:
62  case Intrinsic::nvvm_atomic_exch_gen_i_cta:
63  case Intrinsic::nvvm_atomic_exch_gen_i_sys:
64  case Intrinsic::nvvm_atomic_xor_gen_i_cta:
65  case Intrinsic::nvvm_atomic_xor_gen_i_sys:
66  return true;
67  }
68 }
69 
71  // Without inter-procedural analysis, we conservatively assume that arguments
72  // to __device__ functions are divergent.
73  if (const Argument *Arg = dyn_cast<Argument>(V))
74  return !isKernelFunction(*Arg->getParent());
75 
76  if (const Instruction *I = dyn_cast<Instruction>(V)) {
77  // Without pointer analysis, we conservatively assume values loaded from
78  // generic or local address space are divergent.
79  if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
80  unsigned AS = LI->getPointerAddressSpace();
81  return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;
82  }
83  // Atomic instructions may cause divergence. Atomic instructions are
84  // executed sequentially across all threads in a warp. Therefore, an earlier
85  // executed thread may see different memory inputs than a later executed
86  // thread. For example, suppose *a = 0 initially.
87  //
88  // atom.global.add.s32 d, [a], 1
89  //
90  // returns 0 for the first thread that enters the critical region, and 1 for
91  // the second thread.
92  if (I->isAtomic())
93  return true;
94  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
95  // Instructions that read threadIdx are obviously divergent.
96  if (readsThreadIndex(II) || readsLaneId(II))
97  return true;
98  // Handle the NVPTX atomic instrinsics that cannot be represented as an
99  // atomic IR instruction.
100  if (isNVVMAtomic(II))
101  return true;
102  }
103  // Conservatively consider the return value of function calls as divergent.
104  // We could analyze callees with bodies more precisely using
105  // inter-procedural analysis.
106  if (isa<CallInst>(I))
107  return true;
108  }
109 
110  return false;
111 }
112 
114  unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
115  TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
117  // Legalize the type.
118  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
119 
120  int ISD = TLI->InstructionOpcodeToISD(Opcode);
121 
122  switch (ISD) {
123  default:
124  return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
125  Opd1PropInfo, Opd2PropInfo);
126  case ISD::ADD:
127  case ISD::MUL:
128  case ISD::XOR:
129  case ISD::OR:
130  case ISD::AND:
131  // The machine code (SASS) simulates an i64 with two i32. Therefore, we
132  // estimate that arithmetic operations on i64 are twice as expensive as
133  // those on types that can fit into one machine register.
134  if (LT.second.SimpleTy == MVT::i64)
135  return 2 * LT.first;
136  // Delegate other cases to the basic TTI.
137  return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
138  Opd1PropInfo, Opd2PropInfo);
139  }
140 }
141 
145 
146  // Enable partial unrolling and runtime unrolling, but reduce the
147  // threshold. This partially unrolls small loops which are often
148  // unrolled by the PTX to SASS compiler and unrolling earlier can be
149  // beneficial.
150  UP.Partial = UP.Runtime = true;
151  UP.PartialThreshold = UP.Threshold / 4;
152 }
static bool readsThreadIndex(const IntrinsicInst *II)
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >())
Definition: BasicTTIImpl.h:621
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
This class represents lattice values for constants.
Definition: AllocatorList.h:23
Cost tables and simple lookup functions.
The main scalar evolution driver.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
static bool readsLaneId(const IntrinsicInst *II)
An instruction for reading from memory.
Definition: Instructions.h:169
bool isKernelFunction(const Function &F)
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:200
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Definition: BasicTTIImpl.h:439
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value *> Args=ArrayRef< const Value *>())
This file a TargetTransformInfo::Concept conforming object specific to the NVPTX target machine...
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
bool isSourceOfDivergence(const Value *V)
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
OperandValueProperties
Additional properties of an operand&#39;s values.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:50
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:426
unsigned Threshold
The cost threshold for the unrolled loop.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:509
Parameters that control the generic loop unrolling transformation.
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isNVVMAtomic(const IntrinsicInst *II)
LLVM Value Representation.
Definition: Value.h:74
OperandValueKind
Additional information about an operand&#39;s possible values.
This pass exposes codegen information to IR-level passes.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::pair< int, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:43
This file describes how to lower LLVM code to machine code.