LLVM  4.0.0
NVPTXTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 
11 #include "NVPTXUtilities.h"
12 #include "llvm/Analysis/LoopInfo.h"
16 #include "llvm/Support/Debug.h"
17 #include "llvm/Target/CostTable.h"
19 using namespace llvm;
20 
21 #define DEBUG_TYPE "NVPTXtti"
22 
23 // Whether the given intrinsic reads threadIdx.x/y/z.
24 static bool readsThreadIndex(const IntrinsicInst *II) {
25  switch (II->getIntrinsicID()) {
26  default: return false;
27  case Intrinsic::nvvm_read_ptx_sreg_tid_x:
28  case Intrinsic::nvvm_read_ptx_sreg_tid_y:
29  case Intrinsic::nvvm_read_ptx_sreg_tid_z:
30  return true;
31  }
32 }
33 
34 static bool readsLaneId(const IntrinsicInst *II) {
35  return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;
36 }
37 
38 // Whether the given intrinsic is an atomic instruction in PTX.
39 static bool isNVVMAtomic(const IntrinsicInst *II) {
40  switch (II->getIntrinsicID()) {
41  default: return false;
42  case Intrinsic::nvvm_atomic_load_add_f32:
43  case Intrinsic::nvvm_atomic_load_inc_32:
44  case Intrinsic::nvvm_atomic_load_dec_32:
45 
46  case Intrinsic::nvvm_atomic_add_gen_f_cta:
47  case Intrinsic::nvvm_atomic_add_gen_f_sys:
48  case Intrinsic::nvvm_atomic_add_gen_i_cta:
49  case Intrinsic::nvvm_atomic_add_gen_i_sys:
50  case Intrinsic::nvvm_atomic_and_gen_i_cta:
51  case Intrinsic::nvvm_atomic_and_gen_i_sys:
52  case Intrinsic::nvvm_atomic_cas_gen_i_cta:
53  case Intrinsic::nvvm_atomic_cas_gen_i_sys:
54  case Intrinsic::nvvm_atomic_dec_gen_i_cta:
55  case Intrinsic::nvvm_atomic_dec_gen_i_sys:
56  case Intrinsic::nvvm_atomic_inc_gen_i_cta:
57  case Intrinsic::nvvm_atomic_inc_gen_i_sys:
58  case Intrinsic::nvvm_atomic_max_gen_i_cta:
59  case Intrinsic::nvvm_atomic_max_gen_i_sys:
60  case Intrinsic::nvvm_atomic_min_gen_i_cta:
61  case Intrinsic::nvvm_atomic_min_gen_i_sys:
62  case Intrinsic::nvvm_atomic_or_gen_i_cta:
63  case Intrinsic::nvvm_atomic_or_gen_i_sys:
64  case Intrinsic::nvvm_atomic_exch_gen_i_cta:
65  case Intrinsic::nvvm_atomic_exch_gen_i_sys:
66  case Intrinsic::nvvm_atomic_xor_gen_i_cta:
67  case Intrinsic::nvvm_atomic_xor_gen_i_sys:
68  return true;
69  }
70 }
71 
73  // Without inter-procedural analysis, we conservatively assume that arguments
74  // to __device__ functions are divergent.
75  if (const Argument *Arg = dyn_cast<Argument>(V))
76  return !isKernelFunction(*Arg->getParent());
77 
78  if (const Instruction *I = dyn_cast<Instruction>(V)) {
79  // Without pointer analysis, we conservatively assume values loaded from
80  // generic or local address space are divergent.
81  if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
82  unsigned AS = LI->getPointerAddressSpace();
83  return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;
84  }
85  // Atomic instructions may cause divergence. Atomic instructions are
86  // executed sequentially across all threads in a warp. Therefore, an earlier
87  // executed thread may see different memory inputs than a later executed
88  // thread. For example, suppose *a = 0 initially.
89  //
90  // atom.global.add.s32 d, [a], 1
91  //
92  // returns 0 for the first thread that enters the critical region, and 1 for
93  // the second thread.
94  if (I->isAtomic())
95  return true;
96  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
97  // Instructions that read threadIdx are obviously divergent.
98  if (readsThreadIndex(II) || readsLaneId(II))
99  return true;
100  // Handle the NVPTX atomic instrinsics that cannot be represented as an
101  // atomic IR instruction.
102  if (isNVVMAtomic(II))
103  return true;
104  }
105  // Conservatively consider the return value of function calls as divergent.
106  // We could analyze callees with bodies more precisely using
107  // inter-procedural analysis.
108  if (isa<CallInst>(I))
109  return true;
110  }
111 
112  return false;
113 }
114 
116  unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
117  TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
119  // Legalize the type.
120  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
121 
122  int ISD = TLI->InstructionOpcodeToISD(Opcode);
123 
124  switch (ISD) {
125  default:
126  return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
127  Opd1PropInfo, Opd2PropInfo);
128  case ISD::ADD:
129  case ISD::MUL:
130  case ISD::XOR:
131  case ISD::OR:
132  case ISD::AND:
133  // The machine code (SASS) simulates an i64 with two i32. Therefore, we
134  // estimate that arithmetic operations on i64 are twice as expensive as
135  // those on types that can fit into one machine register.
136  if (LT.second.SimpleTy == MVT::i64)
137  return 2 * LT.first;
138  // Delegate other cases to the basic TTI.
139  return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
140  Opd1PropInfo, Opd2PropInfo);
141  }
142 }
143 
147 
148  // Enable partial unrolling and runtime unrolling, but reduce the
149  // threshold. This partially unrolls small loops which are often
150  // unrolled by the PTX to SASS compiler and unrolling earlier can be
151  // beneficial.
152  UP.Partial = UP.Runtime = true;
153  UP.PartialThreshold = UP.Threshold / 4;
154 }
MachineLoop * L
static bool readsThreadIndex(const IntrinsicInst *II)
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
LLVM Argument representation.
Definition: Argument.h:34
Cost tables and simple lookup functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:51
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
static bool readsLaneId(const IntrinsicInst *II)
An instruction for reading from memory.
Definition: Instructions.h:164
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
bool isKernelFunction(const Function &F)
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >())
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:200
void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
This file a TargetTransformInfo::Concept conforming object specific to the NVPTX target machine...
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
bool isSourceOfDivergence(const Value *V)
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
OperandValueProperties
Additional properties of an operand's values.
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >())
Definition: BasicTTIImpl.h:306
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
std::pair< int, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:333
unsigned Threshold
The cost threshold for the unrolled loop.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:368
Parameters that control the generic loop unrolling transformation.
#define I(x, y, z)
Definition: MD5.cpp:54
static bool isNVVMAtomic(const IntrinsicInst *II)
LLVM Value Representation.
Definition: Value.h:71
OperandValueKind
Additional information about an operand's possible values.
This pass exposes codegen information to IR-level passes.
void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP)
Definition: BasicTTIImpl.h:231
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:44
This file describes how to lower LLVM code to machine code.