Bug Summary

File:llvm/lib/Transforms/IPO/OpenMPOpt.cpp
Warning:line 2767, column 48
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name OpenMPOpt.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20220108111521+9345ab3a4550/build-llvm -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Transforms/IPO -I /build/llvm-toolchain-snapshot-14~++20220108111521+9345ab3a4550/llvm/lib/Transforms/IPO -I include -I /build/llvm-toolchain-snapshot-14~++20220108111521+9345ab3a4550/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-14~++20220108111521+9345ab3a4550/build-llvm=build-llvm -fmacro-prefix-map=/build/llvm-toolchain-snapshot-14~++20220108111521+9345ab3a4550/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-14~++20220108111521+9345ab3a4550/build-llvm=build-llvm -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-14~++20220108111521+9345ab3a4550/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20220108111521+9345ab3a4550/build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20220108111521+9345ab3a4550/build-llvm=build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20220108111521+9345ab3a4550/= -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-01-08-143526-16334-1 -x c++ /build/llvm-toolchain-snapshot-14~++20220108111521+9345ab3a4550/llvm/lib/Transforms/IPO/OpenMPOpt.cpp

/build/llvm-toolchain-snapshot-14~++20220108111521+9345ab3a4550/llvm/lib/Transforms/IPO/OpenMPOpt.cpp

1//===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// OpenMP specific optimizations:
10//
11// - Deduplication of runtime calls, e.g., omp_get_thread_num.
12// - Replacing globalized device memory with stack memory.
13// - Replacing globalized device memory with shared memory.
14// - Parallel region merging.
15// - Transforming generic-mode device kernels to SPMD mode.
16// - Specializing the state machine for generic-mode device kernels.
17//
18//===----------------------------------------------------------------------===//
19
20#include "llvm/Transforms/IPO/OpenMPOpt.h"
21
22#include "llvm/ADT/EnumeratedArray.h"
23#include "llvm/ADT/PostOrderIterator.h"
24#include "llvm/ADT/SetVector.h"
25#include "llvm/ADT/Statistic.h"
26#include "llvm/ADT/StringRef.h"
27#include "llvm/Analysis/CallGraph.h"
28#include "llvm/Analysis/CallGraphSCCPass.h"
29#include "llvm/Analysis/OptimizationRemarkEmitter.h"
30#include "llvm/Analysis/ValueTracking.h"
31#include "llvm/Frontend/OpenMP/OMPConstants.h"
32#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
33#include "llvm/IR/Assumptions.h"
34#include "llvm/IR/DiagnosticInfo.h"
35#include "llvm/IR/GlobalValue.h"
36#include "llvm/IR/Instruction.h"
37#include "llvm/IR/IntrinsicInst.h"
38#include "llvm/IR/IntrinsicsAMDGPU.h"
39#include "llvm/IR/IntrinsicsNVPTX.h"
40#include "llvm/InitializePasses.h"
41#include "llvm/Support/CommandLine.h"
42#include "llvm/Transforms/IPO.h"
43#include "llvm/Transforms/IPO/Attributor.h"
44#include "llvm/Transforms/Utils/BasicBlockUtils.h"
45#include "llvm/Transforms/Utils/CallGraphUpdater.h"
46#include "llvm/Transforms/Utils/CodeExtractor.h"
47
48#include <algorithm>
49
50using namespace llvm;
51using namespace omp;
52
53#define DEBUG_TYPE"openmp-opt" "openmp-opt"
54
55static cl::opt<bool> DisableOpenMPOptimizations(
56 "openmp-opt-disable", cl::ZeroOrMore,
57 cl::desc("Disable OpenMP specific optimizations."), cl::Hidden,
58 cl::init(false));
59
60static cl::opt<bool> EnableParallelRegionMerging(
61 "openmp-opt-enable-merging", cl::ZeroOrMore,
62 cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,
63 cl::init(false));
64
65static cl::opt<bool>
66 DisableInternalization("openmp-opt-disable-internalization", cl::ZeroOrMore,
67 cl::desc("Disable function internalization."),
68 cl::Hidden, cl::init(false));
69
70static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
71 cl::Hidden);
72static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
73 cl::init(false), cl::Hidden);
74
75static cl::opt<bool> HideMemoryTransferLatency(
76 "openmp-hide-memory-transfer-latency",
77 cl::desc("[WIP] Tries to hide the latency of host to device memory"
78 " transfers"),
79 cl::Hidden, cl::init(false));
80
81static cl::opt<bool> DisableOpenMPOptDeglobalization(
82 "openmp-opt-disable-deglobalization", cl::ZeroOrMore,
83 cl::desc("Disable OpenMP optimizations involving deglobalization."),
84 cl::Hidden, cl::init(false));
85
86static cl::opt<bool> DisableOpenMPOptSPMDization(
87 "openmp-opt-disable-spmdization", cl::ZeroOrMore,
88 cl::desc("Disable OpenMP optimizations involving SPMD-ization."),
89 cl::Hidden, cl::init(false));
90
91static cl::opt<bool> DisableOpenMPOptFolding(
92 "openmp-opt-disable-folding", cl::ZeroOrMore,
93 cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden,
94 cl::init(false));
95
96static cl::opt<bool> DisableOpenMPOptStateMachineRewrite(
97 "openmp-opt-disable-state-machine-rewrite", cl::ZeroOrMore,
98 cl::desc("Disable OpenMP optimizations that replace the state machine."),
99 cl::Hidden, cl::init(false));
100
101static cl::opt<bool> PrintModuleAfterOptimizations(
102 "openmp-opt-print-module", cl::ZeroOrMore,
103 cl::desc("Print the current module after OpenMP optimizations."),
104 cl::Hidden, cl::init(false));
105
106static cl::opt<bool> AlwaysInlineDeviceFunctions(
107 "openmp-opt-inline-device", cl::ZeroOrMore,
108 cl::desc("Inline all applicible functions on the device."), cl::Hidden,
109 cl::init(false));
110
111static cl::opt<bool>
112 EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::ZeroOrMore,
113 cl::desc("Enables more verbose remarks."), cl::Hidden,
114 cl::init(false));
115
116static cl::opt<unsigned>
117 SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden,
118 cl::desc("Maximal number of attributor iterations."),
119 cl::init(256));
120
121STATISTIC(NumOpenMPRuntimeCallsDeduplicated,static llvm::Statistic NumOpenMPRuntimeCallsDeduplicated = {"openmp-opt"
, "NumOpenMPRuntimeCallsDeduplicated", "Number of OpenMP runtime calls deduplicated"
}
122 "Number of OpenMP runtime calls deduplicated")static llvm::Statistic NumOpenMPRuntimeCallsDeduplicated = {"openmp-opt"
, "NumOpenMPRuntimeCallsDeduplicated", "Number of OpenMP runtime calls deduplicated"
}
;
123STATISTIC(NumOpenMPParallelRegionsDeleted,static llvm::Statistic NumOpenMPParallelRegionsDeleted = {"openmp-opt"
, "NumOpenMPParallelRegionsDeleted", "Number of OpenMP parallel regions deleted"
}
124 "Number of OpenMP parallel regions deleted")static llvm::Statistic NumOpenMPParallelRegionsDeleted = {"openmp-opt"
, "NumOpenMPParallelRegionsDeleted", "Number of OpenMP parallel regions deleted"
}
;
125STATISTIC(NumOpenMPRuntimeFunctionsIdentified,static llvm::Statistic NumOpenMPRuntimeFunctionsIdentified = {
"openmp-opt", "NumOpenMPRuntimeFunctionsIdentified", "Number of OpenMP runtime functions identified"
}
126 "Number of OpenMP runtime functions identified")static llvm::Statistic NumOpenMPRuntimeFunctionsIdentified = {
"openmp-opt", "NumOpenMPRuntimeFunctionsIdentified", "Number of OpenMP runtime functions identified"
}
;
127STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,static llvm::Statistic NumOpenMPRuntimeFunctionUsesIdentified
= {"openmp-opt", "NumOpenMPRuntimeFunctionUsesIdentified", "Number of OpenMP runtime function uses identified"
}
128 "Number of OpenMP runtime function uses identified")static llvm::Statistic NumOpenMPRuntimeFunctionUsesIdentified
= {"openmp-opt", "NumOpenMPRuntimeFunctionUsesIdentified", "Number of OpenMP runtime function uses identified"
}
;
129STATISTIC(NumOpenMPTargetRegionKernels,static llvm::Statistic NumOpenMPTargetRegionKernels = {"openmp-opt"
, "NumOpenMPTargetRegionKernels", "Number of OpenMP target region entry points (=kernels) identified"
}
130 "Number of OpenMP target region entry points (=kernels) identified")static llvm::Statistic NumOpenMPTargetRegionKernels = {"openmp-opt"
, "NumOpenMPTargetRegionKernels", "Number of OpenMP target region entry points (=kernels) identified"
}
;
131STATISTIC(NumOpenMPTargetRegionKernelsSPMD,static llvm::Statistic NumOpenMPTargetRegionKernelsSPMD = {"openmp-opt"
, "NumOpenMPTargetRegionKernelsSPMD", "Number of OpenMP target region entry points (=kernels) executed in "
"SPMD-mode instead of generic-mode"}
132 "Number of OpenMP target region entry points (=kernels) executed in "static llvm::Statistic NumOpenMPTargetRegionKernelsSPMD = {"openmp-opt"
, "NumOpenMPTargetRegionKernelsSPMD", "Number of OpenMP target region entry points (=kernels) executed in "
"SPMD-mode instead of generic-mode"}
133 "SPMD-mode instead of generic-mode")static llvm::Statistic NumOpenMPTargetRegionKernelsSPMD = {"openmp-opt"
, "NumOpenMPTargetRegionKernelsSPMD", "Number of OpenMP target region entry points (=kernels) executed in "
"SPMD-mode instead of generic-mode"}
;
134STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,static llvm::Statistic NumOpenMPTargetRegionKernelsWithoutStateMachine
= {"openmp-opt", "NumOpenMPTargetRegionKernelsWithoutStateMachine"
, "Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode without a state machines"}
135 "Number of OpenMP target region entry points (=kernels) executed in "static llvm::Statistic NumOpenMPTargetRegionKernelsWithoutStateMachine
= {"openmp-opt", "NumOpenMPTargetRegionKernelsWithoutStateMachine"
, "Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode without a state machines"}
136 "generic-mode without a state machines")static llvm::Statistic NumOpenMPTargetRegionKernelsWithoutStateMachine
= {"openmp-opt", "NumOpenMPTargetRegionKernelsWithoutStateMachine"
, "Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode without a state machines"}
;
137STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,static llvm::Statistic NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback
= {"openmp-opt", "NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback"
, "Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode with customized state machines with fallback"}
138 "Number of OpenMP target region entry points (=kernels) executed in "static llvm::Statistic NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback
= {"openmp-opt", "NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback"
, "Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode with customized state machines with fallback"}
139 "generic-mode with customized state machines with fallback")static llvm::Statistic NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback
= {"openmp-opt", "NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback"
, "Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode with customized state machines with fallback"}
;
140STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,static llvm::Statistic NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback
= {"openmp-opt", "NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback"
, "Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode with customized state machines without fallback"
}
141 "Number of OpenMP target region entry points (=kernels) executed in "static llvm::Statistic NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback
= {"openmp-opt", "NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback"
, "Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode with customized state machines without fallback"
}
142 "generic-mode with customized state machines without fallback")static llvm::Statistic NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback
= {"openmp-opt", "NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback"
, "Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode with customized state machines without fallback"
}
;
143STATISTIC(static llvm::Statistic NumOpenMPParallelRegionsReplacedInGPUStateMachine
= {"openmp-opt", "NumOpenMPParallelRegionsReplacedInGPUStateMachine"
, "Number of OpenMP parallel regions replaced with ID in GPU state machines"
}
144 NumOpenMPParallelRegionsReplacedInGPUStateMachine,static llvm::Statistic NumOpenMPParallelRegionsReplacedInGPUStateMachine
= {"openmp-opt", "NumOpenMPParallelRegionsReplacedInGPUStateMachine"
, "Number of OpenMP parallel regions replaced with ID in GPU state machines"
}
145 "Number of OpenMP parallel regions replaced with ID in GPU state machines")static llvm::Statistic NumOpenMPParallelRegionsReplacedInGPUStateMachine
= {"openmp-opt", "NumOpenMPParallelRegionsReplacedInGPUStateMachine"
, "Number of OpenMP parallel regions replaced with ID in GPU state machines"
}
;
146STATISTIC(NumOpenMPParallelRegionsMerged,static llvm::Statistic NumOpenMPParallelRegionsMerged = {"openmp-opt"
, "NumOpenMPParallelRegionsMerged", "Number of OpenMP parallel regions merged"
}
147 "Number of OpenMP parallel regions merged")static llvm::Statistic NumOpenMPParallelRegionsMerged = {"openmp-opt"
, "NumOpenMPParallelRegionsMerged", "Number of OpenMP parallel regions merged"
}
;
148STATISTIC(NumBytesMovedToSharedMemory,static llvm::Statistic NumBytesMovedToSharedMemory = {"openmp-opt"
, "NumBytesMovedToSharedMemory", "Amount of memory pushed to shared memory"
}
149 "Amount of memory pushed to shared memory")static llvm::Statistic NumBytesMovedToSharedMemory = {"openmp-opt"
, "NumBytesMovedToSharedMemory", "Amount of memory pushed to shared memory"
}
;
150
151#if !defined(NDEBUG)
152static constexpr auto TAG = "[" DEBUG_TYPE"openmp-opt" "]";
153#endif
154
155namespace {
156
157struct AAHeapToShared;
158
159struct AAICVTracker;
160
161/// OpenMP specific information. For now, stores RFIs and ICVs also needed for
162/// Attributor runs.
163struct OMPInformationCache : public InformationCache {
164 OMPInformationCache(Module &M, AnalysisGetter &AG,
165 BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC,
166 SmallPtrSetImpl<Kernel> &Kernels)
167 : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M),
168 Kernels(Kernels) {
169
170 OMPBuilder.initialize();
171 initializeRuntimeFunctions();
172 initializeInternalControlVars();
173 }
174
175 /// Generic information that describes an internal control variable.
176 struct InternalControlVarInfo {
177 /// The kind, as described by InternalControlVar enum.
178 InternalControlVar Kind;
179
180 /// The name of the ICV.
181 StringRef Name;
182
183 /// Environment variable associated with this ICV.
184 StringRef EnvVarName;
185
186 /// Initial value kind.
187 ICVInitValue InitKind;
188
189 /// Initial value.
190 ConstantInt *InitValue;
191
192 /// Setter RTL function associated with this ICV.
193 RuntimeFunction Setter;
194
195 /// Getter RTL function associated with this ICV.
196 RuntimeFunction Getter;
197
198 /// RTL Function corresponding to the override clause of this ICV
199 RuntimeFunction Clause;
200 };
201
202 /// Generic information that describes a runtime function
203 struct RuntimeFunctionInfo {
204
205 /// The kind, as described by the RuntimeFunction enum.
206 RuntimeFunction Kind;
207
208 /// The name of the function.
209 StringRef Name;
210
211 /// Flag to indicate a variadic function.
212 bool IsVarArg;
213
214 /// The return type of the function.
215 Type *ReturnType;
216
217 /// The argument types of the function.
218 SmallVector<Type *, 8> ArgumentTypes;
219
220 /// The declaration if available.
221 Function *Declaration = nullptr;
222
223 /// Uses of this runtime function per function containing the use.
224 using UseVector = SmallVector<Use *, 16>;
225
226 /// Clear UsesMap for runtime function.
227 void clearUsesMap() { UsesMap.clear(); }
228
229 /// Boolean conversion that is true if the runtime function was found.
230 operator bool() const { return Declaration; }
231
232 /// Return the vector of uses in function \p F.
233 UseVector &getOrCreateUseVector(Function *F) {
234 std::shared_ptr<UseVector> &UV = UsesMap[F];
235 if (!UV)
236 UV = std::make_shared<UseVector>();
237 return *UV;
238 }
239
240 /// Return the vector of uses in function \p F or `nullptr` if there are
241 /// none.
242 const UseVector *getUseVector(Function &F) const {
243 auto I = UsesMap.find(&F);
244 if (I != UsesMap.end())
245 return I->second.get();
246 return nullptr;
247 }
248
249 /// Return how many functions contain uses of this runtime function.
250 size_t getNumFunctionsWithUses() const { return UsesMap.size(); }
251
252 /// Return the number of arguments (or the minimal number for variadic
253 /// functions).
254 size_t getNumArgs() const { return ArgumentTypes.size(); }
255
256 /// Run the callback \p CB on each use and forget the use if the result is
257 /// true. The callback will be fed the function in which the use was
258 /// encountered as second argument.
259 void foreachUse(SmallVectorImpl<Function *> &SCC,
260 function_ref<bool(Use &, Function &)> CB) {
261 for (Function *F : SCC)
262 foreachUse(CB, F);
263 }
264
265 /// Run the callback \p CB on each use within the function \p F and forget
266 /// the use if the result is true.
267 void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) {
268 SmallVector<unsigned, 8> ToBeDeleted;
269 ToBeDeleted.clear();
270
271 unsigned Idx = 0;
272 UseVector &UV = getOrCreateUseVector(F);
273
274 for (Use *U : UV) {
275 if (CB(*U, *F))
276 ToBeDeleted.push_back(Idx);
277 ++Idx;
278 }
279
280 // Remove the to-be-deleted indices in reverse order as prior
281 // modifications will not modify the smaller indices.
282 while (!ToBeDeleted.empty()) {
283 unsigned Idx = ToBeDeleted.pop_back_val();
284 UV[Idx] = UV.back();
285 UV.pop_back();
286 }
287 }
288
289 private:
290 /// Map from functions to all uses of this runtime function contained in
291 /// them.
292 DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
293
294 public:
295 /// Iterators for the uses of this runtime function.
296 decltype(UsesMap)::iterator begin() { return UsesMap.begin(); }
297 decltype(UsesMap)::iterator end() { return UsesMap.end(); }
298 };
299
300 /// An OpenMP-IR-Builder instance
301 OpenMPIRBuilder OMPBuilder;
302
303 /// Map from runtime function kind to the runtime function description.
304 EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction,
305 RuntimeFunction::OMPRTL___last>
306 RFIs;
307
308 /// Map from function declarations/definitions to their runtime enum type.
309 DenseMap<Function *, RuntimeFunction> RuntimeFunctionIDMap;
310
311 /// Map from ICV kind to the ICV description.
312 EnumeratedArray<InternalControlVarInfo, InternalControlVar,
313 InternalControlVar::ICV___last>
314 ICVs;
315
316 /// Helper to initialize all internal control variable information for those
317 /// defined in OMPKinds.def.
318 void initializeInternalControlVars() {
319#define ICV_RT_SET(_Name, RTL) \
320 { \
321 auto &ICV = ICVs[_Name]; \
322 ICV.Setter = RTL; \
323 }
324#define ICV_RT_GET(Name, RTL) \
325 { \
326 auto &ICV = ICVs[Name]; \
327 ICV.Getter = RTL; \
328 }
329#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
330 { \
331 auto &ICV = ICVs[Enum]; \
332 ICV.Name = _Name; \
333 ICV.Kind = Enum; \
334 ICV.InitKind = Init; \
335 ICV.EnvVarName = _EnvVarName; \
336 switch (ICV.InitKind) { \
337 case ICV_IMPLEMENTATION_DEFINED: \
338 ICV.InitValue = nullptr; \
339 break; \
340 case ICV_ZERO: \
341 ICV.InitValue = ConstantInt::get( \
342 Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
343 break; \
344 case ICV_FALSE: \
345 ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
346 break; \
347 case ICV_LAST: \
348 break; \
349 } \
350 }
351#include "llvm/Frontend/OpenMP/OMPKinds.def"
352 }
353
354 /// Returns true if the function declaration \p F matches the runtime
355 /// function types, that is, return type \p RTFRetType, and argument types
356 /// \p RTFArgTypes.
357 static bool declMatchesRTFTypes(Function *F, Type *RTFRetType,
358 SmallVector<Type *, 8> &RTFArgTypes) {
359 // TODO: We should output information to the user (under debug output
360 // and via remarks).
361
362 if (!F)
363 return false;
364 if (F->getReturnType() != RTFRetType)
365 return false;
366 if (F->arg_size() != RTFArgTypes.size())
367 return false;
368
369 auto *RTFTyIt = RTFArgTypes.begin();
370 for (Argument &Arg : F->args()) {
371 if (Arg.getType() != *RTFTyIt)
372 return false;
373
374 ++RTFTyIt;
375 }
376
377 return true;
378 }
379
380 // Helper to collect all uses of the declaration in the UsesMap.
381 unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) {
382 unsigned NumUses = 0;
383 if (!RFI.Declaration)
384 return NumUses;
385 OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
386
387 if (CollectStats) {
388 NumOpenMPRuntimeFunctionsIdentified += 1;
389 NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
390 }
391
392 // TODO: We directly convert uses into proper calls and unknown uses.
393 for (Use &U : RFI.Declaration->uses()) {
394 if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) {
395 if (ModuleSlice.count(UserI->getFunction())) {
396 RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
397 ++NumUses;
398 }
399 } else {
400 RFI.getOrCreateUseVector(nullptr).push_back(&U);
401 ++NumUses;
402 }
403 }
404 return NumUses;
405 }
406
407 // Helper function to recollect uses of a runtime function.
408 void recollectUsesForFunction(RuntimeFunction RTF) {
409 auto &RFI = RFIs[RTF];
410 RFI.clearUsesMap();
411 collectUses(RFI, /*CollectStats*/ false);
412 }
413
414 // Helper function to recollect uses of all runtime functions.
415 void recollectUses() {
416 for (int Idx = 0; Idx < RFIs.size(); ++Idx)
417 recollectUsesForFunction(static_cast<RuntimeFunction>(Idx));
418 }
419
420 /// Helper to initialize all runtime function information for those defined
421 /// in OpenMPKinds.def.
422 void initializeRuntimeFunctions() {
423 Module &M = *((*ModuleSlice.begin())->getParent());
424
425 // Helper macros for handling __VA_ARGS__ in OMP_RTL
426#define OMP_TYPE(VarName, ...) \
427 Type *VarName = OMPBuilder.VarName; \
428 (void)VarName;
429
430#define OMP_ARRAY_TYPE(VarName, ...) \
431 ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
432 (void)VarName##Ty; \
433 PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
434 (void)VarName##PtrTy;
435
436#define OMP_FUNCTION_TYPE(VarName, ...) \
437 FunctionType *VarName = OMPBuilder.VarName; \
438 (void)VarName; \
439 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
440 (void)VarName##Ptr;
441
442#define OMP_STRUCT_TYPE(VarName, ...) \
443 StructType *VarName = OMPBuilder.VarName; \
444 (void)VarName; \
445 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
446 (void)VarName##Ptr;
447
448#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
449 { \
450 SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
451 Function *F = M.getFunction(_Name); \
452 RTLFunctions.insert(F); \
453 if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
454 RuntimeFunctionIDMap[F] = _Enum; \
455 F->removeFnAttr(Attribute::NoInline); \
456 auto &RFI = RFIs[_Enum]; \
457 RFI.Kind = _Enum; \
458 RFI.Name = _Name; \
459 RFI.IsVarArg = _IsVarArg; \
460 RFI.ReturnType = OMPBuilder._ReturnType; \
461 RFI.ArgumentTypes = std::move(ArgsTypes); \
462 RFI.Declaration = F; \
463 unsigned NumUses = collectUses(RFI); \
464 (void)NumUses; \
465 LLVM_DEBUG({ \do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { dbgs() << TAG << RFI.Name <<
(RFI.Declaration ? "" : " not") << " found\n"; if (RFI
.Declaration) dbgs() << TAG << "-> got " <<
NumUses << " uses in " << RFI.getNumFunctionsWithUses
() << " different functions.\n"; }; } } while (false)
466 dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { dbgs() << TAG << RFI.Name <<
(RFI.Declaration ? "" : " not") << " found\n"; if (RFI
.Declaration) dbgs() << TAG << "-> got " <<
NumUses << " uses in " << RFI.getNumFunctionsWithUses
() << " different functions.\n"; }; } } while (false)
467 << " found\n"; \do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { dbgs() << TAG << RFI.Name <<
(RFI.Declaration ? "" : " not") << " found\n"; if (RFI
.Declaration) dbgs() << TAG << "-> got " <<
NumUses << " uses in " << RFI.getNumFunctionsWithUses
() << " different functions.\n"; }; } } while (false)
468 if (RFI.Declaration) \do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { dbgs() << TAG << RFI.Name <<
(RFI.Declaration ? "" : " not") << " found\n"; if (RFI
.Declaration) dbgs() << TAG << "-> got " <<
NumUses << " uses in " << RFI.getNumFunctionsWithUses
() << " different functions.\n"; }; } } while (false)
469 dbgs() << TAG << "-> got " << NumUses << " uses in " \do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { dbgs() << TAG << RFI.Name <<
(RFI.Declaration ? "" : " not") << " found\n"; if (RFI
.Declaration) dbgs() << TAG << "-> got " <<
NumUses << " uses in " << RFI.getNumFunctionsWithUses
() << " different functions.\n"; }; } } while (false)
470 << RFI.getNumFunctionsWithUses() \do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { dbgs() << TAG << RFI.Name <<
(RFI.Declaration ? "" : " not") << " found\n"; if (RFI
.Declaration) dbgs() << TAG << "-> got " <<
NumUses << " uses in " << RFI.getNumFunctionsWithUses
() << " different functions.\n"; }; } } while (false)
471 << " different functions.\n"; \do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { dbgs() << TAG << RFI.Name <<
(RFI.Declaration ? "" : " not") << " found\n"; if (RFI
.Declaration) dbgs() << TAG << "-> got " <<
NumUses << " uses in " << RFI.getNumFunctionsWithUses
() << " different functions.\n"; }; } } while (false)
472 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { dbgs() << TAG << RFI.Name <<
(RFI.Declaration ? "" : " not") << " found\n"; if (RFI
.Declaration) dbgs() << TAG << "-> got " <<
NumUses << " uses in " << RFI.getNumFunctionsWithUses
() << " different functions.\n"; }; } } while (false)
; \
473 } \
474 }
475#include "llvm/Frontend/OpenMP/OMPKinds.def"
476
477 // TODO: We should attach the attributes defined in OMPKinds.def.
478 }
479
480 /// Collection of known kernels (\see Kernel) in the module.
481 SmallPtrSetImpl<Kernel> &Kernels;
482
483 /// Collection of known OpenMP runtime functions..
484 DenseSet<const Function *> RTLFunctions;
485};
486
487template <typename Ty, bool InsertInvalidates = true>
488struct BooleanStateWithSetVector : public BooleanState {
489 bool contains(const Ty &Elem) const { return Set.contains(Elem); }
490 bool insert(const Ty &Elem) {
491 if (InsertInvalidates)
492 BooleanState::indicatePessimisticFixpoint();
493 return Set.insert(Elem);
494 }
495
496 const Ty &operator[](int Idx) const { return Set[Idx]; }
497 bool operator==(const BooleanStateWithSetVector &RHS) const {
498 return BooleanState::operator==(RHS) && Set == RHS.Set;
499 }
500 bool operator!=(const BooleanStateWithSetVector &RHS) const {
501 return !(*this == RHS);
502 }
503
504 bool empty() const { return Set.empty(); }
505 size_t size() const { return Set.size(); }
506
507 /// "Clamp" this state with \p RHS.
508 BooleanStateWithSetVector &operator^=(const BooleanStateWithSetVector &RHS) {
509 BooleanState::operator^=(RHS);
510 Set.insert(RHS.Set.begin(), RHS.Set.end());
511 return *this;
512 }
513
514private:
515 /// A set to keep track of elements.
516 SetVector<Ty> Set;
517
518public:
519 typename decltype(Set)::iterator begin() { return Set.begin(); }
520 typename decltype(Set)::iterator end() { return Set.end(); }
521 typename decltype(Set)::const_iterator begin() const { return Set.begin(); }
522 typename decltype(Set)::const_iterator end() const { return Set.end(); }
523};
524
525template <typename Ty, bool InsertInvalidates = true>
526using BooleanStateWithPtrSetVector =
527 BooleanStateWithSetVector<Ty *, InsertInvalidates>;
528
529struct KernelInfoState : AbstractState {
530 /// Flag to track if we reached a fixpoint.
531 bool IsAtFixpoint = false;
532
533 /// The parallel regions (identified by the outlined parallel functions) that
534 /// can be reached from the associated function.
535 BooleanStateWithPtrSetVector<Function, /* InsertInvalidates */ false>
536 ReachedKnownParallelRegions;
537
538 /// State to track what parallel region we might reach.
539 BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;
540
541 /// State to track if we are in SPMD-mode, assumed or know, and why we decided
542 /// we cannot be. If it is assumed, then RequiresFullRuntime should also be
543 /// false.
544 BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;
545
546 /// The __kmpc_target_init call in this kernel, if any. If we find more than
547 /// one we abort as the kernel is malformed.
548 CallBase *KernelInitCB = nullptr;
549
550 /// The __kmpc_target_deinit call in this kernel, if any. If we find more than
551 /// one we abort as the kernel is malformed.
552 CallBase *KernelDeinitCB = nullptr;
553
554 /// Flag to indicate if the associated function is a kernel entry.
555 bool IsKernelEntry = false;
556
557 /// State to track what kernel entries can reach the associated function.
558 BooleanStateWithPtrSetVector<Function, false> ReachingKernelEntries;
559
560 /// State to indicate if we can track parallel level of the associated
561 /// function. We will give up tracking if we encounter unknown caller or the
562 /// caller is __kmpc_parallel_51.
563 BooleanStateWithSetVector<uint8_t> ParallelLevels;
564
565 /// Abstract State interface
566 ///{
567
568 KernelInfoState() {}
569 KernelInfoState(bool BestState) {
570 if (!BestState)
571 indicatePessimisticFixpoint();
572 }
573
574 /// See AbstractState::isValidState(...)
575 bool isValidState() const override { return true; }
576
577 /// See AbstractState::isAtFixpoint(...)
578 bool isAtFixpoint() const override { return IsAtFixpoint; }
579
580 /// See AbstractState::indicatePessimisticFixpoint(...)
581 ChangeStatus indicatePessimisticFixpoint() override {
582 IsAtFixpoint = true;
583 ReachingKernelEntries.indicatePessimisticFixpoint();
584 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
585 ReachedKnownParallelRegions.indicatePessimisticFixpoint();
586 ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
587 return ChangeStatus::CHANGED;
588 }
589
590 /// See AbstractState::indicateOptimisticFixpoint(...)
591 ChangeStatus indicateOptimisticFixpoint() override {
592 IsAtFixpoint = true;
593 ReachingKernelEntries.indicateOptimisticFixpoint();
594 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
595 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
596 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
597 return ChangeStatus::UNCHANGED;
598 }
599
600 /// Return the assumed state
601 KernelInfoState &getAssumed() { return *this; }
602 const KernelInfoState &getAssumed() const { return *this; }
603
604 bool operator==(const KernelInfoState &RHS) const {
605 if (SPMDCompatibilityTracker != RHS.SPMDCompatibilityTracker)
606 return false;
607 if (ReachedKnownParallelRegions != RHS.ReachedKnownParallelRegions)
608 return false;
609 if (ReachedUnknownParallelRegions != RHS.ReachedUnknownParallelRegions)
610 return false;
611 if (ReachingKernelEntries != RHS.ReachingKernelEntries)
612 return false;
613 return true;
614 }
615
616 /// Returns true if this kernel contains any OpenMP parallel regions.
617 bool mayContainParallelRegion() {
618 return !ReachedKnownParallelRegions.empty() ||
619 !ReachedUnknownParallelRegions.empty();
620 }
621
622 /// Return empty set as the best state of potential values.
623 static KernelInfoState getBestState() { return KernelInfoState(true); }
624
625 static KernelInfoState getBestState(KernelInfoState &KIS) {
626 return getBestState();
627 }
628
629 /// Return full set as the worst state of potential values.
630 static KernelInfoState getWorstState() { return KernelInfoState(false); }
631
632 /// "Clamp" this state with \p KIS.
633 KernelInfoState operator^=(const KernelInfoState &KIS) {
634 // Do not merge two different _init and _deinit call sites.
635 if (KIS.KernelInitCB) {
636 if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
637 llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "::llvm::llvm_unreachable_internal("Kernel that calls another kernel violates OpenMP-Opt "
"assumptions.", "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 638
)
638 "assumptions.")::llvm::llvm_unreachable_internal("Kernel that calls another kernel violates OpenMP-Opt "
"assumptions.", "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 638
)
;
639 KernelInitCB = KIS.KernelInitCB;
640 }
641 if (KIS.KernelDeinitCB) {
642 if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
643 llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "::llvm::llvm_unreachable_internal("Kernel that calls another kernel violates OpenMP-Opt "
"assumptions.", "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 644
)
644 "assumptions.")::llvm::llvm_unreachable_internal("Kernel that calls another kernel violates OpenMP-Opt "
"assumptions.", "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 644
)
;
645 KernelDeinitCB = KIS.KernelDeinitCB;
646 }
647 SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
648 ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
649 ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
650 return *this;
651 }
652
653 KernelInfoState operator&=(const KernelInfoState &KIS) {
654 return (*this ^= KIS);
655 }
656
657 ///}
658};
659
660/// Used to map the values physically (in the IR) stored in an offload
661/// array, to a vector in memory.
662struct OffloadArray {
663 /// Physical array (in the IR).
664 AllocaInst *Array = nullptr;
665 /// Mapped values.
666 SmallVector<Value *, 8> StoredValues;
667 /// Last stores made in the offload array.
668 SmallVector<StoreInst *, 8> LastAccesses;
669
670 OffloadArray() = default;
671
672 /// Initializes the OffloadArray with the values stored in \p Array before
673 /// instruction \p Before is reached. Returns false if the initialization
674 /// fails.
675 /// This MUST be used immediately after the construction of the object.
676 bool initialize(AllocaInst &Array, Instruction &Before) {
677 if (!Array.getAllocatedType()->isArrayTy())
678 return false;
679
680 if (!getValues(Array, Before))
681 return false;
682
683 this->Array = &Array;
684 return true;
685 }
686
687 static const unsigned DeviceIDArgNum = 1;
688 static const unsigned BasePtrsArgNum = 3;
689 static const unsigned PtrsArgNum = 4;
690 static const unsigned SizesArgNum = 5;
691
692private:
693 /// Traverses the BasicBlock where \p Array is, collecting the stores made to
694 /// \p Array, leaving StoredValues with the values stored before the
695 /// instruction \p Before is reached.
696 bool getValues(AllocaInst &Array, Instruction &Before) {
697 // Initialize container.
698 const uint64_t NumValues = Array.getAllocatedType()->getArrayNumElements();
699 StoredValues.assign(NumValues, nullptr);
700 LastAccesses.assign(NumValues, nullptr);
701
702 // TODO: This assumes the instruction \p Before is in the same
703 // BasicBlock as Array. Make it general, for any control flow graph.
704 BasicBlock *BB = Array.getParent();
705 if (BB != Before.getParent())
706 return false;
707
708 const DataLayout &DL = Array.getModule()->getDataLayout();
709 const unsigned int PointerSize = DL.getPointerSize();
710
711 for (Instruction &I : *BB) {
712 if (&I == &Before)
713 break;
714
715 if (!isa<StoreInst>(&I))
716 continue;
717
718 auto *S = cast<StoreInst>(&I);
719 int64_t Offset = -1;
720 auto *Dst =
721 GetPointerBaseWithConstantOffset(S->getPointerOperand(), Offset, DL);
722 if (Dst == &Array) {
723 int64_t Idx = Offset / PointerSize;
724 StoredValues[Idx] = getUnderlyingObject(S->getValueOperand());
725 LastAccesses[Idx] = S;
726 }
727 }
728
729 return isFilled();
730 }
731
732 /// Returns true if all values in StoredValues and
733 /// LastAccesses are not nullptrs.
734 bool isFilled() {
735 const unsigned NumValues = StoredValues.size();
736 for (unsigned I = 0; I < NumValues; ++I) {
737 if (!StoredValues[I] || !LastAccesses[I])
738 return false;
739 }
740
741 return true;
742 }
743};
744
745struct OpenMPOpt {
746
747 using OptimizationRemarkGetter =
748 function_ref<OptimizationRemarkEmitter &(Function *)>;
749
750 OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
751 OptimizationRemarkGetter OREGetter,
752 OMPInformationCache &OMPInfoCache, Attributor &A)
753 : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater),
754 OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
755
756 /// Check if any remarks are enabled for openmp-opt
757 bool remarksEnabled() {
758 auto &Ctx = M.getContext();
759 return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE"openmp-opt");
760 }
761
762 /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice.
763 bool run(bool IsModulePass) {
764 if (SCC.empty())
765 return false;
766
767 bool Changed = false;
768
769 LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Run on SCC with "
<< SCC.size() << " functions in a slice with " <<
OMPInfoCache.ModuleSlice.size() << " functions\n"; } }
while (false)
770 << " functions in a slice with "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Run on SCC with "
<< SCC.size() << " functions in a slice with " <<
OMPInfoCache.ModuleSlice.size() << " functions\n"; } }
while (false)
771 << OMPInfoCache.ModuleSlice.size() << " functions\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Run on SCC with "
<< SCC.size() << " functions in a slice with " <<
OMPInfoCache.ModuleSlice.size() << " functions\n"; } }
while (false)
;
772
773 if (IsModulePass) {
774 Changed |= runAttributor(IsModulePass);
775
776 // Recollect uses, in case Attributor deleted any.
777 OMPInfoCache.recollectUses();
778
779 // TODO: This should be folded into buildCustomStateMachine.
780 Changed |= rewriteDeviceCodeStateMachine();
781
782 if (remarksEnabled())
783 analysisGlobalization();
784 } else {
785 if (PrintICVValues)
786 printICVs();
787 if (PrintOpenMPKernels)
788 printKernels();
789
790 Changed |= runAttributor(IsModulePass);
791
792 // Recollect uses, in case Attributor deleted any.
793 OMPInfoCache.recollectUses();
794
795 Changed |= deleteParallelRegions();
796
797 if (HideMemoryTransferLatency)
798 Changed |= hideMemTransfersLatency();
799 Changed |= deduplicateRuntimeCalls();
800 if (EnableParallelRegionMerging) {
801 if (mergeParallelRegions()) {
802 deduplicateRuntimeCalls();
803 Changed = true;
804 }
805 }
806 }
807
808 return Changed;
809 }
810
811 /// Print initial ICV values for testing.
812 /// FIXME: This should be done from the Attributor once it is added.
813 void printICVs() const {
814 InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel,
815 ICV_proc_bind};
816
817 for (Function *F : OMPInfoCache.ModuleSlice) {
818 for (auto ICV : ICVs) {
819 auto ICVInfo = OMPInfoCache.ICVs[ICV];
820 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
821 return ORA << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name)
822 << " Value: "
823 << (ICVInfo.InitValue
824 ? toString(ICVInfo.InitValue->getValue(), 10, true)
825 : "IMPLEMENTATION_DEFINED");
826 };
827
828 emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPICVTracker", Remark);
829 }
830 }
831 }
832
833 /// Print OpenMP GPU kernels for testing.
834 void printKernels() const {
835 for (Function *F : SCC) {
836 if (!OMPInfoCache.Kernels.count(F))
837 continue;
838
839 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
840 return ORA << "OpenMP GPU kernel "
841 << ore::NV("OpenMPGPUKernel", F->getName()) << "\n";
842 };
843
844 emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPGPU", Remark);
845 }
846 }
847
848 /// Return the call if \p U is a callee use in a regular call. If \p RFI is
849 /// given it has to be the callee or a nullptr is returned.
850 static CallInst *getCallIfRegularCall(
851 Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
852 CallInst *CI = dyn_cast<CallInst>(U.getUser());
853 if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() &&
854 (!RFI ||
855 (RFI->Declaration && CI->getCalledFunction() == RFI->Declaration)))
856 return CI;
857 return nullptr;
858 }
859
860 /// Return the call if \p V is a regular call. If \p RFI is given it has to be
861 /// the callee or a nullptr is returned.
862 static CallInst *getCallIfRegularCall(
863 Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
864 CallInst *CI = dyn_cast<CallInst>(&V);
865 if (CI && !CI->hasOperandBundles() &&
866 (!RFI ||
867 (RFI->Declaration && CI->getCalledFunction() == RFI->Declaration)))
868 return CI;
869 return nullptr;
870 }
871
872private:
873 /// Merge parallel regions when it is safe.
874 bool mergeParallelRegions() {
875 const unsigned CallbackCalleeOperand = 2;
876 const unsigned CallbackFirstArgOperand = 3;
877 using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
878
879 // Check if there are any __kmpc_fork_call calls to merge.
880 OMPInformationCache::RuntimeFunctionInfo &RFI =
881 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
882
883 if (!RFI.Declaration)
884 return false;
885
886 // Unmergable calls that prevent merging a parallel region.
887 OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
888 OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
889 OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
890 };
891
892 bool Changed = false;
893 LoopInfo *LI = nullptr;
894 DominatorTree *DT = nullptr;
895
896 SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
897
898 BasicBlock *StartBB = nullptr, *EndBB = nullptr;
899 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
900 BasicBlock &ContinuationIP) {
901 BasicBlock *CGStartBB = CodeGenIP.getBlock();
902 BasicBlock *CGEndBB =
903 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
904 assert(StartBB != nullptr && "StartBB should not be null")(static_cast <bool> (StartBB != nullptr && "StartBB should not be null"
) ? void (0) : __assert_fail ("StartBB != nullptr && \"StartBB should not be null\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 904, __extension__
__PRETTY_FUNCTION__))
;
905 CGStartBB->getTerminator()->setSuccessor(0, StartBB);
906 assert(EndBB != nullptr && "EndBB should not be null")(static_cast <bool> (EndBB != nullptr && "EndBB should not be null"
) ? void (0) : __assert_fail ("EndBB != nullptr && \"EndBB should not be null\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 906, __extension__
__PRETTY_FUNCTION__))
;
907 EndBB->getTerminator()->setSuccessor(0, CGEndBB);
908 };
909
910 auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &,
911 Value &Inner, Value *&ReplacementValue) -> InsertPointTy {
912 ReplacementValue = &Inner;
913 return CodeGenIP;
914 };
915
916 auto FiniCB = [&](InsertPointTy CodeGenIP) {};
917
918 /// Create a sequential execution region within a merged parallel region,
919 /// encapsulated in a master construct with a barrier for synchronization.
920 auto CreateSequentialRegion = [&](Function *OuterFn,
921 BasicBlock *OuterPredBB,
922 Instruction *SeqStartI,
923 Instruction *SeqEndI) {
924 // Isolate the instructions of the sequential region to a separate
925 // block.
926 BasicBlock *ParentBB = SeqStartI->getParent();
927 BasicBlock *SeqEndBB =
928 SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
929 BasicBlock *SeqAfterBB =
930 SplitBlock(SeqEndBB, &*SeqEndBB->getFirstInsertionPt(), DT, LI);
931 BasicBlock *SeqStartBB =
932 SplitBlock(ParentBB, SeqStartI, DT, LI, nullptr, "seq.par.merged");
933
934 assert(ParentBB->getUniqueSuccessor() == SeqStartBB &&(static_cast <bool> (ParentBB->getUniqueSuccessor() ==
SeqStartBB && "Expected a different CFG") ? void (0)
: __assert_fail ("ParentBB->getUniqueSuccessor() == SeqStartBB && \"Expected a different CFG\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 935, __extension__
__PRETTY_FUNCTION__))
935 "Expected a different CFG")(static_cast <bool> (ParentBB->getUniqueSuccessor() ==
SeqStartBB && "Expected a different CFG") ? void (0)
: __assert_fail ("ParentBB->getUniqueSuccessor() == SeqStartBB && \"Expected a different CFG\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 935, __extension__
__PRETTY_FUNCTION__))
;
936 const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
937 ParentBB->getTerminator()->eraseFromParent();
938
939 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
940 BasicBlock &ContinuationIP) {
941 BasicBlock *CGStartBB = CodeGenIP.getBlock();
942 BasicBlock *CGEndBB =
943 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
944 assert(SeqStartBB != nullptr && "SeqStartBB should not be null")(static_cast <bool> (SeqStartBB != nullptr && "SeqStartBB should not be null"
) ? void (0) : __assert_fail ("SeqStartBB != nullptr && \"SeqStartBB should not be null\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 944, __extension__
__PRETTY_FUNCTION__))
;
945 CGStartBB->getTerminator()->setSuccessor(0, SeqStartBB);
946 assert(SeqEndBB != nullptr && "SeqEndBB should not be null")(static_cast <bool> (SeqEndBB != nullptr && "SeqEndBB should not be null"
) ? void (0) : __assert_fail ("SeqEndBB != nullptr && \"SeqEndBB should not be null\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 946, __extension__
__PRETTY_FUNCTION__))
;
947 SeqEndBB->getTerminator()->setSuccessor(0, CGEndBB);
948 };
949 auto FiniCB = [&](InsertPointTy CodeGenIP) {};
950
951 // Find outputs from the sequential region to outside users and
952 // broadcast their values to them.
953 for (Instruction &I : *SeqStartBB) {
954 SmallPtrSet<Instruction *, 4> OutsideUsers;
955 for (User *Usr : I.users()) {
956 Instruction &UsrI = *cast<Instruction>(Usr);
957 // Ignore outputs to LT intrinsics, code extraction for the merged
958 // parallel region will fix them.
959 if (UsrI.isLifetimeStartOrEnd())
960 continue;
961
962 if (UsrI.getParent() != SeqStartBB)
963 OutsideUsers.insert(&UsrI);
964 }
965
966 if (OutsideUsers.empty())
967 continue;
968
969 // Emit an alloca in the outer region to store the broadcasted
970 // value.
971 const DataLayout &DL = M.getDataLayout();
972 AllocaInst *AllocaI = new AllocaInst(
973 I.getType(), DL.getAllocaAddrSpace(), nullptr,
974 I.getName() + ".seq.output.alloc", &OuterFn->front().front());
975
976 // Emit a store instruction in the sequential BB to update the
977 // value.
978 new StoreInst(&I, AllocaI, SeqStartBB->getTerminator());
979
980 // Emit a load instruction and replace the use of the output value
981 // with it.
982 for (Instruction *UsrI : OutsideUsers) {
983 LoadInst *LoadI = new LoadInst(
984 I.getType(), AllocaI, I.getName() + ".seq.output.load", UsrI);
985 UsrI->replaceUsesOfWith(&I, LoadI);
986 }
987 }
988
989 OpenMPIRBuilder::LocationDescription Loc(
990 InsertPointTy(ParentBB, ParentBB->end()), DL);
991 InsertPointTy SeqAfterIP =
992 OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB);
993
994 OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel);
995
996 BranchInst::Create(SeqAfterBB, SeqAfterIP.getBlock());
997
998 LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFndo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "After sequential inlining "
<< *OuterFn << "\n"; } } while (false)
999 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "After sequential inlining "
<< *OuterFn << "\n"; } } while (false)
;
1000 };
1001
1002 // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
1003 // contained in BB and only separated by instructions that can be
1004 // redundantly executed in parallel. The block BB is split before the first
1005 // call (in MergableCIs) and after the last so the entire region we merge
1006 // into a single parallel region is contained in a single basic block
1007 // without any other instructions. We use the OpenMPIRBuilder to outline
1008 // that block and call the resulting function via __kmpc_fork_call.
1009 auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) {
1010 // TODO: Change the interface to allow single CIs expanded, e.g, to
1011 // include an outer loop.
1012 assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs")(static_cast <bool> (MergableCIs.size() > 1 &&
"Assumed multiple mergable CIs") ? void (0) : __assert_fail (
"MergableCIs.size() > 1 && \"Assumed multiple mergable CIs\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 1012, __extension__
__PRETTY_FUNCTION__))
;
1013
1014 auto Remark = [&](OptimizationRemark OR) {
1015 OR << "Parallel region merged with parallel region"
1016 << (MergableCIs.size() > 2 ? "s" : "") << " at ";
1017 for (auto *CI : llvm::drop_begin(MergableCIs)) {
1018 OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc());
1019 if (CI != MergableCIs.back())
1020 OR << ", ";
1021 }
1022 return OR << ".";
1023 };
1024
1025 emitRemark<OptimizationRemark>(MergableCIs.front(), "OMP150", Remark);
1026
1027 Function *OriginalFn = BB->getParent();
1028 LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Merge " <<
MergableCIs.size() << " parallel regions in " <<
OriginalFn->getName() << "\n"; } } while (false)
1029 << " parallel regions in " << OriginalFn->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Merge " <<
MergableCIs.size() << " parallel regions in " <<
OriginalFn->getName() << "\n"; } } while (false)
1030 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Merge " <<
MergableCIs.size() << " parallel regions in " <<
OriginalFn->getName() << "\n"; } } while (false)
;
1031
1032 // Isolate the calls to merge in a separate block.
1033 EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI);
1034 BasicBlock *AfterBB =
1035 SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
1036 StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr,
1037 "omp.par.merged");
1038
1039 assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG")(static_cast <bool> (BB->getUniqueSuccessor() == StartBB
&& "Expected a different CFG") ? void (0) : __assert_fail
("BB->getUniqueSuccessor() == StartBB && \"Expected a different CFG\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 1039, __extension__
__PRETTY_FUNCTION__))
;
1040 const DebugLoc DL = BB->getTerminator()->getDebugLoc();
1041 BB->getTerminator()->eraseFromParent();
1042
1043 // Create sequential regions for sequential instructions that are
1044 // in-between mergable parallel regions.
1045 for (auto *It = MergableCIs.begin(), *End = MergableCIs.end() - 1;
1046 It != End; ++It) {
1047 Instruction *ForkCI = *It;
1048 Instruction *NextForkCI = *(It + 1);
1049
1050 // Continue if there are not in-between instructions.
1051 if (ForkCI->getNextNode() == NextForkCI)
1052 continue;
1053
1054 CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(),
1055 NextForkCI->getPrevNode());
1056 }
1057
1058 OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
1059 DL);
1060 IRBuilder<>::InsertPoint AllocaIP(
1061 &OriginalFn->getEntryBlock(),
1062 OriginalFn->getEntryBlock().getFirstInsertionPt());
1063 // Create the merged parallel region with default proc binding, to
1064 // avoid overriding binding settings, and without explicit cancellation.
1065 InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.createParallel(
1066 Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr,
1067 OMP_PROC_BIND_default, /* IsCancellable */ false);
1068 BranchInst::Create(AfterBB, AfterIP.getBlock());
1069
1070 // Perform the actual outlining.
1071 OMPInfoCache.OMPBuilder.finalize(OriginalFn,
1072 /* AllowExtractorSinking */ true);
1073
1074 Function *OutlinedFn = MergableCIs.front()->getCaller();
1075
1076 // Replace the __kmpc_fork_call calls with direct calls to the outlined
1077 // callbacks.
1078 SmallVector<Value *, 8> Args;
1079 for (auto *CI : MergableCIs) {
1080 Value *Callee =
1081 CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts();
1082 FunctionType *FT =
1083 cast<FunctionType>(Callee->getType()->getPointerElementType());
1084 Args.clear();
1085 Args.push_back(OutlinedFn->getArg(0));
1086 Args.push_back(OutlinedFn->getArg(1));
1087 for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E;
1088 ++U)
1089 Args.push_back(CI->getArgOperand(U));
1090
1091 CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI);
1092 if (CI->getDebugLoc())
1093 NewCI->setDebugLoc(CI->getDebugLoc());
1094
1095 // Forward parameter attributes from the callback to the callee.
1096 for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E;
1097 ++U)
1098 for (const Attribute &A : CI->getAttributes().getParamAttrs(U))
1099 NewCI->addParamAttr(
1100 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
1101
1102 // Emit an explicit barrier to replace the implicit fork-join barrier.
1103 if (CI != MergableCIs.back()) {
1104 // TODO: Remove barrier if the merged parallel region includes the
1105 // 'nowait' clause.
1106 OMPInfoCache.OMPBuilder.createBarrier(
1107 InsertPointTy(NewCI->getParent(),
1108 NewCI->getNextNode()->getIterator()),
1109 OMPD_parallel);
1110 }
1111
1112 CI->eraseFromParent();
1113 }
1114
1115 assert(OutlinedFn != OriginalFn && "Outlining failed")(static_cast <bool> (OutlinedFn != OriginalFn &&
"Outlining failed") ? void (0) : __assert_fail ("OutlinedFn != OriginalFn && \"Outlining failed\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 1115, __extension__
__PRETTY_FUNCTION__))
;
1116 CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
1117 CGUpdater.reanalyzeFunction(*OriginalFn);
1118
1119 NumOpenMPParallelRegionsMerged += MergableCIs.size();
1120
1121 return true;
1122 };
1123
1124 // Helper function that identifes sequences of
1125 // __kmpc_fork_call uses in a basic block.
1126 auto DetectPRsCB = [&](Use &U, Function &F) {
1127 CallInst *CI = getCallIfRegularCall(U, &RFI);
1128 BB2PRMap[CI->getParent()].insert(CI);
1129
1130 return false;
1131 };
1132
1133 BB2PRMap.clear();
1134 RFI.foreachUse(SCC, DetectPRsCB);
1135 SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector;
1136 // Find mergable parallel regions within a basic block that are
1137 // safe to merge, that is any in-between instructions can safely
1138 // execute in parallel after merging.
1139 // TODO: support merging across basic-blocks.
1140 for (auto &It : BB2PRMap) {
1141 auto &CIs = It.getSecond();
1142 if (CIs.size() < 2)
1143 continue;
1144
1145 BasicBlock *BB = It.getFirst();
1146 SmallVector<CallInst *, 4> MergableCIs;
1147
1148 /// Returns true if the instruction is mergable, false otherwise.
1149 /// A terminator instruction is unmergable by definition since merging
1150 /// works within a BB. Instructions before the mergable region are
1151 /// mergable if they are not calls to OpenMP runtime functions that may
1152 /// set different execution parameters for subsequent parallel regions.
1153 /// Instructions in-between parallel regions are mergable if they are not
1154 /// calls to any non-intrinsic function since that may call a non-mergable
1155 /// OpenMP runtime function.
1156 auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) {
1157 // We do not merge across BBs, hence return false (unmergable) if the
1158 // instruction is a terminator.
1159 if (I.isTerminator())
1160 return false;
1161
1162 if (!isa<CallInst>(&I))
1163 return true;
1164
1165 CallInst *CI = cast<CallInst>(&I);
1166 if (IsBeforeMergableRegion) {
1167 Function *CalledFunction = CI->getCalledFunction();
1168 if (!CalledFunction)
1169 return false;
1170 // Return false (unmergable) if the call before the parallel
1171 // region calls an explicit affinity (proc_bind) or number of
1172 // threads (num_threads) compiler-generated function. Those settings
1173 // may be incompatible with following parallel regions.
1174 // TODO: ICV tracking to detect compatibility.
1175 for (const auto &RFI : UnmergableCallsInfo) {
1176 if (CalledFunction == RFI.Declaration)
1177 return false;
1178 }
1179 } else {
1180 // Return false (unmergable) if there is a call instruction
1181 // in-between parallel regions when it is not an intrinsic. It
1182 // may call an unmergable OpenMP runtime function in its callpath.
1183 // TODO: Keep track of possible OpenMP calls in the callpath.
1184 if (!isa<IntrinsicInst>(CI))
1185 return false;
1186 }
1187
1188 return true;
1189 };
1190 // Find maximal number of parallel region CIs that are safe to merge.
1191 for (auto It = BB->begin(), End = BB->end(); It != End;) {
1192 Instruction &I = *It;
1193 ++It;
1194
1195 if (CIs.count(&I)) {
1196 MergableCIs.push_back(cast<CallInst>(&I));
1197 continue;
1198 }
1199
1200 // Continue expanding if the instruction is mergable.
1201 if (IsMergable(I, MergableCIs.empty()))
1202 continue;
1203
1204 // Forward the instruction iterator to skip the next parallel region
1205 // since there is an unmergable instruction which can affect it.
1206 for (; It != End; ++It) {
1207 Instruction &SkipI = *It;
1208 if (CIs.count(&SkipI)) {
1209 LLVM_DEBUG(dbgs() << TAG << "Skip parallel region " << SkipIdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Skip parallel region "
<< SkipI << " due to " << I << "\n";
} } while (false)
1210 << " due to " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Skip parallel region "
<< SkipI << " due to " << I << "\n";
} } while (false)
;
1211 ++It;
1212 break;
1213 }
1214 }
1215
1216 // Store mergable regions found.
1217 if (MergableCIs.size() > 1) {
1218 MergableCIsVector.push_back(MergableCIs);
1219 LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Found " <<
MergableCIs.size() << " parallel regions in block " <<
BB->getName() << " of function " << BB->getParent
()->getName() << "\n";; } } while (false)
1220 << " parallel regions in block " << BB->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Found " <<
MergableCIs.size() << " parallel regions in block " <<
BB->getName() << " of function " << BB->getParent
()->getName() << "\n";; } } while (false)
1221 << " of function " << BB->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Found " <<
MergableCIs.size() << " parallel regions in block " <<
BB->getName() << " of function " << BB->getParent
()->getName() << "\n";; } } while (false)
1222 << "\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Found " <<
MergableCIs.size() << " parallel regions in block " <<
BB->getName() << " of function " << BB->getParent
()->getName() << "\n";; } } while (false)
;
1223 }
1224
1225 MergableCIs.clear();
1226 }
1227
1228 if (!MergableCIsVector.empty()) {
1229 Changed = true;
1230
1231 for (auto &MergableCIs : MergableCIsVector)
1232 Merge(MergableCIs, BB);
1233 MergableCIsVector.clear();
1234 }
1235 }
1236
1237 if (Changed) {
1238 /// Re-collect use for fork calls, emitted barrier calls, and
1239 /// any emitted master/end_master calls.
1240 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
1241 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
1242 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
1243 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
1244 }
1245
1246 return Changed;
1247 }
1248
1249 /// Try to delete parallel regions if possible.
1250 bool deleteParallelRegions() {
1251 const unsigned CallbackCalleeOperand = 2;
1252
1253 OMPInformationCache::RuntimeFunctionInfo &RFI =
1254 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1255
1256 if (!RFI.Declaration)
1257 return false;
1258
1259 bool Changed = false;
1260 auto DeleteCallCB = [&](Use &U, Function &) {
1261 CallInst *CI = getCallIfRegularCall(U);
1262 if (!CI)
1263 return false;
1264 auto *Fn = dyn_cast<Function>(
1265 CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts());
1266 if (!Fn)
1267 return false;
1268 if (!Fn->onlyReadsMemory())
1269 return false;
1270 if (!Fn->hasFnAttribute(Attribute::WillReturn))
1271 return false;
1272
1273 LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Delete read-only parallel region in "
<< CI->getCaller()->getName() << "\n"; } }
while (false)
1274 << CI->getCaller()->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Delete read-only parallel region in "
<< CI->getCaller()->getName() << "\n"; } }
while (false)
;
1275
1276 auto Remark = [&](OptimizationRemark OR) {
1277 return OR << "Removing parallel region with no side-effects.";
1278 };
1279 emitRemark<OptimizationRemark>(CI, "OMP160", Remark);
1280
1281 CGUpdater.removeCallSite(*CI);
1282 CI->eraseFromParent();
1283 Changed = true;
1284 ++NumOpenMPParallelRegionsDeleted;
1285 return true;
1286 };
1287
1288 RFI.foreachUse(SCC, DeleteCallCB);
1289
1290 return Changed;
1291 }
1292
1293 /// Try to eliminate runtime calls by reusing existing ones.
1294 bool deduplicateRuntimeCalls() {
1295 bool Changed = false;
1296
1297 RuntimeFunction DeduplicableRuntimeCallIDs[] = {
1298 OMPRTL_omp_get_num_threads,
1299 OMPRTL_omp_in_parallel,
1300 OMPRTL_omp_get_cancellation,
1301 OMPRTL_omp_get_thread_limit,
1302 OMPRTL_omp_get_supported_active_levels,
1303 OMPRTL_omp_get_level,
1304 OMPRTL_omp_get_ancestor_thread_num,
1305 OMPRTL_omp_get_team_size,
1306 OMPRTL_omp_get_active_level,
1307 OMPRTL_omp_in_final,
1308 OMPRTL_omp_get_proc_bind,
1309 OMPRTL_omp_get_num_places,
1310 OMPRTL_omp_get_num_procs,
1311 OMPRTL_omp_get_place_num,
1312 OMPRTL_omp_get_partition_num_places,
1313 OMPRTL_omp_get_partition_place_nums};
1314
1315 // Global-tid is handled separately.
1316 SmallSetVector<Value *, 16> GTIdArgs;
1317 collectGlobalThreadIdArguments(GTIdArgs);
1318 LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Found " <<
GTIdArgs.size() << " global thread ID arguments\n"; } }
while (false)
1319 << " global thread ID arguments\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Found " <<
GTIdArgs.size() << " global thread ID arguments\n"; } }
while (false)
;
1320
1321 for (Function *F : SCC) {
1322 for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
1323 Changed |= deduplicateRuntimeCalls(
1324 *F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
1325
1326 // __kmpc_global_thread_num is special as we can replace it with an
1327 // argument in enough cases to make it worth trying.
1328 Value *GTIdArg = nullptr;
1329 for (Argument &Arg : F->args())
1330 if (GTIdArgs.count(&Arg)) {
1331 GTIdArg = &Arg;
1332 break;
1333 }
1334 Changed |= deduplicateRuntimeCalls(
1335 *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
1336 }
1337
1338 return Changed;
1339 }
1340
1341 /// Tries to hide the latency of runtime calls that involve host to
1342 /// device memory transfers by splitting them into their "issue" and "wait"
1343 /// versions. The "issue" is moved upwards as much as possible. The "wait" is
1344 /// moved downards as much as possible. The "issue" issues the memory transfer
1345 /// asynchronously, returning a handle. The "wait" waits in the returned
1346 /// handle for the memory transfer to finish.
1347 bool hideMemTransfersLatency() {
1348 auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
1349 bool Changed = false;
1350 auto SplitMemTransfers = [&](Use &U, Function &Decl) {
1351 auto *RTCall = getCallIfRegularCall(U, &RFI);
1352 if (!RTCall)
1353 return false;
1354
1355 OffloadArray OffloadArrays[3];
1356 if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
1357 return false;
1358
1359 LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dumpValuesInOffloadArrays(OffloadArrays); }
} while (false)
;
1360
1361 // TODO: Check if can be moved upwards.
1362 bool WasSplit = false;
1363 Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
1364 if (WaitMovementPoint)
1365 WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
1366
1367 Changed |= WasSplit;
1368 return WasSplit;
1369 };
1370 RFI.foreachUse(SCC, SplitMemTransfers);
1371
1372 return Changed;
1373 }
1374
1375 void analysisGlobalization() {
1376 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
1377
1378 auto CheckGlobalization = [&](Use &U, Function &Decl) {
1379 if (CallInst *CI = getCallIfRegularCall(U, &RFI)) {
1380 auto Remark = [&](OptimizationRemarkMissed ORM) {
1381 return ORM
1382 << "Found thread data sharing on the GPU. "
1383 << "Expect degraded performance due to data globalization.";
1384 };
1385 emitRemark<OptimizationRemarkMissed>(CI, "OMP112", Remark);
1386 }
1387
1388 return false;
1389 };
1390
1391 RFI.foreachUse(SCC, CheckGlobalization);
1392 }
1393
1394 /// Maps the values stored in the offload arrays passed as arguments to
1395 /// \p RuntimeCall into the offload arrays in \p OAs.
1396 bool getValuesInOffloadArrays(CallInst &RuntimeCall,
1397 MutableArrayRef<OffloadArray> OAs) {
1398 assert(OAs.size() == 3 && "Need space for three offload arrays!")(static_cast <bool> (OAs.size() == 3 && "Need space for three offload arrays!"
) ? void (0) : __assert_fail ("OAs.size() == 3 && \"Need space for three offload arrays!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 1398, __extension__
__PRETTY_FUNCTION__))
;
1399
1400 // A runtime call that involves memory offloading looks something like:
1401 // call void @__tgt_target_data_begin_mapper(arg0, arg1,
1402 // i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes,
1403 // ...)
1404 // So, the idea is to access the allocas that allocate space for these
1405 // offload arrays, offload_baseptrs, offload_ptrs, offload_sizes.
1406 // Therefore:
1407 // i8** %offload_baseptrs.
1408 Value *BasePtrsArg =
1409 RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum);
1410 // i8** %offload_ptrs.
1411 Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum);
1412 // i8** %offload_sizes.
1413 Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum);
1414
1415 // Get values stored in **offload_baseptrs.
1416 auto *V = getUnderlyingObject(BasePtrsArg);
1417 if (!isa<AllocaInst>(V))
1418 return false;
1419 auto *BasePtrsArray = cast<AllocaInst>(V);
1420 if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall))
1421 return false;
1422
1423 // Get values stored in **offload_baseptrs.
1424 V = getUnderlyingObject(PtrsArg);
1425 if (!isa<AllocaInst>(V))
1426 return false;
1427 auto *PtrsArray = cast<AllocaInst>(V);
1428 if (!OAs[1].initialize(*PtrsArray, RuntimeCall))
1429 return false;
1430
1431 // Get values stored in **offload_sizes.
1432 V = getUnderlyingObject(SizesArg);
1433 // If it's a [constant] global array don't analyze it.
1434 if (isa<GlobalValue>(V))
1435 return isa<Constant>(V);
1436 if (!isa<AllocaInst>(V))
1437 return false;
1438
1439 auto *SizesArray = cast<AllocaInst>(V);
1440 if (!OAs[2].initialize(*SizesArray, RuntimeCall))
1441 return false;
1442
1443 return true;
1444 }
1445
1446 /// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG.
1447 /// For now this is a way to test that the function getValuesInOffloadArrays
1448 /// is working properly.
1449 /// TODO: Move this to a unittest when unittests are available for OpenMPOpt.
1450 void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) {
1451 assert(OAs.size() == 3 && "There are three offload arrays to debug!")(static_cast <bool> (OAs.size() == 3 && "There are three offload arrays to debug!"
) ? void (0) : __assert_fail ("OAs.size() == 3 && \"There are three offload arrays to debug!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 1451, __extension__
__PRETTY_FUNCTION__))
;
1452
1453 LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << " Successfully got offload values:\n"
; } } while (false)
;
1454 std::string ValuesStr;
1455 raw_string_ostream Printer(ValuesStr);
1456 std::string Separator = " --- ";
1457
1458 for (auto *BP : OAs[0].StoredValues) {
1459 BP->print(Printer);
1460 Printer << Separator;
1461 }
1462 LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << "\t\toffload_baseptrs: " <<
Printer.str() << "\n"; } } while (false)
;
1463 ValuesStr.clear();
1464
1465 for (auto *P : OAs[1].StoredValues) {
1466 P->print(Printer);
1467 Printer << Separator;
1468 }
1469 LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << "\t\toffload_ptrs: " <<
Printer.str() << "\n"; } } while (false)
;
1470 ValuesStr.clear();
1471
1472 for (auto *S : OAs[2].StoredValues) {
1473 S->print(Printer);
1474 Printer << Separator;
1475 }
1476 LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << "\t\toffload_sizes: " <<
Printer.str() << "\n"; } } while (false)
;
1477 }
1478
1479 /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be
1480 /// moved. Returns nullptr if the movement is not possible, or not worth it.
1481 Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
1482 // FIXME: This traverses only the BasicBlock where RuntimeCall is.
1483 // Make it traverse the CFG.
1484
1485 Instruction *CurrentI = &RuntimeCall;
1486 bool IsWorthIt = false;
1487 while ((CurrentI = CurrentI->getNextNode())) {
1488
1489 // TODO: Once we detect the regions to be offloaded we should use the
1490 // alias analysis manager to check if CurrentI may modify one of
1491 // the offloaded regions.
1492 if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) {
1493 if (IsWorthIt)
1494 return CurrentI;
1495
1496 return nullptr;
1497 }
1498
1499 // FIXME: For now if we move it over anything without side effect
1500 // is worth it.
1501 IsWorthIt = true;
1502 }
1503
1504 // Return end of BasicBlock.
1505 return RuntimeCall.getParent()->getTerminator();
1506 }
1507
1508 /// Splits \p RuntimeCall into its "issue" and "wait" counterparts.
1509 bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
1510 Instruction &WaitMovementPoint) {
1511 // Create stack allocated handle (__tgt_async_info) at the beginning of the
1512 // function. Used for storing information of the async transfer, allowing to
1513 // wait on it later.
1514 auto &IRBuilder = OMPInfoCache.OMPBuilder;
1515 auto *F = RuntimeCall.getCaller();
1516 Instruction *FirstInst = &(F->getEntryBlock().front());
1517 AllocaInst *Handle = new AllocaInst(
1518 IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst);
1519
1520 // Add "issue" runtime call declaration:
1521 // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
1522 // i8**, i8**, i64*, i64*)
1523 FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
1524 M, OMPRTL___tgt_target_data_begin_mapper_issue);
1525
1526 // Change RuntimeCall call site for its asynchronous version.
1527 SmallVector<Value *, 16> Args;
1528 for (auto &Arg : RuntimeCall.args())
1529 Args.push_back(Arg.get());
1530 Args.push_back(Handle);
1531
1532 CallInst *IssueCallsite =
1533 CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall);
1534 RuntimeCall.eraseFromParent();
1535
1536 // Add "wait" runtime call declaration:
1537 // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
1538 FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
1539 M, OMPRTL___tgt_target_data_begin_mapper_wait);
1540
1541 Value *WaitParams[2] = {
1542 IssueCallsite->getArgOperand(
1543 OffloadArray::DeviceIDArgNum), // device_id.
1544 Handle // handle to wait on.
1545 };
1546 CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint);
1547
1548 return true;
1549 }
1550
1551 static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
1552 bool GlobalOnly, bool &SingleChoice) {
1553 if (CurrentIdent == NextIdent)
1554 return CurrentIdent;
1555
1556 // TODO: Figure out how to actually combine multiple debug locations. For
1557 // now we just keep an existing one if there is a single choice.
1558 if (!GlobalOnly || isa<GlobalValue>(NextIdent)) {
1559 SingleChoice = !CurrentIdent;
1560 return NextIdent;
1561 }
1562 return nullptr;
1563 }
1564
1565 /// Return an `struct ident_t*` value that represents the ones used in the
1566 /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not
1567 /// return a local `struct ident_t*`. For now, if we cannot find a suitable
1568 /// return value we create one from scratch. We also do not yet combine
1569 /// information, e.g., the source locations, see combinedIdentStruct.
1570 Value *
1571 getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
1572 Function &F, bool GlobalOnly) {
1573 bool SingleChoice = true;
1574 Value *Ident = nullptr;
1575 auto CombineIdentStruct = [&](Use &U, Function &Caller) {
1576 CallInst *CI = getCallIfRegularCall(U, &RFI);
1577 if (!CI || &F != &Caller)
1578 return false;
1579 Ident = combinedIdentStruct(Ident, CI->getArgOperand(0),
1580 /* GlobalOnly */ true, SingleChoice);
1581 return false;
1582 };
1583 RFI.foreachUse(SCC, CombineIdentStruct);
1584
1585 if (!Ident || !SingleChoice) {
1586 // The IRBuilder uses the insertion block to get to the module, this is
1587 // unfortunate but we work around it for now.
1588 if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
1589 OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy(
1590 &F.getEntryBlock(), F.getEntryBlock().begin()));
1591 // Create a fallback location if non was found.
1592 // TODO: Use the debug locations of the calls instead.
1593 uint32_t SrcLocStrSize;
1594 Constant *Loc =
1595 OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1596 Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc, SrcLocStrSize);
1597 }
1598 return Ident;
1599 }
1600
1601 /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or
1602 /// \p ReplVal if given.
1603 bool deduplicateRuntimeCalls(Function &F,
1604 OMPInformationCache::RuntimeFunctionInfo &RFI,
1605 Value *ReplVal = nullptr) {
1606 auto *UV = RFI.getUseVector(F);
1607 if (!UV || UV->size() + (ReplVal != nullptr) < 2)
1608 return false;
1609
1610 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Deduplicate "
<< UV->size() << " uses of " << RFI.Name
<< (ReplVal ? " with an existing value\n" : "\n") <<
"\n"; } } while (false)
1611 dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Namedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Deduplicate "
<< UV->size() << " uses of " << RFI.Name
<< (ReplVal ? " with an existing value\n" : "\n") <<
"\n"; } } while (false)
1612 << (ReplVal ? " with an existing value\n" : "\n") << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Deduplicate "
<< UV->size() << " uses of " << RFI.Name
<< (ReplVal ? " with an existing value\n" : "\n") <<
"\n"; } } while (false)
;
1613
1614 assert((!ReplVal || (isa<Argument>(ReplVal) &&(static_cast <bool> ((!ReplVal || (isa<Argument>(
ReplVal) && cast<Argument>(ReplVal)->getParent
() == &F)) && "Unexpected replacement value!") ? void
(0) : __assert_fail ("(!ReplVal || (isa<Argument>(ReplVal) && cast<Argument>(ReplVal)->getParent() == &F)) && \"Unexpected replacement value!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 1616, __extension__
__PRETTY_FUNCTION__))
1615 cast<Argument>(ReplVal)->getParent() == &F)) &&(static_cast <bool> ((!ReplVal || (isa<Argument>(
ReplVal) && cast<Argument>(ReplVal)->getParent
() == &F)) && "Unexpected replacement value!") ? void
(0) : __assert_fail ("(!ReplVal || (isa<Argument>(ReplVal) && cast<Argument>(ReplVal)->getParent() == &F)) && \"Unexpected replacement value!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 1616, __extension__
__PRETTY_FUNCTION__))
1616 "Unexpected replacement value!")(static_cast <bool> ((!ReplVal || (isa<Argument>(
ReplVal) && cast<Argument>(ReplVal)->getParent
() == &F)) && "Unexpected replacement value!") ? void
(0) : __assert_fail ("(!ReplVal || (isa<Argument>(ReplVal) && cast<Argument>(ReplVal)->getParent() == &F)) && \"Unexpected replacement value!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 1616, __extension__
__PRETTY_FUNCTION__))
;
1617
1618 // TODO: Use dominance to find a good position instead.
1619 auto CanBeMoved = [this](CallBase &CB) {
1620 unsigned NumArgs = CB.arg_size();
1621 if (NumArgs == 0)
1622 return true;
1623 if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
1624 return false;
1625 for (unsigned U = 1; U < NumArgs; ++U)
1626 if (isa<Instruction>(CB.getArgOperand(U)))
1627 return false;
1628 return true;
1629 };
1630
1631 if (!ReplVal) {
1632 for (Use *U : *UV)
1633 if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
1634 if (!CanBeMoved(*CI))
1635 continue;
1636
1637 // If the function is a kernel, dedup will move
1638 // the runtime call right after the kernel init callsite. Otherwise,
1639 // it will move it to the beginning of the caller function.
1640 if (isKernel(F)) {
1641 auto &KernelInitRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
1642 auto *KernelInitUV = KernelInitRFI.getUseVector(F);
1643
1644 if (KernelInitUV->empty())
1645 continue;
1646
1647 assert(KernelInitUV->size() == 1 &&(static_cast <bool> (KernelInitUV->size() == 1 &&
"Expected a single __kmpc_target_init in kernel\n") ? void (
0) : __assert_fail ("KernelInitUV->size() == 1 && \"Expected a single __kmpc_target_init in kernel\\n\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 1648, __extension__
__PRETTY_FUNCTION__))
1648 "Expected a single __kmpc_target_init in kernel\n")(static_cast <bool> (KernelInitUV->size() == 1 &&
"Expected a single __kmpc_target_init in kernel\n") ? void (
0) : __assert_fail ("KernelInitUV->size() == 1 && \"Expected a single __kmpc_target_init in kernel\\n\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 1648, __extension__
__PRETTY_FUNCTION__))
;
1649
1650 CallInst *KernelInitCI =
1651 getCallIfRegularCall(*KernelInitUV->front(), &KernelInitRFI);
1652 assert(KernelInitCI &&(static_cast <bool> (KernelInitCI && "Expected a call to __kmpc_target_init in kernel\n"
) ? void (0) : __assert_fail ("KernelInitCI && \"Expected a call to __kmpc_target_init in kernel\\n\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 1653, __extension__
__PRETTY_FUNCTION__))
1653 "Expected a call to __kmpc_target_init in kernel\n")(static_cast <bool> (KernelInitCI && "Expected a call to __kmpc_target_init in kernel\n"
) ? void (0) : __assert_fail ("KernelInitCI && \"Expected a call to __kmpc_target_init in kernel\\n\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 1653, __extension__
__PRETTY_FUNCTION__))
;
1654
1655 CI->moveAfter(KernelInitCI);
1656 } else
1657 CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt());
1658 ReplVal = CI;
1659 break;
1660 }
1661 if (!ReplVal)
1662 return false;
1663 }
1664
1665 // If we use a call as a replacement value we need to make sure the ident is
1666 // valid at the new location. For now we just pick a global one, either
1667 // existing and used by one of the calls, or created from scratch.
1668 if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
1669 if (!CI->arg_empty() &&
1670 CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) {
1671 Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F,
1672 /* GlobalOnly */ true);
1673 CI->setArgOperand(0, Ident);
1674 }
1675 }
1676
1677 bool Changed = false;
1678 auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) {
1679 CallInst *CI = getCallIfRegularCall(U, &RFI);
1680 if (!CI || CI == ReplVal || &F != &Caller)
1681 return false;
1682 assert(CI->getCaller() == &F && "Unexpected call!")(static_cast <bool> (CI->getCaller() == &F &&
"Unexpected call!") ? void (0) : __assert_fail ("CI->getCaller() == &F && \"Unexpected call!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 1682, __extension__
__PRETTY_FUNCTION__))
;
1683
1684 auto Remark = [&](OptimizationRemark OR) {
1685 return OR << "OpenMP runtime call "
1686 << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated.";
1687 };
1688 if (CI->getDebugLoc())
1689 emitRemark<OptimizationRemark>(CI, "OMP170", Remark);
1690 else
1691 emitRemark<OptimizationRemark>(&F, "OMP170", Remark);
1692
1693 CGUpdater.removeCallSite(*CI);
1694 CI->replaceAllUsesWith(ReplVal);
1695 CI->eraseFromParent();
1696 ++NumOpenMPRuntimeCallsDeduplicated;
1697 Changed = true;
1698 return true;
1699 };
1700 RFI.foreachUse(SCC, ReplaceAndDeleteCB);
1701
1702 return Changed;
1703 }
1704
1705 /// Collect arguments that represent the global thread id in \p GTIdArgs.
1706 void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> &GTIdArgs) {
1707 // TODO: Below we basically perform a fixpoint iteration with a pessimistic
1708 // initialization. We could define an AbstractAttribute instead and
1709 // run the Attributor here once it can be run as an SCC pass.
1710
1711 // Helper to check the argument \p ArgNo at all call sites of \p F for
1712 // a GTId.
1713 auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) {
1714 if (!F.hasLocalLinkage())
1715 return false;
1716 for (Use &U : F.uses()) {
1717 if (CallInst *CI = getCallIfRegularCall(U)) {
1718 Value *ArgOp = CI->getArgOperand(ArgNo);
1719 if (CI == &RefCI || GTIdArgs.count(ArgOp) ||
1720 getCallIfRegularCall(
1721 *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
1722 continue;
1723 }
1724 return false;
1725 }
1726 return true;
1727 };
1728
1729 // Helper to identify uses of a GTId as GTId arguments.
1730 auto AddUserArgs = [&](Value &GTId) {
1731 for (Use &U : GTId.uses())
1732 if (CallInst *CI = dyn_cast<CallInst>(U.getUser()))
1733 if (CI->isArgOperand(&U))
1734 if (Function *Callee = CI->getCalledFunction())
1735 if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI))
1736 GTIdArgs.insert(Callee->getArg(U.getOperandNo()));
1737 };
1738
1739 // The argument users of __kmpc_global_thread_num calls are GTIds.
1740 OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
1741 OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
1742
1743 GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) {
1744 if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
1745 AddUserArgs(*CI);
1746 return false;
1747 });
1748
1749 // Transitively search for more arguments by looking at the users of the
1750 // ones we know already. During the search the GTIdArgs vector is extended
1751 // so we cannot cache the size nor can we use a range based for.
1752 for (unsigned U = 0; U < GTIdArgs.size(); ++U)
1753 AddUserArgs(*GTIdArgs[U]);
1754 }
1755
1756 /// Kernel (=GPU) optimizations and utility functions
1757 ///
1758 ///{{
1759
1760 /// Check if \p F is a kernel, hence entry point for target offloading.
1761 bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); }
1762
1763 /// Cache to remember the unique kernel for a function.
1764 DenseMap<Function *, Optional<Kernel>> UniqueKernelMap;
1765
1766 /// Find the unique kernel that will execute \p F, if any.
1767 Kernel getUniqueKernelFor(Function &F);
1768
1769 /// Find the unique kernel that will execute \p I, if any.
1770 Kernel getUniqueKernelFor(Instruction &I) {
1771 return getUniqueKernelFor(*I.getFunction());
1772 }
1773
1774 /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in
1775 /// the cases we can avoid taking the address of a function.
1776 bool rewriteDeviceCodeStateMachine();
1777
1778 ///
1779 ///}}
1780
1781 /// Emit a remark generically
1782 ///
1783 /// This template function can be used to generically emit a remark. The
1784 /// RemarkKind should be one of the following:
1785 /// - OptimizationRemark to indicate a successful optimization attempt
1786 /// - OptimizationRemarkMissed to report a failed optimization attempt
1787 /// - OptimizationRemarkAnalysis to provide additional information about an
1788 /// optimization attempt
1789 ///
1790 /// The remark is built using a callback function provided by the caller that
1791 /// takes a RemarkKind as input and returns a RemarkKind.
1792 template <typename RemarkKind, typename RemarkCallBack>
1793 void emitRemark(Instruction *I, StringRef RemarkName,
1794 RemarkCallBack &&RemarkCB) const {
1795 Function *F = I->getParent()->getParent();
1796 auto &ORE = OREGetter(F);
1797
1798 if (RemarkName.startswith("OMP"))
1799 ORE.emit([&]() {
1800 return RemarkCB(RemarkKind(DEBUG_TYPE"openmp-opt", RemarkName, I))
1801 << " [" << RemarkName << "]";
1802 });
1803 else
1804 ORE.emit(
1805 [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE"openmp-opt", RemarkName, I)); });
1806 }
1807
1808 /// Emit a remark on a function.
1809 template <typename RemarkKind, typename RemarkCallBack>
1810 void emitRemark(Function *F, StringRef RemarkName,
1811 RemarkCallBack &&RemarkCB) const {
1812 auto &ORE = OREGetter(F);
1813
1814 if (RemarkName.startswith("OMP"))
1815 ORE.emit([&]() {
1816 return RemarkCB(RemarkKind(DEBUG_TYPE"openmp-opt", RemarkName, F))
1817 << " [" << RemarkName << "]";
1818 });
1819 else
1820 ORE.emit(
1821 [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE"openmp-opt", RemarkName, F)); });
1822 }
1823
1824 /// RAII struct to temporarily change an RTL function's linkage to external.
1825 /// This prevents it from being mistakenly removed by other optimizations.
1826 struct ExternalizationRAII {
1827 ExternalizationRAII(OMPInformationCache &OMPInfoCache,
1828 RuntimeFunction RFKind)
1829 : Declaration(OMPInfoCache.RFIs[RFKind].Declaration) {
1830 if (!Declaration)
1831 return;
1832
1833 LinkageType = Declaration->getLinkage();
1834 Declaration->setLinkage(GlobalValue::ExternalLinkage);
1835 }
1836
1837 ~ExternalizationRAII() {
1838 if (!Declaration)
1839 return;
1840
1841 Declaration->setLinkage(LinkageType);
1842 }
1843
1844 Function *Declaration;
1845 GlobalValue::LinkageTypes LinkageType;
1846 };
1847
1848 /// The underlying module.
1849 Module &M;
1850
1851 /// The SCC we are operating on.
1852 SmallVectorImpl<Function *> &SCC;
1853
1854 /// Callback to update the call graph, the first argument is a removed call,
1855 /// the second an optional replacement call.
1856 CallGraphUpdater &CGUpdater;
1857
1858 /// Callback to get an OptimizationRemarkEmitter from a Function *
1859 OptimizationRemarkGetter OREGetter;
1860
1861 /// OpenMP-specific information cache. Also Used for Attributor runs.
1862 OMPInformationCache &OMPInfoCache;
1863
1864 /// Attributor instance.
1865 Attributor &A;
1866
1867 /// Helper function to run Attributor on SCC.
1868 bool runAttributor(bool IsModulePass) {
1869 if (SCC.empty())
1870 return false;
1871
1872 // Temporarily make these function have external linkage so the Attributor
1873 // doesn't remove them when we try to look them up later.
1874 ExternalizationRAII Parallel(OMPInfoCache, OMPRTL___kmpc_kernel_parallel);
1875 ExternalizationRAII EndParallel(OMPInfoCache,
1876 OMPRTL___kmpc_kernel_end_parallel);
1877 ExternalizationRAII BarrierSPMD(OMPInfoCache,
1878 OMPRTL___kmpc_barrier_simple_spmd);
1879 ExternalizationRAII BarrierGeneric(OMPInfoCache,
1880 OMPRTL___kmpc_barrier_simple_generic);
1881 ExternalizationRAII ThreadId(OMPInfoCache,
1882 OMPRTL___kmpc_get_hardware_thread_id_in_block);
1883 ExternalizationRAII WarpSize(OMPInfoCache, OMPRTL___kmpc_get_warp_size);
1884
1885 registerAAs(IsModulePass);
1886
1887 ChangeStatus Changed = A.run();
1888
1889 LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << "[Attributor] Done with " <<
SCC.size() << " functions, result: " << Changed <<
".\n"; } } while (false)
1890 << " functions, result: " << Changed << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << "[Attributor] Done with " <<
SCC.size() << " functions, result: " << Changed <<
".\n"; } } while (false)
;
1891
1892 return Changed == ChangeStatus::CHANGED;
1893 }
1894
1895 void registerFoldRuntimeCall(RuntimeFunction RF);
1896
1897 /// Populate the Attributor with abstract attribute opportunities in the
1898 /// function.
1899 void registerAAs(bool IsModulePass);
1900};
1901
1902Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
1903 if (!OMPInfoCache.ModuleSlice.count(&F))
1904 return nullptr;
1905
1906 // Use a scope to keep the lifetime of the CachedKernel short.
1907 {
1908 Optional<Kernel> &CachedKernel = UniqueKernelMap[&F];
1909 if (CachedKernel)
1910 return *CachedKernel;
1911
1912 // TODO: We should use an AA to create an (optimistic and callback
1913 // call-aware) call graph. For now we stick to simple patterns that
1914 // are less powerful, basically the worst fixpoint.
1915 if (isKernel(F)) {
1916 CachedKernel = Kernel(&F);
1917 return *CachedKernel;
1918 }
1919
1920 CachedKernel = nullptr;
1921 if (!F.hasLocalLinkage()) {
1922
1923 // See https://openmp.llvm.org/remarks/OptimizationRemarks.html
1924 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1925 return ORA << "Potentially unknown OpenMP target region caller.";
1926 };
1927 emitRemark<OptimizationRemarkAnalysis>(&F, "OMP100", Remark);
1928
1929 return nullptr;
1930 }
1931 }
1932
1933 auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel {
1934 if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
1935 // Allow use in equality comparisons.
1936 if (Cmp->isEquality())
1937 return getUniqueKernelFor(*Cmp);
1938 return nullptr;
1939 }
1940 if (auto *CB = dyn_cast<CallBase>(U.getUser())) {
1941 // Allow direct calls.
1942 if (CB->isCallee(&U))
1943 return getUniqueKernelFor(*CB);
1944
1945 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
1946 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
1947 // Allow the use in __kmpc_parallel_51 calls.
1948 if (OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI))
1949 return getUniqueKernelFor(*CB);
1950 return nullptr;
1951 }
1952 // Disallow every other use.
1953 return nullptr;
1954 };
1955
1956 // TODO: In the future we want to track more than just a unique kernel.
1957 SmallPtrSet<Kernel, 2> PotentialKernels;
1958 OMPInformationCache::foreachUse(F, [&](const Use &U) {
1959 PotentialKernels.insert(GetUniqueKernelForUse(U));
1960 });
1961
1962 Kernel K = nullptr;
1963 if (PotentialKernels.size() == 1)
1964 K = *PotentialKernels.begin();
1965
1966 // Cache the result.
1967 UniqueKernelMap[&F] = K;
1968
1969 return K;
1970}
1971
1972bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
1973 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
1974 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
1975
1976 bool Changed = false;
1977 if (!KernelParallelRFI)
1978 return Changed;
1979
1980 // If we have disabled state machine changes, exit
1981 if (DisableOpenMPOptStateMachineRewrite)
1982 return Changed;
1983
1984 for (Function *F : SCC) {
1985
1986 // Check if the function is a use in a __kmpc_parallel_51 call at
1987 // all.
1988 bool UnknownUse = false;
1989 bool KernelParallelUse = false;
1990 unsigned NumDirectCalls = 0;
1991
1992 SmallVector<Use *, 2> ToBeReplacedStateMachineUses;
1993 OMPInformationCache::foreachUse(*F, [&](Use &U) {
1994 if (auto *CB = dyn_cast<CallBase>(U.getUser()))
1995 if (CB->isCallee(&U)) {
1996 ++NumDirectCalls;
1997 return;
1998 }
1999
2000 if (isa<ICmpInst>(U.getUser())) {
2001 ToBeReplacedStateMachineUses.push_back(&U);
2002 return;
2003 }
2004
2005 // Find wrapper functions that represent parallel kernels.
2006 CallInst *CI =
2007 OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI);
2008 const unsigned int WrapperFunctionArgNo = 6;
2009 if (!KernelParallelUse && CI &&
2010 CI->getArgOperandNo(&U) == WrapperFunctionArgNo) {
2011 KernelParallelUse = true;
2012 ToBeReplacedStateMachineUses.push_back(&U);
2013 return;
2014 }
2015 UnknownUse = true;
2016 });
2017
2018 // Do not emit a remark if we haven't seen a __kmpc_parallel_51
2019 // use.
2020 if (!KernelParallelUse)
2021 continue;
2022
2023 // If this ever hits, we should investigate.
2024 // TODO: Checking the number of uses is not a necessary restriction and
2025 // should be lifted.
2026 if (UnknownUse || NumDirectCalls != 1 ||
2027 ToBeReplacedStateMachineUses.size() > 2) {
2028 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2029 return ORA << "Parallel region is used in "
2030 << (UnknownUse ? "unknown" : "unexpected")
2031 << " ways. Will not attempt to rewrite the state machine.";
2032 };
2033 emitRemark<OptimizationRemarkAnalysis>(F, "OMP101", Remark);
2034 continue;
2035 }
2036
2037 // Even if we have __kmpc_parallel_51 calls, we (for now) give
2038 // up if the function is not called from a unique kernel.
2039 Kernel K = getUniqueKernelFor(*F);
2040 if (!K) {
2041 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2042 return ORA << "Parallel region is not called from a unique kernel. "
2043 "Will not attempt to rewrite the state machine.";
2044 };
2045 emitRemark<OptimizationRemarkAnalysis>(F, "OMP102", Remark);
2046 continue;
2047 }
2048
2049 // We now know F is a parallel body function called only from the kernel K.
2050 // We also identified the state machine uses in which we replace the
2051 // function pointer by a new global symbol for identification purposes. This
2052 // ensures only direct calls to the function are left.
2053
2054 Module &M = *F->getParent();
2055 Type *Int8Ty = Type::getInt8Ty(M.getContext());
2056
2057 auto *ID = new GlobalVariable(
2058 M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage,
2059 UndefValue::get(Int8Ty), F->getName() + ".ID");
2060
2061 for (Use *U : ToBeReplacedStateMachineUses)
2062 U->set(ConstantExpr::getPointerBitCastOrAddrSpaceCast(
2063 ID, U->get()->getType()));
2064
2065 ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
2066
2067 Changed = true;
2068 }
2069
2070 return Changed;
2071}
2072
2073/// Abstract Attribute for tracking ICV values.
2074struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
2075 using Base = StateWrapper<BooleanState, AbstractAttribute>;
2076 AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
2077
2078 void initialize(Attributor &A) override {
2079 Function *F = getAnchorScope();
2080 if (!F || !A.isFunctionIPOAmendable(*F))
2081 indicatePessimisticFixpoint();
2082 }
2083
2084 /// Returns true if value is assumed to be tracked.
2085 bool isAssumedTracked() const { return getAssumed(); }
2086
2087 /// Returns true if value is known to be tracked.
2088 bool isKnownTracked() const { return getAssumed(); }
2089
2090 /// Create an abstract attribute biew for the position \p IRP.
2091 static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A);
2092
2093 /// Return the value with which \p I can be replaced for specific \p ICV.
2094 virtual Optional<Value *> getReplacementValue(InternalControlVar ICV,
2095 const Instruction *I,
2096 Attributor &A) const {
2097 return None;
2098 }
2099
2100 /// Return an assumed unique ICV value if a single candidate is found. If
2101 /// there cannot be one, return a nullptr. If it is not clear yet, return the
2102 /// Optional::NoneType.
2103 virtual Optional<Value *>
2104 getUniqueReplacementValue(InternalControlVar ICV) const = 0;
2105
2106 // Currently only nthreads is being tracked.
2107 // this array will only grow with time.
2108 InternalControlVar TrackableICVs[1] = {ICV_nthreads};
2109
2110 /// See AbstractAttribute::getName()
2111 const std::string getName() const override { return "AAICVTracker"; }
2112
2113 /// See AbstractAttribute::getIdAddr()
2114 const char *getIdAddr() const override { return &ID; }
2115
2116 /// This function should return true if the type of the \p AA is AAICVTracker
2117 static bool classof(const AbstractAttribute *AA) {
2118 return (AA->getIdAddr() == &ID);
2119 }
2120
2121 static const char ID;
2122};
2123
2124struct AAICVTrackerFunction : public AAICVTracker {
2125 AAICVTrackerFunction(const IRPosition &IRP, Attributor &A)
2126 : AAICVTracker(IRP, A) {}
2127
2128 // FIXME: come up with better string.
2129 const std::string getAsStr() const override { return "ICVTrackerFunction"; }
2130
2131 // FIXME: come up with some stats.
2132 void trackStatistics() const override {}
2133
2134 /// We don't manifest anything for this AA.
2135 ChangeStatus manifest(Attributor &A) override {
2136 return ChangeStatus::UNCHANGED;
2137 }
2138
2139 // Map of ICV to their values at specific program point.
2140 EnumeratedArray<DenseMap<Instruction *, Value *>, InternalControlVar,
2141 InternalControlVar::ICV___last>
2142 ICVReplacementValuesMap;
2143
2144 ChangeStatus updateImpl(Attributor &A) override {
2145 ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
2146
2147 Function *F = getAnchorScope();
2148
2149 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2150
2151 for (InternalControlVar ICV : TrackableICVs) {
2152 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2153
2154 auto &ValuesMap = ICVReplacementValuesMap[ICV];
2155 auto TrackValues = [&](Use &U, Function &) {
2156 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
2157 if (!CI)
2158 return false;
2159
2160 // FIXME: handle setters with more that 1 arguments.
2161 /// Track new value.
2162 if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second)
2163 HasChanged = ChangeStatus::CHANGED;
2164
2165 return false;
2166 };
2167
2168 auto CallCheck = [&](Instruction &I) {
2169 Optional<Value *> ReplVal = getValueForCall(A, &I, ICV);
2170 if (ReplVal.hasValue() &&
2171 ValuesMap.insert(std::make_pair(&I, *ReplVal)).second)
2172 HasChanged = ChangeStatus::CHANGED;
2173
2174 return true;
2175 };
2176
2177 // Track all changes of an ICV.
2178 SetterRFI.foreachUse(TrackValues, F);
2179
2180 bool UsedAssumedInformation = false;
2181 A.checkForAllInstructions(CallCheck, *this, {Instruction::Call},
2182 UsedAssumedInformation,
2183 /* CheckBBLivenessOnly */ true);
2184
2185 /// TODO: Figure out a way to avoid adding entry in
2186 /// ICVReplacementValuesMap
2187 Instruction *Entry = &F->getEntryBlock().front();
2188 if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry))
2189 ValuesMap.insert(std::make_pair(Entry, nullptr));
2190 }
2191
2192 return HasChanged;
2193 }
2194
2195 /// Hepler to check if \p I is a call and get the value for it if it is
2196 /// unique.
2197 Optional<Value *> getValueForCall(Attributor &A, const Instruction *I,
2198 InternalControlVar &ICV) const {
2199
2200 const auto *CB = dyn_cast<CallBase>(I);
2201 if (!CB || CB->hasFnAttr("no_openmp") ||
2202 CB->hasFnAttr("no_openmp_routines"))
2203 return None;
2204
2205 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2206 auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
2207 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2208 Function *CalledFunction = CB->getCalledFunction();
2209
2210 // Indirect call, assume ICV changes.
2211 if (CalledFunction == nullptr)
2212 return nullptr;
2213 if (CalledFunction == GetterRFI.Declaration)
2214 return None;
2215 if (CalledFunction == SetterRFI.Declaration) {
2216 if (ICVReplacementValuesMap[ICV].count(I))
2217 return ICVReplacementValuesMap[ICV].lookup(I);
2218
2219 return nullptr;
2220 }
2221
2222 // Since we don't know, assume it changes the ICV.
2223 if (CalledFunction->isDeclaration())
2224 return nullptr;
2225
2226 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2227 *this, IRPosition::callsite_returned(*CB), DepClassTy::REQUIRED);
2228
2229 if (ICVTrackingAA.isAssumedTracked())
2230 return ICVTrackingAA.getUniqueReplacementValue(ICV);
2231
2232 // If we don't know, assume it changes.
2233 return nullptr;
2234 }
2235
2236 // We don't check unique value for a function, so return None.
2237 Optional<Value *>
2238 getUniqueReplacementValue(InternalControlVar ICV) const override {
2239 return None;
2240 }
2241
2242 /// Return the value with which \p I can be replaced for specific \p ICV.
2243 Optional<Value *> getReplacementValue(InternalControlVar ICV,
2244 const Instruction *I,
2245 Attributor &A) const override {
2246 const auto &ValuesMap = ICVReplacementValuesMap[ICV];
2247 if (ValuesMap.count(I))
2248 return ValuesMap.lookup(I);
2249
2250 SmallVector<const Instruction *, 16> Worklist;
2251 SmallPtrSet<const Instruction *, 16> Visited;
2252 Worklist.push_back(I);
2253
2254 Optional<Value *> ReplVal;
2255
2256 while (!Worklist.empty()) {
2257 const Instruction *CurrInst = Worklist.pop_back_val();
2258 if (!Visited.insert(CurrInst).second)
2259 continue;
2260
2261 const BasicBlock *CurrBB = CurrInst->getParent();
2262
2263 // Go up and look for all potential setters/calls that might change the
2264 // ICV.
2265 while ((CurrInst = CurrInst->getPrevNode())) {
2266 if (ValuesMap.count(CurrInst)) {
2267 Optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
2268 // Unknown value, track new.
2269 if (!ReplVal.hasValue()) {
2270 ReplVal = NewReplVal;
2271 break;
2272 }
2273
2274 // If we found a new value, we can't know the icv value anymore.
2275 if (NewReplVal.hasValue())
2276 if (ReplVal != NewReplVal)
2277 return nullptr;
2278
2279 break;
2280 }
2281
2282 Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV);
2283 if (!NewReplVal.hasValue())
2284 continue;
2285
2286 // Unknown value, track new.
2287 if (!ReplVal.hasValue()) {
2288 ReplVal = NewReplVal;
2289 break;
2290 }
2291
2292 // if (NewReplVal.hasValue())
2293 // We found a new value, we can't know the icv value anymore.
2294 if (ReplVal != NewReplVal)
2295 return nullptr;
2296 }
2297
2298 // If we are in the same BB and we have a value, we are done.
2299 if (CurrBB == I->getParent() && ReplVal.hasValue())
2300 return ReplVal;
2301
2302 // Go through all predecessors and add terminators for analysis.
2303 for (const BasicBlock *Pred : predecessors(CurrBB))
2304 if (const Instruction *Terminator = Pred->getTerminator())
2305 Worklist.push_back(Terminator);
2306 }
2307
2308 return ReplVal;
2309 }
2310};
2311
2312struct AAICVTrackerFunctionReturned : AAICVTracker {
2313 AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A)
2314 : AAICVTracker(IRP, A) {}
2315
2316 // FIXME: come up with better string.
2317 const std::string getAsStr() const override {
2318 return "ICVTrackerFunctionReturned";
2319 }
2320
2321 // FIXME: come up with some stats.
2322 void trackStatistics() const override {}
2323
2324 /// We don't manifest anything for this AA.
2325 ChangeStatus manifest(Attributor &A) override {
2326 return ChangeStatus::UNCHANGED;
2327 }
2328
2329 // Map of ICV to their values at specific program point.
2330 EnumeratedArray<Optional<Value *>, InternalControlVar,
2331 InternalControlVar::ICV___last>
2332 ICVReplacementValuesMap;
2333
2334 /// Return the value with which \p I can be replaced for specific \p ICV.
2335 Optional<Value *>
2336 getUniqueReplacementValue(InternalControlVar ICV) const override {
2337 return ICVReplacementValuesMap[ICV];
2338 }
2339
2340 ChangeStatus updateImpl(Attributor &A) override {
2341 ChangeStatus Changed = ChangeStatus::UNCHANGED;
2342 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2343 *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
2344
2345 if (!ICVTrackingAA.isAssumedTracked())
2346 return indicatePessimisticFixpoint();
2347
2348 for (InternalControlVar ICV : TrackableICVs) {
2349 Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2350 Optional<Value *> UniqueICVValue;
2351
2352 auto CheckReturnInst = [&](Instruction &I) {
2353 Optional<Value *> NewReplVal =
2354 ICVTrackingAA.getReplacementValue(ICV, &I, A);
2355
2356 // If we found a second ICV value there is no unique returned value.
2357 if (UniqueICVValue.hasValue() && UniqueICVValue != NewReplVal)
2358 return false;
2359
2360 UniqueICVValue = NewReplVal;
2361
2362 return true;
2363 };
2364
2365 bool UsedAssumedInformation = false;
2366 if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret},
2367 UsedAssumedInformation,
2368 /* CheckBBLivenessOnly */ true))
2369 UniqueICVValue = nullptr;
2370
2371 if (UniqueICVValue == ReplVal)
2372 continue;
2373
2374 ReplVal = UniqueICVValue;
2375 Changed = ChangeStatus::CHANGED;
2376 }
2377
2378 return Changed;
2379 }
2380};
2381
2382struct AAICVTrackerCallSite : AAICVTracker {
2383 AAICVTrackerCallSite(const IRPosition &IRP, Attributor &A)
2384 : AAICVTracker(IRP, A) {}
2385
2386 void initialize(Attributor &A) override {
2387 Function *F = getAnchorScope();
2388 if (!F || !A.isFunctionIPOAmendable(*F))
2389 indicatePessimisticFixpoint();
2390
2391 // We only initialize this AA for getters, so we need to know which ICV it
2392 // gets.
2393 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2394 for (InternalControlVar ICV : TrackableICVs) {
2395 auto ICVInfo = OMPInfoCache.ICVs[ICV];
2396 auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
2397 if (Getter.Declaration == getAssociatedFunction()) {
2398 AssociatedICV = ICVInfo.Kind;
2399 return;
2400 }
2401 }
2402
2403 /// Unknown ICV.
2404 indicatePessimisticFixpoint();
2405 }
2406
2407 ChangeStatus manifest(Attributor &A) override {
2408 if (!ReplVal.hasValue() || !ReplVal.getValue())
2409 return ChangeStatus::UNCHANGED;
2410
2411 A.changeValueAfterManifest(*getCtxI(), **ReplVal);
2412 A.deleteAfterManifest(*getCtxI());
2413
2414 return ChangeStatus::CHANGED;
2415 }
2416
2417 // FIXME: come up with better string.
2418 const std::string getAsStr() const override { return "ICVTrackerCallSite"; }
2419
2420 // FIXME: come up with some stats.
2421 void trackStatistics() const override {}
2422
2423 InternalControlVar AssociatedICV;
2424 Optional<Value *> ReplVal;
2425
2426 ChangeStatus updateImpl(Attributor &A) override {
2427 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2428 *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
2429
2430 // We don't have any information, so we assume it changes the ICV.
2431 if (!ICVTrackingAA.isAssumedTracked())
2432 return indicatePessimisticFixpoint();
2433
2434 Optional<Value *> NewReplVal =
2435 ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A);
2436
2437 if (ReplVal == NewReplVal)
2438 return ChangeStatus::UNCHANGED;
2439
2440 ReplVal = NewReplVal;
2441 return ChangeStatus::CHANGED;
2442 }
2443
2444 // Return the value with which associated value can be replaced for specific
2445 // \p ICV.
2446 Optional<Value *>
2447 getUniqueReplacementValue(InternalControlVar ICV) const override {
2448 return ReplVal;
2449 }
2450};
2451
2452struct AAICVTrackerCallSiteReturned : AAICVTracker {
2453 AAICVTrackerCallSiteReturned(const IRPosition &IRP, Attributor &A)
2454 : AAICVTracker(IRP, A) {}
2455
2456 // FIXME: come up with better string.
2457 const std::string getAsStr() const override {
2458 return "ICVTrackerCallSiteReturned";
2459 }
2460
2461 // FIXME: come up with some stats.
2462 void trackStatistics() const override {}
2463
2464 /// We don't manifest anything for this AA.
2465 ChangeStatus manifest(Attributor &A) override {
2466 return ChangeStatus::UNCHANGED;
2467 }
2468
2469 // Map of ICV to their values at specific program point.
2470 EnumeratedArray<Optional<Value *>, InternalControlVar,
2471 InternalControlVar::ICV___last>
2472 ICVReplacementValuesMap;
2473
2474 /// Return the value with which associated value can be replaced for specific
2475 /// \p ICV.
2476 Optional<Value *>
2477 getUniqueReplacementValue(InternalControlVar ICV) const override {
2478 return ICVReplacementValuesMap[ICV];
2479 }
2480
2481 ChangeStatus updateImpl(Attributor &A) override {
2482 ChangeStatus Changed = ChangeStatus::UNCHANGED;
2483 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2484 *this, IRPosition::returned(*getAssociatedFunction()),
2485 DepClassTy::REQUIRED);
2486
2487 // We don't have any information, so we assume it changes the ICV.
2488 if (!ICVTrackingAA.isAssumedTracked())
2489 return indicatePessimisticFixpoint();
2490
2491 for (InternalControlVar ICV : TrackableICVs) {
2492 Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2493 Optional<Value *> NewReplVal =
2494 ICVTrackingAA.getUniqueReplacementValue(ICV);
2495
2496 if (ReplVal == NewReplVal)
2497 continue;
2498
2499 ReplVal = NewReplVal;
2500 Changed = ChangeStatus::CHANGED;
2501 }
2502 return Changed;
2503 }
2504};
2505
2506struct AAExecutionDomainFunction : public AAExecutionDomain {
2507 AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A)
2508 : AAExecutionDomain(IRP, A) {}
2509
2510 const std::string getAsStr() const override {
2511 return "[AAExecutionDomain] " + std::to_string(SingleThreadedBBs.size()) +
2512 "/" + std::to_string(NumBBs) + " BBs thread 0 only.";
2513 }
2514
2515 /// See AbstractAttribute::trackStatistics().
2516 void trackStatistics() const override {}
2517
2518 void initialize(Attributor &A) override {
2519 Function *F = getAnchorScope();
2520 for (const auto &BB : *F)
2521 SingleThreadedBBs.insert(&BB);
2522 NumBBs = SingleThreadedBBs.size();
2523 }
2524
2525 ChangeStatus manifest(Attributor &A) override {
2526 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { for (const BasicBlock *BB : SingleThreadedBBs
) dbgs() << TAG << " Basic block @" << getAnchorScope
()->getName() << " " << BB->getName() <<
" is executed by a single thread.\n"; }; } } while (false)
2527 for (const BasicBlock *BB : SingleThreadedBBs)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { for (const BasicBlock *BB : SingleThreadedBBs
) dbgs() << TAG << " Basic block @" << getAnchorScope
()->getName() << " " << BB->getName() <<
" is executed by a single thread.\n"; }; } } while (false)
2528 dbgs() << TAG << " Basic block @" << getAnchorScope()->getName() << " "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { for (const BasicBlock *BB : SingleThreadedBBs
) dbgs() << TAG << " Basic block @" << getAnchorScope
()->getName() << " " << BB->getName() <<
" is executed by a single thread.\n"; }; } } while (false)
2529 << BB->getName() << " is executed by a single thread.\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { for (const BasicBlock *BB : SingleThreadedBBs
) dbgs() << TAG << " Basic block @" << getAnchorScope
()->getName() << " " << BB->getName() <<
" is executed by a single thread.\n"; }; } } while (false)
2530 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { for (const BasicBlock *BB : SingleThreadedBBs
) dbgs() << TAG << " Basic block @" << getAnchorScope
()->getName() << " " << BB->getName() <<
" is executed by a single thread.\n"; }; } } while (false)
;
2531 return ChangeStatus::UNCHANGED;
2532 }
2533
2534 ChangeStatus updateImpl(Attributor &A) override;
2535
2536 /// Check if an instruction is executed by a single thread.
2537 bool isExecutedByInitialThreadOnly(const Instruction &I) const override {
2538 return isExecutedByInitialThreadOnly(*I.getParent());
2539 }
2540
2541 bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override {
2542 return isValidState() && SingleThreadedBBs.contains(&BB);
2543 }
2544
2545 /// Set of basic blocks that are executed by a single thread.
2546 SmallSetVector<const BasicBlock *, 16> SingleThreadedBBs;
2547
2548 /// Total number of basic blocks in this function.
2549 long unsigned NumBBs;
2550};
2551
2552ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
2553 Function *F = getAnchorScope();
2554 ReversePostOrderTraversal<Function *> RPOT(F);
2555 auto NumSingleThreadedBBs = SingleThreadedBBs.size();
2556
2557 bool AllCallSitesKnown;
2558 auto PredForCallSite = [&](AbstractCallSite ACS) {
2559 const auto &ExecutionDomainAA = A.getAAFor<AAExecutionDomain>(
2560 *this, IRPosition::function(*ACS.getInstruction()->getFunction()),
2561 DepClassTy::REQUIRED);
2562 return ACS.isDirectCall() &&
2563 ExecutionDomainAA.isExecutedByInitialThreadOnly(
2564 *ACS.getInstruction());
2565 };
2566
2567 if (!A.checkForAllCallSites(PredForCallSite, *this,
2568 /* RequiresAllCallSites */ true,
2569 AllCallSitesKnown))
2570 SingleThreadedBBs.remove(&F->getEntryBlock());
2571
2572 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2573 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
2574
2575 // Check if the edge into the successor block contains a condition that only
2576 // lets the main thread execute it.
2577 auto IsInitialThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) {
2578 if (!Edge || !Edge->isConditional())
2579 return false;
2580 if (Edge->getSuccessor(0) != SuccessorBB)
2581 return false;
2582
2583 auto *Cmp = dyn_cast<CmpInst>(Edge->getCondition());
2584 if (!Cmp || !Cmp->isTrueWhenEqual() || !Cmp->isEquality())
2585 return false;
2586
2587 ConstantInt *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
2588 if (!C)
2589 return false;
2590
2591 // Match: -1 == __kmpc_target_init (for non-SPMD kernels only!)
2592 if (C->isAllOnesValue()) {
2593 auto *CB = dyn_cast<CallBase>(Cmp->getOperand(0));
2594 CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;
2595 if (!CB)
2596 return false;
2597 const int InitModeArgNo = 1;
2598 auto *ModeCI = dyn_cast<ConstantInt>(CB->getOperand(InitModeArgNo));
2599 return ModeCI && (ModeCI->getSExtValue() & OMP_TGT_EXEC_MODE_GENERIC);
2600 }
2601
2602 if (C->isZero()) {
2603 // Match: 0 == llvm.nvvm.read.ptx.sreg.tid.x()
2604 if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0)))
2605 if (II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
2606 return true;
2607
2608 // Match: 0 == llvm.amdgcn.workitem.id.x()
2609 if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0)))
2610 if (II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
2611 return true;
2612 }
2613
2614 return false;
2615 };
2616
2617 // Merge all the predecessor states into the current basic block. A basic
2618 // block is executed by a single thread if all of its predecessors are.
2619 auto MergePredecessorStates = [&](BasicBlock *BB) {
2620 if (pred_empty(BB))
2621 return SingleThreadedBBs.contains(BB);
2622
2623 bool IsInitialThread = true;
2624 for (BasicBlock *PredBB : predecessors(BB)) {
2625 if (!IsInitialThreadOnly(dyn_cast<BranchInst>(PredBB->getTerminator()),
2626 BB))
2627 IsInitialThread &= SingleThreadedBBs.contains(PredBB);
2628 }
2629
2630 return IsInitialThread;
2631 };
2632
2633 for (auto *BB : RPOT) {
2634 if (!MergePredecessorStates(BB))
2635 SingleThreadedBBs.remove(BB);
2636 }
2637
2638 return (NumSingleThreadedBBs == SingleThreadedBBs.size())
2639 ? ChangeStatus::UNCHANGED
2640 : ChangeStatus::CHANGED;
2641}
2642
2643/// Try to replace memory allocation calls called by a single thread with a
2644/// static buffer of shared memory.
2645struct AAHeapToShared : public StateWrapper<BooleanState, AbstractAttribute> {
2646 using Base = StateWrapper<BooleanState, AbstractAttribute>;
2647 AAHeapToShared(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
2648
2649 /// Create an abstract attribute view for the position \p IRP.
2650 static AAHeapToShared &createForPosition(const IRPosition &IRP,
2651 Attributor &A);
2652
2653 /// Returns true if HeapToShared conversion is assumed to be possible.
2654 virtual bool isAssumedHeapToShared(CallBase &CB) const = 0;
2655
2656 /// Returns true if HeapToShared conversion is assumed and the CB is a
2657 /// callsite to a free operation to be removed.
2658 virtual bool isAssumedHeapToSharedRemovedFree(CallBase &CB) const = 0;
2659
2660 /// See AbstractAttribute::getName().
2661 const std::string getName() const override { return "AAHeapToShared"; }
2662
2663 /// See AbstractAttribute::getIdAddr().
2664 const char *getIdAddr() const override { return &ID; }
2665
2666 /// This function should return true if the type of the \p AA is
2667 /// AAHeapToShared.
2668 static bool classof(const AbstractAttribute *AA) {
2669 return (AA->getIdAddr() == &ID);
2670 }
2671
2672 /// Unique ID (due to the unique address)
2673 static const char ID;
2674};
2675
2676struct AAHeapToSharedFunction : public AAHeapToShared {
2677 AAHeapToSharedFunction(const IRPosition &IRP, Attributor &A)
2678 : AAHeapToShared(IRP, A) {}
2679
2680 const std::string getAsStr() const override {
2681 return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
2682 " malloc calls eligible.";
2683 }
2684
2685 /// See AbstractAttribute::trackStatistics().
2686 void trackStatistics() const override {}
2687
2688 /// This functions finds free calls that will be removed by the
2689 /// HeapToShared transformation.
2690 void findPotentialRemovedFreeCalls(Attributor &A) {
2691 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2692 auto &FreeRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
2693
2694 PotentialRemovedFreeCalls.clear();
2695 // Update free call users of found malloc calls.
2696 for (CallBase *CB : MallocCalls) {
2697 SmallVector<CallBase *, 4> FreeCalls;
2698 for (auto *U : CB->users()) {
2699 CallBase *C = dyn_cast<CallBase>(U);
2700 if (C && C->getCalledFunction() == FreeRFI.Declaration)
2701 FreeCalls.push_back(C);
2702 }
2703
2704 if (FreeCalls.size() != 1)
2705 continue;
2706
2707 PotentialRemovedFreeCalls.insert(FreeCalls.front());
2708 }
2709 }
2710
2711 void initialize(Attributor &A) override {
2712 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2713 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
2714
2715 for (User *U : RFI.Declaration->users())
2716 if (CallBase *CB = dyn_cast<CallBase>(U))
2717 MallocCalls.insert(CB);
2718
2719 findPotentialRemovedFreeCalls(A);
2720 }
2721
2722 bool isAssumedHeapToShared(CallBase &CB) const override {
2723 return isValidState() && MallocCalls.count(&CB);
2724 }
2725
2726 bool isAssumedHeapToSharedRemovedFree(CallBase &CB) const override {
2727 return isValidState() && PotentialRemovedFreeCalls.count(&CB);
2728 }
2729
2730 ChangeStatus manifest(Attributor &A) override {
2731 if (MallocCalls.empty())
1
Assuming the condition is false
2
Taking false branch
2732 return ChangeStatus::UNCHANGED;
2733
2734 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2735 auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
2736
2737 Function *F = getAnchorScope();
2738 auto *HS = A.lookupAAFor<AAHeapToStack>(IRPosition::function(*F), this,
3
Calling 'Attributor::lookupAAFor'
10
Returning from 'Attributor::lookupAAFor'
2739 DepClassTy::OPTIONAL);
2740
2741 ChangeStatus Changed = ChangeStatus::UNCHANGED;
2742 for (CallBase *CB : MallocCalls) {
11
Assuming '__begin2' is not equal to '__end2'
2743 // Skip replacing this if HeapToStack has already claimed it.
2744 if (HS
11.1
'HS' is non-null
11.1
'HS' is non-null
&& HS->isAssumedHeapToStack(*CB))
12
Assuming the condition is false
13
Taking false branch
2745 continue;
2746
2747 // Find the unique free call to remove it.
2748 SmallVector<CallBase *, 4> FreeCalls;
2749 for (auto *U : CB->users()) {
2750 CallBase *C = dyn_cast<CallBase>(U);
2751 if (C && C->getCalledFunction() == FreeCall.Declaration)
2752 FreeCalls.push_back(C);
2753 }
2754 if (FreeCalls.size() != 1)
14
Assuming the condition is false
15
Taking false branch
2755 continue;
2756
2757 ConstantInt *AllocSize = dyn_cast<ConstantInt>(CB->getArgOperand(0));
16
Assuming the object is not a 'ConstantInt'
17
'AllocSize' initialized to a null pointer value
2758
2759 LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CBdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Replace globalization call "
<< *CB << " with " << AllocSize->getZExtValue
() << " bytes of shared memory\n"; } } while (false)
18
Assuming 'DebugFlag' is false
19
Loop condition is false. Exiting loop
2760 << " with " << AllocSize->getZExtValue()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Replace globalization call "
<< *CB << " with " << AllocSize->getZExtValue
() << " bytes of shared memory\n"; } } while (false)
2761 << " bytes of shared memory\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Replace globalization call "
<< *CB << " with " << AllocSize->getZExtValue
() << " bytes of shared memory\n"; } } while (false)
;
2762
2763 // Create a new shared memory buffer of the same size as the allocation
2764 // and replace all the uses of the original allocation with it.
2765 Module *M = CB->getModule();
2766 Type *Int8Ty = Type::getInt8Ty(M->getContext());
2767 Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
20
Called C++ object pointer is null
2768 auto *SharedMem = new GlobalVariable(
2769 *M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage,
2770 UndefValue::get(Int8ArrTy), CB->getName(), nullptr,
2771 GlobalValue::NotThreadLocal,
2772 static_cast<unsigned>(AddressSpace::Shared));
2773 auto *NewBuffer =
2774 ConstantExpr::getPointerCast(SharedMem, Int8Ty->getPointerTo());
2775
2776 auto Remark = [&](OptimizationRemark OR) {
2777 return OR << "Replaced globalized variable with "
2778 << ore::NV("SharedMemory", AllocSize->getZExtValue())
2779 << ((AllocSize->getZExtValue() != 1) ? " bytes " : " byte ")
2780 << "of shared memory.";
2781 };
2782 A.emitRemark<OptimizationRemark>(CB, "OMP111", Remark);
2783
2784 MaybeAlign Alignment = CB->getRetAlign();
2785 assert(Alignment &&(static_cast <bool> (Alignment && "HeapToShared on allocation without alignment attribute"
) ? void (0) : __assert_fail ("Alignment && \"HeapToShared on allocation without alignment attribute\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 2786, __extension__
__PRETTY_FUNCTION__))
2786 "HeapToShared on allocation without alignment attribute")(static_cast <bool> (Alignment && "HeapToShared on allocation without alignment attribute"
) ? void (0) : __assert_fail ("Alignment && \"HeapToShared on allocation without alignment attribute\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 2786, __extension__
__PRETTY_FUNCTION__))
;
2787 SharedMem->setAlignment(MaybeAlign(Alignment));
2788
2789 A.changeValueAfterManifest(*CB, *NewBuffer);
2790 A.deleteAfterManifest(*CB);
2791 A.deleteAfterManifest(*FreeCalls.front());
2792
2793 NumBytesMovedToSharedMemory += AllocSize->getZExtValue();
2794 Changed = ChangeStatus::CHANGED;
2795 }
2796
2797 return Changed;
2798 }
2799
2800 ChangeStatus updateImpl(Attributor &A) override {
2801 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2802 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
2803 Function *F = getAnchorScope();
2804
2805 auto NumMallocCalls = MallocCalls.size();
2806
2807 // Only consider malloc calls executed by a single thread with a constant.
2808 for (User *U : RFI.Declaration->users()) {
2809 const auto &ED = A.getAAFor<AAExecutionDomain>(
2810 *this, IRPosition::function(*F), DepClassTy::REQUIRED);
2811 if (CallBase *CB = dyn_cast<CallBase>(U))
2812 if (!isa<ConstantInt>(CB->getArgOperand(0)) ||
2813 !ED.isExecutedByInitialThreadOnly(*CB))
2814 MallocCalls.remove(CB);
2815 }
2816
2817 findPotentialRemovedFreeCalls(A);
2818
2819 if (NumMallocCalls != MallocCalls.size())
2820 return ChangeStatus::CHANGED;
2821
2822 return ChangeStatus::UNCHANGED;
2823 }
2824
2825 /// Collection of all malloc calls in a function.
2826 SmallSetVector<CallBase *, 4> MallocCalls;
2827 /// Collection of potentially removed free calls in a function.
2828 SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
2829};
2830
2831struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
2832 using Base = StateWrapper<KernelInfoState, AbstractAttribute>;
2833 AAKernelInfo(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
2834
2835 /// Statistics are tracked as part of manifest for now.
2836 void trackStatistics() const override {}
2837
2838 /// See AbstractAttribute::getAsStr()
2839 const std::string getAsStr() const override {
2840 if (!isValidState())
2841 return "<invalid>";
2842 return std::string(SPMDCompatibilityTracker.isAssumed() ? "SPMD"
2843 : "generic") +
2844 std::string(SPMDCompatibilityTracker.isAtFixpoint() ? " [FIX]"
2845 : "") +
2846 std::string(" #PRs: ") +
2847 (ReachedKnownParallelRegions.isValidState()
2848 ? std::to_string(ReachedKnownParallelRegions.size())
2849 : "<invalid>") +
2850 ", #Unknown PRs: " +
2851 (ReachedUnknownParallelRegions.isValidState()
2852 ? std::to_string(ReachedUnknownParallelRegions.size())
2853 : "<invalid>") +
2854 ", #Reaching Kernels: " +
2855 (ReachingKernelEntries.isValidState()
2856 ? std::to_string(ReachingKernelEntries.size())
2857 : "<invalid>");
2858 }
2859
2860 /// Create an abstract attribute biew for the position \p IRP.
2861 static AAKernelInfo &createForPosition(const IRPosition &IRP, Attributor &A);
2862
2863 /// See AbstractAttribute::getName()
2864 const std::string getName() const override { return "AAKernelInfo"; }
2865
2866 /// See AbstractAttribute::getIdAddr()
2867 const char *getIdAddr() const override { return &ID; }
2868
2869 /// This function should return true if the type of the \p AA is AAKernelInfo
2870 static bool classof(const AbstractAttribute *AA) {
2871 return (AA->getIdAddr() == &ID);
2872 }
2873
2874 static const char ID;
2875};
2876
2877/// The function kernel info abstract attribute, basically, what can we say
2878/// about a function with regards to the KernelInfoState.
2879struct AAKernelInfoFunction : AAKernelInfo {
2880 AAKernelInfoFunction(const IRPosition &IRP, Attributor &A)
2881 : AAKernelInfo(IRP, A) {}
2882
2883 SmallPtrSet<Instruction *, 4> GuardedInstructions;
2884
2885 SmallPtrSetImpl<Instruction *> &getGuardedInstructions() {
2886 return GuardedInstructions;
2887 }
2888
2889 /// See AbstractAttribute::initialize(...).
2890 void initialize(Attributor &A) override {
2891 // This is a high-level transform that might change the constant arguments
2892 // of the init and dinit calls. We need to tell the Attributor about this
2893 // to avoid other parts using the current constant value for simpliication.
2894 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2895
2896 Function *Fn = getAnchorScope();
2897 if (!OMPInfoCache.Kernels.count(Fn))
2898 return;
2899
2900 // Add itself to the reaching kernel and set IsKernelEntry.
2901 ReachingKernelEntries.insert(Fn);
2902 IsKernelEntry = true;
2903
2904 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
2905 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
2906 OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =
2907 OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
2908
2909 // For kernels we perform more initialization work, first we find the init
2910 // and deinit calls.
2911 auto StoreCallBase = [](Use &U,
2912 OMPInformationCache::RuntimeFunctionInfo &RFI,
2913 CallBase *&Storage) {
2914 CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
2915 assert(CB &&(static_cast <bool> (CB && "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!"
) ? void (0) : __assert_fail ("CB && \"Unexpected use of __kmpc_target_init or __kmpc_target_deinit!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 2916, __extension__
__PRETTY_FUNCTION__))
2916 "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!")(static_cast <bool> (CB && "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!"
) ? void (0) : __assert_fail ("CB && \"Unexpected use of __kmpc_target_init or __kmpc_target_deinit!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 2916, __extension__
__PRETTY_FUNCTION__))
;
2917 assert(!Storage &&(static_cast <bool> (!Storage && "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!"
) ? void (0) : __assert_fail ("!Storage && \"Multiple uses of __kmpc_target_init or __kmpc_target_deinit!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 2918, __extension__
__PRETTY_FUNCTION__))
2918 "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!")(static_cast <bool> (!Storage && "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!"
) ? void (0) : __assert_fail ("!Storage && \"Multiple uses of __kmpc_target_init or __kmpc_target_deinit!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 2918, __extension__
__PRETTY_FUNCTION__))
;
2919 Storage = CB;
2920 return false;
2921 };
2922 InitRFI.foreachUse(
2923 [&](Use &U, Function &) {
2924 StoreCallBase(U, InitRFI, KernelInitCB);
2925 return false;
2926 },
2927 Fn);
2928 DeinitRFI.foreachUse(
2929 [&](Use &U, Function &) {
2930 StoreCallBase(U, DeinitRFI, KernelDeinitCB);
2931 return false;
2932 },
2933 Fn);
2934
2935 // Ignore kernels without initializers such as global constructors.
2936 if (!KernelInitCB || !KernelDeinitCB) {
2937 indicateOptimisticFixpoint();
2938 return;
2939 }
2940
2941 // For kernels we might need to initialize/finalize the IsSPMD state and
2942 // we need to register a simplification callback so that the Attributor
2943 // knows the constant arguments to __kmpc_target_init and
2944 // __kmpc_target_deinit might actually change.
2945
2946 Attributor::SimplifictionCallbackTy StateMachineSimplifyCB =
2947 [&](const IRPosition &IRP, const AbstractAttribute *AA,
2948 bool &UsedAssumedInformation) -> Optional<Value *> {
2949 // IRP represents the "use generic state machine" argument of an
2950 // __kmpc_target_init call. We will answer this one with the internal
2951 // state. As long as we are not in an invalid state, we will create a
2952 // custom state machine so the value should be a `i1 false`. If we are
2953 // in an invalid state, we won't change the value that is in the IR.
2954 if (!ReachedKnownParallelRegions.isValidState())
2955 return nullptr;
2956 // If we have disabled state machine rewrites, don't make a custom one.
2957 if (DisableOpenMPOptStateMachineRewrite)
2958 return nullptr;
2959 if (AA)
2960 A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
2961 UsedAssumedInformation = !isAtFixpoint();
2962 auto *FalseVal =
2963 ConstantInt::getBool(IRP.getAnchorValue().getContext(), false);
2964 return FalseVal;
2965 };
2966
2967 Attributor::SimplifictionCallbackTy ModeSimplifyCB =
2968 [&](const IRPosition &IRP, const AbstractAttribute *AA,
2969 bool &UsedAssumedInformation) -> Optional<Value *> {
2970 // IRP represents the "SPMDCompatibilityTracker" argument of an
2971 // __kmpc_target_init or
2972 // __kmpc_target_deinit call. We will answer this one with the internal
2973 // state.
2974 if (!SPMDCompatibilityTracker.isValidState())
2975 return nullptr;
2976 if (!SPMDCompatibilityTracker.isAtFixpoint()) {
2977 if (AA)
2978 A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
2979 UsedAssumedInformation = true;
2980 } else {
2981 UsedAssumedInformation = false;
2982 }
2983 auto *Val = ConstantInt::getSigned(
2984 IntegerType::getInt8Ty(IRP.getAnchorValue().getContext()),
2985 SPMDCompatibilityTracker.isAssumed() ? OMP_TGT_EXEC_MODE_SPMD
2986 : OMP_TGT_EXEC_MODE_GENERIC);
2987 return Val;
2988 };
2989
2990 Attributor::SimplifictionCallbackTy IsGenericModeSimplifyCB =
2991 [&](const IRPosition &IRP, const AbstractAttribute *AA,
2992 bool &UsedAssumedInformation) -> Optional<Value *> {
2993 // IRP represents the "RequiresFullRuntime" argument of an
2994 // __kmpc_target_init or __kmpc_target_deinit call. We will answer this
2995 // one with the internal state of the SPMDCompatibilityTracker, so if
2996 // generic then true, if SPMD then false.
2997 if (!SPMDCompatibilityTracker.isValidState())
2998 return nullptr;
2999 if (!SPMDCompatibilityTracker.isAtFixpoint()) {
3000 if (AA)
3001 A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
3002 UsedAssumedInformation = true;
3003 } else {
3004 UsedAssumedInformation = false;
3005 }
3006 auto *Val = ConstantInt::getBool(IRP.getAnchorValue().getContext(),
3007 !SPMDCompatibilityTracker.isAssumed());
3008 return Val;
3009 };
3010
3011 constexpr const int InitModeArgNo = 1;
3012 constexpr const int DeinitModeArgNo = 1;
3013 constexpr const int InitUseStateMachineArgNo = 2;
3014 constexpr const int InitRequiresFullRuntimeArgNo = 3;
3015 constexpr const int DeinitRequiresFullRuntimeArgNo = 2;
3016 A.registerSimplificationCallback(
3017 IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo),
3018 StateMachineSimplifyCB);
3019 A.registerSimplificationCallback(
3020 IRPosition::callsite_argument(*KernelInitCB, InitModeArgNo),
3021 ModeSimplifyCB);
3022 A.registerSimplificationCallback(
3023 IRPosition::callsite_argument(*KernelDeinitCB, DeinitModeArgNo),
3024 ModeSimplifyCB);
3025 A.registerSimplificationCallback(
3026 IRPosition::callsite_argument(*KernelInitCB,
3027 InitRequiresFullRuntimeArgNo),
3028 IsGenericModeSimplifyCB);
3029 A.registerSimplificationCallback(
3030 IRPosition::callsite_argument(*KernelDeinitCB,
3031 DeinitRequiresFullRuntimeArgNo),
3032 IsGenericModeSimplifyCB);
3033
3034 // Check if we know we are in SPMD-mode already.
3035 ConstantInt *ModeArg =
3036 dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitModeArgNo));
3037 if (ModeArg && (ModeArg->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD))
3038 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3039 // This is a generic region but SPMDization is disabled so stop tracking.
3040 else if (DisableOpenMPOptSPMDization)
3041 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3042 }
3043
3044 /// Sanitize the string \p S such that it is a suitable global symbol name.
3045 static std::string sanitizeForGlobalName(std::string S) {
3046 std::replace_if(
3047 S.begin(), S.end(),
3048 [](const char C) {
3049 return !((C >= 'a' && C <= 'z') || (C >= 'A' && C <= 'Z') ||
3050 (C >= '0' && C <= '9') || C == '_');
3051 },
3052 '.');
3053 return S;
3054 }
3055
3056 /// Modify the IR based on the KernelInfoState as the fixpoint iteration is
3057 /// finished now.
3058 ChangeStatus manifest(Attributor &A) override {
3059 // If we are not looking at a kernel with __kmpc_target_init and
3060 // __kmpc_target_deinit call we cannot actually manifest the information.
3061 if (!KernelInitCB || !KernelDeinitCB)
3062 return ChangeStatus::UNCHANGED;
3063
3064 // If we can we change the execution mode to SPMD-mode otherwise we build a
3065 // custom state machine.
3066 ChangeStatus Changed = ChangeStatus::UNCHANGED;
3067 if (!changeToSPMDMode(A, Changed))
3068 return buildCustomStateMachine(A);
3069
3070 return Changed;
3071 }
3072
3073 bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) {
3074 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3075
3076 if (!SPMDCompatibilityTracker.isAssumed()) {
3077 for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
3078 if (!NonCompatibleI)
3079 continue;
3080
3081 // Skip diagnostics on calls to known OpenMP runtime functions for now.
3082 if (auto *CB = dyn_cast<CallBase>(NonCompatibleI))
3083 if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction()))
3084 continue;
3085
3086 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
3087 ORA << "Value has potential side effects preventing SPMD-mode "
3088 "execution";
3089 if (isa<CallBase>(NonCompatibleI)) {
3090 ORA << ". Add `__attribute__((assume(\"ompx_spmd_amenable\")))` to "
3091 "the called function to override";
3092 }
3093 return ORA << ".";
3094 };
3095 A.emitRemark<OptimizationRemarkAnalysis>(NonCompatibleI, "OMP121",
3096 Remark);
3097
3098 LLVM_DEBUG(dbgs() << TAG << "SPMD-incompatible side-effect: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "SPMD-incompatible side-effect: "
<< *NonCompatibleI << "\n"; } } while (false)
3099 << *NonCompatibleI << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "SPMD-incompatible side-effect: "
<< *NonCompatibleI << "\n"; } } while (false)
;
3100 }
3101
3102 return false;
3103 }
3104
3105 // Check if the kernel is already in SPMD mode, if so, return success.
3106 Function *Kernel = getAnchorScope();
3107 GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
3108 (Kernel->getName() + "_exec_mode").str());
3109 assert(ExecMode && "Kernel without exec mode?")(static_cast <bool> (ExecMode && "Kernel without exec mode?"
) ? void (0) : __assert_fail ("ExecMode && \"Kernel without exec mode?\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 3109, __extension__
__PRETTY_FUNCTION__))
;
3110 assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!")(static_cast <bool> (ExecMode->getInitializer() &&
"ExecMode doesn't have initializer!") ? void (0) : __assert_fail
("ExecMode->getInitializer() && \"ExecMode doesn't have initializer!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 3110, __extension__
__PRETTY_FUNCTION__))
;
3111
3112 // Set the global exec mode flag to indicate SPMD-Generic mode.
3113 assert(isa<ConstantInt>(ExecMode->getInitializer()) &&(static_cast <bool> (isa<ConstantInt>(ExecMode->
getInitializer()) && "ExecMode is not an integer!") ?
void (0) : __assert_fail ("isa<ConstantInt>(ExecMode->getInitializer()) && \"ExecMode is not an integer!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 3114, __extension__
__PRETTY_FUNCTION__))
3114 "ExecMode is not an integer!")(static_cast <bool> (isa<ConstantInt>(ExecMode->
getInitializer()) && "ExecMode is not an integer!") ?
void (0) : __assert_fail ("isa<ConstantInt>(ExecMode->getInitializer()) && \"ExecMode is not an integer!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 3114, __extension__
__PRETTY_FUNCTION__))
;
3115 const int8_t ExecModeVal =
3116 cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
3117 if (ExecModeVal != OMP_TGT_EXEC_MODE_GENERIC)
3118 return true;
3119
3120 // We will now unconditionally modify the IR, indicate a change.
3121 Changed = ChangeStatus::CHANGED;
3122
3123 auto CreateGuardedRegion = [&](Instruction *RegionStartI,
3124 Instruction *RegionEndI) {
3125 LoopInfo *LI = nullptr;
3126 DominatorTree *DT = nullptr;
3127 MemorySSAUpdater *MSU = nullptr;
3128 using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
3129
3130 BasicBlock *ParentBB = RegionStartI->getParent();
3131 Function *Fn = ParentBB->getParent();
3132 Module &M = *Fn->getParent();
3133
3134 // Create all the blocks and logic.
3135 // ParentBB:
3136 // goto RegionCheckTidBB
3137 // RegionCheckTidBB:
3138 // Tid = __kmpc_hardware_thread_id()
3139 // if (Tid != 0)
3140 // goto RegionBarrierBB
3141 // RegionStartBB:
3142 // <execute instructions guarded>
3143 // goto RegionEndBB
3144 // RegionEndBB:
3145 // <store escaping values to shared mem>
3146 // goto RegionBarrierBB
3147 // RegionBarrierBB:
3148 // __kmpc_simple_barrier_spmd()
3149 // // second barrier is omitted if lacking escaping values.
3150 // <load escaping values from shared mem>
3151 // __kmpc_simple_barrier_spmd()
3152 // goto RegionExitBB
3153 // RegionExitBB:
3154 // <execute rest of instructions>
3155
3156 BasicBlock *RegionEndBB = SplitBlock(ParentBB, RegionEndI->getNextNode(),
3157 DT, LI, MSU, "region.guarded.end");
3158 BasicBlock *RegionBarrierBB =
3159 SplitBlock(RegionEndBB, &*RegionEndBB->getFirstInsertionPt(), DT, LI,
3160 MSU, "region.barrier");
3161 BasicBlock *RegionExitBB =
3162 SplitBlock(RegionBarrierBB, &*RegionBarrierBB->getFirstInsertionPt(),
3163 DT, LI, MSU, "region.exit");
3164 BasicBlock *RegionStartBB =
3165 SplitBlock(ParentBB, RegionStartI, DT, LI, MSU, "region.guarded");
3166
3167 assert(ParentBB->getUniqueSuccessor() == RegionStartBB &&(static_cast <bool> (ParentBB->getUniqueSuccessor() ==
RegionStartBB && "Expected a different CFG") ? void (
0) : __assert_fail ("ParentBB->getUniqueSuccessor() == RegionStartBB && \"Expected a different CFG\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 3168, __extension__
__PRETTY_FUNCTION__))
3168 "Expected a different CFG")(static_cast <bool> (ParentBB->getUniqueSuccessor() ==
RegionStartBB && "Expected a different CFG") ? void (
0) : __assert_fail ("ParentBB->getUniqueSuccessor() == RegionStartBB && \"Expected a different CFG\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 3168, __extension__
__PRETTY_FUNCTION__))
;
3169
3170 BasicBlock *RegionCheckTidBB = SplitBlock(
3171 ParentBB, ParentBB->getTerminator(), DT, LI, MSU, "region.check.tid");
3172
3173 // Register basic blocks with the Attributor.
3174 A.registerManifestAddedBasicBlock(*RegionEndBB);
3175 A.registerManifestAddedBasicBlock(*RegionBarrierBB);
3176 A.registerManifestAddedBasicBlock(*RegionExitBB);
3177 A.registerManifestAddedBasicBlock(*RegionStartBB);
3178 A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
3179
3180 bool HasBroadcastValues = false;
3181 // Find escaping outputs from the guarded region to outside users and
3182 // broadcast their values to them.
3183 for (Instruction &I : *RegionStartBB) {
3184 SmallPtrSet<Instruction *, 4> OutsideUsers;
3185 for (User *Usr : I.users()) {
3186 Instruction &UsrI = *cast<Instruction>(Usr);
3187 if (UsrI.getParent() != RegionStartBB)
3188 OutsideUsers.insert(&UsrI);
3189 }
3190
3191 if (OutsideUsers.empty())
3192 continue;
3193
3194 HasBroadcastValues = true;
3195
3196 // Emit a global variable in shared memory to store the broadcasted
3197 // value.
3198 auto *SharedMem = new GlobalVariable(
3199 M, I.getType(), /* IsConstant */ false,
3200 GlobalValue::InternalLinkage, UndefValue::get(I.getType()),
3201 sanitizeForGlobalName(
3202 (I.getName() + ".guarded.output.alloc").str()),
3203 nullptr, GlobalValue::NotThreadLocal,
3204 static_cast<unsigned>(AddressSpace::Shared));
3205
3206 // Emit a store instruction to update the value.
3207 new StoreInst(&I, SharedMem, RegionEndBB->getTerminator());
3208
3209 LoadInst *LoadI = new LoadInst(I.getType(), SharedMem,
3210 I.getName() + ".guarded.output.load",
3211 RegionBarrierBB->getTerminator());
3212
3213 // Emit a load instruction and replace uses of the output value.
3214 for (Instruction *UsrI : OutsideUsers)
3215 UsrI->replaceUsesOfWith(&I, LoadI);
3216 }
3217
3218 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3219
3220 // Go to tid check BB in ParentBB.
3221 const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
3222 ParentBB->getTerminator()->eraseFromParent();
3223 OpenMPIRBuilder::LocationDescription Loc(
3224 InsertPointTy(ParentBB, ParentBB->end()), DL);
3225 OMPInfoCache.OMPBuilder.updateToLocation(Loc);
3226 uint32_t SrcLocStrSize;
3227 auto *SrcLocStr =
3228 OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3229 Value *Ident =
3230 OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3231 BranchInst::Create(RegionCheckTidBB, ParentBB)->setDebugLoc(DL);
3232
3233 // Add check for Tid in RegionCheckTidBB
3234 RegionCheckTidBB->getTerminator()->eraseFromParent();
3235 OpenMPIRBuilder::LocationDescription LocRegionCheckTid(
3236 InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->end()), DL);
3237 OMPInfoCache.OMPBuilder.updateToLocation(LocRegionCheckTid);
3238 FunctionCallee HardwareTidFn =
3239 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
3240 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
3241 Value *Tid =
3242 OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {});
3243 Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid);
3244 OMPInfoCache.OMPBuilder.Builder
3245 .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
3246 ->setDebugLoc(DL);
3247
3248 // First barrier for synchronization, ensures main thread has updated
3249 // values.
3250 FunctionCallee BarrierFn =
3251 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
3252 M, OMPRTL___kmpc_barrier_simple_spmd);
3253 OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy(
3254 RegionBarrierBB, RegionBarrierBB->getFirstInsertionPt()));
3255 OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid})
3256 ->setDebugLoc(DL);
3257
3258 // Second barrier ensures workers have read broadcast values.
3259 if (HasBroadcastValues)
3260 CallInst::Create(BarrierFn, {Ident, Tid}, "",
3261 RegionBarrierBB->getTerminator())
3262 ->setDebugLoc(DL);
3263 };
3264
3265 auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3266 SmallPtrSet<BasicBlock *, 8> Visited;
3267 for (Instruction *GuardedI : SPMDCompatibilityTracker) {
3268 BasicBlock *BB = GuardedI->getParent();
3269 if (!Visited.insert(BB).second)
3270 continue;
3271
3272 SmallVector<std::pair<Instruction *, Instruction *>> Reorders;
3273 Instruction *LastEffect = nullptr;
3274 BasicBlock::reverse_iterator IP = BB->rbegin(), IPEnd = BB->rend();
3275 while (++IP != IPEnd) {
3276 if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())
3277 continue;
3278 Instruction *I = &*IP;
3279 if (OpenMPOpt::getCallIfRegularCall(*I, &AllocSharedRFI))
3280 continue;
3281 if (!I->user_empty() || !SPMDCompatibilityTracker.contains(I)) {
3282 LastEffect = nullptr;
3283 continue;
3284 }
3285 if (LastEffect)
3286 Reorders.push_back({I, LastEffect});
3287 LastEffect = &*IP;
3288 }
3289 for (auto &Reorder : Reorders)
3290 Reorder.first->moveBefore(Reorder.second);
3291 }
3292
3293 SmallVector<std::pair<Instruction *, Instruction *>, 4> GuardedRegions;
3294
3295 for (Instruction *GuardedI : SPMDCompatibilityTracker) {
3296 BasicBlock *BB = GuardedI->getParent();
3297 auto *CalleeAA = A.lookupAAFor<AAKernelInfo>(
3298 IRPosition::function(*GuardedI->getFunction()), nullptr,
3299 DepClassTy::NONE);
3300 assert(CalleeAA != nullptr && "Expected Callee AAKernelInfo")(static_cast <bool> (CalleeAA != nullptr && "Expected Callee AAKernelInfo"
) ? void (0) : __assert_fail ("CalleeAA != nullptr && \"Expected Callee AAKernelInfo\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 3300, __extension__
__PRETTY_FUNCTION__))
;
3301 auto &CalleeAAFunction = *cast<AAKernelInfoFunction>(CalleeAA);
3302 // Continue if instruction is already guarded.
3303 if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
3304 continue;
3305
3306 Instruction *GuardedRegionStart = nullptr, *GuardedRegionEnd = nullptr;
3307 for (Instruction &I : *BB) {
3308 // If instruction I needs to be guarded update the guarded region
3309 // bounds.
3310 if (SPMDCompatibilityTracker.contains(&I)) {
3311 CalleeAAFunction.getGuardedInstructions().insert(&I);
3312 if (GuardedRegionStart)
3313 GuardedRegionEnd = &I;
3314 else
3315 GuardedRegionStart = GuardedRegionEnd = &I;
3316
3317 continue;
3318 }
3319
3320 // Instruction I does not need guarding, store
3321 // any region found and reset bounds.
3322 if (GuardedRegionStart) {
3323 GuardedRegions.push_back(
3324 std::make_pair(GuardedRegionStart, GuardedRegionEnd));
3325 GuardedRegionStart = nullptr;
3326 GuardedRegionEnd = nullptr;
3327 }
3328 }
3329 }
3330
3331 for (auto &GR : GuardedRegions)
3332 CreateGuardedRegion(GR.first, GR.second);
3333
3334 // Adjust the global exec mode flag that tells the runtime what mode this
3335 // kernel is executed in.
3336 assert(ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC &&(static_cast <bool> (ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC
&& "Initially non-SPMD kernel has SPMD exec mode!") ?
void (0) : __assert_fail ("ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC && \"Initially non-SPMD kernel has SPMD exec mode!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 3337, __extension__
__PRETTY_FUNCTION__))
3337 "Initially non-SPMD kernel has SPMD exec mode!")(static_cast <bool> (ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC
&& "Initially non-SPMD kernel has SPMD exec mode!") ?
void (0) : __assert_fail ("ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC && \"Initially non-SPMD kernel has SPMD exec mode!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 3337, __extension__
__PRETTY_FUNCTION__))
;
3338 ExecMode->setInitializer(
3339 ConstantInt::get(ExecMode->getInitializer()->getType(),
3340 ExecModeVal | OMP_TGT_EXEC_MODE_GENERIC_SPMD));
3341
3342 // Next rewrite the init and deinit calls to indicate we use SPMD-mode now.
3343 const int InitModeArgNo = 1;
3344 const int DeinitModeArgNo = 1;
3345 const int InitUseStateMachineArgNo = 2;
3346 const int InitRequiresFullRuntimeArgNo = 3;
3347 const int DeinitRequiresFullRuntimeArgNo = 2;
3348
3349 auto &Ctx = getAnchorValue().getContext();
3350 A.changeUseAfterManifest(
3351 KernelInitCB->getArgOperandUse(InitModeArgNo),
3352 *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx),
3353 OMP_TGT_EXEC_MODE_SPMD));
3354 A.changeUseAfterManifest(
3355 KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo),
3356 *ConstantInt::getBool(Ctx, false));
3357 A.changeUseAfterManifest(
3358 KernelDeinitCB->getArgOperandUse(DeinitModeArgNo),
3359 *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx),
3360 OMP_TGT_EXEC_MODE_SPMD));
3361 A.changeUseAfterManifest(
3362 KernelInitCB->getArgOperandUse(InitRequiresFullRuntimeArgNo),
3363 *ConstantInt::getBool(Ctx, false));
3364 A.changeUseAfterManifest(
3365 KernelDeinitCB->getArgOperandUse(DeinitRequiresFullRuntimeArgNo),
3366 *ConstantInt::getBool(Ctx, false));
3367
3368 ++NumOpenMPTargetRegionKernelsSPMD;
3369
3370 auto Remark = [&](OptimizationRemark OR) {
3371 return OR << "Transformed generic-mode kernel to SPMD-mode.";
3372 };
3373 A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP120", Remark);
3374 return true;
3375 };
3376
3377 ChangeStatus buildCustomStateMachine(Attributor &A) {
3378 // If we have disabled state machine rewrites, don't make a custom one
3379 if (DisableOpenMPOptStateMachineRewrite)
3380 return ChangeStatus::UNCHANGED;
3381
3382 // Don't rewrite the state machine if we are not in a valid state.
3383 if (!ReachedKnownParallelRegions.isValidState())
3384 return ChangeStatus::UNCHANGED;
3385
3386 const int InitModeArgNo = 1;
3387 const int InitUseStateMachineArgNo = 2;
3388
3389 // Check if the current configuration is non-SPMD and generic state machine.
3390 // If we already have SPMD mode or a custom state machine we do not need to
3391 // go any further. If it is anything but a constant something is weird and
3392 // we give up.
3393 ConstantInt *UseStateMachine = dyn_cast<ConstantInt>(
3394 KernelInitCB->getArgOperand(InitUseStateMachineArgNo));
3395 ConstantInt *Mode =
3396 dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitModeArgNo));
3397
3398 // If we are stuck with generic mode, try to create a custom device (=GPU)
3399 // state machine which is specialized for the parallel regions that are
3400 // reachable by the kernel.
3401 if (!UseStateMachine || UseStateMachine->isZero() || !Mode ||
3402 (Mode->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD))
3403 return ChangeStatus::UNCHANGED;
3404
3405 // If not SPMD mode, indicate we use a custom state machine now.
3406 auto &Ctx = getAnchorValue().getContext();
3407 auto *FalseVal = ConstantInt::getBool(Ctx, false);
3408 A.changeUseAfterManifest(
3409 KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *FalseVal);
3410
3411 // If we don't actually need a state machine we are done here. This can
3412 // happen if there simply are no parallel regions. In the resulting kernel
3413 // all worker threads will simply exit right away, leaving the main thread
3414 // to do the work alone.
3415 if (!mayContainParallelRegion()) {
3416 ++NumOpenMPTargetRegionKernelsWithoutStateMachine;
3417
3418 auto Remark = [&](OptimizationRemark OR) {
3419 return OR << "Removing unused state machine from generic-mode kernel.";
3420 };
3421 A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP130", Remark);
3422
3423 return ChangeStatus::CHANGED;
3424 }
3425
3426 // Keep track in the statistics of our new shiny custom state machine.
3427 if (ReachedUnknownParallelRegions.empty()) {
3428 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;
3429
3430 auto Remark = [&](OptimizationRemark OR) {
3431 return OR << "Rewriting generic-mode kernel with a customized state "
3432 "machine.";
3433 };
3434 A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP131", Remark);
3435 } else {
3436 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;
3437
3438 auto Remark = [&](OptimizationRemarkAnalysis OR) {
3439 return OR << "Generic-mode kernel is executed with a customized state "
3440 "machine that requires a fallback.";
3441 };
3442 A.emitRemark<OptimizationRemarkAnalysis>(KernelInitCB, "OMP132", Remark);
3443
3444 // Tell the user why we ended up with a fallback.
3445 for (CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {
3446 if (!UnknownParallelRegionCB)
3447 continue;
3448 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
3449 return ORA << "Call may contain unknown parallel regions. Use "
3450 << "`__attribute__((assume(\"omp_no_parallelism\")))` to "
3451 "override.";
3452 };
3453 A.emitRemark<OptimizationRemarkAnalysis>(UnknownParallelRegionCB,
3454 "OMP133", Remark);
3455 }
3456 }
3457
3458 // Create all the blocks:
3459 //
3460 // InitCB = __kmpc_target_init(...)
3461 // BlockHwSize =
3462 // __kmpc_get_hardware_num_threads_in_block();
3463 // WarpSize = __kmpc_get_warp_size();
3464 // BlockSize = BlockHwSize - WarpSize;
3465 // if (InitCB >= BlockSize) return;
3466 // IsWorkerCheckBB: bool IsWorker = InitCB >= 0;
3467 // if (IsWorker) {
3468 // SMBeginBB: __kmpc_barrier_simple_generic(...);
3469 // void *WorkFn;
3470 // bool Active = __kmpc_kernel_parallel(&WorkFn);
3471 // if (!WorkFn) return;
3472 // SMIsActiveCheckBB: if (Active) {
3473 // SMIfCascadeCurrentBB: if (WorkFn == <ParFn0>)
3474 // ParFn0(...);
3475 // SMIfCascadeCurrentBB: else if (WorkFn == <ParFn1>)
3476 // ParFn1(...);
3477 // ...
3478 // SMIfCascadeCurrentBB: else
3479 // ((WorkFnTy*)WorkFn)(...);
3480 // SMEndParallelBB: __kmpc_kernel_end_parallel(...);
3481 // }
3482 // SMDoneBB: __kmpc_barrier_simple_generic(...);
3483 // goto SMBeginBB;
3484 // }
3485 // UserCodeEntryBB: // user code
3486 // __kmpc_target_deinit(...)
3487 //
3488 Function *Kernel = getAssociatedFunction();
3489 assert(Kernel && "Expected an associated function!")(static_cast <bool> (Kernel && "Expected an associated function!"
) ? void (0) : __assert_fail ("Kernel && \"Expected an associated function!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 3489, __extension__
__PRETTY_FUNCTION__))
;
3490
3491 BasicBlock *InitBB = KernelInitCB->getParent();
3492 BasicBlock *UserCodeEntryBB = InitBB->splitBasicBlock(
3493 KernelInitCB->getNextNode(), "thread.user_code.check");
3494 BasicBlock *IsWorkerCheckBB =
3495 BasicBlock::Create(Ctx, "is_worker_check", Kernel, UserCodeEntryBB);
3496 BasicBlock *StateMachineBeginBB = BasicBlock::Create(
3497 Ctx, "worker_state_machine.begin", Kernel, UserCodeEntryBB);
3498 BasicBlock *StateMachineFinishedBB = BasicBlock::Create(
3499 Ctx, "worker_state_machine.finished", Kernel, UserCodeEntryBB);
3500 BasicBlock *StateMachineIsActiveCheckBB = BasicBlock::Create(
3501 Ctx, "worker_state_machine.is_active.check", Kernel, UserCodeEntryBB);
3502 BasicBlock *StateMachineIfCascadeCurrentBB =
3503 BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",
3504 Kernel, UserCodeEntryBB);
3505 BasicBlock *StateMachineEndParallelBB =
3506 BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.end",
3507 Kernel, UserCodeEntryBB);
3508 BasicBlock *StateMachineDoneBarrierBB = BasicBlock::Create(
3509 Ctx, "worker_state_machine.done.barrier", Kernel, UserCodeEntryBB);
3510 A.registerManifestAddedBasicBlock(*InitBB);
3511 A.registerManifestAddedBasicBlock(*UserCodeEntryBB);
3512 A.registerManifestAddedBasicBlock(*IsWorkerCheckBB);
3513 A.registerManifestAddedBasicBlock(*StateMachineBeginBB);
3514 A.registerManifestAddedBasicBlock(*StateMachineFinishedBB);
3515 A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB);
3516 A.registerManifestAddedBasicBlock(*StateMachineIfCascadeCurrentBB);
3517 A.registerManifestAddedBasicBlock(*StateMachineEndParallelBB);
3518 A.registerManifestAddedBasicBlock(*StateMachineDoneBarrierBB);
3519
3520 const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
3521 ReturnInst::Create(Ctx, StateMachineFinishedBB)->setDebugLoc(DLoc);
3522 InitBB->getTerminator()->eraseFromParent();
3523
3524 Module &M = *Kernel->getParent();
3525 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3526 FunctionCallee BlockHwSizeFn =
3527 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
3528 M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
3529 FunctionCallee WarpSizeFn =
3530 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
3531 M, OMPRTL___kmpc_get_warp_size);
3532 Instruction *BlockHwSize =
3533 CallInst::Create(BlockHwSizeFn, "block.hw_size", InitBB);
3534 BlockHwSize->setDebugLoc(DLoc);
3535 Instruction *WarpSize = CallInst::Create(WarpSizeFn, "warp.size", InitBB);
3536 WarpSize->setDebugLoc(DLoc);
3537 Instruction *BlockSize =
3538 BinaryOperator::CreateSub(BlockHwSize, WarpSize, "block.size", InitBB);
3539 BlockSize->setDebugLoc(DLoc);
3540 Instruction *IsMainOrWorker =
3541 ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_SLT, KernelInitCB,
3542 BlockSize, "thread.is_main_or_worker", InitBB);
3543 IsMainOrWorker->setDebugLoc(DLoc);
3544 BranchInst::Create(IsWorkerCheckBB, StateMachineFinishedBB, IsMainOrWorker,
3545 InitBB);
3546
3547 Instruction *IsWorker =
3548 ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB,
3549 ConstantInt::get(KernelInitCB->getType(), -1),
3550 "thread.is_worker", IsWorkerCheckBB);
3551 IsWorker->setDebugLoc(DLoc);
3552 BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker,
3553 IsWorkerCheckBB);
3554
3555 // Create local storage for the work function pointer.
3556 const DataLayout &DL = M.getDataLayout();
3557 Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
3558 Instruction *WorkFnAI =
3559 new AllocaInst(VoidPtrTy, DL.getAllocaAddrSpace(), nullptr,
3560 "worker.work_fn.addr", &Kernel->getEntryBlock().front());
3561 WorkFnAI->setDebugLoc(DLoc);
3562
3563 OMPInfoCache.OMPBuilder.updateToLocation(
3564 OpenMPIRBuilder::LocationDescription(
3565 IRBuilder<>::InsertPoint(StateMachineBeginBB,
3566 StateMachineBeginBB->end()),
3567 DLoc));
3568
3569 Value *Ident = KernelInitCB->getArgOperand(0);
3570 Value *GTid = KernelInitCB;
3571
3572 FunctionCallee BarrierFn =
3573 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
3574 M, OMPRTL___kmpc_barrier_simple_generic);
3575 CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB)
3576 ->setDebugLoc(DLoc);
3577
3578 if (WorkFnAI->getType()->getPointerAddressSpace() !=
3579 (unsigned int)AddressSpace::Generic) {
3580 WorkFnAI = new AddrSpaceCastInst(
3581 WorkFnAI,
3582 PointerType::getWithSamePointeeType(
3583 cast<PointerType>(WorkFnAI->getType()),
3584 (unsigned int)AddressSpace::Generic),
3585 WorkFnAI->getName() + ".generic", StateMachineBeginBB);
3586 WorkFnAI->setDebugLoc(DLoc);
3587 }
3588
3589 FunctionCallee KernelParallelFn =
3590 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
3591 M, OMPRTL___kmpc_kernel_parallel);
3592 Instruction *IsActiveWorker = CallInst::Create(
3593 KernelParallelFn, {WorkFnAI}, "worker.is_active", StateMachineBeginBB);
3594 IsActiveWorker->setDebugLoc(DLoc);
3595 Instruction *WorkFn = new LoadInst(VoidPtrTy, WorkFnAI, "worker.work_fn",
3596 StateMachineBeginBB);
3597 WorkFn->setDebugLoc(DLoc);
3598
3599 FunctionType *ParallelRegionFnTy = FunctionType::get(
3600 Type::getVoidTy(Ctx), {Type::getInt16Ty(Ctx), Type::getInt32Ty(Ctx)},
3601 false);
3602 Value *WorkFnCast = BitCastInst::CreatePointerBitCastOrAddrSpaceCast(
3603 WorkFn, ParallelRegionFnTy->getPointerTo(), "worker.work_fn.addr_cast",
3604 StateMachineBeginBB);
3605
3606 Instruction *IsDone =
3607 ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFn,
3608 Constant::getNullValue(VoidPtrTy), "worker.is_done",
3609 StateMachineBeginBB);
3610 IsDone->setDebugLoc(DLoc);
3611 BranchInst::Create(StateMachineFinishedBB, StateMachineIsActiveCheckBB,
3612 IsDone, StateMachineBeginBB)
3613 ->setDebugLoc(DLoc);
3614
3615 BranchInst::Create(StateMachineIfCascadeCurrentBB,
3616 StateMachineDoneBarrierBB, IsActiveWorker,
3617 StateMachineIsActiveCheckBB)
3618 ->setDebugLoc(DLoc);
3619
3620 Value *ZeroArg =
3621 Constant::getNullValue(ParallelRegionFnTy->getParamType(0));
3622
3623 // Now that we have most of the CFG skeleton it is time for the if-cascade
3624 // that checks the function pointer we got from the runtime against the
3625 // parallel regions we expect, if there are any.
3626 for (int I = 0, E = ReachedKnownParallelRegions.size(); I < E; ++I) {
3627 auto *ParallelRegion = ReachedKnownParallelRegions[I];
3628 BasicBlock *PRExecuteBB = BasicBlock::Create(
3629 Ctx, "worker_state_machine.parallel_region.execute", Kernel,
3630 StateMachineEndParallelBB);
3631 CallInst::Create(ParallelRegion, {ZeroArg, GTid}, "", PRExecuteBB)
3632 ->setDebugLoc(DLoc);
3633 BranchInst::Create(StateMachineEndParallelBB, PRExecuteBB)
3634 ->setDebugLoc(DLoc);
3635
3636 BasicBlock *PRNextBB =
3637 BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",
3638 Kernel, StateMachineEndParallelBB);
3639
3640 // Check if we need to compare the pointer at all or if we can just
3641 // call the parallel region function.
3642 Value *IsPR;
3643 if (I + 1 < E || !ReachedUnknownParallelRegions.empty()) {
3644 Instruction *CmpI = ICmpInst::Create(
3645 ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFnCast, ParallelRegion,
3646 "worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
3647 CmpI->setDebugLoc(DLoc);
3648 IsPR = CmpI;
3649 } else {
3650 IsPR = ConstantInt::getTrue(Ctx);
3651 }
3652
3653 BranchInst::Create(PRExecuteBB, PRNextBB, IsPR,
3654 StateMachineIfCascadeCurrentBB)
3655 ->setDebugLoc(DLoc);
3656 StateMachineIfCascadeCurrentBB = PRNextBB;
3657 }
3658
3659 // At the end of the if-cascade we place the indirect function pointer call
3660 // in case we might need it, that is if there can be parallel regions we
3661 // have not handled in the if-cascade above.
3662 if (!ReachedUnknownParallelRegions.empty()) {
3663 StateMachineIfCascadeCurrentBB->setName(
3664 "worker_state_machine.parallel_region.fallback.execute");
3665 CallInst::Create(ParallelRegionFnTy, WorkFnCast, {ZeroArg, GTid}, "",
3666 StateMachineIfCascadeCurrentBB)
3667 ->setDebugLoc(DLoc);
3668 }
3669 BranchInst::Create(StateMachineEndParallelBB,
3670 StateMachineIfCascadeCurrentBB)
3671 ->setDebugLoc(DLoc);
3672
3673 CallInst::Create(OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
3674 M, OMPRTL___kmpc_kernel_end_parallel),
3675 {}, "", StateMachineEndParallelBB)
3676 ->setDebugLoc(DLoc);
3677 BranchInst::Create(StateMachineDoneBarrierBB, StateMachineEndParallelBB)
3678 ->setDebugLoc(DLoc);
3679
3680 CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineDoneBarrierBB)
3681 ->setDebugLoc(DLoc);
3682 BranchInst::Create(StateMachineBeginBB, StateMachineDoneBarrierBB)
3683 ->setDebugLoc(DLoc);
3684
3685 return ChangeStatus::CHANGED;
3686 }
3687
3688 /// Fixpoint iteration update function. Will be called every time a dependence
3689 /// changed its state (and in the beginning).
3690 ChangeStatus updateImpl(Attributor &A) override {
3691 KernelInfoState StateBefore = getState();
3692
3693 // Callback to check a read/write instruction.
3694 auto CheckRWInst = [&](Instruction &I) {
3695 // We handle calls later.
3696 if (isa<CallBase>(I))
3697 return true;
3698 // We only care about write effects.
3699 if (!I.mayWriteToMemory())
3700 return true;
3701 if (auto *SI = dyn_cast<StoreInst>(&I)) {
3702 SmallVector<const Value *> Objects;
3703 getUnderlyingObjects(SI->getPointerOperand(), Objects);
3704 if (llvm::all_of(Objects,
3705 [](const Value *Obj) { return isa<AllocaInst>(Obj); }))
3706 return true;
3707 // Check for AAHeapToStack moved objects which must not be guarded.
3708 auto &HS = A.getAAFor<AAHeapToStack>(
3709 *this, IRPosition::function(*I.getFunction()),
3710 DepClassTy::OPTIONAL);
3711 if (llvm::all_of(Objects, [&HS](const Value *Obj) {
3712 auto *CB = dyn_cast<CallBase>(Obj);
3713 if (!CB)
3714 return false;
3715 return HS.isAssumedHeapToStack(*CB);
3716 })) {
3717 return true;
3718 }
3719 }
3720
3721 // Insert instruction that needs guarding.
3722 SPMDCompatibilityTracker.insert(&I);
3723 return true;
3724 };
3725
3726 bool UsedAssumedInformationInCheckRWInst = false;
3727 if (!SPMDCompatibilityTracker.isAtFixpoint())
3728 if (!A.checkForAllReadWriteInstructions(
3729 CheckRWInst, *this, UsedAssumedInformationInCheckRWInst))
3730 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3731
3732 bool UsedAssumedInformationFromReachingKernels = false;
3733 if (!IsKernelEntry) {
3734 updateParallelLevels(A);
3735
3736 bool AllReachingKernelsKnown = true;
3737 updateReachingKernelEntries(A, AllReachingKernelsKnown);
3738 UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown;
3739
3740 if (!ParallelLevels.isValidState())
3741 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3742 else if (!ReachingKernelEntries.isValidState())
3743 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3744 else if (!SPMDCompatibilityTracker.empty()) {
3745 // Check if all reaching kernels agree on the mode as we can otherwise
3746 // not guard instructions. We might not be sure about the mode so we
3747 // we cannot fix the internal spmd-zation state either.
3748 int SPMD = 0, Generic = 0;
3749 for (auto *Kernel : ReachingKernelEntries) {
3750 auto &CBAA = A.getAAFor<AAKernelInfo>(
3751 *this, IRPosition::function(*Kernel), DepClassTy::OPTIONAL);
3752 if (CBAA.SPMDCompatibilityTracker.isValidState() &&
3753 CBAA.SPMDCompatibilityTracker.isAssumed())
3754 ++SPMD;
3755 else
3756 ++Generic;
3757 if (!CBAA.SPMDCompatibilityTracker.isAtFixpoint())
3758 UsedAssumedInformationFromReachingKernels = true;
3759 }
3760 if (SPMD != 0 && Generic != 0)
3761 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3762 }
3763 }
3764
3765 // Callback to check a call instruction.
3766 bool AllParallelRegionStatesWereFixed = true;
3767 bool AllSPMDStatesWereFixed = true;
3768 auto CheckCallInst = [&](Instruction &I) {
3769 auto &CB = cast<CallBase>(I);
3770 auto &CBAA = A.getAAFor<AAKernelInfo>(
3771 *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
3772 getState() ^= CBAA.getState();
3773 AllSPMDStatesWereFixed &= CBAA.SPMDCompatibilityTracker.isAtFixpoint();
3774 AllParallelRegionStatesWereFixed &=
3775 CBAA.ReachedKnownParallelRegions.isAtFixpoint();
3776 AllParallelRegionStatesWereFixed &=
3777 CBAA.ReachedUnknownParallelRegions.isAtFixpoint();
3778 return true;
3779 };
3780
3781 bool UsedAssumedInformationInCheckCallInst = false;
3782 if (!A.checkForAllCallLikeInstructions(
3783 CheckCallInst, *this, UsedAssumedInformationInCheckCallInst)) {
3784 LLVM_DEBUG(dbgs() << TAGdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Failed to visit all call-like instructions!\n"
;; } } while (false)
3785 << "Failed to visit all call-like instructions!\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Failed to visit all call-like instructions!\n"
;; } } while (false)
;
3786 return indicatePessimisticFixpoint();
3787 }
3788
3789 // If we haven't used any assumed information for the reached parallel
3790 // region states we can fix it.
3791 if (!UsedAssumedInformationInCheckCallInst &&
3792 AllParallelRegionStatesWereFixed) {
3793 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
3794 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
3795 }
3796
3797 // If we are sure there are no parallel regions in the kernel we do not
3798 // want SPMD mode.
3799 if (IsKernelEntry && ReachedUnknownParallelRegions.isAtFixpoint() &&
3800 ReachedKnownParallelRegions.isAtFixpoint() &&
3801 ReachedUnknownParallelRegions.isValidState() &&
3802 ReachedKnownParallelRegions.isValidState() &&
3803 !mayContainParallelRegion())
3804 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3805
3806 // If we haven't used any assumed information for the SPMD state we can fix
3807 // it.
3808 if (!UsedAssumedInformationInCheckRWInst &&
3809 !UsedAssumedInformationInCheckCallInst &&
3810 !UsedAssumedInformationFromReachingKernels && AllSPMDStatesWereFixed)
3811 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3812
3813 return StateBefore == getState() ? ChangeStatus::UNCHANGED
3814 : ChangeStatus::CHANGED;
3815 }
3816
3817private:
3818 /// Update info regarding reaching kernels.
3819 void updateReachingKernelEntries(Attributor &A,
3820 bool &AllReachingKernelsKnown) {
3821 auto PredCallSite = [&](AbstractCallSite ACS) {
3822 Function *Caller = ACS.getInstruction()->getFunction();
3823
3824 assert(Caller && "Caller is nullptr")(static_cast <bool> (Caller && "Caller is nullptr"
) ? void (0) : __assert_fail ("Caller && \"Caller is nullptr\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 3824, __extension__
__PRETTY_FUNCTION__))
;
3825
3826 auto &CAA = A.getOrCreateAAFor<AAKernelInfo>(
3827 IRPosition::function(*Caller), this, DepClassTy::REQUIRED);
3828 if (CAA.ReachingKernelEntries.isValidState()) {
3829 ReachingKernelEntries ^= CAA.ReachingKernelEntries;
3830 return true;
3831 }
3832
3833 // We lost track of the caller of the associated function, any kernel
3834 // could reach now.
3835 ReachingKernelEntries.indicatePessimisticFixpoint();
3836
3837 return true;
3838 };
3839
3840 if (!A.checkForAllCallSites(PredCallSite, *this,
3841 true /* RequireAllCallSites */,
3842 AllReachingKernelsKnown))
3843 ReachingKernelEntries.indicatePessimisticFixpoint();
3844 }
3845
3846 /// Update info regarding parallel levels.
3847 void updateParallelLevels(Attributor &A) {
3848 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3849 OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
3850 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
3851
3852 auto PredCallSite = [&](AbstractCallSite ACS) {
3853 Function *Caller = ACS.getInstruction()->getFunction();
3854
3855 assert(Caller && "Caller is nullptr")(static_cast <bool> (Caller && "Caller is nullptr"
) ? void (0) : __assert_fail ("Caller && \"Caller is nullptr\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 3855, __extension__
__PRETTY_FUNCTION__))
;
3856
3857 auto &CAA =
3858 A.getOrCreateAAFor<AAKernelInfo>(IRPosition::function(*Caller));
3859 if (CAA.ParallelLevels.isValidState()) {
3860 // Any function that is called by `__kmpc_parallel_51` will not be
3861 // folded as the parallel level in the function is updated. In order to
3862 // get it right, all the analysis would depend on the implentation. That
3863 // said, if in the future any change to the implementation, the analysis
3864 // could be wrong. As a consequence, we are just conservative here.
3865 if (Caller == Parallel51RFI.Declaration) {
3866 ParallelLevels.indicatePessimisticFixpoint();
3867 return true;
3868 }
3869
3870 ParallelLevels ^= CAA.ParallelLevels;
3871
3872 return true;
3873 }
3874
3875 // We lost track of the caller of the associated function, any kernel
3876 // could reach now.
3877 ParallelLevels.indicatePessimisticFixpoint();
3878
3879 return true;
3880 };
3881
3882 bool AllCallSitesKnown = true;
3883 if (!A.checkForAllCallSites(PredCallSite, *this,
3884 true /* RequireAllCallSites */,
3885 AllCallSitesKnown))
3886 ParallelLevels.indicatePessimisticFixpoint();
3887 }
3888};
3889
3890/// The call site kernel info abstract attribute, basically, what can we say
3891/// about a call site with regards to the KernelInfoState. For now this simply
3892/// forwards the information from the callee.
3893struct AAKernelInfoCallSite : AAKernelInfo {
3894 AAKernelInfoCallSite(const IRPosition &IRP, Attributor &A)
3895 : AAKernelInfo(IRP, A) {}
3896
3897 /// See AbstractAttribute::initialize(...).
3898 void initialize(Attributor &A) override {
3899 AAKernelInfo::initialize(A);
3900
3901 CallBase &CB = cast<CallBase>(getAssociatedValue());
3902 Function *Callee = getAssociatedFunction();
3903
3904 auto &AssumptionAA = A.getAAFor<AAAssumptionInfo>(
3905 *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
3906
3907 // Check for SPMD-mode assumptions.
3908 if (AssumptionAA.hasAssumption("ompx_spmd_amenable")) {
3909 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3910 indicateOptimisticFixpoint();
3911 }
3912
3913 // First weed out calls we do not care about, that is readonly/readnone
3914 // calls, intrinsics, and "no_openmp" calls. Neither of these can reach a
3915 // parallel region or anything else we are looking for.
3916 if (!CB.mayWriteToMemory() || isa<IntrinsicInst>(CB)) {
3917 indicateOptimisticFixpoint();
3918 return;
3919 }
3920
3921 // Next we check if we know the callee. If it is a known OpenMP function
3922 // we will handle them explicitly in the switch below. If it is not, we
3923 // will use an AAKernelInfo object on the callee to gather information and
3924 // merge that into the current state. The latter happens in the updateImpl.
3925 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3926 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
3927 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
3928 // Unknown caller or declarations are not analyzable, we give up.
3929 if (!Callee || !A.isFunctionIPOAmendable(*Callee)) {
3930
3931 // Unknown callees might contain parallel regions, except if they have
3932 // an appropriate assumption attached.
3933 if (!(AssumptionAA.hasAssumption("omp_no_openmp") ||
3934 AssumptionAA.hasAssumption("omp_no_parallelism")))
3935 ReachedUnknownParallelRegions.insert(&CB);
3936
3937 // If SPMDCompatibilityTracker is not fixed, we need to give up on the
3938 // idea we can run something unknown in SPMD-mode.
3939 if (!SPMDCompatibilityTracker.isAtFixpoint()) {
3940 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3941 SPMDCompatibilityTracker.insert(&CB);
3942 }
3943
3944 // We have updated the state for this unknown call properly, there won't
3945 // be any change so we indicate a fixpoint.
3946 indicateOptimisticFixpoint();
3947 }
3948 // If the callee is known and can be used in IPO, we will update the state
3949 // based on the callee state in updateImpl.
3950 return;
3951 }
3952
3953 const unsigned int WrapperFunctionArgNo = 6;
3954 RuntimeFunction RF = It->getSecond();
3955 switch (RF) {
3956 // All the functions we know are compatible with SPMD mode.
3957 case OMPRTL___kmpc_is_spmd_exec_mode:
3958 case OMPRTL___kmpc_distribute_static_fini:
3959 case OMPRTL___kmpc_for_static_fini:
3960 case OMPRTL___kmpc_global_thread_num:
3961 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
3962 case OMPRTL___kmpc_get_hardware_num_blocks:
3963 case OMPRTL___kmpc_single:
3964 case OMPRTL___kmpc_end_single:
3965 case OMPRTL___kmpc_master:
3966 case OMPRTL___kmpc_end_master:
3967 case OMPRTL___kmpc_barrier:
3968 case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:
3969 case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:
3970 case OMPRTL___kmpc_nvptx_end_reduce_nowait:
3971 break;
3972 case OMPRTL___kmpc_distribute_static_init_4:
3973 case OMPRTL___kmpc_distribute_static_init_4u:
3974 case OMPRTL___kmpc_distribute_static_init_8:
3975 case OMPRTL___kmpc_distribute_static_init_8u:
3976 case OMPRTL___kmpc_for_static_init_4:
3977 case OMPRTL___kmpc_for_static_init_4u:
3978 case OMPRTL___kmpc_for_static_init_8:
3979 case OMPRTL___kmpc_for_static_init_8u: {
3980 // Check the schedule and allow static schedule in SPMD mode.
3981 unsigned ScheduleArgOpNo = 2;
3982 auto *ScheduleTypeCI =
3983 dyn_cast<ConstantInt>(CB.getArgOperand(ScheduleArgOpNo));
3984 unsigned ScheduleTypeVal =
3985 ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
3986 switch (OMPScheduleType(ScheduleTypeVal)) {
3987 case OMPScheduleType::Static:
3988 case OMPScheduleType::StaticChunked:
3989 case OMPScheduleType::Distribute:
3990 case OMPScheduleType::DistributeChunked:
3991 break;
3992 default:
3993 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3994 SPMDCompatibilityTracker.insert(&CB);
3995 break;
3996 };
3997 } break;
3998 case OMPRTL___kmpc_target_init:
3999 KernelInitCB = &CB;
4000 break;
4001 case OMPRTL___kmpc_target_deinit:
4002 KernelDeinitCB = &CB;
4003 break;
4004 case OMPRTL___kmpc_parallel_51:
4005 if (auto *ParallelRegion = dyn_cast<Function>(
4006 CB.getArgOperand(WrapperFunctionArgNo)->stripPointerCasts())) {
4007 ReachedKnownParallelRegions.insert(ParallelRegion);
4008 break;
4009 }
4010 // The condition above should usually get the parallel region function
4011 // pointer and record it. In the off chance it doesn't we assume the
4012 // worst.
4013 ReachedUnknownParallelRegions.insert(&CB);
4014 break;
4015 case OMPRTL___kmpc_omp_task:
4016 // We do not look into tasks right now, just give up.
4017 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4018 SPMDCompatibilityTracker.insert(&CB);
4019 ReachedUnknownParallelRegions.insert(&CB);
4020 break;
4021 case OMPRTL___kmpc_alloc_shared:
4022 case OMPRTL___kmpc_free_shared:
4023 // Return without setting a fixpoint, to be resolved in updateImpl.
4024 return;
4025 default:
4026 // Unknown OpenMP runtime calls cannot be executed in SPMD-mode,
4027 // generally. However, they do not hide parallel regions.
4028 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4029 SPMDCompatibilityTracker.insert(&CB);
4030 break;
4031 }
4032 // All other OpenMP runtime calls will not reach parallel regions so they
4033 // can be safely ignored for now. Since it is a known OpenMP runtime call we
4034 // have now modeled all effects and there is no need for any update.
4035 indicateOptimisticFixpoint();
4036 }
4037
4038 ChangeStatus updateImpl(Attributor &A) override {
4039 // TODO: Once we have call site specific value information we can provide
4040 // call site specific liveness information and then it makes
4041 // sense to specialize attributes for call sites arguments instead of
4042 // redirecting requests to the callee argument.
4043 Function *F = getAssociatedFunction();
4044
4045 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4046 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(F);
4047
4048 // If F is not a runtime function, propagate the AAKernelInfo of the callee.
4049 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
4050 const IRPosition &FnPos = IRPosition::function(*F);
4051 auto &FnAA = A.getAAFor<AAKernelInfo>(*this, FnPos, DepClassTy::REQUIRED);
4052 if (getState() == FnAA.getState())
4053 return ChangeStatus::UNCHANGED;
4054 getState() = FnAA.getState();
4055 return ChangeStatus::CHANGED;
4056 }
4057
4058 // F is a runtime function that allocates or frees memory, check
4059 // AAHeapToStack and AAHeapToShared.
4060 KernelInfoState StateBefore = getState();
4061 assert((It->getSecond() == OMPRTL___kmpc_alloc_shared ||(static_cast <bool> ((It->getSecond() == OMPRTL___kmpc_alloc_shared
|| It->getSecond() == OMPRTL___kmpc_free_shared) &&
"Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call"
) ? void (0) : __assert_fail ("(It->getSecond() == OMPRTL___kmpc_alloc_shared || It->getSecond() == OMPRTL___kmpc_free_shared) && \"Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4063, __extension__
__PRETTY_FUNCTION__))
4062 It->getSecond() == OMPRTL___kmpc_free_shared) &&(static_cast <bool> ((It->getSecond() == OMPRTL___kmpc_alloc_shared
|| It->getSecond() == OMPRTL___kmpc_free_shared) &&
"Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call"
) ? void (0) : __assert_fail ("(It->getSecond() == OMPRTL___kmpc_alloc_shared || It->getSecond() == OMPRTL___kmpc_free_shared) && \"Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4063, __extension__
__PRETTY_FUNCTION__))
4063 "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call")(static_cast <bool> ((It->getSecond() == OMPRTL___kmpc_alloc_shared
|| It->getSecond() == OMPRTL___kmpc_free_shared) &&
"Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call"
) ? void (0) : __assert_fail ("(It->getSecond() == OMPRTL___kmpc_alloc_shared || It->getSecond() == OMPRTL___kmpc_free_shared) && \"Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4063, __extension__
__PRETTY_FUNCTION__))
;
4064
4065 CallBase &CB = cast<CallBase>(getAssociatedValue());
4066
4067 auto &HeapToStackAA = A.getAAFor<AAHeapToStack>(
4068 *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL);
4069 auto &HeapToSharedAA = A.getAAFor<AAHeapToShared>(
4070 *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL);
4071
4072 RuntimeFunction RF = It->getSecond();
4073
4074 switch (RF) {
4075 // If neither HeapToStack nor HeapToShared assume the call is removed,
4076 // assume SPMD incompatibility.
4077 case OMPRTL___kmpc_alloc_shared:
4078 if (!HeapToStackAA.isAssumedHeapToStack(CB) &&
4079 !HeapToSharedAA.isAssumedHeapToShared(CB))
4080 SPMDCompatibilityTracker.insert(&CB);
4081 break;
4082 case OMPRTL___kmpc_free_shared:
4083 if (!HeapToStackAA.isAssumedHeapToStackRemovedFree(CB) &&
4084 !HeapToSharedAA.isAssumedHeapToSharedRemovedFree(CB))
4085 SPMDCompatibilityTracker.insert(&CB);
4086 break;
4087 default:
4088 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4089 SPMDCompatibilityTracker.insert(&CB);
4090 }
4091
4092 return StateBefore == getState() ? ChangeStatus::UNCHANGED
4093 : ChangeStatus::CHANGED;
4094 }
4095};
4096
4097struct AAFoldRuntimeCall
4098 : public StateWrapper<BooleanState, AbstractAttribute> {
4099 using Base = StateWrapper<BooleanState, AbstractAttribute>;
4100
4101 AAFoldRuntimeCall(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
4102
4103 /// Statistics are tracked as part of manifest for now.
4104 void trackStatistics() const override {}
4105
4106 /// Create an abstract attribute biew for the position \p IRP.
4107 static AAFoldRuntimeCall &createForPosition(const IRPosition &IRP,
4108 Attributor &A);
4109
4110 /// See AbstractAttribute::getName()
4111 const std::string getName() const override { return "AAFoldRuntimeCall"; }
4112
4113 /// See AbstractAttribute::getIdAddr()
4114 const char *getIdAddr() const override { return &ID; }
4115
4116 /// This function should return true if the type of the \p AA is
4117 /// AAFoldRuntimeCall
4118 static bool classof(const AbstractAttribute *AA) {
4119 return (AA->getIdAddr() == &ID);
4120 }
4121
4122 static const char ID;
4123};
4124
4125struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
4126 AAFoldRuntimeCallCallSiteReturned(const IRPosition &IRP, Attributor &A)
4127 : AAFoldRuntimeCall(IRP, A) {}
4128
4129 /// See AbstractAttribute::getAsStr()
4130 const std::string getAsStr() const override {
4131 if (!isValidState())
4132 return "<invalid>";
4133
4134 std::string Str("simplified value: ");
4135
4136 if (!SimplifiedValue.hasValue())
4137 return Str + std::string("none");
4138
4139 if (!SimplifiedValue.getValue())
4140 return Str + std::string("nullptr");
4141
4142 if (ConstantInt *CI = dyn_cast<ConstantInt>(SimplifiedValue.getValue()))
4143 return Str + std::to_string(CI->getSExtValue());
4144
4145 return Str + std::string("unknown");
4146 }
4147
4148 void initialize(Attributor &A) override {
4149 if (DisableOpenMPOptFolding)
4150 indicatePessimisticFixpoint();
4151
4152 Function *Callee = getAssociatedFunction();
4153
4154 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4155 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
4156 assert(It != OMPInfoCache.RuntimeFunctionIDMap.end() &&(static_cast <bool> (It != OMPInfoCache.RuntimeFunctionIDMap
.end() && "Expected a known OpenMP runtime function")
? void (0) : __assert_fail ("It != OMPInfoCache.RuntimeFunctionIDMap.end() && \"Expected a known OpenMP runtime function\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4157, __extension__
__PRETTY_FUNCTION__))
4157 "Expected a known OpenMP runtime function")(static_cast <bool> (It != OMPInfoCache.RuntimeFunctionIDMap
.end() && "Expected a known OpenMP runtime function")
? void (0) : __assert_fail ("It != OMPInfoCache.RuntimeFunctionIDMap.end() && \"Expected a known OpenMP runtime function\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4157, __extension__
__PRETTY_FUNCTION__))
;
4158
4159 RFKind = It->getSecond();
4160
4161 CallBase &CB = cast<CallBase>(getAssociatedValue());
4162 A.registerSimplificationCallback(
4163 IRPosition::callsite_returned(CB),
4164 [&](const IRPosition &IRP, const AbstractAttribute *AA,
4165 bool &UsedAssumedInformation) -> Optional<Value *> {
4166 assert((isValidState() || (SimplifiedValue.hasValue() &&(static_cast <bool> ((isValidState() || (SimplifiedValue
.hasValue() && SimplifiedValue.getValue() == nullptr)
) && "Unexpected invalid state!") ? void (0) : __assert_fail
("(isValidState() || (SimplifiedValue.hasValue() && SimplifiedValue.getValue() == nullptr)) && \"Unexpected invalid state!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4168, __extension__
__PRETTY_FUNCTION__))
4167 SimplifiedValue.getValue() == nullptr)) &&(static_cast <bool> ((isValidState() || (SimplifiedValue
.hasValue() && SimplifiedValue.getValue() == nullptr)
) && "Unexpected invalid state!") ? void (0) : __assert_fail
("(isValidState() || (SimplifiedValue.hasValue() && SimplifiedValue.getValue() == nullptr)) && \"Unexpected invalid state!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4168, __extension__
__PRETTY_FUNCTION__))
4168 "Unexpected invalid state!")(static_cast <bool> ((isValidState() || (SimplifiedValue
.hasValue() && SimplifiedValue.getValue() == nullptr)
) && "Unexpected invalid state!") ? void (0) : __assert_fail
("(isValidState() || (SimplifiedValue.hasValue() && SimplifiedValue.getValue() == nullptr)) && \"Unexpected invalid state!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4168, __extension__
__PRETTY_FUNCTION__))
;
4169
4170 if (!isAtFixpoint()) {
4171 UsedAssumedInformation = true;
4172 if (AA)
4173 A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
4174 }
4175 return SimplifiedValue;
4176 });
4177 }
4178
4179 ChangeStatus updateImpl(Attributor &A) override {
4180 ChangeStatus Changed = ChangeStatus::UNCHANGED;
4181 switch (RFKind) {
4182 case OMPRTL___kmpc_is_spmd_exec_mode:
4183 Changed |= foldIsSPMDExecMode(A);
4184 break;
4185 case OMPRTL___kmpc_is_generic_main_thread_id:
4186 Changed |= foldIsGenericMainThread(A);
4187 break;
4188 case OMPRTL___kmpc_parallel_level:
4189 Changed |= foldParallelLevel(A);
4190 break;
4191 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
4192 Changed = Changed | foldKernelFnAttribute(A, "omp_target_thread_limit");
4193 break;
4194 case OMPRTL___kmpc_get_hardware_num_blocks:
4195 Changed = Changed | foldKernelFnAttribute(A, "omp_target_num_teams");
4196 break;
4197 default:
4198 llvm_unreachable("Unhandled OpenMP runtime function!")::llvm::llvm_unreachable_internal("Unhandled OpenMP runtime function!"
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4198)
;
4199 }
4200
4201 return Changed;
4202 }
4203
4204 ChangeStatus manifest(Attributor &A) override {
4205 ChangeStatus Changed = ChangeStatus::UNCHANGED;
4206
4207 if (SimplifiedValue.hasValue() && SimplifiedValue.getValue()) {
4208 Instruction &I = *getCtxI();
4209 A.changeValueAfterManifest(I, **SimplifiedValue);
4210 A.deleteAfterManifest(I);
4211
4212 CallBase *CB = dyn_cast<CallBase>(&I);
4213 auto Remark = [&](OptimizationRemark OR) {
4214 if (auto *C = dyn_cast<ConstantInt>(*SimplifiedValue))
4215 return OR << "Replacing OpenMP runtime call "
4216 << CB->getCalledFunction()->getName() << " with "
4217 << ore::NV("FoldedValue", C->getZExtValue()) << ".";
4218 return OR << "Replacing OpenMP runtime call "
4219 << CB->getCalledFunction()->getName() << ".";
4220 };
4221
4222 if (CB && EnableVerboseRemarks)
4223 A.emitRemark<OptimizationRemark>(CB, "OMP180", Remark);
4224
4225 LLVM_DEBUG(dbgs() << TAG << "Replacing runtime call: " << I << " with "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Replacing runtime call: "
<< I << " with " << **SimplifiedValue <<
"\n"; } } while (false)
4226 << **SimplifiedValue << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Replacing runtime call: "
<< I << " with " << **SimplifiedValue <<
"\n"; } } while (false)
;
4227
4228 Changed = ChangeStatus::CHANGED;
4229 }
4230
4231 return Changed;
4232 }
4233
4234 ChangeStatus indicatePessimisticFixpoint() override {
4235 SimplifiedValue = nullptr;
4236 return AAFoldRuntimeCall::indicatePessimisticFixpoint();
4237 }
4238
4239private:
4240 /// Fold __kmpc_is_spmd_exec_mode into a constant if possible.
4241 ChangeStatus foldIsSPMDExecMode(Attributor &A) {
4242 Optional<Value *> SimplifiedValueBefore = SimplifiedValue;
4243
4244 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
4245 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
4246 auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
4247 *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
4248
4249 if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState())
4250 return indicatePessimisticFixpoint();
4251
4252 for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
4253 auto &AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K),
4254 DepClassTy::REQUIRED);
4255
4256 if (!AA.isValidState()) {
4257 SimplifiedValue = nullptr;
4258 return indicatePessimisticFixpoint();
4259 }
4260
4261 if (AA.SPMDCompatibilityTracker.isAssumed()) {
4262 if (AA.SPMDCompatibilityTracker.isAtFixpoint())
4263 ++KnownSPMDCount;
4264 else
4265 ++AssumedSPMDCount;
4266 } else {
4267 if (AA.SPMDCompatibilityTracker.isAtFixpoint())
4268 ++KnownNonSPMDCount;
4269 else
4270 ++AssumedNonSPMDCount;
4271 }
4272 }
4273
4274 if ((AssumedSPMDCount + KnownSPMDCount) &&
4275 (AssumedNonSPMDCount + KnownNonSPMDCount))
4276 return indicatePessimisticFixpoint();
4277
4278 auto &Ctx = getAnchorValue().getContext();
4279 if (KnownSPMDCount || AssumedSPMDCount) {
4280 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&(static_cast <bool> (KnownNonSPMDCount == 0 && AssumedNonSPMDCount
== 0 && "Expected only SPMD kernels!") ? void (0) : __assert_fail
("KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 && \"Expected only SPMD kernels!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4281, __extension__
__PRETTY_FUNCTION__))
4281 "Expected only SPMD kernels!")(static_cast <bool> (KnownNonSPMDCount == 0 && AssumedNonSPMDCount
== 0 && "Expected only SPMD kernels!") ? void (0) : __assert_fail
("KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 && \"Expected only SPMD kernels!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4281, __extension__
__PRETTY_FUNCTION__))
;
4282 // All reaching kernels are in SPMD mode. Update all function calls to
4283 // __kmpc_is_spmd_exec_mode to 1.
4284 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), true);
4285 } else if (KnownNonSPMDCount || AssumedNonSPMDCount) {
4286 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&(static_cast <bool> (KnownSPMDCount == 0 && AssumedSPMDCount
== 0 && "Expected only non-SPMD kernels!") ? void (0
) : __assert_fail ("KnownSPMDCount == 0 && AssumedSPMDCount == 0 && \"Expected only non-SPMD kernels!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4287, __extension__
__PRETTY_FUNCTION__))
4287 "Expected only non-SPMD kernels!")(static_cast <bool> (KnownSPMDCount == 0 && AssumedSPMDCount
== 0 && "Expected only non-SPMD kernels!") ? void (0
) : __assert_fail ("KnownSPMDCount == 0 && AssumedSPMDCount == 0 && \"Expected only non-SPMD kernels!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4287, __extension__
__PRETTY_FUNCTION__))
;
4288 // All reaching kernels are in non-SPMD mode. Update all function
4289 // calls to __kmpc_is_spmd_exec_mode to 0.
4290 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), false);
4291 } else {
4292 // We have empty reaching kernels, therefore we cannot tell if the
4293 // associated call site can be folded. At this moment, SimplifiedValue
4294 // must be none.
4295 assert(!SimplifiedValue.hasValue() && "SimplifiedValue should be none")(static_cast <bool> (!SimplifiedValue.hasValue() &&
"SimplifiedValue should be none") ? void (0) : __assert_fail
("!SimplifiedValue.hasValue() && \"SimplifiedValue should be none\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4295, __extension__
__PRETTY_FUNCTION__))
;
4296 }
4297
4298 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
4299 : ChangeStatus::CHANGED;
4300 }
4301
4302 /// Fold __kmpc_is_generic_main_thread_id into a constant if possible.
4303 ChangeStatus foldIsGenericMainThread(Attributor &A) {
4304 Optional<Value *> SimplifiedValueBefore = SimplifiedValue;
4305
4306 CallBase &CB = cast<CallBase>(getAssociatedValue());
4307 Function *F = CB.getFunction();
4308 const auto &ExecutionDomainAA = A.getAAFor<AAExecutionDomain>(
4309 *this, IRPosition::function(*F), DepClassTy::REQUIRED);
4310
4311 if (!ExecutionDomainAA.isValidState())
4312 return indicatePessimisticFixpoint();
4313
4314 auto &Ctx = getAnchorValue().getContext();
4315 if (ExecutionDomainAA.isExecutedByInitialThreadOnly(CB))
4316 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), true);
4317 else
4318 return indicatePessimisticFixpoint();
4319
4320 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
4321 : ChangeStatus::CHANGED;
4322 }
4323
4324 /// Fold __kmpc_parallel_level into a constant if possible.
4325 ChangeStatus foldParallelLevel(Attributor &A) {
4326 Optional<Value *> SimplifiedValueBefore = SimplifiedValue;
4327
4328 auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
4329 *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
4330
4331 if (!CallerKernelInfoAA.ParallelLevels.isValidState())
4332 return indicatePessimisticFixpoint();
4333
4334 if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState())
4335 return indicatePessimisticFixpoint();
4336
4337 if (CallerKernelInfoAA.ReachingKernelEntries.empty()) {
4338 assert(!SimplifiedValue.hasValue() &&(static_cast <bool> (!SimplifiedValue.hasValue() &&
"SimplifiedValue should keep none at this point") ? void (0)
: __assert_fail ("!SimplifiedValue.hasValue() && \"SimplifiedValue should keep none at this point\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4339, __extension__
__PRETTY_FUNCTION__))
4339 "SimplifiedValue should keep none at this point")(static_cast <bool> (!SimplifiedValue.hasValue() &&
"SimplifiedValue should keep none at this point") ? void (0)
: __assert_fail ("!SimplifiedValue.hasValue() && \"SimplifiedValue should keep none at this point\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4339, __extension__
__PRETTY_FUNCTION__))
;
4340 return ChangeStatus::UNCHANGED;
4341 }
4342
4343 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
4344 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
4345 for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
4346 auto &AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K),
4347 DepClassTy::REQUIRED);
4348 if (!AA.SPMDCompatibilityTracker.isValidState())
4349 return indicatePessimisticFixpoint();
4350
4351 if (AA.SPMDCompatibilityTracker.isAssumed()) {
4352 if (AA.SPMDCompatibilityTracker.isAtFixpoint())
4353 ++KnownSPMDCount;
4354 else
4355 ++AssumedSPMDCount;
4356 } else {
4357 if (AA.SPMDCompatibilityTracker.isAtFixpoint())
4358 ++KnownNonSPMDCount;
4359 else
4360 ++AssumedNonSPMDCount;
4361 }
4362 }
4363
4364 if ((AssumedSPMDCount + KnownSPMDCount) &&
4365 (AssumedNonSPMDCount + KnownNonSPMDCount))
4366 return indicatePessimisticFixpoint();
4367
4368 auto &Ctx = getAnchorValue().getContext();
4369 // If the caller can only be reached by SPMD kernel entries, the parallel
4370 // level is 1. Similarly, if the caller can only be reached by non-SPMD
4371 // kernel entries, it is 0.
4372 if (AssumedSPMDCount || KnownSPMDCount) {
4373 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&(static_cast <bool> (KnownNonSPMDCount == 0 && AssumedNonSPMDCount
== 0 && "Expected only SPMD kernels!") ? void (0) : __assert_fail
("KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 && \"Expected only SPMD kernels!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4374, __extension__
__PRETTY_FUNCTION__))
4374 "Expected only SPMD kernels!")(static_cast <bool> (KnownNonSPMDCount == 0 && AssumedNonSPMDCount
== 0 && "Expected only SPMD kernels!") ? void (0) : __assert_fail
("KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 && \"Expected only SPMD kernels!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4374, __extension__
__PRETTY_FUNCTION__))
;
4375 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);
4376 } else {
4377 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&(static_cast <bool> (KnownSPMDCount == 0 && AssumedSPMDCount
== 0 && "Expected only non-SPMD kernels!") ? void (0
) : __assert_fail ("KnownSPMDCount == 0 && AssumedSPMDCount == 0 && \"Expected only non-SPMD kernels!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4378, __extension__
__PRETTY_FUNCTION__))
4378 "Expected only non-SPMD kernels!")(static_cast <bool> (KnownSPMDCount == 0 && AssumedSPMDCount
== 0 && "Expected only non-SPMD kernels!") ? void (0
) : __assert_fail ("KnownSPMDCount == 0 && AssumedSPMDCount == 0 && \"Expected only non-SPMD kernels!\""
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4378, __extension__
__PRETTY_FUNCTION__))
;
4379 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 0);
4380 }
4381 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
4382 : ChangeStatus::CHANGED;
4383 }
4384
4385 ChangeStatus foldKernelFnAttribute(Attributor &A, llvm::StringRef Attr) {
4386 // Specialize only if all the calls agree with the attribute constant value
4387 int32_t CurrentAttrValue = -1;
4388 Optional<Value *> SimplifiedValueBefore = SimplifiedValue;
4389
4390 auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
4391 *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
4392
4393 if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState())
4394 return indicatePessimisticFixpoint();
4395
4396 // Iterate over the kernels that reach this function
4397 for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
4398 int32_t NextAttrVal = -1;
4399 if (K->hasFnAttribute(Attr))
4400 NextAttrVal =
4401 std::stoi(K->getFnAttribute(Attr).getValueAsString().str());
4402
4403 if (NextAttrVal == -1 ||
4404 (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal))
4405 return indicatePessimisticFixpoint();
4406 CurrentAttrValue = NextAttrVal;
4407 }
4408
4409 if (CurrentAttrValue != -1) {
4410 auto &Ctx = getAnchorValue().getContext();
4411 SimplifiedValue =
4412 ConstantInt::get(Type::getInt32Ty(Ctx), CurrentAttrValue);
4413 }
4414 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
4415 : ChangeStatus::CHANGED;
4416 }
4417
4418 /// An optional value the associated value is assumed to fold to. That is, we
4419 /// assume the associated value (which is a call) can be replaced by this
4420 /// simplified value.
4421 Optional<Value *> SimplifiedValue;
4422
4423 /// The runtime function kind of the callee of the associated call site.
4424 RuntimeFunction RFKind;
4425};
4426
4427} // namespace
4428
4429/// Register folding callsite
4430void OpenMPOpt::registerFoldRuntimeCall(RuntimeFunction RF) {
4431 auto &RFI = OMPInfoCache.RFIs[RF];
4432 RFI.foreachUse(SCC, [&](Use &U, Function &F) {
4433 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI);
4434 if (!CI)
4435 return false;
4436 A.getOrCreateAAFor<AAFoldRuntimeCall>(
4437 IRPosition::callsite_returned(*CI), /* QueryingAA */ nullptr,
4438 DepClassTy::NONE, /* ForceUpdate */ false,
4439 /* UpdateAfterInit */ false);
4440 return false;
4441 });
4442}
4443
4444void OpenMPOpt::registerAAs(bool IsModulePass) {
4445 if (SCC.empty())
4446
4447 return;
4448 if (IsModulePass) {
4449 // Ensure we create the AAKernelInfo AAs first and without triggering an
4450 // update. This will make sure we register all value simplification
4451 // callbacks before any other AA has the chance to create an AAValueSimplify
4452 // or similar.
4453 for (Function *Kernel : OMPInfoCache.Kernels)
4454 A.getOrCreateAAFor<AAKernelInfo>(
4455 IRPosition::function(*Kernel), /* QueryingAA */ nullptr,
4456 DepClassTy::NONE, /* ForceUpdate */ false,
4457 /* UpdateAfterInit */ false);
4458
4459 registerFoldRuntimeCall(OMPRTL___kmpc_is_generic_main_thread_id);
4460 registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
4461 registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);
4462 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block);
4463 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks);
4464 }
4465
4466 // Create CallSite AA for all Getters.
4467 for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
4468 auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];
4469
4470 auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
4471
4472 auto CreateAA = [&](Use &U, Function &Caller) {
4473 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
4474 if (!CI)
4475 return false;
4476
4477 auto &CB = cast<CallBase>(*CI);
4478
4479 IRPosition CBPos = IRPosition::callsite_function(CB);
4480 A.getOrCreateAAFor<AAICVTracker>(CBPos);
4481 return false;
4482 };
4483
4484 GetterRFI.foreachUse(SCC, CreateAA);
4485 }
4486 auto &GlobalizationRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
4487 auto CreateAA = [&](Use &U, Function &F) {
4488 A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F));
4489 return false;
4490 };
4491 if (!DisableOpenMPOptDeglobalization)
4492 GlobalizationRFI.foreachUse(SCC, CreateAA);
4493
4494 // Create an ExecutionDomain AA for every function and a HeapToStack AA for
4495 // every function if there is a device kernel.
4496 if (!isOpenMPDevice(M))
4497 return;
4498
4499 for (auto *F : SCC) {
4500 if (F->isDeclaration())
4501 continue;
4502
4503 A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F));
4504 if (!DisableOpenMPOptDeglobalization)
4505 A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F));
4506
4507 for (auto &I : instructions(*F)) {
4508 if (auto *LI = dyn_cast<LoadInst>(&I)) {
4509 bool UsedAssumedInformation = false;
4510 A.getAssumedSimplified(IRPosition::value(*LI), /* AA */ nullptr,
4511 UsedAssumedInformation);
4512 } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
4513 A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*SI));
4514 }
4515 }
4516 }
4517}
4518
4519const char AAICVTracker::ID = 0;
4520const char AAKernelInfo::ID = 0;
4521const char AAExecutionDomain::ID = 0;
4522const char AAHeapToShared::ID = 0;
4523const char AAFoldRuntimeCall::ID = 0;
4524
4525AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
4526 Attributor &A) {
4527 AAICVTracker *AA = nullptr;
4528 switch (IRP.getPositionKind()) {
4529 case IRPosition::IRP_INVALID:
4530 case IRPosition::IRP_FLOAT:
4531 case IRPosition::IRP_ARGUMENT:
4532 case IRPosition::IRP_CALL_SITE_ARGUMENT:
4533 llvm_unreachable("ICVTracker can only be created for function position!")::llvm::llvm_unreachable_internal("ICVTracker can only be created for function position!"
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4533)
;
4534 case IRPosition::IRP_RETURNED:
4535 AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A);
4536 break;
4537 case IRPosition::IRP_CALL_SITE_RETURNED:
4538 AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A);
4539 break;
4540 case IRPosition::IRP_CALL_SITE:
4541 AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A);
4542 break;
4543 case IRPosition::IRP_FUNCTION:
4544 AA = new (A.Allocator) AAICVTrackerFunction(IRP, A);
4545 break;
4546 }
4547
4548 return *AA;
4549}
4550
4551AAExecutionDomain &AAExecutionDomain::createForPosition(const IRPosition &IRP,
4552 Attributor &A) {
4553 AAExecutionDomainFunction *AA = nullptr;
4554 switch (IRP.getPositionKind()) {
4555 case IRPosition::IRP_INVALID:
4556 case IRPosition::IRP_FLOAT:
4557 case IRPosition::IRP_ARGUMENT:
4558 case IRPosition::IRP_CALL_SITE_ARGUMENT:
4559 case IRPosition::IRP_RETURNED:
4560 case IRPosition::IRP_CALL_SITE_RETURNED:
4561 case IRPosition::IRP_CALL_SITE:
4562 llvm_unreachable(::llvm::llvm_unreachable_internal("AAExecutionDomain can only be created for function position!"
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4563)
4563 "AAExecutionDomain can only be created for function position!")::llvm::llvm_unreachable_internal("AAExecutionDomain can only be created for function position!"
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4563)
;
4564 case IRPosition::IRP_FUNCTION:
4565 AA = new (A.Allocator) AAExecutionDomainFunction(IRP, A);
4566 break;
4567 }
4568
4569 return *AA;
4570}
4571
4572AAHeapToShared &AAHeapToShared::createForPosition(const IRPosition &IRP,
4573 Attributor &A) {
4574 AAHeapToSharedFunction *AA = nullptr;
4575 switch (IRP.getPositionKind()) {
4576 case IRPosition::IRP_INVALID:
4577 case IRPosition::IRP_FLOAT:
4578 case IRPosition::IRP_ARGUMENT:
4579 case IRPosition::IRP_CALL_SITE_ARGUMENT:
4580 case IRPosition::IRP_RETURNED:
4581 case IRPosition::IRP_CALL_SITE_RETURNED:
4582 case IRPosition::IRP_CALL_SITE:
4583 llvm_unreachable(::llvm::llvm_unreachable_internal("AAHeapToShared can only be created for function position!"
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4584)
4584 "AAHeapToShared can only be created for function position!")::llvm::llvm_unreachable_internal("AAHeapToShared can only be created for function position!"
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4584)
;
4585 case IRPosition::IRP_FUNCTION:
4586 AA = new (A.Allocator) AAHeapToSharedFunction(IRP, A);
4587 break;
4588 }
4589
4590 return *AA;
4591}
4592
4593AAKernelInfo &AAKernelInfo::createForPosition(const IRPosition &IRP,
4594 Attributor &A) {
4595 AAKernelInfo *AA = nullptr;
4596 switch (IRP.getPositionKind()) {
4597 case IRPosition::IRP_INVALID:
4598 case IRPosition::IRP_FLOAT:
4599 case IRPosition::IRP_ARGUMENT:
4600 case IRPosition::IRP_RETURNED:
4601 case IRPosition::IRP_CALL_SITE_RETURNED:
4602 case IRPosition::IRP_CALL_SITE_ARGUMENT:
4603 llvm_unreachable("KernelInfo can only be created for function position!")::llvm::llvm_unreachable_internal("KernelInfo can only be created for function position!"
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4603)
;
4604 case IRPosition::IRP_CALL_SITE:
4605 AA = new (A.Allocator) AAKernelInfoCallSite(IRP, A);
4606 break;
4607 case IRPosition::IRP_FUNCTION:
4608 AA = new (A.Allocator) AAKernelInfoFunction(IRP, A);
4609 break;
4610 }
4611
4612 return *AA;
4613}
4614
4615AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(const IRPosition &IRP,
4616 Attributor &A) {
4617 AAFoldRuntimeCall *AA = nullptr;
4618 switch (IRP.getPositionKind()) {
4619 case IRPosition::IRP_INVALID:
4620 case IRPosition::IRP_FLOAT:
4621 case IRPosition::IRP_ARGUMENT:
4622 case IRPosition::IRP_RETURNED:
4623 case IRPosition::IRP_FUNCTION:
4624 case IRPosition::IRP_CALL_SITE:
4625 case IRPosition::IRP_CALL_SITE_ARGUMENT:
4626 llvm_unreachable("KernelInfo can only be created for call site position!")::llvm::llvm_unreachable_internal("KernelInfo can only be created for call site position!"
, "llvm/lib/Transforms/IPO/OpenMPOpt.cpp", 4626)
;
4627 case IRPosition::IRP_CALL_SITE_RETURNED:
4628 AA = new (A.Allocator) AAFoldRuntimeCallCallSiteReturned(IRP, A);
4629 break;
4630 }
4631
4632 return *AA;
4633}
4634
4635PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
4636 if (!containsOpenMP(M))
4637 return PreservedAnalyses::all();
4638 if (DisableOpenMPOptimizations)
4639 return PreservedAnalyses::all();
4640
4641 FunctionAnalysisManager &FAM =
4642 AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
4643 KernelSet Kernels = getDeviceKernels(M);
4644
4645 auto IsCalled = [&](Function &F) {
4646 if (Kernels.contains(&F))
4647 return true;
4648 for (const User *U : F.users())
4649 if (!isa<BlockAddress>(U))
4650 return true;
4651 return false;
4652 };
4653
4654 auto EmitRemark = [&](Function &F) {
4655 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
4656 ORE.emit([&]() {
4657 OptimizationRemarkAnalysis ORA(DEBUG_TYPE"openmp-opt", "OMP140", &F);
4658 return ORA << "Could not internalize function. "
4659 << "Some optimizations may not be possible. [OMP140]";
4660 });
4661 };
4662
4663 // Create internal copies of each function if this is a kernel Module. This
4664 // allows iterprocedural passes to see every call edge.
4665 DenseMap<Function *, Function *> InternalizedMap;
4666 if (isOpenMPDevice(M)) {
4667 SmallPtrSet<Function *, 16> InternalizeFns;
4668 for (Function &F : M)
4669 if (!F.isDeclaration() && !Kernels.contains(&F) && IsCalled(F) &&
4670 !DisableInternalization) {
4671 if (Attributor::isInternalizable(F)) {
4672 InternalizeFns.insert(&F);
4673 } else if (!F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::Cold)) {
4674 EmitRemark(F);
4675 }
4676 }
4677
4678 Attributor::internalizeFunctions(InternalizeFns, InternalizedMap);
4679 }
4680
4681 // Look at every function in the Module unless it was internalized.
4682 SmallVector<Function *, 16> SCC;
4683 for (Function &F : M)
4684 if (!F.isDeclaration() && !InternalizedMap.lookup(&F))
4685 SCC.push_back(&F);
4686
4687 if (SCC.empty())
4688 return PreservedAnalyses::all();
4689
4690 AnalysisGetter AG(FAM);
4691
4692 auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
4693 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
4694 };
4695
4696 BumpPtrAllocator Allocator;
4697 CallGraphUpdater CGUpdater;
4698
4699 SetVector<Function *> Functions(SCC.begin(), SCC.end());
4700 OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ Functions, Kernels);
4701
4702 unsigned MaxFixpointIterations =
4703 (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
4704 Attributor A(Functions, InfoCache, CGUpdater, nullptr, true, false,
4705 MaxFixpointIterations, OREGetter, DEBUG_TYPE"openmp-opt");
4706
4707 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
4708 bool Changed = OMPOpt.run(true);
4709
4710 // Optionally inline device functions for potentially better performance.
4711 if (AlwaysInlineDeviceFunctions && isOpenMPDevice(M))
4712 for (Function &F : M)
4713 if (!F.isDeclaration() && !Kernels.contains(&F) &&
4714 !F.hasFnAttribute(Attribute::NoInline))
4715 F.addFnAttr(Attribute::AlwaysInline);
4716
4717 if (PrintModuleAfterOptimizations)
4718 LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt Module Pass:\n" << M)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Module after OpenMPOpt Module Pass:\n"
<< M; } } while (false)
;
4719
4720 if (Changed)
4721 return PreservedAnalyses::none();
4722
4723 return PreservedAnalyses::all();
4724}
4725
4726PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
4727 CGSCCAnalysisManager &AM,
4728 LazyCallGraph &CG,
4729 CGSCCUpdateResult &UR) {
4730 if (!containsOpenMP(*C.begin()->getFunction().getParent()))
4731 return PreservedAnalyses::all();
4732 if (DisableOpenMPOptimizations)
4733 return PreservedAnalyses::all();
4734
4735 SmallVector<Function *, 16> SCC;
4736 // If there are kernels in the module, we have to run on all SCC's.
4737 for (LazyCallGraph::Node &N : C) {
4738 Function *Fn = &N.getFunction();
4739 SCC.push_back(Fn);
4740 }
4741
4742 if (SCC.empty())
4743 return PreservedAnalyses::all();
4744
4745 Module &M = *C.begin()->getFunction().getParent();
4746
4747 KernelSet Kernels = getDeviceKernels(M);
4748
4749 FunctionAnalysisManager &FAM =
4750 AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
4751
4752 AnalysisGetter AG(FAM);
4753
4754 auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
4755 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
4756 };
4757
4758 BumpPtrAllocator Allocator;
4759 CallGraphUpdater CGUpdater;
4760 CGUpdater.initialize(CG, C, AM, UR);
4761
4762 SetVector<Function *> Functions(SCC.begin(), SCC.end());
4763 OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
4764 /*CGSCC*/ Functions, Kernels);
4765
4766 unsigned MaxFixpointIterations =
4767 (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
4768 Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true,
4769 MaxFixpointIterations, OREGetter, DEBUG_TYPE"openmp-opt");
4770
4771 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
4772 bool Changed = OMPOpt.run(false);
4773
4774 if (PrintModuleAfterOptimizations)
4775 LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n"
<< M; } } while (false)
;
4776
4777 if (Changed)
4778 return PreservedAnalyses::none();
4779
4780 return PreservedAnalyses::all();
4781}
4782
4783namespace {
4784
4785struct OpenMPOptCGSCCLegacyPass : public CallGraphSCCPass {
4786 CallGraphUpdater CGUpdater;
4787 static char ID;
4788
4789 OpenMPOptCGSCCLegacyPass() : CallGraphSCCPass(ID) {
4790 initializeOpenMPOptCGSCCLegacyPassPass(*PassRegistry::getPassRegistry());
4791 }
4792
4793 void getAnalysisUsage(AnalysisUsage &AU) const override {
4794 CallGraphSCCPass::getAnalysisUsage(AU);
4795 }
4796
4797 bool runOnSCC(CallGraphSCC &CGSCC) override {
4798 if (!containsOpenMP(CGSCC.getCallGraph().getModule()))
4799 return false;
4800 if (DisableOpenMPOptimizations || skipSCC(CGSCC))
4801 return false;
4802
4803 SmallVector<Function *, 16> SCC;
4804 // If there are kernels in the module, we have to run on all SCC's.
4805 for (CallGraphNode *CGN : CGSCC) {
4806 Function *Fn = CGN->getFunction();
4807 if (!Fn || Fn->isDeclaration())
4808 continue;
4809 SCC.push_back(Fn);
4810 }
4811
4812 if (SCC.empty())
4813 return false;
4814
4815 Module &M = CGSCC.getCallGraph().getModule();
4816 KernelSet Kernels = getDeviceKernels(M);
4817
4818 CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
4819 CGUpdater.initialize(CG, CGSCC);
4820
4821 // Maintain a map of functions to avoid rebuilding the ORE
4822 DenseMap<Function *, std::unique_ptr<OptimizationRemarkEmitter>> OREMap;
4823 auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & {
4824 std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F];
4825 if (!ORE)
4826 ORE = std::make_unique<OptimizationRemarkEmitter>(F);
4827 return *ORE;
4828 };
4829
4830 AnalysisGetter AG;
4831 SetVector<Function *> Functions(SCC.begin(), SCC.end());
4832 BumpPtrAllocator Allocator;
4833 OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG,
4834 Allocator,
4835 /*CGSCC*/ Functions, Kernels);
4836
4837 unsigned MaxFixpointIterations =
4838 (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
4839 Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true,
4840 MaxFixpointIterations, OREGetter, DEBUG_TYPE"openmp-opt");
4841
4842 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
4843 bool Result = OMPOpt.run(false);
4844
4845 if (PrintModuleAfterOptimizations)
4846 LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n"
<< M; } } while (false)
;
4847
4848 return Result;
4849 }
4850
4851 bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); }
4852};
4853
4854} // end anonymous namespace
4855
4856KernelSet llvm::omp::getDeviceKernels(Module &M) {
4857 // TODO: Create a more cross-platform way of determining device kernels.
4858 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
4859 KernelSet Kernels;
4860
4861 if (!MD)
4862 return Kernels;
4863
4864 for (auto *Op : MD->operands()) {
4865 if (Op->getNumOperands() < 2)
4866 continue;
4867 MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
4868 if (!KindID || KindID->getString() != "kernel")
4869 continue;
4870
4871 Function *KernelFn =
4872 mdconst::dyn_extract_or_null<Function>(Op->getOperand(0));
4873 if (!KernelFn)
4874 continue;
4875
4876 ++NumOpenMPTargetRegionKernels;
4877
4878 Kernels.insert(KernelFn);
4879 }
4880
4881 return Kernels;
4882}
4883
4884bool llvm::omp::containsOpenMP(Module &M) {
4885 Metadata *MD = M.getModuleFlag("openmp");
4886 if (!MD)
4887 return false;
4888
4889 return true;
4890}
4891
4892bool llvm::omp::isOpenMPDevice(Module &M) {
4893 Metadata *MD = M.getModuleFlag("openmp-device");
4894 if (!MD)
4895 return false;
4896
4897 return true;
4898}
4899
4900char OpenMPOptCGSCCLegacyPass::ID = 0;
4901
4902INITIALIZE_PASS_BEGIN(OpenMPOptCGSCCLegacyPass, "openmp-opt-cgscc",static void *initializeOpenMPOptCGSCCLegacyPassPassOnce(PassRegistry
&Registry) {
4903 "OpenMP specific optimizations", false, false)static void *initializeOpenMPOptCGSCCLegacyPassPassOnce(PassRegistry
&Registry) {
4904INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)initializeCallGraphWrapperPassPass(Registry);
4905INITIALIZE_PASS_END(OpenMPOptCGSCCLegacyPass, "openmp-opt-cgscc",PassInfo *PI = new PassInfo( "OpenMP specific optimizations",
"openmp-opt-cgscc", &OpenMPOptCGSCCLegacyPass::ID, PassInfo
::NormalCtor_t(callDefaultCtor<OpenMPOptCGSCCLegacyPass>
), false, false); Registry.registerPass(*PI, true); return PI
; } static llvm::once_flag InitializeOpenMPOptCGSCCLegacyPassPassFlag
; void llvm::initializeOpenMPOptCGSCCLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeOpenMPOptCGSCCLegacyPassPassFlag
, initializeOpenMPOptCGSCCLegacyPassPassOnce, std::ref(Registry
)); }
4906 "OpenMP specific optimizations", false, false)PassInfo *PI = new PassInfo( "OpenMP specific optimizations",
"openmp-opt-cgscc", &OpenMPOptCGSCCLegacyPass::ID, PassInfo
::NormalCtor_t(callDefaultCtor<OpenMPOptCGSCCLegacyPass>
), false, false); Registry.registerPass(*PI, true); return PI
; } static llvm::once_flag InitializeOpenMPOptCGSCCLegacyPassPassFlag
; void llvm::initializeOpenMPOptCGSCCLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeOpenMPOptCGSCCLegacyPassPassFlag
, initializeOpenMPOptCGSCCLegacyPassPassOnce, std::ref(Registry
)); }
4907
4908Pass *llvm::createOpenMPOptCGSCCLegacyPass() {
4909 return new OpenMPOptCGSCCLegacyPass();
4910}

/build/llvm-toolchain-snapshot-14~++20220108111521+9345ab3a4550/llvm/include/llvm/Transforms/IPO/Attributor.h

1//===- Attributor.h --- Module-wide attribute deduction ---------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Attributor: An inter procedural (abstract) "attribute" deduction framework.
10//
11// The Attributor framework is an inter procedural abstract analysis (fixpoint
12// iteration analysis). The goal is to allow easy deduction of new attributes as
13// well as information exchange between abstract attributes in-flight.
14//
15// The Attributor class is the driver and the link between the various abstract
16// attributes. The Attributor will iterate until a fixpoint state is reached by
17// all abstract attributes in-flight, or until it will enforce a pessimistic fix
18// point because an iteration limit is reached.
19//
20// Abstract attributes, derived from the AbstractAttribute class, actually
21// describe properties of the code. They can correspond to actual LLVM-IR
22// attributes, or they can be more general, ultimately unrelated to LLVM-IR
23// attributes. The latter is useful when an abstract attributes provides
24// information to other abstract attributes in-flight but we might not want to
25// manifest the information. The Attributor allows to query in-flight abstract
26// attributes through the `Attributor::getAAFor` method (see the method
27// description for an example). If the method is used by an abstract attribute
28// P, and it results in an abstract attribute Q, the Attributor will
29// automatically capture a potential dependence from Q to P. This dependence
30// will cause P to be reevaluated whenever Q changes in the future.
31//
32// The Attributor will only reevaluate abstract attributes that might have
33// changed since the last iteration. That means that the Attribute will not
34// revisit all instructions/blocks/functions in the module but only query
35// an update from a subset of the abstract attributes.
36//
37// The update method `AbstractAttribute::updateImpl` is implemented by the
38// specific "abstract attribute" subclasses. The method is invoked whenever the
39// currently assumed state (see the AbstractState class) might not be valid
40// anymore. This can, for example, happen if the state was dependent on another
41// abstract attribute that changed. In every invocation, the update method has
42// to adjust the internal state of an abstract attribute to a point that is
43// justifiable by the underlying IR and the current state of abstract attributes
44// in-flight. Since the IR is given and assumed to be valid, the information
45// derived from it can be assumed to hold. However, information derived from
46// other abstract attributes is conditional on various things. If the justifying
47// state changed, the `updateImpl` has to revisit the situation and potentially
48// find another justification or limit the optimistic assumes made.
49//
50// Change is the key in this framework. Until a state of no-change, thus a
51// fixpoint, is reached, the Attributor will query the abstract attributes
52// in-flight to re-evaluate their state. If the (current) state is too
53// optimistic, hence it cannot be justified anymore through other abstract
54// attributes or the state of the IR, the state of the abstract attribute will
55// have to change. Generally, we assume abstract attribute state to be a finite
56// height lattice and the update function to be monotone. However, these
57// conditions are not enforced because the iteration limit will guarantee
58// termination. If an optimistic fixpoint is reached, or a pessimistic fix
59// point is enforced after a timeout, the abstract attributes are tasked to
60// manifest their result in the IR for passes to come.
61//
62// Attribute manifestation is not mandatory. If desired, there is support to
63// generate a single or multiple LLVM-IR attributes already in the helper struct
64// IRAttribute. In the simplest case, a subclass inherits from IRAttribute with
65// a proper Attribute::AttrKind as template parameter. The Attributor
66// manifestation framework will then create and place a new attribute if it is
67// allowed to do so (based on the abstract state). Other use cases can be
68// achieved by overloading AbstractAttribute or IRAttribute methods.
69//
70//
71// The "mechanics" of adding a new "abstract attribute":
72// - Define a class (transitively) inheriting from AbstractAttribute and one
73// (which could be the same) that (transitively) inherits from AbstractState.
74// For the latter, consider the already available BooleanState and
75// {Inc,Dec,Bit}IntegerState if they fit your needs, e.g., you require only a
76// number tracking or bit-encoding.
77// - Implement all pure methods. Also use overloading if the attribute is not
78// conforming with the "default" behavior: A (set of) LLVM-IR attribute(s) for
79// an argument, call site argument, function return value, or function. See
80// the class and method descriptions for more information on the two
81// "Abstract" classes and their respective methods.
82// - Register opportunities for the new abstract attribute in the
83// `Attributor::identifyDefaultAbstractAttributes` method if it should be
84// counted as a 'default' attribute.
85// - Add sufficient tests.
86// - Add a Statistics object for bookkeeping. If it is a simple (set of)
87// attribute(s) manifested through the Attributor manifestation framework, see
88// the bookkeeping function in Attributor.cpp.
89// - If instructions with a certain opcode are interesting to the attribute, add
90// that opcode to the switch in `Attributor::identifyAbstractAttributes`. This
91// will make it possible to query all those instructions through the
92// `InformationCache::getOpcodeInstMapForFunction` interface and eliminate the
93// need to traverse the IR repeatedly.
94//
95//===----------------------------------------------------------------------===//
96
97#ifndef LLVM_TRANSFORMS_IPO_ATTRIBUTOR_H
98#define LLVM_TRANSFORMS_IPO_ATTRIBUTOR_H
99
100#include "llvm/ADT/DenseSet.h"
101#include "llvm/ADT/GraphTraits.h"
102#include "llvm/ADT/MapVector.h"
103#include "llvm/ADT/STLExtras.h"
104#include "llvm/ADT/SetOperations.h"
105#include "llvm/ADT/SetVector.h"
106#include "llvm/ADT/Triple.h"
107#include "llvm/ADT/iterator.h"
108#include "llvm/Analysis/AssumeBundleQueries.h"
109#include "llvm/Analysis/CFG.h"
110#include "llvm/Analysis/CGSCCPassManager.h"
111#include "llvm/Analysis/LazyCallGraph.h"
112#include "llvm/Analysis/LoopInfo.h"
113#include "llvm/Analysis/MustExecute.h"
114#include "llvm/Analysis/OptimizationRemarkEmitter.h"
115#include "llvm/Analysis/PostDominators.h"
116#include "llvm/Analysis/TargetLibraryInfo.h"
117#include "llvm/IR/AbstractCallSite.h"
118#include "llvm/IR/ConstantRange.h"
119#include "llvm/IR/PassManager.h"
120#include "llvm/Support/Allocator.h"
121#include "llvm/Support/Casting.h"
122#include "llvm/Support/GraphWriter.h"
123#include "llvm/Support/TimeProfiler.h"
124#include "llvm/Transforms/Utils/CallGraphUpdater.h"
125
126namespace llvm {
127
128struct AADepGraphNode;
129struct AADepGraph;
130struct Attributor;
131struct AbstractAttribute;
132struct InformationCache;
133struct AAIsDead;
134struct AttributorCallGraph;
135
136class AAResults;
137class Function;
138
139/// Abstract Attribute helper functions.
140namespace AA {
141
142/// Return true if \p V is dynamically unique, that is, there are no two
143/// "instances" of \p V at runtime with different values.
144bool isDynamicallyUnique(Attributor &A, const AbstractAttribute &QueryingAA,
145 const Value &V);
146
147/// Return true if \p V is a valid value in \p Scope, that is a constant or an
148/// instruction/argument of \p Scope.
149bool isValidInScope(const Value &V, const Function *Scope);
150
151/// Return true if \p V is a valid value at position \p CtxI, that is a
152/// constant, an argument of the same function as \p CtxI, or an instruction in
153/// that function that dominates \p CtxI.
154bool isValidAtPosition(const Value &V, const Instruction &CtxI,
155 InformationCache &InfoCache);
156
157/// Try to convert \p V to type \p Ty without introducing new instructions. If
158/// this is not possible return `nullptr`. Note: this function basically knows
159/// how to cast various constants.
160Value *getWithType(Value &V, Type &Ty);
161
162/// Return the combination of \p A and \p B such that the result is a possible
163/// value of both. \p B is potentially casted to match the type \p Ty or the
164/// type of \p A if \p Ty is null.
165///
166/// Examples:
167/// X + none => X
168/// not_none + undef => not_none
169/// V1 + V2 => nullptr
170Optional<Value *>
171combineOptionalValuesInAAValueLatice(const Optional<Value *> &A,
172 const Optional<Value *> &B, Type *Ty);
173
174/// Return the initial value of \p Obj with type \p Ty if that is a constant.
175Constant *getInitialValueForObj(Value &Obj, Type &Ty,
176 const TargetLibraryInfo *TLI);
177
178/// Collect all potential underlying objects of \p Ptr at position \p CtxI in
179/// \p Objects. Assumed information is used and dependences onto \p QueryingAA
180/// are added appropriately.
181///
182/// \returns True if \p Objects contains all assumed underlying objects, and
183/// false if something went wrong and the objects could not be
184/// determined.
185bool getAssumedUnderlyingObjects(Attributor &A, const Value &Ptr,
186 SmallVectorImpl<Value *> &Objects,
187 const AbstractAttribute &QueryingAA,
188 const Instruction *CtxI);
189
190/// Collect all potential values of the one stored by \p SI into
191/// \p PotentialCopies. That is, the only copies that were made via the
192/// store are assumed to be known and all in \p PotentialCopies. Dependences
193/// onto \p QueryingAA are properly tracked, \p UsedAssumedInformation will
194/// inform the caller if assumed information was used.
195///
196/// \returns True if the assumed potential copies are all in \p PotentialCopies,
197/// false if something went wrong and the copies could not be
198/// determined.
199bool getPotentialCopiesOfStoredValue(
200 Attributor &A, StoreInst &SI, SmallSetVector<Value *, 4> &PotentialCopies,
201 const AbstractAttribute &QueryingAA, bool &UsedAssumedInformation);
202
203} // namespace AA
204
205/// The value passed to the line option that defines the maximal initialization
206/// chain length.
207extern unsigned MaxInitializationChainLength;
208
209///{
210enum class ChangeStatus {
211 CHANGED,
212 UNCHANGED,
213};
214
215ChangeStatus operator|(ChangeStatus l, ChangeStatus r);
216ChangeStatus &operator|=(ChangeStatus &l, ChangeStatus r);
217ChangeStatus operator&(ChangeStatus l, ChangeStatus r);
218ChangeStatus &operator&=(ChangeStatus &l, ChangeStatus r);
219
220enum class DepClassTy {
221 REQUIRED, ///< The target cannot be valid if the source is not.
222 OPTIONAL, ///< The target may be valid if the source is not.
223 NONE, ///< Do not track a dependence between source and target.
224};
225///}
226
227/// The data structure for the nodes of a dependency graph
228struct AADepGraphNode {
229public:
230 virtual ~AADepGraphNode(){};
231 using DepTy = PointerIntPair<AADepGraphNode *, 1>;
232
233protected:
234 /// Set of dependency graph nodes which should be updated if this one
235 /// is updated. The bit encodes if it is optional.
236 TinyPtrVector<DepTy> Deps;
237
238 static AADepGraphNode *DepGetVal(DepTy &DT) { return DT.getPointer(); }
239 static AbstractAttribute *DepGetValAA(DepTy &DT) {
240 return cast<AbstractAttribute>(DT.getPointer());
241 }
242
243 operator AbstractAttribute *() { return cast<AbstractAttribute>(this); }
244
245public:
246 using iterator =
247 mapped_iterator<TinyPtrVector<DepTy>::iterator, decltype(&DepGetVal)>;
248 using aaiterator =
249 mapped_iterator<TinyPtrVector<DepTy>::iterator, decltype(&DepGetValAA)>;
250
251 aaiterator begin() { return aaiterator(Deps.begin(), &DepGetValAA); }
252 aaiterator end() { return aaiterator(Deps.end(), &DepGetValAA); }
253 iterator child_begin() { return iterator(Deps.begin(), &DepGetVal); }
254 iterator child_end() { return iterator(Deps.end(), &DepGetVal); }
255
256 virtual void print(raw_ostream &OS) const { OS << "AADepNode Impl\n"; }
257 TinyPtrVector<DepTy> &getDeps() { return Deps; }
258
259 friend struct Attributor;
260 friend struct AADepGraph;
261};
262
263/// The data structure for the dependency graph
264///
265/// Note that in this graph if there is an edge from A to B (A -> B),
266/// then it means that B depends on A, and when the state of A is
267/// updated, node B should also be updated
268struct AADepGraph {
269 AADepGraph() {}
270 ~AADepGraph() {}
271
272 using DepTy = AADepGraphNode::DepTy;
273 static AADepGraphNode *DepGetVal(DepTy &DT) { return DT.getPointer(); }
274 using iterator =
275 mapped_iterator<TinyPtrVector<DepTy>::iterator, decltype(&DepGetVal)>;
276
277 /// There is no root node for the dependency graph. But the SCCIterator
278 /// requires a single entry point, so we maintain a fake("synthetic") root
279 /// node that depends on every node.
280 AADepGraphNode SyntheticRoot;
281 AADepGraphNode *GetEntryNode() { return &SyntheticRoot; }
282
283 iterator begin() { return SyntheticRoot.child_begin(); }
284 iterator end() { return SyntheticRoot.child_end(); }
285
286 void viewGraph();
287
288 /// Dump graph to file
289 void dumpGraph();
290
291 /// Print dependency graph
292 void print();
293};
294
295/// Helper to describe and deal with positions in the LLVM-IR.
296///
297/// A position in the IR is described by an anchor value and an "offset" that
298/// could be the argument number, for call sites and arguments, or an indicator
299/// of the "position kind". The kinds, specified in the Kind enum below, include
300/// the locations in the attribute list, i.a., function scope and return value,
301/// as well as a distinction between call sites and functions. Finally, there
302/// are floating values that do not have a corresponding attribute list
303/// position.
304struct IRPosition {
305 // NOTE: In the future this definition can be changed to support recursive
306 // functions.
307 using CallBaseContext = CallBase;
308
309 /// The positions we distinguish in the IR.
310 enum Kind : char {
311 IRP_INVALID, ///< An invalid position.
312 IRP_FLOAT, ///< A position that is not associated with a spot suitable
313 ///< for attributes. This could be any value or instruction.
314 IRP_RETURNED, ///< An attribute for the function return value.
315 IRP_CALL_SITE_RETURNED, ///< An attribute for a call site return value.
316 IRP_FUNCTION, ///< An attribute for a function (scope).
317 IRP_CALL_SITE, ///< An attribute for a call site (function scope).
318 IRP_ARGUMENT, ///< An attribute for a function argument.
319 IRP_CALL_SITE_ARGUMENT, ///< An attribute for a call site argument.
320 };
321
322 /// Default constructor available to create invalid positions implicitly. All
323 /// other positions need to be created explicitly through the appropriate
324 /// static member function.
325 IRPosition() : Enc(nullptr, ENC_VALUE) { verify(); }
326
327 /// Create a position describing the value of \p V.
328 static const IRPosition value(const Value &V,
329 const CallBaseContext *CBContext = nullptr) {
330 if (auto *Arg = dyn_cast<Argument>(&V))
331 return IRPosition::argument(*Arg, CBContext);
332 if (auto *CB = dyn_cast<CallBase>(&V))
333 return IRPosition::callsite_returned(*CB);
334 return IRPosition(const_cast<Value &>(V), IRP_FLOAT, CBContext);
335 }
336
337 /// Create a position describing the function scope of \p F.
338 /// \p CBContext is used for call base specific analysis.
339 static const IRPosition function(const Function &F,
340 const CallBaseContext *CBContext = nullptr) {
341 return IRPosition(const_cast<Function &>(F), IRP_FUNCTION, CBContext);
342 }
343
344 /// Create a position describing the returned value of \p F.
345 /// \p CBContext is used for call base specific analysis.
346 static const IRPosition returned(const Function &F,
347 const CallBaseContext *CBContext = nullptr) {
348 return IRPosition(const_cast<Function &>(F), IRP_RETURNED, CBContext);
349 }
350
351 /// Create a position describing the argument \p Arg.
352 /// \p CBContext is used for call base specific analysis.
353 static const IRPosition argument(const Argument &Arg,
354 const CallBaseContext *CBContext = nullptr) {
355 return IRPosition(const_cast<Argument &>(Arg), IRP_ARGUMENT, CBContext);
356 }
357
358 /// Create a position describing the function scope of \p CB.
359 static const IRPosition callsite_function(const CallBase &CB) {
360 return IRPosition(const_cast<CallBase &>(CB), IRP_CALL_SITE);
361 }
362
363 /// Create a position describing the returned value of \p CB.
364 static const IRPosition callsite_returned(const CallBase &CB) {
365 return IRPosition(const_cast<CallBase &>(CB), IRP_CALL_SITE_RETURNED);
366 }
367
368 /// Create a position describing the argument of \p CB at position \p ArgNo.
369 static const IRPosition callsite_argument(const CallBase &CB,
370 unsigned ArgNo) {
371 return IRPosition(const_cast<Use &>(CB.getArgOperandUse(ArgNo)),
372 IRP_CALL_SITE_ARGUMENT);
373 }
374
375 /// Create a position describing the argument of \p ACS at position \p ArgNo.
376 static const IRPosition callsite_argument(AbstractCallSite ACS,
377 unsigned ArgNo) {
378 if (ACS.getNumArgOperands() <= ArgNo)
379 return IRPosition();
380 int CSArgNo = ACS.getCallArgOperandNo(ArgNo);
381 if (CSArgNo >= 0)
382 return IRPosition::callsite_argument(
383 cast<CallBase>(*ACS.getInstruction()), CSArgNo);
384 return IRPosition();
385 }
386
387 /// Create a position with function scope matching the "context" of \p IRP.
388 /// If \p IRP is a call site (see isAnyCallSitePosition()) then the result
389 /// will be a call site position, otherwise the function position of the
390 /// associated function.
391 static const IRPosition
392 function_scope(const IRPosition &IRP,
393 const CallBaseContext *CBContext = nullptr) {
394 if (IRP.isAnyCallSitePosition()) {
395 return IRPosition::callsite_function(
396 cast<CallBase>(IRP.getAnchorValue()));
397 }
398 assert(IRP.getAssociatedFunction())(static_cast <bool> (IRP.getAssociatedFunction()) ? void
(0) : __assert_fail ("IRP.getAssociatedFunction()", "llvm/include/llvm/Transforms/IPO/Attributor.h"
, 398, __extension__ __PRETTY_FUNCTION__))
;
399 return IRPosition::function(*IRP.getAssociatedFunction(), CBContext);
400 }
401
402 bool operator==(const IRPosition &RHS) const {
403 return Enc == RHS.Enc && RHS.CBContext == CBContext;
404 }
405 bool operator!=(const IRPosition &RHS) const { return !(*this == RHS); }
406
407 /// Return the value this abstract attribute is anchored with.
408 ///
409 /// The anchor value might not be the associated value if the latter is not
410 /// sufficient to determine where arguments will be manifested. This is, so
411 /// far, only the case for call site arguments as the value is not sufficient
412 /// to pinpoint them. Instead, we can use the call site as an anchor.
413 Value &getAnchorValue() const {
414 switch (getEncodingBits()) {
415 case ENC_VALUE:
416 case ENC_RETURNED_VALUE:
417 case ENC_FLOATING_FUNCTION:
418 return *getAsValuePtr();
419 case ENC_CALL_SITE_ARGUMENT_USE:
420 return *(getAsUsePtr()->getUser());
421 default:
422 llvm_unreachable("Unkown encoding!")::llvm::llvm_unreachable_internal("Unkown encoding!", "llvm/include/llvm/Transforms/IPO/Attributor.h"
, 422)
;
423 };
424 }
425
426 /// Return the associated function, if any.
427 Function *getAssociatedFunction() const {
428 if (auto *CB = dyn_cast<CallBase>(&getAnchorValue())) {
429 // We reuse the logic that associates callback calles to arguments of a
430 // call site here to identify the callback callee as the associated
431 // function.
432 if (Argument *Arg = getAssociatedArgument())
433 return Arg->getParent();
434 return CB->getCalledFunction();
435 }
436 return getAnchorScope();
437 }
438
439 /// Return the associated argument, if any.
440 Argument *getAssociatedArgument() const;
441
442 /// Return true if the position refers to a function interface, that is the
443 /// function scope, the function return, or an argument.
444 bool isFnInterfaceKind() const {
445 switch (getPositionKind()) {
446 case IRPosition::IRP_FUNCTION:
447 case IRPosition::IRP_RETURNED:
448 case IRPosition::IRP_ARGUMENT:
449 return true;
450 default:
451 return false;
452 }
453 }
454
455 /// Return the Function surrounding the anchor value.
456 Function *getAnchorScope() const {
457 Value &V = getAnchorValue();
458 if (isa<Function>(V))
459 return &cast<Function>(V);
460 if (isa<Argument>(V))
461 return cast<Argument>(V).getParent();
462 if (isa<Instruction>(V))
463 return cast<Instruction>(V).getFunction();
464 return nullptr;
465 }
466
467 /// Return the context instruction, if any.
468 Instruction *getCtxI() const {
469 Value &V = getAnchorValue();
470 if (auto *I = dyn_cast<Instruction>(&V))
471 return I;
472 if (auto *Arg = dyn_cast<Argument>(&V))
473 if (!Arg->getParent()->isDeclaration())
474 return &Arg->getParent()->getEntryBlock().front();
475 if (auto *F = dyn_cast<Function>(&V))
476 if (!F->isDeclaration())
477 return &(F->getEntryBlock().front());
478 return nullptr;
479 }
480
481 /// Return the value this abstract attribute is associated with.
482 Value &getAssociatedValue() const {
483 if (getCallSiteArgNo() < 0 || isa<Argument>(&getAnchorValue()))
484 return getAnchorValue();
485 assert(isa<CallBase>(&getAnchorValue()) && "Expected a call base!")(static_cast <bool> (isa<CallBase>(&getAnchorValue
()) && "Expected a call base!") ? void (0) : __assert_fail
("isa<CallBase>(&getAnchorValue()) && \"Expected a call base!\""
, "llvm/include/llvm/Transforms/IPO/Attributor.h", 485, __extension__
__PRETTY_FUNCTION__))
;
486 return *cast<CallBase>(&getAnchorValue())
487 ->getArgOperand(getCallSiteArgNo());
488 }
489
490 /// Return the type this abstract attribute is associated with.
491 Type *getAssociatedType() const {
492 if (getPositionKind() == IRPosition::IRP_RETURNED)
493 return getAssociatedFunction()->getReturnType();
494 return getAssociatedValue().getType();
495 }
496
497 /// Return the callee argument number of the associated value if it is an
498 /// argument or call site argument, otherwise a negative value. In contrast to
499 /// `getCallSiteArgNo` this method will always return the "argument number"
500 /// from the perspective of the callee. This may not the same as the call site
501 /// if this is a callback call.
502 int getCalleeArgNo() const {
503 return getArgNo(/* CallbackCalleeArgIfApplicable */ true);
504 }
505
506 /// Return the call site argument number of the associated value if it is an
507 /// argument or call site argument, otherwise a negative value. In contrast to
508 /// `getCalleArgNo` this method will always return the "operand number" from
509 /// the perspective of the call site. This may not the same as the callee
510 /// perspective if this is a callback call.
511 int getCallSiteArgNo() const {
512 return getArgNo(/* CallbackCalleeArgIfApplicable */ false);
513 }
514
515 /// Return the index in the attribute list for this position.
516 unsigned getAttrIdx() const {
517 switch (getPositionKind()) {
518 case IRPosition::IRP_INVALID:
519 case IRPosition::IRP_FLOAT:
520 break;
521 case IRPosition::IRP_FUNCTION:
522 case IRPosition::IRP_CALL_SITE:
523 return AttributeList::FunctionIndex;
524 case IRPosition::IRP_RETURNED:
525 case IRPosition::IRP_CALL_SITE_RETURNED:
526 return AttributeList::ReturnIndex;
527 case IRPosition::IRP_ARGUMENT:
528 case IRPosition::IRP_CALL_SITE_ARGUMENT:
529 return getCallSiteArgNo() + AttributeList::FirstArgIndex;
530 }
531 llvm_unreachable(::llvm::llvm_unreachable_internal("There is no attribute index for a floating or invalid position!"
, "llvm/include/llvm/Transforms/IPO/Attributor.h", 532)
532 "There is no attribute index for a floating or invalid position!")::llvm::llvm_unreachable_internal("There is no attribute index for a floating or invalid position!"
, "llvm/include/llvm/Transforms/IPO/Attributor.h", 532)
;
533 }
534
535 /// Return the associated position kind.
536 Kind getPositionKind() const {
537 char EncodingBits = getEncodingBits();
538 if (EncodingBits == ENC_CALL_SITE_ARGUMENT_USE)
539 return IRP_CALL_SITE_ARGUMENT;
540 if (EncodingBits == ENC_FLOATING_FUNCTION)
541 return IRP_FLOAT;
542
543 Value *V = getAsValuePtr();
544 if (!V)
545 return IRP_INVALID;
546 if (isa<Argument>(V))
547 return IRP_ARGUMENT;
548 if (isa<Function>(V))
549 return isReturnPosition(EncodingBits) ? IRP_RETURNED : IRP_FUNCTION;
550 if (isa<CallBase>(V))
551 return isReturnPosition(EncodingBits) ? IRP_CALL_SITE_RETURNED
552 : IRP_CALL_SITE;
553 return IRP_FLOAT;
554 }
555
556 /// TODO: Figure out if the attribute related helper functions should live
557 /// here or somewhere else.
558
559 /// Return true if any kind in \p AKs existing in the IR at a position that
560 /// will affect this one. See also getAttrs(...).
561 /// \param IgnoreSubsumingPositions Flag to determine if subsuming positions,
562 /// e.g., the function position if this is an
563 /// argument position, should be ignored.
564 bool hasAttr(ArrayRef<Attribute::AttrKind> AKs,
565 bool IgnoreSubsumingPositions = false,
566 Attributor *A = nullptr) const;
567
568 /// Return the attributes of any kind in \p AKs existing in the IR at a
569 /// position that will affect this one. While each position can only have a
570 /// single attribute of any kind in \p AKs, there are "subsuming" positions
571 /// that could have an attribute as well. This method returns all attributes
572 /// found in \p Attrs.
573 /// \param IgnoreSubsumingPositions Flag to determine if subsuming positions,
574 /// e.g., the function position if this is an
575 /// argument position, should be ignored.
576 void getAttrs(ArrayRef<Attribute::AttrKind> AKs,
577 SmallVectorImpl<Attribute> &Attrs,
578 bool IgnoreSubsumingPositions = false,
579 Attributor *A = nullptr) const;
580
581 /// Remove the attribute of kind \p AKs existing in the IR at this position.
582 void removeAttrs(ArrayRef<Attribute::AttrKind> AKs) const {
583 if (getPositionKind() == IRP_INVALID || getPositionKind() == IRP_FLOAT)
584 return;
585
586 AttributeList AttrList;
587 auto *CB = dyn_cast<CallBase>(&getAnchorValue());
588 if (CB)
589 AttrList = CB->getAttributes();
590 else
591 AttrList = getAssociatedFunction()->getAttributes();
592
593 LLVMContext &Ctx = getAnchorValue().getContext();
594 for (Attribute::AttrKind AK : AKs)
595 AttrList = AttrList.removeAttributeAtIndex(Ctx, getAttrIdx(), AK);
596
597 if (CB)
598 CB->setAttributes(AttrList);
599 else
600 getAssociatedFunction()->setAttributes(AttrList);
601 }
602
603 bool isAnyCallSitePosition() const {
604 switch (getPositionKind()) {
605 case IRPosition::IRP_CALL_SITE:
606 case IRPosition::IRP_CALL_SITE_RETURNED:
607 case IRPosition::IRP_CALL_SITE_ARGUMENT:
608 return true;
609 default:
610 return false;
611 }
612 }
613
614 /// Return true if the position is an argument or call site argument.
615 bool isArgumentPosition() const {
616 switch (getPositionKind()) {
617 case IRPosition::IRP_ARGUMENT:
618 case IRPosition::IRP_CALL_SITE_ARGUMENT:
619 return true;
620 default:
621 return false;
622 }
623 }
624
625 /// Return the same position without the call base context.
626 IRPosition stripCallBaseContext() const {
627 IRPosition Result = *this;
628 Result.CBContext = nullptr;
629 return Result;
630 }
631
632 /// Get the call base context from the position.
633 const CallBaseContext *getCallBaseContext() const { return CBContext; }
634
635 /// Check if the position has any call base context.
636 bool hasCallBaseContext() const { return CBContext != nullptr; }
637
638 /// Special DenseMap key values.
639 ///
640 ///{
641 static const IRPosition EmptyKey;
642 static const IRPosition TombstoneKey;
643 ///}
644
645 /// Conversion into a void * to allow reuse of pointer hashing.
646 operator void *() const { return Enc.getOpaqueValue(); }
647
648private:
649 /// Private constructor for special values only!
650 explicit IRPosition(void *Ptr, const CallBaseContext *CBContext = nullptr)
651 : CBContext(CBContext) {
652 Enc.setFromOpaqueValue(Ptr);
653 }
654
655 /// IRPosition anchored at \p AnchorVal with kind/argument numbet \p PK.
656 explicit IRPosition(Value &AnchorVal, Kind PK,
657 const CallBaseContext *CBContext = nullptr)
658 : CBContext(CBContext) {
659 switch (PK) {
660 case IRPosition::IRP_INVALID:
661 llvm_unreachable("Cannot create invalid IRP with an anchor value!")::llvm::llvm_unreachable_internal("Cannot create invalid IRP with an anchor value!"
, "llvm/include/llvm/Transforms/IPO/Attributor.h", 661)
;
662 break;
663 case IRPosition::IRP_FLOAT:
664 // Special case for floating functions.
665 if (isa<Function>(AnchorVal))
666 Enc = {&AnchorVal, ENC_FLOATING_FUNCTION};
667 else
668 Enc = {&AnchorVal, ENC_VALUE};
669 break;
670 case IRPosition::IRP_FUNCTION:
671 case IRPosition::IRP_CALL_SITE:
672 Enc = {&AnchorVal, ENC_VALUE};
673 break;
674 case IRPosition::IRP_RETURNED:
675 case IRPosition::IRP_CALL_SITE_RETURNED:
676 Enc = {&AnchorVal, ENC_RETURNED_VALUE};
677 break;
678 case IRPosition::IRP_ARGUMENT:
679 Enc = {&AnchorVal, ENC_VALUE};
680 break;
681 case IRPosition::IRP_CALL_SITE_ARGUMENT:
682 llvm_unreachable(::llvm::llvm_unreachable_internal("Cannot create call site argument IRP with an anchor value!"
, "llvm/include/llvm/Transforms/IPO/Attributor.h", 683)
683 "Cannot create call site argument IRP with an anchor value!")::llvm::llvm_unreachable_internal("Cannot create call site argument IRP with an anchor value!"
, "llvm/include/llvm/Transforms/IPO/Attributor.h", 683)
;
684 break;
685 }
686 verify();
687 }
688
689 /// Return the callee argument number of the associated value if it is an
690 /// argument or call site argument. See also `getCalleeArgNo` and
691 /// `getCallSiteArgNo`.
692 int getArgNo(bool CallbackCalleeArgIfApplicable) const {
693 if (CallbackCalleeArgIfApplicable)
694 if (Argument *Arg = getAssociatedArgument())
695 return Arg->getArgNo();
696 switch (getPositionKind()) {
697 case IRPosition::IRP_ARGUMENT:
698 return cast<Argument>(getAsValuePtr())->getArgNo();
699 case IRPosition::IRP_CALL_SITE_ARGUMENT: {
700 Use &U = *getAsUsePtr();
701 return cast<CallBase>(U.getUser())->getArgOperandNo(&U);
702 }
703 default:
704 return -1;
705 }
706 }
707
708 /// IRPosition for the use \p U. The position kind \p PK needs to be
709 /// IRP_CALL_SITE_ARGUMENT, the anchor value is the user, the associated value
710 /// the used value.
711 explicit IRPosition(Use &U, Kind PK) {
712 assert(PK == IRP_CALL_SITE_ARGUMENT &&(static_cast <bool> (PK == IRP_CALL_SITE_ARGUMENT &&
"Use constructor is for call site arguments only!") ? void (
0) : __assert_fail ("PK == IRP_CALL_SITE_ARGUMENT && \"Use constructor is for call site arguments only!\""
, "llvm/include/llvm/Transforms/IPO/Attributor.h", 713, __extension__
__PRETTY_FUNCTION__))
713 "Use constructor is for call site arguments only!")(static_cast <bool> (PK == IRP_CALL_SITE_ARGUMENT &&
"Use constructor is for call site arguments only!") ? void (
0) : __assert_fail ("PK == IRP_CALL_SITE_ARGUMENT && \"Use constructor is for call site arguments only!\""
, "llvm/include/llvm/Transforms/IPO/Attributor.h", 713, __extension__
__PRETTY_FUNCTION__))
;
714 Enc = {&U, ENC_CALL_SITE_ARGUMENT_USE};
715 verify();
716 }
717
718 /// Verify internal invariants.
719 void verify();
720
721 /// Return the attributes of kind \p AK existing in the IR as attribute.
722 bool getAttrsFromIRAttr(Attribute::AttrKind AK,
723 SmallVectorImpl<Attribute> &Attrs) const;
724
725 /// Return the attributes of kind \p AK existing in the IR as operand bundles
726 /// of an llvm.assume.
727 bool getAttrsFromAssumes(Attribute::AttrKind AK,
728 SmallVectorImpl<Attribute> &Attrs,
729 Attributor &A) const;
730
731 /// Return the underlying pointer as Value *, valid for all positions but
732 /// IRP_CALL_SITE_ARGUMENT.
733 Value *getAsValuePtr() const {
734 assert(getEncodingBits() != ENC_CALL_SITE_ARGUMENT_USE &&(static_cast <bool> (getEncodingBits() != ENC_CALL_SITE_ARGUMENT_USE
&& "Not a value pointer!") ? void (0) : __assert_fail
("getEncodingBits() != ENC_CALL_SITE_ARGUMENT_USE && \"Not a value pointer!\""
, "llvm/include/llvm/Transforms/IPO/Attributor.h", 735, __extension__
__PRETTY_FUNCTION__))
735 "Not a value pointer!")(static_cast <bool> (getEncodingBits() != ENC_CALL_SITE_ARGUMENT_USE
&& "Not a value pointer!") ? void (0) : __assert_fail
("getEncodingBits() != ENC_CALL_SITE_ARGUMENT_USE && \"Not a value pointer!\""
, "llvm/include/llvm/Transforms/IPO/Attributor.h", 735, __extension__
__PRETTY_FUNCTION__))
;
736 return reinterpret_cast<Value *>(Enc.getPointer());
737 }
738
739 /// Return the underlying pointer as Use *, valid only for
740 /// IRP_CALL_SITE_ARGUMENT positions.
741 Use *getAsUsePtr() const {
742 assert(getEncodingBits() == ENC_CALL_SITE_ARGUMENT_USE &&(static_cast <bool> (getEncodingBits() == ENC_CALL_SITE_ARGUMENT_USE
&& "Not a value pointer!") ? void (0) : __assert_fail
("getEncodingBits() == ENC_CALL_SITE_ARGUMENT_USE && \"Not a value pointer!\""
, "llvm/include/llvm/Transforms/IPO/Attributor.h", 743, __extension__
__PRETTY_FUNCTION__))
743 "Not a value pointer!")(static_cast <bool> (getEncodingBits() == ENC_CALL_SITE_ARGUMENT_USE
&& "Not a value pointer!") ? void (0) : __assert_fail
("getEncodingBits() == ENC_CALL_SITE_ARGUMENT_USE && \"Not a value pointer!\""
, "llvm/include/llvm/Transforms/IPO/Attributor.h", 743, __extension__
__PRETTY_FUNCTION__))
;
744 return reinterpret_cast<Use *>(Enc.getPointer());
745 }
746
747 /// Return true if \p EncodingBits describe a returned or call site returned
748 /// position.
749 static bool isReturnPosition(char EncodingBits) {
750 return EncodingBits == ENC_RETURNED_VALUE;
751 }
752
753 /// Return true if the encoding bits describe a returned or call site returned
754 /// position.
755 bool isReturnPosition() const { return isReturnPosition(getEncodingBits()); }
756
757 /// The encoding of the IRPosition is a combination of a pointer and two
758 /// encoding bits. The values of the encoding bits are defined in the enum
759 /// below. The pointer is either a Value* (for the first three encoding bit
760 /// combinations) or Use* (for ENC_CALL_SITE_ARGUMENT_USE).
761 ///
762 ///{
763 enum {
764 ENC_VALUE = 0b00,
765 ENC_RETURNED_VALUE = 0b01,
766 ENC_FLOATING_FUNCTION = 0b10,
767 ENC_CALL_SITE_ARGUMENT_USE = 0b11,
768 };
769
770 // Reserve the maximal amount of bits so there is no need to mask out the
771 // remaining ones. We will not encode anything else in the pointer anyway.
772 static constexpr int NumEncodingBits =
773 PointerLikeTypeTraits<void *>::NumLowBitsAvailable;
774 static_assert(NumEncodingBits >= 2, "At least two bits are required!");
775
776 /// The pointer with the encoding bits.
777 PointerIntPair<void *, NumEncodingBits, char> Enc;
778 ///}
779
780 /// Call base context. Used for callsite specific analysis.