Bug Summary

File:llvm/lib/Transforms/IPO/OpenMPOpt.cpp
Warning:line 2753, column 7
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name OpenMPOpt.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/build-llvm -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Transforms/IPO -I /build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO -I include -I /build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/include -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-command-line-argument -Wno-unknown-warning-option -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/build-llvm -ferror-limit 19 -fvisibility-inlines-hidden -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-10-17-004846-21170-1 -x c++ /build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp

/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp

1//===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// OpenMP specific optimizations:
10//
11// - Deduplication of runtime calls, e.g., omp_get_thread_num.
12// - Replacing globalized device memory with stack memory.
13// - Replacing globalized device memory with shared memory.
14// - Parallel region merging.
15// - Transforming generic-mode device kernels to SPMD mode.
16// - Specializing the state machine for generic-mode device kernels.
17//
18//===----------------------------------------------------------------------===//
19
20#include "llvm/Transforms/IPO/OpenMPOpt.h"
21
22#include "llvm/ADT/EnumeratedArray.h"
23#include "llvm/ADT/PostOrderIterator.h"
24#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/CallGraph.h"
26#include "llvm/Analysis/CallGraphSCCPass.h"
27#include "llvm/Analysis/OptimizationRemarkEmitter.h"
28#include "llvm/Analysis/ValueTracking.h"
29#include "llvm/Frontend/OpenMP/OMPConstants.h"
30#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
31#include "llvm/IR/Assumptions.h"
32#include "llvm/IR/DiagnosticInfo.h"
33#include "llvm/IR/GlobalValue.h"
34#include "llvm/IR/Instruction.h"
35#include "llvm/IR/IntrinsicInst.h"
36#include "llvm/IR/IntrinsicsAMDGPU.h"
37#include "llvm/IR/IntrinsicsNVPTX.h"
38#include "llvm/InitializePasses.h"
39#include "llvm/Support/CommandLine.h"
40#include "llvm/Transforms/IPO.h"
41#include "llvm/Transforms/IPO/Attributor.h"
42#include "llvm/Transforms/Utils/BasicBlockUtils.h"
43#include "llvm/Transforms/Utils/CallGraphUpdater.h"
44#include "llvm/Transforms/Utils/CodeExtractor.h"
45
46using namespace llvm;
47using namespace omp;
48
49#define DEBUG_TYPE"openmp-opt" "openmp-opt"
50
51static cl::opt<bool> DisableOpenMPOptimizations(
52 "openmp-opt-disable", cl::ZeroOrMore,
53 cl::desc("Disable OpenMP specific optimizations."), cl::Hidden,
54 cl::init(false));
55
56static cl::opt<bool> EnableParallelRegionMerging(
57 "openmp-opt-enable-merging", cl::ZeroOrMore,
58 cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,
59 cl::init(false));
60
61static cl::opt<bool>
62 DisableInternalization("openmp-opt-disable-internalization", cl::ZeroOrMore,
63 cl::desc("Disable function internalization."),
64 cl::Hidden, cl::init(false));
65
66static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
67 cl::Hidden);
68static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
69 cl::init(false), cl::Hidden);
70
71static cl::opt<bool> HideMemoryTransferLatency(
72 "openmp-hide-memory-transfer-latency",
73 cl::desc("[WIP] Tries to hide the latency of host to device memory"
74 " transfers"),
75 cl::Hidden, cl::init(false));
76
77static cl::opt<bool> DisableOpenMPOptDeglobalization(
78 "openmp-opt-disable-deglobalization", cl::ZeroOrMore,
79 cl::desc("Disable OpenMP optimizations involving deglobalization."),
80 cl::Hidden, cl::init(false));
81
82static cl::opt<bool> DisableOpenMPOptSPMDization(
83 "openmp-opt-disable-spmdization", cl::ZeroOrMore,
84 cl::desc("Disable OpenMP optimizations involving SPMD-ization."),
85 cl::Hidden, cl::init(false));
86
87static cl::opt<bool> DisableOpenMPOptFolding(
88 "openmp-opt-disable-folding", cl::ZeroOrMore,
89 cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden,
90 cl::init(false));
91
92static cl::opt<bool> DisableOpenMPOptStateMachineRewrite(
93 "openmp-opt-disable-state-machine-rewrite", cl::ZeroOrMore,
94 cl::desc("Disable OpenMP optimizations that replace the state machine."),
95 cl::Hidden, cl::init(false));
96
97static cl::opt<bool> PrintModuleAfterOptimizations(
98 "openmp-opt-print-module", cl::ZeroOrMore,
99 cl::desc("Print the current module after OpenMP optimizations."),
100 cl::Hidden, cl::init(false));
101
102static cl::opt<bool> AlwaysInlineDeviceFunctions(
103 "openmp-opt-inline-device", cl::ZeroOrMore,
104 cl::desc("Inline all applicible functions on the device."), cl::Hidden,
105 cl::init(false));
106
107static cl::opt<bool>
108 EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::ZeroOrMore,
109 cl::desc("Enables more verbose remarks."), cl::Hidden,
110 cl::init(false));
111
112static cl::opt<unsigned>
113 SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden,
114 cl::desc("Maximal number of attributor iterations."),
115 cl::init(256));
116
117STATISTIC(NumOpenMPRuntimeCallsDeduplicated,static llvm::Statistic NumOpenMPRuntimeCallsDeduplicated = {"openmp-opt"
, "NumOpenMPRuntimeCallsDeduplicated", "Number of OpenMP runtime calls deduplicated"
}
118 "Number of OpenMP runtime calls deduplicated")static llvm::Statistic NumOpenMPRuntimeCallsDeduplicated = {"openmp-opt"
, "NumOpenMPRuntimeCallsDeduplicated", "Number of OpenMP runtime calls deduplicated"
}
;
119STATISTIC(NumOpenMPParallelRegionsDeleted,static llvm::Statistic NumOpenMPParallelRegionsDeleted = {"openmp-opt"
, "NumOpenMPParallelRegionsDeleted", "Number of OpenMP parallel regions deleted"
}
120 "Number of OpenMP parallel regions deleted")static llvm::Statistic NumOpenMPParallelRegionsDeleted = {"openmp-opt"
, "NumOpenMPParallelRegionsDeleted", "Number of OpenMP parallel regions deleted"
}
;
121STATISTIC(NumOpenMPRuntimeFunctionsIdentified,static llvm::Statistic NumOpenMPRuntimeFunctionsIdentified = {
"openmp-opt", "NumOpenMPRuntimeFunctionsIdentified", "Number of OpenMP runtime functions identified"
}
122 "Number of OpenMP runtime functions identified")static llvm::Statistic NumOpenMPRuntimeFunctionsIdentified = {
"openmp-opt", "NumOpenMPRuntimeFunctionsIdentified", "Number of OpenMP runtime functions identified"
}
;
123STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,static llvm::Statistic NumOpenMPRuntimeFunctionUsesIdentified
= {"openmp-opt", "NumOpenMPRuntimeFunctionUsesIdentified", "Number of OpenMP runtime function uses identified"
}
124 "Number of OpenMP runtime function uses identified")static llvm::Statistic NumOpenMPRuntimeFunctionUsesIdentified
= {"openmp-opt", "NumOpenMPRuntimeFunctionUsesIdentified", "Number of OpenMP runtime function uses identified"
}
;
125STATISTIC(NumOpenMPTargetRegionKernels,static llvm::Statistic NumOpenMPTargetRegionKernels = {"openmp-opt"
, "NumOpenMPTargetRegionKernels", "Number of OpenMP target region entry points (=kernels) identified"
}
126 "Number of OpenMP target region entry points (=kernels) identified")static llvm::Statistic NumOpenMPTargetRegionKernels = {"openmp-opt"
, "NumOpenMPTargetRegionKernels", "Number of OpenMP target region entry points (=kernels) identified"
}
;
127STATISTIC(NumOpenMPTargetRegionKernelsSPMD,static llvm::Statistic NumOpenMPTargetRegionKernelsSPMD = {"openmp-opt"
, "NumOpenMPTargetRegionKernelsSPMD", "Number of OpenMP target region entry points (=kernels) executed in "
"SPMD-mode instead of generic-mode"}
128 "Number of OpenMP target region entry points (=kernels) executed in "static llvm::Statistic NumOpenMPTargetRegionKernelsSPMD = {"openmp-opt"
, "NumOpenMPTargetRegionKernelsSPMD", "Number of OpenMP target region entry points (=kernels) executed in "
"SPMD-mode instead of generic-mode"}
129 "SPMD-mode instead of generic-mode")static llvm::Statistic NumOpenMPTargetRegionKernelsSPMD = {"openmp-opt"
, "NumOpenMPTargetRegionKernelsSPMD", "Number of OpenMP target region entry points (=kernels) executed in "
"SPMD-mode instead of generic-mode"}
;
130STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,static llvm::Statistic NumOpenMPTargetRegionKernelsWithoutStateMachine
= {"openmp-opt", "NumOpenMPTargetRegionKernelsWithoutStateMachine"
, "Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode without a state machines"}
131 "Number of OpenMP target region entry points (=kernels) executed in "static llvm::Statistic NumOpenMPTargetRegionKernelsWithoutStateMachine
= {"openmp-opt", "NumOpenMPTargetRegionKernelsWithoutStateMachine"
, "Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode without a state machines"}
132 "generic-mode without a state machines")static llvm::Statistic NumOpenMPTargetRegionKernelsWithoutStateMachine
= {"openmp-opt", "NumOpenMPTargetRegionKernelsWithoutStateMachine"
, "Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode without a state machines"}
;
133STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,static llvm::Statistic NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback
= {"openmp-opt", "NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback"
, "Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode with customized state machines with fallback"}
134 "Number of OpenMP target region entry points (=kernels) executed in "static llvm::Statistic NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback
= {"openmp-opt", "NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback"
, "Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode with customized state machines with fallback"}
135 "generic-mode with customized state machines with fallback")static llvm::Statistic NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback
= {"openmp-opt", "NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback"
, "Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode with customized state machines with fallback"}
;
136STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,static llvm::Statistic NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback
= {"openmp-opt", "NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback"
, "Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode with customized state machines without fallback"
}
137 "Number of OpenMP target region entry points (=kernels) executed in "static llvm::Statistic NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback
= {"openmp-opt", "NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback"
, "Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode with customized state machines without fallback"
}
138 "generic-mode with customized state machines without fallback")static llvm::Statistic NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback
= {"openmp-opt", "NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback"
, "Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode with customized state machines without fallback"
}
;
139STATISTIC(static llvm::Statistic NumOpenMPParallelRegionsReplacedInGPUStateMachine
= {"openmp-opt", "NumOpenMPParallelRegionsReplacedInGPUStateMachine"
, "Number of OpenMP parallel regions replaced with ID in GPU state machines"
}
140 NumOpenMPParallelRegionsReplacedInGPUStateMachine,static llvm::Statistic NumOpenMPParallelRegionsReplacedInGPUStateMachine
= {"openmp-opt", "NumOpenMPParallelRegionsReplacedInGPUStateMachine"
, "Number of OpenMP parallel regions replaced with ID in GPU state machines"
}
141 "Number of OpenMP parallel regions replaced with ID in GPU state machines")static llvm::Statistic NumOpenMPParallelRegionsReplacedInGPUStateMachine
= {"openmp-opt", "NumOpenMPParallelRegionsReplacedInGPUStateMachine"
, "Number of OpenMP parallel regions replaced with ID in GPU state machines"
}
;
142STATISTIC(NumOpenMPParallelRegionsMerged,static llvm::Statistic NumOpenMPParallelRegionsMerged = {"openmp-opt"
, "NumOpenMPParallelRegionsMerged", "Number of OpenMP parallel regions merged"
}
143 "Number of OpenMP parallel regions merged")static llvm::Statistic NumOpenMPParallelRegionsMerged = {"openmp-opt"
, "NumOpenMPParallelRegionsMerged", "Number of OpenMP parallel regions merged"
}
;
144STATISTIC(NumBytesMovedToSharedMemory,static llvm::Statistic NumBytesMovedToSharedMemory = {"openmp-opt"
, "NumBytesMovedToSharedMemory", "Amount of memory pushed to shared memory"
}
145 "Amount of memory pushed to shared memory")static llvm::Statistic NumBytesMovedToSharedMemory = {"openmp-opt"
, "NumBytesMovedToSharedMemory", "Amount of memory pushed to shared memory"
}
;
146
147#if !defined(NDEBUG)
148static constexpr auto TAG = "[" DEBUG_TYPE"openmp-opt" "]";
149#endif
150
151namespace {
152
153enum class AddressSpace : unsigned {
154 Generic = 0,
155 Global = 1,
156 Shared = 3,
157 Constant = 4,
158 Local = 5,
159};
160
161struct AAHeapToShared;
162
163struct AAICVTracker;
164
165/// OpenMP specific information. For now, stores RFIs and ICVs also needed for
166/// Attributor runs.
167struct OMPInformationCache : public InformationCache {
168 OMPInformationCache(Module &M, AnalysisGetter &AG,
169 BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC,
170 SmallPtrSetImpl<Kernel> &Kernels)
171 : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M),
172 Kernels(Kernels) {
173
174 OMPBuilder.initialize();
175 initializeRuntimeFunctions();
176 initializeInternalControlVars();
177 }
178
179 /// Generic information that describes an internal control variable.
180 struct InternalControlVarInfo {
181 /// The kind, as described by InternalControlVar enum.
182 InternalControlVar Kind;
183
184 /// The name of the ICV.
185 StringRef Name;
186
187 /// Environment variable associated with this ICV.
188 StringRef EnvVarName;
189
190 /// Initial value kind.
191 ICVInitValue InitKind;
192
193 /// Initial value.
194 ConstantInt *InitValue;
195
196 /// Setter RTL function associated with this ICV.
197 RuntimeFunction Setter;
198
199 /// Getter RTL function associated with this ICV.
200 RuntimeFunction Getter;
201
202 /// RTL Function corresponding to the override clause of this ICV
203 RuntimeFunction Clause;
204 };
205
206 /// Generic information that describes a runtime function
207 struct RuntimeFunctionInfo {
208
209 /// The kind, as described by the RuntimeFunction enum.
210 RuntimeFunction Kind;
211
212 /// The name of the function.
213 StringRef Name;
214
215 /// Flag to indicate a variadic function.
216 bool IsVarArg;
217
218 /// The return type of the function.
219 Type *ReturnType;
220
221 /// The argument types of the function.
222 SmallVector<Type *, 8> ArgumentTypes;
223
224 /// The declaration if available.
225 Function *Declaration = nullptr;
226
227 /// Uses of this runtime function per function containing the use.
228 using UseVector = SmallVector<Use *, 16>;
229
230 /// Clear UsesMap for runtime function.
231 void clearUsesMap() { UsesMap.clear(); }
232
233 /// Boolean conversion that is true if the runtime function was found.
234 operator bool() const { return Declaration; }
235
236 /// Return the vector of uses in function \p F.
237 UseVector &getOrCreateUseVector(Function *F) {
238 std::shared_ptr<UseVector> &UV = UsesMap[F];
239 if (!UV)
240 UV = std::make_shared<UseVector>();
241 return *UV;
242 }
243
244 /// Return the vector of uses in function \p F or `nullptr` if there are
245 /// none.
246 const UseVector *getUseVector(Function &F) const {
247 auto I = UsesMap.find(&F);
248 if (I != UsesMap.end())
249 return I->second.get();
250 return nullptr;
251 }
252
253 /// Return how many functions contain uses of this runtime function.
254 size_t getNumFunctionsWithUses() const { return UsesMap.size(); }
255
256 /// Return the number of arguments (or the minimal number for variadic
257 /// functions).
258 size_t getNumArgs() const { return ArgumentTypes.size(); }
259
260 /// Run the callback \p CB on each use and forget the use if the result is
261 /// true. The callback will be fed the function in which the use was
262 /// encountered as second argument.
263 void foreachUse(SmallVectorImpl<Function *> &SCC,
264 function_ref<bool(Use &, Function &)> CB) {
265 for (Function *F : SCC)
266 foreachUse(CB, F);
267 }
268
269 /// Run the callback \p CB on each use within the function \p F and forget
270 /// the use if the result is true.
271 void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) {
272 SmallVector<unsigned, 8> ToBeDeleted;
273 ToBeDeleted.clear();
274
275 unsigned Idx = 0;
276 UseVector &UV = getOrCreateUseVector(F);
277
278 for (Use *U : UV) {
279 if (CB(*U, *F))
280 ToBeDeleted.push_back(Idx);
281 ++Idx;
282 }
283
284 // Remove the to-be-deleted indices in reverse order as prior
285 // modifications will not modify the smaller indices.
286 while (!ToBeDeleted.empty()) {
287 unsigned Idx = ToBeDeleted.pop_back_val();
288 UV[Idx] = UV.back();
289 UV.pop_back();
290 }
291 }
292
293 private:
294 /// Map from functions to all uses of this runtime function contained in
295 /// them.
296 DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
297
298 public:
299 /// Iterators for the uses of this runtime function.
300 decltype(UsesMap)::iterator begin() { return UsesMap.begin(); }
301 decltype(UsesMap)::iterator end() { return UsesMap.end(); }
302 };
303
304 /// An OpenMP-IR-Builder instance
305 OpenMPIRBuilder OMPBuilder;
306
307 /// Map from runtime function kind to the runtime function description.
308 EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction,
309 RuntimeFunction::OMPRTL___last>
310 RFIs;
311
312 /// Map from function declarations/definitions to their runtime enum type.
313 DenseMap<Function *, RuntimeFunction> RuntimeFunctionIDMap;
314
315 /// Map from ICV kind to the ICV description.
316 EnumeratedArray<InternalControlVarInfo, InternalControlVar,
317 InternalControlVar::ICV___last>
318 ICVs;
319
320 /// Helper to initialize all internal control variable information for those
321 /// defined in OMPKinds.def.
322 void initializeInternalControlVars() {
323#define ICV_RT_SET(_Name, RTL) \
324 { \
325 auto &ICV = ICVs[_Name]; \
326 ICV.Setter = RTL; \
327 }
328#define ICV_RT_GET(Name, RTL) \
329 { \
330 auto &ICV = ICVs[Name]; \
331 ICV.Getter = RTL; \
332 }
333#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
334 { \
335 auto &ICV = ICVs[Enum]; \
336 ICV.Name = _Name; \
337 ICV.Kind = Enum; \
338 ICV.InitKind = Init; \
339 ICV.EnvVarName = _EnvVarName; \
340 switch (ICV.InitKind) { \
341 case ICV_IMPLEMENTATION_DEFINED: \
342 ICV.InitValue = nullptr; \
343 break; \
344 case ICV_ZERO: \
345 ICV.InitValue = ConstantInt::get( \
346 Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
347 break; \
348 case ICV_FALSE: \
349 ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
350 break; \
351 case ICV_LAST: \
352 break; \
353 } \
354 }
355#include "llvm/Frontend/OpenMP/OMPKinds.def"
356 }
357
358 /// Returns true if the function declaration \p F matches the runtime
359 /// function types, that is, return type \p RTFRetType, and argument types
360 /// \p RTFArgTypes.
361 static bool declMatchesRTFTypes(Function *F, Type *RTFRetType,
362 SmallVector<Type *, 8> &RTFArgTypes) {
363 // TODO: We should output information to the user (under debug output
364 // and via remarks).
365
366 if (!F)
367 return false;
368 if (F->getReturnType() != RTFRetType)
369 return false;
370 if (F->arg_size() != RTFArgTypes.size())
371 return false;
372
373 auto *RTFTyIt = RTFArgTypes.begin();
374 for (Argument &Arg : F->args()) {
375 if (Arg.getType() != *RTFTyIt)
376 return false;
377
378 ++RTFTyIt;
379 }
380
381 return true;
382 }
383
384 // Helper to collect all uses of the declaration in the UsesMap.
385 unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) {
386 unsigned NumUses = 0;
387 if (!RFI.Declaration)
388 return NumUses;
389 OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
390
391 if (CollectStats) {
392 NumOpenMPRuntimeFunctionsIdentified += 1;
393 NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
394 }
395
396 // TODO: We directly convert uses into proper calls and unknown uses.
397 for (Use &U : RFI.Declaration->uses()) {
398 if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) {
399 if (ModuleSlice.count(UserI->getFunction())) {
400 RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
401 ++NumUses;
402 }
403 } else {
404 RFI.getOrCreateUseVector(nullptr).push_back(&U);
405 ++NumUses;
406 }
407 }
408 return NumUses;
409 }
410
411 // Helper function to recollect uses of a runtime function.
412 void recollectUsesForFunction(RuntimeFunction RTF) {
413 auto &RFI = RFIs[RTF];
414 RFI.clearUsesMap();
415 collectUses(RFI, /*CollectStats*/ false);
416 }
417
418 // Helper function to recollect uses of all runtime functions.
419 void recollectUses() {
420 for (int Idx = 0; Idx < RFIs.size(); ++Idx)
421 recollectUsesForFunction(static_cast<RuntimeFunction>(Idx));
422 }
423
424 /// Helper to initialize all runtime function information for those defined
425 /// in OpenMPKinds.def.
426 void initializeRuntimeFunctions() {
427 Module &M = *((*ModuleSlice.begin())->getParent());
428
429 // Helper macros for handling __VA_ARGS__ in OMP_RTL
430#define OMP_TYPE(VarName, ...) \
431 Type *VarName = OMPBuilder.VarName; \
432 (void)VarName;
433
434#define OMP_ARRAY_TYPE(VarName, ...) \
435 ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
436 (void)VarName##Ty; \
437 PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
438 (void)VarName##PtrTy;
439
440#define OMP_FUNCTION_TYPE(VarName, ...) \
441 FunctionType *VarName = OMPBuilder.VarName; \
442 (void)VarName; \
443 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
444 (void)VarName##Ptr;
445
446#define OMP_STRUCT_TYPE(VarName, ...) \
447 StructType *VarName = OMPBuilder.VarName; \
448 (void)VarName; \
449 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
450 (void)VarName##Ptr;
451
452#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
453 { \
454 SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
455 Function *F = M.getFunction(_Name); \
456 RTLFunctions.insert(F); \
457 if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
458 RuntimeFunctionIDMap[F] = _Enum; \
459 F->removeFnAttr(Attribute::NoInline); \
460 auto &RFI = RFIs[_Enum]; \
461 RFI.Kind = _Enum; \
462 RFI.Name = _Name; \
463 RFI.IsVarArg = _IsVarArg; \
464 RFI.ReturnType = OMPBuilder._ReturnType; \
465 RFI.ArgumentTypes = std::move(ArgsTypes); \
466 RFI.Declaration = F; \
467 unsigned NumUses = collectUses(RFI); \
468 (void)NumUses; \
469 LLVM_DEBUG({ \do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { dbgs() << TAG << RFI.Name <<
(RFI.Declaration ? "" : " not") << " found\n"; if (RFI
.Declaration) dbgs() << TAG << "-> got " <<
NumUses << " uses in " << RFI.getNumFunctionsWithUses
() << " different functions.\n"; }; } } while (false)
470 dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { dbgs() << TAG << RFI.Name <<
(RFI.Declaration ? "" : " not") << " found\n"; if (RFI
.Declaration) dbgs() << TAG << "-> got " <<
NumUses << " uses in " << RFI.getNumFunctionsWithUses
() << " different functions.\n"; }; } } while (false)
471 << " found\n"; \do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { dbgs() << TAG << RFI.Name <<
(RFI.Declaration ? "" : " not") << " found\n"; if (RFI
.Declaration) dbgs() << TAG << "-> got " <<
NumUses << " uses in " << RFI.getNumFunctionsWithUses
() << " different functions.\n"; }; } } while (false)
472 if (RFI.Declaration) \do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { dbgs() << TAG << RFI.Name <<
(RFI.Declaration ? "" : " not") << " found\n"; if (RFI
.Declaration) dbgs() << TAG << "-> got " <<
NumUses << " uses in " << RFI.getNumFunctionsWithUses
() << " different functions.\n"; }; } } while (false)
473 dbgs() << TAG << "-> got " << NumUses << " uses in " \do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { dbgs() << TAG << RFI.Name <<
(RFI.Declaration ? "" : " not") << " found\n"; if (RFI
.Declaration) dbgs() << TAG << "-> got " <<
NumUses << " uses in " << RFI.getNumFunctionsWithUses
() << " different functions.\n"; }; } } while (false)
474 << RFI.getNumFunctionsWithUses() \do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { dbgs() << TAG << RFI.Name <<
(RFI.Declaration ? "" : " not") << " found\n"; if (RFI
.Declaration) dbgs() << TAG << "-> got " <<
NumUses << " uses in " << RFI.getNumFunctionsWithUses
() << " different functions.\n"; }; } } while (false)
475 << " different functions.\n"; \do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { dbgs() << TAG << RFI.Name <<
(RFI.Declaration ? "" : " not") << " found\n"; if (RFI
.Declaration) dbgs() << TAG << "-> got " <<
NumUses << " uses in " << RFI.getNumFunctionsWithUses
() << " different functions.\n"; }; } } while (false)
476 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { dbgs() << TAG << RFI.Name <<
(RFI.Declaration ? "" : " not") << " found\n"; if (RFI
.Declaration) dbgs() << TAG << "-> got " <<
NumUses << " uses in " << RFI.getNumFunctionsWithUses
() << " different functions.\n"; }; } } while (false)
; \
477 } \
478 }
479#include "llvm/Frontend/OpenMP/OMPKinds.def"
480
481 // TODO: We should attach the attributes defined in OMPKinds.def.
482 }
483
484 /// Collection of known kernels (\see Kernel) in the module.
485 SmallPtrSetImpl<Kernel> &Kernels;
486
487 /// Collection of known OpenMP runtime functions..
488 DenseSet<const Function *> RTLFunctions;
489};
490
491template <typename Ty, bool InsertInvalidates = true>
492struct BooleanStateWithSetVector : public BooleanState {
493 bool contains(const Ty &Elem) const { return Set.contains(Elem); }
494 bool insert(const Ty &Elem) {
495 if (InsertInvalidates)
496 BooleanState::indicatePessimisticFixpoint();
497 return Set.insert(Elem);
498 }
499
500 const Ty &operator[](int Idx) const { return Set[Idx]; }
501 bool operator==(const BooleanStateWithSetVector &RHS) const {
502 return BooleanState::operator==(RHS) && Set == RHS.Set;
503 }
504 bool operator!=(const BooleanStateWithSetVector &RHS) const {
505 return !(*this == RHS);
506 }
507
508 bool empty() const { return Set.empty(); }
509 size_t size() const { return Set.size(); }
510
511 /// "Clamp" this state with \p RHS.
512 BooleanStateWithSetVector &operator^=(const BooleanStateWithSetVector &RHS) {
513 BooleanState::operator^=(RHS);
514 Set.insert(RHS.Set.begin(), RHS.Set.end());
515 return *this;
516 }
517
518private:
519 /// A set to keep track of elements.
520 SetVector<Ty> Set;
521
522public:
523 typename decltype(Set)::iterator begin() { return Set.begin(); }
524 typename decltype(Set)::iterator end() { return Set.end(); }
525 typename decltype(Set)::const_iterator begin() const { return Set.begin(); }
526 typename decltype(Set)::const_iterator end() const { return Set.end(); }
527};
528
529template <typename Ty, bool InsertInvalidates = true>
530using BooleanStateWithPtrSetVector =
531 BooleanStateWithSetVector<Ty *, InsertInvalidates>;
532
533struct KernelInfoState : AbstractState {
534 /// Flag to track if we reached a fixpoint.
535 bool IsAtFixpoint = false;
536
537 /// The parallel regions (identified by the outlined parallel functions) that
538 /// can be reached from the associated function.
539 BooleanStateWithPtrSetVector<Function, /* InsertInvalidates */ false>
540 ReachedKnownParallelRegions;
541
542 /// State to track what parallel region we might reach.
543 BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;
544
545 /// State to track if we are in SPMD-mode, assumed or know, and why we decided
546 /// we cannot be. If it is assumed, then RequiresFullRuntime should also be
547 /// false.
548 BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;
549
550 /// The __kmpc_target_init call in this kernel, if any. If we find more than
551 /// one we abort as the kernel is malformed.
552 CallBase *KernelInitCB = nullptr;
553
554 /// The __kmpc_target_deinit call in this kernel, if any. If we find more than
555 /// one we abort as the kernel is malformed.
556 CallBase *KernelDeinitCB = nullptr;
557
558 /// Flag to indicate if the associated function is a kernel entry.
559 bool IsKernelEntry = false;
560
561 /// State to track what kernel entries can reach the associated function.
562 BooleanStateWithPtrSetVector<Function, false> ReachingKernelEntries;
563
564 /// State to indicate if we can track parallel level of the associated
565 /// function. We will give up tracking if we encounter unknown caller or the
566 /// caller is __kmpc_parallel_51.
567 BooleanStateWithSetVector<uint8_t> ParallelLevels;
568
569 /// Abstract State interface
570 ///{
571
572 KernelInfoState() {}
573 KernelInfoState(bool BestState) {
574 if (!BestState)
575 indicatePessimisticFixpoint();
576 }
577
578 /// See AbstractState::isValidState(...)
579 bool isValidState() const override { return true; }
580
581 /// See AbstractState::isAtFixpoint(...)
582 bool isAtFixpoint() const override { return IsAtFixpoint; }
583
584 /// See AbstractState::indicatePessimisticFixpoint(...)
585 ChangeStatus indicatePessimisticFixpoint() override {
586 IsAtFixpoint = true;
587 ReachingKernelEntries.indicatePessimisticFixpoint();
588 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
589 ReachedKnownParallelRegions.indicatePessimisticFixpoint();
590 ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
591 return ChangeStatus::CHANGED;
592 }
593
594 /// See AbstractState::indicateOptimisticFixpoint(...)
595 ChangeStatus indicateOptimisticFixpoint() override {
596 IsAtFixpoint = true;
597 return ChangeStatus::UNCHANGED;
598 }
599
600 /// Return the assumed state
601 KernelInfoState &getAssumed() { return *this; }
602 const KernelInfoState &getAssumed() const { return *this; }
603
604 bool operator==(const KernelInfoState &RHS) const {
605 if (SPMDCompatibilityTracker != RHS.SPMDCompatibilityTracker)
606 return false;
607 if (ReachedKnownParallelRegions != RHS.ReachedKnownParallelRegions)
608 return false;
609 if (ReachedUnknownParallelRegions != RHS.ReachedUnknownParallelRegions)
610 return false;
611 if (ReachingKernelEntries != RHS.ReachingKernelEntries)
612 return false;
613 return true;
614 }
615
616 /// Returns true if this kernel contains any OpenMP parallel regions.
617 bool mayContainParallelRegion() {
618 return !ReachedKnownParallelRegions.empty() ||
619 !ReachedUnknownParallelRegions.empty();
620 }
621
622 /// Return empty set as the best state of potential values.
623 static KernelInfoState getBestState() { return KernelInfoState(true); }
624
625 static KernelInfoState getBestState(KernelInfoState &KIS) {
626 return getBestState();
627 }
628
629 /// Return full set as the worst state of potential values.
630 static KernelInfoState getWorstState() { return KernelInfoState(false); }
631
632 /// "Clamp" this state with \p KIS.
633 KernelInfoState operator^=(const KernelInfoState &KIS) {
634 // Do not merge two different _init and _deinit call sites.
635 if (KIS.KernelInitCB) {
636 if(KernelInitCB && KernelInitCB != KIS.KernelInitCB)
637 llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt assumptions.")::llvm::llvm_unreachable_internal("Kernel that calls another kernel violates OpenMP-Opt assumptions."
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 637)
;
638 KernelInitCB = KIS.KernelInitCB;
639 }
640 if (KIS.KernelDeinitCB) {
641 if(KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
642 llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt assumptions.")::llvm::llvm_unreachable_internal("Kernel that calls another kernel violates OpenMP-Opt assumptions."
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 642)
;
643 KernelDeinitCB = KIS.KernelDeinitCB;
644 }
645 SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
646 ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
647 ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
648 return *this;
649 }
650
651 KernelInfoState operator&=(const KernelInfoState &KIS) {
652 return (*this ^= KIS);
653 }
654
655 ///}
656};
657
658/// Used to map the values physically (in the IR) stored in an offload
659/// array, to a vector in memory.
660struct OffloadArray {
661 /// Physical array (in the IR).
662 AllocaInst *Array = nullptr;
663 /// Mapped values.
664 SmallVector<Value *, 8> StoredValues;
665 /// Last stores made in the offload array.
666 SmallVector<StoreInst *, 8> LastAccesses;
667
668 OffloadArray() = default;
669
670 /// Initializes the OffloadArray with the values stored in \p Array before
671 /// instruction \p Before is reached. Returns false if the initialization
672 /// fails.
673 /// This MUST be used immediately after the construction of the object.
674 bool initialize(AllocaInst &Array, Instruction &Before) {
675 if (!Array.getAllocatedType()->isArrayTy())
676 return false;
677
678 if (!getValues(Array, Before))
679 return false;
680
681 this->Array = &Array;
682 return true;
683 }
684
685 static const unsigned DeviceIDArgNum = 1;
686 static const unsigned BasePtrsArgNum = 3;
687 static const unsigned PtrsArgNum = 4;
688 static const unsigned SizesArgNum = 5;
689
690private:
691 /// Traverses the BasicBlock where \p Array is, collecting the stores made to
692 /// \p Array, leaving StoredValues with the values stored before the
693 /// instruction \p Before is reached.
694 bool getValues(AllocaInst &Array, Instruction &Before) {
695 // Initialize container.
696 const uint64_t NumValues = Array.getAllocatedType()->getArrayNumElements();
697 StoredValues.assign(NumValues, nullptr);
698 LastAccesses.assign(NumValues, nullptr);
699
700 // TODO: This assumes the instruction \p Before is in the same
701 // BasicBlock as Array. Make it general, for any control flow graph.
702 BasicBlock *BB = Array.getParent();
703 if (BB != Before.getParent())
704 return false;
705
706 const DataLayout &DL = Array.getModule()->getDataLayout();
707 const unsigned int PointerSize = DL.getPointerSize();
708
709 for (Instruction &I : *BB) {
710 if (&I == &Before)
711 break;
712
713 if (!isa<StoreInst>(&I))
714 continue;
715
716 auto *S = cast<StoreInst>(&I);
717 int64_t Offset = -1;
718 auto *Dst =
719 GetPointerBaseWithConstantOffset(S->getPointerOperand(), Offset, DL);
720 if (Dst == &Array) {
721 int64_t Idx = Offset / PointerSize;
722 StoredValues[Idx] = getUnderlyingObject(S->getValueOperand());
723 LastAccesses[Idx] = S;
724 }
725 }
726
727 return isFilled();
728 }
729
730 /// Returns true if all values in StoredValues and
731 /// LastAccesses are not nullptrs.
732 bool isFilled() {
733 const unsigned NumValues = StoredValues.size();
734 for (unsigned I = 0; I < NumValues; ++I) {
735 if (!StoredValues[I] || !LastAccesses[I])
736 return false;
737 }
738
739 return true;
740 }
741};
742
743struct OpenMPOpt {
744
745 using OptimizationRemarkGetter =
746 function_ref<OptimizationRemarkEmitter &(Function *)>;
747
748 OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
749 OptimizationRemarkGetter OREGetter,
750 OMPInformationCache &OMPInfoCache, Attributor &A)
751 : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater),
752 OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
753
754 /// Check if any remarks are enabled for openmp-opt
755 bool remarksEnabled() {
756 auto &Ctx = M.getContext();
757 return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE"openmp-opt");
758 }
759
760 /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice.
761 bool run(bool IsModulePass) {
762 if (SCC.empty())
763 return false;
764
765 bool Changed = false;
766
767 LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Run on SCC with "
<< SCC.size() << " functions in a slice with " <<
OMPInfoCache.ModuleSlice.size() << " functions\n"; } }
while (false)
768 << " functions in a slice with "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Run on SCC with "
<< SCC.size() << " functions in a slice with " <<
OMPInfoCache.ModuleSlice.size() << " functions\n"; } }
while (false)
769 << OMPInfoCache.ModuleSlice.size() << " functions\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Run on SCC with "
<< SCC.size() << " functions in a slice with " <<
OMPInfoCache.ModuleSlice.size() << " functions\n"; } }
while (false)
;
770
771 if (IsModulePass) {
772 Changed |= runAttributor(IsModulePass);
773
774 // Recollect uses, in case Attributor deleted any.
775 OMPInfoCache.recollectUses();
776
777 // TODO: This should be folded into buildCustomStateMachine.
778 Changed |= rewriteDeviceCodeStateMachine();
779
780 if (remarksEnabled())
781 analysisGlobalization();
782 } else {
783 if (PrintICVValues)
784 printICVs();
785 if (PrintOpenMPKernels)
786 printKernels();
787
788 Changed |= runAttributor(IsModulePass);
789
790 // Recollect uses, in case Attributor deleted any.
791 OMPInfoCache.recollectUses();
792
793 Changed |= deleteParallelRegions();
794
795 if (HideMemoryTransferLatency)
796 Changed |= hideMemTransfersLatency();
797 Changed |= deduplicateRuntimeCalls();
798 if (EnableParallelRegionMerging) {
799 if (mergeParallelRegions()) {
800 deduplicateRuntimeCalls();
801 Changed = true;
802 }
803 }
804 }
805
806 return Changed;
807 }
808
809 /// Print initial ICV values for testing.
810 /// FIXME: This should be done from the Attributor once it is added.
811 void printICVs() const {
812 InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel,
813 ICV_proc_bind};
814
815 for (Function *F : OMPInfoCache.ModuleSlice) {
816 for (auto ICV : ICVs) {
817 auto ICVInfo = OMPInfoCache.ICVs[ICV];
818 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
819 return ORA << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name)
820 << " Value: "
821 << (ICVInfo.InitValue
822 ? toString(ICVInfo.InitValue->getValue(), 10, true)
823 : "IMPLEMENTATION_DEFINED");
824 };
825
826 emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPICVTracker", Remark);
827 }
828 }
829 }
830
831 /// Print OpenMP GPU kernels for testing.
832 void printKernels() const {
833 for (Function *F : SCC) {
834 if (!OMPInfoCache.Kernels.count(F))
835 continue;
836
837 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
838 return ORA << "OpenMP GPU kernel "
839 << ore::NV("OpenMPGPUKernel", F->getName()) << "\n";
840 };
841
842 emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPGPU", Remark);
843 }
844 }
845
846 /// Return the call if \p U is a callee use in a regular call. If \p RFI is
847 /// given it has to be the callee or a nullptr is returned.
848 static CallInst *getCallIfRegularCall(
849 Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
850 CallInst *CI = dyn_cast<CallInst>(U.getUser());
851 if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() &&
852 (!RFI ||
853 (RFI->Declaration && CI->getCalledFunction() == RFI->Declaration)))
854 return CI;
855 return nullptr;
856 }
857
858 /// Return the call if \p V is a regular call. If \p RFI is given it has to be
859 /// the callee or a nullptr is returned.
860 static CallInst *getCallIfRegularCall(
861 Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
862 CallInst *CI = dyn_cast<CallInst>(&V);
863 if (CI && !CI->hasOperandBundles() &&
864 (!RFI ||
865 (RFI->Declaration && CI->getCalledFunction() == RFI->Declaration)))
866 return CI;
867 return nullptr;
868 }
869
870private:
871 /// Merge parallel regions when it is safe.
872 bool mergeParallelRegions() {
873 const unsigned CallbackCalleeOperand = 2;
874 const unsigned CallbackFirstArgOperand = 3;
875 using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
876
877 // Check if there are any __kmpc_fork_call calls to merge.
878 OMPInformationCache::RuntimeFunctionInfo &RFI =
879 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
880
881 if (!RFI.Declaration)
882 return false;
883
884 // Unmergable calls that prevent merging a parallel region.
885 OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
886 OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
887 OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
888 };
889
890 bool Changed = false;
891 LoopInfo *LI = nullptr;
892 DominatorTree *DT = nullptr;
893
894 SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
895
896 BasicBlock *StartBB = nullptr, *EndBB = nullptr;
897 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
898 BasicBlock &ContinuationIP) {
899 BasicBlock *CGStartBB = CodeGenIP.getBlock();
900 BasicBlock *CGEndBB =
901 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
902 assert(StartBB != nullptr && "StartBB should not be null")(static_cast <bool> (StartBB != nullptr && "StartBB should not be null"
) ? void (0) : __assert_fail ("StartBB != nullptr && \"StartBB should not be null\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 902, __extension__ __PRETTY_FUNCTION__))
;
903 CGStartBB->getTerminator()->setSuccessor(0, StartBB);
904 assert(EndBB != nullptr && "EndBB should not be null")(static_cast <bool> (EndBB != nullptr && "EndBB should not be null"
) ? void (0) : __assert_fail ("EndBB != nullptr && \"EndBB should not be null\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 904, __extension__ __PRETTY_FUNCTION__))
;
905 EndBB->getTerminator()->setSuccessor(0, CGEndBB);
906 };
907
908 auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &,
909 Value &Inner, Value *&ReplacementValue) -> InsertPointTy {
910 ReplacementValue = &Inner;
911 return CodeGenIP;
912 };
913
914 auto FiniCB = [&](InsertPointTy CodeGenIP) {};
915
916 /// Create a sequential execution region within a merged parallel region,
917 /// encapsulated in a master construct with a barrier for synchronization.
918 auto CreateSequentialRegion = [&](Function *OuterFn,
919 BasicBlock *OuterPredBB,
920 Instruction *SeqStartI,
921 Instruction *SeqEndI) {
922 // Isolate the instructions of the sequential region to a separate
923 // block.
924 BasicBlock *ParentBB = SeqStartI->getParent();
925 BasicBlock *SeqEndBB =
926 SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
927 BasicBlock *SeqAfterBB =
928 SplitBlock(SeqEndBB, &*SeqEndBB->getFirstInsertionPt(), DT, LI);
929 BasicBlock *SeqStartBB =
930 SplitBlock(ParentBB, SeqStartI, DT, LI, nullptr, "seq.par.merged");
931
932 assert(ParentBB->getUniqueSuccessor() == SeqStartBB &&(static_cast <bool> (ParentBB->getUniqueSuccessor() ==
SeqStartBB && "Expected a different CFG") ? void (0)
: __assert_fail ("ParentBB->getUniqueSuccessor() == SeqStartBB && \"Expected a different CFG\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 933, __extension__ __PRETTY_FUNCTION__))
933 "Expected a different CFG")(static_cast <bool> (ParentBB->getUniqueSuccessor() ==
SeqStartBB && "Expected a different CFG") ? void (0)
: __assert_fail ("ParentBB->getUniqueSuccessor() == SeqStartBB && \"Expected a different CFG\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 933, __extension__ __PRETTY_FUNCTION__))
;
934 const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
935 ParentBB->getTerminator()->eraseFromParent();
936
937 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
938 BasicBlock &ContinuationIP) {
939 BasicBlock *CGStartBB = CodeGenIP.getBlock();
940 BasicBlock *CGEndBB =
941 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
942 assert(SeqStartBB != nullptr && "SeqStartBB should not be null")(static_cast <bool> (SeqStartBB != nullptr && "SeqStartBB should not be null"
) ? void (0) : __assert_fail ("SeqStartBB != nullptr && \"SeqStartBB should not be null\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 942, __extension__ __PRETTY_FUNCTION__))
;
943 CGStartBB->getTerminator()->setSuccessor(0, SeqStartBB);
944 assert(SeqEndBB != nullptr && "SeqEndBB should not be null")(static_cast <bool> (SeqEndBB != nullptr && "SeqEndBB should not be null"
) ? void (0) : __assert_fail ("SeqEndBB != nullptr && \"SeqEndBB should not be null\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 944, __extension__ __PRETTY_FUNCTION__))
;
945 SeqEndBB->getTerminator()->setSuccessor(0, CGEndBB);
946 };
947 auto FiniCB = [&](InsertPointTy CodeGenIP) {};
948
949 // Find outputs from the sequential region to outside users and
950 // broadcast their values to them.
951 for (Instruction &I : *SeqStartBB) {
952 SmallPtrSet<Instruction *, 4> OutsideUsers;
953 for (User *Usr : I.users()) {
954 Instruction &UsrI = *cast<Instruction>(Usr);
955 // Ignore outputs to LT intrinsics, code extraction for the merged
956 // parallel region will fix them.
957 if (UsrI.isLifetimeStartOrEnd())
958 continue;
959
960 if (UsrI.getParent() != SeqStartBB)
961 OutsideUsers.insert(&UsrI);
962 }
963
964 if (OutsideUsers.empty())
965 continue;
966
967 // Emit an alloca in the outer region to store the broadcasted
968 // value.
969 const DataLayout &DL = M.getDataLayout();
970 AllocaInst *AllocaI = new AllocaInst(
971 I.getType(), DL.getAllocaAddrSpace(), nullptr,
972 I.getName() + ".seq.output.alloc", &OuterFn->front().front());
973
974 // Emit a store instruction in the sequential BB to update the
975 // value.
976 new StoreInst(&I, AllocaI, SeqStartBB->getTerminator());
977
978 // Emit a load instruction and replace the use of the output value
979 // with it.
980 for (Instruction *UsrI : OutsideUsers) {
981 LoadInst *LoadI = new LoadInst(
982 I.getType(), AllocaI, I.getName() + ".seq.output.load", UsrI);
983 UsrI->replaceUsesOfWith(&I, LoadI);
984 }
985 }
986
987 OpenMPIRBuilder::LocationDescription Loc(
988 InsertPointTy(ParentBB, ParentBB->end()), DL);
989 InsertPointTy SeqAfterIP =
990 OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB);
991
992 OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel);
993
994 BranchInst::Create(SeqAfterBB, SeqAfterIP.getBlock());
995
996 LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFndo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "After sequential inlining "
<< *OuterFn << "\n"; } } while (false)
997 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "After sequential inlining "
<< *OuterFn << "\n"; } } while (false)
;
998 };
999
1000 // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
1001 // contained in BB and only separated by instructions that can be
1002 // redundantly executed in parallel. The block BB is split before the first
1003 // call (in MergableCIs) and after the last so the entire region we merge
1004 // into a single parallel region is contained in a single basic block
1005 // without any other instructions. We use the OpenMPIRBuilder to outline
1006 // that block and call the resulting function via __kmpc_fork_call.
1007 auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) {
1008 // TODO: Change the interface to allow single CIs expanded, e.g, to
1009 // include an outer loop.
1010 assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs")(static_cast <bool> (MergableCIs.size() > 1 &&
"Assumed multiple mergable CIs") ? void (0) : __assert_fail (
"MergableCIs.size() > 1 && \"Assumed multiple mergable CIs\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 1010, __extension__ __PRETTY_FUNCTION__))
;
1011
1012 auto Remark = [&](OptimizationRemark OR) {
1013 OR << "Parallel region merged with parallel region"
1014 << (MergableCIs.size() > 2 ? "s" : "") << " at ";
1015 for (auto *CI : llvm::drop_begin(MergableCIs)) {
1016 OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc());
1017 if (CI != MergableCIs.back())
1018 OR << ", ";
1019 }
1020 return OR << ".";
1021 };
1022
1023 emitRemark<OptimizationRemark>(MergableCIs.front(), "OMP150", Remark);
1024
1025 Function *OriginalFn = BB->getParent();
1026 LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Merge " <<
MergableCIs.size() << " parallel regions in " <<
OriginalFn->getName() << "\n"; } } while (false)
1027 << " parallel regions in " << OriginalFn->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Merge " <<
MergableCIs.size() << " parallel regions in " <<
OriginalFn->getName() << "\n"; } } while (false)
1028 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Merge " <<
MergableCIs.size() << " parallel regions in " <<
OriginalFn->getName() << "\n"; } } while (false)
;
1029
1030 // Isolate the calls to merge in a separate block.
1031 EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI);
1032 BasicBlock *AfterBB =
1033 SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
1034 StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr,
1035 "omp.par.merged");
1036
1037 assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG")(static_cast <bool> (BB->getUniqueSuccessor() == StartBB
&& "Expected a different CFG") ? void (0) : __assert_fail
("BB->getUniqueSuccessor() == StartBB && \"Expected a different CFG\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 1037, __extension__ __PRETTY_FUNCTION__))
;
1038 const DebugLoc DL = BB->getTerminator()->getDebugLoc();
1039 BB->getTerminator()->eraseFromParent();
1040
1041 // Create sequential regions for sequential instructions that are
1042 // in-between mergable parallel regions.
1043 for (auto *It = MergableCIs.begin(), *End = MergableCIs.end() - 1;
1044 It != End; ++It) {
1045 Instruction *ForkCI = *It;
1046 Instruction *NextForkCI = *(It + 1);
1047
1048 // Continue if there are not in-between instructions.
1049 if (ForkCI->getNextNode() == NextForkCI)
1050 continue;
1051
1052 CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(),
1053 NextForkCI->getPrevNode());
1054 }
1055
1056 OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
1057 DL);
1058 IRBuilder<>::InsertPoint AllocaIP(
1059 &OriginalFn->getEntryBlock(),
1060 OriginalFn->getEntryBlock().getFirstInsertionPt());
1061 // Create the merged parallel region with default proc binding, to
1062 // avoid overriding binding settings, and without explicit cancellation.
1063 InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.createParallel(
1064 Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr,
1065 OMP_PROC_BIND_default, /* IsCancellable */ false);
1066 BranchInst::Create(AfterBB, AfterIP.getBlock());
1067
1068 // Perform the actual outlining.
1069 OMPInfoCache.OMPBuilder.finalize(OriginalFn,
1070 /* AllowExtractorSinking */ true);
1071
1072 Function *OutlinedFn = MergableCIs.front()->getCaller();
1073
1074 // Replace the __kmpc_fork_call calls with direct calls to the outlined
1075 // callbacks.
1076 SmallVector<Value *, 8> Args;
1077 for (auto *CI : MergableCIs) {
1078 Value *Callee =
1079 CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts();
1080 FunctionType *FT =
1081 cast<FunctionType>(Callee->getType()->getPointerElementType());
1082 Args.clear();
1083 Args.push_back(OutlinedFn->getArg(0));
1084 Args.push_back(OutlinedFn->getArg(1));
1085 for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E;
1086 ++U)
1087 Args.push_back(CI->getArgOperand(U));
1088
1089 CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI);
1090 if (CI->getDebugLoc())
1091 NewCI->setDebugLoc(CI->getDebugLoc());
1092
1093 // Forward parameter attributes from the callback to the callee.
1094 for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E;
1095 ++U)
1096 for (const Attribute &A : CI->getAttributes().getParamAttrs(U))
1097 NewCI->addParamAttr(
1098 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
1099
1100 // Emit an explicit barrier to replace the implicit fork-join barrier.
1101 if (CI != MergableCIs.back()) {
1102 // TODO: Remove barrier if the merged parallel region includes the
1103 // 'nowait' clause.
1104 OMPInfoCache.OMPBuilder.createBarrier(
1105 InsertPointTy(NewCI->getParent(),
1106 NewCI->getNextNode()->getIterator()),
1107 OMPD_parallel);
1108 }
1109
1110 CI->eraseFromParent();
1111 }
1112
1113 assert(OutlinedFn != OriginalFn && "Outlining failed")(static_cast <bool> (OutlinedFn != OriginalFn &&
"Outlining failed") ? void (0) : __assert_fail ("OutlinedFn != OriginalFn && \"Outlining failed\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 1113, __extension__ __PRETTY_FUNCTION__))
;
1114 CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
1115 CGUpdater.reanalyzeFunction(*OriginalFn);
1116
1117 NumOpenMPParallelRegionsMerged += MergableCIs.size();
1118
1119 return true;
1120 };
1121
1122 // Helper function that identifes sequences of
1123 // __kmpc_fork_call uses in a basic block.
1124 auto DetectPRsCB = [&](Use &U, Function &F) {
1125 CallInst *CI = getCallIfRegularCall(U, &RFI);
1126 BB2PRMap[CI->getParent()].insert(CI);
1127
1128 return false;
1129 };
1130
1131 BB2PRMap.clear();
1132 RFI.foreachUse(SCC, DetectPRsCB);
1133 SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector;
1134 // Find mergable parallel regions within a basic block that are
1135 // safe to merge, that is any in-between instructions can safely
1136 // execute in parallel after merging.
1137 // TODO: support merging across basic-blocks.
1138 for (auto &It : BB2PRMap) {
1139 auto &CIs = It.getSecond();
1140 if (CIs.size() < 2)
1141 continue;
1142
1143 BasicBlock *BB = It.getFirst();
1144 SmallVector<CallInst *, 4> MergableCIs;
1145
1146 /// Returns true if the instruction is mergable, false otherwise.
1147 /// A terminator instruction is unmergable by definition since merging
1148 /// works within a BB. Instructions before the mergable region are
1149 /// mergable if they are not calls to OpenMP runtime functions that may
1150 /// set different execution parameters for subsequent parallel regions.
1151 /// Instructions in-between parallel regions are mergable if they are not
1152 /// calls to any non-intrinsic function since that may call a non-mergable
1153 /// OpenMP runtime function.
1154 auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) {
1155 // We do not merge across BBs, hence return false (unmergable) if the
1156 // instruction is a terminator.
1157 if (I.isTerminator())
1158 return false;
1159
1160 if (!isa<CallInst>(&I))
1161 return true;
1162
1163 CallInst *CI = cast<CallInst>(&I);
1164 if (IsBeforeMergableRegion) {
1165 Function *CalledFunction = CI->getCalledFunction();
1166 if (!CalledFunction)
1167 return false;
1168 // Return false (unmergable) if the call before the parallel
1169 // region calls an explicit affinity (proc_bind) or number of
1170 // threads (num_threads) compiler-generated function. Those settings
1171 // may be incompatible with following parallel regions.
1172 // TODO: ICV tracking to detect compatibility.
1173 for (const auto &RFI : UnmergableCallsInfo) {
1174 if (CalledFunction == RFI.Declaration)
1175 return false;
1176 }
1177 } else {
1178 // Return false (unmergable) if there is a call instruction
1179 // in-between parallel regions when it is not an intrinsic. It
1180 // may call an unmergable OpenMP runtime function in its callpath.
1181 // TODO: Keep track of possible OpenMP calls in the callpath.
1182 if (!isa<IntrinsicInst>(CI))
1183 return false;
1184 }
1185
1186 return true;
1187 };
1188 // Find maximal number of parallel region CIs that are safe to merge.
1189 for (auto It = BB->begin(), End = BB->end(); It != End;) {
1190 Instruction &I = *It;
1191 ++It;
1192
1193 if (CIs.count(&I)) {
1194 MergableCIs.push_back(cast<CallInst>(&I));
1195 continue;
1196 }
1197
1198 // Continue expanding if the instruction is mergable.
1199 if (IsMergable(I, MergableCIs.empty()))
1200 continue;
1201
1202 // Forward the instruction iterator to skip the next parallel region
1203 // since there is an unmergable instruction which can affect it.
1204 for (; It != End; ++It) {
1205 Instruction &SkipI = *It;
1206 if (CIs.count(&SkipI)) {
1207 LLVM_DEBUG(dbgs() << TAG << "Skip parallel region " << SkipIdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Skip parallel region "
<< SkipI << " due to " << I << "\n";
} } while (false)
1208 << " due to " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Skip parallel region "
<< SkipI << " due to " << I << "\n";
} } while (false)
;
1209 ++It;
1210 break;
1211 }
1212 }
1213
1214 // Store mergable regions found.
1215 if (MergableCIs.size() > 1) {
1216 MergableCIsVector.push_back(MergableCIs);
1217 LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Found " <<
MergableCIs.size() << " parallel regions in block " <<
BB->getName() << " of function " << BB->getParent
()->getName() << "\n";; } } while (false)
1218 << " parallel regions in block " << BB->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Found " <<
MergableCIs.size() << " parallel regions in block " <<
BB->getName() << " of function " << BB->getParent
()->getName() << "\n";; } } while (false)
1219 << " of function " << BB->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Found " <<
MergableCIs.size() << " parallel regions in block " <<
BB->getName() << " of function " << BB->getParent
()->getName() << "\n";; } } while (false)
1220 << "\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Found " <<
MergableCIs.size() << " parallel regions in block " <<
BB->getName() << " of function " << BB->getParent
()->getName() << "\n";; } } while (false)
;
1221 }
1222
1223 MergableCIs.clear();
1224 }
1225
1226 if (!MergableCIsVector.empty()) {
1227 Changed = true;
1228
1229 for (auto &MergableCIs : MergableCIsVector)
1230 Merge(MergableCIs, BB);
1231 MergableCIsVector.clear();
1232 }
1233 }
1234
1235 if (Changed) {
1236 /// Re-collect use for fork calls, emitted barrier calls, and
1237 /// any emitted master/end_master calls.
1238 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
1239 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
1240 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
1241 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
1242 }
1243
1244 return Changed;
1245 }
1246
1247 /// Try to delete parallel regions if possible.
1248 bool deleteParallelRegions() {
1249 const unsigned CallbackCalleeOperand = 2;
1250
1251 OMPInformationCache::RuntimeFunctionInfo &RFI =
1252 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1253
1254 if (!RFI.Declaration)
1255 return false;
1256
1257 bool Changed = false;
1258 auto DeleteCallCB = [&](Use &U, Function &) {
1259 CallInst *CI = getCallIfRegularCall(U);
1260 if (!CI)
1261 return false;
1262 auto *Fn = dyn_cast<Function>(
1263 CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts());
1264 if (!Fn)
1265 return false;
1266 if (!Fn->onlyReadsMemory())
1267 return false;
1268 if (!Fn->hasFnAttribute(Attribute::WillReturn))
1269 return false;
1270
1271 LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Delete read-only parallel region in "
<< CI->getCaller()->getName() << "\n"; } }
while (false)
1272 << CI->getCaller()->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Delete read-only parallel region in "
<< CI->getCaller()->getName() << "\n"; } }
while (false)
;
1273
1274 auto Remark = [&](OptimizationRemark OR) {
1275 return OR << "Removing parallel region with no side-effects.";
1276 };
1277 emitRemark<OptimizationRemark>(CI, "OMP160", Remark);
1278
1279 CGUpdater.removeCallSite(*CI);
1280 CI->eraseFromParent();
1281 Changed = true;
1282 ++NumOpenMPParallelRegionsDeleted;
1283 return true;
1284 };
1285
1286 RFI.foreachUse(SCC, DeleteCallCB);
1287
1288 return Changed;
1289 }
1290
1291 /// Try to eliminate runtime calls by reusing existing ones.
1292 bool deduplicateRuntimeCalls() {
1293 bool Changed = false;
1294
1295 RuntimeFunction DeduplicableRuntimeCallIDs[] = {
1296 OMPRTL_omp_get_num_threads,
1297 OMPRTL_omp_in_parallel,
1298 OMPRTL_omp_get_cancellation,
1299 OMPRTL_omp_get_thread_limit,
1300 OMPRTL_omp_get_supported_active_levels,
1301 OMPRTL_omp_get_level,
1302 OMPRTL_omp_get_ancestor_thread_num,
1303 OMPRTL_omp_get_team_size,
1304 OMPRTL_omp_get_active_level,
1305 OMPRTL_omp_in_final,
1306 OMPRTL_omp_get_proc_bind,
1307 OMPRTL_omp_get_num_places,
1308 OMPRTL_omp_get_num_procs,
1309 OMPRTL_omp_get_place_num,
1310 OMPRTL_omp_get_partition_num_places,
1311 OMPRTL_omp_get_partition_place_nums};
1312
1313 // Global-tid is handled separately.
1314 SmallSetVector<Value *, 16> GTIdArgs;
1315 collectGlobalThreadIdArguments(GTIdArgs);
1316 LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Found " <<
GTIdArgs.size() << " global thread ID arguments\n"; } }
while (false)
1317 << " global thread ID arguments\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Found " <<
GTIdArgs.size() << " global thread ID arguments\n"; } }
while (false)
;
1318
1319 for (Function *F : SCC) {
1320 for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
1321 Changed |= deduplicateRuntimeCalls(
1322 *F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
1323
1324 // __kmpc_global_thread_num is special as we can replace it with an
1325 // argument in enough cases to make it worth trying.
1326 Value *GTIdArg = nullptr;
1327 for (Argument &Arg : F->args())
1328 if (GTIdArgs.count(&Arg)) {
1329 GTIdArg = &Arg;
1330 break;
1331 }
1332 Changed |= deduplicateRuntimeCalls(
1333 *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
1334 }
1335
1336 return Changed;
1337 }
1338
1339 /// Tries to hide the latency of runtime calls that involve host to
1340 /// device memory transfers by splitting them into their "issue" and "wait"
1341 /// versions. The "issue" is moved upwards as much as possible. The "wait" is
1342 /// moved downards as much as possible. The "issue" issues the memory transfer
1343 /// asynchronously, returning a handle. The "wait" waits in the returned
1344 /// handle for the memory transfer to finish.
1345 bool hideMemTransfersLatency() {
1346 auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
1347 bool Changed = false;
1348 auto SplitMemTransfers = [&](Use &U, Function &Decl) {
1349 auto *RTCall = getCallIfRegularCall(U, &RFI);
1350 if (!RTCall)
1351 return false;
1352
1353 OffloadArray OffloadArrays[3];
1354 if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
1355 return false;
1356
1357 LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dumpValuesInOffloadArrays(OffloadArrays); }
} while (false)
;
1358
1359 // TODO: Check if can be moved upwards.
1360 bool WasSplit = false;
1361 Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
1362 if (WaitMovementPoint)
1363 WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
1364
1365 Changed |= WasSplit;
1366 return WasSplit;
1367 };
1368 RFI.foreachUse(SCC, SplitMemTransfers);
1369
1370 return Changed;
1371 }
1372
1373 void analysisGlobalization() {
1374 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
1375
1376 auto CheckGlobalization = [&](Use &U, Function &Decl) {
1377 if (CallInst *CI = getCallIfRegularCall(U, &RFI)) {
1378 auto Remark = [&](OptimizationRemarkMissed ORM) {
1379 return ORM
1380 << "Found thread data sharing on the GPU. "
1381 << "Expect degraded performance due to data globalization.";
1382 };
1383 emitRemark<OptimizationRemarkMissed>(CI, "OMP112", Remark);
1384 }
1385
1386 return false;
1387 };
1388
1389 RFI.foreachUse(SCC, CheckGlobalization);
1390 }
1391
1392 /// Maps the values stored in the offload arrays passed as arguments to
1393 /// \p RuntimeCall into the offload arrays in \p OAs.
1394 bool getValuesInOffloadArrays(CallInst &RuntimeCall,
1395 MutableArrayRef<OffloadArray> OAs) {
1396 assert(OAs.size() == 3 && "Need space for three offload arrays!")(static_cast <bool> (OAs.size() == 3 && "Need space for three offload arrays!"
) ? void (0) : __assert_fail ("OAs.size() == 3 && \"Need space for three offload arrays!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 1396, __extension__ __PRETTY_FUNCTION__))
;
1397
1398 // A runtime call that involves memory offloading looks something like:
1399 // call void @__tgt_target_data_begin_mapper(arg0, arg1,
1400 // i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes,
1401 // ...)
1402 // So, the idea is to access the allocas that allocate space for these
1403 // offload arrays, offload_baseptrs, offload_ptrs, offload_sizes.
1404 // Therefore:
1405 // i8** %offload_baseptrs.
1406 Value *BasePtrsArg =
1407 RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum);
1408 // i8** %offload_ptrs.
1409 Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum);
1410 // i8** %offload_sizes.
1411 Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum);
1412
1413 // Get values stored in **offload_baseptrs.
1414 auto *V = getUnderlyingObject(BasePtrsArg);
1415 if (!isa<AllocaInst>(V))
1416 return false;
1417 auto *BasePtrsArray = cast<AllocaInst>(V);
1418 if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall))
1419 return false;
1420
1421 // Get values stored in **offload_baseptrs.
1422 V = getUnderlyingObject(PtrsArg);
1423 if (!isa<AllocaInst>(V))
1424 return false;
1425 auto *PtrsArray = cast<AllocaInst>(V);
1426 if (!OAs[1].initialize(*PtrsArray, RuntimeCall))
1427 return false;
1428
1429 // Get values stored in **offload_sizes.
1430 V = getUnderlyingObject(SizesArg);
1431 // If it's a [constant] global array don't analyze it.
1432 if (isa<GlobalValue>(V))
1433 return isa<Constant>(V);
1434 if (!isa<AllocaInst>(V))
1435 return false;
1436
1437 auto *SizesArray = cast<AllocaInst>(V);
1438 if (!OAs[2].initialize(*SizesArray, RuntimeCall))
1439 return false;
1440
1441 return true;
1442 }
1443
1444 /// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG.
1445 /// For now this is a way to test that the function getValuesInOffloadArrays
1446 /// is working properly.
1447 /// TODO: Move this to a unittest when unittests are available for OpenMPOpt.
1448 void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) {
1449 assert(OAs.size() == 3 && "There are three offload arrays to debug!")(static_cast <bool> (OAs.size() == 3 && "There are three offload arrays to debug!"
) ? void (0) : __assert_fail ("OAs.size() == 3 && \"There are three offload arrays to debug!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 1449, __extension__ __PRETTY_FUNCTION__))
;
1450
1451 LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << " Successfully got offload values:\n"
; } } while (false)
;
1452 std::string ValuesStr;
1453 raw_string_ostream Printer(ValuesStr);
1454 std::string Separator = " --- ";
1455
1456 for (auto *BP : OAs[0].StoredValues) {
1457 BP->print(Printer);
1458 Printer << Separator;
1459 }
1460 LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << "\t\toffload_baseptrs: " <<
Printer.str() << "\n"; } } while (false)
;
1461 ValuesStr.clear();
1462
1463 for (auto *P : OAs[1].StoredValues) {
1464 P->print(Printer);
1465 Printer << Separator;
1466 }
1467 LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << "\t\toffload_ptrs: " <<
Printer.str() << "\n"; } } while (false)
;
1468 ValuesStr.clear();
1469
1470 for (auto *S : OAs[2].StoredValues) {
1471 S->print(Printer);
1472 Printer << Separator;
1473 }
1474 LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << "\t\toffload_sizes: " <<
Printer.str() << "\n"; } } while (false)
;
1475 }
1476
1477 /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be
1478 /// moved. Returns nullptr if the movement is not possible, or not worth it.
1479 Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
1480 // FIXME: This traverses only the BasicBlock where RuntimeCall is.
1481 // Make it traverse the CFG.
1482
1483 Instruction *CurrentI = &RuntimeCall;
1484 bool IsWorthIt = false;
1485 while ((CurrentI = CurrentI->getNextNode())) {
1486
1487 // TODO: Once we detect the regions to be offloaded we should use the
1488 // alias analysis manager to check if CurrentI may modify one of
1489 // the offloaded regions.
1490 if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) {
1491 if (IsWorthIt)
1492 return CurrentI;
1493
1494 return nullptr;
1495 }
1496
1497 // FIXME: For now if we move it over anything without side effect
1498 // is worth it.
1499 IsWorthIt = true;
1500 }
1501
1502 // Return end of BasicBlock.
1503 return RuntimeCall.getParent()->getTerminator();
1504 }
1505
1506 /// Splits \p RuntimeCall into its "issue" and "wait" counterparts.
1507 bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
1508 Instruction &WaitMovementPoint) {
1509 // Create stack allocated handle (__tgt_async_info) at the beginning of the
1510 // function. Used for storing information of the async transfer, allowing to
1511 // wait on it later.
1512 auto &IRBuilder = OMPInfoCache.OMPBuilder;
1513 auto *F = RuntimeCall.getCaller();
1514 Instruction *FirstInst = &(F->getEntryBlock().front());
1515 AllocaInst *Handle = new AllocaInst(
1516 IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst);
1517
1518 // Add "issue" runtime call declaration:
1519 // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
1520 // i8**, i8**, i64*, i64*)
1521 FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
1522 M, OMPRTL___tgt_target_data_begin_mapper_issue);
1523
1524 // Change RuntimeCall call site for its asynchronous version.
1525 SmallVector<Value *, 16> Args;
1526 for (auto &Arg : RuntimeCall.args())
1527 Args.push_back(Arg.get());
1528 Args.push_back(Handle);
1529
1530 CallInst *IssueCallsite =
1531 CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall);
1532 RuntimeCall.eraseFromParent();
1533
1534 // Add "wait" runtime call declaration:
1535 // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
1536 FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
1537 M, OMPRTL___tgt_target_data_begin_mapper_wait);
1538
1539 Value *WaitParams[2] = {
1540 IssueCallsite->getArgOperand(
1541 OffloadArray::DeviceIDArgNum), // device_id.
1542 Handle // handle to wait on.
1543 };
1544 CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint);
1545
1546 return true;
1547 }
1548
1549 static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
1550 bool GlobalOnly, bool &SingleChoice) {
1551 if (CurrentIdent == NextIdent)
1552 return CurrentIdent;
1553
1554 // TODO: Figure out how to actually combine multiple debug locations. For
1555 // now we just keep an existing one if there is a single choice.
1556 if (!GlobalOnly || isa<GlobalValue>(NextIdent)) {
1557 SingleChoice = !CurrentIdent;
1558 return NextIdent;
1559 }
1560 return nullptr;
1561 }
1562
1563 /// Return an `struct ident_t*` value that represents the ones used in the
1564 /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not
1565 /// return a local `struct ident_t*`. For now, if we cannot find a suitable
1566 /// return value we create one from scratch. We also do not yet combine
1567 /// information, e.g., the source locations, see combinedIdentStruct.
1568 Value *
1569 getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
1570 Function &F, bool GlobalOnly) {
1571 bool SingleChoice = true;
1572 Value *Ident = nullptr;
1573 auto CombineIdentStruct = [&](Use &U, Function &Caller) {
1574 CallInst *CI = getCallIfRegularCall(U, &RFI);
1575 if (!CI || &F != &Caller)
1576 return false;
1577 Ident = combinedIdentStruct(Ident, CI->getArgOperand(0),
1578 /* GlobalOnly */ true, SingleChoice);
1579 return false;
1580 };
1581 RFI.foreachUse(SCC, CombineIdentStruct);
1582
1583 if (!Ident || !SingleChoice) {
1584 // The IRBuilder uses the insertion block to get to the module, this is
1585 // unfortunate but we work around it for now.
1586 if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
1587 OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy(
1588 &F.getEntryBlock(), F.getEntryBlock().begin()));
1589 // Create a fallback location if non was found.
1590 // TODO: Use the debug locations of the calls instead.
1591 Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr();
1592 Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc);
1593 }
1594 return Ident;
1595 }
1596
1597 /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or
1598 /// \p ReplVal if given.
1599 bool deduplicateRuntimeCalls(Function &F,
1600 OMPInformationCache::RuntimeFunctionInfo &RFI,
1601 Value *ReplVal = nullptr) {
1602 auto *UV = RFI.getUseVector(F);
1603 if (!UV || UV->size() + (ReplVal != nullptr) < 2)
1604 return false;
1605
1606 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Deduplicate "
<< UV->size() << " uses of " << RFI.Name
<< (ReplVal ? " with an existing value\n" : "\n") <<
"\n"; } } while (false)
1607 dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Namedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Deduplicate "
<< UV->size() << " uses of " << RFI.Name
<< (ReplVal ? " with an existing value\n" : "\n") <<
"\n"; } } while (false)
1608 << (ReplVal ? " with an existing value\n" : "\n") << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Deduplicate "
<< UV->size() << " uses of " << RFI.Name
<< (ReplVal ? " with an existing value\n" : "\n") <<
"\n"; } } while (false)
;
1609
1610 assert((!ReplVal || (isa<Argument>(ReplVal) &&(static_cast <bool> ((!ReplVal || (isa<Argument>(
ReplVal) && cast<Argument>(ReplVal)->getParent
() == &F)) && "Unexpected replacement value!") ? void
(0) : __assert_fail ("(!ReplVal || (isa<Argument>(ReplVal) && cast<Argument>(ReplVal)->getParent() == &F)) && \"Unexpected replacement value!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 1612, __extension__ __PRETTY_FUNCTION__))
1611 cast<Argument>(ReplVal)->getParent() == &F)) &&(static_cast <bool> ((!ReplVal || (isa<Argument>(
ReplVal) && cast<Argument>(ReplVal)->getParent
() == &F)) && "Unexpected replacement value!") ? void
(0) : __assert_fail ("(!ReplVal || (isa<Argument>(ReplVal) && cast<Argument>(ReplVal)->getParent() == &F)) && \"Unexpected replacement value!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 1612, __extension__ __PRETTY_FUNCTION__))
1612 "Unexpected replacement value!")(static_cast <bool> ((!ReplVal || (isa<Argument>(
ReplVal) && cast<Argument>(ReplVal)->getParent
() == &F)) && "Unexpected replacement value!") ? void
(0) : __assert_fail ("(!ReplVal || (isa<Argument>(ReplVal) && cast<Argument>(ReplVal)->getParent() == &F)) && \"Unexpected replacement value!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 1612, __extension__ __PRETTY_FUNCTION__))
;
1613
1614 // TODO: Use dominance to find a good position instead.
1615 auto CanBeMoved = [this](CallBase &CB) {
1616 unsigned NumArgs = CB.arg_size();
1617 if (NumArgs == 0)
1618 return true;
1619 if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
1620 return false;
1621 for (unsigned U = 1; U < NumArgs; ++U)
1622 if (isa<Instruction>(CB.getArgOperand(U)))
1623 return false;
1624 return true;
1625 };
1626
1627 if (!ReplVal) {
1628 for (Use *U : *UV)
1629 if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
1630 if (!CanBeMoved(*CI))
1631 continue;
1632
1633 // If the function is a kernel, dedup will move
1634 // the runtime call right after the kernel init callsite. Otherwise,
1635 // it will move it to the beginning of the caller function.
1636 if (isKernel(F)) {
1637 auto &KernelInitRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
1638 auto *KernelInitUV = KernelInitRFI.getUseVector(F);
1639
1640 if (KernelInitUV->empty())
1641 continue;
1642
1643 assert(KernelInitUV->size() == 1 &&(static_cast <bool> (KernelInitUV->size() == 1 &&
"Expected a single __kmpc_target_init in kernel\n") ? void (
0) : __assert_fail ("KernelInitUV->size() == 1 && \"Expected a single __kmpc_target_init in kernel\\n\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 1644, __extension__ __PRETTY_FUNCTION__))
1644 "Expected a single __kmpc_target_init in kernel\n")(static_cast <bool> (KernelInitUV->size() == 1 &&
"Expected a single __kmpc_target_init in kernel\n") ? void (
0) : __assert_fail ("KernelInitUV->size() == 1 && \"Expected a single __kmpc_target_init in kernel\\n\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 1644, __extension__ __PRETTY_FUNCTION__))
;
1645
1646 CallInst *KernelInitCI =
1647 getCallIfRegularCall(*KernelInitUV->front(), &KernelInitRFI);
1648 assert(KernelInitCI &&(static_cast <bool> (KernelInitCI && "Expected a call to __kmpc_target_init in kernel\n"
) ? void (0) : __assert_fail ("KernelInitCI && \"Expected a call to __kmpc_target_init in kernel\\n\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 1649, __extension__ __PRETTY_FUNCTION__))
1649 "Expected a call to __kmpc_target_init in kernel\n")(static_cast <bool> (KernelInitCI && "Expected a call to __kmpc_target_init in kernel\n"
) ? void (0) : __assert_fail ("KernelInitCI && \"Expected a call to __kmpc_target_init in kernel\\n\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 1649, __extension__ __PRETTY_FUNCTION__))
;
1650
1651 CI->moveAfter(KernelInitCI);
1652 } else
1653 CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt());
1654 ReplVal = CI;
1655 break;
1656 }
1657 if (!ReplVal)
1658 return false;
1659 }
1660
1661 // If we use a call as a replacement value we need to make sure the ident is
1662 // valid at the new location. For now we just pick a global one, either
1663 // existing and used by one of the calls, or created from scratch.
1664 if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
1665 if (!CI->arg_empty() &&
1666 CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) {
1667 Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F,
1668 /* GlobalOnly */ true);
1669 CI->setArgOperand(0, Ident);
1670 }
1671 }
1672
1673 bool Changed = false;
1674 auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) {
1675 CallInst *CI = getCallIfRegularCall(U, &RFI);
1676 if (!CI || CI == ReplVal || &F != &Caller)
1677 return false;
1678 assert(CI->getCaller() == &F && "Unexpected call!")(static_cast <bool> (CI->getCaller() == &F &&
"Unexpected call!") ? void (0) : __assert_fail ("CI->getCaller() == &F && \"Unexpected call!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 1678, __extension__ __PRETTY_FUNCTION__))
;
1679
1680 auto Remark = [&](OptimizationRemark OR) {
1681 return OR << "OpenMP runtime call "
1682 << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated.";
1683 };
1684 if (CI->getDebugLoc())
1685 emitRemark<OptimizationRemark>(CI, "OMP170", Remark);
1686 else
1687 emitRemark<OptimizationRemark>(&F, "OMP170", Remark);
1688
1689 CGUpdater.removeCallSite(*CI);
1690 CI->replaceAllUsesWith(ReplVal);
1691 CI->eraseFromParent();
1692 ++NumOpenMPRuntimeCallsDeduplicated;
1693 Changed = true;
1694 return true;
1695 };
1696 RFI.foreachUse(SCC, ReplaceAndDeleteCB);
1697
1698 return Changed;
1699 }
1700
1701 /// Collect arguments that represent the global thread id in \p GTIdArgs.
1702 void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> &GTIdArgs) {
1703 // TODO: Below we basically perform a fixpoint iteration with a pessimistic
1704 // initialization. We could define an AbstractAttribute instead and
1705 // run the Attributor here once it can be run as an SCC pass.
1706
1707 // Helper to check the argument \p ArgNo at all call sites of \p F for
1708 // a GTId.
1709 auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) {
1710 if (!F.hasLocalLinkage())
1711 return false;
1712 for (Use &U : F.uses()) {
1713 if (CallInst *CI = getCallIfRegularCall(U)) {
1714 Value *ArgOp = CI->getArgOperand(ArgNo);
1715 if (CI == &RefCI || GTIdArgs.count(ArgOp) ||
1716 getCallIfRegularCall(
1717 *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
1718 continue;
1719 }
1720 return false;
1721 }
1722 return true;
1723 };
1724
1725 // Helper to identify uses of a GTId as GTId arguments.
1726 auto AddUserArgs = [&](Value &GTId) {
1727 for (Use &U : GTId.uses())
1728 if (CallInst *CI = dyn_cast<CallInst>(U.getUser()))
1729 if (CI->isArgOperand(&U))
1730 if (Function *Callee = CI->getCalledFunction())
1731 if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI))
1732 GTIdArgs.insert(Callee->getArg(U.getOperandNo()));
1733 };
1734
1735 // The argument users of __kmpc_global_thread_num calls are GTIds.
1736 OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
1737 OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
1738
1739 GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) {
1740 if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
1741 AddUserArgs(*CI);
1742 return false;
1743 });
1744
1745 // Transitively search for more arguments by looking at the users of the
1746 // ones we know already. During the search the GTIdArgs vector is extended
1747 // so we cannot cache the size nor can we use a range based for.
1748 for (unsigned U = 0; U < GTIdArgs.size(); ++U)
1749 AddUserArgs(*GTIdArgs[U]);
1750 }
1751
1752 /// Kernel (=GPU) optimizations and utility functions
1753 ///
1754 ///{{
1755
1756 /// Check if \p F is a kernel, hence entry point for target offloading.
1757 bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); }
1758
1759 /// Cache to remember the unique kernel for a function.
1760 DenseMap<Function *, Optional<Kernel>> UniqueKernelMap;
1761
1762 /// Find the unique kernel that will execute \p F, if any.
1763 Kernel getUniqueKernelFor(Function &F);
1764
1765 /// Find the unique kernel that will execute \p I, if any.
1766 Kernel getUniqueKernelFor(Instruction &I) {
1767 return getUniqueKernelFor(*I.getFunction());
1768 }
1769
1770 /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in
1771 /// the cases we can avoid taking the address of a function.
1772 bool rewriteDeviceCodeStateMachine();
1773
1774 ///
1775 ///}}
1776
1777 /// Emit a remark generically
1778 ///
1779 /// This template function can be used to generically emit a remark. The
1780 /// RemarkKind should be one of the following:
1781 /// - OptimizationRemark to indicate a successful optimization attempt
1782 /// - OptimizationRemarkMissed to report a failed optimization attempt
1783 /// - OptimizationRemarkAnalysis to provide additional information about an
1784 /// optimization attempt
1785 ///
1786 /// The remark is built using a callback function provided by the caller that
1787 /// takes a RemarkKind as input and returns a RemarkKind.
1788 template <typename RemarkKind, typename RemarkCallBack>
1789 void emitRemark(Instruction *I, StringRef RemarkName,
1790 RemarkCallBack &&RemarkCB) const {
1791 Function *F = I->getParent()->getParent();
1792 auto &ORE = OREGetter(F);
1793
1794 if (RemarkName.startswith("OMP"))
1795 ORE.emit([&]() {
1796 return RemarkCB(RemarkKind(DEBUG_TYPE"openmp-opt", RemarkName, I))
1797 << " [" << RemarkName << "]";
1798 });
1799 else
1800 ORE.emit(
1801 [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE"openmp-opt", RemarkName, I)); });
1802 }
1803
1804 /// Emit a remark on a function.
1805 template <typename RemarkKind, typename RemarkCallBack>
1806 void emitRemark(Function *F, StringRef RemarkName,
1807 RemarkCallBack &&RemarkCB) const {
1808 auto &ORE = OREGetter(F);
1809
1810 if (RemarkName.startswith("OMP"))
1811 ORE.emit([&]() {
1812 return RemarkCB(RemarkKind(DEBUG_TYPE"openmp-opt", RemarkName, F))
1813 << " [" << RemarkName << "]";
1814 });
1815 else
1816 ORE.emit(
1817 [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE"openmp-opt", RemarkName, F)); });
1818 }
1819
1820 /// RAII struct to temporarily change an RTL function's linkage to external.
1821 /// This prevents it from being mistakenly removed by other optimizations.
1822 struct ExternalizationRAII {
1823 ExternalizationRAII(OMPInformationCache &OMPInfoCache,
1824 RuntimeFunction RFKind)
1825 : Declaration(OMPInfoCache.RFIs[RFKind].Declaration) {
1826 if (!Declaration)
1827 return;
1828
1829 LinkageType = Declaration->getLinkage();
1830 Declaration->setLinkage(GlobalValue::ExternalLinkage);
1831 }
1832
1833 ~ExternalizationRAII() {
1834 if (!Declaration)
1835 return;
1836
1837 Declaration->setLinkage(LinkageType);
1838 }
1839
1840 Function *Declaration;
1841 GlobalValue::LinkageTypes LinkageType;
1842 };
1843
1844 /// The underlying module.
1845 Module &M;
1846
1847 /// The SCC we are operating on.
1848 SmallVectorImpl<Function *> &SCC;
1849
1850 /// Callback to update the call graph, the first argument is a removed call,
1851 /// the second an optional replacement call.
1852 CallGraphUpdater &CGUpdater;
1853
1854 /// Callback to get an OptimizationRemarkEmitter from a Function *
1855 OptimizationRemarkGetter OREGetter;
1856
1857 /// OpenMP-specific information cache. Also Used for Attributor runs.
1858 OMPInformationCache &OMPInfoCache;
1859
1860 /// Attributor instance.
1861 Attributor &A;
1862
1863 /// Helper function to run Attributor on SCC.
1864 bool runAttributor(bool IsModulePass) {
1865 if (SCC.empty())
1866 return false;
1867
1868 // Temporarily make these function have external linkage so the Attributor
1869 // doesn't remove them when we try to look them up later.
1870 ExternalizationRAII Parallel(OMPInfoCache, OMPRTL___kmpc_kernel_parallel);
1871 ExternalizationRAII EndParallel(OMPInfoCache,
1872 OMPRTL___kmpc_kernel_end_parallel);
1873 ExternalizationRAII BarrierSPMD(OMPInfoCache,
1874 OMPRTL___kmpc_barrier_simple_spmd);
1875 ExternalizationRAII ThreadId(OMPInfoCache,
1876 OMPRTL___kmpc_get_hardware_thread_id_in_block);
1877
1878 registerAAs(IsModulePass);
1879
1880 ChangeStatus Changed = A.run();
1881
1882 LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << "[Attributor] Done with " <<
SCC.size() << " functions, result: " << Changed <<
".\n"; } } while (false)
1883 << " functions, result: " << Changed << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << "[Attributor] Done with " <<
SCC.size() << " functions, result: " << Changed <<
".\n"; } } while (false)
;
1884
1885 return Changed == ChangeStatus::CHANGED;
1886 }
1887
1888 void registerFoldRuntimeCall(RuntimeFunction RF);
1889
1890 /// Populate the Attributor with abstract attribute opportunities in the
1891 /// function.
1892 void registerAAs(bool IsModulePass);
1893};
1894
1895Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
1896 if (!OMPInfoCache.ModuleSlice.count(&F))
1897 return nullptr;
1898
1899 // Use a scope to keep the lifetime of the CachedKernel short.
1900 {
1901 Optional<Kernel> &CachedKernel = UniqueKernelMap[&F];
1902 if (CachedKernel)
1903 return *CachedKernel;
1904
1905 // TODO: We should use an AA to create an (optimistic and callback
1906 // call-aware) call graph. For now we stick to simple patterns that
1907 // are less powerful, basically the worst fixpoint.
1908 if (isKernel(F)) {
1909 CachedKernel = Kernel(&F);
1910 return *CachedKernel;
1911 }
1912
1913 CachedKernel = nullptr;
1914 if (!F.hasLocalLinkage()) {
1915
1916 // See https://openmp.llvm.org/remarks/OptimizationRemarks.html
1917 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1918 return ORA << "Potentially unknown OpenMP target region caller.";
1919 };
1920 emitRemark<OptimizationRemarkAnalysis>(&F, "OMP100", Remark);
1921
1922 return nullptr;
1923 }
1924 }
1925
1926 auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel {
1927 if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
1928 // Allow use in equality comparisons.
1929 if (Cmp->isEquality())
1930 return getUniqueKernelFor(*Cmp);
1931 return nullptr;
1932 }
1933 if (auto *CB = dyn_cast<CallBase>(U.getUser())) {
1934 // Allow direct calls.
1935 if (CB->isCallee(&U))
1936 return getUniqueKernelFor(*CB);
1937
1938 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
1939 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
1940 // Allow the use in __kmpc_parallel_51 calls.
1941 if (OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI))
1942 return getUniqueKernelFor(*CB);
1943 return nullptr;
1944 }
1945 // Disallow every other use.
1946 return nullptr;
1947 };
1948
1949 // TODO: In the future we want to track more than just a unique kernel.
1950 SmallPtrSet<Kernel, 2> PotentialKernels;
1951 OMPInformationCache::foreachUse(F, [&](const Use &U) {
1952 PotentialKernels.insert(GetUniqueKernelForUse(U));
1953 });
1954
1955 Kernel K = nullptr;
1956 if (PotentialKernels.size() == 1)
1957 K = *PotentialKernels.begin();
1958
1959 // Cache the result.
1960 UniqueKernelMap[&F] = K;
1961
1962 return K;
1963}
1964
1965bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
1966 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
1967 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
1968
1969 bool Changed = false;
1970 if (!KernelParallelRFI)
1971 return Changed;
1972
1973 // If we have disabled state machine changes, exit
1974 if (DisableOpenMPOptStateMachineRewrite)
1975 return Changed;
1976
1977 for (Function *F : SCC) {
1978
1979 // Check if the function is a use in a __kmpc_parallel_51 call at
1980 // all.
1981 bool UnknownUse = false;
1982 bool KernelParallelUse = false;
1983 unsigned NumDirectCalls = 0;
1984
1985 SmallVector<Use *, 2> ToBeReplacedStateMachineUses;
1986 OMPInformationCache::foreachUse(*F, [&](Use &U) {
1987 if (auto *CB = dyn_cast<CallBase>(U.getUser()))
1988 if (CB->isCallee(&U)) {
1989 ++NumDirectCalls;
1990 return;
1991 }
1992
1993 if (isa<ICmpInst>(U.getUser())) {
1994 ToBeReplacedStateMachineUses.push_back(&U);
1995 return;
1996 }
1997
1998 // Find wrapper functions that represent parallel kernels.
1999 CallInst *CI =
2000 OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI);
2001 const unsigned int WrapperFunctionArgNo = 6;
2002 if (!KernelParallelUse && CI &&
2003 CI->getArgOperandNo(&U) == WrapperFunctionArgNo) {
2004 KernelParallelUse = true;
2005 ToBeReplacedStateMachineUses.push_back(&U);
2006 return;
2007 }
2008 UnknownUse = true;
2009 });
2010
2011 // Do not emit a remark if we haven't seen a __kmpc_parallel_51
2012 // use.
2013 if (!KernelParallelUse)
2014 continue;
2015
2016 // If this ever hits, we should investigate.
2017 // TODO: Checking the number of uses is not a necessary restriction and
2018 // should be lifted.
2019 if (UnknownUse || NumDirectCalls != 1 ||
2020 ToBeReplacedStateMachineUses.size() > 2) {
2021 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2022 return ORA << "Parallel region is used in "
2023 << (UnknownUse ? "unknown" : "unexpected")
2024 << " ways. Will not attempt to rewrite the state machine.";
2025 };
2026 emitRemark<OptimizationRemarkAnalysis>(F, "OMP101", Remark);
2027 continue;
2028 }
2029
2030 // Even if we have __kmpc_parallel_51 calls, we (for now) give
2031 // up if the function is not called from a unique kernel.
2032 Kernel K = getUniqueKernelFor(*F);
2033 if (!K) {
2034 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2035 return ORA << "Parallel region is not called from a unique kernel. "
2036 "Will not attempt to rewrite the state machine.";
2037 };
2038 emitRemark<OptimizationRemarkAnalysis>(F, "OMP102", Remark);
2039 continue;
2040 }
2041
2042 // We now know F is a parallel body function called only from the kernel K.
2043 // We also identified the state machine uses in which we replace the
2044 // function pointer by a new global symbol for identification purposes. This
2045 // ensures only direct calls to the function are left.
2046
2047 Module &M = *F->getParent();
2048 Type *Int8Ty = Type::getInt8Ty(M.getContext());
2049
2050 auto *ID = new GlobalVariable(
2051 M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage,
2052 UndefValue::get(Int8Ty), F->getName() + ".ID");
2053
2054 for (Use *U : ToBeReplacedStateMachineUses)
2055 U->set(ConstantExpr::getPointerBitCastOrAddrSpaceCast(
2056 ID, U->get()->getType()));
2057
2058 ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
2059
2060 Changed = true;
2061 }
2062
2063 return Changed;
2064}
2065
2066/// Abstract Attribute for tracking ICV values.
2067struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
2068 using Base = StateWrapper<BooleanState, AbstractAttribute>;
2069 AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
2070
2071 void initialize(Attributor &A) override {
2072 Function *F = getAnchorScope();
2073 if (!F || !A.isFunctionIPOAmendable(*F))
2074 indicatePessimisticFixpoint();
2075 }
2076
2077 /// Returns true if value is assumed to be tracked.
2078 bool isAssumedTracked() const { return getAssumed(); }
2079
2080 /// Returns true if value is known to be tracked.
2081 bool isKnownTracked() const { return getAssumed(); }
2082
2083 /// Create an abstract attribute biew for the position \p IRP.
2084 static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A);
2085
2086 /// Return the value with which \p I can be replaced for specific \p ICV.
2087 virtual Optional<Value *> getReplacementValue(InternalControlVar ICV,
2088 const Instruction *I,
2089 Attributor &A) const {
2090 return None;
2091 }
2092
2093 /// Return an assumed unique ICV value if a single candidate is found. If
2094 /// there cannot be one, return a nullptr. If it is not clear yet, return the
2095 /// Optional::NoneType.
2096 virtual Optional<Value *>
2097 getUniqueReplacementValue(InternalControlVar ICV) const = 0;
2098
2099 // Currently only nthreads is being tracked.
2100 // this array will only grow with time.
2101 InternalControlVar TrackableICVs[1] = {ICV_nthreads};
2102
2103 /// See AbstractAttribute::getName()
2104 const std::string getName() const override { return "AAICVTracker"; }
2105
2106 /// See AbstractAttribute::getIdAddr()
2107 const char *getIdAddr() const override { return &ID; }
2108
2109 /// This function should return true if the type of the \p AA is AAICVTracker
2110 static bool classof(const AbstractAttribute *AA) {
2111 return (AA->getIdAddr() == &ID);
2112 }
2113
2114 static const char ID;
2115};
2116
2117struct AAICVTrackerFunction : public AAICVTracker {
2118 AAICVTrackerFunction(const IRPosition &IRP, Attributor &A)
2119 : AAICVTracker(IRP, A) {}
2120
2121 // FIXME: come up with better string.
2122 const std::string getAsStr() const override { return "ICVTrackerFunction"; }
2123
2124 // FIXME: come up with some stats.
2125 void trackStatistics() const override {}
2126
2127 /// We don't manifest anything for this AA.
2128 ChangeStatus manifest(Attributor &A) override {
2129 return ChangeStatus::UNCHANGED;
2130 }
2131
2132 // Map of ICV to their values at specific program point.
2133 EnumeratedArray<DenseMap<Instruction *, Value *>, InternalControlVar,
2134 InternalControlVar::ICV___last>
2135 ICVReplacementValuesMap;
2136
2137 ChangeStatus updateImpl(Attributor &A) override {
2138 ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
2139
2140 Function *F = getAnchorScope();
2141
2142 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2143
2144 for (InternalControlVar ICV : TrackableICVs) {
2145 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2146
2147 auto &ValuesMap = ICVReplacementValuesMap[ICV];
2148 auto TrackValues = [&](Use &U, Function &) {
2149 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
2150 if (!CI)
2151 return false;
2152
2153 // FIXME: handle setters with more that 1 arguments.
2154 /// Track new value.
2155 if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second)
2156 HasChanged = ChangeStatus::CHANGED;
2157
2158 return false;
2159 };
2160
2161 auto CallCheck = [&](Instruction &I) {
2162 Optional<Value *> ReplVal = getValueForCall(A, &I, ICV);
2163 if (ReplVal.hasValue() &&
2164 ValuesMap.insert(std::make_pair(&I, *ReplVal)).second)
2165 HasChanged = ChangeStatus::CHANGED;
2166
2167 return true;
2168 };
2169
2170 // Track all changes of an ICV.
2171 SetterRFI.foreachUse(TrackValues, F);
2172
2173 bool UsedAssumedInformation = false;
2174 A.checkForAllInstructions(CallCheck, *this, {Instruction::Call},
2175 UsedAssumedInformation,
2176 /* CheckBBLivenessOnly */ true);
2177
2178 /// TODO: Figure out a way to avoid adding entry in
2179 /// ICVReplacementValuesMap
2180 Instruction *Entry = &F->getEntryBlock().front();
2181 if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry))
2182 ValuesMap.insert(std::make_pair(Entry, nullptr));
2183 }
2184
2185 return HasChanged;
2186 }
2187
2188 /// Hepler to check if \p I is a call and get the value for it if it is
2189 /// unique.
2190 Optional<Value *> getValueForCall(Attributor &A, const Instruction *I,
2191 InternalControlVar &ICV) const {
2192
2193 const auto *CB = dyn_cast<CallBase>(I);
2194 if (!CB || CB->hasFnAttr("no_openmp") ||
2195 CB->hasFnAttr("no_openmp_routines"))
2196 return None;
2197
2198 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2199 auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
2200 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2201 Function *CalledFunction = CB->getCalledFunction();
2202
2203 // Indirect call, assume ICV changes.
2204 if (CalledFunction == nullptr)
2205 return nullptr;
2206 if (CalledFunction == GetterRFI.Declaration)
2207 return None;
2208 if (CalledFunction == SetterRFI.Declaration) {
2209 if (ICVReplacementValuesMap[ICV].count(I))
2210 return ICVReplacementValuesMap[ICV].lookup(I);
2211
2212 return nullptr;
2213 }
2214
2215 // Since we don't know, assume it changes the ICV.
2216 if (CalledFunction->isDeclaration())
2217 return nullptr;
2218
2219 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2220 *this, IRPosition::callsite_returned(*CB), DepClassTy::REQUIRED);
2221
2222 if (ICVTrackingAA.isAssumedTracked())
2223 return ICVTrackingAA.getUniqueReplacementValue(ICV);
2224
2225 // If we don't know, assume it changes.
2226 return nullptr;
2227 }
2228
2229 // We don't check unique value for a function, so return None.
2230 Optional<Value *>
2231 getUniqueReplacementValue(InternalControlVar ICV) const override {
2232 return None;
2233 }
2234
2235 /// Return the value with which \p I can be replaced for specific \p ICV.
2236 Optional<Value *> getReplacementValue(InternalControlVar ICV,
2237 const Instruction *I,
2238 Attributor &A) const override {
2239 const auto &ValuesMap = ICVReplacementValuesMap[ICV];
2240 if (ValuesMap.count(I))
2241 return ValuesMap.lookup(I);
2242
2243 SmallVector<const Instruction *, 16> Worklist;
2244 SmallPtrSet<const Instruction *, 16> Visited;
2245 Worklist.push_back(I);
2246
2247 Optional<Value *> ReplVal;
2248
2249 while (!Worklist.empty()) {
2250 const Instruction *CurrInst = Worklist.pop_back_val();
2251 if (!Visited.insert(CurrInst).second)
2252 continue;
2253
2254 const BasicBlock *CurrBB = CurrInst->getParent();
2255
2256 // Go up and look for all potential setters/calls that might change the
2257 // ICV.
2258 while ((CurrInst = CurrInst->getPrevNode())) {
2259 if (ValuesMap.count(CurrInst)) {
2260 Optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
2261 // Unknown value, track new.
2262 if (!ReplVal.hasValue()) {
2263 ReplVal = NewReplVal;
2264 break;
2265 }
2266
2267 // If we found a new value, we can't know the icv value anymore.
2268 if (NewReplVal.hasValue())
2269 if (ReplVal != NewReplVal)
2270 return nullptr;
2271
2272 break;
2273 }
2274
2275 Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV);
2276 if (!NewReplVal.hasValue())
2277 continue;
2278
2279 // Unknown value, track new.
2280 if (!ReplVal.hasValue()) {
2281 ReplVal = NewReplVal;
2282 break;
2283 }
2284
2285 // if (NewReplVal.hasValue())
2286 // We found a new value, we can't know the icv value anymore.
2287 if (ReplVal != NewReplVal)
2288 return nullptr;
2289 }
2290
2291 // If we are in the same BB and we have a value, we are done.
2292 if (CurrBB == I->getParent() && ReplVal.hasValue())
2293 return ReplVal;
2294
2295 // Go through all predecessors and add terminators for analysis.
2296 for (const BasicBlock *Pred : predecessors(CurrBB))
2297 if (const Instruction *Terminator = Pred->getTerminator())
2298 Worklist.push_back(Terminator);
2299 }
2300
2301 return ReplVal;
2302 }
2303};
2304
2305struct AAICVTrackerFunctionReturned : AAICVTracker {
2306 AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A)
2307 : AAICVTracker(IRP, A) {}
2308
2309 // FIXME: come up with better string.
2310 const std::string getAsStr() const override {
2311 return "ICVTrackerFunctionReturned";
2312 }
2313
2314 // FIXME: come up with some stats.
2315 void trackStatistics() const override {}
2316
2317 /// We don't manifest anything for this AA.
2318 ChangeStatus manifest(Attributor &A) override {
2319 return ChangeStatus::UNCHANGED;
2320 }
2321
2322 // Map of ICV to their values at specific program point.
2323 EnumeratedArray<Optional<Value *>, InternalControlVar,
2324 InternalControlVar::ICV___last>
2325 ICVReplacementValuesMap;
2326
2327 /// Return the value with which \p I can be replaced for specific \p ICV.
2328 Optional<Value *>
2329 getUniqueReplacementValue(InternalControlVar ICV) const override {
2330 return ICVReplacementValuesMap[ICV];
2331 }
2332
2333 ChangeStatus updateImpl(Attributor &A) override {
2334 ChangeStatus Changed = ChangeStatus::UNCHANGED;
2335 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2336 *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
2337
2338 if (!ICVTrackingAA.isAssumedTracked())
2339 return indicatePessimisticFixpoint();
2340
2341 for (InternalControlVar ICV : TrackableICVs) {
2342 Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2343 Optional<Value *> UniqueICVValue;
2344
2345 auto CheckReturnInst = [&](Instruction &I) {
2346 Optional<Value *> NewReplVal =
2347 ICVTrackingAA.getReplacementValue(ICV, &I, A);
2348
2349 // If we found a second ICV value there is no unique returned value.
2350 if (UniqueICVValue.hasValue() && UniqueICVValue != NewReplVal)
2351 return false;
2352
2353 UniqueICVValue = NewReplVal;
2354
2355 return true;
2356 };
2357
2358 bool UsedAssumedInformation = false;
2359 if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret},
2360 UsedAssumedInformation,
2361 /* CheckBBLivenessOnly */ true))
2362 UniqueICVValue = nullptr;
2363
2364 if (UniqueICVValue == ReplVal)
2365 continue;
2366
2367 ReplVal = UniqueICVValue;
2368 Changed = ChangeStatus::CHANGED;
2369 }
2370
2371 return Changed;
2372 }
2373};
2374
2375struct AAICVTrackerCallSite : AAICVTracker {
2376 AAICVTrackerCallSite(const IRPosition &IRP, Attributor &A)
2377 : AAICVTracker(IRP, A) {}
2378
2379 void initialize(Attributor &A) override {
2380 Function *F = getAnchorScope();
2381 if (!F || !A.isFunctionIPOAmendable(*F))
2382 indicatePessimisticFixpoint();
2383
2384 // We only initialize this AA for getters, so we need to know which ICV it
2385 // gets.
2386 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2387 for (InternalControlVar ICV : TrackableICVs) {
2388 auto ICVInfo = OMPInfoCache.ICVs[ICV];
2389 auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
2390 if (Getter.Declaration == getAssociatedFunction()) {
2391 AssociatedICV = ICVInfo.Kind;
2392 return;
2393 }
2394 }
2395
2396 /// Unknown ICV.
2397 indicatePessimisticFixpoint();
2398 }
2399
2400 ChangeStatus manifest(Attributor &A) override {
2401 if (!ReplVal.hasValue() || !ReplVal.getValue())
2402 return ChangeStatus::UNCHANGED;
2403
2404 A.changeValueAfterManifest(*getCtxI(), **ReplVal);
2405 A.deleteAfterManifest(*getCtxI());
2406
2407 return ChangeStatus::CHANGED;
2408 }
2409
2410 // FIXME: come up with better string.
2411 const std::string getAsStr() const override { return "ICVTrackerCallSite"; }
2412
2413 // FIXME: come up with some stats.
2414 void trackStatistics() const override {}
2415
2416 InternalControlVar AssociatedICV;
2417 Optional<Value *> ReplVal;
2418
2419 ChangeStatus updateImpl(Attributor &A) override {
2420 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2421 *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
2422
2423 // We don't have any information, so we assume it changes the ICV.
2424 if (!ICVTrackingAA.isAssumedTracked())
2425 return indicatePessimisticFixpoint();
2426
2427 Optional<Value *> NewReplVal =
2428 ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A);
2429
2430 if (ReplVal == NewReplVal)
2431 return ChangeStatus::UNCHANGED;
2432
2433 ReplVal = NewReplVal;
2434 return ChangeStatus::CHANGED;
2435 }
2436
2437 // Return the value with which associated value can be replaced for specific
2438 // \p ICV.
2439 Optional<Value *>
2440 getUniqueReplacementValue(InternalControlVar ICV) const override {
2441 return ReplVal;
2442 }
2443};
2444
2445struct AAICVTrackerCallSiteReturned : AAICVTracker {
2446 AAICVTrackerCallSiteReturned(const IRPosition &IRP, Attributor &A)
2447 : AAICVTracker(IRP, A) {}
2448
2449 // FIXME: come up with better string.
2450 const std::string getAsStr() const override {
2451 return "ICVTrackerCallSiteReturned";
2452 }
2453
2454 // FIXME: come up with some stats.
2455 void trackStatistics() const override {}
2456
2457 /// We don't manifest anything for this AA.
2458 ChangeStatus manifest(Attributor &A) override {
2459 return ChangeStatus::UNCHANGED;
2460 }
2461
2462 // Map of ICV to their values at specific program point.
2463 EnumeratedArray<Optional<Value *>, InternalControlVar,
2464 InternalControlVar::ICV___last>
2465 ICVReplacementValuesMap;
2466
2467 /// Return the value with which associated value can be replaced for specific
2468 /// \p ICV.
2469 Optional<Value *>
2470 getUniqueReplacementValue(InternalControlVar ICV) const override {
2471 return ICVReplacementValuesMap[ICV];
2472 }
2473
2474 ChangeStatus updateImpl(Attributor &A) override {
2475 ChangeStatus Changed = ChangeStatus::UNCHANGED;
2476 const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2477 *this, IRPosition::returned(*getAssociatedFunction()),
2478 DepClassTy::REQUIRED);
2479
2480 // We don't have any information, so we assume it changes the ICV.
2481 if (!ICVTrackingAA.isAssumedTracked())
2482 return indicatePessimisticFixpoint();
2483
2484 for (InternalControlVar ICV : TrackableICVs) {
2485 Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2486 Optional<Value *> NewReplVal =
2487 ICVTrackingAA.getUniqueReplacementValue(ICV);
2488
2489 if (ReplVal == NewReplVal)
2490 continue;
2491
2492 ReplVal = NewReplVal;
2493 Changed = ChangeStatus::CHANGED;
2494 }
2495 return Changed;
2496 }
2497};
2498
2499struct AAExecutionDomainFunction : public AAExecutionDomain {
2500 AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A)
2501 : AAExecutionDomain(IRP, A) {}
2502
2503 const std::string getAsStr() const override {
2504 return "[AAExecutionDomain] " + std::to_string(SingleThreadedBBs.size()) +
2505 "/" + std::to_string(NumBBs) + " BBs thread 0 only.";
2506 }
2507
2508 /// See AbstractAttribute::trackStatistics().
2509 void trackStatistics() const override {}
2510
2511 void initialize(Attributor &A) override {
2512 Function *F = getAnchorScope();
2513 for (const auto &BB : *F)
2514 SingleThreadedBBs.insert(&BB);
2515 NumBBs = SingleThreadedBBs.size();
2516 }
2517
2518 ChangeStatus manifest(Attributor &A) override {
2519 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { for (const BasicBlock *BB : SingleThreadedBBs
) dbgs() << TAG << " Basic block @" << getAnchorScope
()->getName() << " " << BB->getName() <<
" is executed by a single thread.\n"; }; } } while (false)
2520 for (const BasicBlock *BB : SingleThreadedBBs)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { for (const BasicBlock *BB : SingleThreadedBBs
) dbgs() << TAG << " Basic block @" << getAnchorScope
()->getName() << " " << BB->getName() <<
" is executed by a single thread.\n"; }; } } while (false)
2521 dbgs() << TAG << " Basic block @" << getAnchorScope()->getName() << " "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { for (const BasicBlock *BB : SingleThreadedBBs
) dbgs() << TAG << " Basic block @" << getAnchorScope
()->getName() << " " << BB->getName() <<
" is executed by a single thread.\n"; }; } } while (false)
2522 << BB->getName() << " is executed by a single thread.\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { for (const BasicBlock *BB : SingleThreadedBBs
) dbgs() << TAG << " Basic block @" << getAnchorScope
()->getName() << " " << BB->getName() <<
" is executed by a single thread.\n"; }; } } while (false)
2523 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { { for (const BasicBlock *BB : SingleThreadedBBs
) dbgs() << TAG << " Basic block @" << getAnchorScope
()->getName() << " " << BB->getName() <<
" is executed by a single thread.\n"; }; } } while (false)
;
2524 return ChangeStatus::UNCHANGED;
2525 }
2526
2527 ChangeStatus updateImpl(Attributor &A) override;
2528
2529 /// Check if an instruction is executed by a single thread.
2530 bool isExecutedByInitialThreadOnly(const Instruction &I) const override {
2531 return isExecutedByInitialThreadOnly(*I.getParent());
2532 }
2533
2534 bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override {
2535 return isValidState() && SingleThreadedBBs.contains(&BB);
2536 }
2537
2538 /// Set of basic blocks that are executed by a single thread.
2539 DenseSet<const BasicBlock *> SingleThreadedBBs;
2540
2541 /// Total number of basic blocks in this function.
2542 long unsigned NumBBs;
2543};
2544
2545ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
2546 Function *F = getAnchorScope();
2547 ReversePostOrderTraversal<Function *> RPOT(F);
2548 auto NumSingleThreadedBBs = SingleThreadedBBs.size();
2549
2550 bool AllCallSitesKnown;
2551 auto PredForCallSite = [&](AbstractCallSite ACS) {
2552 const auto &ExecutionDomainAA = A.getAAFor<AAExecutionDomain>(
2553 *this, IRPosition::function(*ACS.getInstruction()->getFunction()),
2554 DepClassTy::REQUIRED);
2555 return ACS.isDirectCall() &&
2556 ExecutionDomainAA.isExecutedByInitialThreadOnly(
2557 *ACS.getInstruction());
2558 };
2559
2560 if (!A.checkForAllCallSites(PredForCallSite, *this,
2561 /* RequiresAllCallSites */ true,
2562 AllCallSitesKnown))
2563 SingleThreadedBBs.erase(&F->getEntryBlock());
2564
2565 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2566 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
2567
2568 // Check if the edge into the successor block contains a condition that only
2569 // lets the main thread execute it.
2570 auto IsInitialThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) {
2571 if (!Edge || !Edge->isConditional())
2572 return false;
2573 if (Edge->getSuccessor(0) != SuccessorBB)
2574 return false;
2575
2576 auto *Cmp = dyn_cast<CmpInst>(Edge->getCondition());
2577 if (!Cmp || !Cmp->isTrueWhenEqual() || !Cmp->isEquality())
2578 return false;
2579
2580 ConstantInt *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
2581 if (!C)
2582 return false;
2583
2584 // Match: -1 == __kmpc_target_init (for non-SPMD kernels only!)
2585 if (C->isAllOnesValue()) {
2586 auto *CB = dyn_cast<CallBase>(Cmp->getOperand(0));
2587 CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;
2588 if (!CB)
2589 return false;
2590 const int InitModeArgNo = 1;
2591 auto *ModeCI = dyn_cast<ConstantInt>(CB->getOperand(InitModeArgNo));
2592 return ModeCI && (ModeCI->getSExtValue() & OMP_TGT_EXEC_MODE_GENERIC);
2593 }
2594
2595 if (C->isZero()) {
2596 // Match: 0 == llvm.nvvm.read.ptx.sreg.tid.x()
2597 if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0)))
2598 if (II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
2599 return true;
2600
2601 // Match: 0 == llvm.amdgcn.workitem.id.x()
2602 if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0)))
2603 if (II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
2604 return true;
2605 }
2606
2607 return false;
2608 };
2609
2610 // Merge all the predecessor states into the current basic block. A basic
2611 // block is executed by a single thread if all of its predecessors are.
2612 auto MergePredecessorStates = [&](BasicBlock *BB) {
2613 if (pred_begin(BB) == pred_end(BB))
2614 return SingleThreadedBBs.contains(BB);
2615
2616 bool IsInitialThread = true;
2617 for (auto PredBB = pred_begin(BB), PredEndBB = pred_end(BB);
2618 PredBB != PredEndBB; ++PredBB) {
2619 if (!IsInitialThreadOnly(dyn_cast<BranchInst>((*PredBB)->getTerminator()),
2620 BB))
2621 IsInitialThread &= SingleThreadedBBs.contains(*PredBB);
2622 }
2623
2624 return IsInitialThread;
2625 };
2626
2627 for (auto *BB : RPOT) {
2628 if (!MergePredecessorStates(BB))
2629 SingleThreadedBBs.erase(BB);
2630 }
2631
2632 return (NumSingleThreadedBBs == SingleThreadedBBs.size())
2633 ? ChangeStatus::UNCHANGED
2634 : ChangeStatus::CHANGED;
2635}
2636
2637/// Try to replace memory allocation calls called by a single thread with a
2638/// static buffer of shared memory.
2639struct AAHeapToShared : public StateWrapper<BooleanState, AbstractAttribute> {
2640 using Base = StateWrapper<BooleanState, AbstractAttribute>;
2641 AAHeapToShared(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
2642
2643 /// Create an abstract attribute view for the position \p IRP.
2644 static AAHeapToShared &createForPosition(const IRPosition &IRP,
2645 Attributor &A);
2646
2647 /// Returns true if HeapToShared conversion is assumed to be possible.
2648 virtual bool isAssumedHeapToShared(CallBase &CB) const = 0;
2649
2650 /// Returns true if HeapToShared conversion is assumed and the CB is a
2651 /// callsite to a free operation to be removed.
2652 virtual bool isAssumedHeapToSharedRemovedFree(CallBase &CB) const = 0;
2653
2654 /// See AbstractAttribute::getName().
2655 const std::string getName() const override { return "AAHeapToShared"; }
2656
2657 /// See AbstractAttribute::getIdAddr().
2658 const char *getIdAddr() const override { return &ID; }
2659
2660 /// This function should return true if the type of the \p AA is
2661 /// AAHeapToShared.
2662 static bool classof(const AbstractAttribute *AA) {
2663 return (AA->getIdAddr() == &ID);
2664 }
2665
2666 /// Unique ID (due to the unique address)
2667 static const char ID;
2668};
2669
2670struct AAHeapToSharedFunction : public AAHeapToShared {
2671 AAHeapToSharedFunction(const IRPosition &IRP, Attributor &A)
2672 : AAHeapToShared(IRP, A) {}
2673
2674 const std::string getAsStr() const override {
2675 return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
2676 " malloc calls eligible.";
2677 }
2678
2679 /// See AbstractAttribute::trackStatistics().
2680 void trackStatistics() const override {}
2681
2682 /// This functions finds free calls that will be removed by the
2683 /// HeapToShared transformation.
2684 void findPotentialRemovedFreeCalls(Attributor &A) {
2685 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2686 auto &FreeRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
2687
2688 PotentialRemovedFreeCalls.clear();
2689 // Update free call users of found malloc calls.
2690 for (CallBase *CB : MallocCalls) {
2691 SmallVector<CallBase *, 4> FreeCalls;
2692 for (auto *U : CB->users()) {
2693 CallBase *C = dyn_cast<CallBase>(U);
2694 if (C && C->getCalledFunction() == FreeRFI.Declaration)
2695 FreeCalls.push_back(C);
2696 }
2697
2698 if (FreeCalls.size() != 1)
2699 continue;
2700
2701 PotentialRemovedFreeCalls.insert(FreeCalls.front());
2702 }
2703 }
2704
2705 void initialize(Attributor &A) override {
2706 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2707 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
2708
2709 for (User *U : RFI.Declaration->users())
2710 if (CallBase *CB = dyn_cast<CallBase>(U))
2711 MallocCalls.insert(CB);
2712
2713 findPotentialRemovedFreeCalls(A);
2714 }
2715
2716 bool isAssumedHeapToShared(CallBase &CB) const override {
2717 return isValidState() && MallocCalls.count(&CB);
2718 }
2719
2720 bool isAssumedHeapToSharedRemovedFree(CallBase &CB) const override {
2721 return isValidState() && PotentialRemovedFreeCalls.count(&CB);
2722 }
2723
2724 ChangeStatus manifest(Attributor &A) override {
2725 if (MallocCalls.empty())
1
Calling 'SmallPtrSetImplBase::empty'
4
Returning from 'SmallPtrSetImplBase::empty'
5
Taking false branch
2726 return ChangeStatus::UNCHANGED;
2727
2728 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2729 auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
2730
2731 Function *F = getAnchorScope();
2732 auto *HS = A.lookupAAFor<AAHeapToStack>(IRPosition::function(*F), this,
6
Calling 'Attributor::lookupAAFor'
13
Returning from 'Attributor::lookupAAFor'
2733 DepClassTy::OPTIONAL);
2734
2735 ChangeStatus Changed = ChangeStatus::UNCHANGED;
2736 for (CallBase *CB : MallocCalls) {
2737 // Skip replacing this if HeapToStack has already claimed it.
2738 if (HS
13.1
'HS' is non-null
13.1
'HS' is non-null
13.1
'HS' is non-null
13.1
'HS' is non-null
&& HS->isAssumedHeapToStack(*CB))
14
Assuming the condition is false
15
Taking false branch
2739 continue;
2740
2741 // Find the unique free call to remove it.
2742 SmallVector<CallBase *, 4> FreeCalls;
2743 for (auto *U : CB->users()) {
2744 CallBase *C = dyn_cast<CallBase>(U);
2745 if (C && C->getCalledFunction() == FreeCall.Declaration)
2746 FreeCalls.push_back(C);
2747 }
2748 if (FreeCalls.size() != 1)
16
Assuming the condition is false
17
Taking false branch
2749 continue;
2750
2751 ConstantInt *AllocSize = dyn_cast<ConstantInt>(CB->getArgOperand(0));
18
Calling 'CallBase::getArgOperand'
33
Returning from 'CallBase::getArgOperand'
34
Assuming the object is not a 'ConstantInt'
35
'AllocSize' initialized to a null pointer value
2752
2753 LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CBdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Replace globalization call "
<< *CB << " with " << AllocSize->getZExtValue
() << " bytes of shared memory\n"; } } while (false)
36
Assuming 'DebugFlag' is true
37
Assuming the condition is true
38
Taking true branch
39
Called C++ object pointer is null
2754 << " with " << AllocSize->getZExtValue()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Replace globalization call "
<< *CB << " with " << AllocSize->getZExtValue
() << " bytes of shared memory\n"; } } while (false)
2755 << " bytes of shared memory\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Replace globalization call "
<< *CB << " with " << AllocSize->getZExtValue
() << " bytes of shared memory\n"; } } while (false)
;
2756
2757 // Create a new shared memory buffer of the same size as the allocation
2758 // and replace all the uses of the original allocation with it.
2759 Module *M = CB->getModule();
2760 Type *Int8Ty = Type::getInt8Ty(M->getContext());
2761 Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
2762 auto *SharedMem = new GlobalVariable(
2763 *M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage,
2764 UndefValue::get(Int8ArrTy), CB->getName(), nullptr,
2765 GlobalValue::NotThreadLocal,
2766 static_cast<unsigned>(AddressSpace::Shared));
2767 auto *NewBuffer =
2768 ConstantExpr::getPointerCast(SharedMem, Int8Ty->getPointerTo());
2769
2770 auto Remark = [&](OptimizationRemark OR) {
2771 return OR << "Replaced globalized variable with "
2772 << ore::NV("SharedMemory", AllocSize->getZExtValue())
2773 << ((AllocSize->getZExtValue() != 1) ? " bytes " : " byte ")
2774 << "of shared memory.";
2775 };
2776 A.emitRemark<OptimizationRemark>(CB, "OMP111", Remark);
2777
2778 SharedMem->setAlignment(MaybeAlign(32));
2779
2780 A.changeValueAfterManifest(*CB, *NewBuffer);
2781 A.deleteAfterManifest(*CB);
2782 A.deleteAfterManifest(*FreeCalls.front());
2783
2784 NumBytesMovedToSharedMemory += AllocSize->getZExtValue();
2785 Changed = ChangeStatus::CHANGED;
2786 }
2787
2788 return Changed;
2789 }
2790
2791 ChangeStatus updateImpl(Attributor &A) override {
2792 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2793 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
2794 Function *F = getAnchorScope();
2795
2796 auto NumMallocCalls = MallocCalls.size();
2797
2798 // Only consider malloc calls executed by a single thread with a constant.
2799 for (User *U : RFI.Declaration->users()) {
2800 const auto &ED = A.getAAFor<AAExecutionDomain>(
2801 *this, IRPosition::function(*F), DepClassTy::REQUIRED);
2802 if (CallBase *CB = dyn_cast<CallBase>(U))
2803 if (!dyn_cast<ConstantInt>(CB->getArgOperand(0)) ||
2804 !ED.isExecutedByInitialThreadOnly(*CB))
2805 MallocCalls.erase(CB);
2806 }
2807
2808 findPotentialRemovedFreeCalls(A);
2809
2810 if (NumMallocCalls != MallocCalls.size())
2811 return ChangeStatus::CHANGED;
2812
2813 return ChangeStatus::UNCHANGED;
2814 }
2815
2816 /// Collection of all malloc calls in a function.
2817 SmallPtrSet<CallBase *, 4> MallocCalls;
2818 /// Collection of potentially removed free calls in a function.
2819 SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
2820};
2821
2822struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
2823 using Base = StateWrapper<KernelInfoState, AbstractAttribute>;
2824 AAKernelInfo(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
2825
2826 /// Statistics are tracked as part of manifest for now.
2827 void trackStatistics() const override {}
2828
2829 /// See AbstractAttribute::getAsStr()
2830 const std::string getAsStr() const override {
2831 if (!isValidState())
2832 return "<invalid>";
2833 return std::string(SPMDCompatibilityTracker.isAssumed() ? "SPMD"
2834 : "generic") +
2835 std::string(SPMDCompatibilityTracker.isAtFixpoint() ? " [FIX]"
2836 : "") +
2837 std::string(" #PRs: ") +
2838 (ReachedKnownParallelRegions.isValidState()
2839 ? std::to_string(ReachedKnownParallelRegions.size())
2840 : "<invalid>") +
2841 ", #Unknown PRs: " +
2842 (ReachedUnknownParallelRegions.isValidState()
2843 ? std::to_string(ReachedUnknownParallelRegions.size())
2844 : "<invalid>") +
2845 ", #Reaching Kernels: " +
2846 (ReachingKernelEntries.isValidState()
2847 ? std::to_string(ReachingKernelEntries.size())
2848 : "<invalid>");
2849 }
2850
2851 /// Create an abstract attribute biew for the position \p IRP.
2852 static AAKernelInfo &createForPosition(const IRPosition &IRP, Attributor &A);
2853
2854 /// See AbstractAttribute::getName()
2855 const std::string getName() const override { return "AAKernelInfo"; }
2856
2857 /// See AbstractAttribute::getIdAddr()
2858 const char *getIdAddr() const override { return &ID; }
2859
2860 /// This function should return true if the type of the \p AA is AAKernelInfo
2861 static bool classof(const AbstractAttribute *AA) {
2862 return (AA->getIdAddr() == &ID);
2863 }
2864
2865 static const char ID;
2866};
2867
2868/// The function kernel info abstract attribute, basically, what can we say
2869/// about a function with regards to the KernelInfoState.
2870struct AAKernelInfoFunction : AAKernelInfo {
2871 AAKernelInfoFunction(const IRPosition &IRP, Attributor &A)
2872 : AAKernelInfo(IRP, A) {}
2873
2874 SmallPtrSet<Instruction *, 4> GuardedInstructions;
2875
2876 SmallPtrSetImpl<Instruction *> &getGuardedInstructions() {
2877 return GuardedInstructions;
2878 }
2879
2880 /// See AbstractAttribute::initialize(...).
2881 void initialize(Attributor &A) override {
2882 // This is a high-level transform that might change the constant arguments
2883 // of the init and dinit calls. We need to tell the Attributor about this
2884 // to avoid other parts using the current constant value for simpliication.
2885 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2886
2887 Function *Fn = getAnchorScope();
2888 if (!OMPInfoCache.Kernels.count(Fn))
2889 return;
2890
2891 // Add itself to the reaching kernel and set IsKernelEntry.
2892 ReachingKernelEntries.insert(Fn);
2893 IsKernelEntry = true;
2894
2895 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
2896 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
2897 OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =
2898 OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
2899
2900 // For kernels we perform more initialization work, first we find the init
2901 // and deinit calls.
2902 auto StoreCallBase = [](Use &U,
2903 OMPInformationCache::RuntimeFunctionInfo &RFI,
2904 CallBase *&Storage) {
2905 CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
2906 assert(CB &&(static_cast <bool> (CB && "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!"
) ? void (0) : __assert_fail ("CB && \"Unexpected use of __kmpc_target_init or __kmpc_target_deinit!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 2907, __extension__ __PRETTY_FUNCTION__))
2907 "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!")(static_cast <bool> (CB && "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!"
) ? void (0) : __assert_fail ("CB && \"Unexpected use of __kmpc_target_init or __kmpc_target_deinit!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 2907, __extension__ __PRETTY_FUNCTION__))
;
2908 assert(!Storage &&(static_cast <bool> (!Storage && "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!"
) ? void (0) : __assert_fail ("!Storage && \"Multiple uses of __kmpc_target_init or __kmpc_target_deinit!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 2909, __extension__ __PRETTY_FUNCTION__))
2909 "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!")(static_cast <bool> (!Storage && "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!"
) ? void (0) : __assert_fail ("!Storage && \"Multiple uses of __kmpc_target_init or __kmpc_target_deinit!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 2909, __extension__ __PRETTY_FUNCTION__))
;
2910 Storage = CB;
2911 return false;
2912 };
2913 InitRFI.foreachUse(
2914 [&](Use &U, Function &) {
2915 StoreCallBase(U, InitRFI, KernelInitCB);
2916 return false;
2917 },
2918 Fn);
2919 DeinitRFI.foreachUse(
2920 [&](Use &U, Function &) {
2921 StoreCallBase(U, DeinitRFI, KernelDeinitCB);
2922 return false;
2923 },
2924 Fn);
2925
2926 // Ignore kernels without initializers such as global constructors.
2927 if (!KernelInitCB || !KernelDeinitCB) {
2928 indicateOptimisticFixpoint();
2929 return;
2930 }
2931
2932 // For kernels we might need to initialize/finalize the IsSPMD state and
2933 // we need to register a simplification callback so that the Attributor
2934 // knows the constant arguments to __kmpc_target_init and
2935 // __kmpc_target_deinit might actually change.
2936
2937 Attributor::SimplifictionCallbackTy StateMachineSimplifyCB =
2938 [&](const IRPosition &IRP, const AbstractAttribute *AA,
2939 bool &UsedAssumedInformation) -> Optional<Value *> {
2940 // IRP represents the "use generic state machine" argument of an
2941 // __kmpc_target_init call. We will answer this one with the internal
2942 // state. As long as we are not in an invalid state, we will create a
2943 // custom state machine so the value should be a `i1 false`. If we are
2944 // in an invalid state, we won't change the value that is in the IR.
2945 if (!isValidState())
2946 return nullptr;
2947 // If we have disabled state machine rewrites, don't make a custom one.
2948 if (DisableOpenMPOptStateMachineRewrite)
2949 return nullptr;
2950 if (AA)
2951 A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
2952 UsedAssumedInformation = !isAtFixpoint();
2953 auto *FalseVal =
2954 ConstantInt::getBool(IRP.getAnchorValue().getContext(), 0);
2955 return FalseVal;
2956 };
2957
2958 Attributor::SimplifictionCallbackTy ModeSimplifyCB =
2959 [&](const IRPosition &IRP, const AbstractAttribute *AA,
2960 bool &UsedAssumedInformation) -> Optional<Value *> {
2961 // IRP represents the "SPMDCompatibilityTracker" argument of an
2962 // __kmpc_target_init or
2963 // __kmpc_target_deinit call. We will answer this one with the internal
2964 // state.
2965 if (!SPMDCompatibilityTracker.isValidState())
2966 return nullptr;
2967 if (!SPMDCompatibilityTracker.isAtFixpoint()) {
2968 if (AA)
2969 A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
2970 UsedAssumedInformation = true;
2971 } else {
2972 UsedAssumedInformation = false;
2973 }
2974 auto *Val = ConstantInt::getSigned(
2975 IntegerType::getInt8Ty(IRP.getAnchorValue().getContext()),
2976 SPMDCompatibilityTracker.isAssumed() ? OMP_TGT_EXEC_MODE_SPMD
2977 : OMP_TGT_EXEC_MODE_GENERIC);
2978 return Val;
2979 };
2980
2981 Attributor::SimplifictionCallbackTy IsGenericModeSimplifyCB =
2982 [&](const IRPosition &IRP, const AbstractAttribute *AA,
2983 bool &UsedAssumedInformation) -> Optional<Value *> {
2984 // IRP represents the "RequiresFullRuntime" argument of an
2985 // __kmpc_target_init or __kmpc_target_deinit call. We will answer this
2986 // one with the internal state of the SPMDCompatibilityTracker, so if
2987 // generic then true, if SPMD then false.
2988 if (!SPMDCompatibilityTracker.isValidState())
2989 return nullptr;
2990 if (!SPMDCompatibilityTracker.isAtFixpoint()) {
2991 if (AA)
2992 A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
2993 UsedAssumedInformation = true;
2994 } else {
2995 UsedAssumedInformation = false;
2996 }
2997 auto *Val = ConstantInt::getBool(IRP.getAnchorValue().getContext(),
2998 !SPMDCompatibilityTracker.isAssumed());
2999 return Val;
3000 };
3001
3002 constexpr const int InitModeArgNo = 1;
3003 constexpr const int DeinitModeArgNo = 1;
3004 constexpr const int InitUseStateMachineArgNo = 2;
3005 constexpr const int InitRequiresFullRuntimeArgNo = 3;
3006 constexpr const int DeinitRequiresFullRuntimeArgNo = 2;
3007 A.registerSimplificationCallback(
3008 IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo),
3009 StateMachineSimplifyCB);
3010 A.registerSimplificationCallback(
3011 IRPosition::callsite_argument(*KernelInitCB, InitModeArgNo),
3012 ModeSimplifyCB);
3013 A.registerSimplificationCallback(
3014 IRPosition::callsite_argument(*KernelDeinitCB, DeinitModeArgNo),
3015 ModeSimplifyCB);
3016 A.registerSimplificationCallback(
3017 IRPosition::callsite_argument(*KernelInitCB,
3018 InitRequiresFullRuntimeArgNo),
3019 IsGenericModeSimplifyCB);
3020 A.registerSimplificationCallback(
3021 IRPosition::callsite_argument(*KernelDeinitCB,
3022 DeinitRequiresFullRuntimeArgNo),
3023 IsGenericModeSimplifyCB);
3024
3025 // Check if we know we are in SPMD-mode already.
3026 ConstantInt *ModeArg =
3027 dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitModeArgNo));
3028 if (ModeArg && (ModeArg->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD))
3029 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3030 // This is a generic region but SPMDization is disabled so stop tracking.
3031 else if (DisableOpenMPOptSPMDization)
3032 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3033 }
3034
3035 /// Modify the IR based on the KernelInfoState as the fixpoint iteration is
3036 /// finished now.
3037 ChangeStatus manifest(Attributor &A) override {
3038 // If we are not looking at a kernel with __kmpc_target_init and
3039 // __kmpc_target_deinit call we cannot actually manifest the information.
3040 if (!KernelInitCB || !KernelDeinitCB)
3041 return ChangeStatus::UNCHANGED;
3042
3043 // Known SPMD-mode kernels need no manifest changes.
3044 if (SPMDCompatibilityTracker.isKnown())
3045 return ChangeStatus::UNCHANGED;
3046
3047 // If we can we change the execution mode to SPMD-mode otherwise we build a
3048 // custom state machine.
3049 if (!mayContainParallelRegion() || !changeToSPMDMode(A))
3050 return buildCustomStateMachine(A);
3051
3052 return ChangeStatus::CHANGED;
3053 }
3054
3055 bool changeToSPMDMode(Attributor &A) {
3056 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3057
3058 if (!SPMDCompatibilityTracker.isAssumed()) {
3059 for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
3060 if (!NonCompatibleI)
3061 continue;
3062
3063 // Skip diagnostics on calls to known OpenMP runtime functions for now.
3064 if (auto *CB = dyn_cast<CallBase>(NonCompatibleI))
3065 if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction()))
3066 continue;
3067
3068 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
3069 ORA << "Value has potential side effects preventing SPMD-mode "
3070 "execution";
3071 if (isa<CallBase>(NonCompatibleI)) {
3072 ORA << ". Add `__attribute__((assume(\"ompx_spmd_amenable\")))` to "
3073 "the called function to override";
3074 }
3075 return ORA << ".";
3076 };
3077 A.emitRemark<OptimizationRemarkAnalysis>(NonCompatibleI, "OMP121",
3078 Remark);
3079
3080 LLVM_DEBUG(dbgs() << TAG << "SPMD-incompatible side-effect: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "SPMD-incompatible side-effect: "
<< *NonCompatibleI << "\n"; } } while (false)
3081 << *NonCompatibleI << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "SPMD-incompatible side-effect: "
<< *NonCompatibleI << "\n"; } } while (false)
;
3082 }
3083
3084 return false;
3085 }
3086
3087 auto CreateGuardedRegion = [&](Instruction *RegionStartI,
3088 Instruction *RegionEndI) {
3089 LoopInfo *LI = nullptr;
3090 DominatorTree *DT = nullptr;
3091 MemorySSAUpdater *MSU = nullptr;
3092 using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
3093
3094 BasicBlock *ParentBB = RegionStartI->getParent();
3095 Function *Fn = ParentBB->getParent();
3096 Module &M = *Fn->getParent();
3097
3098 // Create all the blocks and logic.
3099 // ParentBB:
3100 // goto RegionCheckTidBB
3101 // RegionCheckTidBB:
3102 // Tid = __kmpc_hardware_thread_id()
3103 // if (Tid != 0)
3104 // goto RegionBarrierBB
3105 // RegionStartBB:
3106 // <execute instructions guarded>
3107 // goto RegionEndBB
3108 // RegionEndBB:
3109 // <store escaping values to shared mem>
3110 // goto RegionBarrierBB
3111 // RegionBarrierBB:
3112 // __kmpc_simple_barrier_spmd()
3113 // // second barrier is omitted if lacking escaping values.
3114 // <load escaping values from shared mem>
3115 // __kmpc_simple_barrier_spmd()
3116 // goto RegionExitBB
3117 // RegionExitBB:
3118 // <execute rest of instructions>
3119
3120 BasicBlock *RegionEndBB = SplitBlock(ParentBB, RegionEndI->getNextNode(),
3121 DT, LI, MSU, "region.guarded.end");
3122 BasicBlock *RegionBarrierBB =
3123 SplitBlock(RegionEndBB, &*RegionEndBB->getFirstInsertionPt(), DT, LI,
3124 MSU, "region.barrier");
3125 BasicBlock *RegionExitBB =
3126 SplitBlock(RegionBarrierBB, &*RegionBarrierBB->getFirstInsertionPt(),
3127 DT, LI, MSU, "region.exit");
3128 BasicBlock *RegionStartBB =
3129 SplitBlock(ParentBB, RegionStartI, DT, LI, MSU, "region.guarded");
3130
3131 assert(ParentBB->getUniqueSuccessor() == RegionStartBB &&(static_cast <bool> (ParentBB->getUniqueSuccessor() ==
RegionStartBB && "Expected a different CFG") ? void (
0) : __assert_fail ("ParentBB->getUniqueSuccessor() == RegionStartBB && \"Expected a different CFG\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 3132, __extension__ __PRETTY_FUNCTION__))
3132 "Expected a different CFG")(static_cast <bool> (ParentBB->getUniqueSuccessor() ==
RegionStartBB && "Expected a different CFG") ? void (
0) : __assert_fail ("ParentBB->getUniqueSuccessor() == RegionStartBB && \"Expected a different CFG\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 3132, __extension__ __PRETTY_FUNCTION__))
;
3133
3134 BasicBlock *RegionCheckTidBB = SplitBlock(
3135 ParentBB, ParentBB->getTerminator(), DT, LI, MSU, "region.check.tid");
3136
3137 // Register basic blocks with the Attributor.
3138 A.registerManifestAddedBasicBlock(*RegionEndBB);
3139 A.registerManifestAddedBasicBlock(*RegionBarrierBB);
3140 A.registerManifestAddedBasicBlock(*RegionExitBB);
3141 A.registerManifestAddedBasicBlock(*RegionStartBB);
3142 A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
3143
3144 bool HasBroadcastValues = false;
3145 // Find escaping outputs from the guarded region to outside users and
3146 // broadcast their values to them.
3147 for (Instruction &I : *RegionStartBB) {
3148 SmallPtrSet<Instruction *, 4> OutsideUsers;
3149 for (User *Usr : I.users()) {
3150 Instruction &UsrI = *cast<Instruction>(Usr);
3151 if (UsrI.getParent() != RegionStartBB)
3152 OutsideUsers.insert(&UsrI);
3153 }
3154
3155 if (OutsideUsers.empty())
3156 continue;
3157
3158 HasBroadcastValues = true;
3159
3160 // Emit a global variable in shared memory to store the broadcasted
3161 // value.
3162 auto *SharedMem = new GlobalVariable(
3163 M, I.getType(), /* IsConstant */ false,
3164 GlobalValue::InternalLinkage, UndefValue::get(I.getType()),
3165 I.getName() + ".guarded.output.alloc", nullptr,
3166 GlobalValue::NotThreadLocal,
3167 static_cast<unsigned>(AddressSpace::Shared));
3168
3169 // Emit a store instruction to update the value.
3170 new StoreInst(&I, SharedMem, RegionEndBB->getTerminator());
3171
3172 LoadInst *LoadI = new LoadInst(I.getType(), SharedMem,
3173 I.getName() + ".guarded.output.load",
3174 RegionBarrierBB->getTerminator());
3175
3176 // Emit a load instruction and replace uses of the output value.
3177 for (Instruction *UsrI : OutsideUsers) {
3178 assert(UsrI->getParent() == RegionExitBB &&(static_cast <bool> (UsrI->getParent() == RegionExitBB
&& "Expected escaping users in exit region") ? void (
0) : __assert_fail ("UsrI->getParent() == RegionExitBB && \"Expected escaping users in exit region\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 3179, __extension__ __PRETTY_FUNCTION__))
3179 "Expected escaping users in exit region")(static_cast <bool> (UsrI->getParent() == RegionExitBB
&& "Expected escaping users in exit region") ? void (
0) : __assert_fail ("UsrI->getParent() == RegionExitBB && \"Expected escaping users in exit region\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 3179, __extension__ __PRETTY_FUNCTION__))
;
3180 UsrI->replaceUsesOfWith(&I, LoadI);
3181 }
3182 }
3183
3184 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3185
3186 // Go to tid check BB in ParentBB.
3187 const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
3188 ParentBB->getTerminator()->eraseFromParent();
3189 OpenMPIRBuilder::LocationDescription Loc(
3190 InsertPointTy(ParentBB, ParentBB->end()), DL);
3191 OMPInfoCache.OMPBuilder.updateToLocation(Loc);
3192 auto *SrcLocStr = OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc);
3193 Value *Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr);
3194 BranchInst::Create(RegionCheckTidBB, ParentBB)->setDebugLoc(DL);
3195
3196 // Add check for Tid in RegionCheckTidBB
3197 RegionCheckTidBB->getTerminator()->eraseFromParent();
3198 OpenMPIRBuilder::LocationDescription LocRegionCheckTid(
3199 InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->end()), DL);
3200 OMPInfoCache.OMPBuilder.updateToLocation(LocRegionCheckTid);
3201 FunctionCallee HardwareTidFn =
3202 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
3203 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
3204 Value *Tid =
3205 OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {});
3206 Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid);
3207 OMPInfoCache.OMPBuilder.Builder
3208 .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
3209 ->setDebugLoc(DL);
3210
3211 // First barrier for synchronization, ensures main thread has updated
3212 // values.
3213 FunctionCallee BarrierFn =
3214 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
3215 M, OMPRTL___kmpc_barrier_simple_spmd);
3216 OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy(
3217 RegionBarrierBB, RegionBarrierBB->getFirstInsertionPt()));
3218 OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid})
3219 ->setDebugLoc(DL);
3220
3221 // Second barrier ensures workers have read broadcast values.
3222 if (HasBroadcastValues)
3223 CallInst::Create(BarrierFn, {Ident, Tid}, "",
3224 RegionBarrierBB->getTerminator())
3225 ->setDebugLoc(DL);
3226 };
3227
3228 auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3229 SmallPtrSet<BasicBlock *, 8> Visited;
3230 for (Instruction *GuardedI : SPMDCompatibilityTracker) {
3231 BasicBlock *BB = GuardedI->getParent();
3232 if (!Visited.insert(BB).second)
3233 continue;
3234
3235 SmallVector<std::pair<Instruction *, Instruction *>> Reorders;
3236 Instruction *LastEffect = nullptr;
3237 BasicBlock::reverse_iterator IP = BB->rbegin(), IPEnd = BB->rend();
3238 while (++IP != IPEnd) {
3239 if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())
3240 continue;
3241 Instruction *I = &*IP;
3242 if (OpenMPOpt::getCallIfRegularCall(*I, &AllocSharedRFI))
3243 continue;
3244 if (!I->user_empty() || !SPMDCompatibilityTracker.contains(I)) {
3245 LastEffect = nullptr;
3246 continue;
3247 }
3248 if (LastEffect)
3249 Reorders.push_back({I, LastEffect});
3250 LastEffect = &*IP;
3251 }
3252 for (auto &Reorder : Reorders)
3253 Reorder.first->moveBefore(Reorder.second);
3254 }
3255
3256 SmallVector<std::pair<Instruction *, Instruction *>, 4> GuardedRegions;
3257
3258 for (Instruction *GuardedI : SPMDCompatibilityTracker) {
3259 BasicBlock *BB = GuardedI->getParent();
3260 auto *CalleeAA = A.lookupAAFor<AAKernelInfo>(
3261 IRPosition::function(*GuardedI->getFunction()), nullptr,
3262 DepClassTy::NONE);
3263 assert(CalleeAA != nullptr && "Expected Callee AAKernelInfo")(static_cast <bool> (CalleeAA != nullptr && "Expected Callee AAKernelInfo"
) ? void (0) : __assert_fail ("CalleeAA != nullptr && \"Expected Callee AAKernelInfo\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 3263, __extension__ __PRETTY_FUNCTION__))
;
3264 auto &CalleeAAFunction = *cast<AAKernelInfoFunction>(CalleeAA);
3265 // Continue if instruction is already guarded.
3266 if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
3267 continue;
3268
3269 Instruction *GuardedRegionStart = nullptr, *GuardedRegionEnd = nullptr;
3270 for (Instruction &I : *BB) {
3271 // If instruction I needs to be guarded update the guarded region
3272 // bounds.
3273 if (SPMDCompatibilityTracker.contains(&I)) {
3274 CalleeAAFunction.getGuardedInstructions().insert(&I);
3275 if (GuardedRegionStart)
3276 GuardedRegionEnd = &I;
3277 else
3278 GuardedRegionStart = GuardedRegionEnd = &I;
3279
3280 continue;
3281 }
3282
3283 // Instruction I does not need guarding, store
3284 // any region found and reset bounds.
3285 if (GuardedRegionStart) {
3286 GuardedRegions.push_back(
3287 std::make_pair(GuardedRegionStart, GuardedRegionEnd));
3288 GuardedRegionStart = nullptr;
3289 GuardedRegionEnd = nullptr;
3290 }
3291 }
3292 }
3293
3294 for (auto &GR : GuardedRegions)
3295 CreateGuardedRegion(GR.first, GR.second);
3296
3297 // Adjust the global exec mode flag that tells the runtime what mode this
3298 // kernel is executed in.
3299 Function *Kernel = getAnchorScope();
3300 GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
3301 (Kernel->getName() + "_exec_mode").str());
3302 assert(ExecMode && "Kernel without exec mode?")(static_cast <bool> (ExecMode && "Kernel without exec mode?"
) ? void (0) : __assert_fail ("ExecMode && \"Kernel without exec mode?\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 3302, __extension__ __PRETTY_FUNCTION__))
;
3303 assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!")(static_cast <bool> (ExecMode->getInitializer() &&
"ExecMode doesn't have initializer!") ? void (0) : __assert_fail
("ExecMode->getInitializer() && \"ExecMode doesn't have initializer!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 3303, __extension__ __PRETTY_FUNCTION__))
;
3304
3305 // Set the global exec mode flag to indicate SPMD-Generic mode.
3306 assert(isa<ConstantInt>(ExecMode->getInitializer()) &&(static_cast <bool> (isa<ConstantInt>(ExecMode->
getInitializer()) && "ExecMode is not an integer!") ?
void (0) : __assert_fail ("isa<ConstantInt>(ExecMode->getInitializer()) && \"ExecMode is not an integer!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 3307, __extension__ __PRETTY_FUNCTION__))
3307 "ExecMode is not an integer!")(static_cast <bool> (isa<ConstantInt>(ExecMode->
getInitializer()) && "ExecMode is not an integer!") ?
void (0) : __assert_fail ("isa<ConstantInt>(ExecMode->getInitializer()) && \"ExecMode is not an integer!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 3307, __extension__ __PRETTY_FUNCTION__))
;
3308 const int8_t ExecModeVal =
3309 cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
3310 assert(ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC &&(static_cast <bool> (ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC
&& "Initially non-SPMD kernel has SPMD exec mode!") ?
void (0) : __assert_fail ("ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC && \"Initially non-SPMD kernel has SPMD exec mode!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 3311, __extension__ __PRETTY_FUNCTION__))
3311 "Initially non-SPMD kernel has SPMD exec mode!")(static_cast <bool> (ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC
&& "Initially non-SPMD kernel has SPMD exec mode!") ?
void (0) : __assert_fail ("ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC && \"Initially non-SPMD kernel has SPMD exec mode!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 3311, __extension__ __PRETTY_FUNCTION__))
;
3312 ExecMode->setInitializer(
3313 ConstantInt::get(ExecMode->getInitializer()->getType(),
3314 ExecModeVal | OMP_TGT_EXEC_MODE_GENERIC_SPMD));
3315
3316 // Next rewrite the init and deinit calls to indicate we use SPMD-mode now.
3317 const int InitModeArgNo = 1;
3318 const int DeinitModeArgNo = 1;
3319 const int InitUseStateMachineArgNo = 2;
3320 const int InitRequiresFullRuntimeArgNo = 3;
3321 const int DeinitRequiresFullRuntimeArgNo = 2;
3322
3323 auto &Ctx = getAnchorValue().getContext();
3324 A.changeUseAfterManifest(
3325 KernelInitCB->getArgOperandUse(InitModeArgNo),
3326 *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx),
3327 OMP_TGT_EXEC_MODE_SPMD));
3328 A.changeUseAfterManifest(
3329 KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo),
3330 *ConstantInt::getBool(Ctx, 0));
3331 A.changeUseAfterManifest(
3332 KernelDeinitCB->getArgOperandUse(DeinitModeArgNo),
3333 *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx),
3334 OMP_TGT_EXEC_MODE_SPMD));
3335 A.changeUseAfterManifest(
3336 KernelInitCB->getArgOperandUse(InitRequiresFullRuntimeArgNo),
3337 *ConstantInt::getBool(Ctx, 0));
3338 A.changeUseAfterManifest(
3339 KernelDeinitCB->getArgOperandUse(DeinitRequiresFullRuntimeArgNo),
3340 *ConstantInt::getBool(Ctx, 0));
3341
3342 ++NumOpenMPTargetRegionKernelsSPMD;
3343
3344 auto Remark = [&](OptimizationRemark OR) {
3345 return OR << "Transformed generic-mode kernel to SPMD-mode.";
3346 };
3347 A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP120", Remark);
3348 return true;
3349 };
3350
3351 ChangeStatus buildCustomStateMachine(Attributor &A) {
3352 // If we have disabled state machine rewrites, don't make a custom one
3353 if (DisableOpenMPOptStateMachineRewrite)
3354 return ChangeStatus::UNCHANGED;
3355
3356 // Don't rewrite the state machine if we are not in a valid state.
3357 if (!ReachedKnownParallelRegions.isValidState())
3358 return ChangeStatus::UNCHANGED;
3359
3360 const int InitModeArgNo = 1;
3361 const int InitUseStateMachineArgNo = 2;
3362
3363 // Check if the current configuration is non-SPMD and generic state machine.
3364 // If we already have SPMD mode or a custom state machine we do not need to
3365 // go any further. If it is anything but a constant something is weird and
3366 // we give up.
3367 ConstantInt *UseStateMachine = dyn_cast<ConstantInt>(
3368 KernelInitCB->getArgOperand(InitUseStateMachineArgNo));
3369 ConstantInt *Mode =
3370 dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitModeArgNo));
3371
3372 // If we are stuck with generic mode, try to create a custom device (=GPU)
3373 // state machine which is specialized for the parallel regions that are
3374 // reachable by the kernel.
3375 if (!UseStateMachine || UseStateMachine->isZero() || !Mode ||
3376 (Mode->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD))
3377 return ChangeStatus::UNCHANGED;
3378
3379 // If not SPMD mode, indicate we use a custom state machine now.
3380 auto &Ctx = getAnchorValue().getContext();
3381 auto *FalseVal = ConstantInt::getBool(Ctx, 0);
3382 A.changeUseAfterManifest(
3383 KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *FalseVal);
3384
3385 // If we don't actually need a state machine we are done here. This can
3386 // happen if there simply are no parallel regions. In the resulting kernel
3387 // all worker threads will simply exit right away, leaving the main thread
3388 // to do the work alone.
3389 if (!mayContainParallelRegion()) {
3390 ++NumOpenMPTargetRegionKernelsWithoutStateMachine;
3391
3392 auto Remark = [&](OptimizationRemark OR) {
3393 return OR << "Removing unused state machine from generic-mode kernel.";
3394 };
3395 A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP130", Remark);
3396
3397 return ChangeStatus::CHANGED;
3398 }
3399
3400 // Keep track in the statistics of our new shiny custom state machine.
3401 if (ReachedUnknownParallelRegions.empty()) {
3402 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;
3403
3404 auto Remark = [&](OptimizationRemark OR) {
3405 return OR << "Rewriting generic-mode kernel with a customized state "
3406 "machine.";
3407 };
3408 A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP131", Remark);
3409 } else {
3410 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;
3411
3412 auto Remark = [&](OptimizationRemarkAnalysis OR) {
3413 return OR << "Generic-mode kernel is executed with a customized state "
3414 "machine that requires a fallback.";
3415 };
3416 A.emitRemark<OptimizationRemarkAnalysis>(KernelInitCB, "OMP132", Remark);
3417
3418 // Tell the user why we ended up with a fallback.
3419 for (CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {
3420 if (!UnknownParallelRegionCB)
3421 continue;
3422 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
3423 return ORA << "Call may contain unknown parallel regions. Use "
3424 << "`__attribute__((assume(\"omp_no_parallelism\")))` to "
3425 "override.";
3426 };
3427 A.emitRemark<OptimizationRemarkAnalysis>(UnknownParallelRegionCB,
3428 "OMP133", Remark);
3429 }
3430 }
3431
3432 // Create all the blocks:
3433 //
3434 // InitCB = __kmpc_target_init(...)
3435 // bool IsWorker = InitCB >= 0;
3436 // if (IsWorker) {
3437 // SMBeginBB: __kmpc_barrier_simple_spmd(...);
3438 // void *WorkFn;
3439 // bool Active = __kmpc_kernel_parallel(&WorkFn);
3440 // if (!WorkFn) return;
3441 // SMIsActiveCheckBB: if (Active) {
3442 // SMIfCascadeCurrentBB: if (WorkFn == <ParFn0>)
3443 // ParFn0(...);
3444 // SMIfCascadeCurrentBB: else if (WorkFn == <ParFn1>)
3445 // ParFn1(...);
3446 // ...
3447 // SMIfCascadeCurrentBB: else
3448 // ((WorkFnTy*)WorkFn)(...);
3449 // SMEndParallelBB: __kmpc_kernel_end_parallel(...);
3450 // }
3451 // SMDoneBB: __kmpc_barrier_simple_spmd(...);
3452 // goto SMBeginBB;
3453 // }
3454 // UserCodeEntryBB: // user code
3455 // __kmpc_target_deinit(...)
3456 //
3457 Function *Kernel = getAssociatedFunction();
3458 assert(Kernel && "Expected an associated function!")(static_cast <bool> (Kernel && "Expected an associated function!"
) ? void (0) : __assert_fail ("Kernel && \"Expected an associated function!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 3458, __extension__ __PRETTY_FUNCTION__))
;
3459
3460 BasicBlock *InitBB = KernelInitCB->getParent();
3461 BasicBlock *UserCodeEntryBB = InitBB->splitBasicBlock(
3462 KernelInitCB->getNextNode(), "thread.user_code.check");
3463 BasicBlock *StateMachineBeginBB = BasicBlock::Create(
3464 Ctx, "worker_state_machine.begin", Kernel, UserCodeEntryBB);
3465 BasicBlock *StateMachineFinishedBB = BasicBlock::Create(
3466 Ctx, "worker_state_machine.finished", Kernel, UserCodeEntryBB);
3467 BasicBlock *StateMachineIsActiveCheckBB = BasicBlock::Create(
3468 Ctx, "worker_state_machine.is_active.check", Kernel, UserCodeEntryBB);
3469 BasicBlock *StateMachineIfCascadeCurrentBB =
3470 BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",
3471 Kernel, UserCodeEntryBB);
3472 BasicBlock *StateMachineEndParallelBB =
3473 BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.end",
3474 Kernel, UserCodeEntryBB);
3475 BasicBlock *StateMachineDoneBarrierBB = BasicBlock::Create(
3476 Ctx, "worker_state_machine.done.barrier", Kernel, UserCodeEntryBB);
3477 A.registerManifestAddedBasicBlock(*InitBB);
3478 A.registerManifestAddedBasicBlock(*UserCodeEntryBB);
3479 A.registerManifestAddedBasicBlock(*StateMachineBeginBB);
3480 A.registerManifestAddedBasicBlock(*StateMachineFinishedBB);
3481 A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB);
3482 A.registerManifestAddedBasicBlock(*StateMachineIfCascadeCurrentBB);
3483 A.registerManifestAddedBasicBlock(*StateMachineEndParallelBB);
3484 A.registerManifestAddedBasicBlock(*StateMachineDoneBarrierBB);
3485
3486 const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
3487 ReturnInst::Create(Ctx, StateMachineFinishedBB)->setDebugLoc(DLoc);
3488
3489 InitBB->getTerminator()->eraseFromParent();
3490 Instruction *IsWorker =
3491 ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB,
3492 ConstantInt::get(KernelInitCB->getType(), -1),
3493 "thread.is_worker", InitBB);
3494 IsWorker->setDebugLoc(DLoc);
3495 BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker, InitBB);
3496
3497 Module &M = *Kernel->getParent();
3498
3499 // Create local storage for the work function pointer.
3500 const DataLayout &DL = M.getDataLayout();
3501 Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
3502 Instruction *WorkFnAI =
3503 new AllocaInst(VoidPtrTy, DL.getAllocaAddrSpace(), nullptr,
3504 "worker.work_fn.addr", &Kernel->getEntryBlock().front());
3505 WorkFnAI->setDebugLoc(DLoc);
3506
3507 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3508 OMPInfoCache.OMPBuilder.updateToLocation(
3509 OpenMPIRBuilder::LocationDescription(
3510 IRBuilder<>::InsertPoint(StateMachineBeginBB,
3511 StateMachineBeginBB->end()),
3512 DLoc));
3513
3514 Value *Ident = KernelInitCB->getArgOperand(0);
3515 Value *GTid = KernelInitCB;
3516
3517 FunctionCallee BarrierFn =
3518 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
3519 M, OMPRTL___kmpc_barrier_simple_spmd);
3520 CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB)
3521 ->setDebugLoc(DLoc);
3522
3523 if (WorkFnAI->getType()->getPointerAddressSpace() !=
3524 (unsigned int)AddressSpace::Generic) {
3525 WorkFnAI = new AddrSpaceCastInst(
3526 WorkFnAI,
3527 PointerType::getWithSamePointeeType(
3528 cast<PointerType>(WorkFnAI->getType()),
3529 (unsigned int)AddressSpace::Generic),
3530 WorkFnAI->getName() + ".generic", StateMachineBeginBB);
3531 WorkFnAI->setDebugLoc(DLoc);
3532 }
3533
3534 FunctionCallee KernelParallelFn =
3535 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
3536 M, OMPRTL___kmpc_kernel_parallel);
3537 Instruction *IsActiveWorker = CallInst::Create(
3538 KernelParallelFn, {WorkFnAI}, "worker.is_active", StateMachineBeginBB);
3539 IsActiveWorker->setDebugLoc(DLoc);
3540 Instruction *WorkFn = new LoadInst(VoidPtrTy, WorkFnAI, "worker.work_fn",
3541 StateMachineBeginBB);
3542 WorkFn->setDebugLoc(DLoc);
3543
3544 FunctionType *ParallelRegionFnTy = FunctionType::get(
3545 Type::getVoidTy(Ctx), {Type::getInt16Ty(Ctx), Type::getInt32Ty(Ctx)},
3546 false);
3547 Value *WorkFnCast = BitCastInst::CreatePointerBitCastOrAddrSpaceCast(
3548 WorkFn, ParallelRegionFnTy->getPointerTo(), "worker.work_fn.addr_cast",
3549 StateMachineBeginBB);
3550
3551 Instruction *IsDone =
3552 ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFn,
3553 Constant::getNullValue(VoidPtrTy), "worker.is_done",
3554 StateMachineBeginBB);
3555 IsDone->setDebugLoc(DLoc);
3556 BranchInst::Create(StateMachineFinishedBB, StateMachineIsActiveCheckBB,
3557 IsDone, StateMachineBeginBB)
3558 ->setDebugLoc(DLoc);
3559
3560 BranchInst::Create(StateMachineIfCascadeCurrentBB,
3561 StateMachineDoneBarrierBB, IsActiveWorker,
3562 StateMachineIsActiveCheckBB)
3563 ->setDebugLoc(DLoc);
3564
3565 Value *ZeroArg =
3566 Constant::getNullValue(ParallelRegionFnTy->getParamType(0));
3567
3568 // Now that we have most of the CFG skeleton it is time for the if-cascade
3569 // that checks the function pointer we got from the runtime against the
3570 // parallel regions we expect, if there are any.
3571 for (int I = 0, E = ReachedKnownParallelRegions.size(); I < E; ++I) {
3572 auto *ParallelRegion = ReachedKnownParallelRegions[I];
3573 BasicBlock *PRExecuteBB = BasicBlock::Create(
3574 Ctx, "worker_state_machine.parallel_region.execute", Kernel,
3575 StateMachineEndParallelBB);
3576 CallInst::Create(ParallelRegion, {ZeroArg, GTid}, "", PRExecuteBB)
3577 ->setDebugLoc(DLoc);
3578 BranchInst::Create(StateMachineEndParallelBB, PRExecuteBB)
3579 ->setDebugLoc(DLoc);
3580
3581 BasicBlock *PRNextBB =
3582 BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",
3583 Kernel, StateMachineEndParallelBB);
3584
3585 // Check if we need to compare the pointer at all or if we can just
3586 // call the parallel region function.
3587 Value *IsPR;
3588 if (I + 1 < E || !ReachedUnknownParallelRegions.empty()) {
3589 Instruction *CmpI = ICmpInst::Create(
3590 ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFnCast, ParallelRegion,
3591 "worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
3592 CmpI->setDebugLoc(DLoc);
3593 IsPR = CmpI;
3594 } else {
3595 IsPR = ConstantInt::getTrue(Ctx);
3596 }
3597
3598 BranchInst::Create(PRExecuteBB, PRNextBB, IsPR,
3599 StateMachineIfCascadeCurrentBB)
3600 ->setDebugLoc(DLoc);
3601 StateMachineIfCascadeCurrentBB = PRNextBB;
3602 }
3603
3604 // At the end of the if-cascade we place the indirect function pointer call
3605 // in case we might need it, that is if there can be parallel regions we
3606 // have not handled in the if-cascade above.
3607 if (!ReachedUnknownParallelRegions.empty()) {
3608 StateMachineIfCascadeCurrentBB->setName(
3609 "worker_state_machine.parallel_region.fallback.execute");
3610 CallInst::Create(ParallelRegionFnTy, WorkFnCast, {ZeroArg, GTid}, "",
3611 StateMachineIfCascadeCurrentBB)
3612 ->setDebugLoc(DLoc);
3613 }
3614 BranchInst::Create(StateMachineEndParallelBB,
3615 StateMachineIfCascadeCurrentBB)
3616 ->setDebugLoc(DLoc);
3617
3618 CallInst::Create(OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
3619 M, OMPRTL___kmpc_kernel_end_parallel),
3620 {}, "", StateMachineEndParallelBB)
3621 ->setDebugLoc(DLoc);
3622 BranchInst::Create(StateMachineDoneBarrierBB, StateMachineEndParallelBB)
3623 ->setDebugLoc(DLoc);
3624
3625 CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineDoneBarrierBB)
3626 ->setDebugLoc(DLoc);
3627 BranchInst::Create(StateMachineBeginBB, StateMachineDoneBarrierBB)
3628 ->setDebugLoc(DLoc);
3629
3630 return ChangeStatus::CHANGED;
3631 }
3632
3633 /// Fixpoint iteration update function. Will be called every time a dependence
3634 /// changed its state (and in the beginning).
3635 ChangeStatus updateImpl(Attributor &A) override {
3636 KernelInfoState StateBefore = getState();
3637
3638 // Callback to check a read/write instruction.
3639 auto CheckRWInst = [&](Instruction &I) {
3640 // We handle calls later.
3641 if (isa<CallBase>(I))
3642 return true;
3643 // We only care about write effects.
3644 if (!I.mayWriteToMemory())
3645 return true;
3646 if (auto *SI = dyn_cast<StoreInst>(&I)) {
3647 SmallVector<const Value *> Objects;
3648 getUnderlyingObjects(SI->getPointerOperand(), Objects);
3649 if (llvm::all_of(Objects,
3650 [](const Value *Obj) { return isa<AllocaInst>(Obj); }))
3651 return true;
3652 // Check for AAHeapToStack moved objects which must not be guarded.
3653 auto &HS = A.getAAFor<AAHeapToStack>(
3654 *this, IRPosition::function(*I.getFunction()),
3655 DepClassTy::REQUIRED);
3656 if (llvm::all_of(Objects, [&HS](const Value *Obj) {
3657 auto *CB = dyn_cast<CallBase>(Obj);
3658 if (!CB)
3659 return false;
3660 return HS.isAssumedHeapToStack(*CB);
3661 })) {
3662 return true;
3663 }
3664 }
3665
3666 // Insert instruction that needs guarding.
3667 SPMDCompatibilityTracker.insert(&I);
3668 return true;
3669 };
3670
3671 bool UsedAssumedInformationInCheckRWInst = false;
3672 if (!SPMDCompatibilityTracker.isAtFixpoint())
3673 if (!A.checkForAllReadWriteInstructions(
3674 CheckRWInst, *this, UsedAssumedInformationInCheckRWInst))
3675 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3676
3677 if (!IsKernelEntry) {
3678 updateReachingKernelEntries(A);
3679 updateParallelLevels(A);
3680
3681 if (!ParallelLevels.isValidState())
3682 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3683 }
3684
3685 // Callback to check a call instruction.
3686 bool AllSPMDStatesWereFixed = true;
3687 auto CheckCallInst = [&](Instruction &I) {
3688 auto &CB = cast<CallBase>(I);
3689 auto &CBAA = A.getAAFor<AAKernelInfo>(
3690 *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
3691 getState() ^= CBAA.getState();
3692 AllSPMDStatesWereFixed &= CBAA.SPMDCompatibilityTracker.isAtFixpoint();
3693 return true;
3694 };
3695
3696 bool UsedAssumedInformationInCheckCallInst = false;
3697 if (!A.checkForAllCallLikeInstructions(
3698 CheckCallInst, *this, UsedAssumedInformationInCheckCallInst)) {
3699 LLVM_DEBUG(dbgs() << TAG << "Failed to visit all call-like instructions!\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Failed to visit all call-like instructions!\n"
;; } } while (false)
;
3700 return indicatePessimisticFixpoint();
3701 }
3702
3703 // If we haven't used any assumed information for the SPMD state we can fix
3704 // it.
3705 if (!UsedAssumedInformationInCheckRWInst &&
3706 !UsedAssumedInformationInCheckCallInst && AllSPMDStatesWereFixed)
3707 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3708
3709 return StateBefore == getState() ? ChangeStatus::UNCHANGED
3710 : ChangeStatus::CHANGED;
3711 }
3712
3713private:
3714 /// Update info regarding reaching kernels.
3715 void updateReachingKernelEntries(Attributor &A) {
3716 auto PredCallSite = [&](AbstractCallSite ACS) {
3717 Function *Caller = ACS.getInstruction()->getFunction();
3718
3719 assert(Caller && "Caller is nullptr")(static_cast <bool> (Caller && "Caller is nullptr"
) ? void (0) : __assert_fail ("Caller && \"Caller is nullptr\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 3719, __extension__ __PRETTY_FUNCTION__))
;
3720
3721 auto &CAA = A.getOrCreateAAFor<AAKernelInfo>(
3722 IRPosition::function(*Caller), this, DepClassTy::REQUIRED);
3723 if (CAA.ReachingKernelEntries.isValidState()) {
3724 ReachingKernelEntries ^= CAA.ReachingKernelEntries;
3725 return true;
3726 }
3727
3728 // We lost track of the caller of the associated function, any kernel
3729 // could reach now.
3730 ReachingKernelEntries.indicatePessimisticFixpoint();
3731
3732 return true;
3733 };
3734
3735 bool AllCallSitesKnown;
3736 if (!A.checkForAllCallSites(PredCallSite, *this,
3737 true /* RequireAllCallSites */,
3738 AllCallSitesKnown))
3739 ReachingKernelEntries.indicatePessimisticFixpoint();
3740 }
3741
3742 /// Update info regarding parallel levels.
3743 void updateParallelLevels(Attributor &A) {
3744 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3745 OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
3746 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
3747
3748 auto PredCallSite = [&](AbstractCallSite ACS) {
3749 Function *Caller = ACS.getInstruction()->getFunction();
3750
3751 assert(Caller && "Caller is nullptr")(static_cast <bool> (Caller && "Caller is nullptr"
) ? void (0) : __assert_fail ("Caller && \"Caller is nullptr\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 3751, __extension__ __PRETTY_FUNCTION__))
;
3752
3753 auto &CAA =
3754 A.getOrCreateAAFor<AAKernelInfo>(IRPosition::function(*Caller));
3755 if (CAA.ParallelLevels.isValidState()) {
3756 // Any function that is called by `__kmpc_parallel_51` will not be
3757 // folded as the parallel level in the function is updated. In order to
3758 // get it right, all the analysis would depend on the implentation. That
3759 // said, if in the future any change to the implementation, the analysis
3760 // could be wrong. As a consequence, we are just conservative here.
3761 if (Caller == Parallel51RFI.Declaration) {
3762 ParallelLevels.indicatePessimisticFixpoint();
3763 return true;
3764 }
3765
3766 ParallelLevels ^= CAA.ParallelLevels;
3767
3768 return true;
3769 }
3770
3771 // We lost track of the caller of the associated function, any kernel
3772 // could reach now.
3773 ParallelLevels.indicatePessimisticFixpoint();
3774
3775 return true;
3776 };
3777
3778 bool AllCallSitesKnown = true;
3779 if (!A.checkForAllCallSites(PredCallSite, *this,
3780 true /* RequireAllCallSites */,
3781 AllCallSitesKnown))
3782 ParallelLevels.indicatePessimisticFixpoint();
3783 }
3784};
3785
3786/// The call site kernel info abstract attribute, basically, what can we say
3787/// about a call site with regards to the KernelInfoState. For now this simply
3788/// forwards the information from the callee.
3789struct AAKernelInfoCallSite : AAKernelInfo {
3790 AAKernelInfoCallSite(const IRPosition &IRP, Attributor &A)
3791 : AAKernelInfo(IRP, A) {}
3792
3793 /// See AbstractAttribute::initialize(...).
3794 void initialize(Attributor &A) override {
3795 AAKernelInfo::initialize(A);
3796
3797 CallBase &CB = cast<CallBase>(getAssociatedValue());
3798 Function *Callee = getAssociatedFunction();
3799
3800 // Helper to lookup an assumption string.
3801 auto HasAssumption = [](CallBase &CB, StringRef AssumptionStr) {
3802 return hasAssumption(CB, AssumptionStr);
3803 };
3804
3805 // Check for SPMD-mode assumptions.
3806 if (HasAssumption(CB, "ompx_spmd_amenable")) {
3807 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3808 indicateOptimisticFixpoint();
3809 }
3810
3811 // First weed out calls we do not care about, that is readonly/readnone
3812 // calls, intrinsics, and "no_openmp" calls. Neither of these can reach a
3813 // parallel region or anything else we are looking for.
3814 if (!CB.mayWriteToMemory() || isa<IntrinsicInst>(CB)) {
3815 indicateOptimisticFixpoint();
3816 return;
3817 }
3818
3819 // Next we check if we know the callee. If it is a known OpenMP function
3820 // we will handle them explicitly in the switch below. If it is not, we
3821 // will use an AAKernelInfo object on the callee to gather information and
3822 // merge that into the current state. The latter happens in the updateImpl.
3823 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3824 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
3825 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
3826 // Unknown caller or declarations are not analyzable, we give up.
3827 if (!Callee || !A.isFunctionIPOAmendable(*Callee)) {
3828
3829 // Unknown callees might contain parallel regions, except if they have
3830 // an appropriate assumption attached.
3831 if (!(HasAssumption(CB, "omp_no_openmp") ||
3832 HasAssumption(CB, "omp_no_parallelism")))
3833 ReachedUnknownParallelRegions.insert(&CB);
3834
3835 // If SPMDCompatibilityTracker is not fixed, we need to give up on the
3836 // idea we can run something unknown in SPMD-mode.
3837 if (!SPMDCompatibilityTracker.isAtFixpoint()) {
3838 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3839 SPMDCompatibilityTracker.insert(&CB);
3840 }
3841
3842 // We have updated the state for this unknown call properly, there won't
3843 // be any change so we indicate a fixpoint.
3844 indicateOptimisticFixpoint();
3845 }
3846 // If the callee is known and can be used in IPO, we will update the state
3847 // based on the callee state in updateImpl.
3848 return;
3849 }
3850
3851 const unsigned int WrapperFunctionArgNo = 6;
3852 RuntimeFunction RF = It->getSecond();
3853 switch (RF) {
3854 // All the functions we know are compatible with SPMD mode.
3855 case OMPRTL___kmpc_is_spmd_exec_mode:
3856 case OMPRTL___kmpc_distribute_static_fini:
3857 case OMPRTL___kmpc_for_static_fini:
3858 case OMPRTL___kmpc_global_thread_num:
3859 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
3860 case OMPRTL___kmpc_get_hardware_num_blocks:
3861 case OMPRTL___kmpc_single:
3862 case OMPRTL___kmpc_end_single:
3863 case OMPRTL___kmpc_master:
3864 case OMPRTL___kmpc_end_master:
3865 case OMPRTL___kmpc_barrier:
3866 break;
3867 case OMPRTL___kmpc_distribute_static_init_4:
3868 case OMPRTL___kmpc_distribute_static_init_4u:
3869 case OMPRTL___kmpc_distribute_static_init_8:
3870 case OMPRTL___kmpc_distribute_static_init_8u:
3871 case OMPRTL___kmpc_for_static_init_4:
3872 case OMPRTL___kmpc_for_static_init_4u:
3873 case OMPRTL___kmpc_for_static_init_8:
3874 case OMPRTL___kmpc_for_static_init_8u: {
3875 // Check the schedule and allow static schedule in SPMD mode.
3876 unsigned ScheduleArgOpNo = 2;
3877 auto *ScheduleTypeCI =
3878 dyn_cast<ConstantInt>(CB.getArgOperand(ScheduleArgOpNo));
3879 unsigned ScheduleTypeVal =
3880 ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
3881 switch (OMPScheduleType(ScheduleTypeVal)) {
3882 case OMPScheduleType::Static:
3883 case OMPScheduleType::StaticChunked:
3884 case OMPScheduleType::Distribute:
3885 case OMPScheduleType::DistributeChunked:
3886 break;
3887 default:
3888 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3889 SPMDCompatibilityTracker.insert(&CB);
3890 break;
3891 };
3892 } break;
3893 case OMPRTL___kmpc_target_init:
3894 KernelInitCB = &CB;
3895 break;
3896 case OMPRTL___kmpc_target_deinit:
3897 KernelDeinitCB = &CB;
3898 break;
3899 case OMPRTL___kmpc_parallel_51:
3900 if (auto *ParallelRegion = dyn_cast<Function>(
3901 CB.getArgOperand(WrapperFunctionArgNo)->stripPointerCasts())) {
3902 ReachedKnownParallelRegions.insert(ParallelRegion);
3903 break;
3904 }
3905 // The condition above should usually get the parallel region function
3906 // pointer and record it. In the off chance it doesn't we assume the
3907 // worst.
3908 ReachedUnknownParallelRegions.insert(&CB);
3909 break;
3910 case OMPRTL___kmpc_omp_task:
3911 // We do not look into tasks right now, just give up.
3912 SPMDCompatibilityTracker.insert(&CB);
3913 ReachedUnknownParallelRegions.insert(&CB);
3914 break;
3915 case OMPRTL___kmpc_alloc_shared:
3916 case OMPRTL___kmpc_free_shared:
3917 // Return without setting a fixpoint, to be resolved in updateImpl.
3918 return;
3919 default:
3920 // Unknown OpenMP runtime calls cannot be executed in SPMD-mode,
3921 // generally. However, they do not hide parallel regions.
3922 SPMDCompatibilityTracker.insert(&CB);
3923 break;
3924 }
3925 // All other OpenMP runtime calls will not reach parallel regions so they
3926 // can be safely ignored for now. Since it is a known OpenMP runtime call we
3927 // have now modeled all effects and there is no need for any update.
3928 indicateOptimisticFixpoint();
3929 }
3930
3931 ChangeStatus updateImpl(Attributor &A) override {
3932 // TODO: Once we have call site specific value information we can provide
3933 // call site specific liveness information and then it makes
3934 // sense to specialize attributes for call sites arguments instead of
3935 // redirecting requests to the callee argument.
3936 Function *F = getAssociatedFunction();
3937
3938 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3939 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(F);
3940
3941 // If F is not a runtime function, propagate the AAKernelInfo of the callee.
3942 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
3943 const IRPosition &FnPos = IRPosition::function(*F);
3944 auto &FnAA = A.getAAFor<AAKernelInfo>(*this, FnPos, DepClassTy::REQUIRED);
3945 if (getState() == FnAA.getState())
3946 return ChangeStatus::UNCHANGED;
3947 getState() = FnAA.getState();
3948 return ChangeStatus::CHANGED;
3949 }
3950
3951 // F is a runtime function that allocates or frees memory, check
3952 // AAHeapToStack and AAHeapToShared.
3953 KernelInfoState StateBefore = getState();
3954 assert((It->getSecond() == OMPRTL___kmpc_alloc_shared ||(static_cast <bool> ((It->getSecond() == OMPRTL___kmpc_alloc_shared
|| It->getSecond() == OMPRTL___kmpc_free_shared) &&
"Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call"
) ? void (0) : __assert_fail ("(It->getSecond() == OMPRTL___kmpc_alloc_shared || It->getSecond() == OMPRTL___kmpc_free_shared) && \"Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 3956, __extension__ __PRETTY_FUNCTION__))
3955 It->getSecond() == OMPRTL___kmpc_free_shared) &&(static_cast <bool> ((It->getSecond() == OMPRTL___kmpc_alloc_shared
|| It->getSecond() == OMPRTL___kmpc_free_shared) &&
"Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call"
) ? void (0) : __assert_fail ("(It->getSecond() == OMPRTL___kmpc_alloc_shared || It->getSecond() == OMPRTL___kmpc_free_shared) && \"Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 3956, __extension__ __PRETTY_FUNCTION__))
3956 "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call")(static_cast <bool> ((It->getSecond() == OMPRTL___kmpc_alloc_shared
|| It->getSecond() == OMPRTL___kmpc_free_shared) &&
"Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call"
) ? void (0) : __assert_fail ("(It->getSecond() == OMPRTL___kmpc_alloc_shared || It->getSecond() == OMPRTL___kmpc_free_shared) && \"Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 3956, __extension__ __PRETTY_FUNCTION__))
;
3957
3958 CallBase &CB = cast<CallBase>(getAssociatedValue());
3959
3960 auto &HeapToStackAA = A.getAAFor<AAHeapToStack>(
3961 *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL);
3962 auto &HeapToSharedAA = A.getAAFor<AAHeapToShared>(
3963 *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL);
3964
3965 RuntimeFunction RF = It->getSecond();
3966
3967 switch (RF) {
3968 // If neither HeapToStack nor HeapToShared assume the call is removed,
3969 // assume SPMD incompatibility.
3970 case OMPRTL___kmpc_alloc_shared:
3971 if (!HeapToStackAA.isAssumedHeapToStack(CB) &&
3972 !HeapToSharedAA.isAssumedHeapToShared(CB))
3973 SPMDCompatibilityTracker.insert(&CB);
3974 break;
3975 case OMPRTL___kmpc_free_shared:
3976 if (!HeapToStackAA.isAssumedHeapToStackRemovedFree(CB) &&
3977 !HeapToSharedAA.isAssumedHeapToSharedRemovedFree(CB))
3978 SPMDCompatibilityTracker.insert(&CB);
3979 break;
3980 default:
3981 SPMDCompatibilityTracker.insert(&CB);
3982 }
3983
3984 return StateBefore == getState() ? ChangeStatus::UNCHANGED
3985 : ChangeStatus::CHANGED;
3986 }
3987};
3988
3989struct AAFoldRuntimeCall
3990 : public StateWrapper<BooleanState, AbstractAttribute> {
3991 using Base = StateWrapper<BooleanState, AbstractAttribute>;
3992
3993 AAFoldRuntimeCall(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
3994
3995 /// Statistics are tracked as part of manifest for now.
3996 void trackStatistics() const override {}
3997
3998 /// Create an abstract attribute biew for the position \p IRP.
3999 static AAFoldRuntimeCall &createForPosition(const IRPosition &IRP,
4000 Attributor &A);
4001
4002 /// See AbstractAttribute::getName()
4003 const std::string getName() const override { return "AAFoldRuntimeCall"; }
4004
4005 /// See AbstractAttribute::getIdAddr()
4006 const char *getIdAddr() const override { return &ID; }
4007
4008 /// This function should return true if the type of the \p AA is
4009 /// AAFoldRuntimeCall
4010 static bool classof(const AbstractAttribute *AA) {
4011 return (AA->getIdAddr() == &ID);
4012 }
4013
4014 static const char ID;
4015};
4016
4017struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
4018 AAFoldRuntimeCallCallSiteReturned(const IRPosition &IRP, Attributor &A)
4019 : AAFoldRuntimeCall(IRP, A) {}
4020
4021 /// See AbstractAttribute::getAsStr()
4022 const std::string getAsStr() const override {
4023 if (!isValidState())
4024 return "<invalid>";
4025
4026 std::string Str("simplified value: ");
4027
4028 if (!SimplifiedValue.hasValue())
4029 return Str + std::string("none");
4030
4031 if (!SimplifiedValue.getValue())
4032 return Str + std::string("nullptr");
4033
4034 if (ConstantInt *CI = dyn_cast<ConstantInt>(SimplifiedValue.getValue()))
4035 return Str + std::to_string(CI->getSExtValue());
4036
4037 return Str + std::string("unknown");
4038 }
4039
4040 void initialize(Attributor &A) override {
4041 if (DisableOpenMPOptFolding)
4042 indicatePessimisticFixpoint();
4043
4044 Function *Callee = getAssociatedFunction();
4045
4046 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4047 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
4048 assert(It != OMPInfoCache.RuntimeFunctionIDMap.end() &&(static_cast <bool> (It != OMPInfoCache.RuntimeFunctionIDMap
.end() && "Expected a known OpenMP runtime function")
? void (0) : __assert_fail ("It != OMPInfoCache.RuntimeFunctionIDMap.end() && \"Expected a known OpenMP runtime function\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4049, __extension__ __PRETTY_FUNCTION__))
4049 "Expected a known OpenMP runtime function")(static_cast <bool> (It != OMPInfoCache.RuntimeFunctionIDMap
.end() && "Expected a known OpenMP runtime function")
? void (0) : __assert_fail ("It != OMPInfoCache.RuntimeFunctionIDMap.end() && \"Expected a known OpenMP runtime function\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4049, __extension__ __PRETTY_FUNCTION__))
;
4050
4051 RFKind = It->getSecond();
4052
4053 CallBase &CB = cast<CallBase>(getAssociatedValue());
4054 A.registerSimplificationCallback(
4055 IRPosition::callsite_returned(CB),
4056 [&](const IRPosition &IRP, const AbstractAttribute *AA,
4057 bool &UsedAssumedInformation) -> Optional<Value *> {
4058 assert((isValidState() || (SimplifiedValue.hasValue() &&(static_cast <bool> ((isValidState() || (SimplifiedValue
.hasValue() && SimplifiedValue.getValue() == nullptr)
) && "Unexpected invalid state!") ? void (0) : __assert_fail
("(isValidState() || (SimplifiedValue.hasValue() && SimplifiedValue.getValue() == nullptr)) && \"Unexpected invalid state!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4060, __extension__ __PRETTY_FUNCTION__))
4059 SimplifiedValue.getValue() == nullptr)) &&(static_cast <bool> ((isValidState() || (SimplifiedValue
.hasValue() && SimplifiedValue.getValue() == nullptr)
) && "Unexpected invalid state!") ? void (0) : __assert_fail
("(isValidState() || (SimplifiedValue.hasValue() && SimplifiedValue.getValue() == nullptr)) && \"Unexpected invalid state!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4060, __extension__ __PRETTY_FUNCTION__))
4060 "Unexpected invalid state!")(static_cast <bool> ((isValidState() || (SimplifiedValue
.hasValue() && SimplifiedValue.getValue() == nullptr)
) && "Unexpected invalid state!") ? void (0) : __assert_fail
("(isValidState() || (SimplifiedValue.hasValue() && SimplifiedValue.getValue() == nullptr)) && \"Unexpected invalid state!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4060, __extension__ __PRETTY_FUNCTION__))
;
4061
4062 if (!isAtFixpoint()) {
4063 UsedAssumedInformation = true;
4064 if (AA)
4065 A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
4066 }
4067 return SimplifiedValue;
4068 });
4069 }
4070
4071 ChangeStatus updateImpl(Attributor &A) override {
4072 ChangeStatus Changed = ChangeStatus::UNCHANGED;
4073 switch (RFKind) {
4074 case OMPRTL___kmpc_is_spmd_exec_mode:
4075 Changed |= foldIsSPMDExecMode(A);
4076 break;
4077 case OMPRTL___kmpc_is_generic_main_thread_id:
4078 Changed |= foldIsGenericMainThread(A);
4079 break;
4080 case OMPRTL___kmpc_parallel_level:
4081 Changed |= foldParallelLevel(A);
4082 break;
4083 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
4084 Changed = Changed | foldKernelFnAttribute(A, "omp_target_thread_limit");
4085 break;
4086 case OMPRTL___kmpc_get_hardware_num_blocks:
4087 Changed = Changed | foldKernelFnAttribute(A, "omp_target_num_teams");
4088 break;
4089 default:
4090 llvm_unreachable("Unhandled OpenMP runtime function!")::llvm::llvm_unreachable_internal("Unhandled OpenMP runtime function!"
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4090)
;
4091 }
4092
4093 return Changed;
4094 }
4095
4096 ChangeStatus manifest(Attributor &A) override {
4097 ChangeStatus Changed = ChangeStatus::UNCHANGED;
4098
4099 if (SimplifiedValue.hasValue() && SimplifiedValue.getValue()) {
4100 Instruction &I = *getCtxI();
4101 A.changeValueAfterManifest(I, **SimplifiedValue);
4102 A.deleteAfterManifest(I);
4103
4104 CallBase *CB = dyn_cast<CallBase>(&I);
4105 auto Remark = [&](OptimizationRemark OR) {
4106 if (auto *C = dyn_cast<ConstantInt>(*SimplifiedValue))
4107 return OR << "Replacing OpenMP runtime call "
4108 << CB->getCalledFunction()->getName() << " with "
4109 << ore::NV("FoldedValue", C->getZExtValue()) << ".";
4110 return OR << "Replacing OpenMP runtime call "
4111 << CB->getCalledFunction()->getName() << ".";
4112 };
4113
4114 if (CB && EnableVerboseRemarks)
4115 A.emitRemark<OptimizationRemark>(CB, "OMP180", Remark);
4116
4117 LLVM_DEBUG(dbgs() << TAG << "Replacing runtime call: " << I << " with "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Replacing runtime call: "
<< I << " with " << **SimplifiedValue <<
"\n"; } } while (false)
4118 << **SimplifiedValue << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Replacing runtime call: "
<< I << " with " << **SimplifiedValue <<
"\n"; } } while (false)
;
4119
4120 Changed = ChangeStatus::CHANGED;
4121 }
4122
4123 return Changed;
4124 }
4125
4126 ChangeStatus indicatePessimisticFixpoint() override {
4127 SimplifiedValue = nullptr;
4128 return AAFoldRuntimeCall::indicatePessimisticFixpoint();
4129 }
4130
4131private:
4132 /// Fold __kmpc_is_spmd_exec_mode into a constant if possible.
4133 ChangeStatus foldIsSPMDExecMode(Attributor &A) {
4134 Optional<Value *> SimplifiedValueBefore = SimplifiedValue;
4135
4136 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
4137 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
4138 auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
4139 *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
4140
4141 if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState())
4142 return indicatePessimisticFixpoint();
4143
4144 for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
4145 auto &AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K),
4146 DepClassTy::REQUIRED);
4147
4148 if (!AA.isValidState()) {
4149 SimplifiedValue = nullptr;
4150 return indicatePessimisticFixpoint();
4151 }
4152
4153 if (AA.SPMDCompatibilityTracker.isAssumed()) {
4154 if (AA.SPMDCompatibilityTracker.isAtFixpoint())
4155 ++KnownSPMDCount;
4156 else
4157 ++AssumedSPMDCount;
4158 } else {
4159 if (AA.SPMDCompatibilityTracker.isAtFixpoint())
4160 ++KnownNonSPMDCount;
4161 else
4162 ++AssumedNonSPMDCount;
4163 }
4164 }
4165
4166 if ((AssumedSPMDCount + KnownSPMDCount) &&
4167 (AssumedNonSPMDCount + KnownNonSPMDCount))
4168 return indicatePessimisticFixpoint();
4169
4170 auto &Ctx = getAnchorValue().getContext();
4171 if (KnownSPMDCount || AssumedSPMDCount) {
4172 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&(static_cast <bool> (KnownNonSPMDCount == 0 && AssumedNonSPMDCount
== 0 && "Expected only SPMD kernels!") ? void (0) : __assert_fail
("KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 && \"Expected only SPMD kernels!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4173, __extension__ __PRETTY_FUNCTION__))
4173 "Expected only SPMD kernels!")(static_cast <bool> (KnownNonSPMDCount == 0 && AssumedNonSPMDCount
== 0 && "Expected only SPMD kernels!") ? void (0) : __assert_fail
("KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 && \"Expected only SPMD kernels!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4173, __extension__ __PRETTY_FUNCTION__))
;
4174 // All reaching kernels are in SPMD mode. Update all function calls to
4175 // __kmpc_is_spmd_exec_mode to 1.
4176 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), true);
4177 } else if (KnownNonSPMDCount || AssumedNonSPMDCount) {
4178 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&(static_cast <bool> (KnownSPMDCount == 0 && AssumedSPMDCount
== 0 && "Expected only non-SPMD kernels!") ? void (0
) : __assert_fail ("KnownSPMDCount == 0 && AssumedSPMDCount == 0 && \"Expected only non-SPMD kernels!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4179, __extension__ __PRETTY_FUNCTION__))
4179 "Expected only non-SPMD kernels!")(static_cast <bool> (KnownSPMDCount == 0 && AssumedSPMDCount
== 0 && "Expected only non-SPMD kernels!") ? void (0
) : __assert_fail ("KnownSPMDCount == 0 && AssumedSPMDCount == 0 && \"Expected only non-SPMD kernels!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4179, __extension__ __PRETTY_FUNCTION__))
;
4180 // All reaching kernels are in non-SPMD mode. Update all function
4181 // calls to __kmpc_is_spmd_exec_mode to 0.
4182 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), false);
4183 } else {
4184 // We have empty reaching kernels, therefore we cannot tell if the
4185 // associated call site can be folded. At this moment, SimplifiedValue
4186 // must be none.
4187 assert(!SimplifiedValue.hasValue() && "SimplifiedValue should be none")(static_cast <bool> (!SimplifiedValue.hasValue() &&
"SimplifiedValue should be none") ? void (0) : __assert_fail
("!SimplifiedValue.hasValue() && \"SimplifiedValue should be none\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4187, __extension__ __PRETTY_FUNCTION__))
;
4188 }
4189
4190 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
4191 : ChangeStatus::CHANGED;
4192 }
4193
4194 /// Fold __kmpc_is_generic_main_thread_id into a constant if possible.
4195 ChangeStatus foldIsGenericMainThread(Attributor &A) {
4196 Optional<Value *> SimplifiedValueBefore = SimplifiedValue;
4197
4198 CallBase &CB = cast<CallBase>(getAssociatedValue());
4199 Function *F = CB.getFunction();
4200 const auto &ExecutionDomainAA = A.getAAFor<AAExecutionDomain>(
4201 *this, IRPosition::function(*F), DepClassTy::REQUIRED);
4202
4203 if (!ExecutionDomainAA.isValidState())
4204 return indicatePessimisticFixpoint();
4205
4206 auto &Ctx = getAnchorValue().getContext();
4207 if (ExecutionDomainAA.isExecutedByInitialThreadOnly(CB))
4208 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), true);
4209 else
4210 return indicatePessimisticFixpoint();
4211
4212 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
4213 : ChangeStatus::CHANGED;
4214 }
4215
4216 /// Fold __kmpc_parallel_level into a constant if possible.
4217 ChangeStatus foldParallelLevel(Attributor &A) {
4218 Optional<Value *> SimplifiedValueBefore = SimplifiedValue;
4219
4220 auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
4221 *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
4222
4223 if (!CallerKernelInfoAA.ParallelLevels.isValidState())
4224 return indicatePessimisticFixpoint();
4225
4226 if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState())
4227 return indicatePessimisticFixpoint();
4228
4229 if (CallerKernelInfoAA.ReachingKernelEntries.empty()) {
4230 assert(!SimplifiedValue.hasValue() &&(static_cast <bool> (!SimplifiedValue.hasValue() &&
"SimplifiedValue should keep none at this point") ? void (0)
: __assert_fail ("!SimplifiedValue.hasValue() && \"SimplifiedValue should keep none at this point\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4231, __extension__ __PRETTY_FUNCTION__))
4231 "SimplifiedValue should keep none at this point")(static_cast <bool> (!SimplifiedValue.hasValue() &&
"SimplifiedValue should keep none at this point") ? void (0)
: __assert_fail ("!SimplifiedValue.hasValue() && \"SimplifiedValue should keep none at this point\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4231, __extension__ __PRETTY_FUNCTION__))
;
4232 return ChangeStatus::UNCHANGED;
4233 }
4234
4235 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
4236 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
4237 for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
4238 auto &AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K),
4239 DepClassTy::REQUIRED);
4240 if (!AA.SPMDCompatibilityTracker.isValidState())
4241 return indicatePessimisticFixpoint();
4242
4243 if (AA.SPMDCompatibilityTracker.isAssumed()) {
4244 if (AA.SPMDCompatibilityTracker.isAtFixpoint())
4245 ++KnownSPMDCount;
4246 else
4247 ++AssumedSPMDCount;
4248 } else {
4249 if (AA.SPMDCompatibilityTracker.isAtFixpoint())
4250 ++KnownNonSPMDCount;
4251 else
4252 ++AssumedNonSPMDCount;
4253 }
4254 }
4255
4256 if ((AssumedSPMDCount + KnownSPMDCount) &&
4257 (AssumedNonSPMDCount + KnownNonSPMDCount))
4258 return indicatePessimisticFixpoint();
4259
4260 auto &Ctx = getAnchorValue().getContext();
4261 // If the caller can only be reached by SPMD kernel entries, the parallel
4262 // level is 1. Similarly, if the caller can only be reached by non-SPMD
4263 // kernel entries, it is 0.
4264 if (AssumedSPMDCount || KnownSPMDCount) {
4265 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&(static_cast <bool> (KnownNonSPMDCount == 0 && AssumedNonSPMDCount
== 0 && "Expected only SPMD kernels!") ? void (0) : __assert_fail
("KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 && \"Expected only SPMD kernels!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4266, __extension__ __PRETTY_FUNCTION__))
4266 "Expected only SPMD kernels!")(static_cast <bool> (KnownNonSPMDCount == 0 && AssumedNonSPMDCount
== 0 && "Expected only SPMD kernels!") ? void (0) : __assert_fail
("KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 && \"Expected only SPMD kernels!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4266, __extension__ __PRETTY_FUNCTION__))
;
4267 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);
4268 } else {
4269 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&(static_cast <bool> (KnownSPMDCount == 0 && AssumedSPMDCount
== 0 && "Expected only non-SPMD kernels!") ? void (0
) : __assert_fail ("KnownSPMDCount == 0 && AssumedSPMDCount == 0 && \"Expected only non-SPMD kernels!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4270, __extension__ __PRETTY_FUNCTION__))
4270 "Expected only non-SPMD kernels!")(static_cast <bool> (KnownSPMDCount == 0 && AssumedSPMDCount
== 0 && "Expected only non-SPMD kernels!") ? void (0
) : __assert_fail ("KnownSPMDCount == 0 && AssumedSPMDCount == 0 && \"Expected only non-SPMD kernels!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4270, __extension__ __PRETTY_FUNCTION__))
;
4271 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 0);
4272 }
4273 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
4274 : ChangeStatus::CHANGED;
4275 }
4276
4277 ChangeStatus foldKernelFnAttribute(Attributor &A, llvm::StringRef Attr) {
4278 // Specialize only if all the calls agree with the attribute constant value
4279 int32_t CurrentAttrValue = -1;
4280 Optional<Value *> SimplifiedValueBefore = SimplifiedValue;
4281
4282 auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
4283 *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
4284
4285 if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState())
4286 return indicatePessimisticFixpoint();
4287
4288 // Iterate over the kernels that reach this function
4289 for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
4290 int32_t NextAttrVal = -1;
4291 if (K->hasFnAttribute(Attr))
4292 NextAttrVal =
4293 std::stoi(K->getFnAttribute(Attr).getValueAsString().str());
4294
4295 if (NextAttrVal == -1 ||
4296 (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal))
4297 return indicatePessimisticFixpoint();
4298 CurrentAttrValue = NextAttrVal;
4299 }
4300
4301 if (CurrentAttrValue != -1) {
4302 auto &Ctx = getAnchorValue().getContext();
4303 SimplifiedValue =
4304 ConstantInt::get(Type::getInt32Ty(Ctx), CurrentAttrValue);
4305 }
4306 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
4307 : ChangeStatus::CHANGED;
4308 }
4309
4310 /// An optional value the associated value is assumed to fold to. That is, we
4311 /// assume the associated value (which is a call) can be replaced by this
4312 /// simplified value.
4313 Optional<Value *> SimplifiedValue;
4314
4315 /// The runtime function kind of the callee of the associated call site.
4316 RuntimeFunction RFKind;
4317};
4318
4319} // namespace
4320
4321/// Register folding callsite
4322void OpenMPOpt::registerFoldRuntimeCall(RuntimeFunction RF) {
4323 auto &RFI = OMPInfoCache.RFIs[RF];
4324 RFI.foreachUse(SCC, [&](Use &U, Function &F) {
4325 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI);
4326 if (!CI)
4327 return false;
4328 A.getOrCreateAAFor<AAFoldRuntimeCall>(
4329 IRPosition::callsite_returned(*CI), /* QueryingAA */ nullptr,
4330 DepClassTy::NONE, /* ForceUpdate */ false,
4331 /* UpdateAfterInit */ false);
4332 return false;
4333 });
4334}
4335
4336void OpenMPOpt::registerAAs(bool IsModulePass) {
4337 if (SCC.empty())
4338
4339 return;
4340 if (IsModulePass) {
4341 // Ensure we create the AAKernelInfo AAs first and without triggering an
4342 // update. This will make sure we register all value simplification
4343 // callbacks before any other AA has the chance to create an AAValueSimplify
4344 // or similar.
4345 for (Function *Kernel : OMPInfoCache.Kernels)
4346 A.getOrCreateAAFor<AAKernelInfo>(
4347 IRPosition::function(*Kernel), /* QueryingAA */ nullptr,
4348 DepClassTy::NONE, /* ForceUpdate */ false,
4349 /* UpdateAfterInit */ false);
4350
4351 registerFoldRuntimeCall(OMPRTL___kmpc_is_generic_main_thread_id);
4352 registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
4353 registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);
4354 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block);
4355 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks);
4356 }
4357
4358 // Create CallSite AA for all Getters.
4359 for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
4360 auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];
4361
4362 auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
4363
4364 auto CreateAA = [&](Use &U, Function &Caller) {
4365 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
4366 if (!CI)
4367 return false;
4368
4369 auto &CB = cast<CallBase>(*CI);
4370
4371 IRPosition CBPos = IRPosition::callsite_function(CB);
4372 A.getOrCreateAAFor<AAICVTracker>(CBPos);
4373 return false;
4374 };
4375
4376 GetterRFI.foreachUse(SCC, CreateAA);
4377 }
4378 auto &GlobalizationRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
4379 auto CreateAA = [&](Use &U, Function &F) {
4380 A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F));
4381 return false;
4382 };
4383 if (!DisableOpenMPOptDeglobalization)
4384 GlobalizationRFI.foreachUse(SCC, CreateAA);
4385
4386 // Create an ExecutionDomain AA for every function and a HeapToStack AA for
4387 // every function if there is a device kernel.
4388 if (!isOpenMPDevice(M))
4389 return;
4390
4391 for (auto *F : SCC) {
4392 if (F->isDeclaration())
4393 continue;
4394
4395 A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F));
4396 if (!DisableOpenMPOptDeglobalization)
4397 A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F));
4398
4399 for (auto &I : instructions(*F)) {
4400 if (auto *LI = dyn_cast<LoadInst>(&I)) {
4401 bool UsedAssumedInformation = false;
4402 A.getAssumedSimplified(IRPosition::value(*LI), /* AA */ nullptr,
4403 UsedAssumedInformation);
4404 }
4405 }
4406 }
4407}
4408
4409const char AAICVTracker::ID = 0;
4410const char AAKernelInfo::ID = 0;
4411const char AAExecutionDomain::ID = 0;
4412const char AAHeapToShared::ID = 0;
4413const char AAFoldRuntimeCall::ID = 0;
4414
4415AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
4416 Attributor &A) {
4417 AAICVTracker *AA = nullptr;
4418 switch (IRP.getPositionKind()) {
4419 case IRPosition::IRP_INVALID:
4420 case IRPosition::IRP_FLOAT:
4421 case IRPosition::IRP_ARGUMENT:
4422 case IRPosition::IRP_CALL_SITE_ARGUMENT:
4423 llvm_unreachable("ICVTracker can only be created for function position!")::llvm::llvm_unreachable_internal("ICVTracker can only be created for function position!"
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4423)
;
4424 case IRPosition::IRP_RETURNED:
4425 AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A);
4426 break;
4427 case IRPosition::IRP_CALL_SITE_RETURNED:
4428 AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A);
4429 break;
4430 case IRPosition::IRP_CALL_SITE:
4431 AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A);
4432 break;
4433 case IRPosition::IRP_FUNCTION:
4434 AA = new (A.Allocator) AAICVTrackerFunction(IRP, A);
4435 break;
4436 }
4437
4438 return *AA;
4439}
4440
4441AAExecutionDomain &AAExecutionDomain::createForPosition(const IRPosition &IRP,
4442 Attributor &A) {
4443 AAExecutionDomainFunction *AA = nullptr;
4444 switch (IRP.getPositionKind()) {
4445 case IRPosition::IRP_INVALID:
4446 case IRPosition::IRP_FLOAT:
4447 case IRPosition::IRP_ARGUMENT:
4448 case IRPosition::IRP_CALL_SITE_ARGUMENT:
4449 case IRPosition::IRP_RETURNED:
4450 case IRPosition::IRP_CALL_SITE_RETURNED:
4451 case IRPosition::IRP_CALL_SITE:
4452 llvm_unreachable(::llvm::llvm_unreachable_internal("AAExecutionDomain can only be created for function position!"
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4453)
4453 "AAExecutionDomain can only be created for function position!")::llvm::llvm_unreachable_internal("AAExecutionDomain can only be created for function position!"
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4453)
;
4454 case IRPosition::IRP_FUNCTION:
4455 AA = new (A.Allocator) AAExecutionDomainFunction(IRP, A);
4456 break;
4457 }
4458
4459 return *AA;
4460}
4461
4462AAHeapToShared &AAHeapToShared::createForPosition(const IRPosition &IRP,
4463 Attributor &A) {
4464 AAHeapToSharedFunction *AA = nullptr;
4465 switch (IRP.getPositionKind()) {
4466 case IRPosition::IRP_INVALID:
4467 case IRPosition::IRP_FLOAT:
4468 case IRPosition::IRP_ARGUMENT:
4469 case IRPosition::IRP_CALL_SITE_ARGUMENT:
4470 case IRPosition::IRP_RETURNED:
4471 case IRPosition::IRP_CALL_SITE_RETURNED:
4472 case IRPosition::IRP_CALL_SITE:
4473 llvm_unreachable(::llvm::llvm_unreachable_internal("AAHeapToShared can only be created for function position!"
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4474)
4474 "AAHeapToShared can only be created for function position!")::llvm::llvm_unreachable_internal("AAHeapToShared can only be created for function position!"
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4474)
;
4475 case IRPosition::IRP_FUNCTION:
4476 AA = new (A.Allocator) AAHeapToSharedFunction(IRP, A);
4477 break;
4478 }
4479
4480 return *AA;
4481}
4482
4483AAKernelInfo &AAKernelInfo::createForPosition(const IRPosition &IRP,
4484 Attributor &A) {
4485 AAKernelInfo *AA = nullptr;
4486 switch (IRP.getPositionKind()) {
4487 case IRPosition::IRP_INVALID:
4488 case IRPosition::IRP_FLOAT:
4489 case IRPosition::IRP_ARGUMENT:
4490 case IRPosition::IRP_RETURNED:
4491 case IRPosition::IRP_CALL_SITE_RETURNED:
4492 case IRPosition::IRP_CALL_SITE_ARGUMENT:
4493 llvm_unreachable("KernelInfo can only be created for function position!")::llvm::llvm_unreachable_internal("KernelInfo can only be created for function position!"
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4493)
;
4494 case IRPosition::IRP_CALL_SITE:
4495 AA = new (A.Allocator) AAKernelInfoCallSite(IRP, A);
4496 break;
4497 case IRPosition::IRP_FUNCTION:
4498 AA = new (A.Allocator) AAKernelInfoFunction(IRP, A);
4499 break;
4500 }
4501
4502 return *AA;
4503}
4504
4505AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(const IRPosition &IRP,
4506 Attributor &A) {
4507 AAFoldRuntimeCall *AA = nullptr;
4508 switch (IRP.getPositionKind()) {
4509 case IRPosition::IRP_INVALID:
4510 case IRPosition::IRP_FLOAT:
4511 case IRPosition::IRP_ARGUMENT:
4512 case IRPosition::IRP_RETURNED:
4513 case IRPosition::IRP_FUNCTION:
4514 case IRPosition::IRP_CALL_SITE:
4515 case IRPosition::IRP_CALL_SITE_ARGUMENT:
4516 llvm_unreachable("KernelInfo can only be created for call site position!")::llvm::llvm_unreachable_internal("KernelInfo can only be created for call site position!"
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/lib/Transforms/IPO/OpenMPOpt.cpp"
, 4516)
;
4517 case IRPosition::IRP_CALL_SITE_RETURNED:
4518 AA = new (A.Allocator) AAFoldRuntimeCallCallSiteReturned(IRP, A);
4519 break;
4520 }
4521
4522 return *AA;
4523}
4524
4525PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
4526 if (!containsOpenMP(M))
4527 return PreservedAnalyses::all();
4528 if (DisableOpenMPOptimizations)
4529 return PreservedAnalyses::all();
4530
4531 FunctionAnalysisManager &FAM =
4532 AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
4533 KernelSet Kernels = getDeviceKernels(M);
4534
4535 auto IsCalled = [&](Function &F) {
4536 if (Kernels.contains(&F))
4537 return true;
4538 for (const User *U : F.users())
4539 if (!isa<BlockAddress>(U))
4540 return true;
4541 return false;
4542 };
4543
4544 auto EmitRemark = [&](Function &F) {
4545 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
4546 ORE.emit([&]() {
4547 OptimizationRemarkAnalysis ORA(DEBUG_TYPE"openmp-opt", "OMP140", &F);
4548 return ORA << "Could not internalize function. "
4549 << "Some optimizations may not be possible. [OMP140]";
4550 });
4551 };
4552
4553 // Create internal copies of each function if this is a kernel Module. This
4554 // allows iterprocedural passes to see every call edge.
4555 DenseMap<Function *, Function *> InternalizedMap;
4556 if (isOpenMPDevice(M)) {
4557 SmallPtrSet<Function *, 16> InternalizeFns;
4558 for (Function &F : M)
4559 if (!F.isDeclaration() && !Kernels.contains(&F) && IsCalled(F) &&
4560 !DisableInternalization) {
4561 if (Attributor::isInternalizable(F)) {
4562 InternalizeFns.insert(&F);
4563 } else if (!F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::Cold)) {
4564 EmitRemark(F);
4565 }
4566 }
4567
4568 Attributor::internalizeFunctions(InternalizeFns, InternalizedMap);
4569 }
4570
4571 // Look at every function in the Module unless it was internalized.
4572 SmallVector<Function *, 16> SCC;
4573 for (Function &F : M)
4574 if (!F.isDeclaration() && !InternalizedMap.lookup(&F))
4575 SCC.push_back(&F);
4576
4577 if (SCC.empty())
4578 return PreservedAnalyses::all();
4579
4580 AnalysisGetter AG(FAM);
4581
4582 auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
4583 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
4584 };
4585
4586 BumpPtrAllocator Allocator;
4587 CallGraphUpdater CGUpdater;
4588
4589 SetVector<Function *> Functions(SCC.begin(), SCC.end());
4590 OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ Functions, Kernels);
4591
4592 unsigned MaxFixpointIterations =
4593 (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
4594 Attributor A(Functions, InfoCache, CGUpdater, nullptr, true, false,
4595 MaxFixpointIterations, OREGetter, DEBUG_TYPE"openmp-opt");
4596
4597 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
4598 bool Changed = OMPOpt.run(true);
4599
4600 // Optionally inline device functions for potentially better performance.
4601 if (AlwaysInlineDeviceFunctions && isOpenMPDevice(M))
4602 for (Function &F : M)
4603 if (!F.isDeclaration() && !Kernels.contains(&F) &&
4604 !F.hasFnAttribute(Attribute::NoInline))
4605 F.addFnAttr(Attribute::AlwaysInline);
4606
4607 if (PrintModuleAfterOptimizations)
4608 LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt Module Pass:\n" << M)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Module after OpenMPOpt Module Pass:\n"
<< M; } } while (false)
;
4609
4610 if (Changed)
4611 return PreservedAnalyses::none();
4612
4613 return PreservedAnalyses::all();
4614}
4615
4616PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
4617 CGSCCAnalysisManager &AM,
4618 LazyCallGraph &CG,
4619 CGSCCUpdateResult &UR) {
4620 if (!containsOpenMP(*C.begin()->getFunction().getParent()))
4621 return PreservedAnalyses::all();
4622 if (DisableOpenMPOptimizations)
4623 return PreservedAnalyses::all();
4624
4625 SmallVector<Function *, 16> SCC;
4626 // If there are kernels in the module, we have to run on all SCC's.
4627 for (LazyCallGraph::Node &N : C) {
4628 Function *Fn = &N.getFunction();
4629 SCC.push_back(Fn);
4630 }
4631
4632 if (SCC.empty())
4633 return PreservedAnalyses::all();
4634
4635 Module &M = *C.begin()->getFunction().getParent();
4636
4637 KernelSet Kernels = getDeviceKernels(M);
4638
4639 FunctionAnalysisManager &FAM =
4640 AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
4641
4642 AnalysisGetter AG(FAM);
4643
4644 auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
4645 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
4646 };
4647
4648 BumpPtrAllocator Allocator;
4649 CallGraphUpdater CGUpdater;
4650 CGUpdater.initialize(CG, C, AM, UR);
4651
4652 SetVector<Function *> Functions(SCC.begin(), SCC.end());
4653 OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
4654 /*CGSCC*/ Functions, Kernels);
4655
4656 unsigned MaxFixpointIterations =
4657 (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
4658 Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true,
4659 MaxFixpointIterations, OREGetter, DEBUG_TYPE"openmp-opt");
4660
4661 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
4662 bool Changed = OMPOpt.run(false);
4663
4664 if (PrintModuleAfterOptimizations)
4665 LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n"
<< M; } } while (false)
;
4666
4667 if (Changed)
4668 return PreservedAnalyses::none();
4669
4670 return PreservedAnalyses::all();
4671}
4672
4673namespace {
4674
4675struct OpenMPOptCGSCCLegacyPass : public CallGraphSCCPass {
4676 CallGraphUpdater CGUpdater;
4677 static char ID;
4678
4679 OpenMPOptCGSCCLegacyPass() : CallGraphSCCPass(ID) {
4680 initializeOpenMPOptCGSCCLegacyPassPass(*PassRegistry::getPassRegistry());
4681 }
4682
4683 void getAnalysisUsage(AnalysisUsage &AU) const override {
4684 CallGraphSCCPass::getAnalysisUsage(AU);
4685 }
4686
4687 bool runOnSCC(CallGraphSCC &CGSCC) override {
4688 if (!containsOpenMP(CGSCC.getCallGraph().getModule()))
4689 return false;
4690 if (DisableOpenMPOptimizations || skipSCC(CGSCC))
4691 return false;
4692
4693 SmallVector<Function *, 16> SCC;
4694 // If there are kernels in the module, we have to run on all SCC's.
4695 for (CallGraphNode *CGN : CGSCC) {
4696 Function *Fn = CGN->getFunction();
4697 if (!Fn || Fn->isDeclaration())
4698 continue;
4699 SCC.push_back(Fn);
4700 }
4701
4702 if (SCC.empty())
4703 return false;
4704
4705 Module &M = CGSCC.getCallGraph().getModule();
4706 KernelSet Kernels = getDeviceKernels(M);
4707
4708 CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
4709 CGUpdater.initialize(CG, CGSCC);
4710
4711 // Maintain a map of functions to avoid rebuilding the ORE
4712 DenseMap<Function *, std::unique_ptr<OptimizationRemarkEmitter>> OREMap;
4713 auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & {
4714 std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F];
4715 if (!ORE)
4716 ORE = std::make_unique<OptimizationRemarkEmitter>(F);
4717 return *ORE;
4718 };
4719
4720 AnalysisGetter AG;
4721 SetVector<Function *> Functions(SCC.begin(), SCC.end());
4722 BumpPtrAllocator Allocator;
4723 OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG,
4724 Allocator,
4725 /*CGSCC*/ Functions, Kernels);
4726
4727 unsigned MaxFixpointIterations =
4728 (isOpenMPDevice(M)) ? SetFixpointIterations : 32;
4729 Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true,
4730 MaxFixpointIterations, OREGetter, DEBUG_TYPE"openmp-opt");
4731
4732 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
4733 bool Result = OMPOpt.run(false);
4734
4735 if (PrintModuleAfterOptimizations)
4736 LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("openmp-opt")) { dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n"
<< M; } } while (false)
;
4737
4738 return Result;
4739 }
4740
4741 bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); }
4742};
4743
4744} // end anonymous namespace
4745
4746KernelSet llvm::omp::getDeviceKernels(Module &M) {
4747 // TODO: Create a more cross-platform way of determining device kernels.
4748 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
4749 KernelSet Kernels;
4750
4751 if (!MD)
4752 return Kernels;
4753
4754 for (auto *Op : MD->operands()) {
4755 if (Op->getNumOperands() < 2)
4756 continue;
4757 MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
4758 if (!KindID || KindID->getString() != "kernel")
4759 continue;
4760
4761 Function *KernelFn =
4762 mdconst::dyn_extract_or_null<Function>(Op->getOperand(0));
4763 if (!KernelFn)
4764 continue;
4765
4766 ++NumOpenMPTargetRegionKernels;
4767
4768 Kernels.insert(KernelFn);
4769 }
4770
4771 return Kernels;
4772}
4773
4774bool llvm::omp::containsOpenMP(Module &M) {
4775 Metadata *MD = M.getModuleFlag("openmp");
4776 if (!MD)
4777 return false;
4778
4779 return true;
4780}
4781
4782bool llvm::omp::isOpenMPDevice(Module &M) {
4783 Metadata *MD = M.getModuleFlag("openmp-device");
4784 if (!MD)
4785 return false;
4786
4787 return true;
4788}
4789
4790char OpenMPOptCGSCCLegacyPass::ID = 0;
4791
4792INITIALIZE_PASS_BEGIN(OpenMPOptCGSCCLegacyPass, "openmp-opt-cgscc",static void *initializeOpenMPOptCGSCCLegacyPassPassOnce(PassRegistry
&Registry) {
4793 "OpenMP specific optimizations", false, false)static void *initializeOpenMPOptCGSCCLegacyPassPassOnce(PassRegistry
&Registry) {
4794INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)initializeCallGraphWrapperPassPass(Registry);
4795INITIALIZE_PASS_END(OpenMPOptCGSCCLegacyPass, "openmp-opt-cgscc",PassInfo *PI = new PassInfo( "OpenMP specific optimizations",
"openmp-opt-cgscc", &OpenMPOptCGSCCLegacyPass::ID, PassInfo
::NormalCtor_t(callDefaultCtor<OpenMPOptCGSCCLegacyPass>
), false, false); Registry.registerPass(*PI, true); return PI
; } static llvm::once_flag InitializeOpenMPOptCGSCCLegacyPassPassFlag
; void llvm::initializeOpenMPOptCGSCCLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeOpenMPOptCGSCCLegacyPassPassFlag
, initializeOpenMPOptCGSCCLegacyPassPassOnce, std::ref(Registry
)); }
4796 "OpenMP specific optimizations", false, false)PassInfo *PI = new PassInfo( "OpenMP specific optimizations",
"openmp-opt-cgscc", &OpenMPOptCGSCCLegacyPass::ID, PassInfo
::NormalCtor_t(callDefaultCtor<OpenMPOptCGSCCLegacyPass>
), false, false); Registry.registerPass(*PI, true); return PI
; } static llvm::once_flag InitializeOpenMPOptCGSCCLegacyPassPassFlag
; void llvm::initializeOpenMPOptCGSCCLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeOpenMPOptCGSCCLegacyPassPassFlag
, initializeOpenMPOptCGSCCLegacyPassPassOnce, std::ref(Registry
)); }
4797
4798Pass *llvm::createOpenMPOptCGSCCLegacyPass() {
4799 return new OpenMPOptCGSCCLegacyPass();
4800}

/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/include/llvm/ADT/SmallPtrSet.h

1//===- llvm/ADT/SmallPtrSet.h - 'Normally small' pointer set ----*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the SmallPtrSet class. See the doxygen comment for
10// SmallPtrSetImplBase for more details on the algorithm used.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_ADT_SMALLPTRSET_H
15#define LLVM_ADT_SMALLPTRSET_H
16
17#include "llvm/ADT/EpochTracker.h"
18#include "llvm/Support/Compiler.h"
19#include "llvm/Support/ReverseIteration.h"
20#include "llvm/Support/type_traits.h"
21#include <cassert>
22#include <cstddef>
23#include <cstdlib>
24#include <cstring>
25#include <initializer_list>
26#include <iterator>
27#include <utility>
28
29namespace llvm {
30
31/// SmallPtrSetImplBase - This is the common code shared among all the
32/// SmallPtrSet<>'s, which is almost everything. SmallPtrSet has two modes, one
33/// for small and one for large sets.
34///
35/// Small sets use an array of pointers allocated in the SmallPtrSet object,
36/// which is treated as a simple array of pointers. When a pointer is added to
37/// the set, the array is scanned to see if the element already exists, if not
38/// the element is 'pushed back' onto the array. If we run out of space in the
39/// array, we grow into the 'large set' case. SmallSet should be used when the
40/// sets are often small. In this case, no memory allocation is used, and only
41/// light-weight and cache-efficient scanning is used.
42///
43/// Large sets use a classic exponentially-probed hash table. Empty buckets are
44/// represented with an illegal pointer value (-1) to allow null pointers to be
45/// inserted. Tombstones are represented with another illegal pointer value
46/// (-2), to allow deletion. The hash table is resized when the table is 3/4 or
47/// more. When this happens, the table is doubled in size.
48///
49class SmallPtrSetImplBase : public DebugEpochBase {
50 friend class SmallPtrSetIteratorImpl;
51
52protected:
53 /// SmallArray - Points to a fixed size set of buckets, used in 'small mode'.
54 const void **SmallArray;
55 /// CurArray - This is the current set of buckets. If equal to SmallArray,
56 /// then the set is in 'small mode'.
57 const void **CurArray;
58 /// CurArraySize - The allocated size of CurArray, always a power of two.
59 unsigned CurArraySize;
60
61 /// Number of elements in CurArray that contain a value or are a tombstone.
62 /// If small, all these elements are at the beginning of CurArray and the rest
63 /// is uninitialized.
64 unsigned NumNonEmpty;
65 /// Number of tombstones in CurArray.
66 unsigned NumTombstones;
67
68 // Helpers to copy and move construct a SmallPtrSet.
69 SmallPtrSetImplBase(const void **SmallStorage,
70 const SmallPtrSetImplBase &that);
71 SmallPtrSetImplBase(const void **SmallStorage, unsigned SmallSize,
72 SmallPtrSetImplBase &&that);
73
74 explicit SmallPtrSetImplBase(const void **SmallStorage, unsigned SmallSize)
75 : SmallArray(SmallStorage), CurArray(SmallStorage),
76 CurArraySize(SmallSize), NumNonEmpty(0), NumTombstones(0) {
77 assert(SmallSize && (SmallSize & (SmallSize-1)) == 0 &&(static_cast <bool> (SmallSize && (SmallSize &
(SmallSize-1)) == 0 && "Initial size must be a power of two!"
) ? void (0) : __assert_fail ("SmallSize && (SmallSize & (SmallSize-1)) == 0 && \"Initial size must be a power of two!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/include/llvm/ADT/SmallPtrSet.h"
, 78, __extension__ __PRETTY_FUNCTION__))
78 "Initial size must be a power of two!")(static_cast <bool> (SmallSize && (SmallSize &
(SmallSize-1)) == 0 && "Initial size must be a power of two!"
) ? void (0) : __assert_fail ("SmallSize && (SmallSize & (SmallSize-1)) == 0 && \"Initial size must be a power of two!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/include/llvm/ADT/SmallPtrSet.h"
, 78, __extension__ __PRETTY_FUNCTION__))
;
79 }
80
81 ~SmallPtrSetImplBase() {
82 if (!isSmall())
83 free(CurArray);
84 }
85
86public:
87 using size_type = unsigned;
88
89 SmallPtrSetImplBase &operator=(const SmallPtrSetImplBase &) = delete;
90
91 LLVM_NODISCARD[[clang::warn_unused_result]] bool empty() const { return size() == 0; }
2
Assuming the condition is false
3
Returning zero, which participates in a condition later
92 size_type size() const { return NumNonEmpty - NumTombstones; }
93
94 void clear() {
95 incrementEpoch();
96 // If the capacity of the array is huge, and the # elements used is small,
97 // shrink the array.
98 if (!isSmall()) {
99 if (size() * 4 < CurArraySize && CurArraySize > 32)
100 return shrink_and_clear();
101 // Fill the array with empty markers.
102 memset(CurArray, -1, CurArraySize * sizeof(void *));
103 }
104
105 NumNonEmpty = 0;
106 NumTombstones = 0;
107 }
108
109protected:
110 static void *getTombstoneMarker() { return reinterpret_cast<void*>(-2); }
111
112 static void *getEmptyMarker() {
113 // Note that -1 is chosen to make clear() efficiently implementable with
114 // memset and because it's not a valid pointer value.
115 return reinterpret_cast<void*>(-1);
116 }
117
118 const void **EndPointer() const {
119 return isSmall() ? CurArray + NumNonEmpty : CurArray + CurArraySize;
120 }
121
122 /// insert_imp - This returns true if the pointer was new to the set, false if
123 /// it was already in the set. This is hidden from the client so that the
124 /// derived class can check that the right type of pointer is passed in.
125 std::pair<const void *const *, bool> insert_imp(const void *Ptr) {
126 if (isSmall()) {
127 // Check to see if it is already in the set.
128 const void **LastTombstone = nullptr;
129 for (const void **APtr = SmallArray, **E = SmallArray + NumNonEmpty;
130 APtr != E; ++APtr) {
131 const void *Value = *APtr;
132 if (Value == Ptr)
133 return std::make_pair(APtr, false);
134 if (Value == getTombstoneMarker())
135 LastTombstone = APtr;
136 }
137
138 // Did we find any tombstone marker?
139 if (LastTombstone != nullptr) {
140 *LastTombstone = Ptr;
141 --NumTombstones;
142 incrementEpoch();
143 return std::make_pair(LastTombstone, true);
144 }
145
146 // Nope, there isn't. If we stay small, just 'pushback' now.
147 if (NumNonEmpty < CurArraySize) {
148 SmallArray[NumNonEmpty++] = Ptr;
149 incrementEpoch();
150 return std::make_pair(SmallArray + (NumNonEmpty - 1), true);
151 }
152 // Otherwise, hit the big set case, which will call grow.
153 }
154 return insert_imp_big(Ptr);
155 }
156
157 /// erase_imp - If the set contains the specified pointer, remove it and
158 /// return true, otherwise return false. This is hidden from the client so
159 /// that the derived class can check that the right type of pointer is passed
160 /// in.
161 bool erase_imp(const void * Ptr) {
162 const void *const *P = find_imp(Ptr);
163 if (P == EndPointer())
164 return false;
165
166 const void **Loc = const_cast<const void **>(P);
167 assert(*Loc == Ptr && "broken find!")(static_cast <bool> (*Loc == Ptr && "broken find!"
) ? void (0) : __assert_fail ("*Loc == Ptr && \"broken find!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/include/llvm/ADT/SmallPtrSet.h"
, 167, __extension__ __PRETTY_FUNCTION__))
;
168 *Loc = getTombstoneMarker();
169 NumTombstones++;
170 return true;
171 }
172
173 /// Returns the raw pointer needed to construct an iterator. If element not
174 /// found, this will be EndPointer. Otherwise, it will be a pointer to the
175 /// slot which stores Ptr;
176 const void *const * find_imp(const void * Ptr) const {
177 if (isSmall()) {
178 // Linear search for the item.
179 for (const void *const *APtr = SmallArray,
180 *const *E = SmallArray + NumNonEmpty; APtr != E; ++APtr)
181 if (*APtr == Ptr)
182 return APtr;
183 return EndPointer();
184 }
185
186 // Big set case.
187 auto *Bucket = FindBucketFor(Ptr);
188 if (*Bucket == Ptr)
189 return Bucket;
190 return EndPointer();
191 }
192
193private:
194 bool isSmall() const { return CurArray == SmallArray; }
195
196 std::pair<const void *const *, bool> insert_imp_big(const void *Ptr);
197
198 const void * const *FindBucketFor(const void *Ptr) const;
199 void shrink_and_clear();
200
201 /// Grow - Allocate a larger backing store for the buckets and move it over.
202 void Grow(unsigned NewSize);
203
204protected:
205 /// swap - Swaps the elements of two sets.
206 /// Note: This method assumes that both sets have the same small size.
207 void swap(SmallPtrSetImplBase &RHS);
208
209 void CopyFrom(const SmallPtrSetImplBase &RHS);
210 void MoveFrom(unsigned SmallSize, SmallPtrSetImplBase &&RHS);
211
212private:
213 /// Code shared by MoveFrom() and move constructor.
214 void MoveHelper(unsigned SmallSize, SmallPtrSetImplBase &&RHS);
215 /// Code shared by CopyFrom() and copy constructor.
216 void CopyHelper(const SmallPtrSetImplBase &RHS);
217};
218
219/// SmallPtrSetIteratorImpl - This is the common base class shared between all
220/// instances of SmallPtrSetIterator.
221class SmallPtrSetIteratorImpl {
222protected:
223 const void *const *Bucket;
224 const void *const *End;
225
226public:
227 explicit SmallPtrSetIteratorImpl(const void *const *BP, const void*const *E)
228 : Bucket(BP), End(E) {
229 if (shouldReverseIterate()) {
230 RetreatIfNotValid();
231 return;
232 }
233 AdvanceIfNotValid();
234 }
235
236 bool operator==(const SmallPtrSetIteratorImpl &RHS) const {
237 return Bucket == RHS.Bucket;
238 }
239 bool operator!=(const SmallPtrSetIteratorImpl &RHS) const {
240 return Bucket != RHS.Bucket;
241 }
242
243protected:
244 /// AdvanceIfNotValid - If the current bucket isn't valid, advance to a bucket
245 /// that is. This is guaranteed to stop because the end() bucket is marked
246 /// valid.
247 void AdvanceIfNotValid() {
248 assert(Bucket <= End)(static_cast <bool> (Bucket <= End) ? void (0) : __assert_fail
("Bucket <= End", "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/include/llvm/ADT/SmallPtrSet.h"
, 248, __extension__ __PRETTY_FUNCTION__))
;
249 while (Bucket != End &&
250 (*Bucket == SmallPtrSetImplBase::getEmptyMarker() ||
251 *Bucket == SmallPtrSetImplBase::getTombstoneMarker()))
252 ++Bucket;
253 }
254 void RetreatIfNotValid() {
255 assert(Bucket >= End)(static_cast <bool> (Bucket >= End) ? void (0) : __assert_fail
("Bucket >= End", "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/include/llvm/ADT/SmallPtrSet.h"
, 255, __extension__ __PRETTY_FUNCTION__))
;
256 while (Bucket != End &&
257 (Bucket[-1] == SmallPtrSetImplBase::getEmptyMarker() ||
258 Bucket[-1] == SmallPtrSetImplBase::getTombstoneMarker())) {
259 --Bucket;
260 }
261 }
262};
263
264/// SmallPtrSetIterator - This implements a const_iterator for SmallPtrSet.
265template <typename PtrTy>
266class SmallPtrSetIterator : public SmallPtrSetIteratorImpl,
267 DebugEpochBase::HandleBase {
268 using PtrTraits = PointerLikeTypeTraits<PtrTy>;
269
270public:
271 using value_type = PtrTy;
272 using reference = PtrTy;
273 using pointer = PtrTy;
274 using difference_type = std::ptrdiff_t;
275 using iterator_category = std::forward_iterator_tag;
276
277 explicit SmallPtrSetIterator(const void *const *BP, const void *const *E,
278 const DebugEpochBase &Epoch)
279 : SmallPtrSetIteratorImpl(BP, E), DebugEpochBase::HandleBase(&Epoch) {}
280
281 // Most methods are provided by the base class.
282
283 const PtrTy operator*() const {
284 assert(isHandleInSync() && "invalid iterator access!")(static_cast <bool> (isHandleInSync() && "invalid iterator access!"
) ? void (0) : __assert_fail ("isHandleInSync() && \"invalid iterator access!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/include/llvm/ADT/SmallPtrSet.h"
, 284, __extension__ __PRETTY_FUNCTION__))
;
285 if (shouldReverseIterate()) {
286 assert(Bucket > End)(static_cast <bool> (Bucket > End) ? void (0) : __assert_fail
("Bucket > End", "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/include/llvm/ADT/SmallPtrSet.h"
, 286, __extension__ __PRETTY_FUNCTION__))
;
287 return PtrTraits::getFromVoidPointer(const_cast<void *>(Bucket[-1]));
288 }
289 assert(Bucket < End)(static_cast <bool> (Bucket < End) ? void (0) : __assert_fail
("Bucket < End", "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/include/llvm/ADT/SmallPtrSet.h"
, 289, __extension__ __PRETTY_FUNCTION__))
;
290 return PtrTraits::getFromVoidPointer(const_cast<void*>(*Bucket));
291 }
292
293 inline SmallPtrSetIterator& operator++() { // Preincrement
294 assert(isHandleInSync() && "invalid iterator access!")(static_cast <bool> (isHandleInSync() && "invalid iterator access!"
) ? void (0) : __assert_fail ("isHandleInSync() && \"invalid iterator access!\""
, "/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/include/llvm/ADT/SmallPtrSet.h"
, 294, __extension__ __PRETTY_FUNCTION__))
;
295 if (shouldReverseIterate()) {
296 --Bucket;
297 RetreatIfNotValid();
298 return *this;
299 }
300 ++Bucket;
301 AdvanceIfNotValid();
302 return *this;
303 }
304
305 SmallPtrSetIterator operator++(int) { // Postincrement
306 SmallPtrSetIterator tmp = *this;
307 ++*this;
308 return tmp;
309 }
310};
311
312/// RoundUpToPowerOfTwo - This is a helper template that rounds N up to the next
313/// power of two (which means N itself if N is already a power of two).
314template<unsigned N>
315struct RoundUpToPowerOfTwo;
316
317/// RoundUpToPowerOfTwoH - If N is not a power of two, increase it. This is a
318/// helper template used to implement RoundUpToPowerOfTwo.
319template<unsigned N, bool isPowerTwo>
320struct RoundUpToPowerOfTwoH {
321 enum { Val = N };
322};
323template<unsigned N>
324struct RoundUpToPowerOfTwoH<N, false> {
325 enum {
326 // We could just use NextVal = N+1, but this converges faster. N|(N-1) sets
327 // the right-most zero bits to one all at once, e.g. 0b0011000 -> 0b0011111.
328 Val = RoundUpToPowerOfTwo<(N|(N-1)) + 1>::Val
329 };
330};
331
332template<unsigned N>
333struct RoundUpToPowerOfTwo {
334 enum { Val = RoundUpToPowerOfTwoH<N, (N&(N-1)) == 0>::Val };
335};
336
337/// A templated base class for \c SmallPtrSet which provides the
338/// typesafe interface that is common across all small sizes.
339///
340/// This is particularly useful for passing around between interface boundaries
341/// to avoid encoding a particular small size in the interface boundary.
342template <typename PtrType>
343class SmallPtrSetImpl : public SmallPtrSetImplBase {
344 using ConstPtrType = typename add_const_past_pointer<PtrType>::type;
345 using PtrTraits = PointerLikeTypeTraits<PtrType>;
346 using ConstPtrTraits = PointerLikeTypeTraits<ConstPtrType>;
347
348protected:
349 // Forward constructors to the base.
350 using SmallPtrSetImplBase::SmallPtrSetImplBase;
351
352public:
353 using iterator = SmallPtrSetIterator<PtrType>;
354 using const_iterator = SmallPtrSetIterator<PtrType>;
355 using key_type = ConstPtrType;
356 using value_type = PtrType;
357
358 SmallPtrSetImpl(const SmallPtrSetImpl &) = delete;
359
360 /// Inserts Ptr if and only if there is no element in the container equal to
361 /// Ptr. The bool component of the returned pair is true if and only if the
362 /// insertion takes place, and the iterator component of the pair points to
363 /// the element equal to Ptr.
364 std::pair<iterator, bool> insert(PtrType Ptr) {
365 auto p = insert_imp(PtrTraits::getAsVoidPointer(Ptr));
366 return std::make_pair(makeIterator(p.first), p.second);
367 }
368
369 /// Insert the given pointer with an iterator hint that is ignored. This is
370 /// identical to calling insert(Ptr), but allows SmallPtrSet to be used by
371 /// std::insert_iterator and std::inserter().
372 iterator insert(iterator, PtrType Ptr) {
373 return insert(Ptr).first;
374 }
375
376 /// erase - If the set contains the specified pointer, remove it and return
377 /// true, otherwise return false.
378 bool erase(PtrType Ptr) {
379 return erase_imp(PtrTraits::getAsVoidPointer(Ptr));
380 }
381 /// count - Return 1 if the specified pointer is in the set, 0 otherwise.
382 size_type count(ConstPtrType Ptr) const {
383 return find_imp(ConstPtrTraits::getAsVoidPointer(Ptr)) != EndPointer();
384 }
385 iterator find(ConstPtrType Ptr) const {
386 return makeIterator(find_imp(ConstPtrTraits::getAsVoidPointer(Ptr)));
387 }
388 bool contains(ConstPtrType Ptr) const {
389 return find_imp(ConstPtrTraits::getAsVoidPointer(Ptr)) != EndPointer();
390 }
391
392 template <typename IterT>
393 void insert(IterT I, IterT E) {
394 for (; I != E; ++I)
395 insert(*I);
396 }
397
398 void insert(std::initializer_list<PtrType> IL) {
399 insert(IL.begin(), IL.end());
400 }
401
402 iterator begin() const {
403 if (shouldReverseIterate())
404 return makeIterator(EndPointer() - 1);
405 return makeIterator(CurArray);
406 }
407 iterator end() const { return makeIterator(EndPointer()); }
408
409private:
410 /// Create an iterator that dereferences to same place as the given pointer.
411 iterator makeIterator(const void *const *P) const {
412 if (shouldReverseIterate())
413 return iterator(P == EndPointer() ? CurArray : P + 1, CurArray, *this);
414 return iterator(P, EndPointer(), *this);
415 }
416};
417
418/// Equality comparison for SmallPtrSet.
419///
420/// Iterates over elements of LHS confirming that each value from LHS is also in
421/// RHS, and that no additional values are in RHS.
422template <typename PtrType>
423bool operator==(const SmallPtrSetImpl<PtrType> &LHS,
424 const SmallPtrSetImpl<PtrType> &RHS) {
425 if (LHS.size() != RHS.size())
426 return false;
427
428 for (const auto *KV : LHS)
429 if (!RHS.count(KV))
430 return false;
431
432 return true;
433}
434
435/// Inequality comparison for SmallPtrSet.
436///
437/// Equivalent to !(LHS == RHS).
438template <typename PtrType>
439bool operator!=(const SmallPtrSetImpl<PtrType> &LHS,
440 const SmallPtrSetImpl<PtrType> &RHS) {
441 return !(LHS == RHS);
442}
443
444/// SmallPtrSet - This class implements a set which is optimized for holding
445/// SmallSize or less elements. This internally rounds up SmallSize to the next
446/// power of two if it is not already a power of two. See the comments above
447/// SmallPtrSetImplBase for details of the algorithm.
448template<class PtrType, unsigned SmallSize>
449class SmallPtrSet : public SmallPtrSetImpl<PtrType> {
450 // In small mode SmallPtrSet uses linear search for the elements, so it is
451 // not a good idea to choose this value too high. You may consider using a
452 // DenseSet<> instead if you expect many elements in the set.
453 static_assert(SmallSize <= 32, "SmallSize should be small");
454
455 using BaseT = SmallPtrSetImpl<PtrType>;
456
457 // Make sure that SmallSize is a power of two, round up if not.
458 enum { SmallSizePowTwo = RoundUpToPowerOfTwo<SmallSize>::Val };
459 /// SmallStorage - Fixed size storage used in 'small mode'.
460 const void *SmallStorage[SmallSizePowTwo];
461
462public:
463 SmallPtrSet() : BaseT(SmallStorage, SmallSizePowTwo) {}
464 SmallPtrSet(const SmallPtrSet &that) : BaseT(SmallStorage, that) {}
465 SmallPtrSet(SmallPtrSet &&that)
466 : BaseT(SmallStorage, SmallSizePowTwo, std::move(that)) {}
467
468 template<typename It>
469 SmallPtrSet(It I, It E) : BaseT(SmallStorage, SmallSizePowTwo) {
470 this->insert(I, E);
471 }
472
473 SmallPtrSet(std::initializer_list<PtrType> IL)
474 : BaseT(SmallStorage, SmallSizePowTwo) {
475 this->insert(IL.begin(), IL.end());
476 }
477
478 SmallPtrSet<PtrType, SmallSize> &
479 operator=(const SmallPtrSet<PtrType, SmallSize> &RHS) {
480 if (&RHS != this)
481 this->CopyFrom(RHS);
482 return *this;
483 }
484
485 SmallPtrSet<PtrType, SmallSize> &
486 operator=(SmallPtrSet<PtrType, SmallSize> &&RHS) {
487 if (&RHS != this)
488 this->MoveFrom(SmallSizePowTwo, std::move(RHS));
489 return *this;
490 }
491
492 SmallPtrSet<PtrType, SmallSize> &
493 operator=(std::initializer_list<PtrType> IL) {
494 this->clear();
495 this->insert(IL.begin(), IL.end());
496 return *this;
497 }
498
499 /// swap - Swaps the elements of two sets.
500 void swap(SmallPtrSet<PtrType, SmallSize> &RHS) {
501 SmallPtrSetImplBase::swap(RHS);
502 }
503};
504
505} // end namespace llvm
506
507namespace std {
508
509 /// Implement std::swap in terms of SmallPtrSet swap.
510 template<class T, unsigned N>
511 inline void swap(llvm::SmallPtrSet<T, N> &LHS, llvm::SmallPtrSet<T, N> &RHS) {
512 LHS.swap(RHS);
513 }
514
515} // end namespace std
516
517#endif // LLVM_ADT_SMALLPTRSET_H

/build/llvm-toolchain-snapshot-14~++20211016100712+8e1d532707fd/llvm/include/llvm/Transforms/IPO/Attributor.h

1//===- Attributor.h --- Module-wide attribute deduction ---------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Attributor: An inter procedural (abstract) "attribute" deduction framework.
10//
11// The Attributor framework is an inter procedural abstract analysis (fixpoint
12// iteration analysis). The goal is to allow easy deduction of new attributes as
13// well as information exchange between abstract attributes in-flight.
14//
15// The Attributor class is the driver and the link between the various abstract
16// attributes. The Attributor will iterate until a fixpoint state is reached by
17// all abstract attributes in-flight, or until it will enforce a pessimistic fix
18// point because an iteration limit is reached.
19//
20// Abstract attributes, derived from the AbstractAttribute class, actually
21// describe properties of the code. They can correspond to actual LLVM-IR
22// attributes, or they can be more general, ultimately unrelated to LLVM-IR
23// attributes. The latter is useful when an abstract attributes provides
24// information to other abstract attributes in-flight but we might not want to
25// manifest the information. The Attributor allows to query in-flight abstract
26// attributes through the `Attributor::getAAFor` method (see the method
27// description for an example). If the method is used by an abstract attribute
28// P, and it results in an abstract attribute Q, the Attributor will
29// automatically capture a potential dependence from Q to P. This dependence
30// will cause P to be reevaluated whenever Q changes in the future.
31//
32// The Attributor will only reevaluate abstract attributes that might have
33// changed since the last iteration. That means that the Attribute will not
34// revisit all instructions/blocks/functions in the module but only query
35// an update from a subset of the abstract attributes.
36//
37// The update method `AbstractAttribute::updateImpl` is implemented by the
38// specific "abstract attribute" subclasses. The method is invoked whenever the
39// currently assumed state (see the AbstractState class) might not be valid
40// anymore. This can, for example, happen if the state was dependent on another
41// abstract attribute that changed. In every invocation, the update method has
42// to adjust the internal state of an abstract attribute to a point that is
43// justifiable by the underlying IR and the current state of abstract attributes
44// in-flight. Since the IR is given and assumed to be valid, the information
45// derived from it can be assumed to hold. However, information derived from
46// other abstract attributes is conditional on various things. If the justifying
47// state changed, the `updateImpl` has to revisit the situation and potentially
48// find another justification or limit the optimistic assumes made.
49//
50// Change is the key in this framework. Until a state of no-change, thus a
51// fixpoint, is reached, the Attributor will query the abstract attributes
52// in-flight to re-evaluate their state. If the (current) state is too
53// optimistic, hence it cannot be justified anymore through other abstract
54// attributes or the state of the IR, the state of the abstract attribute will
55// have to change. Generally, we assume abstract attribute state to be a finite
56// height lattice and the update function to be monotone. However, these
57// conditions are not enforced because the iteration limit will guarantee
58// termination. If an optimistic fixpoint is reached, or a pessimistic fix
59// point is enforced after a timeout, the abstract attributes are tasked to
60// manifest their result in the IR for passes to come.
61//
62// Attribute manifestation is not mandatory. If desired, there is support to
63// generate a single or multiple LLVM-IR attributes already in the helper struct
64// IRAttribute. In the simplest case, a subclass inherits from IRAttribute with
65// a proper Attribute::AttrKind as template parameter. The Attributor
66// manifestation framework will then create and place a new attribute if it is
67// allowed to do so (based on the abstract state). Other use cases can be
68// achieved by overloading AbstractAttribute or IRAttribute methods.
69//
70//
71// The "mechanics" of adding a new "abstract attribute":
72// - Define a class (transitively) inheriting from AbstractAttribute and one
73// (which could be the same) that (transitively) inherits from AbstractState.
74// For the latter, consider the already available BooleanState and
75// {Inc,Dec,Bit}IntegerState if they fit your needs, e.g., you require only a
76// number tracking or bit-encoding.
77// - Implement all pure methods. Also use overloading if the attribute is not
78// conforming with the "default" behavior: A (set of) LLVM-IR attribute(s) for
79// an argument, call site argument, function return value, or function. See
80// the class and method descriptions for more information on the two
81// "Abstract" classes and their respective methods.
82// - Register opportunities for the new abstract attribute in the
83// `Attributor::identifyDefaultAbstractAttributes` method if it should be
84// counted as a 'default' attribute.
85// - Add sufficient tests.
86// - Add a Statistics object for bookkeeping. If it is a simple (set of)
87// attribute(s) manifested through the Attributor manifestation framework, see
88// the bookkeeping function in Attributor.cpp.
89// - If instructions with a certain opcode are interesting to the attribute, add
90// that opcode to the switch in `Attributor::identifyAbstractAttributes`. This
91// will make it possible to query all those instructions through the
92// `InformationCache::getOpcodeInstMapForFunction` interface and eliminate the
93// need to traverse the IR repeatedly.
94//
95//===----------------------------------------------------------------------===//
96
97#ifndef LLVM_TRANSFORMS_IPO_ATTRIBUTOR_H
98#define LLVM_TRANSFORMS_IPO_ATTRIBUTOR_H
99
100#include "llvm/ADT/DenseSet.h"
101#include "llvm/ADT/GraphTraits.h"
102#include "llvm/ADT/MapVector.h"
103#include "llvm/ADT/STLExtras.h"
104#include "llvm/ADT/SetVector.h"
105#include "llvm/ADT/Triple.h"
106#include "llvm/ADT/iterator.h"
107#include "llvm/Analysis/AssumeBundleQueries.h"
108#include "llvm/Analysis/CFG.h"
109#include "llvm/Analysis/CGSCCPassManager.h"
110#include "llvm/Analysis/LazyCallGraph.h"
111#include "llvm/Analysis/LoopInfo.h"
112#include "llvm/Analysis/MustExecute.h"
113#include "llvm/Analysis/OptimizationRemarkEmitter.h"
114#include "llvm/Analysis/PostDominators.h"
115#include "llvm/Analysis/TargetLibraryInfo.h"
116#include "llvm/IR/AbstractCallSite.h"
117#include "llvm/IR/ConstantRange.h"
118#include "llvm/IR/PassManager.h"
119#include "llvm/Support/Allocator.h"
120#include "llvm/Support/Casting.h"
121#include "llvm/Support/GraphWriter.h"
122#include "llvm/Support/TimeProfiler.h"
123#include "llvm/Transforms/Utils/CallGraphUpdater.h"
124
125namespace llvm {
126
127struct AADepGraphNode;
128struct AADepGraph;
129struct Attributor;
130struct AbstractAttribute;
131struct InformationCache;
132struct AAIsDead;
133struct AttributorCallGraph;
134
135class AAManager;
136class AAResults;
137class Function;
138
139/// Abstract Attribute helper functions.
140namespace AA {
141
142/// Return true if \p V is dynamically unique, that is, there are no two
143/// "instances" of \p V at runtime with different values.
144bool isDynamicallyUnique(Attributor &A, const AbstractAttribute &QueryingAA,
145 const Value &V);
146
147/// Return true if \p V is a valid value in \p Scope, that is a constant or an
148/// instruction/argument of \p Scope.
149bool isValidInScope(const Value &V, const Function *Scope);
150
151/// Return true if \p V is a valid value at position \p CtxI, that is a
152/// constant, an argument of the same function as \p CtxI, or an instruction in
153/// that function that dominates \p CtxI.
154bool isValidAtPosition(const Value &V, const Instruction &CtxI,
155 InformationCache &InfoCache);
156
157/// Try to convert \p V to type \p Ty without introducing new instructions. If
158/// this is not possible return `nullptr`. Note: this function basically knows
159/// how to cast various constants.
160Value *getWithType(Value &V, Type &Ty);
161
162/// Return the combination of \p A and \p B such that the result is a possible
163/// value of both. \p B is potentially casted to match the type \p Ty or the
164/// type of \p A if \p Ty is null.
165///
166/// Examples:
167/// X + none => X
168/// not_none + undef => not_none
169/// V1 + V2 => nullptr
170Optional<Value *>
171combineOptionalValuesInAAValueLatice(const Optional<Value *> &A,
172 const Optional<Value *> &B, Type *Ty);
173
174/// Return the initial value of \p Obj with type \p Ty if that is a constant.
175Constant *getInitialValueForObj(Value &Obj, Type &Ty);
176
177/// Collect all potential underlying objects of \p Ptr at position \p CtxI in
178/// \p Objects. Assumed information is used and dependences onto \p QueryingAA
179/// are added appropriately.
180///
181/// \returns True if \p Objects contains all assumed underlying objects, and
182/// false if something went wrong and the objects could not be
183/// determined.
184bool getAssumedUnderlyingObjects(Attributor &A, const Value &Ptr,
185 SmallVectorImpl<Value *> &Objects,
186 const AbstractAttribute &QueryingAA,
187 const Instruction *CtxI);
188
189/// Collect all potential values of the one stored by \p SI into
190/// \p PotentialCopies. That is, the only copies that were made via the
191/// store are assumed to be known and all in \p PotentialCopies. Dependences
192/// onto \p QueryingAA are properly tracked, \p UsedAssumedInformation will
193/// inform the caller if assumed information was used.
194///
195/// \returns True if the assumed potential copies are all in \p PotentialCopies,
196/// false if something went wrong and the copies could not be
197/// determined.
198bool getPotentialCopiesOfStoredValue(
199 Attributor &A, StoreInst &SI, SmallSetVector<Value *, 4> &PotentialCopies,
200 const AbstractAttribute &QueryingAA, bool &UsedAssumedInformation);
201
202} // namespace AA
203
204/// The value passed to the line option that defines the maximal initialization
205/// chain length.
206extern unsigned MaxInitializationChainLength;
207
208///{
209enum class ChangeStatus {
210 CHANGED,
211 UNCHANGED,
212};
213
214ChangeStatus operator|(ChangeStatus l, ChangeStatus r);
215ChangeStatus &operator|=(ChangeStatus &l, ChangeStatus r);
216ChangeStatus operator&(ChangeStatus l, ChangeStatus r);
217ChangeStatus &operator&=(ChangeStatus &l, ChangeStatus r);
218
219enum class DepClassTy {
220 REQUIRED, ///< The target cannot be valid if the source is not.
221 OPTIONAL, ///< The target may be valid if the source is not.
222 NONE, ///< Do not track a dependence between source and target.
223};
224///}
225
226/// The data structure for the nodes of a dependency graph
227struct AADepGraphNode {
228public:
229 virtual ~AADepGraphNode(){};
230 using DepTy = PointerIntPair<AADepGraphNode *, 1>;
231
232protected:
233 /// Set of dependency graph nodes which should be updated if this one
234 /// is updated. The bit encodes if it is optional.
235 TinyPtrVector<DepTy> Deps;
236
237 static AADepGraphNode *DepGetVal(DepTy &DT) { return DT.getPointer(); }
238 static AbstractAttribute *DepGetValAA(DepTy &DT) {
239 return cast<AbstractAttribute>(DT.getPointer());
240 }
241
242 operator AbstractAttribute *() { return cast<AbstractAttribute>(this); }
243
244public:
245 using iterator =
246 mapped_iterator<TinyPtrVector<DepTy>::iterator, decltype(&DepGetVal)>;
247 using aaiterator =
248 mapped_iterator<TinyPtrVector<DepTy>::iterator, decltype(&DepGetValAA)>;
249
250 aaiterator begin() { return aaiterator(Deps.begin(), &DepGetValAA); }
251 aaiterator end() { return aaiterator(Deps.end(), &DepGetValAA); }
252 iterator child_begin() { return iterator(Deps.begin(), &DepGetVal); }
253 iterator child_end() { return iterator(Deps.end(), &DepGetVal); }
254
255 virtual void print(raw_ostream &OS) const { OS << "AADepNode Impl\n"; }
256 TinyPtrVector<DepTy> &getDeps() { return Deps; }
257
258 friend struct Attributor;
259 friend struct AADepGraph;
260};
261
262/// The data structure for the dependency graph
263///
264/// Note that in this graph if there is an edge from A to B (A -> B),
265/// then it means that B depends on A, and when the state of A is
266/// updated, node B should also be updated
267struct AADepGraph {
268 AADepGraph() {}
269 ~AADepGraph() {}
270
271 using DepTy = AADepGraphNode::DepTy;
272 static AADepGraphNode *DepGetVal(DepTy &DT) { return DT.getPointer(); }
273 using iterator =
274 mapped_iterator<TinyPtrVector<DepTy>::iterator, decltype(&DepGetVal)>;
275
276 /// There is no root node for the dependency graph. But the SCCIterator
277 /// requires a single entry point, so we maintain a fake("synthetic") root
278 /// node that depends on every node.
279 AADepGraphNode SyntheticRoot;
280 AADepGraphNode *GetEntryNode() { return &SyntheticRoot; }
281
282 iterator begin() { return SyntheticRoot.child_begin(); }
283 iterator end() { return SyntheticRoot.child_end(); }
284
285 void viewGraph();
286
287 /// Dump graph to file
288 void dumpGraph();
289
290 /// Print dependency graph
291 void print();
292};
293
294/// Helper to describe and deal with positions in the LLVM-IR.
295///
296/// A position in the IR is described by an anchor value and an "offset" that
297/// could be the argument number, for call sites and arguments, or an indicator
298/// of the "position kind". The kinds, specified in the Kind enum below, include
299/// the locations in the attribute list, i.a., function scope and return value,
300/// as well as a distinction between call sites and functions. Finally, there
301/// are floating values that do not have a corresponding attribute list
302/// position.
303struct IRPosition {
304 // NOTE: In the future this definition can be changed to support recursive
305 // functions.
306 using CallBaseContext = CallBase;
307
308 /// The positions we distinguish in the IR.
309 enum Kind : char {
310 IRP_INVALID, ///< An invalid position.
311 IRP_FLOAT, ///< A position that is not associated with a spot suitable
312 ///< for attributes. This could be any value or instruction.
313 IRP_RETURNED, ///< An attribute for the function return value.
314 IRP_CALL_SITE_RETURNED, ///< An attribute for a call site return value.
315 IRP_FUNCTION, ///< An attribute for a function (scope).
316 IRP_CALL_SITE, ///< An attribute for a call site (function scope).
317 IRP_ARGUMENT, ///< An attribute for a function argument.
318 IRP_CALL_SITE_ARGUMENT, ///< An attribute for a call site argument.
319 };