LLVM 22.0.0git
SROA.cpp
Go to the documentation of this file.
1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This transformation implements the well known scalar replacement of
10/// aggregates transformation. It tries to identify promotable elements of an
11/// aggregate alloca, and promote them to registers. It will also try to
12/// convert uses of an element (or set of elements) of an alloca into a vector
13/// or bitfield-style integer scalar if appropriate.
14///
15/// It works to do this with minimal slicing of the alloca so that regions
16/// which are merely transferred in and out of external memory remain unchanged
17/// and are not decomposed to scalar code.
18///
19/// Because this also performs alloca promotion, it can be thought of as also
20/// serving the purpose of SSA formation. The algorithm iterates on the
21/// function until all opportunities for promotion have been realized.
22///
23//===----------------------------------------------------------------------===//
24
26#include "llvm/ADT/APInt.h"
27#include "llvm/ADT/ArrayRef.h"
28#include "llvm/ADT/DenseMap.h"
29#include "llvm/ADT/MapVector.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SetVector.h"
36#include "llvm/ADT/Statistic.h"
37#include "llvm/ADT/StringRef.h"
38#include "llvm/ADT/Twine.h"
39#include "llvm/ADT/iterator.h"
44#include "llvm/Analysis/Loads.h"
47#include "llvm/Config/llvm-config.h"
48#include "llvm/IR/BasicBlock.h"
49#include "llvm/IR/Constant.h"
51#include "llvm/IR/Constants.h"
52#include "llvm/IR/DIBuilder.h"
53#include "llvm/IR/DataLayout.h"
54#include "llvm/IR/DebugInfo.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/GlobalAlias.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstVisitor.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/LLVMContext.h"
66#include "llvm/IR/Metadata.h"
67#include "llvm/IR/Module.h"
68#include "llvm/IR/Operator.h"
69#include "llvm/IR/PassManager.h"
70#include "llvm/IR/Type.h"
71#include "llvm/IR/Use.h"
72#include "llvm/IR/User.h"
73#include "llvm/IR/Value.h"
74#include "llvm/IR/ValueHandle.h"
76#include "llvm/Pass.h"
80#include "llvm/Support/Debug.h"
88#include <algorithm>
89#include <cassert>
90#include <cstddef>
91#include <cstdint>
92#include <cstring>
93#include <iterator>
94#include <queue>
95#include <string>
96#include <tuple>
97#include <utility>
98#include <variant>
99#include <vector>
100
101using namespace llvm;
102
103#define DEBUG_TYPE "sroa"
104
105STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
106STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
107STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
108STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
109STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
110STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
111STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
112STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
113STATISTIC(NumLoadsPredicated,
114 "Number of loads rewritten into predicated loads to allow promotion");
116 NumStoresPredicated,
117 "Number of stores rewritten into predicated loads to allow promotion");
118STATISTIC(NumDeleted, "Number of instructions deleted");
119STATISTIC(NumVectorized, "Number of vectorized aggregates");
120
121namespace llvm {
122/// Disable running mem2reg during SROA in order to test or debug SROA.
123static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false),
124 cl::Hidden);
126} // namespace llvm
127
128namespace {
129
130class AllocaSliceRewriter;
131class AllocaSlices;
132class Partition;
133
134class SelectHandSpeculativity {
135 unsigned char Storage = 0; // None are speculatable by default.
136 using TrueVal = Bitfield::Element<bool, 0, 1>; // Low 0'th bit.
137 using FalseVal = Bitfield::Element<bool, 1, 1>; // Low 1'th bit.
138public:
139 SelectHandSpeculativity() = default;
140 SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal);
141 bool isSpeculatable(bool isTrueVal) const;
142 bool areAllSpeculatable() const;
143 bool areAnySpeculatable() const;
144 bool areNoneSpeculatable() const;
145 // For interop as int half of PointerIntPair.
146 explicit operator intptr_t() const { return static_cast<intptr_t>(Storage); }
147 explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {}
148};
149static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char));
150
151using PossiblySpeculatableLoad =
153using UnspeculatableStore = StoreInst *;
154using RewriteableMemOp =
155 std::variant<PossiblySpeculatableLoad, UnspeculatableStore>;
156using RewriteableMemOps = SmallVector<RewriteableMemOp, 2>;
157
158/// An optimization pass providing Scalar Replacement of Aggregates.
159///
160/// This pass takes allocations which can be completely analyzed (that is, they
161/// don't escape) and tries to turn them into scalar SSA values. There are
162/// a few steps to this process.
163///
164/// 1) It takes allocations of aggregates and analyzes the ways in which they
165/// are used to try to split them into smaller allocations, ideally of
166/// a single scalar data type. It will split up memcpy and memset accesses
167/// as necessary and try to isolate individual scalar accesses.
168/// 2) It will transform accesses into forms which are suitable for SSA value
169/// promotion. This can be replacing a memset with a scalar store of an
170/// integer value, or it can involve speculating operations on a PHI or
171/// select to be a PHI or select of the results.
172/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
173/// onto insert and extract operations on a vector value, and convert them to
174/// this form. By doing so, it will enable promotion of vector aggregates to
175/// SSA vector values.
176class SROA {
177 LLVMContext *const C;
178 DomTreeUpdater *const DTU;
179 AssumptionCache *const AC;
180 const bool PreserveCFG;
181
182 /// Worklist of alloca instructions to simplify.
183 ///
184 /// Each alloca in the function is added to this. Each new alloca formed gets
185 /// added to it as well to recursively simplify unless that alloca can be
186 /// directly promoted. Finally, each time we rewrite a use of an alloca other
187 /// the one being actively rewritten, we add it back onto the list if not
188 /// already present to ensure it is re-visited.
189 SmallSetVector<AllocaInst *, 16> Worklist;
190
191 /// A collection of instructions to delete.
192 /// We try to batch deletions to simplify code and make things a bit more
193 /// efficient. We also make sure there is no dangling pointers.
194 SmallVector<WeakVH, 8> DeadInsts;
195
196 /// Post-promotion worklist.
197 ///
198 /// Sometimes we discover an alloca which has a high probability of becoming
199 /// viable for SROA after a round of promotion takes place. In those cases,
200 /// the alloca is enqueued here for re-processing.
201 ///
202 /// Note that we have to be very careful to clear allocas out of this list in
203 /// the event they are deleted.
204 SmallSetVector<AllocaInst *, 16> PostPromotionWorklist;
205
206 /// A collection of alloca instructions we can directly promote.
207 SetVector<AllocaInst *, SmallVector<AllocaInst *>,
208 SmallPtrSet<AllocaInst *, 16>, 16>
209 PromotableAllocas;
210
211 /// A worklist of PHIs to speculate prior to promoting allocas.
212 ///
213 /// All of these PHIs have been checked for the safety of speculation and by
214 /// being speculated will allow promoting allocas currently in the promotable
215 /// queue.
216 SmallSetVector<PHINode *, 8> SpeculatablePHIs;
217
218 /// A worklist of select instructions to rewrite prior to promoting
219 /// allocas.
220 SmallMapVector<SelectInst *, RewriteableMemOps, 8> SelectsToRewrite;
221
222 /// Select instructions that use an alloca and are subsequently loaded can be
223 /// rewritten to load both input pointers and then select between the result,
224 /// allowing the load of the alloca to be promoted.
225 /// From this:
226 /// %P2 = select i1 %cond, ptr %Alloca, ptr %Other
227 /// %V = load <type>, ptr %P2
228 /// to:
229 /// %V1 = load <type>, ptr %Alloca -> will be mem2reg'd
230 /// %V2 = load <type>, ptr %Other
231 /// %V = select i1 %cond, <type> %V1, <type> %V2
232 ///
233 /// We can do this to a select if its only uses are loads
234 /// and if either the operand to the select can be loaded unconditionally,
235 /// or if we are allowed to perform CFG modifications.
236 /// If found an intervening bitcast with a single use of the load,
237 /// allow the promotion.
238 static std::optional<RewriteableMemOps>
239 isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG);
240
241public:
242 SROA(LLVMContext *C, DomTreeUpdater *DTU, AssumptionCache *AC,
243 SROAOptions PreserveCFG_)
244 : C(C), DTU(DTU), AC(AC),
245 PreserveCFG(PreserveCFG_ == SROAOptions::PreserveCFG) {}
246
247 /// Main run method used by both the SROAPass and by the legacy pass.
248 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runSROA(Function &F);
249
250private:
251 friend class AllocaSliceRewriter;
252
253 bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
254 AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
255 bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
256 bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
257 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
258 void clobberUse(Use &U);
259 bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
260 bool promoteAllocas();
261};
262
263} // end anonymous namespace
264
265/// Calculate the fragment of a variable to use when slicing a store
266/// based on the slice dimensions, existing fragment, and base storage
267/// fragment.
268/// Results:
269/// UseFrag - Use Target as the new fragment.
270/// UseNoFrag - The new slice already covers the whole variable.
271/// Skip - The new alloca slice doesn't include this variable.
272/// FIXME: Can we use calculateFragmentIntersect instead?
273namespace {
274enum FragCalcResult { UseFrag, UseNoFrag, Skip };
275}
276static FragCalcResult
278 uint64_t NewStorageSliceOffsetInBits,
279 uint64_t NewStorageSliceSizeInBits,
280 std::optional<DIExpression::FragmentInfo> StorageFragment,
281 std::optional<DIExpression::FragmentInfo> CurrentFragment,
283 // If the base storage describes part of the variable apply the offset and
284 // the size constraint.
285 if (StorageFragment) {
286 Target.SizeInBits =
287 std::min(NewStorageSliceSizeInBits, StorageFragment->SizeInBits);
288 Target.OffsetInBits =
289 NewStorageSliceOffsetInBits + StorageFragment->OffsetInBits;
290 } else {
291 Target.SizeInBits = NewStorageSliceSizeInBits;
292 Target.OffsetInBits = NewStorageSliceOffsetInBits;
293 }
294
295 // If this slice extracts the entirety of an independent variable from a
296 // larger alloca, do not produce a fragment expression, as the variable is
297 // not fragmented.
298 if (!CurrentFragment) {
299 if (auto Size = Variable->getSizeInBits()) {
300 // Treat the current fragment as covering the whole variable.
301 CurrentFragment = DIExpression::FragmentInfo(*Size, 0);
302 if (Target == CurrentFragment)
303 return UseNoFrag;
304 }
305 }
306
307 // No additional work to do if there isn't a fragment already, or there is
308 // but it already exactly describes the new assignment.
309 if (!CurrentFragment || *CurrentFragment == Target)
310 return UseFrag;
311
312 // Reject the target fragment if it doesn't fit wholly within the current
313 // fragment. TODO: We could instead chop up the target to fit in the case of
314 // a partial overlap.
315 if (Target.startInBits() < CurrentFragment->startInBits() ||
316 Target.endInBits() > CurrentFragment->endInBits())
317 return Skip;
318
319 // Target fits within the current fragment, return it.
320 return UseFrag;
321}
322
324 return DebugVariable(DVR->getVariable(), std::nullopt,
325 DVR->getDebugLoc().getInlinedAt());
326}
327
328/// Find linked dbg.assign and generate a new one with the correct
329/// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the
330/// value component is copied from the old dbg.assign to the new.
331/// \param OldAlloca Alloca for the variable before splitting.
332/// \param IsSplit True if the store (not necessarily alloca)
333/// is being split.
334/// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca.
335/// \param SliceSizeInBits New number of bits being written to.
336/// \param OldInst Instruction that is being split.
337/// \param Inst New instruction performing this part of the
338/// split store.
339/// \param Dest Store destination.
340/// \param Value Stored value.
341/// \param DL Datalayout.
342static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
343 uint64_t OldAllocaOffsetInBits,
344 uint64_t SliceSizeInBits, Instruction *OldInst,
345 Instruction *Inst, Value *Dest, Value *Value,
346 const DataLayout &DL) {
347 // If we want allocas to be migrated using this helper then we need to ensure
348 // that the BaseFragments map code still works. A simple solution would be
349 // to choose to always clone alloca dbg_assigns (rather than sometimes
350 // "stealing" them).
351 assert(!isa<AllocaInst>(Inst) && "Unexpected alloca");
352
353 auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst);
354 // Nothing to do if OldInst has no linked dbg.assign intrinsics.
355 if (DVRAssignMarkerRange.empty())
356 return;
357
358 LLVM_DEBUG(dbgs() << " migrateDebugInfo\n");
359 LLVM_DEBUG(dbgs() << " OldAlloca: " << *OldAlloca << "\n");
360 LLVM_DEBUG(dbgs() << " IsSplit: " << IsSplit << "\n");
361 LLVM_DEBUG(dbgs() << " OldAllocaOffsetInBits: " << OldAllocaOffsetInBits
362 << "\n");
363 LLVM_DEBUG(dbgs() << " SliceSizeInBits: " << SliceSizeInBits << "\n");
364 LLVM_DEBUG(dbgs() << " OldInst: " << *OldInst << "\n");
365 LLVM_DEBUG(dbgs() << " Inst: " << *Inst << "\n");
366 LLVM_DEBUG(dbgs() << " Dest: " << *Dest << "\n");
367 if (Value)
368 LLVM_DEBUG(dbgs() << " Value: " << *Value << "\n");
369
370 /// Map of aggregate variables to their fragment associated with OldAlloca.
372 BaseFragments;
373 for (auto *DVR : at::getDVRAssignmentMarkers(OldAlloca))
374 BaseFragments[getAggregateVariable(DVR)] =
375 DVR->getExpression()->getFragmentInfo();
376
377 // The new inst needs a DIAssignID unique metadata tag (if OldInst has
378 // one). It shouldn't already have one: assert this assumption.
379 assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID));
380 DIAssignID *NewID = nullptr;
381 auto &Ctx = Inst->getContext();
382 DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
383 assert(OldAlloca->isStaticAlloca());
384
385 auto MigrateDbgAssign = [&](DbgVariableRecord *DbgAssign) {
386 LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign
387 << "\n");
388 auto *Expr = DbgAssign->getExpression();
389 bool SetKillLocation = false;
390
391 if (IsSplit) {
392 std::optional<DIExpression::FragmentInfo> BaseFragment;
393 {
394 auto R = BaseFragments.find(getAggregateVariable(DbgAssign));
395 if (R == BaseFragments.end())
396 return;
397 BaseFragment = R->second;
398 }
399 std::optional<DIExpression::FragmentInfo> CurrentFragment =
400 Expr->getFragmentInfo();
401 DIExpression::FragmentInfo NewFragment;
402 FragCalcResult Result = calculateFragment(
403 DbgAssign->getVariable(), OldAllocaOffsetInBits, SliceSizeInBits,
404 BaseFragment, CurrentFragment, NewFragment);
405
406 if (Result == Skip)
407 return;
408 if (Result == UseFrag && !(NewFragment == CurrentFragment)) {
409 if (CurrentFragment) {
410 // Rewrite NewFragment to be relative to the existing one (this is
411 // what createFragmentExpression wants). CalculateFragment has
412 // already resolved the size for us. FIXME: Should it return the
413 // relative fragment too?
414 NewFragment.OffsetInBits -= CurrentFragment->OffsetInBits;
415 }
416 // Add the new fragment info to the existing expression if possible.
418 Expr, NewFragment.OffsetInBits, NewFragment.SizeInBits)) {
419 Expr = *E;
420 } else {
421 // Otherwise, add the new fragment info to an empty expression and
422 // discard the value component of this dbg.assign as the value cannot
423 // be computed with the new fragment.
425 DIExpression::get(Expr->getContext(), {}),
426 NewFragment.OffsetInBits, NewFragment.SizeInBits);
427 SetKillLocation = true;
428 }
429 }
430 }
431
432 // If we haven't created a DIAssignID ID do that now and attach it to Inst.
433 if (!NewID) {
434 NewID = DIAssignID::getDistinct(Ctx);
435 Inst->setMetadata(LLVMContext::MD_DIAssignID, NewID);
436 }
437
438 DbgVariableRecord *NewAssign;
439 if (IsSplit) {
440 ::Value *NewValue = Value ? Value : DbgAssign->getValue();
442 DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
443 Dest, DIExpression::get(Expr->getContext(), {}),
444 DbgAssign->getDebugLoc())));
445 } else {
446 // The store is not split, simply steal the existing dbg_assign.
447 NewAssign = DbgAssign;
448 NewAssign->setAssignId(NewID); // FIXME: Can we avoid generating new IDs?
449 NewAssign->setAddress(Dest);
450 if (Value)
451 NewAssign->replaceVariableLocationOp(0u, Value);
452 assert(Expr == NewAssign->getExpression());
453 }
454
455 // If we've updated the value but the original dbg.assign has an arglist
456 // then kill it now - we can't use the requested new value.
457 // We can't replace the DIArgList with the new value as it'd leave
458 // the DIExpression in an invalid state (DW_OP_LLVM_arg operands without
459 // an arglist). And we can't keep the DIArgList in case the linked store
460 // is being split - in which case the DIArgList + expression may no longer
461 // be computing the correct value.
462 // This should be a very rare situation as it requires the value being
463 // stored to differ from the dbg.assign (i.e., the value has been
464 // represented differently in the debug intrinsic for some reason).
465 SetKillLocation |=
466 Value && (DbgAssign->hasArgList() ||
467 !DbgAssign->getExpression()->isSingleLocationExpression());
468 if (SetKillLocation)
469 NewAssign->setKillLocation();
470
471 // We could use more precision here at the cost of some additional (code)
472 // complexity - if the original dbg.assign was adjacent to its store, we
473 // could position this new dbg.assign adjacent to its store rather than the
474 // old dbg.assgn. That would result in interleaved dbg.assigns rather than
475 // what we get now:
476 // split store !1
477 // split store !2
478 // dbg.assign !1
479 // dbg.assign !2
480 // This (current behaviour) results results in debug assignments being
481 // noted as slightly offset (in code) from the store. In practice this
482 // should have little effect on the debugging experience due to the fact
483 // that all the split stores should get the same line number.
484 if (NewAssign != DbgAssign) {
485 NewAssign->moveBefore(DbgAssign->getIterator());
486 NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
487 }
488 LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
489 };
490
491 for_each(DVRAssignMarkerRange, MigrateDbgAssign);
492}
493
494namespace {
495
496/// A custom IRBuilder inserter which prefixes all names, but only in
497/// Assert builds.
498class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
499 std::string Prefix;
500
501 Twine getNameWithPrefix(const Twine &Name) const {
502 return Name.isTriviallyEmpty() ? Name : Prefix + Name;
503 }
504
505public:
506 void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
507
508 void InsertHelper(Instruction *I, const Twine &Name,
509 BasicBlock::iterator InsertPt) const override {
510 IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name),
511 InsertPt);
512 }
513};
514
515/// Provide a type for IRBuilder that drops names in release builds.
517
518/// A used slice of an alloca.
519///
520/// This structure represents a slice of an alloca used by some instruction. It
521/// stores both the begin and end offsets of this use, a pointer to the use
522/// itself, and a flag indicating whether we can classify the use as splittable
523/// or not when forming partitions of the alloca.
524class Slice {
525 /// The beginning offset of the range.
526 uint64_t BeginOffset = 0;
527
528 /// The ending offset, not included in the range.
529 uint64_t EndOffset = 0;
530
531 /// Storage for both the use of this slice and whether it can be
532 /// split.
533 PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
534
535public:
536 Slice() = default;
537
538 Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
539 : BeginOffset(BeginOffset), EndOffset(EndOffset),
540 UseAndIsSplittable(U, IsSplittable) {}
541
542 uint64_t beginOffset() const { return BeginOffset; }
543 uint64_t endOffset() const { return EndOffset; }
544
545 bool isSplittable() const { return UseAndIsSplittable.getInt(); }
546 void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
547
548 Use *getUse() const { return UseAndIsSplittable.getPointer(); }
549
550 bool isDead() const { return getUse() == nullptr; }
551 void kill() { UseAndIsSplittable.setPointer(nullptr); }
552
553 /// Support for ordering ranges.
554 ///
555 /// This provides an ordering over ranges such that start offsets are
556 /// always increasing, and within equal start offsets, the end offsets are
557 /// decreasing. Thus the spanning range comes first in a cluster with the
558 /// same start position.
559 bool operator<(const Slice &RHS) const {
560 if (beginOffset() < RHS.beginOffset())
561 return true;
562 if (beginOffset() > RHS.beginOffset())
563 return false;
564 if (isSplittable() != RHS.isSplittable())
565 return !isSplittable();
566 if (endOffset() > RHS.endOffset())
567 return true;
568 return false;
569 }
570
571 /// Support comparison with a single offset to allow binary searches.
572 [[maybe_unused]] friend bool operator<(const Slice &LHS, uint64_t RHSOffset) {
573 return LHS.beginOffset() < RHSOffset;
574 }
575 [[maybe_unused]] friend bool operator<(uint64_t LHSOffset, const Slice &RHS) {
576 return LHSOffset < RHS.beginOffset();
577 }
578
579 bool operator==(const Slice &RHS) const {
580 return isSplittable() == RHS.isSplittable() &&
581 beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
582 }
583 bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
584};
585
586/// Representation of the alloca slices.
587///
588/// This class represents the slices of an alloca which are formed by its
589/// various uses. If a pointer escapes, we can't fully build a representation
590/// for the slices used and we reflect that in this structure. The uses are
591/// stored, sorted by increasing beginning offset and with unsplittable slices
592/// starting at a particular offset before splittable slices.
593class AllocaSlices {
594public:
595 /// Construct the slices of a particular alloca.
596 AllocaSlices(const DataLayout &DL, AllocaInst &AI);
597
598 /// Test whether a pointer to the allocation escapes our analysis.
599 ///
600 /// If this is true, the slices are never fully built and should be
601 /// ignored.
602 bool isEscaped() const { return PointerEscapingInstr; }
603 bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
604
605 /// Support for iterating over the slices.
606 /// @{
607 using iterator = SmallVectorImpl<Slice>::iterator;
608 using range = iterator_range<iterator>;
609
610 iterator begin() { return Slices.begin(); }
611 iterator end() { return Slices.end(); }
612
613 using const_iterator = SmallVectorImpl<Slice>::const_iterator;
614 using const_range = iterator_range<const_iterator>;
615
616 const_iterator begin() const { return Slices.begin(); }
617 const_iterator end() const { return Slices.end(); }
618 /// @}
619
620 /// Erase a range of slices.
621 void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
622
623 /// Insert new slices for this alloca.
624 ///
625 /// This moves the slices into the alloca's slices collection, and re-sorts
626 /// everything so that the usual ordering properties of the alloca's slices
627 /// hold.
628 void insert(ArrayRef<Slice> NewSlices) {
629 int OldSize = Slices.size();
630 Slices.append(NewSlices.begin(), NewSlices.end());
631 auto SliceI = Slices.begin() + OldSize;
632 std::stable_sort(SliceI, Slices.end());
633 std::inplace_merge(Slices.begin(), SliceI, Slices.end());
634 }
635
636 // Forward declare the iterator and range accessor for walking the
637 // partitions.
638 class partition_iterator;
640
641 /// Access the dead users for this alloca.
642 ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
643
644 /// Access Uses that should be dropped if the alloca is promotable.
645 ArrayRef<Use *> getDeadUsesIfPromotable() const {
646 return DeadUseIfPromotable;
647 }
648
649 /// Access the dead operands referring to this alloca.
650 ///
651 /// These are operands which have cannot actually be used to refer to the
652 /// alloca as they are outside its range and the user doesn't correct for
653 /// that. These mostly consist of PHI node inputs and the like which we just
654 /// need to replace with undef.
655 ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
656
657#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
658 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
659 void printSlice(raw_ostream &OS, const_iterator I,
660 StringRef Indent = " ") const;
661 void printUse(raw_ostream &OS, const_iterator I,
662 StringRef Indent = " ") const;
663 void print(raw_ostream &OS) const;
664 void dump(const_iterator I) const;
665 void dump() const;
666#endif
667
668private:
669 template <typename DerivedT, typename RetT = void> class BuilderBase;
670 class SliceBuilder;
671
672 friend class AllocaSlices::SliceBuilder;
673
674#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
675 /// Handle to alloca instruction to simplify method interfaces.
676 AllocaInst &AI;
677#endif
678
679 /// The instruction responsible for this alloca not having a known set
680 /// of slices.
681 ///
682 /// When an instruction (potentially) escapes the pointer to the alloca, we
683 /// store a pointer to that here and abort trying to form slices of the
684 /// alloca. This will be null if the alloca slices are analyzed successfully.
685 Instruction *PointerEscapingInstr;
686 Instruction *PointerEscapingInstrReadOnly;
687
688 /// The slices of the alloca.
689 ///
690 /// We store a vector of the slices formed by uses of the alloca here. This
691 /// vector is sorted by increasing begin offset, and then the unsplittable
692 /// slices before the splittable ones. See the Slice inner class for more
693 /// details.
695
696 /// Instructions which will become dead if we rewrite the alloca.
697 ///
698 /// Note that these are not separated by slice. This is because we expect an
699 /// alloca to be completely rewritten or not rewritten at all. If rewritten,
700 /// all these instructions can simply be removed and replaced with poison as
701 /// they come from outside of the allocated space.
702 SmallVector<Instruction *, 8> DeadUsers;
703
704 /// Uses which will become dead if can promote the alloca.
705 SmallVector<Use *, 8> DeadUseIfPromotable;
706
707 /// Operands which will become dead if we rewrite the alloca.
708 ///
709 /// These are operands that in their particular use can be replaced with
710 /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
711 /// to PHI nodes and the like. They aren't entirely dead (there might be
712 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
713 /// want to swap this particular input for poison to simplify the use lists of
714 /// the alloca.
715 SmallVector<Use *, 8> DeadOperands;
716};
717
718/// A partition of the slices.
719///
720/// An ephemeral representation for a range of slices which can be viewed as
721/// a partition of the alloca. This range represents a span of the alloca's
722/// memory which cannot be split, and provides access to all of the slices
723/// overlapping some part of the partition.
724///
725/// Objects of this type are produced by traversing the alloca's slices, but
726/// are only ephemeral and not persistent.
727class Partition {
728private:
729 friend class AllocaSlices;
730 friend class AllocaSlices::partition_iterator;
731
732 using iterator = AllocaSlices::iterator;
733
734 /// The beginning and ending offsets of the alloca for this
735 /// partition.
736 uint64_t BeginOffset = 0, EndOffset = 0;
737
738 /// The start and end iterators of this partition.
739 iterator SI, SJ;
740
741 /// A collection of split slice tails overlapping the partition.
742 SmallVector<Slice *, 4> SplitTails;
743
744 /// Raw constructor builds an empty partition starting and ending at
745 /// the given iterator.
746 Partition(iterator SI) : SI(SI), SJ(SI) {}
747
748public:
749 /// The start offset of this partition.
750 ///
751 /// All of the contained slices start at or after this offset.
752 uint64_t beginOffset() const { return BeginOffset; }
753
754 /// The end offset of this partition.
755 ///
756 /// All of the contained slices end at or before this offset.
757 uint64_t endOffset() const { return EndOffset; }
758
759 /// The size of the partition.
760 ///
761 /// Note that this can never be zero.
762 uint64_t size() const {
763 assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
764 return EndOffset - BeginOffset;
765 }
766
767 /// Test whether this partition contains no slices, and merely spans
768 /// a region occupied by split slices.
769 bool empty() const { return SI == SJ; }
770
771 /// \name Iterate slices that start within the partition.
772 /// These may be splittable or unsplittable. They have a begin offset >= the
773 /// partition begin offset.
774 /// @{
775 // FIXME: We should probably define a "concat_iterator" helper and use that
776 // to stitch together pointee_iterators over the split tails and the
777 // contiguous iterators of the partition. That would give a much nicer
778 // interface here. We could then additionally expose filtered iterators for
779 // split, unsplit, and unsplittable splices based on the usage patterns.
780 iterator begin() const { return SI; }
781 iterator end() const { return SJ; }
782 /// @}
783
784 /// Get the sequence of split slice tails.
785 ///
786 /// These tails are of slices which start before this partition but are
787 /// split and overlap into the partition. We accumulate these while forming
788 /// partitions.
789 ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
790};
791
792} // end anonymous namespace
793
794/// An iterator over partitions of the alloca's slices.
795///
796/// This iterator implements the core algorithm for partitioning the alloca's
797/// slices. It is a forward iterator as we don't support backtracking for
798/// efficiency reasons, and re-use a single storage area to maintain the
799/// current set of split slices.
800///
801/// It is templated on the slice iterator type to use so that it can operate
802/// with either const or non-const slice iterators.
804 : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
805 Partition> {
806 friend class AllocaSlices;
807
808 /// Most of the state for walking the partitions is held in a class
809 /// with a nice interface for examining them.
810 Partition P;
811
812 /// We need to keep the end of the slices to know when to stop.
813 AllocaSlices::iterator SE;
814
815 /// We also need to keep track of the maximum split end offset seen.
816 /// FIXME: Do we really?
817 uint64_t MaxSplitSliceEndOffset = 0;
818
819 /// Sets the partition to be empty at given iterator, and sets the
820 /// end iterator.
821 partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
822 : P(SI), SE(SE) {
823 // If not already at the end, advance our state to form the initial
824 // partition.
825 if (SI != SE)
826 advance();
827 }
828
829 /// Advance the iterator to the next partition.
830 ///
831 /// Requires that the iterator not be at the end of the slices.
832 void advance() {
833 assert((P.SI != SE || !P.SplitTails.empty()) &&
834 "Cannot advance past the end of the slices!");
835
836 // Clear out any split uses which have ended.
837 if (!P.SplitTails.empty()) {
838 if (P.EndOffset >= MaxSplitSliceEndOffset) {
839 // If we've finished all splits, this is easy.
840 P.SplitTails.clear();
841 MaxSplitSliceEndOffset = 0;
842 } else {
843 // Remove the uses which have ended in the prior partition. This
844 // cannot change the max split slice end because we just checked that
845 // the prior partition ended prior to that max.
846 llvm::erase_if(P.SplitTails,
847 [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
848 assert(llvm::any_of(P.SplitTails,
849 [&](Slice *S) {
850 return S->endOffset() == MaxSplitSliceEndOffset;
851 }) &&
852 "Could not find the current max split slice offset!");
853 assert(llvm::all_of(P.SplitTails,
854 [&](Slice *S) {
855 return S->endOffset() <= MaxSplitSliceEndOffset;
856 }) &&
857 "Max split slice end offset is not actually the max!");
858 }
859 }
860
861 // If P.SI is already at the end, then we've cleared the split tail and
862 // now have an end iterator.
863 if (P.SI == SE) {
864 assert(P.SplitTails.empty() && "Failed to clear the split slices!");
865 return;
866 }
867
868 // If we had a non-empty partition previously, set up the state for
869 // subsequent partitions.
870 if (P.SI != P.SJ) {
871 // Accumulate all the splittable slices which started in the old
872 // partition into the split list.
873 for (Slice &S : P)
874 if (S.isSplittable() && S.endOffset() > P.EndOffset) {
875 P.SplitTails.push_back(&S);
876 MaxSplitSliceEndOffset =
877 std::max(S.endOffset(), MaxSplitSliceEndOffset);
878 }
879
880 // Start from the end of the previous partition.
881 P.SI = P.SJ;
882
883 // If P.SI is now at the end, we at most have a tail of split slices.
884 if (P.SI == SE) {
885 P.BeginOffset = P.EndOffset;
886 P.EndOffset = MaxSplitSliceEndOffset;
887 return;
888 }
889
890 // If the we have split slices and the next slice is after a gap and is
891 // not splittable immediately form an empty partition for the split
892 // slices up until the next slice begins.
893 if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
894 !P.SI->isSplittable()) {
895 P.BeginOffset = P.EndOffset;
896 P.EndOffset = P.SI->beginOffset();
897 return;
898 }
899 }
900
901 // OK, we need to consume new slices. Set the end offset based on the
902 // current slice, and step SJ past it. The beginning offset of the
903 // partition is the beginning offset of the next slice unless we have
904 // pre-existing split slices that are continuing, in which case we begin
905 // at the prior end offset.
906 P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
907 P.EndOffset = P.SI->endOffset();
908 ++P.SJ;
909
910 // There are two strategies to form a partition based on whether the
911 // partition starts with an unsplittable slice or a splittable slice.
912 if (!P.SI->isSplittable()) {
913 // When we're forming an unsplittable region, it must always start at
914 // the first slice and will extend through its end.
915 assert(P.BeginOffset == P.SI->beginOffset());
916
917 // Form a partition including all of the overlapping slices with this
918 // unsplittable slice.
919 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
920 if (!P.SJ->isSplittable())
921 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
922 ++P.SJ;
923 }
924
925 // We have a partition across a set of overlapping unsplittable
926 // partitions.
927 return;
928 }
929
930 // If we're starting with a splittable slice, then we need to form
931 // a synthetic partition spanning it and any other overlapping splittable
932 // splices.
933 assert(P.SI->isSplittable() && "Forming a splittable partition!");
934
935 // Collect all of the overlapping splittable slices.
936 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
937 P.SJ->isSplittable()) {
938 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
939 ++P.SJ;
940 }
941
942 // Back upiP.EndOffset if we ended the span early when encountering an
943 // unsplittable slice. This synthesizes the early end offset of
944 // a partition spanning only splittable slices.
945 if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
946 assert(!P.SJ->isSplittable());
947 P.EndOffset = P.SJ->beginOffset();
948 }
949 }
950
951public:
952 bool operator==(const partition_iterator &RHS) const {
953 assert(SE == RHS.SE &&
954 "End iterators don't match between compared partition iterators!");
955
956 // The observed positions of partitions is marked by the P.SI iterator and
957 // the emptiness of the split slices. The latter is only relevant when
958 // P.SI == SE, as the end iterator will additionally have an empty split
959 // slices list, but the prior may have the same P.SI and a tail of split
960 // slices.
961 if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
962 assert(P.SJ == RHS.P.SJ &&
963 "Same set of slices formed two different sized partitions!");
964 assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
965 "Same slice position with differently sized non-empty split "
966 "slice tails!");
967 return true;
968 }
969 return false;
970 }
971
972 partition_iterator &operator++() {
973 advance();
974 return *this;
975 }
976
977 Partition &operator*() { return P; }
978};
979
980/// A forward range over the partitions of the alloca's slices.
981///
982/// This accesses an iterator range over the partitions of the alloca's
983/// slices. It computes these partitions on the fly based on the overlapping
984/// offsets of the slices and the ability to split them. It will visit "empty"
985/// partitions to cover regions of the alloca only accessed via split
986/// slices.
987iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
988 return make_range(partition_iterator(begin(), end()),
989 partition_iterator(end(), end()));
990}
991
993 // If the condition being selected on is a constant or the same value is
994 // being selected between, fold the select. Yes this does (rarely) happen
995 // early on.
996 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
997 return SI.getOperand(1 + CI->isZero());
998 if (SI.getOperand(1) == SI.getOperand(2))
999 return SI.getOperand(1);
1000
1001 return nullptr;
1002}
1003
1004/// A helper that folds a PHI node or a select.
1006 if (PHINode *PN = dyn_cast<PHINode>(&I)) {
1007 // If PN merges together the same value, return that value.
1008 return PN->hasConstantValue();
1009 }
1011}
1012
1013/// Builder for the alloca slices.
1014///
1015/// This class builds a set of alloca slices by recursively visiting the uses
1016/// of an alloca and making a slice for each load and store at each offset.
1017class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
1018 friend class PtrUseVisitor<SliceBuilder>;
1019 friend class InstVisitor<SliceBuilder>;
1020
1021 using Base = PtrUseVisitor<SliceBuilder>;
1022
1023 const uint64_t AllocSize;
1024 AllocaSlices &AS;
1025
1026 SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
1028
1029 /// Set to de-duplicate dead instructions found in the use walk.
1030 SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
1031
1032public:
1033 SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
1035 AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue()),
1036 AS(AS) {}
1037
1038private:
1039 void markAsDead(Instruction &I) {
1040 if (VisitedDeadInsts.insert(&I).second)
1041 AS.DeadUsers.push_back(&I);
1042 }
1043
1044 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
1045 bool IsSplittable = false) {
1046 // Completely skip uses which have a zero size or start either before or
1047 // past the end of the allocation.
1048 if (Size == 0 || Offset.uge(AllocSize)) {
1049 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
1050 << Offset
1051 << " which has zero size or starts outside of the "
1052 << AllocSize << " byte alloca:\n"
1053 << " alloca: " << AS.AI << "\n"
1054 << " use: " << I << "\n");
1055 return markAsDead(I);
1056 }
1057
1058 uint64_t BeginOffset = Offset.getZExtValue();
1059 uint64_t EndOffset = BeginOffset + Size;
1060
1061 // Clamp the end offset to the end of the allocation. Note that this is
1062 // formulated to handle even the case where "BeginOffset + Size" overflows.
1063 // This may appear superficially to be something we could ignore entirely,
1064 // but that is not so! There may be widened loads or PHI-node uses where
1065 // some instructions are dead but not others. We can't completely ignore
1066 // them, and so have to record at least the information here.
1067 assert(AllocSize >= BeginOffset); // Established above.
1068 if (Size > AllocSize - BeginOffset) {
1069 LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
1070 << Offset << " to remain within the " << AllocSize
1071 << " byte alloca:\n"
1072 << " alloca: " << AS.AI << "\n"
1073 << " use: " << I << "\n");
1074 EndOffset = AllocSize;
1075 }
1076
1077 AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
1078 }
1079
1080 void visitBitCastInst(BitCastInst &BC) {
1081 if (BC.use_empty())
1082 return markAsDead(BC);
1083
1084 return Base::visitBitCastInst(BC);
1085 }
1086
1087 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1088 if (ASC.use_empty())
1089 return markAsDead(ASC);
1090
1091 return Base::visitAddrSpaceCastInst(ASC);
1092 }
1093
1094 void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1095 if (GEPI.use_empty())
1096 return markAsDead(GEPI);
1097
1098 return Base::visitGetElementPtrInst(GEPI);
1099 }
1100
1101 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
1102 uint64_t Size, bool IsVolatile) {
1103 // We allow splitting of non-volatile loads and stores where the type is an
1104 // integer type. These may be used to implement 'memcpy' or other "transfer
1105 // of bits" patterns.
1106 bool IsSplittable =
1107 Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1108
1109 insertUse(I, Offset, Size, IsSplittable);
1110 }
1111
1112 void visitLoadInst(LoadInst &LI) {
1113 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
1114 "All simple FCA loads should have been pre-split");
1115
1116 // If there is a load with an unknown offset, we can still perform store
1117 // to load forwarding for other known-offset loads.
1118 if (!IsOffsetKnown)
1119 return PI.setEscapedReadOnly(&LI);
1120
1121 TypeSize Size = DL.getTypeStoreSize(LI.getType());
1122 if (Size.isScalable()) {
1123 unsigned VScale = LI.getFunction()->getVScaleValue();
1124 if (!VScale)
1125 return PI.setAborted(&LI);
1126
1127 Size = TypeSize::getFixed(Size.getKnownMinValue() * VScale);
1128 }
1129
1130 return handleLoadOrStore(LI.getType(), LI, Offset, Size.getFixedValue(),
1131 LI.isVolatile());
1132 }
1133
1134 void visitStoreInst(StoreInst &SI) {
1135 Value *ValOp = SI.getValueOperand();
1136 if (ValOp == *U)
1137 return PI.setEscapedAndAborted(&SI);
1138 if (!IsOffsetKnown)
1139 return PI.setAborted(&SI);
1140
1141 TypeSize StoreSize = DL.getTypeStoreSize(ValOp->getType());
1142 if (StoreSize.isScalable()) {
1143 unsigned VScale = SI.getFunction()->getVScaleValue();
1144 if (!VScale)
1145 return PI.setAborted(&SI);
1146
1147 StoreSize = TypeSize::getFixed(StoreSize.getKnownMinValue() * VScale);
1148 }
1149
1150 uint64_t Size = StoreSize.getFixedValue();
1151
1152 // If this memory access can be shown to *statically* extend outside the
1153 // bounds of the allocation, it's behavior is undefined, so simply
1154 // ignore it. Note that this is more strict than the generic clamping
1155 // behavior of insertUse. We also try to handle cases which might run the
1156 // risk of overflow.
1157 // FIXME: We should instead consider the pointer to have escaped if this
1158 // function is being instrumented for addressing bugs or race conditions.
1159 if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
1160 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
1161 << Offset << " which extends past the end of the "
1162 << AllocSize << " byte alloca:\n"
1163 << " alloca: " << AS.AI << "\n"
1164 << " use: " << SI << "\n");
1165 return markAsDead(SI);
1166 }
1167
1168 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
1169 "All simple FCA stores should have been pre-split");
1170 handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
1171 }
1172
1173 void visitMemSetInst(MemSetInst &II) {
1174 assert(II.getRawDest() == *U && "Pointer use is not the destination?");
1175 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1176 if ((Length && Length->getValue() == 0) ||
1177 (IsOffsetKnown && Offset.uge(AllocSize)))
1178 // Zero-length mem transfer intrinsics can be ignored entirely.
1179 return markAsDead(II);
1180
1181 if (!IsOffsetKnown)
1182 return PI.setAborted(&II);
1183
1184 insertUse(II, Offset,
1185 Length ? Length->getLimitedValue()
1186 : AllocSize - Offset.getLimitedValue(),
1187 (bool)Length);
1188 }
1189
1190 void visitMemTransferInst(MemTransferInst &II) {
1191 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1192 if (Length && Length->getValue() == 0)
1193 // Zero-length mem transfer intrinsics can be ignored entirely.
1194 return markAsDead(II);
1195
1196 // Because we can visit these intrinsics twice, also check to see if the
1197 // first time marked this instruction as dead. If so, skip it.
1198 if (VisitedDeadInsts.count(&II))
1199 return;
1200
1201 if (!IsOffsetKnown)
1202 return PI.setAborted(&II);
1203
1204 // This side of the transfer is completely out-of-bounds, and so we can
1205 // nuke the entire transfer. However, we also need to nuke the other side
1206 // if already added to our partitions.
1207 // FIXME: Yet another place we really should bypass this when
1208 // instrumenting for ASan.
1209 if (Offset.uge(AllocSize)) {
1210 SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
1211 MemTransferSliceMap.find(&II);
1212 if (MTPI != MemTransferSliceMap.end())
1213 AS.Slices[MTPI->second].kill();
1214 return markAsDead(II);
1215 }
1216
1217 uint64_t RawOffset = Offset.getLimitedValue();
1218 uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
1219
1220 // Check for the special case where the same exact value is used for both
1221 // source and dest.
1222 if (*U == II.getRawDest() && *U == II.getRawSource()) {
1223 // For non-volatile transfers this is a no-op.
1224 if (!II.isVolatile())
1225 return markAsDead(II);
1226
1227 return insertUse(II, Offset, Size, /*IsSplittable=*/false);
1228 }
1229
1230 // If we have seen both source and destination for a mem transfer, then
1231 // they both point to the same alloca.
1232 bool Inserted;
1233 SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
1234 std::tie(MTPI, Inserted) =
1235 MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
1236 unsigned PrevIdx = MTPI->second;
1237 if (!Inserted) {
1238 Slice &PrevP = AS.Slices[PrevIdx];
1239
1240 // Check if the begin offsets match and this is a non-volatile transfer.
1241 // In that case, we can completely elide the transfer.
1242 if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
1243 PrevP.kill();
1244 return markAsDead(II);
1245 }
1246
1247 // Otherwise we have an offset transfer within the same alloca. We can't
1248 // split those.
1249 PrevP.makeUnsplittable();
1250 }
1251
1252 // Insert the use now that we've fixed up the splittable nature.
1253 insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
1254
1255 // Check that we ended up with a valid index in the map.
1256 assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
1257 "Map index doesn't point back to a slice with this user.");
1258 }
1259
1260 // Disable SRoA for any intrinsics except for lifetime invariants.
1261 // FIXME: What about debug intrinsics? This matches old behavior, but
1262 // doesn't make sense.
1263 void visitIntrinsicInst(IntrinsicInst &II) {
1264 if (II.isDroppable()) {
1265 AS.DeadUseIfPromotable.push_back(U);
1266 return;
1267 }
1268
1269 if (!IsOffsetKnown)
1270 return PI.setAborted(&II);
1271
1272 if (II.isLifetimeStartOrEnd()) {
1273 insertUse(II, Offset, AllocSize, true);
1274 return;
1275 }
1276
1277 Base::visitIntrinsicInst(II);
1278 }
1279
1280 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
1281 // We consider any PHI or select that results in a direct load or store of
1282 // the same offset to be a viable use for slicing purposes. These uses
1283 // are considered unsplittable and the size is the maximum loaded or stored
1284 // size.
1285 SmallPtrSet<Instruction *, 4> Visited;
1287 Visited.insert(Root);
1288 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
1289 const DataLayout &DL = Root->getDataLayout();
1290 // If there are no loads or stores, the access is dead. We mark that as
1291 // a size zero access.
1292 Size = 0;
1293 do {
1294 Instruction *I, *UsedI;
1295 std::tie(UsedI, I) = Uses.pop_back_val();
1296
1297 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
1298 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
1299 if (LoadSize.isScalable()) {
1300 PI.setAborted(LI);
1301 return nullptr;
1302 }
1303 Size = std::max(Size, LoadSize.getFixedValue());
1304 continue;
1305 }
1306 if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
1307 Value *Op = SI->getOperand(0);
1308 if (Op == UsedI)
1309 return SI;
1310 TypeSize StoreSize = DL.getTypeStoreSize(Op->getType());
1311 if (StoreSize.isScalable()) {
1312 PI.setAborted(SI);
1313 return nullptr;
1314 }
1315 Size = std::max(Size, StoreSize.getFixedValue());
1316 continue;
1317 }
1318
1319 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
1320 if (!GEP->hasAllZeroIndices())
1321 return GEP;
1322 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
1324 return I;
1325 }
1326
1327 for (User *U : I->users())
1328 if (Visited.insert(cast<Instruction>(U)).second)
1329 Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
1330 } while (!Uses.empty());
1331
1332 return nullptr;
1333 }
1334
1335 void visitPHINodeOrSelectInst(Instruction &I) {
1337 if (I.use_empty())
1338 return markAsDead(I);
1339
1340 // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1341 // instructions in this BB, which may be required during rewriting. Bail out
1342 // on these cases.
1343 if (isa<PHINode>(I) &&
1344 I.getParent()->getFirstInsertionPt() == I.getParent()->end())
1345 return PI.setAborted(&I);
1346
1347 // TODO: We could use simplifyInstruction here to fold PHINodes and
1348 // SelectInsts. However, doing so requires to change the current
1349 // dead-operand-tracking mechanism. For instance, suppose neither loading
1350 // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1351 // trap either. However, if we simply replace %U with undef using the
1352 // current dead-operand-tracking mechanism, "load (select undef, undef,
1353 // %other)" may trap because the select may return the first operand
1354 // "undef".
1355 if (Value *Result = foldPHINodeOrSelectInst(I)) {
1356 if (Result == *U)
1357 // If the result of the constant fold will be the pointer, recurse
1358 // through the PHI/select as if we had RAUW'ed it.
1359 enqueueUsers(I);
1360 else
1361 // Otherwise the operand to the PHI/select is dead, and we can replace
1362 // it with poison.
1363 AS.DeadOperands.push_back(U);
1364
1365 return;
1366 }
1367
1368 if (!IsOffsetKnown)
1369 return PI.setAborted(&I);
1370
1371 // See if we already have computed info on this node.
1372 uint64_t &Size = PHIOrSelectSizes[&I];
1373 if (!Size) {
1374 // This is a new PHI/Select, check for an unsafe use of it.
1375 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
1376 return PI.setAborted(UnsafeI);
1377 }
1378
1379 // For PHI and select operands outside the alloca, we can't nuke the entire
1380 // phi or select -- the other side might still be relevant, so we special
1381 // case them here and use a separate structure to track the operands
1382 // themselves which should be replaced with poison.
1383 // FIXME: This should instead be escaped in the event we're instrumenting
1384 // for address sanitization.
1385 if (Offset.uge(AllocSize)) {
1386 AS.DeadOperands.push_back(U);
1387 return;
1388 }
1389
1390 insertUse(I, Offset, Size);
1391 }
1392
1393 void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
1394
1395 void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
1396
1397 /// Disable SROA entirely if there are unhandled users of the alloca.
1398 void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1399
1400 void visitCallBase(CallBase &CB) {
1401 // If the call operand is read-only and only does a read-only or address
1402 // capture, then we mark it as EscapedReadOnly.
1403 if (CB.isDataOperand(U) &&
1404 !capturesFullProvenance(CB.getCaptureInfo(U->getOperandNo())) &&
1405 CB.onlyReadsMemory(U->getOperandNo())) {
1406 PI.setEscapedReadOnly(&CB);
1407 return;
1408 }
1409
1410 Base::visitCallBase(CB);
1411 }
1412};
1413
1414AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
1415 :
1416#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1417 AI(AI),
1418#endif
1419 PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
1420 SliceBuilder PB(DL, AI, *this);
1421 SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
1422 if (PtrI.isEscaped() || PtrI.isAborted()) {
1423 // FIXME: We should sink the escape vs. abort info into the caller nicely,
1424 // possibly by just storing the PtrInfo in the AllocaSlices.
1425 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1426 : PtrI.getAbortingInst();
1427 assert(PointerEscapingInstr && "Did not track a bad instruction");
1428 return;
1429 }
1430 PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
1431
1432 llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
1433
1434 // Sort the uses. This arranges for the offsets to be in ascending order,
1435 // and the sizes to be in descending order.
1436 llvm::stable_sort(Slices);
1437}
1438
1439#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1440
1441void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1442 StringRef Indent) const {
1443 printSlice(OS, I, Indent);
1444 OS << "\n";
1445 printUse(OS, I, Indent);
1446}
1447
1448void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1449 StringRef Indent) const {
1450 OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1451 << " slice #" << (I - begin())
1452 << (I->isSplittable() ? " (splittable)" : "");
1453}
1454
1455void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1456 StringRef Indent) const {
1457 OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1458}
1459
1460void AllocaSlices::print(raw_ostream &OS) const {
1461 if (PointerEscapingInstr) {
1462 OS << "Can't analyze slices for alloca: " << AI << "\n"
1463 << " A pointer to this alloca escaped by:\n"
1464 << " " << *PointerEscapingInstr << "\n";
1465 return;
1466 }
1467
1468 if (PointerEscapingInstrReadOnly)
1469 OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
1470
1471 OS << "Slices of alloca: " << AI << "\n";
1472 for (const_iterator I = begin(), E = end(); I != E; ++I)
1473 print(OS, I);
1474}
1475
1476LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1477 print(dbgs(), I);
1478}
1479LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1480
1481#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1482
1483/// Walk the range of a partitioning looking for a common type to cover this
1484/// sequence of slices.
1485static std::pair<Type *, IntegerType *>
1486findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1487 uint64_t EndOffset) {
1488 Type *Ty = nullptr;
1489 bool TyIsCommon = true;
1490 IntegerType *ITy = nullptr;
1491
1492 // Note that we need to look at *every* alloca slice's Use to ensure we
1493 // always get consistent results regardless of the order of slices.
1494 for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1495 Use *U = I->getUse();
1496 if (isa<IntrinsicInst>(*U->getUser()))
1497 continue;
1498 if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1499 continue;
1500
1501 Type *UserTy = nullptr;
1502 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1503 UserTy = LI->getType();
1504 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1505 UserTy = SI->getValueOperand()->getType();
1506 }
1507
1508 if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
1509 // If the type is larger than the partition, skip it. We only encounter
1510 // this for split integer operations where we want to use the type of the
1511 // entity causing the split. Also skip if the type is not a byte width
1512 // multiple.
1513 if (UserITy->getBitWidth() % 8 != 0 ||
1514 UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1515 continue;
1516
1517 // Track the largest bitwidth integer type used in this way in case there
1518 // is no common type.
1519 if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1520 ITy = UserITy;
1521 }
1522
1523 // To avoid depending on the order of slices, Ty and TyIsCommon must not
1524 // depend on types skipped above.
1525 if (!UserTy || (Ty && Ty != UserTy))
1526 TyIsCommon = false; // Give up on anything but an iN type.
1527 else
1528 Ty = UserTy;
1529 }
1530
1531 return {TyIsCommon ? Ty : nullptr, ITy};
1532}
1533
1534/// PHI instructions that use an alloca and are subsequently loaded can be
1535/// rewritten to load both input pointers in the pred blocks and then PHI the
1536/// results, allowing the load of the alloca to be promoted.
1537/// From this:
1538/// %P2 = phi [i32* %Alloca, i32* %Other]
1539/// %V = load i32* %P2
1540/// to:
1541/// %V1 = load i32* %Alloca -> will be mem2reg'd
1542/// ...
1543/// %V2 = load i32* %Other
1544/// ...
1545/// %V = phi [i32 %V1, i32 %V2]
1546///
1547/// We can do this to a select if its only uses are loads and if the operands
1548/// to the select can be loaded unconditionally.
1549///
1550/// FIXME: This should be hoisted into a generic utility, likely in
1551/// Transforms/Util/Local.h
1553 const DataLayout &DL = PN.getDataLayout();
1554
1555 // For now, we can only do this promotion if the load is in the same block
1556 // as the PHI, and if there are no stores between the phi and load.
1557 // TODO: Allow recursive phi users.
1558 // TODO: Allow stores.
1559 BasicBlock *BB = PN.getParent();
1560 Align MaxAlign;
1561 uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
1562 Type *LoadType = nullptr;
1563 for (User *U : PN.users()) {
1565 if (!LI || !LI->isSimple())
1566 return false;
1567
1568 // For now we only allow loads in the same block as the PHI. This is
1569 // a common case that happens when instcombine merges two loads through
1570 // a PHI.
1571 if (LI->getParent() != BB)
1572 return false;
1573
1574 if (LoadType) {
1575 if (LoadType != LI->getType())
1576 return false;
1577 } else {
1578 LoadType = LI->getType();
1579 }
1580
1581 // Ensure that there are no instructions between the PHI and the load that
1582 // could store.
1583 for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1584 if (BBI->mayWriteToMemory())
1585 return false;
1586
1587 MaxAlign = std::max(MaxAlign, LI->getAlign());
1588 }
1589
1590 if (!LoadType)
1591 return false;
1592
1593 APInt LoadSize =
1594 APInt(APWidth, DL.getTypeStoreSize(LoadType).getFixedValue());
1595
1596 // We can only transform this if it is safe to push the loads into the
1597 // predecessor blocks. The only thing to watch out for is that we can't put
1598 // a possibly trapping load in the predecessor if it is a critical edge.
1599 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1601 Value *InVal = PN.getIncomingValue(Idx);
1602
1603 // If the value is produced by the terminator of the predecessor (an
1604 // invoke) or it has side-effects, there is no valid place to put a load
1605 // in the predecessor.
1606 if (TI == InVal || TI->mayHaveSideEffects())
1607 return false;
1608
1609 // If the predecessor has a single successor, then the edge isn't
1610 // critical.
1611 if (TI->getNumSuccessors() == 1)
1612 continue;
1613
1614 // If this pointer is always safe to load, or if we can prove that there
1615 // is already a load in the block, then we can move the load to the pred
1616 // block.
1617 if (isSafeToLoadUnconditionally(InVal, MaxAlign, LoadSize, DL, TI))
1618 continue;
1619
1620 return false;
1621 }
1622
1623 return true;
1624}
1625
1626static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1627 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1628
1629 LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
1630 Type *LoadTy = SomeLoad->getType();
1631 IRB.SetInsertPoint(&PN);
1632 PHINode *NewPN = IRB.CreatePHI(LoadTy, PN.getNumIncomingValues(),
1633 PN.getName() + ".sroa.speculated");
1634
1635 // Get the AA tags and alignment to use from one of the loads. It does not
1636 // matter which one we get and if any differ.
1637 AAMDNodes AATags = SomeLoad->getAAMetadata();
1638 Align Alignment = SomeLoad->getAlign();
1639
1640 // Rewrite all loads of the PN to use the new PHI.
1641 while (!PN.use_empty()) {
1642 LoadInst *LI = cast<LoadInst>(PN.user_back());
1643 LI->replaceAllUsesWith(NewPN);
1644 LI->eraseFromParent();
1645 }
1646
1647 // Inject loads into all of the pred blocks.
1648 DenseMap<BasicBlock *, Value *> InjectedLoads;
1649 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1650 BasicBlock *Pred = PN.getIncomingBlock(Idx);
1651 Value *InVal = PN.getIncomingValue(Idx);
1652
1653 // A PHI node is allowed to have multiple (duplicated) entries for the same
1654 // basic block, as long as the value is the same. So if we already injected
1655 // a load in the predecessor, then we should reuse the same load for all
1656 // duplicated entries.
1657 if (Value *V = InjectedLoads.lookup(Pred)) {
1658 NewPN->addIncoming(V, Pred);
1659 continue;
1660 }
1661
1662 Instruction *TI = Pred->getTerminator();
1663 IRB.SetInsertPoint(TI);
1664
1665 LoadInst *Load = IRB.CreateAlignedLoad(
1666 LoadTy, InVal, Alignment,
1667 (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1668 ++NumLoadsSpeculated;
1669 if (AATags)
1670 Load->setAAMetadata(AATags);
1671 NewPN->addIncoming(Load, Pred);
1672 InjectedLoads[Pred] = Load;
1673 }
1674
1675 LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1676 PN.eraseFromParent();
1677}
1678
1679SelectHandSpeculativity &
1680SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) {
1681 if (isTrueVal)
1683 else
1685 return *this;
1686}
1687
1688bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const {
1689 return isTrueVal ? Bitfield::get<SelectHandSpeculativity::TrueVal>(Storage)
1690 : Bitfield::get<SelectHandSpeculativity::FalseVal>(Storage);
1691}
1692
1693bool SelectHandSpeculativity::areAllSpeculatable() const {
1694 return isSpeculatable(/*isTrueVal=*/true) &&
1695 isSpeculatable(/*isTrueVal=*/false);
1696}
1697
1698bool SelectHandSpeculativity::areAnySpeculatable() const {
1699 return isSpeculatable(/*isTrueVal=*/true) ||
1700 isSpeculatable(/*isTrueVal=*/false);
1701}
1702bool SelectHandSpeculativity::areNoneSpeculatable() const {
1703 return !areAnySpeculatable();
1704}
1705
1706static SelectHandSpeculativity
1708 assert(LI.isSimple() && "Only for simple loads");
1709 SelectHandSpeculativity Spec;
1710
1711 const DataLayout &DL = SI.getDataLayout();
1712 for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()})
1714 &LI))
1715 Spec.setAsSpeculatable(/*isTrueVal=*/Value == SI.getTrueValue());
1716 else if (PreserveCFG)
1717 return Spec;
1718
1719 return Spec;
1720}
1721
1722std::optional<RewriteableMemOps>
1723SROA::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) {
1724 RewriteableMemOps Ops;
1725
1726 for (User *U : SI.users()) {
1727 if (auto *BC = dyn_cast<BitCastInst>(U); BC && BC->hasOneUse())
1728 U = *BC->user_begin();
1729
1730 if (auto *Store = dyn_cast<StoreInst>(U)) {
1731 // Note that atomic stores can be transformed; atomic semantics do not
1732 // have any meaning for a local alloca. Stores are not speculatable,
1733 // however, so if we can't turn it into a predicated store, we are done.
1734 if (Store->isVolatile() || PreserveCFG)
1735 return {}; // Give up on this `select`.
1736 Ops.emplace_back(Store);
1737 continue;
1738 }
1739
1740 auto *LI = dyn_cast<LoadInst>(U);
1741
1742 // Note that atomic loads can be transformed;
1743 // atomic semantics do not have any meaning for a local alloca.
1744 if (!LI || LI->isVolatile())
1745 return {}; // Give up on this `select`.
1746
1747 PossiblySpeculatableLoad Load(LI);
1748 if (!LI->isSimple()) {
1749 // If the `load` is not simple, we can't speculatively execute it,
1750 // but we could handle this via a CFG modification. But can we?
1751 if (PreserveCFG)
1752 return {}; // Give up on this `select`.
1753 Ops.emplace_back(Load);
1754 continue;
1755 }
1756
1757 SelectHandSpeculativity Spec =
1759 if (PreserveCFG && !Spec.areAllSpeculatable())
1760 return {}; // Give up on this `select`.
1761
1762 Load.setInt(Spec);
1763 Ops.emplace_back(Load);
1764 }
1765
1766 return Ops;
1767}
1768
1770 IRBuilderTy &IRB) {
1771 LLVM_DEBUG(dbgs() << " original load: " << SI << "\n");
1772
1773 Value *TV = SI.getTrueValue();
1774 Value *FV = SI.getFalseValue();
1775 // Replace the given load of the select with a select of two loads.
1776
1777 assert(LI.isSimple() && "We only speculate simple loads");
1778
1779 IRB.SetInsertPoint(&LI);
1780
1781 LoadInst *TL =
1782 IRB.CreateAlignedLoad(LI.getType(), TV, LI.getAlign(),
1783 LI.getName() + ".sroa.speculate.load.true");
1784 LoadInst *FL =
1785 IRB.CreateAlignedLoad(LI.getType(), FV, LI.getAlign(),
1786 LI.getName() + ".sroa.speculate.load.false");
1787 NumLoadsSpeculated += 2;
1788
1789 // Transfer alignment and AA info if present.
1790 TL->setAlignment(LI.getAlign());
1791 FL->setAlignment(LI.getAlign());
1792
1793 AAMDNodes Tags = LI.getAAMetadata();
1794 if (Tags) {
1795 TL->setAAMetadata(Tags);
1796 FL->setAAMetadata(Tags);
1797 }
1798
1799 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
1800 LI.getName() + ".sroa.speculated",
1801 ProfcheckDisableMetadataFixes ? nullptr : &SI);
1802
1803 LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1804 LI.replaceAllUsesWith(V);
1805}
1806
1807template <typename T>
1809 SelectHandSpeculativity Spec,
1810 DomTreeUpdater &DTU) {
1811 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Only for load and store!");
1812 LLVM_DEBUG(dbgs() << " original mem op: " << I << "\n");
1813 BasicBlock *Head = I.getParent();
1814 Instruction *ThenTerm = nullptr;
1815 Instruction *ElseTerm = nullptr;
1816 if (Spec.areNoneSpeculatable())
1817 SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm,
1818 SI.getMetadata(LLVMContext::MD_prof), &DTU);
1819 else {
1820 SplitBlockAndInsertIfThen(SI.getCondition(), &I, /*Unreachable=*/false,
1821 SI.getMetadata(LLVMContext::MD_prof), &DTU,
1822 /*LI=*/nullptr, /*ThenBlock=*/nullptr);
1823 if (Spec.isSpeculatable(/*isTrueVal=*/true))
1824 cast<BranchInst>(Head->getTerminator())->swapSuccessors();
1825 }
1826 auto *HeadBI = cast<BranchInst>(Head->getTerminator());
1827 Spec = {}; // Do not use `Spec` beyond this point.
1828 BasicBlock *Tail = I.getParent();
1829 Tail->setName(Head->getName() + ".cont");
1830 PHINode *PN;
1831 if (isa<LoadInst>(I))
1832 PN = PHINode::Create(I.getType(), 2, "", I.getIterator());
1833 for (BasicBlock *SuccBB : successors(Head)) {
1834 bool IsThen = SuccBB == HeadBI->getSuccessor(0);
1835 int SuccIdx = IsThen ? 0 : 1;
1836 auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB;
1837 auto &CondMemOp = cast<T>(*I.clone());
1838 if (NewMemOpBB != Head) {
1839 NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else"));
1840 if (isa<LoadInst>(I))
1841 ++NumLoadsPredicated;
1842 else
1843 ++NumStoresPredicated;
1844 } else {
1845 CondMemOp.dropUBImplyingAttrsAndMetadata();
1846 ++NumLoadsSpeculated;
1847 }
1848 CondMemOp.insertBefore(NewMemOpBB->getTerminator()->getIterator());
1849 Value *Ptr = SI.getOperand(1 + SuccIdx);
1850 CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr);
1851 if (isa<LoadInst>(I)) {
1852 CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val");
1853 PN->addIncoming(&CondMemOp, NewMemOpBB);
1854 } else
1855 LLVM_DEBUG(dbgs() << " to: " << CondMemOp << "\n");
1856 }
1857 if (isa<LoadInst>(I)) {
1858 PN->takeName(&I);
1859 LLVM_DEBUG(dbgs() << " to: " << *PN << "\n");
1860 I.replaceAllUsesWith(PN);
1861 }
1862}
1863
1865 SelectHandSpeculativity Spec,
1866 DomTreeUpdater &DTU) {
1867 if (auto *LI = dyn_cast<LoadInst>(&I))
1868 rewriteMemOpOfSelect(SelInst, *LI, Spec, DTU);
1869 else if (auto *SI = dyn_cast<StoreInst>(&I))
1870 rewriteMemOpOfSelect(SelInst, *SI, Spec, DTU);
1871 else
1872 llvm_unreachable_internal("Only for load and store.");
1873}
1874
1876 const RewriteableMemOps &Ops,
1877 IRBuilderTy &IRB, DomTreeUpdater *DTU) {
1878 bool CFGChanged = false;
1879 LLVM_DEBUG(dbgs() << " original select: " << SI << "\n");
1880
1881 for (const RewriteableMemOp &Op : Ops) {
1882 SelectHandSpeculativity Spec;
1883 Instruction *I;
1884 if (auto *const *US = std::get_if<UnspeculatableStore>(&Op)) {
1885 I = *US;
1886 } else {
1887 auto PSL = std::get<PossiblySpeculatableLoad>(Op);
1888 I = PSL.getPointer();
1889 Spec = PSL.getInt();
1890 }
1891 if (Spec.areAllSpeculatable()) {
1893 } else {
1894 assert(DTU && "Should not get here when not allowed to modify the CFG!");
1895 rewriteMemOpOfSelect(SI, *I, Spec, *DTU);
1896 CFGChanged = true;
1897 }
1898 I->eraseFromParent();
1899 }
1900
1901 for (User *U : make_early_inc_range(SI.users()))
1902 cast<BitCastInst>(U)->eraseFromParent();
1903 SI.eraseFromParent();
1904 return CFGChanged;
1905}
1906
1907/// Compute an adjusted pointer from Ptr by Offset bytes where the
1908/// resulting pointer has PointerTy.
1909static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1911 const Twine &NamePrefix) {
1912 if (Offset != 0)
1913 Ptr = IRB.CreateInBoundsPtrAdd(Ptr, IRB.getInt(Offset),
1914 NamePrefix + "sroa_idx");
1915 return IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, PointerTy,
1916 NamePrefix + "sroa_cast");
1917}
1918
1919/// Compute the adjusted alignment for a load or store from an offset.
1923
1924/// Test whether we can convert a value from the old to the new type.
1925///
1926/// This predicate should be used to guard calls to convertValue in order to
1927/// ensure that we only try to convert viable values. The strategy is that we
1928/// will peel off single element struct and array wrappings to get to an
1929/// underlying value, and convert that value.
1930static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy,
1931 unsigned VScale = 0) {
1932 if (OldTy == NewTy)
1933 return true;
1934
1935 // For integer types, we can't handle any bit-width differences. This would
1936 // break both vector conversions with extension and introduce endianness
1937 // issues when in conjunction with loads and stores.
1938 if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
1940 cast<IntegerType>(NewTy)->getBitWidth() &&
1941 "We can't have the same bitwidth for different int types");
1942 return false;
1943 }
1944
1945 TypeSize NewSize = DL.getTypeSizeInBits(NewTy);
1946 TypeSize OldSize = DL.getTypeSizeInBits(OldTy);
1947
1948 if ((isa<ScalableVectorType>(NewTy) && isa<FixedVectorType>(OldTy)) ||
1949 (isa<ScalableVectorType>(OldTy) && isa<FixedVectorType>(NewTy))) {
1950 // Conversion is only possible when the size of scalable vectors is known.
1951 if (!VScale)
1952 return false;
1953
1954 // For ptr-to-int and int-to-ptr casts, the pointer side is resolved within
1955 // a single domain (either fixed or scalable). Any additional conversion
1956 // between fixed and scalable types is handled through integer types.
1957 auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy;
1958 auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy;
1959
1960 if (isa<ScalableVectorType>(NewTy)) {
1962 return false;
1963
1964 NewSize = TypeSize::getFixed(NewSize.getKnownMinValue() * VScale);
1965 } else {
1967 return false;
1968
1969 OldSize = TypeSize::getFixed(OldSize.getKnownMinValue() * VScale);
1970 }
1971 }
1972
1973 if (NewSize != OldSize)
1974 return false;
1975 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
1976 return false;
1977
1978 // We can convert pointers to integers and vice-versa. Same for vectors
1979 // of pointers and integers.
1980 OldTy = OldTy->getScalarType();
1981 NewTy = NewTy->getScalarType();
1982 if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
1983 if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
1984 unsigned OldAS = OldTy->getPointerAddressSpace();
1985 unsigned NewAS = NewTy->getPointerAddressSpace();
1986 // Convert pointers if they are pointers from the same address space or
1987 // different integral (not non-integral) address spaces with the same
1988 // pointer size.
1989 return OldAS == NewAS ||
1990 (!DL.isNonIntegralAddressSpace(OldAS) &&
1991 !DL.isNonIntegralAddressSpace(NewAS) &&
1992 DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
1993 }
1994
1995 // We can convert integers to integral pointers, but not to non-integral
1996 // pointers.
1997 if (OldTy->isIntegerTy())
1998 return !DL.isNonIntegralPointerType(NewTy);
1999
2000 // We can convert integral pointers to integers, but non-integral pointers
2001 // need to remain pointers.
2002 if (!DL.isNonIntegralPointerType(OldTy))
2003 return NewTy->isIntegerTy();
2004
2005 return false;
2006 }
2007
2008 if (OldTy->isTargetExtTy() || NewTy->isTargetExtTy())
2009 return false;
2010
2011 return true;
2012}
2013
2014/// Generic routine to convert an SSA value to a value of a different
2015/// type.
2016///
2017/// This will try various different casting techniques, such as bitcasts,
2018/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
2019/// two types for viability with this routine.
2020static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2021 Type *NewTy) {
2022 Type *OldTy = V->getType();
2023
2024#ifndef NDEBUG
2025 BasicBlock *BB = IRB.GetInsertBlock();
2026 assert(BB && BB->getParent() && "VScale unknown!");
2027 unsigned VScale = BB->getParent()->getVScaleValue();
2028 assert(canConvertValue(DL, OldTy, NewTy, VScale) &&
2029 "Value not convertable to type");
2030#endif
2031
2032 if (OldTy == NewTy)
2033 return V;
2034
2035 assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
2036 "Integer types must be the exact same to convert.");
2037
2038 // A variant of bitcast that supports a mixture of fixed and scalable types
2039 // that are know to have the same size.
2040 auto CreateBitCastLike = [&IRB](Value *In, Type *Ty) -> Value * {
2041 Type *InTy = In->getType();
2042 if (InTy == Ty)
2043 return In;
2044
2046 // For vscale_range(2) expand <4 x i32> to <vscale x 4 x i16> -->
2047 // <4 x i32> to <vscale x 2 x i32> to <vscale x 4 x i16>
2049 return IRB.CreateBitCast(IRB.CreateInsertVector(VTy,
2050 PoisonValue::get(VTy), In,
2051 IRB.getInt64(0)),
2052 Ty);
2053 }
2054
2056 // For vscale_range(2) expand <vscale x 4 x i16> to <4 x i32> -->
2057 // <vscale x 4 x i16> to <vscale x 2 x i32> to <4 x i32>
2059 return IRB.CreateExtractVector(Ty, IRB.CreateBitCast(In, VTy),
2060 IRB.getInt64(0));
2061 }
2062
2063 return IRB.CreateBitCast(In, Ty);
2064 };
2065
2066 // See if we need inttoptr for this type pair. May require additional bitcast.
2067 if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
2068 // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
2069 // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
2070 // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
2071 // Directly handle i64 to i8*
2072 return IRB.CreateIntToPtr(CreateBitCastLike(V, DL.getIntPtrType(NewTy)),
2073 NewTy);
2074 }
2075
2076 // See if we need ptrtoint for this type pair. May require additional bitcast.
2077 if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
2078 // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
2079 // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
2080 // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
2081 // Expand i8* to i64 --> i8* to i64 to i64
2082 return CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
2083 NewTy);
2084 }
2085
2086 if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
2087 unsigned OldAS = OldTy->getPointerAddressSpace();
2088 unsigned NewAS = NewTy->getPointerAddressSpace();
2089 // To convert pointers with different address spaces (they are already
2090 // checked convertible, i.e. they have the same pointer size), so far we
2091 // cannot use `bitcast` (which has restrict on the same address space) or
2092 // `addrspacecast` (which is not always no-op casting). Instead, use a pair
2093 // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
2094 // size.
2095 if (OldAS != NewAS) {
2096 assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
2097 return IRB.CreateIntToPtr(
2098 CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
2099 DL.getIntPtrType(NewTy)),
2100 NewTy);
2101 }
2102 }
2103
2104 return CreateBitCastLike(V, NewTy);
2105}
2106
2107/// Test whether the given slice use can be promoted to a vector.
2108///
2109/// This function is called to test each entry in a partition which is slated
2110/// for a single slice.
2111static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
2112 VectorType *Ty,
2113 uint64_t ElementSize,
2114 const DataLayout &DL,
2115 unsigned VScale) {
2116 // First validate the slice offsets.
2117 uint64_t BeginOffset =
2118 std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
2119 uint64_t BeginIndex = BeginOffset / ElementSize;
2120 if (BeginIndex * ElementSize != BeginOffset ||
2121 BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
2122 return false;
2123 uint64_t EndOffset = std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
2124 uint64_t EndIndex = EndOffset / ElementSize;
2125 if (EndIndex * ElementSize != EndOffset ||
2126 EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
2127 return false;
2128
2129 assert(EndIndex > BeginIndex && "Empty vector!");
2130 uint64_t NumElements = EndIndex - BeginIndex;
2131 Type *SliceTy = (NumElements == 1)
2132 ? Ty->getElementType()
2133 : FixedVectorType::get(Ty->getElementType(), NumElements);
2134
2135 Type *SplitIntTy =
2136 Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
2137
2138 Use *U = S.getUse();
2139
2140 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2141 if (MI->isVolatile())
2142 return false;
2143 if (!S.isSplittable())
2144 return false; // Skip any unsplittable intrinsics.
2145 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2146 if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2147 return false;
2148 } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2149 if (LI->isVolatile())
2150 return false;
2151 Type *LTy = LI->getType();
2152 // Disable vector promotion when there are loads or stores of an FCA.
2153 if (LTy->isStructTy())
2154 return false;
2155 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2156 assert(LTy->isIntegerTy());
2157 LTy = SplitIntTy;
2158 }
2159 if (!canConvertValue(DL, SliceTy, LTy, VScale))
2160 return false;
2161 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2162 if (SI->isVolatile())
2163 return false;
2164 Type *STy = SI->getValueOperand()->getType();
2165 // Disable vector promotion when there are loads or stores of an FCA.
2166 if (STy->isStructTy())
2167 return false;
2168 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2169 assert(STy->isIntegerTy());
2170 STy = SplitIntTy;
2171 }
2172 if (!canConvertValue(DL, STy, SliceTy, VScale))
2173 return false;
2174 } else {
2175 return false;
2176 }
2177
2178 return true;
2179}
2180
2181/// Test whether a vector type is viable for promotion.
2182///
2183/// This implements the necessary checking for \c checkVectorTypesForPromotion
2184/// (and thus isVectorPromotionViable) over all slices of the alloca for the
2185/// given VectorType.
2186static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
2187 const DataLayout &DL, unsigned VScale) {
2188 uint64_t ElementSize =
2189 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2190
2191 // While the definition of LLVM vectors is bitpacked, we don't support sizes
2192 // that aren't byte sized.
2193 if (ElementSize % 8)
2194 return false;
2195 assert((DL.getTypeSizeInBits(VTy).getFixedValue() % 8) == 0 &&
2196 "vector size not a multiple of element size?");
2197 ElementSize /= 8;
2198
2199 for (const Slice &S : P)
2200 if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, VScale))
2201 return false;
2202
2203 for (const Slice *S : P.splitSliceTails())
2204 if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, VScale))
2205 return false;
2206
2207 return true;
2208}
2209
2210/// Test whether any vector type in \p CandidateTys is viable for promotion.
2211///
2212/// This implements the necessary checking for \c isVectorPromotionViable over
2213/// all slices of the alloca for the given VectorType.
2214static VectorType *
2216 SmallVectorImpl<VectorType *> &CandidateTys,
2217 bool HaveCommonEltTy, Type *CommonEltTy,
2218 bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
2219 VectorType *CommonVecPtrTy, unsigned VScale) {
2220 // If we didn't find a vector type, nothing to do here.
2221 if (CandidateTys.empty())
2222 return nullptr;
2223
2224 // Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
2225 // then we should choose it, not some other alternative.
2226 // But, we can't perform a no-op pointer address space change via bitcast,
2227 // so if we didn't have a common pointer element type, bail.
2228 if (HaveVecPtrTy && !HaveCommonVecPtrTy)
2229 return nullptr;
2230
2231 // Try to pick the "best" element type out of the choices.
2232 if (!HaveCommonEltTy && HaveVecPtrTy) {
2233 // If there was a pointer element type, there's really only one choice.
2234 CandidateTys.clear();
2235 CandidateTys.push_back(CommonVecPtrTy);
2236 } else if (!HaveCommonEltTy && !HaveVecPtrTy) {
2237 // Integer-ify vector types.
2238 for (VectorType *&VTy : CandidateTys) {
2239 if (!VTy->getElementType()->isIntegerTy())
2240 VTy = cast<VectorType>(VTy->getWithNewType(IntegerType::getIntNTy(
2241 VTy->getContext(), VTy->getScalarSizeInBits())));
2242 }
2243
2244 // Rank the remaining candidate vector types. This is easy because we know
2245 // they're all integer vectors. We sort by ascending number of elements.
2246 auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2247 (void)DL;
2248 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2249 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2250 "Cannot have vector types of different sizes!");
2251 assert(RHSTy->getElementType()->isIntegerTy() &&
2252 "All non-integer types eliminated!");
2253 assert(LHSTy->getElementType()->isIntegerTy() &&
2254 "All non-integer types eliminated!");
2255 return cast<FixedVectorType>(RHSTy)->getNumElements() <
2256 cast<FixedVectorType>(LHSTy)->getNumElements();
2257 };
2258 auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2259 (void)DL;
2260 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2261 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2262 "Cannot have vector types of different sizes!");
2263 assert(RHSTy->getElementType()->isIntegerTy() &&
2264 "All non-integer types eliminated!");
2265 assert(LHSTy->getElementType()->isIntegerTy() &&
2266 "All non-integer types eliminated!");
2267 return cast<FixedVectorType>(RHSTy)->getNumElements() ==
2268 cast<FixedVectorType>(LHSTy)->getNumElements();
2269 };
2270 llvm::sort(CandidateTys, RankVectorTypesComp);
2271 CandidateTys.erase(llvm::unique(CandidateTys, RankVectorTypesEq),
2272 CandidateTys.end());
2273 } else {
2274// The only way to have the same element type in every vector type is to
2275// have the same vector type. Check that and remove all but one.
2276#ifndef NDEBUG
2277 for (VectorType *VTy : CandidateTys) {
2278 assert(VTy->getElementType() == CommonEltTy &&
2279 "Unaccounted for element type!");
2280 assert(VTy == CandidateTys[0] &&
2281 "Different vector types with the same element type!");
2282 }
2283#endif
2284 CandidateTys.resize(1);
2285 }
2286
2287 // FIXME: hack. Do we have a named constant for this?
2288 // SDAG SDNode can't have more than 65535 operands.
2289 llvm::erase_if(CandidateTys, [](VectorType *VTy) {
2290 return cast<FixedVectorType>(VTy)->getNumElements() >
2291 std::numeric_limits<unsigned short>::max();
2292 });
2293
2294 for (VectorType *VTy : CandidateTys)
2295 if (checkVectorTypeForPromotion(P, VTy, DL, VScale))
2296 return VTy;
2297
2298 return nullptr;
2299}
2300
2302 SetVector<Type *> &OtherTys, ArrayRef<VectorType *> CandidateTysCopy,
2303 function_ref<void(Type *)> CheckCandidateType, Partition &P,
2304 const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
2305 bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
2306 bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) {
2307 [[maybe_unused]] VectorType *OriginalElt =
2308 CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
2309 // Consider additional vector types where the element type size is a
2310 // multiple of load/store element size.
2311 for (Type *Ty : OtherTys) {
2313 continue;
2314 unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
2315 // Make a copy of CandidateTys and iterate through it, because we
2316 // might append to CandidateTys in the loop.
2317 for (VectorType *const VTy : CandidateTysCopy) {
2318 // The elements in the copy should remain invariant throughout the loop
2319 assert(CandidateTysCopy[0] == OriginalElt && "Different Element");
2320 unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue();
2321 unsigned ElementSize =
2322 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2323 if (TypeSize != VectorSize && TypeSize != ElementSize &&
2324 VectorSize % TypeSize == 0) {
2325 VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false);
2326 CheckCandidateType(NewVTy);
2327 }
2328 }
2329 }
2330
2332 P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2333 HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
2334}
2335
2336/// Test whether the given alloca partitioning and range of slices can be
2337/// promoted to a vector.
2338///
2339/// This is a quick test to check whether we can rewrite a particular alloca
2340/// partition (and its newly formed alloca) into a vector alloca with only
2341/// whole-vector loads and stores such that it could be promoted to a vector
2342/// SSA value. We only can ensure this for a limited set of operations, and we
2343/// don't want to do the rewrites unless we are confident that the result will
2344/// be promotable, so we have an early test here.
2346 unsigned VScale) {
2347 // Collect the candidate types for vector-based promotion. Also track whether
2348 // we have different element types.
2349 SmallVector<VectorType *, 4> CandidateTys;
2350 SetVector<Type *> LoadStoreTys;
2351 SetVector<Type *> DeferredTys;
2352 Type *CommonEltTy = nullptr;
2353 VectorType *CommonVecPtrTy = nullptr;
2354 bool HaveVecPtrTy = false;
2355 bool HaveCommonEltTy = true;
2356 bool HaveCommonVecPtrTy = true;
2357 auto CheckCandidateType = [&](Type *Ty) {
2358 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2359 // Return if bitcast to vectors is different for total size in bits.
2360 if (!CandidateTys.empty()) {
2361 VectorType *V = CandidateTys[0];
2362 if (DL.getTypeSizeInBits(VTy).getFixedValue() !=
2363 DL.getTypeSizeInBits(V).getFixedValue()) {
2364 CandidateTys.clear();
2365 return;
2366 }
2367 }
2368 CandidateTys.push_back(VTy);
2369 Type *EltTy = VTy->getElementType();
2370
2371 if (!CommonEltTy)
2372 CommonEltTy = EltTy;
2373 else if (CommonEltTy != EltTy)
2374 HaveCommonEltTy = false;
2375
2376 if (EltTy->isPointerTy()) {
2377 HaveVecPtrTy = true;
2378 if (!CommonVecPtrTy)
2379 CommonVecPtrTy = VTy;
2380 else if (CommonVecPtrTy != VTy)
2381 HaveCommonVecPtrTy = false;
2382 }
2383 }
2384 };
2385
2386 // Put load and store types into a set for de-duplication.
2387 for (const Slice &S : P) {
2388 Type *Ty;
2389 if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
2390 Ty = LI->getType();
2391 else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
2392 Ty = SI->getValueOperand()->getType();
2393 else
2394 continue;
2395
2396 auto CandTy = Ty->getScalarType();
2397 if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() ||
2398 S.endOffset() != P.endOffset())) {
2399 DeferredTys.insert(Ty);
2400 continue;
2401 }
2402
2403 LoadStoreTys.insert(Ty);
2404 // Consider any loads or stores that are the exact size of the slice.
2405 if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
2406 CheckCandidateType(Ty);
2407 }
2408
2409 SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
2411 LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
2412 CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2413 HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
2414 return VTy;
2415
2416 CandidateTys.clear();
2418 DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
2419 HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
2420 CommonVecPtrTy, VScale);
2421}
2422
2423/// Test whether a slice of an alloca is valid for integer widening.
2424///
2425/// This implements the necessary checking for the \c isIntegerWideningViable
2426/// test below on a single slice of the alloca.
2427static bool isIntegerWideningViableForSlice(const Slice &S,
2428 uint64_t AllocBeginOffset,
2429 Type *AllocaTy,
2430 const DataLayout &DL,
2431 bool &WholeAllocaOp) {
2432 uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedValue();
2433
2434 uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2435 uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2436
2437 Use *U = S.getUse();
2438
2439 // Lifetime intrinsics operate over the whole alloca whose sizes are usually
2440 // larger than other load/store slices (RelEnd > Size). But lifetime are
2441 // always promotable and should not impact other slices' promotability of the
2442 // partition.
2443 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2444 if (II->isLifetimeStartOrEnd() || II->isDroppable())
2445 return true;
2446 }
2447
2448 // We can't reasonably handle cases where the load or store extends past
2449 // the end of the alloca's type and into its padding.
2450 if (RelEnd > Size)
2451 return false;
2452
2453 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2454 if (LI->isVolatile())
2455 return false;
2456 // We can't handle loads that extend past the allocated memory.
2457 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
2458 if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size)
2459 return false;
2460 // So far, AllocaSliceRewriter does not support widening split slice tails
2461 // in rewriteIntegerLoad.
2462 if (S.beginOffset() < AllocBeginOffset)
2463 return false;
2464 // Note that we don't count vector loads or stores as whole-alloca
2465 // operations which enable integer widening because we would prefer to use
2466 // vector widening instead.
2467 if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
2468 WholeAllocaOp = true;
2469 if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
2470 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2471 return false;
2472 } else if (RelBegin != 0 || RelEnd != Size ||
2473 !canConvertValue(DL, AllocaTy, LI->getType())) {
2474 // Non-integer loads need to be convertible from the alloca type so that
2475 // they are promotable.
2476 return false;
2477 }
2478 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2479 Type *ValueTy = SI->getValueOperand()->getType();
2480 if (SI->isVolatile())
2481 return false;
2482 // We can't handle stores that extend past the allocated memory.
2483 TypeSize StoreSize = DL.getTypeStoreSize(ValueTy);
2484 if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size)
2485 return false;
2486 // So far, AllocaSliceRewriter does not support widening split slice tails
2487 // in rewriteIntegerStore.
2488 if (S.beginOffset() < AllocBeginOffset)
2489 return false;
2490 // Note that we don't count vector loads or stores as whole-alloca
2491 // operations which enable integer widening because we would prefer to use
2492 // vector widening instead.
2493 if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
2494 WholeAllocaOp = true;
2495 if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
2496 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2497 return false;
2498 } else if (RelBegin != 0 || RelEnd != Size ||
2499 !canConvertValue(DL, ValueTy, AllocaTy)) {
2500 // Non-integer stores need to be convertible to the alloca type so that
2501 // they are promotable.
2502 return false;
2503 }
2504 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2505 if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
2506 return false;
2507 if (!S.isSplittable())
2508 return false; // Skip any unsplittable intrinsics.
2509 } else {
2510 return false;
2511 }
2512
2513 return true;
2514}
2515
2516/// Test whether the given alloca partition's integer operations can be
2517/// widened to promotable ones.
2518///
2519/// This is a quick test to check whether we can rewrite the integer loads and
2520/// stores to a particular alloca into wider loads and stores and be able to
2521/// promote the resulting alloca.
2522static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2523 const DataLayout &DL) {
2524 uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedValue();
2525 // Don't create integer types larger than the maximum bitwidth.
2526 if (SizeInBits > IntegerType::MAX_INT_BITS)
2527 return false;
2528
2529 // Don't try to handle allocas with bit-padding.
2530 if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedValue())
2531 return false;
2532
2533 // We need to ensure that an integer type with the appropriate bitwidth can
2534 // be converted to the alloca type, whatever that is. We don't want to force
2535 // the alloca itself to have an integer type if there is a more suitable one.
2536 Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
2537 if (!canConvertValue(DL, AllocaTy, IntTy) ||
2538 !canConvertValue(DL, IntTy, AllocaTy))
2539 return false;
2540
2541 // While examining uses, we ensure that the alloca has a covering load or
2542 // store. We don't want to widen the integer operations only to fail to
2543 // promote due to some other unsplittable entry (which we may make splittable
2544 // later). However, if there are only splittable uses, go ahead and assume
2545 // that we cover the alloca.
2546 // FIXME: We shouldn't consider split slices that happen to start in the
2547 // partition here...
2548 bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
2549
2550 for (const Slice &S : P)
2551 if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
2552 WholeAllocaOp))
2553 return false;
2554
2555 for (const Slice *S : P.splitSliceTails())
2556 if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
2557 WholeAllocaOp))
2558 return false;
2559
2560 return WholeAllocaOp;
2561}
2562
2563static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2565 const Twine &Name) {
2566 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2567 IntegerType *IntTy = cast<IntegerType>(V->getType());
2568 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2569 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2570 "Element extends past full value");
2571 uint64_t ShAmt = 8 * Offset;
2572 if (DL.isBigEndian())
2573 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2574 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2575 if (ShAmt) {
2576 V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
2577 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2578 }
2579 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2580 "Cannot extract to a larger integer!");
2581 if (Ty != IntTy) {
2582 V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
2583 LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2584 }
2585 return V;
2586}
2587
2588static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2589 Value *V, uint64_t Offset, const Twine &Name) {
2590 IntegerType *IntTy = cast<IntegerType>(Old->getType());
2591 IntegerType *Ty = cast<IntegerType>(V->getType());
2592 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2593 "Cannot insert a larger integer!");
2594 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2595 if (Ty != IntTy) {
2596 V = IRB.CreateZExt(V, IntTy, Name + ".ext");
2597 LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2598 }
2599 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2600 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2601 "Element store outside of alloca store");
2602 uint64_t ShAmt = 8 * Offset;
2603 if (DL.isBigEndian())
2604 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2605 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2606 if (ShAmt) {
2607 V = IRB.CreateShl(V, ShAmt, Name + ".shift");
2608 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2609 }
2610
2611 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2612 APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
2613 Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
2614 LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2615 V = IRB.CreateOr(Old, V, Name + ".insert");
2616 LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2617 }
2618 return V;
2619}
2620
2621static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2622 unsigned EndIndex, const Twine &Name) {
2623 auto *VecTy = cast<FixedVectorType>(V->getType());
2624 unsigned NumElements = EndIndex - BeginIndex;
2625 assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2626
2627 if (NumElements == VecTy->getNumElements())
2628 return V;
2629
2630 if (NumElements == 1) {
2631 V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
2632 Name + ".extract");
2633 LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2634 return V;
2635 }
2636
2637 auto Mask = llvm::to_vector<8>(llvm::seq<int>(BeginIndex, EndIndex));
2638 V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
2639 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2640 return V;
2641}
2642
2643static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2644 unsigned BeginIndex, const Twine &Name) {
2645 VectorType *VecTy = cast<VectorType>(Old->getType());
2646 assert(VecTy && "Can only insert a vector into a vector");
2647
2648 VectorType *Ty = dyn_cast<VectorType>(V->getType());
2649 if (!Ty) {
2650 // Single element to insert.
2651 V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
2652 Name + ".insert");
2653 LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2654 return V;
2655 }
2656
2659 "Too many elements!");
2662 assert(V->getType() == VecTy && "Vector type mismatch");
2663 return V;
2664 }
2665 unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Ty)->getNumElements();
2666
2667 // When inserting a smaller vector into the larger to store, we first
2668 // use a shuffle vector to widen it with undef elements, and then
2669 // a second shuffle vector to select between the loaded vector and the
2670 // incoming vector.
2672 Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
2673 for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2674 if (i >= BeginIndex && i < EndIndex)
2675 Mask.push_back(i - BeginIndex);
2676 else
2677 Mask.push_back(-1);
2678 V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
2679 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2680
2683 for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2684 Mask2.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
2685
2686 // No profiling support for vector selects.
2687 V = IRB.CreateSelectWithUnknownProfile(ConstantVector::get(Mask2), V, Old,
2688 DEBUG_TYPE, Name + "blend");
2689
2690 LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2691 return V;
2692}
2693
2694/// This function takes two vector values and combines them into a single vector
2695/// by concatenating their elements. The function handles:
2696///
2697/// 1. Element type mismatch: If either vector's element type differs from
2698/// NewAIEltType, the function bitcasts the vector to use NewAIEltType while
2699/// preserving the total bit width (adjusting the number of elements
2700/// accordingly).
2701///
2702/// 2. Size mismatch: After transforming the vectors to have the desired element
2703/// type, if the two vectors have different numbers of elements, the smaller
2704/// vector is extended with poison values to match the size of the larger
2705/// vector before concatenation.
2706///
2707/// 3. Concatenation: The vectors are merged using a shuffle operation that
2708/// places all elements of V0 first, followed by all elements of V1.
2709///
2710/// \param V0 The first vector to merge (must be a vector type)
2711/// \param V1 The second vector to merge (must be a vector type)
2712/// \param DL The data layout for size calculations
2713/// \param NewAIEltTy The desired element type for the result vector
2714/// \param Builder IRBuilder for creating new instructions
2715/// \return A new vector containing all elements from V0 followed by all
2716/// elements from V1
2718 Type *NewAIEltTy, IRBuilder<> &Builder) {
2719 // V0 and V1 are vectors
2720 // Create a new vector type with combined elements
2721 // Use ShuffleVector to concatenate the vectors
2722 auto *VecType0 = cast<FixedVectorType>(V0->getType());
2723 auto *VecType1 = cast<FixedVectorType>(V1->getType());
2724
2725 // If V0/V1 element types are different from NewAllocaElementType,
2726 // we need to introduce bitcasts before merging them
2727 auto BitcastIfNeeded = [&](Value *&V, FixedVectorType *&VecType,
2728 const char *DebugName) {
2729 Type *EltType = VecType->getElementType();
2730 if (EltType != NewAIEltTy) {
2731 // Calculate new number of elements to maintain same bit width
2732 unsigned TotalBits =
2733 VecType->getNumElements() * DL.getTypeSizeInBits(EltType);
2734 unsigned NewNumElts = TotalBits / DL.getTypeSizeInBits(NewAIEltTy);
2735
2736 auto *NewVecType = FixedVectorType::get(NewAIEltTy, NewNumElts);
2737 V = Builder.CreateBitCast(V, NewVecType);
2738 VecType = NewVecType;
2739 LLVM_DEBUG(dbgs() << " bitcast " << DebugName << ": " << *V << "\n");
2740 }
2741 };
2742
2743 BitcastIfNeeded(V0, VecType0, "V0");
2744 BitcastIfNeeded(V1, VecType1, "V1");
2745
2746 unsigned NumElts0 = VecType0->getNumElements();
2747 unsigned NumElts1 = VecType1->getNumElements();
2748
2749 SmallVector<int, 16> ShuffleMask;
2750
2751 if (NumElts0 == NumElts1) {
2752 for (unsigned i = 0; i < NumElts0 + NumElts1; ++i)
2753 ShuffleMask.push_back(i);
2754 } else {
2755 // If two vectors have different sizes, we need to extend
2756 // the smaller vector to the size of the larger vector.
2757 unsigned SmallSize = std::min(NumElts0, NumElts1);
2758 unsigned LargeSize = std::max(NumElts0, NumElts1);
2759 bool IsV0Smaller = NumElts0 < NumElts1;
2760 Value *&ExtendedVec = IsV0Smaller ? V0 : V1;
2761 SmallVector<int, 16> ExtendMask;
2762 for (unsigned i = 0; i < SmallSize; ++i)
2763 ExtendMask.push_back(i);
2764 for (unsigned i = SmallSize; i < LargeSize; ++i)
2765 ExtendMask.push_back(PoisonMaskElem);
2766 ExtendedVec = Builder.CreateShuffleVector(
2767 ExtendedVec, PoisonValue::get(ExtendedVec->getType()), ExtendMask);
2768 LLVM_DEBUG(dbgs() << " shufflevector: " << *ExtendedVec << "\n");
2769 for (unsigned i = 0; i < NumElts0; ++i)
2770 ShuffleMask.push_back(i);
2771 for (unsigned i = 0; i < NumElts1; ++i)
2772 ShuffleMask.push_back(LargeSize + i);
2773 }
2774
2775 return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2776}
2777
2778namespace {
2779
2780/// Visitor to rewrite instructions using p particular slice of an alloca
2781/// to use a new alloca.
2782///
2783/// Also implements the rewriting to vector-based accesses when the partition
2784/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2785/// lives here.
2786class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
2787 // Befriend the base class so it can delegate to private visit methods.
2788 friend class InstVisitor<AllocaSliceRewriter, bool>;
2789
2790 using Base = InstVisitor<AllocaSliceRewriter, bool>;
2791
2792 const DataLayout &DL;
2793 AllocaSlices &AS;
2794 SROA &Pass;
2795 AllocaInst &OldAI, &NewAI;
2796 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2797 Type *NewAllocaTy;
2798
2799 // This is a convenience and flag variable that will be null unless the new
2800 // alloca's integer operations should be widened to this integer type due to
2801 // passing isIntegerWideningViable above. If it is non-null, the desired
2802 // integer type will be stored here for easy access during rewriting.
2803 IntegerType *IntTy;
2804
2805 // If we are rewriting an alloca partition which can be written as pure
2806 // vector operations, we stash extra information here. When VecTy is
2807 // non-null, we have some strict guarantees about the rewritten alloca:
2808 // - The new alloca is exactly the size of the vector type here.
2809 // - The accesses all either map to the entire vector or to a single
2810 // element.
2811 // - The set of accessing instructions is only one of those handled above
2812 // in isVectorPromotionViable. Generally these are the same access kinds
2813 // which are promotable via mem2reg.
2814 VectorType *VecTy;
2815 Type *ElementTy;
2816 uint64_t ElementSize;
2817
2818 // The original offset of the slice currently being rewritten relative to
2819 // the original alloca.
2820 uint64_t BeginOffset = 0;
2821 uint64_t EndOffset = 0;
2822
2823 // The new offsets of the slice currently being rewritten relative to the
2824 // original alloca.
2825 uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2826
2827 uint64_t SliceSize = 0;
2828 bool IsSplittable = false;
2829 bool IsSplit = false;
2830 Use *OldUse = nullptr;
2831 Instruction *OldPtr = nullptr;
2832
2833 // Track post-rewrite users which are PHI nodes and Selects.
2834 SmallSetVector<PHINode *, 8> &PHIUsers;
2835 SmallSetVector<SelectInst *, 8> &SelectUsers;
2836
2837 // Utility IR builder, whose name prefix is setup for each visited use, and
2838 // the insertion point is set to point to the user.
2839 IRBuilderTy IRB;
2840
2841 // Return the new alloca, addrspacecasted if required to avoid changing the
2842 // addrspace of a volatile access.
2843 Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) {
2844 if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace())
2845 return &NewAI;
2846
2847 Type *AccessTy = IRB.getPtrTy(AddrSpace);
2848 return IRB.CreateAddrSpaceCast(&NewAI, AccessTy);
2849 }
2850
2851public:
2852 AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
2853 AllocaInst &OldAI, AllocaInst &NewAI,
2854 uint64_t NewAllocaBeginOffset,
2855 uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2856 VectorType *PromotableVecTy,
2857 SmallSetVector<PHINode *, 8> &PHIUsers,
2858 SmallSetVector<SelectInst *, 8> &SelectUsers)
2859 : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2860 NewAllocaBeginOffset(NewAllocaBeginOffset),
2861 NewAllocaEndOffset(NewAllocaEndOffset),
2862 NewAllocaTy(NewAI.getAllocatedType()),
2863 IntTy(
2864 IsIntegerPromotable
2865 ? Type::getIntNTy(NewAI.getContext(),
2866 DL.getTypeSizeInBits(NewAI.getAllocatedType())
2867 .getFixedValue())
2868 : nullptr),
2869 VecTy(PromotableVecTy),
2870 ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2871 ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8
2872 : 0),
2873 PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2874 IRB(NewAI.getContext(), ConstantFolder()) {
2875 if (VecTy) {
2876 assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % 8) == 0 &&
2877 "Only multiple-of-8 sized vector elements are viable");
2878 ++NumVectorized;
2879 }
2880 assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2881 }
2882
2883 bool visit(AllocaSlices::const_iterator I) {
2884 bool CanSROA = true;
2885 BeginOffset = I->beginOffset();
2886 EndOffset = I->endOffset();
2887 IsSplittable = I->isSplittable();
2888 IsSplit =
2889 BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2890 LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2891 LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2892 LLVM_DEBUG(dbgs() << "\n");
2893
2894 // Compute the intersecting offset range.
2895 assert(BeginOffset < NewAllocaEndOffset);
2896 assert(EndOffset > NewAllocaBeginOffset);
2897 NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
2898 NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
2899
2900 SliceSize = NewEndOffset - NewBeginOffset;
2901 LLVM_DEBUG(dbgs() << " Begin:(" << BeginOffset << ", " << EndOffset
2902 << ") NewBegin:(" << NewBeginOffset << ", "
2903 << NewEndOffset << ") NewAllocaBegin:("
2904 << NewAllocaBeginOffset << ", " << NewAllocaEndOffset
2905 << ")\n");
2906 assert(IsSplit || NewBeginOffset == BeginOffset);
2907 OldUse = I->getUse();
2908 OldPtr = cast<Instruction>(OldUse->get());
2909
2910 Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
2911 IRB.SetInsertPoint(OldUserI);
2912 IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2913 IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." +
2914 Twine(BeginOffset) + ".");
2915
2916 CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
2917 if (VecTy || IntTy)
2918 assert(CanSROA);
2919 return CanSROA;
2920 }
2921
2922 /// Attempts to rewrite a partition using tree-structured merge optimization.
2923 ///
2924 /// This function analyzes a partition to determine if it can be optimized
2925 /// using a tree-structured merge pattern, where multiple non-overlapping
2926 /// stores completely fill an alloca. And there is no load from the alloca in
2927 /// the middle of the stores. Such patterns can be optimized by eliminating
2928 /// the intermediate stores and directly constructing the final vector by
2929 /// using shufflevectors.
2930 ///
2931 /// Example transformation:
2932 /// Before: (stores do not have to be in order)
2933 /// %alloca = alloca <8 x float>
2934 /// store <2 x float> %val0, ptr %alloca ; offset 0-1
2935 /// store <2 x float> %val2, ptr %alloca+16 ; offset 4-5
2936 /// store <2 x float> %val1, ptr %alloca+8 ; offset 2-3
2937 /// store <2 x float> %val3, ptr %alloca+24 ; offset 6-7
2938 ///
2939 /// After:
2940 /// %alloca = alloca <8 x float>
2941 /// %shuffle0 = shufflevector %val0, %val1, <4 x i32> <i32 0, i32 1, i32 2,
2942 /// i32 3>
2943 /// %shuffle1 = shufflevector %val2, %val3, <4 x i32> <i32 0, i32 1, i32 2,
2944 /// i32 3>
2945 /// %shuffle2 = shufflevector %shuffle0, %shuffle1, <8 x i32> <i32 0, i32 1,
2946 /// i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2947 /// store %shuffle2, ptr %alloca
2948 ///
2949 /// The optimization looks for partitions that:
2950 /// 1. Have no overlapping split slice tails
2951 /// 2. Contain non-overlapping stores that cover the entire alloca
2952 /// 3. Have exactly one load that reads the complete alloca structure and not
2953 /// in the middle of the stores (TODO: maybe we can relax the constraint
2954 /// about reading the entire alloca structure)
2955 ///
2956 /// \param P The partition to analyze and potentially rewrite
2957 /// \return An optional vector of values that were deleted during the rewrite
2958 /// process, or std::nullopt if the partition cannot be optimized
2959 /// using tree-structured merge
2960 std::optional<SmallVector<Value *, 4>>
2961 rewriteTreeStructuredMerge(Partition &P) {
2962 // No tail slices that overlap with the partition
2963 if (P.splitSliceTails().size() > 0)
2964 return std::nullopt;
2965
2966 SmallVector<Value *, 4> DeletedValues;
2967 LoadInst *TheLoad = nullptr;
2968
2969 // Structure to hold store information
2970 struct StoreInfo {
2971 StoreInst *Store;
2972 uint64_t BeginOffset;
2973 uint64_t EndOffset;
2974 Value *StoredValue;
2975 StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val)
2976 : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val) {}
2977 };
2978
2979 SmallVector<StoreInfo, 4> StoreInfos;
2980
2981 // If the new alloca is a fixed vector type, we use its element type as the
2982 // allocated element type, otherwise we use i8 as the allocated element
2983 Type *AllocatedEltTy =
2985 ? cast<FixedVectorType>(NewAI.getAllocatedType())->getElementType()
2986 : Type::getInt8Ty(NewAI.getContext());
2987 unsigned AllocatedEltTySize = DL.getTypeSizeInBits(AllocatedEltTy);
2988
2989 // Helper to check if a type is
2990 // 1. A fixed vector type
2991 // 2. The element type is not a pointer
2992 // 3. The element type size is byte-aligned
2993 // We only handle the cases that the ld/st meet these conditions
2994 auto IsTypeValidForTreeStructuredMerge = [&](Type *Ty) -> bool {
2995 auto *FixedVecTy = dyn_cast<FixedVectorType>(Ty);
2996 return FixedVecTy &&
2997 DL.getTypeSizeInBits(FixedVecTy->getElementType()) % 8 == 0 &&
2998 !FixedVecTy->getElementType()->isPointerTy();
2999 };
3000
3001 for (Slice &S : P) {
3002 auto *User = cast<Instruction>(S.getUse()->getUser());
3003 if (auto *LI = dyn_cast<LoadInst>(User)) {
3004 // Do not handle the case if
3005 // 1. There is more than one load
3006 // 2. The load is volatile
3007 // 3. The load does not read the entire alloca structure
3008 // 4. The load does not meet the conditions in the helper function
3009 if (TheLoad || !IsTypeValidForTreeStructuredMerge(LI->getType()) ||
3010 S.beginOffset() != NewAllocaBeginOffset ||
3011 S.endOffset() != NewAllocaEndOffset || LI->isVolatile())
3012 return std::nullopt;
3013 TheLoad = LI;
3014 } else if (auto *SI = dyn_cast<StoreInst>(User)) {
3015 // Do not handle the case if
3016 // 1. The store does not meet the conditions in the helper function
3017 // 2. The store is volatile
3018 // 3. The total store size is not a multiple of the allocated element
3019 // type size
3020 if (!IsTypeValidForTreeStructuredMerge(
3021 SI->getValueOperand()->getType()) ||
3022 SI->isVolatile())
3023 return std::nullopt;
3024 auto *VecTy = cast<FixedVectorType>(SI->getValueOperand()->getType());
3025 unsigned NumElts = VecTy->getNumElements();
3026 unsigned EltSize = DL.getTypeSizeInBits(VecTy->getElementType());
3027 if (NumElts * EltSize % AllocatedEltTySize != 0)
3028 return std::nullopt;
3029 StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(),
3030 SI->getValueOperand());
3031 } else {
3032 // If we have instructions other than load and store, we cannot do the
3033 // tree structured merge
3034 return std::nullopt;
3035 }
3036 }
3037 // If we do not have any load, we cannot do the tree structured merge
3038 if (!TheLoad)
3039 return std::nullopt;
3040
3041 // If we do not have multiple stores, we cannot do the tree structured merge
3042 if (StoreInfos.size() < 2)
3043 return std::nullopt;
3044
3045 // Stores should not overlap and should cover the whole alloca
3046 // Sort by begin offset
3047 llvm::sort(StoreInfos, [](const StoreInfo &A, const StoreInfo &B) {
3048 return A.BeginOffset < B.BeginOffset;
3049 });
3050
3051 // Check for overlaps and coverage
3052 uint64_t ExpectedStart = NewAllocaBeginOffset;
3053 for (auto &StoreInfo : StoreInfos) {
3054 uint64_t BeginOff = StoreInfo.BeginOffset;
3055 uint64_t EndOff = StoreInfo.EndOffset;
3056
3057 // Check for gap or overlap
3058 if (BeginOff != ExpectedStart)
3059 return std::nullopt;
3060
3061 ExpectedStart = EndOff;
3062 }
3063 // Check that stores cover the entire alloca
3064 if (ExpectedStart != NewAllocaEndOffset)
3065 return std::nullopt;
3066
3067 // Stores should be in the same basic block
3068 // The load should not be in the middle of the stores
3069 // Note:
3070 // If the load is in a different basic block with the stores, we can still
3071 // do the tree structured merge. This is because we do not have the
3072 // store->load forwarding here. The merged vector will be stored back to
3073 // NewAI and the new load will load from NewAI. The forwarding will be
3074 // handled later when we try to promote NewAI.
3075 BasicBlock *LoadBB = TheLoad->getParent();
3076 BasicBlock *StoreBB = StoreInfos[0].Store->getParent();
3077
3078 for (auto &StoreInfo : StoreInfos) {
3079 if (StoreInfo.Store->getParent() != StoreBB)
3080 return std::nullopt;
3081 if (LoadBB == StoreBB && !StoreInfo.Store->comesBefore(TheLoad))
3082 return std::nullopt;
3083 }
3084
3085 // If we reach here, the partition can be merged with a tree structured
3086 // merge
3087 LLVM_DEBUG({
3088 dbgs() << "Tree structured merge rewrite:\n Load: " << *TheLoad
3089 << "\n Ordered stores:\n";
3090 for (auto [i, Info] : enumerate(StoreInfos))
3091 dbgs() << " [" << i << "] Range[" << Info.BeginOffset << ", "
3092 << Info.EndOffset << ") \tStore: " << *Info.Store
3093 << "\tValue: " << *Info.StoredValue << "\n";
3094 });
3095
3096 // Instead of having these stores, we merge all the stored values into a
3097 // vector and store the merged value into the alloca
3098 std::queue<Value *> VecElements;
3099 IRBuilder<> Builder(StoreInfos.back().Store);
3100 for (const auto &Info : StoreInfos) {
3101 DeletedValues.push_back(Info.Store);
3102 VecElements.push(Info.StoredValue);
3103 }
3104
3105 LLVM_DEBUG(dbgs() << " Rewrite stores into shufflevectors:\n");
3106 while (VecElements.size() > 1) {
3107 const auto NumElts = VecElements.size();
3108 for ([[maybe_unused]] const auto _ : llvm::seq(NumElts / 2)) {
3109 Value *V0 = VecElements.front();
3110 VecElements.pop();
3111 Value *V1 = VecElements.front();
3112 VecElements.pop();
3113 Value *Merged = mergeTwoVectors(V0, V1, DL, AllocatedEltTy, Builder);
3114 LLVM_DEBUG(dbgs() << " shufflevector: " << *Merged << "\n");
3115 VecElements.push(Merged);
3116 }
3117 if (NumElts % 2 == 1) {
3118 Value *V = VecElements.front();
3119 VecElements.pop();
3120 VecElements.push(V);
3121 }
3122 }
3123
3124 // Store the merged value into the alloca
3125 Value *MergedValue = VecElements.front();
3126 Builder.CreateAlignedStore(MergedValue, &NewAI, getSliceAlign());
3127
3128 IRBuilder<> LoadBuilder(TheLoad);
3129 TheLoad->replaceAllUsesWith(LoadBuilder.CreateAlignedLoad(
3130 TheLoad->getType(), &NewAI, getSliceAlign(), TheLoad->isVolatile(),
3131 TheLoad->getName() + ".sroa.new.load"));
3132 DeletedValues.push_back(TheLoad);
3133
3134 return DeletedValues;
3135 }
3136
3137private:
3138 // Make sure the other visit overloads are visible.
3139 using Base::visit;
3140
3141 // Every instruction which can end up as a user must have a rewrite rule.
3142 bool visitInstruction(Instruction &I) {
3143 LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
3144 llvm_unreachable("No rewrite rule for this instruction!");
3145 }
3146
3147 Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
3148 // Note that the offset computation can use BeginOffset or NewBeginOffset
3149 // interchangeably for unsplit slices.
3150 assert(IsSplit || BeginOffset == NewBeginOffset);
3151 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3152
3153#ifndef NDEBUG
3154 StringRef OldName = OldPtr->getName();
3155 // Skip through the last '.sroa.' component of the name.
3156 size_t LastSROAPrefix = OldName.rfind(".sroa.");
3157 if (LastSROAPrefix != StringRef::npos) {
3158 OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
3159 // Look for an SROA slice index.
3160 size_t IndexEnd = OldName.find_first_not_of("0123456789");
3161 if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
3162 // Strip the index and look for the offset.
3163 OldName = OldName.substr(IndexEnd + 1);
3164 size_t OffsetEnd = OldName.find_first_not_of("0123456789");
3165 if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
3166 // Strip the offset.
3167 OldName = OldName.substr(OffsetEnd + 1);
3168 }
3169 }
3170 // Strip any SROA suffixes as well.
3171 OldName = OldName.substr(0, OldName.find(".sroa_"));
3172#endif
3173
3174 return getAdjustedPtr(IRB, DL, &NewAI,
3175 APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
3176 PointerTy,
3177#ifndef NDEBUG
3178 Twine(OldName) + "."
3179#else
3180 Twine()
3181#endif
3182 );
3183 }
3184
3185 /// Compute suitable alignment to access this slice of the *new*
3186 /// alloca.
3187 ///
3188 /// You can optionally pass a type to this routine and if that type's ABI
3189 /// alignment is itself suitable, this will return zero.
3190 Align getSliceAlign() {
3191 return commonAlignment(NewAI.getAlign(),
3192 NewBeginOffset - NewAllocaBeginOffset);
3193 }
3194
3195 unsigned getIndex(uint64_t Offset) {
3196 assert(VecTy && "Can only call getIndex when rewriting a vector");
3197 uint64_t RelOffset = Offset - NewAllocaBeginOffset;
3198 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
3199 uint32_t Index = RelOffset / ElementSize;
3200 assert(Index * ElementSize == RelOffset);
3201 return Index;
3202 }
3203
3204 void deleteIfTriviallyDead(Value *V) {
3207 Pass.DeadInsts.push_back(I);
3208 }
3209
3210 Value *rewriteVectorizedLoadInst(LoadInst &LI) {
3211 unsigned BeginIndex = getIndex(NewBeginOffset);
3212 unsigned EndIndex = getIndex(NewEndOffset);
3213 assert(EndIndex > BeginIndex && "Empty vector!");
3214
3215 LoadInst *Load = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3216 NewAI.getAlign(), "load");
3217
3218 Load->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3219 LLVMContext::MD_access_group});
3220 return extractVector(IRB, Load, BeginIndex, EndIndex, "vec");
3221 }
3222
3223 Value *rewriteIntegerLoad(LoadInst &LI) {
3224 assert(IntTy && "We cannot insert an integer to the alloca");
3225 assert(!LI.isVolatile());
3226 Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3227 NewAI.getAlign(), "load");
3228 V = convertValue(DL, IRB, V, IntTy);
3229 assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3230 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3231 if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
3232 IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
3233 V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
3234 }
3235 // It is possible that the extracted type is not the load type. This
3236 // happens if there is a load past the end of the alloca, and as
3237 // a consequence the slice is narrower but still a candidate for integer
3238 // lowering. To handle this case, we just zero extend the extracted
3239 // integer.
3240 assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
3241 "Can only handle an extract for an overly wide load");
3242 if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
3243 V = IRB.CreateZExt(V, LI.getType());
3244 return V;
3245 }
3246
3247 bool visitLoadInst(LoadInst &LI) {
3248 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3249 Value *OldOp = LI.getOperand(0);
3250 assert(OldOp == OldPtr);
3251
3252 AAMDNodes AATags = LI.getAAMetadata();
3253
3254 unsigned AS = LI.getPointerAddressSpace();
3255
3256 Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
3257 : LI.getType();
3258 bool IsPtrAdjusted = false;
3259 Value *V;
3260 if (VecTy) {
3261 V = rewriteVectorizedLoadInst(LI);
3262 } else if (IntTy && LI.getType()->isIntegerTy()) {
3263 V = rewriteIntegerLoad(LI);
3264 } else if (NewBeginOffset == NewAllocaBeginOffset &&
3265 NewEndOffset == NewAllocaEndOffset &&
3266 (canConvertValue(DL, NewAllocaTy, TargetTy) ||
3267 (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() &&
3268 DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize &&
3269 !LI.isVolatile()))) {
3270 Value *NewPtr =
3271 getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile());
3272 LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), NewPtr,
3273 NewAI.getAlign(), LI.isVolatile(),
3274 LI.getName());
3275 if (LI.isVolatile())
3276 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3277 if (NewLI->isAtomic())
3278 NewLI->setAlignment(LI.getAlign());
3279
3280 // Copy any metadata that is valid for the new load. This may require
3281 // conversion to a different kind of metadata, e.g. !nonnull might change
3282 // to !range or vice versa.
3283 copyMetadataForLoad(*NewLI, LI);
3284
3285 // Do this after copyMetadataForLoad() to preserve the TBAA shift.
3286 if (AATags)
3287 NewLI->setAAMetadata(AATags.adjustForAccess(
3288 NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3289
3290 // Try to preserve nonnull metadata
3291 V = NewLI;
3292
3293 // If this is an integer load past the end of the slice (which means the
3294 // bytes outside the slice are undef or this load is dead) just forcibly
3295 // fix the integer size with correct handling of endianness.
3296 if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
3297 if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
3298 if (AITy->getBitWidth() < TITy->getBitWidth()) {
3299 V = IRB.CreateZExt(V, TITy, "load.ext");
3300 if (DL.isBigEndian())
3301 V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
3302 "endian_shift");
3303 }
3304 } else {
3305 Type *LTy = IRB.getPtrTy(AS);
3306 LoadInst *NewLI =
3307 IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
3308 getSliceAlign(), LI.isVolatile(), LI.getName());
3309
3310 if (AATags)
3311 NewLI->setAAMetadata(AATags.adjustForAccess(
3312 NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3313
3314 if (LI.isVolatile())
3315 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3316 NewLI->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3317 LLVMContext::MD_access_group});
3318
3319 V = NewLI;
3320 IsPtrAdjusted = true;
3321 }
3322 V = convertValue(DL, IRB, V, TargetTy);
3323
3324 if (IsSplit) {
3325 assert(!LI.isVolatile());
3326 assert(LI.getType()->isIntegerTy() &&
3327 "Only integer type loads and stores are split");
3328 assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() &&
3329 "Split load isn't smaller than original load");
3330 assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
3331 "Non-byte-multiple bit width");
3332 // Move the insertion point just past the load so that we can refer to it.
3333 BasicBlock::iterator LIIt = std::next(LI.getIterator());
3334 // Ensure the insertion point comes before any debug-info immediately
3335 // after the load, so that variable values referring to the load are
3336 // dominated by it.
3337 LIIt.setHeadBit(true);
3338 IRB.SetInsertPoint(LI.getParent(), LIIt);
3339 // Create a placeholder value with the same type as LI to use as the
3340 // basis for the new value. This allows us to replace the uses of LI with
3341 // the computed value, and then replace the placeholder with LI, leaving
3342 // LI only used for this computation.
3343 Value *Placeholder =
3344 new LoadInst(LI.getType(), PoisonValue::get(IRB.getPtrTy(AS)), "",
3345 false, Align(1));
3346 V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
3347 "insert");
3348 LI.replaceAllUsesWith(V);
3349 Placeholder->replaceAllUsesWith(&LI);
3350 Placeholder->deleteValue();
3351 } else {
3352 LI.replaceAllUsesWith(V);
3353 }
3354
3355 Pass.DeadInsts.push_back(&LI);
3356 deleteIfTriviallyDead(OldOp);
3357 LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
3358 return !LI.isVolatile() && !IsPtrAdjusted;
3359 }
3360
3361 bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
3362 AAMDNodes AATags) {
3363 // Capture V for the purpose of debug-info accounting once it's converted
3364 // to a vector store.
3365 Value *OrigV = V;
3366 if (V->getType() != VecTy) {
3367 unsigned BeginIndex = getIndex(NewBeginOffset);
3368 unsigned EndIndex = getIndex(NewEndOffset);
3369 assert(EndIndex > BeginIndex && "Empty vector!");
3370 unsigned NumElements = EndIndex - BeginIndex;
3371 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3372 "Too many elements!");
3373 Type *SliceTy = (NumElements == 1)
3374 ? ElementTy
3375 : FixedVectorType::get(ElementTy, NumElements);
3376 if (V->getType() != SliceTy)
3377 V = convertValue(DL, IRB, V, SliceTy);
3378
3379 // Mix in the existing elements.
3380 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3381 NewAI.getAlign(), "load");
3382 V = insertVector(IRB, Old, V, BeginIndex, "vec");
3383 }
3384 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3385 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3386 LLVMContext::MD_access_group});
3387 if (AATags)
3388 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3389 V->getType(), DL));
3390 Pass.DeadInsts.push_back(&SI);
3391
3392 // NOTE: Careful to use OrigV rather than V.
3393 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3394 Store, Store->getPointerOperand(), OrigV, DL);
3395 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3396 return true;
3397 }
3398
3399 bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
3400 assert(IntTy && "We cannot extract an integer from the alloca");
3401 assert(!SI.isVolatile());
3402 if (DL.getTypeSizeInBits(V->getType()).getFixedValue() !=
3403 IntTy->getBitWidth()) {
3404 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3405 NewAI.getAlign(), "oldload");
3406 Old = convertValue(DL, IRB, Old, IntTy);
3407 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3408 uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
3409 V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
3410 }
3411 V = convertValue(DL, IRB, V, NewAllocaTy);
3412 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3413 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3414 LLVMContext::MD_access_group});
3415 if (AATags)
3416 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3417 V->getType(), DL));
3418
3419 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3420 Store, Store->getPointerOperand(),
3421 Store->getValueOperand(), DL);
3422
3423 Pass.DeadInsts.push_back(&SI);
3424 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3425 return true;
3426 }
3427
3428 bool visitStoreInst(StoreInst &SI) {
3429 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3430 Value *OldOp = SI.getOperand(1);
3431 assert(OldOp == OldPtr);
3432
3433 AAMDNodes AATags = SI.getAAMetadata();
3434 Value *V = SI.getValueOperand();
3435
3436 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3437 // alloca that should be re-examined after promoting this alloca.
3438 if (V->getType()->isPointerTy())
3439 if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
3440 Pass.PostPromotionWorklist.insert(AI);
3441
3442 TypeSize StoreSize = DL.getTypeStoreSize(V->getType());
3443 if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) {
3444 assert(!SI.isVolatile());
3445 assert(V->getType()->isIntegerTy() &&
3446 "Only integer type loads and stores are split");
3447 assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
3448 "Non-byte-multiple bit width");
3449 IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
3450 V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
3451 "extract");
3452 }
3453
3454 if (VecTy)
3455 return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
3456 if (IntTy && V->getType()->isIntegerTy())
3457 return rewriteIntegerStore(V, SI, AATags);
3458
3459 StoreInst *NewSI;
3460 if (NewBeginOffset == NewAllocaBeginOffset &&
3461 NewEndOffset == NewAllocaEndOffset &&
3462 canConvertValue(DL, V->getType(), NewAllocaTy)) {
3463 V = convertValue(DL, IRB, V, NewAllocaTy);
3464 Value *NewPtr =
3465 getPtrToNewAI(SI.getPointerAddressSpace(), SI.isVolatile());
3466
3467 NewSI =
3468 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), SI.isVolatile());
3469 } else {
3470 unsigned AS = SI.getPointerAddressSpace();
3471 Value *NewPtr = getNewAllocaSlicePtr(IRB, IRB.getPtrTy(AS));
3472 NewSI =
3473 IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
3474 }
3475 NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3476 LLVMContext::MD_access_group});
3477 if (AATags)
3478 NewSI->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3479 V->getType(), DL));
3480 if (SI.isVolatile())
3481 NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
3482 if (NewSI->isAtomic())
3483 NewSI->setAlignment(SI.getAlign());
3484
3485 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3486 NewSI, NewSI->getPointerOperand(),
3487 NewSI->getValueOperand(), DL);
3488
3489 Pass.DeadInsts.push_back(&SI);
3490 deleteIfTriviallyDead(OldOp);
3491
3492 LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
3493 return NewSI->getPointerOperand() == &NewAI &&
3494 NewSI->getValueOperand()->getType() == NewAllocaTy &&
3495 !SI.isVolatile();
3496 }
3497
3498 /// Compute an integer value from splatting an i8 across the given
3499 /// number of bytes.
3500 ///
3501 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
3502 /// call this routine.
3503 /// FIXME: Heed the advice above.
3504 ///
3505 /// \param V The i8 value to splat.
3506 /// \param Size The number of bytes in the output (assuming i8 is one byte)
3507 Value *getIntegerSplat(Value *V, unsigned Size) {
3508 assert(Size > 0 && "Expected a positive number of bytes.");
3509 IntegerType *VTy = cast<IntegerType>(V->getType());
3510 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
3511 if (Size == 1)
3512 return V;
3513
3514 Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
3515 V = IRB.CreateMul(
3516 IRB.CreateZExt(V, SplatIntTy, "zext"),
3517 IRB.CreateUDiv(Constant::getAllOnesValue(SplatIntTy),
3518 IRB.CreateZExt(Constant::getAllOnesValue(V->getType()),
3519 SplatIntTy)),
3520 "isplat");
3521 return V;
3522 }
3523
3524 /// Compute a vector splat for a given element value.
3525 Value *getVectorSplat(Value *V, unsigned NumElements) {
3526 V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
3527 LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
3528 return V;
3529 }
3530
3531 bool visitMemSetInst(MemSetInst &II) {
3532 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3533 assert(II.getRawDest() == OldPtr);
3534
3535 AAMDNodes AATags = II.getAAMetadata();
3536
3537 // If the memset has a variable size, it cannot be split, just adjust the
3538 // pointer to the new alloca.
3539 if (!isa<ConstantInt>(II.getLength())) {
3540 assert(!IsSplit);
3541 assert(NewBeginOffset == BeginOffset);
3542 II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
3543 II.setDestAlignment(getSliceAlign());
3544 // In theory we should call migrateDebugInfo here. However, we do not
3545 // emit dbg.assign intrinsics for mem intrinsics storing through non-
3546 // constant geps, or storing a variable number of bytes.
3548 "AT: Unexpected link to non-const GEP");
3549 deleteIfTriviallyDead(OldPtr);
3550 return false;
3551 }
3552
3553 // Record this instruction for deletion.
3554 Pass.DeadInsts.push_back(&II);
3555
3556 Type *AllocaTy = NewAI.getAllocatedType();
3557 Type *ScalarTy = AllocaTy->getScalarType();
3558
3559 const bool CanContinue = [&]() {
3560 if (VecTy || IntTy)
3561 return true;
3562 if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)
3563 return false;
3564 // Length must be in range for FixedVectorType.
3565 auto *C = cast<ConstantInt>(II.getLength());
3566 const uint64_t Len = C->getLimitedValue();
3567 if (Len > std::numeric_limits<unsigned>::max())
3568 return false;
3569 auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
3570 auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
3571 return canConvertValue(DL, SrcTy, AllocaTy) &&
3572 DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedValue());
3573 }();
3574
3575 // If this doesn't map cleanly onto the alloca type, and that type isn't
3576 // a single value type, just emit a memset.
3577 if (!CanContinue) {
3578 Type *SizeTy = II.getLength()->getType();
3579 unsigned Sz = NewEndOffset - NewBeginOffset;
3580 Constant *Size = ConstantInt::get(SizeTy, Sz);
3581 MemIntrinsic *New = cast<MemIntrinsic>(IRB.CreateMemSet(
3582 getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
3583 MaybeAlign(getSliceAlign()), II.isVolatile()));
3584 if (AATags)
3585 New->setAAMetadata(
3586 AATags.adjustForAccess(NewBeginOffset - BeginOffset, Sz));
3587
3588 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3589 New, New->getRawDest(), nullptr, DL);
3590
3591 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3592 return false;
3593 }
3594
3595 // If we can represent this as a simple value, we have to build the actual
3596 // value to store, which requires expanding the byte present in memset to
3597 // a sensible representation for the alloca type. This is essentially
3598 // splatting the byte to a sufficiently wide integer, splatting it across
3599 // any desired vector width, and bitcasting to the final type.
3600 Value *V;
3601
3602 if (VecTy) {
3603 // If this is a memset of a vectorized alloca, insert it.
3604 assert(ElementTy == ScalarTy);
3605
3606 unsigned BeginIndex = getIndex(NewBeginOffset);
3607 unsigned EndIndex = getIndex(NewEndOffset);
3608 assert(EndIndex > BeginIndex && "Empty vector!");
3609 unsigned NumElements = EndIndex - BeginIndex;
3610 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3611 "Too many elements!");
3612
3613 Value *Splat = getIntegerSplat(
3614 II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8);
3615 Splat = convertValue(DL, IRB, Splat, ElementTy);
3616 if (NumElements > 1)
3617 Splat = getVectorSplat(Splat, NumElements);
3618
3619 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3620 NewAI.getAlign(), "oldload");
3621 V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
3622 } else if (IntTy) {
3623 // If this is a memset on an alloca where we can widen stores, insert the
3624 // set integer.
3625 assert(!II.isVolatile());
3626
3627 uint64_t Size = NewEndOffset - NewBeginOffset;
3628 V = getIntegerSplat(II.getValue(), Size);
3629
3630 if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
3631 EndOffset != NewAllocaBeginOffset)) {
3632 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3633 NewAI.getAlign(), "oldload");
3634 Old = convertValue(DL, IRB, Old, IntTy);
3635 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3636 V = insertInteger(DL, IRB, Old, V, Offset, "insert");
3637 } else {
3638 assert(V->getType() == IntTy &&
3639 "Wrong type for an alloca wide integer!");
3640 }
3641 V = convertValue(DL, IRB, V, AllocaTy);
3642 } else {
3643 // Established these invariants above.
3644 assert(NewBeginOffset == NewAllocaBeginOffset);
3645 assert(NewEndOffset == NewAllocaEndOffset);
3646
3647 V = getIntegerSplat(II.getValue(),
3648 DL.getTypeSizeInBits(ScalarTy).getFixedValue() / 8);
3649 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
3650 V = getVectorSplat(
3651 V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
3652
3653 V = convertValue(DL, IRB, V, AllocaTy);
3654 }
3655
3656 Value *NewPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3657 StoreInst *New =
3658 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), II.isVolatile());
3659 New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3660 LLVMContext::MD_access_group});
3661 if (AATags)
3662 New->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3663 V->getType(), DL));
3664
3665 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3666 New, New->getPointerOperand(), V, DL);
3667
3668 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3669 return !II.isVolatile();
3670 }
3671
3672 bool visitMemTransferInst(MemTransferInst &II) {
3673 // Rewriting of memory transfer instructions can be a bit tricky. We break
3674 // them into two categories: split intrinsics and unsplit intrinsics.
3675
3676 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3677
3678 AAMDNodes AATags = II.getAAMetadata();
3679
3680 bool IsDest = &II.getRawDestUse() == OldUse;
3681 assert((IsDest && II.getRawDest() == OldPtr) ||
3682 (!IsDest && II.getRawSource() == OldPtr));
3683
3684 Align SliceAlign = getSliceAlign();
3685 // For unsplit intrinsics, we simply modify the source and destination
3686 // pointers in place. This isn't just an optimization, it is a matter of
3687 // correctness. With unsplit intrinsics we may be dealing with transfers
3688 // within a single alloca before SROA ran, or with transfers that have
3689 // a variable length. We may also be dealing with memmove instead of
3690 // memcpy, and so simply updating the pointers is the necessary for us to
3691 // update both source and dest of a single call.
3692 if (!IsSplittable) {
3693 Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3694 if (IsDest) {
3695 // Update the address component of linked dbg.assigns.
3696 for (DbgVariableRecord *DbgAssign : at::getDVRAssignmentMarkers(&II)) {
3697 if (llvm::is_contained(DbgAssign->location_ops(), II.getDest()) ||
3698 DbgAssign->getAddress() == II.getDest())
3699 DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
3700 }
3701 II.setDest(AdjustedPtr);
3702 II.setDestAlignment(SliceAlign);
3703 } else {
3704 II.setSource(AdjustedPtr);
3705 II.setSourceAlignment(SliceAlign);
3706 }
3707
3708 LLVM_DEBUG(dbgs() << " to: " << II << "\n");
3709 deleteIfTriviallyDead(OldPtr);
3710 return false;
3711 }
3712 // For split transfer intrinsics we have an incredibly useful assurance:
3713 // the source and destination do not reside within the same alloca, and at
3714 // least one of them does not escape. This means that we can replace
3715 // memmove with memcpy, and we don't need to worry about all manner of
3716 // downsides to splitting and transforming the operations.
3717
3718 // If this doesn't map cleanly onto the alloca type, and that type isn't
3719 // a single value type, just emit a memcpy.
3720 bool EmitMemCpy =
3721 !VecTy && !IntTy &&
3722 (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
3723 SliceSize !=
3724 DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedValue() ||
3725 !DL.typeSizeEqualsStoreSize(NewAI.getAllocatedType()) ||
3727
3728 // If we're just going to emit a memcpy, the alloca hasn't changed, and the
3729 // size hasn't been shrunk based on analysis of the viable range, this is
3730 // a no-op.
3731 if (EmitMemCpy && &OldAI == &NewAI) {
3732 // Ensure the start lines up.
3733 assert(NewBeginOffset == BeginOffset);
3734
3735 // Rewrite the size as needed.
3736 if (NewEndOffset != EndOffset)
3737 II.setLength(NewEndOffset - NewBeginOffset);
3738 return false;
3739 }
3740 // Record this instruction for deletion.
3741 Pass.DeadInsts.push_back(&II);
3742
3743 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3744 // alloca that should be re-examined after rewriting this instruction.
3745 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
3746 if (AllocaInst *AI =
3748 assert(AI != &OldAI && AI != &NewAI &&
3749 "Splittable transfers cannot reach the same alloca on both ends.");
3750 Pass.Worklist.insert(AI);
3751 }
3752
3753 Type *OtherPtrTy = OtherPtr->getType();
3754 unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
3755
3756 // Compute the relative offset for the other pointer within the transfer.
3757 unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
3758 APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
3759 Align OtherAlign =
3760 (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
3761 OtherAlign =
3762 commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
3763
3764 if (EmitMemCpy) {
3765 // Compute the other pointer, folding as much as possible to produce
3766 // a single, simple GEP in most cases.
3767 OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3768 OtherPtr->getName() + ".");
3769
3770 Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3771 Type *SizeTy = II.getLength()->getType();
3772 Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
3773
3774 Value *DestPtr, *SrcPtr;
3775 MaybeAlign DestAlign, SrcAlign;
3776 // Note: IsDest is true iff we're copying into the new alloca slice
3777 if (IsDest) {
3778 DestPtr = OurPtr;
3779 DestAlign = SliceAlign;
3780 SrcPtr = OtherPtr;
3781 SrcAlign = OtherAlign;
3782 } else {
3783 DestPtr = OtherPtr;
3784 DestAlign = OtherAlign;
3785 SrcPtr = OurPtr;
3786 SrcAlign = SliceAlign;
3787 }
3788 CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
3789 Size, II.isVolatile());
3790 if (AATags)
3791 New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3792
3793 APInt Offset(DL.getIndexTypeSizeInBits(DestPtr->getType()), 0);
3794 if (IsDest) {
3795 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8,
3796 &II, New, DestPtr, nullptr, DL);
3797 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3799 DL, Offset, /*AllowNonInbounds*/ true))) {
3800 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8,
3801 SliceSize * 8, &II, New, DestPtr, nullptr, DL);
3802 }
3803 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3804 return false;
3805 }
3806
3807 bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3808 NewEndOffset == NewAllocaEndOffset;
3809 uint64_t Size = NewEndOffset - NewBeginOffset;
3810 unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
3811 unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
3812 unsigned NumElements = EndIndex - BeginIndex;
3813 IntegerType *SubIntTy =
3814 IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
3815
3816 // Reset the other pointer type to match the register type we're going to
3817 // use, but using the address space of the original other pointer.
3818 Type *OtherTy;
3819 if (VecTy && !IsWholeAlloca) {
3820 if (NumElements == 1)
3821 OtherTy = VecTy->getElementType();
3822 else
3823 OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
3824 } else if (IntTy && !IsWholeAlloca) {
3825 OtherTy = SubIntTy;
3826 } else {
3827 OtherTy = NewAllocaTy;
3828 }
3829
3830 Value *AdjPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3831 OtherPtr->getName() + ".");
3832 MaybeAlign SrcAlign = OtherAlign;
3833 MaybeAlign DstAlign = SliceAlign;
3834 if (!IsDest)
3835 std::swap(SrcAlign, DstAlign);
3836
3837 Value *SrcPtr;
3838 Value *DstPtr;
3839
3840 if (IsDest) {
3841 DstPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3842 SrcPtr = AdjPtr;
3843 } else {
3844 DstPtr = AdjPtr;
3845 SrcPtr = getPtrToNewAI(II.getSourceAddressSpace(), II.isVolatile());
3846 }
3847
3848 Value *Src;
3849 if (VecTy && !IsWholeAlloca && !IsDest) {
3850 Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3851 NewAI.getAlign(), "load");
3852 Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
3853 } else if (IntTy && !IsWholeAlloca && !IsDest) {
3854 Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3855 NewAI.getAlign(), "load");
3856 Src = convertValue(DL, IRB, Src, IntTy);
3857 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3858 Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
3859 } else {
3860 LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
3861 II.isVolatile(), "copyload");
3862 Load->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3863 LLVMContext::MD_access_group});
3864 if (AATags)
3865 Load->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3866 Load->getType(), DL));
3867 Src = Load;
3868 }
3869
3870 if (VecTy && !IsWholeAlloca && IsDest) {
3871 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3872 NewAI.getAlign(), "oldload");
3873 Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
3874 } else if (IntTy && !IsWholeAlloca && IsDest) {
3875 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3876 NewAI.getAlign(), "oldload");
3877 Old = convertValue(DL, IRB, Old, IntTy);
3878 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3879 Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
3880 Src = convertValue(DL, IRB, Src, NewAllocaTy);
3881 }
3882
3883 StoreInst *Store = cast<StoreInst>(
3884 IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
3885 Store->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3886 LLVMContext::MD_access_group});
3887 if (AATags)
3888 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3889 Src->getType(), DL));
3890
3891 APInt Offset(DL.getIndexTypeSizeInBits(DstPtr->getType()), 0);
3892 if (IsDest) {
3893
3894 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3895 Store, DstPtr, Src, DL);
3896 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3898 DL, Offset, /*AllowNonInbounds*/ true))) {
3899 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8, SliceSize * 8,
3900 &II, Store, DstPtr, Src, DL);
3901 }
3902
3903 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3904 return !II.isVolatile();
3905 }
3906
3907 bool visitIntrinsicInst(IntrinsicInst &II) {
3908 assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
3909 "Unexpected intrinsic!");
3910 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3911
3912 // Record this instruction for deletion.
3913 Pass.DeadInsts.push_back(&II);
3914
3915 if (II.isDroppable()) {
3916 assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3917 // TODO For now we forget assumed information, this can be improved.
3918 OldPtr->dropDroppableUsesIn(II);
3919 return true;
3920 }
3921
3922 assert(II.getArgOperand(0) == OldPtr);
3923 Type *PointerTy = IRB.getPtrTy(OldPtr->getType()->getPointerAddressSpace());
3924 Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3925 Value *New;
3926 if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3927 New = IRB.CreateLifetimeStart(Ptr);
3928 else
3929 New = IRB.CreateLifetimeEnd(Ptr);
3930
3931 (void)New;
3932 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3933
3934 return true;
3935 }
3936
3937 void fixLoadStoreAlign(Instruction &Root) {
3938 // This algorithm implements the same visitor loop as
3939 // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3940 // or store found.
3941 SmallPtrSet<Instruction *, 4> Visited;
3942 SmallVector<Instruction *, 4> Uses;
3943 Visited.insert(&Root);
3944 Uses.push_back(&Root);
3945 do {
3946 Instruction *I = Uses.pop_back_val();
3947
3948 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
3949 LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
3950 continue;
3951 }
3952 if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
3953 SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
3954 continue;
3955 }
3956
3960 for (User *U : I->users())
3961 if (Visited.insert(cast<Instruction>(U)).second)
3962 Uses.push_back(cast<Instruction>(U));
3963 } while (!Uses.empty());
3964 }
3965
3966 bool visitPHINode(PHINode &PN) {
3967 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
3968 assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3969 assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3970
3971 // We would like to compute a new pointer in only one place, but have it be
3972 // as local as possible to the PHI. To do that, we re-use the location of
3973 // the old pointer, which necessarily must be in the right position to
3974 // dominate the PHI.
3975 IRBuilderBase::InsertPointGuard Guard(IRB);
3976 if (isa<PHINode>(OldPtr))
3977 IRB.SetInsertPoint(OldPtr->getParent(),
3978 OldPtr->getParent()->getFirstInsertionPt());
3979 else
3980 IRB.SetInsertPoint(OldPtr);
3981 IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3982
3983 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3984 // Replace the operands which were using the old pointer.
3985 std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
3986
3987 LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
3988 deleteIfTriviallyDead(OldPtr);
3989
3990 // Fix the alignment of any loads or stores using this PHI node.
3991 fixLoadStoreAlign(PN);
3992
3993 // PHIs can't be promoted on their own, but often can be speculated. We
3994 // check the speculation outside of the rewriter so that we see the
3995 // fully-rewritten alloca.
3996 PHIUsers.insert(&PN);
3997 return true;
3998 }
3999
4000 bool visitSelectInst(SelectInst &SI) {
4001 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
4002 assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
4003 "Pointer isn't an operand!");
4004 assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
4005 assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
4006
4007 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
4008 // Replace the operands which were using the old pointer.
4009 if (SI.getOperand(1) == OldPtr)
4010 SI.setOperand(1, NewPtr);
4011 if (SI.getOperand(2) == OldPtr)
4012 SI.setOperand(2, NewPtr);
4013
4014 LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
4015 deleteIfTriviallyDead(OldPtr);
4016
4017 // Fix the alignment of any loads or stores using this select.
4018 fixLoadStoreAlign(SI);
4019
4020 // Selects can't be promoted on their own, but often can be speculated. We
4021 // check the speculation outside of the rewriter so that we see the
4022 // fully-rewritten alloca.
4023 SelectUsers.insert(&SI);
4024 return true;
4025 }
4026};
4027
4028/// Visitor to rewrite aggregate loads and stores as scalar.
4029///
4030/// This pass aggressively rewrites all aggregate loads and stores on
4031/// a particular pointer (or any pointer derived from it which we can identify)
4032/// with scalar loads and stores.
4033class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
4034 // Befriend the base class so it can delegate to private visit methods.
4035 friend class InstVisitor<AggLoadStoreRewriter, bool>;
4036
4037 /// Queue of pointer uses to analyze and potentially rewrite.
4039
4040 /// Set to prevent us from cycling with phi nodes and loops.
4041 SmallPtrSet<User *, 8> Visited;
4042
4043 /// The current pointer use being rewritten. This is used to dig up the used
4044 /// value (as opposed to the user).
4045 Use *U = nullptr;
4046
4047 /// Used to calculate offsets, and hence alignment, of subobjects.
4048 const DataLayout &DL;
4049
4050 IRBuilderTy &IRB;
4051
4052public:
4053 AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
4054 : DL(DL), IRB(IRB) {}
4055
4056 /// Rewrite loads and stores through a pointer and all pointers derived from
4057 /// it.
4058 bool rewrite(Instruction &I) {
4059 LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
4060 enqueueUsers(I);
4061 bool Changed = false;
4062 while (!Queue.empty()) {
4063 U = Queue.pop_back_val();
4064 Changed |= visit(cast<Instruction>(U->getUser()));
4065 }
4066 return Changed;
4067 }
4068
4069private:
4070 /// Enqueue all the users of the given instruction for further processing.
4071 /// This uses a set to de-duplicate users.
4072 void enqueueUsers(Instruction &I) {
4073 for (Use &U : I.uses())
4074 if (Visited.insert(U.getUser()).second)
4075 Queue.push_back(&U);
4076 }
4077
4078 // Conservative default is to not rewrite anything.
4079 bool visitInstruction(Instruction &I) { return false; }
4080
4081 /// Generic recursive split emission class.
4082 template <typename Derived> class OpSplitter {
4083 protected:
4084 /// The builder used to form new instructions.
4085 IRBuilderTy &IRB;
4086
4087 /// The indices which to be used with insert- or extractvalue to select the
4088 /// appropriate value within the aggregate.
4089 SmallVector<unsigned, 4> Indices;
4090
4091 /// The indices to a GEP instruction which will move Ptr to the correct slot
4092 /// within the aggregate.
4093 SmallVector<Value *, 4> GEPIndices;
4094
4095 /// The base pointer of the original op, used as a base for GEPing the
4096 /// split operations.
4097 Value *Ptr;
4098
4099 /// The base pointee type being GEPed into.
4100 Type *BaseTy;
4101
4102 /// Known alignment of the base pointer.
4103 Align BaseAlign;
4104
4105 /// To calculate offset of each component so we can correctly deduce
4106 /// alignments.
4107 const DataLayout &DL;
4108
4109 /// Initialize the splitter with an insertion point, Ptr and start with a
4110 /// single zero GEP index.
4111 OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4112 Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
4113 : IRB(IRB), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy),
4114 BaseAlign(BaseAlign), DL(DL) {
4115 IRB.SetInsertPoint(InsertionPoint);
4116 }
4117
4118 public:
4119 /// Generic recursive split emission routine.
4120 ///
4121 /// This method recursively splits an aggregate op (load or store) into
4122 /// scalar or vector ops. It splits recursively until it hits a single value
4123 /// and emits that single value operation via the template argument.
4124 ///
4125 /// The logic of this routine relies on GEPs and insertvalue and
4126 /// extractvalue all operating with the same fundamental index list, merely
4127 /// formatted differently (GEPs need actual values).
4128 ///
4129 /// \param Ty The type being split recursively into smaller ops.
4130 /// \param Agg The aggregate value being built up or stored, depending on
4131 /// whether this is splitting a load or a store respectively.
4132 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
4133 if (Ty->isSingleValueType()) {
4134 unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
4135 return static_cast<Derived *>(this)->emitFunc(
4136 Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
4137 }
4138
4139 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
4140 unsigned OldSize = Indices.size();
4141 (void)OldSize;
4142 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
4143 ++Idx) {
4144 assert(Indices.size() == OldSize && "Did not return to the old size");
4145 Indices.push_back(Idx);
4146 GEPIndices.push_back(IRB.getInt32(Idx));
4147 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
4148 GEPIndices.pop_back();
4149 Indices.pop_back();
4150 }
4151 return;
4152 }
4153
4154 if (StructType *STy = dyn_cast<StructType>(Ty)) {
4155 unsigned OldSize = Indices.size();
4156 (void)OldSize;
4157 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
4158 ++Idx) {
4159 assert(Indices.size() == OldSize && "Did not return to the old size");
4160 Indices.push_back(Idx);
4161 GEPIndices.push_back(IRB.getInt32(Idx));
4162 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
4163 GEPIndices.pop_back();
4164 Indices.pop_back();
4165 }
4166 return;
4167 }
4168
4169 llvm_unreachable("Only arrays and structs are aggregate loadable types");
4170 }
4171 };
4172
4173 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
4174 AAMDNodes AATags;
4175 // A vector to hold the split components that we want to emit
4176 // separate fake uses for.
4177 SmallVector<Value *, 4> Components;
4178 // A vector to hold all the fake uses of the struct that we are splitting.
4179 // Usually there should only be one, but we are handling the general case.
4181
4182 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4183 AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
4184 IRBuilderTy &IRB)
4185 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
4186 IRB),
4187 AATags(AATags) {}
4188
4189 /// Emit a leaf load of a single value. This is called at the leaves of the
4190 /// recursive emission to actually load values.
4191 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4193 // Load the single value and insert it using the indices.
4194 Value *GEP =
4195 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
4196 LoadInst *Load =
4197 IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
4198
4199 APInt Offset(
4200 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4201 if (AATags &&
4202 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
4203 Load->setAAMetadata(
4204 AATags.adjustForAccess(Offset.getZExtValue(), Load->getType(), DL));
4205 // Record the load so we can generate a fake use for this aggregate
4206 // component.
4207 Components.push_back(Load);
4208
4209 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
4210 LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
4211 }
4212
4213 // Stash the fake uses that use the value generated by this instruction.
4214 void recordFakeUses(LoadInst &LI) {
4215 for (Use &U : LI.uses())
4216 if (auto *II = dyn_cast<IntrinsicInst>(U.getUser()))
4217 if (II->getIntrinsicID() == Intrinsic::fake_use)
4218 FakeUses.push_back(II);
4219 }
4220
4221 // Replace all fake uses of the aggregate with a series of fake uses, one
4222 // for each split component.
4223 void emitFakeUses() {
4224 for (Instruction *I : FakeUses) {
4225 IRB.SetInsertPoint(I);
4226 for (auto *V : Components)
4227 IRB.CreateIntrinsic(Intrinsic::fake_use, {V});
4228 I->eraseFromParent();
4229 }
4230 }
4231 };
4232
4233 bool visitLoadInst(LoadInst &LI) {
4234 assert(LI.getPointerOperand() == *U);
4235 if (!LI.isSimple() || LI.getType()->isSingleValueType())
4236 return false;
4237
4238 // We have an aggregate being loaded, split it apart.
4239 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
4240 LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
4241 getAdjustedAlignment(&LI, 0), DL, IRB);
4242 Splitter.recordFakeUses(LI);
4244 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
4245 Splitter.emitFakeUses();
4246 Visited.erase(&LI);
4247 LI.replaceAllUsesWith(V);
4248 LI.eraseFromParent();
4249 return true;
4250 }
4251
4252 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
4253 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4254 AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign,
4255 const DataLayout &DL, IRBuilderTy &IRB)
4256 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
4257 DL, IRB),
4258 AATags(AATags), AggStore(AggStore) {}
4259 AAMDNodes AATags;
4260 StoreInst *AggStore;
4261 /// Emit a leaf store of a single value. This is called at the leaves of the
4262 /// recursive emission to actually produce stores.
4263 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4265 // Extract the single value and store it using the indices.
4266 //
4267 // The gep and extractvalue values are factored out of the CreateStore
4268 // call to make the output independent of the argument evaluation order.
4269 Value *ExtractValue =
4270 IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
4271 Value *InBoundsGEP =
4272 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
4273 StoreInst *Store =
4274 IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
4275
4276 APInt Offset(
4277 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4278 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset);
4279 if (AATags) {
4280 Store->setAAMetadata(AATags.adjustForAccess(
4281 Offset.getZExtValue(), ExtractValue->getType(), DL));
4282 }
4283
4284 // migrateDebugInfo requires the base Alloca. Walk to it from this gep.
4285 // If we cannot (because there's an intervening non-const or unbounded
4286 // gep) then we wouldn't expect to see dbg.assign intrinsics linked to
4287 // this instruction.
4289 if (auto *OldAI = dyn_cast<AllocaInst>(Base)) {
4290 uint64_t SizeInBits =
4291 DL.getTypeSizeInBits(Store->getValueOperand()->getType());
4292 migrateDebugInfo(OldAI, /*IsSplit*/ true, Offset.getZExtValue() * 8,
4293 SizeInBits, AggStore, Store,
4294 Store->getPointerOperand(), Store->getValueOperand(),
4295 DL);
4296 } else {
4298 "AT: unexpected debug.assign linked to store through "
4299 "unbounded GEP");
4300 }
4301 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
4302 }
4303 };
4304
4305 bool visitStoreInst(StoreInst &SI) {
4306 if (!SI.isSimple() || SI.getPointerOperand() != *U)
4307 return false;
4308 Value *V = SI.getValueOperand();
4309 if (V->getType()->isSingleValueType())
4310 return false;
4311
4312 // We have an aggregate being stored, split it apart.
4313 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
4314 StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI,
4315 getAdjustedAlignment(&SI, 0), DL, IRB);
4316 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
4317 Visited.erase(&SI);
4318 // The stores replacing SI each have markers describing fragments of the
4319 // assignment so delete the assignment markers linked to SI.
4321 SI.eraseFromParent();
4322 return true;
4323 }
4324
4325 bool visitBitCastInst(BitCastInst &BC) {
4326 enqueueUsers(BC);
4327 return false;
4328 }
4329
4330 bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
4331 enqueueUsers(ASC);
4332 return false;
4333 }
4334
4335 // Unfold gep (select cond, ptr1, ptr2), idx
4336 // => select cond, gep(ptr1, idx), gep(ptr2, idx)
4337 // and gep ptr, (select cond, idx1, idx2)
4338 // => select cond, gep(ptr, idx1), gep(ptr, idx2)
4339 // We also allow for i1 zext indices, which are equivalent to selects.
4340 bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
4341 // Check whether the GEP has exactly one select operand and all indices
4342 // will become constant after the transform.
4344 for (Value *Op : GEPI.indices()) {
4345 if (auto *SI = dyn_cast<SelectInst>(Op)) {
4346 if (Sel)
4347 return false;
4348
4349 Sel = SI;
4350 if (!isa<ConstantInt>(SI->getTrueValue()) ||
4351 !isa<ConstantInt>(SI->getFalseValue()))
4352 return false;
4353 continue;
4354 }
4355 if (auto *ZI = dyn_cast<ZExtInst>(Op)) {
4356 if (Sel)
4357 return false;
4358 Sel = ZI;
4359 if (!ZI->getSrcTy()->isIntegerTy(1))
4360 return false;
4361 continue;
4362 }
4363
4364 if (!isa<ConstantInt>(Op))
4365 return false;
4366 }
4367
4368 if (!Sel)
4369 return false;
4370
4371 LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):\n";
4372 dbgs() << " original: " << *Sel << "\n";
4373 dbgs() << " " << GEPI << "\n";);
4374
4375 auto GetNewOps = [&](Value *SelOp) {
4376 SmallVector<Value *> NewOps;
4377 for (Value *Op : GEPI.operands())
4378 if (Op == Sel)
4379 NewOps.push_back(SelOp);
4380 else
4381 NewOps.push_back(Op);
4382 return NewOps;
4383 };
4384
4385 Value *Cond, *True, *False;
4386 Instruction *MDFrom = nullptr;
4387 if (auto *SI = dyn_cast<SelectInst>(Sel)) {
4388 Cond = SI->getCondition();
4389 True = SI->getTrueValue();
4390 False = SI->getFalseValue();
4392 MDFrom = SI;
4393 } else {
4394 Cond = Sel->getOperand(0);
4395 True = ConstantInt::get(Sel->getType(), 1);
4396 False = ConstantInt::get(Sel->getType(), 0);
4397 }
4398 SmallVector<Value *> TrueOps = GetNewOps(True);
4399 SmallVector<Value *> FalseOps = GetNewOps(False);
4400
4401 IRB.SetInsertPoint(&GEPI);
4402 GEPNoWrapFlags NW = GEPI.getNoWrapFlags();
4403
4404 Type *Ty = GEPI.getSourceElementType();
4405 Value *NTrue = IRB.CreateGEP(Ty, TrueOps[0], ArrayRef(TrueOps).drop_front(),
4406 True->getName() + ".sroa.gep", NW);
4407
4408 Value *NFalse =
4409 IRB.CreateGEP(Ty, FalseOps[0], ArrayRef(FalseOps).drop_front(),
4410 False->getName() + ".sroa.gep", NW);
4411
4412 Value *NSel = MDFrom
4413 ? IRB.CreateSelect(Cond, NTrue, NFalse,
4414 Sel->getName() + ".sroa.sel", MDFrom)
4415 : IRB.CreateSelectWithUnknownProfile(
4416 Cond, NTrue, NFalse, DEBUG_TYPE,
4417 Sel->getName() + ".sroa.sel");
4418 Visited.erase(&GEPI);
4419 GEPI.replaceAllUsesWith(NSel);
4420 GEPI.eraseFromParent();
4421 Instruction *NSelI = cast<Instruction>(NSel);
4422 Visited.insert(NSelI);
4423 enqueueUsers(*NSelI);
4424
4425 LLVM_DEBUG(dbgs() << " to: " << *NTrue << "\n";
4426 dbgs() << " " << *NFalse << "\n";
4427 dbgs() << " " << *NSel << "\n";);
4428
4429 return true;
4430 }
4431
4432 // Unfold gep (phi ptr1, ptr2), idx
4433 // => phi ((gep ptr1, idx), (gep ptr2, idx))
4434 // and gep ptr, (phi idx1, idx2)
4435 // => phi ((gep ptr, idx1), (gep ptr, idx2))
4436 bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
4437 // To prevent infinitely expanding recursive phis, bail if the GEP pointer
4438 // operand (looking through the phi if it is the phi we want to unfold) is
4439 // an instruction besides a static alloca.
4440 PHINode *Phi = dyn_cast<PHINode>(GEPI.getPointerOperand());
4441 auto IsInvalidPointerOperand = [](Value *V) {
4442 if (!isa<Instruction>(V))
4443 return false;
4444 if (auto *AI = dyn_cast<AllocaInst>(V))
4445 return !AI->isStaticAlloca();
4446 return true;
4447 };
4448 if (Phi) {
4449 if (any_of(Phi->operands(), IsInvalidPointerOperand))
4450 return false;
4451 } else {
4452 if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
4453 return false;
4454 }
4455 // Check whether the GEP has exactly one phi operand (including the pointer
4456 // operand) and all indices will become constant after the transform.
4457 for (Value *Op : GEPI.indices()) {
4458 if (auto *SI = dyn_cast<PHINode>(Op)) {
4459 if (Phi)
4460 return false;
4461
4462 Phi = SI;
4463 if (!all_of(Phi->incoming_values(),
4464 [](Value *V) { return isa<ConstantInt>(V); }))
4465 return false;
4466 continue;
4467 }
4468
4469 if (!isa<ConstantInt>(Op))
4470 return false;
4471 }
4472
4473 if (!Phi)
4474 return false;
4475
4476 LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):\n";
4477 dbgs() << " original: " << *Phi << "\n";
4478 dbgs() << " " << GEPI << "\n";);
4479
4480 auto GetNewOps = [&](Value *PhiOp) {
4481 SmallVector<Value *> NewOps;
4482 for (Value *Op : GEPI.operands())
4483 if (Op == Phi)
4484 NewOps.push_back(PhiOp);
4485 else
4486 NewOps.push_back(Op);
4487 return NewOps;
4488 };
4489
4490 IRB.SetInsertPoint(Phi);
4491 PHINode *NewPhi = IRB.CreatePHI(GEPI.getType(), Phi->getNumIncomingValues(),
4492 Phi->getName() + ".sroa.phi");
4493
4494 Type *SourceTy = GEPI.getSourceElementType();
4495 // We only handle arguments, constants, and static allocas here, so we can
4496 // insert GEPs at the end of the entry block.
4497 IRB.SetInsertPoint(GEPI.getFunction()->getEntryBlock().getTerminator());
4498 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
4499 Value *Op = Phi->getIncomingValue(I);
4500 BasicBlock *BB = Phi->getIncomingBlock(I);
4501 Value *NewGEP;
4502 if (int NI = NewPhi->getBasicBlockIndex(BB); NI >= 0) {
4503 NewGEP = NewPhi->getIncomingValue(NI);
4504 } else {
4505 SmallVector<Value *> NewOps = GetNewOps(Op);
4506 NewGEP =
4507 IRB.CreateGEP(SourceTy, NewOps[0], ArrayRef(NewOps).drop_front(),
4508 Phi->getName() + ".sroa.gep", GEPI.getNoWrapFlags());
4509 }
4510 NewPhi->addIncoming(NewGEP, BB);
4511 }
4512
4513 Visited.erase(&GEPI);
4514 GEPI.replaceAllUsesWith(NewPhi);
4515 GEPI.eraseFromParent();
4516 Visited.insert(NewPhi);
4517 enqueueUsers(*NewPhi);
4518
4519 LLVM_DEBUG(dbgs() << " to: ";
4520 for (Value *In
4521 : NewPhi->incoming_values()) dbgs()
4522 << "\n " << *In;
4523 dbgs() << "\n " << *NewPhi << '\n');
4524
4525 return true;
4526 }
4527
4528 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
4529 if (unfoldGEPSelect(GEPI))
4530 return true;
4531
4532 if (unfoldGEPPhi(GEPI))
4533 return true;
4534
4535 enqueueUsers(GEPI);
4536 return false;
4537 }
4538
4539 bool visitPHINode(PHINode &PN) {
4540 enqueueUsers(PN);
4541 return false;
4542 }
4543
4544 bool visitSelectInst(SelectInst &SI) {
4545 enqueueUsers(SI);
4546 return false;
4547 }
4548};
4549
4550} // end anonymous namespace
4551
4552/// Strip aggregate type wrapping.
4553///
4554/// This removes no-op aggregate types wrapping an underlying type. It will
4555/// strip as many layers of types as it can without changing either the type
4556/// size or the allocated size.
4558 if (Ty->isSingleValueType())
4559 return Ty;
4560
4561 uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue();
4562 uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
4563
4564 Type *InnerTy;
4565 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
4566 InnerTy = ArrTy->getElementType();
4567 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
4568 const StructLayout *SL = DL.getStructLayout(STy);
4569 unsigned Index = SL->getElementContainingOffset(0);
4570 InnerTy = STy->getElementType(Index);
4571 } else {
4572 return Ty;
4573 }
4574
4575 if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedValue() ||
4576 TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedValue())
4577 return Ty;
4578
4579 return stripAggregateTypeWrapping(DL, InnerTy);
4580}
4581
4582/// Try to find a partition of the aggregate type passed in for a given
4583/// offset and size.
4584///
4585/// This recurses through the aggregate type and tries to compute a subtype
4586/// based on the offset and size. When the offset and size span a sub-section
4587/// of an array, it will even compute a new array type for that sub-section,
4588/// and the same for structs.
4589///
4590/// Note that this routine is very strict and tries to find a partition of the
4591/// type which produces the *exact* right offset and size. It is not forgiving
4592/// when the size or offset cause either end of type-based partition to be off.
4593/// Also, this is a best-effort routine. It is reasonable to give up and not
4594/// return a type if necessary.
4596 uint64_t Size) {
4597 if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedValue() == Size)
4598 return stripAggregateTypeWrapping(DL, Ty);
4599 if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() ||
4600 (DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size)
4601 return nullptr;
4602
4603 if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
4604 Type *ElementTy;
4605 uint64_t TyNumElements;
4606 if (auto *AT = dyn_cast<ArrayType>(Ty)) {
4607 ElementTy = AT->getElementType();
4608 TyNumElements = AT->getNumElements();
4609 } else {
4610 // FIXME: This isn't right for vectors with non-byte-sized or
4611 // non-power-of-two sized elements.
4612 auto *VT = cast<FixedVectorType>(Ty);
4613 ElementTy = VT->getElementType();
4614 TyNumElements = VT->getNumElements();
4615 }
4616 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4617 uint64_t NumSkippedElements = Offset / ElementSize;
4618 if (NumSkippedElements >= TyNumElements)
4619 return nullptr;
4620 Offset -= NumSkippedElements * ElementSize;
4621
4622 // First check if we need to recurse.
4623 if (Offset > 0 || Size < ElementSize) {
4624 // Bail if the partition ends in a different array element.
4625 if ((Offset + Size) > ElementSize)
4626 return nullptr;
4627 // Recurse through the element type trying to peel off offset bytes.
4628 return getTypePartition(DL, ElementTy, Offset, Size);
4629 }
4630 assert(Offset == 0);
4631
4632 if (Size == ElementSize)
4633 return stripAggregateTypeWrapping(DL, ElementTy);
4634 assert(Size > ElementSize);
4635 uint64_t NumElements = Size / ElementSize;
4636 if (NumElements * ElementSize != Size)
4637 return nullptr;
4638 return ArrayType::get(ElementTy, NumElements);
4639 }
4640
4642 if (!STy)
4643 return nullptr;
4644
4645 const StructLayout *SL = DL.getStructLayout(STy);
4646
4647 if (SL->getSizeInBits().isScalable())
4648 return nullptr;
4649
4650 if (Offset >= SL->getSizeInBytes())
4651 return nullptr;
4652 uint64_t EndOffset = Offset + Size;
4653 if (EndOffset > SL->getSizeInBytes())
4654 return nullptr;
4655
4656 unsigned Index = SL->getElementContainingOffset(Offset);
4657 Offset -= SL->getElementOffset(Index);
4658
4659 Type *ElementTy = STy->getElementType(Index);
4660 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4661 if (Offset >= ElementSize)
4662 return nullptr; // The offset points into alignment padding.
4663
4664 // See if any partition must be contained by the element.
4665 if (Offset > 0 || Size < ElementSize) {
4666 if ((Offset + Size) > ElementSize)
4667 return nullptr;
4668 return getTypePartition(DL, ElementTy, Offset, Size);
4669 }
4670 assert(Offset == 0);
4671
4672 if (Size == ElementSize)
4673 return stripAggregateTypeWrapping(DL, ElementTy);
4674
4675 StructType::element_iterator EI = STy->element_begin() + Index,
4676 EE = STy->element_end();
4677 if (EndOffset < SL->getSizeInBytes()) {
4678 unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
4679 if (Index == EndIndex)
4680 return nullptr; // Within a single element and its padding.
4681
4682 // Don't try to form "natural" types if the elements don't line up with the
4683 // expected size.
4684 // FIXME: We could potentially recurse down through the last element in the
4685 // sub-struct to find a natural end point.
4686 if (SL->getElementOffset(EndIndex) != EndOffset)
4687 return nullptr;
4688
4689 assert(Index < EndIndex);
4690 EE = STy->element_begin() + EndIndex;
4691 }
4692
4693 // Try to build up a sub-structure.
4694 StructType *SubTy =
4695 StructType::get(STy->getContext(), ArrayRef(EI, EE), STy->isPacked());
4696 const StructLayout *SubSL = DL.getStructLayout(SubTy);
4697 if (Size != SubSL->getSizeInBytes())
4698 return nullptr; // The sub-struct doesn't have quite the size needed.
4699
4700 return SubTy;
4701}
4702
4703/// Pre-split loads and stores to simplify rewriting.
4704///
4705/// We want to break up the splittable load+store pairs as much as
4706/// possible. This is important to do as a preprocessing step, as once we
4707/// start rewriting the accesses to partitions of the alloca we lose the
4708/// necessary information to correctly split apart paired loads and stores
4709/// which both point into this alloca. The case to consider is something like
4710/// the following:
4711///
4712/// %a = alloca [12 x i8]
4713/// %gep1 = getelementptr i8, ptr %a, i32 0
4714/// %gep2 = getelementptr i8, ptr %a, i32 4
4715/// %gep3 = getelementptr i8, ptr %a, i32 8
4716/// store float 0.0, ptr %gep1
4717/// store float 1.0, ptr %gep2
4718/// %v = load i64, ptr %gep1
4719/// store i64 %v, ptr %gep2
4720/// %f1 = load float, ptr %gep2
4721/// %f2 = load float, ptr %gep3
4722///
4723/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
4724/// promote everything so we recover the 2 SSA values that should have been
4725/// there all along.
4726///
4727/// \returns true if any changes are made.
4728bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
4729 LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
4730
4731 // Track the loads and stores which are candidates for pre-splitting here, in
4732 // the order they first appear during the partition scan. These give stable
4733 // iteration order and a basis for tracking which loads and stores we
4734 // actually split.
4737
4738 // We need to accumulate the splits required of each load or store where we
4739 // can find them via a direct lookup. This is important to cross-check loads
4740 // and stores against each other. We also track the slice so that we can kill
4741 // all the slices that end up split.
4742 struct SplitOffsets {
4743 Slice *S;
4744 std::vector<uint64_t> Splits;
4745 };
4746 SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
4747
4748 // Track loads out of this alloca which cannot, for any reason, be pre-split.
4749 // This is important as we also cannot pre-split stores of those loads!
4750 // FIXME: This is all pretty gross. It means that we can be more aggressive
4751 // in pre-splitting when the load feeding the store happens to come from
4752 // a separate alloca. Put another way, the effectiveness of SROA would be
4753 // decreased by a frontend which just concatenated all of its local allocas
4754 // into one big flat alloca. But defeating such patterns is exactly the job
4755 // SROA is tasked with! Sadly, to not have this discrepancy we would have
4756 // change store pre-splitting to actually force pre-splitting of the load
4757 // that feeds it *and all stores*. That makes pre-splitting much harder, but
4758 // maybe it would make it more principled?
4759 SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
4760
4761 LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
4762 for (auto &P : AS.partitions()) {
4763 for (Slice &S : P) {
4764 Instruction *I = cast<Instruction>(S.getUse()->getUser());
4765 if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
4766 // If this is a load we have to track that it can't participate in any
4767 // pre-splitting. If this is a store of a load we have to track that
4768 // that load also can't participate in any pre-splitting.
4769 if (auto *LI = dyn_cast<LoadInst>(I))
4770 UnsplittableLoads.insert(LI);
4771 else if (auto *SI = dyn_cast<StoreInst>(I))
4772 if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
4773 UnsplittableLoads.insert(LI);
4774 continue;
4775 }
4776 assert(P.endOffset() > S.beginOffset() &&
4777 "Empty or backwards partition!");
4778
4779 // Determine if this is a pre-splittable slice.
4780 if (auto *LI = dyn_cast<LoadInst>(I)) {
4781 assert(!LI->isVolatile() && "Cannot split volatile loads!");
4782
4783 // The load must be used exclusively to store into other pointers for
4784 // us to be able to arbitrarily pre-split it. The stores must also be
4785 // simple to avoid changing semantics.
4786 auto IsLoadSimplyStored = [](LoadInst *LI) {
4787 for (User *LU : LI->users()) {
4788 auto *SI = dyn_cast<StoreInst>(LU);
4789 if (!SI || !SI->isSimple())
4790 return false;
4791 }
4792 return true;
4793 };
4794 if (!IsLoadSimplyStored(LI)) {
4795 UnsplittableLoads.insert(LI);
4796 continue;
4797 }
4798
4799 Loads.push_back(LI);
4800 } else if (auto *SI = dyn_cast<StoreInst>(I)) {
4801 if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
4802 // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
4803 continue;
4804 auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
4805 if (!StoredLoad || !StoredLoad->isSimple())
4806 continue;
4807 assert(!SI->isVolatile() && "Cannot split volatile stores!");
4808
4809 Stores.push_back(SI);
4810 } else {
4811 // Other uses cannot be pre-split.
4812 continue;
4813 }
4814
4815 // Record the initial split.
4816 LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
4817 auto &Offsets = SplitOffsetsMap[I];
4818 assert(Offsets.Splits.empty() &&
4819 "Should not have splits the first time we see an instruction!");
4820 Offsets.S = &S;
4821 Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
4822 }
4823
4824 // Now scan the already split slices, and add a split for any of them which
4825 // we're going to pre-split.
4826 for (Slice *S : P.splitSliceTails()) {
4827 auto SplitOffsetsMapI =
4828 SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
4829 if (SplitOffsetsMapI == SplitOffsetsMap.end())
4830 continue;
4831 auto &Offsets = SplitOffsetsMapI->second;
4832
4833 assert(Offsets.S == S && "Found a mismatched slice!");
4834 assert(!Offsets.Splits.empty() &&
4835 "Cannot have an empty set of splits on the second partition!");
4836 assert(Offsets.Splits.back() ==
4837 P.beginOffset() - Offsets.S->beginOffset() &&
4838 "Previous split does not end where this one begins!");
4839
4840 // Record each split. The last partition's end isn't needed as the size
4841 // of the slice dictates that.
4842 if (S->endOffset() > P.endOffset())
4843 Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
4844 }
4845 }
4846
4847 // We may have split loads where some of their stores are split stores. For
4848 // such loads and stores, we can only pre-split them if their splits exactly
4849 // match relative to their starting offset. We have to verify this prior to
4850 // any rewriting.
4851 llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
4852 // Lookup the load we are storing in our map of split
4853 // offsets.
4854 auto *LI = cast<LoadInst>(SI->getValueOperand());
4855 // If it was completely unsplittable, then we're done,
4856 // and this store can't be pre-split.
4857 if (UnsplittableLoads.count(LI))
4858 return true;
4859
4860 auto LoadOffsetsI = SplitOffsetsMap.find(LI);
4861 if (LoadOffsetsI == SplitOffsetsMap.end())
4862 return false; // Unrelated loads are definitely safe.
4863 auto &LoadOffsets = LoadOffsetsI->second;
4864
4865 // Now lookup the store's offsets.
4866 auto &StoreOffsets = SplitOffsetsMap[SI];
4867
4868 // If the relative offsets of each split in the load and
4869 // store match exactly, then we can split them and we
4870 // don't need to remove them here.
4871 if (LoadOffsets.Splits == StoreOffsets.Splits)
4872 return false;
4873
4874 LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
4875 << " " << *LI << "\n"
4876 << " " << *SI << "\n");
4877
4878 // We've found a store and load that we need to split
4879 // with mismatched relative splits. Just give up on them
4880 // and remove both instructions from our list of
4881 // candidates.
4882 UnsplittableLoads.insert(LI);
4883 return true;
4884 });
4885 // Now we have to go *back* through all the stores, because a later store may
4886 // have caused an earlier store's load to become unsplittable and if it is
4887 // unsplittable for the later store, then we can't rely on it being split in
4888 // the earlier store either.
4889 llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
4890 auto *LI = cast<LoadInst>(SI->getValueOperand());
4891 return UnsplittableLoads.count(LI);
4892 });
4893 // Once we've established all the loads that can't be split for some reason,
4894 // filter any that made it into our list out.
4895 llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
4896 return UnsplittableLoads.count(LI);
4897 });
4898
4899 // If no loads or stores are left, there is no pre-splitting to be done for
4900 // this alloca.
4901 if (Loads.empty() && Stores.empty())
4902 return false;
4903
4904 // From here on, we can't fail and will be building new accesses, so rig up
4905 // an IR builder.
4906 IRBuilderTy IRB(&AI);
4907
4908 // Collect the new slices which we will merge into the alloca slices.
4909 SmallVector<Slice, 4> NewSlices;
4910
4911 // Track any allocas we end up splitting loads and stores for so we iterate
4912 // on them.
4913 SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
4914
4915 // At this point, we have collected all of the loads and stores we can
4916 // pre-split, and the specific splits needed for them. We actually do the
4917 // splitting in a specific order in order to handle when one of the loads in
4918 // the value operand to one of the stores.
4919 //
4920 // First, we rewrite all of the split loads, and just accumulate each split
4921 // load in a parallel structure. We also build the slices for them and append
4922 // them to the alloca slices.
4923 SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
4924 std::vector<LoadInst *> SplitLoads;
4925 const DataLayout &DL = AI.getDataLayout();
4926 for (LoadInst *LI : Loads) {
4927 SplitLoads.clear();
4928
4929 auto &Offsets = SplitOffsetsMap[LI];
4930 unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
4931 assert(LI->getType()->getIntegerBitWidth() % 8 == 0 &&
4932 "Load must have type size equal to store size");
4933 assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize &&
4934 "Load must be >= slice size");
4935
4936 uint64_t BaseOffset = Offsets.S->beginOffset();
4937 assert(BaseOffset + SliceSize > BaseOffset &&
4938 "Cannot represent alloca access size using 64-bit integers!");
4939
4941 IRB.SetInsertPoint(LI);
4942
4943 LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
4944
4945 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4946 int Idx = 0, Size = Offsets.Splits.size();
4947 for (;;) {
4948 auto *PartTy = Type::getIntNTy(LI->getContext(), PartSize * 8);
4949 auto AS = LI->getPointerAddressSpace();
4950 auto *PartPtrTy = LI->getPointerOperandType();
4951 LoadInst *PLoad = IRB.CreateAlignedLoad(
4952 PartTy,
4953 getAdjustedPtr(IRB, DL, BasePtr,
4954 APInt(DL.getIndexSizeInBits(AS), PartOffset),
4955 PartPtrTy, BasePtr->getName() + "."),
4956 getAdjustedAlignment(LI, PartOffset),
4957 /*IsVolatile*/ false, LI->getName());
4958 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4959 LLVMContext::MD_access_group});
4960
4961 // Append this load onto the list of split loads so we can find it later
4962 // to rewrite the stores.
4963 SplitLoads.push_back(PLoad);
4964
4965 // Now build a new slice for the alloca.
4966 NewSlices.push_back(
4967 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4968 &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
4969 /*IsSplittable*/ false));
4970 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4971 << ", " << NewSlices.back().endOffset()
4972 << "): " << *PLoad << "\n");
4973
4974 // See if we've handled all the splits.
4975 if (Idx >= Size)
4976 break;
4977
4978 // Setup the next partition.
4979 PartOffset = Offsets.Splits[Idx];
4980 ++Idx;
4981 PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset;
4982 }
4983
4984 // Now that we have the split loads, do the slow walk over all uses of the
4985 // load and rewrite them as split stores, or save the split loads to use
4986 // below if the store is going to be split there anyways.
4987 bool DeferredStores = false;
4988 for (User *LU : LI->users()) {
4989 StoreInst *SI = cast<StoreInst>(LU);
4990 if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
4991 DeferredStores = true;
4992 LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
4993 << "\n");
4994 continue;
4995 }
4996
4997 Value *StoreBasePtr = SI->getPointerOperand();
4998 IRB.SetInsertPoint(SI);
4999 AAMDNodes AATags = SI->getAAMetadata();
5000
5001 LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
5002
5003 for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
5004 LoadInst *PLoad = SplitLoads[Idx];
5005 uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
5006 auto *PartPtrTy = SI->getPointerOperandType();
5007
5008 auto AS = SI->getPointerAddressSpace();
5009 StoreInst *PStore = IRB.CreateAlignedStore(
5010 PLoad,
5011 getAdjustedPtr(IRB, DL, StoreBasePtr,
5012 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5013 PartPtrTy, StoreBasePtr->getName() + "."),
5014 getAdjustedAlignment(SI, PartOffset),
5015 /*IsVolatile*/ false);
5016 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
5017 LLVMContext::MD_access_group,
5018 LLVMContext::MD_DIAssignID});
5019
5020 if (AATags)
5021 PStore->setAAMetadata(
5022 AATags.adjustForAccess(PartOffset, PLoad->getType(), DL));
5023 LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
5024 }
5025
5026 // We want to immediately iterate on any allocas impacted by splitting
5027 // this store, and we have to track any promotable alloca (indicated by
5028 // a direct store) as needing to be resplit because it is no longer
5029 // promotable.
5030 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
5031 ResplitPromotableAllocas.insert(OtherAI);
5032 Worklist.insert(OtherAI);
5033 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
5034 StoreBasePtr->stripInBoundsOffsets())) {
5035 Worklist.insert(OtherAI);
5036 }
5037
5038 // Mark the original store as dead.
5039 DeadInsts.push_back(SI);
5040 }
5041
5042 // Save the split loads if there are deferred stores among the users.
5043 if (DeferredStores)
5044 SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
5045
5046 // Mark the original load as dead and kill the original slice.
5047 DeadInsts.push_back(LI);
5048 Offsets.S->kill();
5049 }
5050
5051 // Second, we rewrite all of the split stores. At this point, we know that
5052 // all loads from this alloca have been split already. For stores of such
5053 // loads, we can simply look up the pre-existing split loads. For stores of
5054 // other loads, we split those loads first and then write split stores of
5055 // them.
5056 for (StoreInst *SI : Stores) {
5057 auto *LI = cast<LoadInst>(SI->getValueOperand());
5058 IntegerType *Ty = cast<IntegerType>(LI->getType());
5059 assert(Ty->getBitWidth() % 8 == 0);
5060 uint64_t StoreSize = Ty->getBitWidth() / 8;
5061 assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
5062
5063 auto &Offsets = SplitOffsetsMap[SI];
5064 assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
5065 "Slice size should always match load size exactly!");
5066 uint64_t BaseOffset = Offsets.S->beginOffset();
5067 assert(BaseOffset + StoreSize > BaseOffset &&
5068 "Cannot represent alloca access size using 64-bit integers!");
5069
5070 Value *LoadBasePtr = LI->getPointerOperand();
5071 Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
5072
5073 LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
5074
5075 // Check whether we have an already split load.
5076 auto SplitLoadsMapI = SplitLoadsMap.find(LI);
5077 std::vector<LoadInst *> *SplitLoads = nullptr;
5078 if (SplitLoadsMapI != SplitLoadsMap.end()) {
5079 SplitLoads = &SplitLoadsMapI->second;
5080 assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
5081 "Too few split loads for the number of splits in the store!");
5082 } else {
5083 LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
5084 }
5085
5086 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
5087 int Idx = 0, Size = Offsets.Splits.size();
5088 for (;;) {
5089 auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
5090 auto *LoadPartPtrTy = LI->getPointerOperandType();
5091 auto *StorePartPtrTy = SI->getPointerOperandType();
5092
5093 // Either lookup a split load or create one.
5094 LoadInst *PLoad;
5095 if (SplitLoads) {
5096 PLoad = (*SplitLoads)[Idx];
5097 } else {
5098 IRB.SetInsertPoint(LI);
5099 auto AS = LI->getPointerAddressSpace();
5100 PLoad = IRB.CreateAlignedLoad(
5101 PartTy,
5102 getAdjustedPtr(IRB, DL, LoadBasePtr,
5103 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5104 LoadPartPtrTy, LoadBasePtr->getName() + "."),
5105 getAdjustedAlignment(LI, PartOffset),
5106 /*IsVolatile*/ false, LI->getName());
5107 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
5108 LLVMContext::MD_access_group});
5109 }
5110
5111 // And store this partition.
5112 IRB.SetInsertPoint(SI);
5113 auto AS = SI->getPointerAddressSpace();
5114 StoreInst *PStore = IRB.CreateAlignedStore(
5115 PLoad,
5116 getAdjustedPtr(IRB, DL, StoreBasePtr,
5117 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5118 StorePartPtrTy, StoreBasePtr->getName() + "."),
5119 getAdjustedAlignment(SI, PartOffset),
5120 /*IsVolatile*/ false);
5121 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
5122 LLVMContext::MD_access_group});
5123
5124 // Now build a new slice for the alloca.
5125 NewSlices.push_back(
5126 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
5127 &PStore->getOperandUse(PStore->getPointerOperandIndex()),
5128 /*IsSplittable*/ false));
5129 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
5130 << ", " << NewSlices.back().endOffset()
5131 << "): " << *PStore << "\n");
5132 if (!SplitLoads) {
5133 LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
5134 }
5135
5136 // See if we've finished all the splits.
5137 if (Idx >= Size)
5138 break;
5139
5140 // Setup the next partition.
5141 PartOffset = Offsets.Splits[Idx];
5142 ++Idx;
5143 PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
5144 }
5145
5146 // We want to immediately iterate on any allocas impacted by splitting
5147 // this load, which is only relevant if it isn't a load of this alloca and
5148 // thus we didn't already split the loads above. We also have to keep track
5149 // of any promotable allocas we split loads on as they can no longer be
5150 // promoted.
5151 if (!SplitLoads) {
5152 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
5153 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5154 ResplitPromotableAllocas.insert(OtherAI);
5155 Worklist.insert(OtherAI);
5156 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
5157 LoadBasePtr->stripInBoundsOffsets())) {
5158 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5159 Worklist.insert(OtherAI);
5160 }
5161 }
5162
5163 // Mark the original store as dead now that we've split it up and kill its
5164 // slice. Note that we leave the original load in place unless this store
5165 // was its only use. It may in turn be split up if it is an alloca load
5166 // for some other alloca, but it may be a normal load. This may introduce
5167 // redundant loads, but where those can be merged the rest of the optimizer
5168 // should handle the merging, and this uncovers SSA splits which is more
5169 // important. In practice, the original loads will almost always be fully
5170 // split and removed eventually, and the splits will be merged by any
5171 // trivial CSE, including instcombine.
5172 if (LI->hasOneUse()) {
5173 assert(*LI->user_begin() == SI && "Single use isn't this store!");
5174 DeadInsts.push_back(LI);
5175 }
5176 DeadInsts.push_back(SI);
5177 Offsets.S->kill();
5178 }
5179
5180 // Remove the killed slices that have ben pre-split.
5181 llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
5182
5183 // Insert our new slices. This will sort and merge them into the sorted
5184 // sequence.
5185 AS.insert(NewSlices);
5186
5187 LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
5188#ifndef NDEBUG
5189 for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
5190 LLVM_DEBUG(AS.print(dbgs(), I, " "));
5191#endif
5192
5193 // Finally, don't try to promote any allocas that new require re-splitting.
5194 // They have already been added to the worklist above.
5195 PromotableAllocas.set_subtract(ResplitPromotableAllocas);
5196
5197 return true;
5198}
5199
5200/// Rewrite an alloca partition's users.
5201///
5202/// This routine drives both of the rewriting goals of the SROA pass. It tries
5203/// to rewrite uses of an alloca partition to be conducive for SSA value
5204/// promotion. If the partition needs a new, more refined alloca, this will
5205/// build that new alloca, preserving as much type information as possible, and
5206/// rewrite the uses of the old alloca to point at the new one and have the
5207/// appropriate new offsets. It also evaluates how successful the rewrite was
5208/// at enabling promotion and if it was successful queues the alloca to be
5209/// promoted.
5210AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
5211 Partition &P) {
5212 // Try to compute a friendly type for this partition of the alloca. This
5213 // won't always succeed, in which case we fall back to a legal integer type
5214 // or an i8 array of an appropriate size.
5215 Type *SliceTy = nullptr;
5216 VectorType *SliceVecTy = nullptr;
5217 const DataLayout &DL = AI.getDataLayout();
5218 unsigned VScale = AI.getFunction()->getVScaleValue();
5219
5220 std::pair<Type *, IntegerType *> CommonUseTy =
5221 findCommonType(P.begin(), P.end(), P.endOffset());
5222 // Do all uses operate on the same type?
5223 if (CommonUseTy.first) {
5224 TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first);
5225 if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) {
5226 SliceTy = CommonUseTy.first;
5227 SliceVecTy = dyn_cast<VectorType>(SliceTy);
5228 }
5229 }
5230 // If not, can we find an appropriate subtype in the original allocated type?
5231 if (!SliceTy)
5232 if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
5233 P.beginOffset(), P.size()))
5234 SliceTy = TypePartitionTy;
5235
5236 // If still not, can we use the largest bitwidth integer type used?
5237 if (!SliceTy && CommonUseTy.second)
5238 if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) {
5239 SliceTy = CommonUseTy.second;
5240 SliceVecTy = dyn_cast<VectorType>(SliceTy);
5241 }
5242 if ((!SliceTy || (SliceTy->isArrayTy() &&
5243 SliceTy->getArrayElementType()->isIntegerTy())) &&
5244 DL.isLegalInteger(P.size() * 8)) {
5245 SliceTy = Type::getIntNTy(*C, P.size() * 8);
5246 }
5247
5248 // If the common use types are not viable for promotion then attempt to find
5249 // another type that is viable.
5250 if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL, VScale))
5251 if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
5252 P.beginOffset(), P.size())) {
5253 VectorType *TypePartitionVecTy = dyn_cast<VectorType>(TypePartitionTy);
5254 if (TypePartitionVecTy &&
5255 checkVectorTypeForPromotion(P, TypePartitionVecTy, DL, VScale))
5256 SliceTy = TypePartitionTy;
5257 }
5258
5259 if (!SliceTy)
5260 SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
5261 assert(DL.getTypeAllocSize(SliceTy).getFixedValue() >= P.size());
5262
5263 bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
5264
5265 VectorType *VecTy =
5266 IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale);
5267 if (VecTy)
5268 SliceTy = VecTy;
5269
5270 // Check for the case where we're going to rewrite to a new alloca of the
5271 // exact same type as the original, and with the same access offsets. In that
5272 // case, re-use the existing alloca, but still run through the rewriter to
5273 // perform phi and select speculation.
5274 // P.beginOffset() can be non-zero even with the same type in a case with
5275 // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
5276 AllocaInst *NewAI;
5277 if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
5278 NewAI = &AI;
5279 // FIXME: We should be able to bail at this point with "nothing changed".
5280 // FIXME: We might want to defer PHI speculation until after here.
5281 // FIXME: return nullptr;
5282 } else {
5283 // Make sure the alignment is compatible with P.beginOffset().
5284 const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
5285 // If we will get at least this much alignment from the type alone, leave
5286 // the alloca's alignment unconstrained.
5287 const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy);
5288 NewAI = new AllocaInst(
5289 SliceTy, AI.getAddressSpace(), nullptr,
5290 IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment,
5291 AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()),
5292 AI.getIterator());
5293 // Copy the old AI debug location over to the new one.
5294 NewAI->setDebugLoc(AI.getDebugLoc());
5295 ++NumNewAllocas;
5296 }
5297
5298 LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
5299 << "," << P.endOffset() << ") to: " << *NewAI << "\n");
5300
5301 // Track the high watermark on the worklist as it is only relevant for
5302 // promoted allocas. We will reset it to this point if the alloca is not in
5303 // fact scheduled for promotion.
5304 unsigned PPWOldSize = PostPromotionWorklist.size();
5305 unsigned NumUses = 0;
5306 SmallSetVector<PHINode *, 8> PHIUsers;
5307 SmallSetVector<SelectInst *, 8> SelectUsers;
5308
5309 AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
5310 P.endOffset(), IsIntegerPromotable, VecTy,
5311 PHIUsers, SelectUsers);
5312 bool Promotable = true;
5313 // Check whether we can have tree-structured merge.
5314 if (auto DeletedValues = Rewriter.rewriteTreeStructuredMerge(P)) {
5315 NumUses += DeletedValues->size() + 1;
5316 for (Value *V : *DeletedValues)
5317 DeadInsts.push_back(V);
5318 } else {
5319 for (Slice *S : P.splitSliceTails()) {
5320 Promotable &= Rewriter.visit(S);
5321 ++NumUses;
5322 }
5323 for (Slice &S : P) {
5324 Promotable &= Rewriter.visit(&S);
5325 ++NumUses;
5326 }
5327 }
5328
5329 NumAllocaPartitionUses += NumUses;
5330 MaxUsesPerAllocaPartition.updateMax(NumUses);
5331
5332 // Now that we've processed all the slices in the new partition, check if any
5333 // PHIs or Selects would block promotion.
5334 for (PHINode *PHI : PHIUsers)
5335 if (!isSafePHIToSpeculate(*PHI)) {
5336 Promotable = false;
5337 PHIUsers.clear();
5338 SelectUsers.clear();
5339 break;
5340 }
5341
5343 NewSelectsToRewrite;
5344 NewSelectsToRewrite.reserve(SelectUsers.size());
5345 for (SelectInst *Sel : SelectUsers) {
5346 std::optional<RewriteableMemOps> Ops =
5347 isSafeSelectToSpeculate(*Sel, PreserveCFG);
5348 if (!Ops) {
5349 Promotable = false;
5350 PHIUsers.clear();
5351 SelectUsers.clear();
5352 NewSelectsToRewrite.clear();
5353 break;
5354 }
5355 NewSelectsToRewrite.emplace_back(std::make_pair(Sel, *Ops));
5356 }
5357
5358 if (Promotable) {
5359 for (Use *U : AS.getDeadUsesIfPromotable()) {
5360 auto *OldInst = dyn_cast<Instruction>(U->get());
5361 Value::dropDroppableUse(*U);
5362 if (OldInst)
5363 if (isInstructionTriviallyDead(OldInst))
5364 DeadInsts.push_back(OldInst);
5365 }
5366 if (PHIUsers.empty() && SelectUsers.empty()) {
5367 // Promote the alloca.
5368 PromotableAllocas.insert(NewAI);
5369 } else {
5370 // If we have either PHIs or Selects to speculate, add them to those
5371 // worklists and re-queue the new alloca so that we promote in on the
5372 // next iteration.
5373 SpeculatablePHIs.insert_range(PHIUsers);
5374 SelectsToRewrite.reserve(SelectsToRewrite.size() +
5375 NewSelectsToRewrite.size());
5376 for (auto &&KV : llvm::make_range(
5377 std::make_move_iterator(NewSelectsToRewrite.begin()),
5378 std::make_move_iterator(NewSelectsToRewrite.end())))
5379 SelectsToRewrite.insert(std::move(KV));
5380 Worklist.insert(NewAI);
5381 }
5382 } else {
5383 // Drop any post-promotion work items if promotion didn't happen.
5384 while (PostPromotionWorklist.size() > PPWOldSize)
5385 PostPromotionWorklist.pop_back();
5386
5387 // We couldn't promote and we didn't create a new partition, nothing
5388 // happened.
5389 if (NewAI == &AI)
5390 return nullptr;
5391
5392 // If we can't promote the alloca, iterate on it to check for new
5393 // refinements exposed by splitting the current alloca. Don't iterate on an
5394 // alloca which didn't actually change and didn't get promoted.
5395 Worklist.insert(NewAI);
5396 }
5397
5398 return NewAI;
5399}
5400
5401// There isn't a shared interface to get the "address" parts out of a
5402// dbg.declare and dbg.assign, so provide some wrappers.
5405 return DVR->isKillAddress();
5406 return DVR->isKillLocation();
5407}
5408
5411 return DVR->getAddressExpression();
5412 return DVR->getExpression();
5413}
5414
5415/// Create or replace an existing fragment in a DIExpression with \p Frag.
5416/// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext
5417/// operation, add \p BitExtractOffset to the offset part.
5418///
5419/// Returns the new expression, or nullptr if this fails (see details below).
5420///
5421/// This function is similar to DIExpression::createFragmentExpression except
5422/// for 3 important distinctions:
5423/// 1. The new fragment isn't relative to an existing fragment.
5424/// 2. It assumes the computed location is a memory location. This means we
5425/// don't need to perform checks that creating the fragment preserves the
5426/// expression semantics.
5427/// 3. Existing extract_bits are modified independently of fragment changes
5428/// using \p BitExtractOffset. A change to the fragment offset or size
5429/// may affect a bit extract. But a bit extract offset can change
5430/// independently of the fragment dimensions.
5431///
5432/// Returns the new expression, or nullptr if one couldn't be created.
5433/// Ideally this is only used to signal that a bit-extract has become
5434/// zero-sized (and thus the new debug record has no size and can be
5435/// dropped), however, it fails for other reasons too - see the FIXME below.
5436///
5437/// FIXME: To keep the change that introduces this function NFC it bails
5438/// in some situations unecessarily, e.g. when fragment and bit extract
5439/// sizes differ.
5442 int64_t BitExtractOffset) {
5444 bool HasFragment = false;
5445 bool HasBitExtract = false;
5446
5447 for (auto &Op : Expr->expr_ops()) {
5448 if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
5449 HasFragment = true;
5450 continue;
5451 }
5452 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5454 HasBitExtract = true;
5455 int64_t ExtractOffsetInBits = Op.getArg(0);
5456 int64_t ExtractSizeInBits = Op.getArg(1);
5457
5458 // DIExpression::createFragmentExpression doesn't know how to handle
5459 // a fragment that is smaller than the extract. Copy the behaviour
5460 // (bail) to avoid non-NFC changes.
5461 // FIXME: Don't do this.
5462 if (Frag.SizeInBits < uint64_t(ExtractSizeInBits))
5463 return nullptr;
5464
5465 assert(BitExtractOffset <= 0);
5466 int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset;
5467
5468 // DIExpression::createFragmentExpression doesn't know what to do
5469 // if the new extract starts "outside" the existing one. Copy the
5470 // behaviour (bail) to avoid non-NFC changes.
5471 // FIXME: Don't do this.
5472 if (AdjustedOffset < 0)
5473 return nullptr;
5474
5475 Ops.push_back(Op.getOp());
5476 Ops.push_back(std::max<int64_t>(0, AdjustedOffset));
5477 Ops.push_back(ExtractSizeInBits);
5478 continue;
5479 }
5480 Op.appendToVector(Ops);
5481 }
5482
5483 // Unsupported by createFragmentExpression, so don't support it here yet to
5484 // preserve NFC-ness.
5485 if (HasFragment && HasBitExtract)
5486 return nullptr;
5487
5488 if (!HasBitExtract) {
5490 Ops.push_back(Frag.OffsetInBits);
5491 Ops.push_back(Frag.SizeInBits);
5492 }
5493 return DIExpression::get(Expr->getContext(), Ops);
5494}
5495
5496/// Insert a new DbgRecord.
5497/// \p Orig Original to copy record type, debug loc and variable from, and
5498/// additionally value and value expression for dbg_assign records.
5499/// \p NewAddr Location's new base address.
5500/// \p NewAddrExpr New expression to apply to address.
5501/// \p BeforeInst Insert position.
5502/// \p NewFragment New fragment (absolute, non-relative).
5503/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5504static void
5506 DIExpression *NewAddrExpr, Instruction *BeforeInst,
5507 std::optional<DIExpression::FragmentInfo> NewFragment,
5508 int64_t BitExtractAdjustment) {
5509 (void)DIB;
5510
5511 // A dbg_assign puts fragment info in the value expression only. The address
5512 // expression has already been built: NewAddrExpr. A dbg_declare puts the
5513 // new fragment info into NewAddrExpr (as it only has one expression).
5514 DIExpression *NewFragmentExpr =
5515 Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr;
5516 if (NewFragment)
5517 NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment,
5518 BitExtractAdjustment);
5519 if (!NewFragmentExpr)
5520 return;
5521
5522 if (Orig->isDbgDeclare()) {
5524 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5525 BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5526 BeforeInst->getIterator());
5527 return;
5528 }
5529
5530 if (Orig->isDbgValue()) {
5532 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5533 // Drop debug information if the expression doesn't start with a
5534 // DW_OP_deref. This is because without a DW_OP_deref, the #dbg_value
5535 // describes the address of alloca rather than the value inside the alloca.
5536 if (!NewFragmentExpr->startsWithDeref())
5537 DVR->setKillAddress();
5538 BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5539 BeforeInst->getIterator());
5540 return;
5541 }
5542
5543 // Apply a DIAssignID to the store if it doesn't already have it.
5544 if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) {
5545 NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
5547 }
5548
5550 NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
5551 NewAddrExpr, Orig->getDebugLoc());
5552 LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
5553 (void)NewAssign;
5554}
5555
5556/// Walks the slices of an alloca and form partitions based on them,
5557/// rewriting each of their uses.
5558bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
5559 if (AS.begin() == AS.end())
5560 return false;
5561
5562 unsigned NumPartitions = 0;
5563 bool Changed = false;
5564 const DataLayout &DL = AI.getModule()->getDataLayout();
5565
5566 // First try to pre-split loads and stores.
5567 Changed |= presplitLoadsAndStores(AI, AS);
5568
5569 // Now that we have identified any pre-splitting opportunities,
5570 // mark loads and stores unsplittable except for the following case.
5571 // We leave a slice splittable if all other slices are disjoint or fully
5572 // included in the slice, such as whole-alloca loads and stores.
5573 // If we fail to split these during pre-splitting, we want to force them
5574 // to be rewritten into a partition.
5575 bool IsSorted = true;
5576
5577 uint64_t AllocaSize =
5578 DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue();
5579 const uint64_t MaxBitVectorSize = 1024;
5580 if (AllocaSize <= MaxBitVectorSize) {
5581 // If a byte boundary is included in any load or store, a slice starting or
5582 // ending at the boundary is not splittable.
5583 SmallBitVector SplittableOffset(AllocaSize + 1, true);
5584 for (Slice &S : AS)
5585 for (unsigned O = S.beginOffset() + 1;
5586 O < S.endOffset() && O < AllocaSize; O++)
5587 SplittableOffset.reset(O);
5588
5589 for (Slice &S : AS) {
5590 if (!S.isSplittable())
5591 continue;
5592
5593 if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
5594 (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
5595 continue;
5596
5597 if (isa<LoadInst>(S.getUse()->getUser()) ||
5598 isa<StoreInst>(S.getUse()->getUser())) {
5599 S.makeUnsplittable();
5600 IsSorted = false;
5601 }
5602 }
5603 } else {
5604 // We only allow whole-alloca splittable loads and stores
5605 // for a large alloca to avoid creating too large BitVector.
5606 for (Slice &S : AS) {
5607 if (!S.isSplittable())
5608 continue;
5609
5610 if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
5611 continue;
5612
5613 if (isa<LoadInst>(S.getUse()->getUser()) ||
5614 isa<StoreInst>(S.getUse()->getUser())) {
5615 S.makeUnsplittable();
5616 IsSorted = false;
5617 }
5618 }
5619 }
5620
5621 if (!IsSorted)
5623
5624 /// Describes the allocas introduced by rewritePartition in order to migrate
5625 /// the debug info.
5626 struct Fragment {
5627 AllocaInst *Alloca;
5628 uint64_t Offset;
5629 uint64_t Size;
5630 Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
5631 : Alloca(AI), Offset(O), Size(S) {}
5632 };
5633 SmallVector<Fragment, 4> Fragments;
5634
5635 // Rewrite each partition.
5636 for (auto &P : AS.partitions()) {
5637 if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
5638 Changed = true;
5639 if (NewAI != &AI) {
5640 uint64_t SizeOfByte = 8;
5641 uint64_t AllocaSize =
5642 DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedValue();
5643 // Don't include any padding.
5644 uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
5645 Fragments.push_back(
5646 Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
5647 }
5648 }
5649 ++NumPartitions;
5650 }
5651
5652 NumAllocaPartitions += NumPartitions;
5653 MaxPartitionsPerAlloca.updateMax(NumPartitions);
5654
5655 // Migrate debug information from the old alloca to the new alloca(s)
5656 // and the individual partitions.
5657 auto MigrateOne = [&](DbgVariableRecord *DbgVariable) {
5658 // Can't overlap with undef memory.
5659 if (isKillAddress(DbgVariable))
5660 return;
5661
5662 const Value *DbgPtr = DbgVariable->getAddress();
5664 DbgVariable->getFragmentOrEntireVariable();
5665 // Get the address expression constant offset if one exists and the ops
5666 // that come after it.
5667 int64_t CurrentExprOffsetInBytes = 0;
5668 SmallVector<uint64_t> PostOffsetOps;
5669 if (!getAddressExpression(DbgVariable)
5670 ->extractLeadingOffset(CurrentExprOffsetInBytes, PostOffsetOps))
5671 return; // Couldn't interpret this DIExpression - drop the var.
5672
5673 // Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext.
5674 int64_t ExtractOffsetInBits = 0;
5675 for (auto Op : getAddressExpression(DbgVariable)->expr_ops()) {
5676 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5678 ExtractOffsetInBits = Op.getArg(0);
5679 break;
5680 }
5681 }
5682
5683 DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
5684 for (auto Fragment : Fragments) {
5685 int64_t OffsetFromLocationInBits;
5686 std::optional<DIExpression::FragmentInfo> NewDbgFragment;
5687 // Find the variable fragment that the new alloca slice covers.
5688 // Drop debug info for this variable fragment if we can't compute an
5689 // intersect between it and the alloca slice.
5691 DL, &AI, Fragment.Offset, Fragment.Size, DbgPtr,
5692 CurrentExprOffsetInBytes * 8, ExtractOffsetInBits, VarFrag,
5693 NewDbgFragment, OffsetFromLocationInBits))
5694 continue; // Do not migrate this fragment to this slice.
5695
5696 // Zero sized fragment indicates there's no intersect between the variable
5697 // fragment and the alloca slice. Skip this slice for this variable
5698 // fragment.
5699 if (NewDbgFragment && !NewDbgFragment->SizeInBits)
5700 continue; // Do not migrate this fragment to this slice.
5701
5702 // No fragment indicates DbgVariable's variable or fragment exactly
5703 // overlaps the slice; copy its fragment (or nullopt if there isn't one).
5704 if (!NewDbgFragment)
5705 NewDbgFragment = DbgVariable->getFragment();
5706
5707 // Reduce the new expression offset by the bit-extract offset since
5708 // we'll be keeping that.
5709 int64_t OffestFromNewAllocaInBits =
5710 OffsetFromLocationInBits - ExtractOffsetInBits;
5711 // We need to adjust an existing bit extract if the offset expression
5712 // can't eat the slack (i.e., if the new offset would be negative).
5713 int64_t BitExtractOffset =
5714 std::min<int64_t>(0, OffestFromNewAllocaInBits);
5715 // The magnitude of a negative value indicates the number of bits into
5716 // the existing variable fragment that the memory region begins. The new
5717 // variable fragment already excludes those bits - the new DbgPtr offset
5718 // only needs to be applied if it's positive.
5719 OffestFromNewAllocaInBits =
5720 std::max(int64_t(0), OffestFromNewAllocaInBits);
5721
5722 // Rebuild the expression:
5723 // {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment}
5724 // Add NewDbgFragment later, because dbg.assigns don't want it in the
5725 // address expression but the value expression instead.
5726 DIExpression *NewExpr = DIExpression::get(AI.getContext(), PostOffsetOps);
5727 if (OffestFromNewAllocaInBits > 0) {
5728 int64_t OffsetInBytes = (OffestFromNewAllocaInBits + 7) / 8;
5729 NewExpr = DIExpression::prepend(NewExpr, /*flags=*/0, OffsetInBytes);
5730 }
5731
5732 // Remove any existing intrinsics on the new alloca describing
5733 // the variable fragment.
5734 auto RemoveOne = [DbgVariable](auto *OldDII) {
5735 auto SameVariableFragment = [](const auto *LHS, const auto *RHS) {
5736 return LHS->getVariable() == RHS->getVariable() &&
5737 LHS->getDebugLoc()->getInlinedAt() ==
5738 RHS->getDebugLoc()->getInlinedAt();
5739 };
5740 if (SameVariableFragment(OldDII, DbgVariable))
5741 OldDII->eraseFromParent();
5742 };
5743 for_each(findDVRDeclares(Fragment.Alloca), RemoveOne);
5744 for_each(findDVRValues(Fragment.Alloca), RemoveOne);
5745 insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI,
5746 NewDbgFragment, BitExtractOffset);
5747 }
5748 };
5749
5750 // Migrate debug information from the old alloca to the new alloca(s)
5751 // and the individual partitions.
5752 for_each(findDVRDeclares(&AI), MigrateOne);
5753 for_each(findDVRValues(&AI), MigrateOne);
5754 for_each(at::getDVRAssignmentMarkers(&AI), MigrateOne);
5755
5756 return Changed;
5757}
5758
5759/// Clobber a use with poison, deleting the used value if it becomes dead.
5760void SROA::clobberUse(Use &U) {
5761 Value *OldV = U;
5762 // Replace the use with an poison value.
5763 U = PoisonValue::get(OldV->getType());
5764
5765 // Check for this making an instruction dead. We have to garbage collect
5766 // all the dead instructions to ensure the uses of any alloca end up being
5767 // minimal.
5768 if (Instruction *OldI = dyn_cast<Instruction>(OldV))
5769 if (isInstructionTriviallyDead(OldI)) {
5770 DeadInsts.push_back(OldI);
5771 }
5772}
5773
5774/// A basic LoadAndStorePromoter that does not remove store nodes.
5776public:
5778 Type *ZeroType)
5779 : LoadAndStorePromoter(Insts, S), ZeroType(ZeroType) {}
5780 bool shouldDelete(Instruction *I) const override {
5781 return !isa<StoreInst>(I) && !isa<AllocaInst>(I);
5782 }
5783
5785 return UndefValue::get(ZeroType);
5786 }
5787
5788private:
5789 Type *ZeroType;
5790};
5791
5792bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
5793 // Look through each "partition", looking for slices with the same start/end
5794 // that do not overlap with any before them. The slices are sorted by
5795 // increasing beginOffset. We don't use AS.partitions(), as it will use a more
5796 // sophisticated algorithm that takes splittable slices into account.
5797 LLVM_DEBUG(dbgs() << "Attempting to propagate values on " << AI << "\n");
5798 bool AllSameAndValid = true;
5799 Type *PartitionType = nullptr;
5801 uint64_t BeginOffset = 0;
5802 uint64_t EndOffset = 0;
5803
5804 auto Flush = [&]() {
5805 if (AllSameAndValid && !Insts.empty()) {
5806 LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
5807 << EndOffset << ")\n");
5809 SSAUpdater SSA(&NewPHIs);
5810 Insts.push_back(&AI);
5811 BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
5812 Promoter.run(Insts);
5813 }
5814 AllSameAndValid = true;
5815 PartitionType = nullptr;
5816 Insts.clear();
5817 };
5818
5819 for (Slice &S : AS) {
5820 auto *User = cast<Instruction>(S.getUse()->getUser());
5821 if (isAssumeLikeIntrinsic(User)) {
5822 LLVM_DEBUG({
5823 dbgs() << "Ignoring slice: ";
5824 AS.print(dbgs(), &S);
5825 });
5826 continue;
5827 }
5828 if (S.beginOffset() >= EndOffset) {
5829 Flush();
5830 BeginOffset = S.beginOffset();
5831 EndOffset = S.endOffset();
5832 } else if (S.beginOffset() != BeginOffset || S.endOffset() != EndOffset) {
5833 if (AllSameAndValid) {
5834 LLVM_DEBUG({
5835 dbgs() << "Slice does not match range [" << BeginOffset << ", "
5836 << EndOffset << ")";
5837 AS.print(dbgs(), &S);
5838 });
5839 AllSameAndValid = false;
5840 }
5841 EndOffset = std::max(EndOffset, S.endOffset());
5842 continue;
5843 }
5844
5845 if (auto *LI = dyn_cast<LoadInst>(User)) {
5846 Type *UserTy = LI->getType();
5847 // LoadAndStorePromoter requires all the types to be the same.
5848 if (!LI->isSimple() || (PartitionType && UserTy != PartitionType))
5849 AllSameAndValid = false;
5850 PartitionType = UserTy;
5851 Insts.push_back(User);
5852 } else if (auto *SI = dyn_cast<StoreInst>(User)) {
5853 Type *UserTy = SI->getValueOperand()->getType();
5854 if (!SI->isSimple() || (PartitionType && UserTy != PartitionType))
5855 AllSameAndValid = false;
5856 PartitionType = UserTy;
5857 Insts.push_back(User);
5858 } else {
5859 AllSameAndValid = false;
5860 }
5861 }
5862
5863 Flush();
5864 return true;
5865}
5866
5867/// Analyze an alloca for SROA.
5868///
5869/// This analyzes the alloca to ensure we can reason about it, builds
5870/// the slices of the alloca, and then hands it off to be split and
5871/// rewritten as needed.
5872std::pair<bool /*Changed*/, bool /*CFGChanged*/>
5873SROA::runOnAlloca(AllocaInst &AI) {
5874 bool Changed = false;
5875 bool CFGChanged = false;
5876
5877 LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
5878 ++NumAllocasAnalyzed;
5879
5880 // Special case dead allocas, as they're trivial.
5881 if (AI.use_empty()) {
5882 AI.eraseFromParent();
5883 Changed = true;
5884 return {Changed, CFGChanged};
5885 }
5886 const DataLayout &DL = AI.getDataLayout();
5887
5888 // Skip alloca forms that this analysis can't handle.
5889 auto *AT = AI.getAllocatedType();
5890 TypeSize Size = DL.getTypeAllocSize(AT);
5891 if (AI.isArrayAllocation() || !AT->isSized() || Size.isScalable() ||
5892 Size.getFixedValue() == 0)
5893 return {Changed, CFGChanged};
5894
5895 // First, split any FCA loads and stores touching this alloca to promote
5896 // better splitting and promotion opportunities.
5897 IRBuilderTy IRB(&AI);
5898 AggLoadStoreRewriter AggRewriter(DL, IRB);
5899 Changed |= AggRewriter.rewrite(AI);
5900
5901 // Build the slices using a recursive instruction-visiting builder.
5902 AllocaSlices AS(DL, AI);
5903 LLVM_DEBUG(AS.print(dbgs()));
5904 if (AS.isEscaped())
5905 return {Changed, CFGChanged};
5906
5907 if (AS.isEscapedReadOnly()) {
5908 Changed |= propagateStoredValuesToLoads(AI, AS);
5909 return {Changed, CFGChanged};
5910 }
5911
5912 // Delete all the dead users of this alloca before splitting and rewriting it.
5913 for (Instruction *DeadUser : AS.getDeadUsers()) {
5914 // Free up everything used by this instruction.
5915 for (Use &DeadOp : DeadUser->operands())
5916 clobberUse(DeadOp);
5917
5918 // Now replace the uses of this instruction.
5919 DeadUser->replaceAllUsesWith(PoisonValue::get(DeadUser->getType()));
5920
5921 // And mark it for deletion.
5922 DeadInsts.push_back(DeadUser);
5923 Changed = true;
5924 }
5925 for (Use *DeadOp : AS.getDeadOperands()) {
5926 clobberUse(*DeadOp);
5927 Changed = true;
5928 }
5929
5930 // No slices to split. Leave the dead alloca for a later pass to clean up.
5931 if (AS.begin() == AS.end())
5932 return {Changed, CFGChanged};
5933
5934 Changed |= splitAlloca(AI, AS);
5935
5936 LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
5937 while (!SpeculatablePHIs.empty())
5938 speculatePHINodeLoads(IRB, *SpeculatablePHIs.pop_back_val());
5939
5940 LLVM_DEBUG(dbgs() << " Rewriting Selects\n");
5941 auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector();
5942 while (!RemainingSelectsToRewrite.empty()) {
5943 const auto [K, V] = RemainingSelectsToRewrite.pop_back_val();
5944 CFGChanged |=
5945 rewriteSelectInstMemOps(*K, V, IRB, PreserveCFG ? nullptr : DTU);
5946 }
5947
5948 return {Changed, CFGChanged};
5949}
5950
5951/// Delete the dead instructions accumulated in this run.
5952///
5953/// Recursively deletes the dead instructions we've accumulated. This is done
5954/// at the very end to maximize locality of the recursive delete and to
5955/// minimize the problems of invalidated instruction pointers as such pointers
5956/// are used heavily in the intermediate stages of the algorithm.
5957///
5958/// We also record the alloca instructions deleted here so that they aren't
5959/// subsequently handed to mem2reg to promote.
5960bool SROA::deleteDeadInstructions(
5961 SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
5962 bool Changed = false;
5963 while (!DeadInsts.empty()) {
5964 Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
5965 if (!I)
5966 continue;
5967 LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
5968
5969 // If the instruction is an alloca, find the possible dbg.declare connected
5970 // to it, and remove it too. We must do this before calling RAUW or we will
5971 // not be able to find it.
5972 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
5973 DeletedAllocas.insert(AI);
5974 for (DbgVariableRecord *OldDII : findDVRDeclares(AI))
5975 OldDII->eraseFromParent();
5976 }
5977
5979 I->replaceAllUsesWith(UndefValue::get(I->getType()));
5980
5981 for (Use &Operand : I->operands())
5982 if (Instruction *U = dyn_cast<Instruction>(Operand)) {
5983 // Zero out the operand and see if it becomes trivially dead.
5984 Operand = nullptr;
5986 DeadInsts.push_back(U);
5987 }
5988
5989 ++NumDeleted;
5990 I->eraseFromParent();
5991 Changed = true;
5992 }
5993 return Changed;
5994}
5995/// Promote the allocas, using the best available technique.
5996///
5997/// This attempts to promote whatever allocas have been identified as viable in
5998/// the PromotableAllocas list. If that list is empty, there is nothing to do.
5999/// This function returns whether any promotion occurred.
6000bool SROA::promoteAllocas() {
6001 if (PromotableAllocas.empty())
6002 return false;
6003
6004 if (SROASkipMem2Reg) {
6005 LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
6006 } else {
6007 LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
6008 NumPromoted += PromotableAllocas.size();
6009 PromoteMemToReg(PromotableAllocas.getArrayRef(), DTU->getDomTree(), AC);
6010 }
6011
6012 PromotableAllocas.clear();
6013 return true;
6014}
6015
6016std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) {
6017 LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
6018
6019 const DataLayout &DL = F.getDataLayout();
6020 BasicBlock &EntryBB = F.getEntryBlock();
6021 for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
6022 I != E; ++I) {
6023 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
6024 if (DL.getTypeAllocSize(AI->getAllocatedType()).isScalable() &&
6026 PromotableAllocas.insert(AI);
6027 else
6028 Worklist.insert(AI);
6029 }
6030 }
6031
6032 bool Changed = false;
6033 bool CFGChanged = false;
6034 // A set of deleted alloca instruction pointers which should be removed from
6035 // the list of promotable allocas.
6036 SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
6037
6038 do {
6039 while (!Worklist.empty()) {
6040 auto [IterationChanged, IterationCFGChanged] =
6041 runOnAlloca(*Worklist.pop_back_val());
6042 Changed |= IterationChanged;
6043 CFGChanged |= IterationCFGChanged;
6044
6045 Changed |= deleteDeadInstructions(DeletedAllocas);
6046
6047 // Remove the deleted allocas from various lists so that we don't try to
6048 // continue processing them.
6049 if (!DeletedAllocas.empty()) {
6050 Worklist.set_subtract(DeletedAllocas);
6051 PostPromotionWorklist.set_subtract(DeletedAllocas);
6052 PromotableAllocas.set_subtract(DeletedAllocas);
6053 DeletedAllocas.clear();
6054 }
6055 }
6056
6057 Changed |= promoteAllocas();
6058
6059 Worklist = PostPromotionWorklist;
6060 PostPromotionWorklist.clear();
6061 } while (!Worklist.empty());
6062
6063 assert((!CFGChanged || Changed) && "Can not only modify the CFG.");
6064 assert((!CFGChanged || !PreserveCFG) &&
6065 "Should not have modified the CFG when told to preserve it.");
6066
6067 if (Changed && isAssignmentTrackingEnabled(*F.getParent())) {
6068 for (auto &BB : F) {
6070 }
6071 }
6072
6073 return {Changed, CFGChanged};
6074}
6075
6079 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6080 auto [Changed, CFGChanged] =
6081 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6082 if (!Changed)
6083 return PreservedAnalyses::all();
6085 if (!CFGChanged)
6088 return PA;
6089}
6090
6092 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
6093 static_cast<PassInfoMixin<SROAPass> *>(this)->printPipeline(
6094 OS, MapClassName2PassName);
6095 OS << (PreserveCFG == SROAOptions::PreserveCFG ? "<preserve-cfg>"
6096 : "<modify-cfg>");
6097}
6098
6099SROAPass::SROAPass(SROAOptions PreserveCFG) : PreserveCFG(PreserveCFG) {}
6100
6101namespace {
6102
6103/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
6104class SROALegacyPass : public FunctionPass {
6106
6107public:
6108 static char ID;
6109
6113 }
6114
6115 bool runOnFunction(Function &F) override {
6116 if (skipFunction(F))
6117 return false;
6118
6119 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
6120 AssumptionCache &AC =
6121 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
6122 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6123 auto [Changed, _] =
6124 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6125 return Changed;
6126 }
6127
6128 void getAnalysisUsage(AnalysisUsage &AU) const override {
6129 AU.addRequired<AssumptionCacheTracker>();
6130 AU.addRequired<DominatorTreeWrapperPass>();
6131 AU.addPreserved<GlobalsAAWrapperPass>();
6132 AU.addPreserved<DominatorTreeWrapperPass>();
6133 }
6134
6135 StringRef getPassName() const override { return "SROA"; }
6136};
6137
6138} // end anonymous namespace
6139
6140char SROALegacyPass::ID = 0;
6141
6146
6147INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
6148 "Scalar Replacement Of Aggregates", false, false)
6151INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
This defines the Use class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
print mir2vec MIR2Vec Vocabulary Printer Pass
Definition MIR2Vec.cpp:270
This file implements a map that provides insertion order iteration.
static std::optional< uint64_t > getSizeInBytes(std::optional< uint64_t > SizeInBits)
Memory SSA
Definition MemorySSA.cpp:72
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
PassBuilder PB(Machine, PassOpts->PTO, std::nullopt, &PIC)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PointerIntPair class.
This file provides a collection of visitors which walk the (instruction) uses of a pointer.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
bool isDead(const MachineInstr &MI, const MachineRegisterInfo &MRI)
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, uint64_t OldAllocaOffsetInBits, uint64_t SliceSizeInBits, Instruction *OldInst, Instruction *Inst, Value *Dest, Value *Value, const DataLayout &DL)
Find linked dbg.assign and generate a new one with the correct FragmentInfo.
Definition SROA.cpp:342
static VectorType * isVectorPromotionViable(Partition &P, const DataLayout &DL, unsigned VScale)
Test whether the given alloca partitioning and range of slices can be promoted to a vector.
Definition SROA.cpp:2345
static Align getAdjustedAlignment(Instruction *I, uint64_t Offset)
Compute the adjusted alignment for a load or store from an offset.
Definition SROA.cpp:1920
static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy, const DataLayout &DL, unsigned VScale)
Test whether a vector type is viable for promotion.
Definition SROA.cpp:2186
static VectorType * checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, SmallVectorImpl< VectorType * > &CandidateTys, bool HaveCommonEltTy, Type *CommonEltTy, bool HaveVecPtrTy, bool HaveCommonVecPtrTy, VectorType *CommonVecPtrTy, unsigned VScale)
Test whether any vector type in CandidateTys is viable for promotion.
Definition SROA.cpp:2215
static std::pair< Type *, IntegerType * > findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E, uint64_t EndOffset)
Walk the range of a partitioning looking for a common type to cover this sequence of slices.
Definition SROA.cpp:1486
static Type * stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty)
Strip aggregate type wrapping.
Definition SROA.cpp:4557
static FragCalcResult calculateFragment(DILocalVariable *Variable, uint64_t NewStorageSliceOffsetInBits, uint64_t NewStorageSliceSizeInBits, std::optional< DIExpression::FragmentInfo > StorageFragment, std::optional< DIExpression::FragmentInfo > CurrentFragment, DIExpression::FragmentInfo &Target)
Definition SROA.cpp:277
static DIExpression * createOrReplaceFragment(const DIExpression *Expr, DIExpression::FragmentInfo Frag, int64_t BitExtractOffset)
Create or replace an existing fragment in a DIExpression with Frag.
Definition SROA.cpp:5440
static Value * insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, Value *V, uint64_t Offset, const Twine &Name)
Definition SROA.cpp:2588
static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, VectorType *Ty, uint64_t ElementSize, const DataLayout &DL, unsigned VScale)
Test whether the given slice use can be promoted to a vector.
Definition SROA.cpp:2111
static Value * getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Offset, Type *PointerTy, const Twine &NamePrefix)
Compute an adjusted pointer from Ptr by Offset bytes where the resulting pointer has PointerTy.
Definition SROA.cpp:1909
static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t AllocBeginOffset, Type *AllocaTy, const DataLayout &DL, bool &WholeAllocaOp)
Test whether a slice of an alloca is valid for integer widening.
Definition SROA.cpp:2427
static Value * extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, unsigned EndIndex, const Twine &Name)
Definition SROA.cpp:2621
static Value * foldPHINodeOrSelectInst(Instruction &I)
A helper that folds a PHI node or a select.
Definition SROA.cpp:1005
static bool rewriteSelectInstMemOps(SelectInst &SI, const RewriteableMemOps &Ops, IRBuilderTy &IRB, DomTreeUpdater *DTU)
Definition SROA.cpp:1875
static void rewriteMemOpOfSelect(SelectInst &SI, T &I, SelectHandSpeculativity Spec, DomTreeUpdater &DTU)
Definition SROA.cpp:1808
static Value * foldSelectInst(SelectInst &SI)
Definition SROA.cpp:992
bool isKillAddress(const DbgVariableRecord *DVR)
Definition SROA.cpp:5403
static Value * insertVector(IRBuilderTy &IRB, Value *Old, Value *V, unsigned BeginIndex, const Twine &Name)
Definition SROA.cpp:2643
static bool isIntegerWideningViable(Partition &P, Type *AllocaTy, const DataLayout &DL)
Test whether the given alloca partition's integer operations can be widened to promotable ones.
Definition SROA.cpp:2522
static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN)
Definition SROA.cpp:1626
static VectorType * createAndCheckVectorTypesForPromotion(SetVector< Type * > &OtherTys, ArrayRef< VectorType * > CandidateTysCopy, function_ref< void(Type *)> CheckCandidateType, Partition &P, const DataLayout &DL, SmallVectorImpl< VectorType * > &CandidateTys, bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy, bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale)
Definition SROA.cpp:2301
static DebugVariable getAggregateVariable(DbgVariableRecord *DVR)
Definition SROA.cpp:323
static bool isSafePHIToSpeculate(PHINode &PN)
PHI instructions that use an alloca and are subsequently loaded can be rewritten to load both input p...
Definition SROA.cpp:1552
static Value * extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, IntegerType *Ty, uint64_t Offset, const Twine &Name)
Definition SROA.cpp:2563
static void insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, AllocaInst *NewAddr, DIExpression *NewAddrExpr, Instruction *BeforeInst, std::optional< DIExpression::FragmentInfo > NewFragment, int64_t BitExtractAdjustment)
Insert a new DbgRecord.
Definition SROA.cpp:5505
static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI, IRBuilderTy &IRB)
Definition SROA.cpp:1769
static Value * mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL, Type *NewAIEltTy, IRBuilder<> &Builder)
This function takes two vector values and combines them into a single vector by concatenating their e...
Definition SROA.cpp:2717
const DIExpression * getAddressExpression(const DbgVariableRecord *DVR)
Definition SROA.cpp:5409
static Type * getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, uint64_t Size)
Try to find a partition of the aggregate type passed in for a given offset and size.
Definition SROA.cpp:4595
static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy, unsigned VScale=0)
Test whether we can convert a value from the old to the new type.
Definition SROA.cpp:1930
static SelectHandSpeculativity isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG)
Definition SROA.cpp:1707
static Value * convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, Type *NewTy)
Generic routine to convert an SSA value to a value of a different type.
Definition SROA.cpp:2020
This file provides the interface for LLVM's Scalar Replacement of Aggregates pass.
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Virtual Register Rewriter
Value * RHS
Value * LHS
Builder for the alloca slices.
Definition SROA.cpp:1017
SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
Definition SROA.cpp:1033
An iterator over partitions of the alloca's slices.
Definition SROA.cpp:805
bool operator==(const partition_iterator &RHS) const
Definition SROA.cpp:952
partition_iterator & operator++()
Definition SROA.cpp:972
bool shouldDelete(Instruction *I) const override
Return false if a sub-class wants to keep one of the loads/stores after the SSA construction.
Definition SROA.cpp:5780
BasicLoadAndStorePromoter(ArrayRef< const Instruction * > Insts, SSAUpdater &S, Type *ZeroType)
Definition SROA.cpp:5777
Value * getValueToUseForAlloca(Instruction *I) const override
Return the value to use for the point in the code that the alloca is positioned.
Definition SROA.cpp:5784
Class for arbitrary precision integers.
Definition APInt.h:78
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
iterator begin() const
Definition ArrayRef.h:135
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
LLVM_ABI CaptureInfo getCaptureInfo(unsigned OpNo) const
Return which pointer components this operand may capture.
bool onlyReadsMemory(unsigned OpNo) const
bool isDataOperand(const Use *U) const
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static DIAssignID * getDistinct(LLVMContext &Context)
LLVM_ABI DbgInstPtr insertDbgAssign(Instruction *LinkedInstr, Value *Val, DILocalVariable *SrcVar, DIExpression *ValExpr, Value *Addr, DIExpression *AddrExpr, const DILocation *DL)
Insert a new llvm.dbg.assign intrinsic call.
DWARF expression.
iterator_range< expr_op_iterator > expr_ops() const
DbgVariableFragmentInfo FragmentInfo
LLVM_ABI bool startsWithDeref() const
Return whether the first element a DW_OP_deref.
static LLVM_ABI bool calculateFragmentIntersect(const DataLayout &DL, const Value *SliceStart, uint64_t SliceOffsetInBits, uint64_t SliceSizeInBits, const Value *DbgPtr, int64_t DbgPtrOffsetInBits, int64_t DbgExtractOffsetInBits, DIExpression::FragmentInfo VarFrag, std::optional< DIExpression::FragmentInfo > &Result, int64_t &OffsetFromLocationInBits)
Computes a fragment, bit-extract operation if needed, and new constant offset to describe a part of a...
static LLVM_ABI std::optional< DIExpression * > createFragmentExpression(const DIExpression *Expr, unsigned OffsetInBits, unsigned SizeInBits)
Create a DIExpression to describe one part of an aggregate variable that is fragmented across multipl...
static LLVM_ABI DIExpression * prepend(const DIExpression *Expr, uint8_t Flags, int64_t Offset=0)
Prepend DIExpr with a deref and offset operation and optionally turn it into a stack value or/and an ...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
DebugLoc getDebugLoc() const
LLVM_ABI void moveBefore(DbgRecord *MoveBefore)
void setDebugLoc(DebugLoc Loc)
Record of a variable value-assignment, aka a non instruction representation of the dbg....
LLVM_ABI bool isKillAddress() const
Check whether this kills the address component.
LLVM_ABI bool isKillLocation() const
Value * getValue(unsigned OpIdx=0) const
static LLVM_ABI DbgVariableRecord * createDbgVariableRecord(Value *Location, DILocalVariable *DV, DIExpression *Expr, const DILocation *DI)
static LLVM_ABI DbgVariableRecord * createDVRDeclare(Value *Address, DILocalVariable *DV, DIExpression *Expr, const DILocation *DI)
DIExpression * getExpression() const
LLVM_ABI void setKillAddress()
Kill the address component.
DILocalVariable * getVariable() const
LLVM_ABI void setAssignId(DIAssignID *New)
LLVM_ABI void replaceVariableLocationOp(Value *OldValue, Value *NewValue, bool AllowEmpty=false)
static LLVM_ABI DbgVariableRecord * createLinkedDVRAssign(Instruction *LinkedInstr, Value *Val, DILocalVariable *Variable, DIExpression *Expression, Value *Address, DIExpression *AddressExpression, const DILocation *DI)
DIExpression * getAddressExpression() const
LLVM_ABI DILocation * getInlinedAt() const
Definition DebugLoc.cpp:69
Identifies a unique instance of a variable.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:163
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:322
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
Class to represent fixed width SIMD vectors.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
unsigned getVScaleValue() const
Return the value for vscale based on the vscale_range attribute or 0 when unknown.
const BasicBlock & getEntryBlock() const
Definition Function.h:807
LLVM_ABI bool accumulateConstantOffset(const DataLayout &DL, APInt &Offset, function_ref< bool(Value &, APInt &)> ExternalAnalysis=nullptr) const
Accumulate the constant address offset of this GEP if possible.
Definition Operator.cpp:113
iterator_range< op_iterator > indices()
Type * getSourceElementType() const
LLVM_ABI GEPNoWrapFlags getNoWrapFlags() const
Get the nowrap flags for the GEP instruction.
This provides the default implementation of the IRBuilder 'InsertHelper' method that is called whenev...
Definition IRBuilder.h:61
virtual void InsertHelper(Instruction *I, const Twine &Name, BasicBlock::iterator InsertPt) const
Definition IRBuilder.h:65
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
Base class for instruction visitors.
Definition InstVisitor.h:78
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void setAAMetadata(const AAMDNodes &N)
Sets the AA metadata on this instruction from the AAMDNodes structure.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Class to represent integer types.
@ MAX_INT_BITS
Maximum number of bits that can be specified.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
A wrapper class for inspecting calls to intrinsic functions.
LoadAndStorePromoter(ArrayRef< const Instruction * > Insts, SSAUpdater &S, StringRef Name=StringRef())
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Value * getPointerOperand()
bool isVolatile() const
Return true if this is a load from a volatile memory location.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this load instruction.
Type * getPointerOperandType() const
static unsigned getPointerOperandIndex()
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this load instruction.
bool isSimple() const
Align getAlign() const
Return the alignment of the access that is being performed.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
LLVMContext & getContext() const
Definition Metadata.h:1242
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
This is the common base class for memset/memcpy/memmove.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
PointerIntPair - This class implements a pair of a pointer and small integer.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
PtrUseVisitor(const DataLayout &DL)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Run the pass over the function.
Definition SROA.cpp:6076
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
Definition SROA.cpp:6091
SROAPass(SROAOptions PreserveCFG)
If PreserveCFG is set, then the pass is not allowed to modify CFG in any way, even if it would update...
Definition SROA.cpp:6099
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:102
void clear()
Completely clear the SetVector.
Definition SetVector.h:266
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
Value * getValueOperand()
static unsigned getPointerOperandIndex()
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition StringRef.h:573
size_t rfind(char C, size_t From=npos) const
Search for the last character C in the string.
Definition StringRef.h:345
size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition StringRef.h:293
static constexpr size_t npos
Definition StringRef.h:57
LLVM_ABI size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Used to lazily calculate structure layout information for a target machine, based on the DataLayout s...
Definition DataLayout.h:712
TypeSize getSizeInBytes() const
Definition DataLayout.h:721
LLVM_ABI unsigned getElementContainingOffset(uint64_t FixedOffset) const
Given a valid byte offset into the structure, returns the structure index that contains it.
TypeSize getElementOffset(unsigned Idx) const
Definition DataLayout.h:743
TypeSize getSizeInBits() const
Definition DataLayout.h:723
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
element_iterator element_end() const
element_iterator element_begin() const
bool isPacked() const
Type * getElementType(unsigned N) const
Type::subtype_iterator element_iterator
Target - Wrapper for Target specific information.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getIntegerBitWidth() const
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:264
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
Type * getArrayElementType() const
Definition Type.h:408
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:296
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
bool isTargetExtTy() const
Return true if this is a target extension type.
Definition Type.h:203
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:292
op_iterator op_begin()
Definition User.h:284
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
op_iterator op_end()
Definition User.h:286
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI const Value * stripInBoundsOffsets(function_ref< void(const Value *)> Func=[](const Value *) {}) const
Strip off pointer casts and inbounds GEPs.
Definition Value.cpp:812
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void dropDroppableUsesIn(User &Usr)
Remove every use of this value in User that can safely be removed.
Definition Value.cpp:218
LLVM_ABI const Value * stripAndAccumulateConstantOffsets(const DataLayout &DL, APInt &Offset, bool AllowNonInbounds, bool AllowInvariantGroup=false, function_ref< bool(Value &Value, APInt &Offset)> ExternalAnalysis=nullptr, bool LookThroughIntToPtr=false) const
Accumulate the constant offset this value has compared to a base pointer.
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static VectorType * getWithSizeAndScalar(VectorType *SizeTy, Type *EltTy)
This static method attempts to construct a VectorType with the same size-in-bits as SizeTy but with a...
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:172
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition iterator.h:80
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
Offsets
Offsets in bytes from the start of the input buffer.
SmallVector< DbgVariableRecord * > getDVRAssignmentMarkers(const Instruction *Inst)
Return a range of dbg_assign records for which Inst performs the assignment they encode.
Definition DebugInfo.h:201
LLVM_ABI void deleteAssignmentMarkers(const Instruction *Inst)
Delete the llvm.dbg.assign intrinsics linked to Inst.
initializer< Ty > init(const Ty &Val)
@ DW_OP_LLVM_extract_bits_zext
Only used in LLVM metadata.
Definition Dwarf.h:151
@ DW_OP_LLVM_fragment
Only used in LLVM metadata.
Definition Dwarf.h:144
@ DW_OP_LLVM_extract_bits_sext
Only used in LLVM metadata.
Definition Dwarf.h:150
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
bool empty() const
Definition BasicBlock.h:101
Context & getContext() const
Definition BasicBlock.h:99
iterator end() const
Definition BasicBlock.h:89
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
static cl::opt< bool > SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false), cl::Hidden)
Disable running mem2reg during SROA in order to test or debug SROA.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
bool operator<(int64_t V1, const APSInt &V2)
Definition APSInt.h:362
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2058
LLVM_ABI bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1718
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655
LLVM_ABI void PromoteMemToReg(ArrayRef< AllocaInst * > Allocas, DominatorTree &DT, AssumptionCache *AC=nullptr)
Promote the specified list of alloca instructions into scalar registers, inserting PHI nodes as appro...
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto successors(const MachineBasicBlock *BB)
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2113
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI std::optional< RegOrConstant > getVectorSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI)
Definition Utils.cpp:1494
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
void * PointerTy
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2076
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI bool isAllocaPromotable(const AllocaInst *AI)
Return true if this alloca is legal for promotion.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2128
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
bool capturesFullProvenance(CaptureComponents CC)
Definition ModRef.h:336
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void initializeSROALegacyPassPass(PassRegistry &)
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
LLVM_ABI TinyPtrVector< DbgVariableRecord * > findDVRValues(Value *V)
As above, for DVRValues.
Definition DebugInfo.cpp:66
LLVM_ABI void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2120
LLVM_ABI TinyPtrVector< DbgVariableRecord * > findDVRDeclares(Value *V)
Finds dbg.declare records declaring local variables as living in the memory that 'V' points to.
Definition DebugInfo.cpp:49
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
cl::opt< bool > ProfcheckDisableMetadataFixes("profcheck-disable-metadata-fixes", cl::Hidden, cl::init(false), cl::desc("Disable metadata propagation fixes discovered through Issue #147390"))
LLVM_ABI Instruction * SplitBlockAndInsertIfThen(Value *Cond, BasicBlock::iterator SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI FunctionPass * createSROAPass(bool PreserveCFG=true)
Definition SROA.cpp:6142
SROAOptions
Definition SROA.h:24
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define NDEBUG
Definition regutils.h:48
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
AAMDNodes shift(size_t Offset) const
Create a new AAMDNode that describes this AAMDNode after applying a constant offset to the start of t...
Definition Metadata.h:820
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Describes an element of a Bitfield.
Definition Bitfields.h:176
static Bitfield::Type get(StorageType Packed)
Unpacks the field from the Packed value.
Definition Bitfields.h:207
static void set(StorageType &Packed, typename Bitfield::Type Value)
Sets the typed value in the provided Packed value.
Definition Bitfields.h:223
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:70