LLVM 22.0.0git
SROA.cpp
Go to the documentation of this file.
1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This transformation implements the well known scalar replacement of
10/// aggregates transformation. It tries to identify promotable elements of an
11/// aggregate alloca, and promote them to registers. It will also try to
12/// convert uses of an element (or set of elements) of an alloca into a vector
13/// or bitfield-style integer scalar if appropriate.
14///
15/// It works to do this with minimal slicing of the alloca so that regions
16/// which are merely transferred in and out of external memory remain unchanged
17/// and are not decomposed to scalar code.
18///
19/// Because this also performs alloca promotion, it can be thought of as also
20/// serving the purpose of SSA formation. The algorithm iterates on the
21/// function until all opportunities for promotion have been realized.
22///
23//===----------------------------------------------------------------------===//
24
26#include "llvm/ADT/APInt.h"
27#include "llvm/ADT/ArrayRef.h"
28#include "llvm/ADT/DenseMap.h"
29#include "llvm/ADT/MapVector.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SetVector.h"
36#include "llvm/ADT/Statistic.h"
37#include "llvm/ADT/StringRef.h"
38#include "llvm/ADT/Twine.h"
39#include "llvm/ADT/iterator.h"
44#include "llvm/Analysis/Loads.h"
47#include "llvm/Config/llvm-config.h"
48#include "llvm/IR/BasicBlock.h"
49#include "llvm/IR/Constant.h"
51#include "llvm/IR/Constants.h"
52#include "llvm/IR/DIBuilder.h"
53#include "llvm/IR/DataLayout.h"
54#include "llvm/IR/DebugInfo.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/GlobalAlias.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstVisitor.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/LLVMContext.h"
66#include "llvm/IR/Metadata.h"
67#include "llvm/IR/Module.h"
68#include "llvm/IR/Operator.h"
69#include "llvm/IR/PassManager.h"
70#include "llvm/IR/Type.h"
71#include "llvm/IR/Use.h"
72#include "llvm/IR/User.h"
73#include "llvm/IR/Value.h"
74#include "llvm/IR/ValueHandle.h"
76#include "llvm/Pass.h"
80#include "llvm/Support/Debug.h"
88#include <algorithm>
89#include <cassert>
90#include <cstddef>
91#include <cstdint>
92#include <cstring>
93#include <iterator>
94#include <queue>
95#include <string>
96#include <tuple>
97#include <utility>
98#include <variant>
99#include <vector>
100
101using namespace llvm;
102
103#define DEBUG_TYPE "sroa"
104
105STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
106STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
107STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
108STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
109STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
110STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
111STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
112STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
113STATISTIC(NumLoadsPredicated,
114 "Number of loads rewritten into predicated loads to allow promotion");
116 NumStoresPredicated,
117 "Number of stores rewritten into predicated loads to allow promotion");
118STATISTIC(NumDeleted, "Number of instructions deleted");
119STATISTIC(NumVectorized, "Number of vectorized aggregates");
120
121/// Disable running mem2reg during SROA in order to test or debug SROA.
122static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false),
123 cl::Hidden);
124namespace {
125
126class AllocaSliceRewriter;
127class AllocaSlices;
128class Partition;
129
130class SelectHandSpeculativity {
131 unsigned char Storage = 0; // None are speculatable by default.
132 using TrueVal = Bitfield::Element<bool, 0, 1>; // Low 0'th bit.
133 using FalseVal = Bitfield::Element<bool, 1, 1>; // Low 1'th bit.
134public:
135 SelectHandSpeculativity() = default;
136 SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal);
137 bool isSpeculatable(bool isTrueVal) const;
138 bool areAllSpeculatable() const;
139 bool areAnySpeculatable() const;
140 bool areNoneSpeculatable() const;
141 // For interop as int half of PointerIntPair.
142 explicit operator intptr_t() const { return static_cast<intptr_t>(Storage); }
143 explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {}
144};
145static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char));
146
147using PossiblySpeculatableLoad =
149using UnspeculatableStore = StoreInst *;
150using RewriteableMemOp =
151 std::variant<PossiblySpeculatableLoad, UnspeculatableStore>;
152using RewriteableMemOps = SmallVector<RewriteableMemOp, 2>;
153
154/// An optimization pass providing Scalar Replacement of Aggregates.
155///
156/// This pass takes allocations which can be completely analyzed (that is, they
157/// don't escape) and tries to turn them into scalar SSA values. There are
158/// a few steps to this process.
159///
160/// 1) It takes allocations of aggregates and analyzes the ways in which they
161/// are used to try to split them into smaller allocations, ideally of
162/// a single scalar data type. It will split up memcpy and memset accesses
163/// as necessary and try to isolate individual scalar accesses.
164/// 2) It will transform accesses into forms which are suitable for SSA value
165/// promotion. This can be replacing a memset with a scalar store of an
166/// integer value, or it can involve speculating operations on a PHI or
167/// select to be a PHI or select of the results.
168/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
169/// onto insert and extract operations on a vector value, and convert them to
170/// this form. By doing so, it will enable promotion of vector aggregates to
171/// SSA vector values.
172class SROA {
173 LLVMContext *const C;
174 DomTreeUpdater *const DTU;
175 AssumptionCache *const AC;
176 const bool PreserveCFG;
177
178 /// Worklist of alloca instructions to simplify.
179 ///
180 /// Each alloca in the function is added to this. Each new alloca formed gets
181 /// added to it as well to recursively simplify unless that alloca can be
182 /// directly promoted. Finally, each time we rewrite a use of an alloca other
183 /// the one being actively rewritten, we add it back onto the list if not
184 /// already present to ensure it is re-visited.
185 SmallSetVector<AllocaInst *, 16> Worklist;
186
187 /// A collection of instructions to delete.
188 /// We try to batch deletions to simplify code and make things a bit more
189 /// efficient. We also make sure there is no dangling pointers.
190 SmallVector<WeakVH, 8> DeadInsts;
191
192 /// Post-promotion worklist.
193 ///
194 /// Sometimes we discover an alloca which has a high probability of becoming
195 /// viable for SROA after a round of promotion takes place. In those cases,
196 /// the alloca is enqueued here for re-processing.
197 ///
198 /// Note that we have to be very careful to clear allocas out of this list in
199 /// the event they are deleted.
200 SmallSetVector<AllocaInst *, 16> PostPromotionWorklist;
201
202 /// A collection of alloca instructions we can directly promote.
203 SetVector<AllocaInst *, SmallVector<AllocaInst *>,
204 SmallPtrSet<AllocaInst *, 16>, 16>
205 PromotableAllocas;
206
207 /// A worklist of PHIs to speculate prior to promoting allocas.
208 ///
209 /// All of these PHIs have been checked for the safety of speculation and by
210 /// being speculated will allow promoting allocas currently in the promotable
211 /// queue.
212 SmallSetVector<PHINode *, 8> SpeculatablePHIs;
213
214 /// A worklist of select instructions to rewrite prior to promoting
215 /// allocas.
216 SmallMapVector<SelectInst *, RewriteableMemOps, 8> SelectsToRewrite;
217
218 /// Select instructions that use an alloca and are subsequently loaded can be
219 /// rewritten to load both input pointers and then select between the result,
220 /// allowing the load of the alloca to be promoted.
221 /// From this:
222 /// %P2 = select i1 %cond, ptr %Alloca, ptr %Other
223 /// %V = load <type>, ptr %P2
224 /// to:
225 /// %V1 = load <type>, ptr %Alloca -> will be mem2reg'd
226 /// %V2 = load <type>, ptr %Other
227 /// %V = select i1 %cond, <type> %V1, <type> %V2
228 ///
229 /// We can do this to a select if its only uses are loads
230 /// and if either the operand to the select can be loaded unconditionally,
231 /// or if we are allowed to perform CFG modifications.
232 /// If found an intervening bitcast with a single use of the load,
233 /// allow the promotion.
234 static std::optional<RewriteableMemOps>
235 isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG);
236
237public:
238 SROA(LLVMContext *C, DomTreeUpdater *DTU, AssumptionCache *AC,
239 SROAOptions PreserveCFG_)
240 : C(C), DTU(DTU), AC(AC),
241 PreserveCFG(PreserveCFG_ == SROAOptions::PreserveCFG) {}
242
243 /// Main run method used by both the SROAPass and by the legacy pass.
244 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runSROA(Function &F);
245
246private:
247 friend class AllocaSliceRewriter;
248
249 bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
250 AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
251 bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
252 bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
253 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
254 void clobberUse(Use &U);
255 bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
256 bool promoteAllocas();
257};
258
259} // end anonymous namespace
260
261/// Calculate the fragment of a variable to use when slicing a store
262/// based on the slice dimensions, existing fragment, and base storage
263/// fragment.
264/// Results:
265/// UseFrag - Use Target as the new fragment.
266/// UseNoFrag - The new slice already covers the whole variable.
267/// Skip - The new alloca slice doesn't include this variable.
268/// FIXME: Can we use calculateFragmentIntersect instead?
269namespace {
270enum FragCalcResult { UseFrag, UseNoFrag, Skip };
271}
272static FragCalcResult
274 uint64_t NewStorageSliceOffsetInBits,
275 uint64_t NewStorageSliceSizeInBits,
276 std::optional<DIExpression::FragmentInfo> StorageFragment,
277 std::optional<DIExpression::FragmentInfo> CurrentFragment,
279 // If the base storage describes part of the variable apply the offset and
280 // the size constraint.
281 if (StorageFragment) {
282 Target.SizeInBits =
283 std::min(NewStorageSliceSizeInBits, StorageFragment->SizeInBits);
284 Target.OffsetInBits =
285 NewStorageSliceOffsetInBits + StorageFragment->OffsetInBits;
286 } else {
287 Target.SizeInBits = NewStorageSliceSizeInBits;
288 Target.OffsetInBits = NewStorageSliceOffsetInBits;
289 }
290
291 // If this slice extracts the entirety of an independent variable from a
292 // larger alloca, do not produce a fragment expression, as the variable is
293 // not fragmented.
294 if (!CurrentFragment) {
295 if (auto Size = Variable->getSizeInBits()) {
296 // Treat the current fragment as covering the whole variable.
297 CurrentFragment = DIExpression::FragmentInfo(*Size, 0);
298 if (Target == CurrentFragment)
299 return UseNoFrag;
300 }
301 }
302
303 // No additional work to do if there isn't a fragment already, or there is
304 // but it already exactly describes the new assignment.
305 if (!CurrentFragment || *CurrentFragment == Target)
306 return UseFrag;
307
308 // Reject the target fragment if it doesn't fit wholly within the current
309 // fragment. TODO: We could instead chop up the target to fit in the case of
310 // a partial overlap.
311 if (Target.startInBits() < CurrentFragment->startInBits() ||
312 Target.endInBits() > CurrentFragment->endInBits())
313 return Skip;
314
315 // Target fits within the current fragment, return it.
316 return UseFrag;
317}
318
320 return DebugVariable(DVR->getVariable(), std::nullopt,
321 DVR->getDebugLoc().getInlinedAt());
322}
323
324/// Find linked dbg.assign and generate a new one with the correct
325/// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the
326/// value component is copied from the old dbg.assign to the new.
327/// \param OldAlloca Alloca for the variable before splitting.
328/// \param IsSplit True if the store (not necessarily alloca)
329/// is being split.
330/// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca.
331/// \param SliceSizeInBits New number of bits being written to.
332/// \param OldInst Instruction that is being split.
333/// \param Inst New instruction performing this part of the
334/// split store.
335/// \param Dest Store destination.
336/// \param Value Stored value.
337/// \param DL Datalayout.
338static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
339 uint64_t OldAllocaOffsetInBits,
340 uint64_t SliceSizeInBits, Instruction *OldInst,
341 Instruction *Inst, Value *Dest, Value *Value,
342 const DataLayout &DL) {
343 auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst);
344 // Nothing to do if OldInst has no linked dbg.assign intrinsics.
345 if (DVRAssignMarkerRange.empty())
346 return;
347
348 LLVM_DEBUG(dbgs() << " migrateDebugInfo\n");
349 LLVM_DEBUG(dbgs() << " OldAlloca: " << *OldAlloca << "\n");
350 LLVM_DEBUG(dbgs() << " IsSplit: " << IsSplit << "\n");
351 LLVM_DEBUG(dbgs() << " OldAllocaOffsetInBits: " << OldAllocaOffsetInBits
352 << "\n");
353 LLVM_DEBUG(dbgs() << " SliceSizeInBits: " << SliceSizeInBits << "\n");
354 LLVM_DEBUG(dbgs() << " OldInst: " << *OldInst << "\n");
355 LLVM_DEBUG(dbgs() << " Inst: " << *Inst << "\n");
356 LLVM_DEBUG(dbgs() << " Dest: " << *Dest << "\n");
357 if (Value)
358 LLVM_DEBUG(dbgs() << " Value: " << *Value << "\n");
359
360 /// Map of aggregate variables to their fragment associated with OldAlloca.
362 BaseFragments;
363 for (auto *DVR : at::getDVRAssignmentMarkers(OldAlloca))
364 BaseFragments[getAggregateVariable(DVR)] =
365 DVR->getExpression()->getFragmentInfo();
366
367 // The new inst needs a DIAssignID unique metadata tag (if OldInst has
368 // one). It shouldn't already have one: assert this assumption.
369 assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID));
370 DIAssignID *NewID = nullptr;
371 auto &Ctx = Inst->getContext();
372 DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
373 assert(OldAlloca->isStaticAlloca());
374
375 auto MigrateDbgAssign = [&](DbgVariableRecord *DbgAssign) {
376 LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign
377 << "\n");
378 auto *Expr = DbgAssign->getExpression();
379 bool SetKillLocation = false;
380
381 if (IsSplit) {
382 std::optional<DIExpression::FragmentInfo> BaseFragment;
383 {
384 auto R = BaseFragments.find(getAggregateVariable(DbgAssign));
385 if (R == BaseFragments.end())
386 return;
387 BaseFragment = R->second;
388 }
389 std::optional<DIExpression::FragmentInfo> CurrentFragment =
390 Expr->getFragmentInfo();
391 DIExpression::FragmentInfo NewFragment;
392 FragCalcResult Result = calculateFragment(
393 DbgAssign->getVariable(), OldAllocaOffsetInBits, SliceSizeInBits,
394 BaseFragment, CurrentFragment, NewFragment);
395
396 if (Result == Skip)
397 return;
398 if (Result == UseFrag && !(NewFragment == CurrentFragment)) {
399 if (CurrentFragment) {
400 // Rewrite NewFragment to be relative to the existing one (this is
401 // what createFragmentExpression wants). CalculateFragment has
402 // already resolved the size for us. FIXME: Should it return the
403 // relative fragment too?
404 NewFragment.OffsetInBits -= CurrentFragment->OffsetInBits;
405 }
406 // Add the new fragment info to the existing expression if possible.
408 Expr, NewFragment.OffsetInBits, NewFragment.SizeInBits)) {
409 Expr = *E;
410 } else {
411 // Otherwise, add the new fragment info to an empty expression and
412 // discard the value component of this dbg.assign as the value cannot
413 // be computed with the new fragment.
415 DIExpression::get(Expr->getContext(), {}),
416 NewFragment.OffsetInBits, NewFragment.SizeInBits);
417 SetKillLocation = true;
418 }
419 }
420 }
421
422 // If we haven't created a DIAssignID ID do that now and attach it to Inst.
423 if (!NewID) {
424 NewID = DIAssignID::getDistinct(Ctx);
425 Inst->setMetadata(LLVMContext::MD_DIAssignID, NewID);
426 }
427
428 ::Value *NewValue = Value ? Value : DbgAssign->getValue();
430 DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
431 Dest, DIExpression::get(Expr->getContext(), {}),
432 DbgAssign->getDebugLoc())));
433
434 // If we've updated the value but the original dbg.assign has an arglist
435 // then kill it now - we can't use the requested new value.
436 // We can't replace the DIArgList with the new value as it'd leave
437 // the DIExpression in an invalid state (DW_OP_LLVM_arg operands without
438 // an arglist). And we can't keep the DIArgList in case the linked store
439 // is being split - in which case the DIArgList + expression may no longer
440 // be computing the correct value.
441 // This should be a very rare situation as it requires the value being
442 // stored to differ from the dbg.assign (i.e., the value has been
443 // represented differently in the debug intrinsic for some reason).
444 SetKillLocation |=
445 Value && (DbgAssign->hasArgList() ||
446 !DbgAssign->getExpression()->isSingleLocationExpression());
447 if (SetKillLocation)
448 NewAssign->setKillLocation();
449
450 // We could use more precision here at the cost of some additional (code)
451 // complexity - if the original dbg.assign was adjacent to its store, we
452 // could position this new dbg.assign adjacent to its store rather than the
453 // old dbg.assgn. That would result in interleaved dbg.assigns rather than
454 // what we get now:
455 // split store !1
456 // split store !2
457 // dbg.assign !1
458 // dbg.assign !2
459 // This (current behaviour) results results in debug assignments being
460 // noted as slightly offset (in code) from the store. In practice this
461 // should have little effect on the debugging experience due to the fact
462 // that all the split stores should get the same line number.
463 NewAssign->moveBefore(DbgAssign->getIterator());
464
465 NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
466 LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
467 };
468
469 for_each(DVRAssignMarkerRange, MigrateDbgAssign);
470}
471
472namespace {
473
474/// A custom IRBuilder inserter which prefixes all names, but only in
475/// Assert builds.
476class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
477 std::string Prefix;
478
479 Twine getNameWithPrefix(const Twine &Name) const {
480 return Name.isTriviallyEmpty() ? Name : Prefix + Name;
481 }
482
483public:
484 void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
485
486 void InsertHelper(Instruction *I, const Twine &Name,
487 BasicBlock::iterator InsertPt) const override {
488 IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name),
489 InsertPt);
490 }
491};
492
493/// Provide a type for IRBuilder that drops names in release builds.
495
496/// A used slice of an alloca.
497///
498/// This structure represents a slice of an alloca used by some instruction. It
499/// stores both the begin and end offsets of this use, a pointer to the use
500/// itself, and a flag indicating whether we can classify the use as splittable
501/// or not when forming partitions of the alloca.
502class Slice {
503 /// The beginning offset of the range.
504 uint64_t BeginOffset = 0;
505
506 /// The ending offset, not included in the range.
507 uint64_t EndOffset = 0;
508
509 /// Storage for both the use of this slice and whether it can be
510 /// split.
511 PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
512
513public:
514 Slice() = default;
515
516 Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
517 : BeginOffset(BeginOffset), EndOffset(EndOffset),
518 UseAndIsSplittable(U, IsSplittable) {}
519
520 uint64_t beginOffset() const { return BeginOffset; }
521 uint64_t endOffset() const { return EndOffset; }
522
523 bool isSplittable() const { return UseAndIsSplittable.getInt(); }
524 void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
525
526 Use *getUse() const { return UseAndIsSplittable.getPointer(); }
527
528 bool isDead() const { return getUse() == nullptr; }
529 void kill() { UseAndIsSplittable.setPointer(nullptr); }
530
531 /// Support for ordering ranges.
532 ///
533 /// This provides an ordering over ranges such that start offsets are
534 /// always increasing, and within equal start offsets, the end offsets are
535 /// decreasing. Thus the spanning range comes first in a cluster with the
536 /// same start position.
537 bool operator<(const Slice &RHS) const {
538 if (beginOffset() < RHS.beginOffset())
539 return true;
540 if (beginOffset() > RHS.beginOffset())
541 return false;
542 if (isSplittable() != RHS.isSplittable())
543 return !isSplittable();
544 if (endOffset() > RHS.endOffset())
545 return true;
546 return false;
547 }
548
549 /// Support comparison with a single offset to allow binary searches.
550 friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
551 uint64_t RHSOffset) {
552 return LHS.beginOffset() < RHSOffset;
553 }
554 friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
555 const Slice &RHS) {
556 return LHSOffset < RHS.beginOffset();
557 }
558
559 bool operator==(const Slice &RHS) const {
560 return isSplittable() == RHS.isSplittable() &&
561 beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
562 }
563 bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
564};
565
566/// Representation of the alloca slices.
567///
568/// This class represents the slices of an alloca which are formed by its
569/// various uses. If a pointer escapes, we can't fully build a representation
570/// for the slices used and we reflect that in this structure. The uses are
571/// stored, sorted by increasing beginning offset and with unsplittable slices
572/// starting at a particular offset before splittable slices.
573class AllocaSlices {
574public:
575 /// Construct the slices of a particular alloca.
576 AllocaSlices(const DataLayout &DL, AllocaInst &AI);
577
578 /// Test whether a pointer to the allocation escapes our analysis.
579 ///
580 /// If this is true, the slices are never fully built and should be
581 /// ignored.
582 bool isEscaped() const { return PointerEscapingInstr; }
583 bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
584
585 /// Support for iterating over the slices.
586 /// @{
587 using iterator = SmallVectorImpl<Slice>::iterator;
588 using range = iterator_range<iterator>;
589
590 iterator begin() { return Slices.begin(); }
591 iterator end() { return Slices.end(); }
592
593 using const_iterator = SmallVectorImpl<Slice>::const_iterator;
594 using const_range = iterator_range<const_iterator>;
595
596 const_iterator begin() const { return Slices.begin(); }
597 const_iterator end() const { return Slices.end(); }
598 /// @}
599
600 /// Erase a range of slices.
601 void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
602
603 /// Insert new slices for this alloca.
604 ///
605 /// This moves the slices into the alloca's slices collection, and re-sorts
606 /// everything so that the usual ordering properties of the alloca's slices
607 /// hold.
608 void insert(ArrayRef<Slice> NewSlices) {
609 int OldSize = Slices.size();
610 Slices.append(NewSlices.begin(), NewSlices.end());
611 auto SliceI = Slices.begin() + OldSize;
612 std::stable_sort(SliceI, Slices.end());
613 std::inplace_merge(Slices.begin(), SliceI, Slices.end());
614 }
615
616 // Forward declare the iterator and range accessor for walking the
617 // partitions.
618 class partition_iterator;
620
621 /// Access the dead users for this alloca.
622 ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
623
624 /// Access Uses that should be dropped if the alloca is promotable.
625 ArrayRef<Use *> getDeadUsesIfPromotable() const {
626 return DeadUseIfPromotable;
627 }
628
629 /// Access the dead operands referring to this alloca.
630 ///
631 /// These are operands which have cannot actually be used to refer to the
632 /// alloca as they are outside its range and the user doesn't correct for
633 /// that. These mostly consist of PHI node inputs and the like which we just
634 /// need to replace with undef.
635 ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
636
637#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
638 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
639 void printSlice(raw_ostream &OS, const_iterator I,
640 StringRef Indent = " ") const;
641 void printUse(raw_ostream &OS, const_iterator I,
642 StringRef Indent = " ") const;
643 void print(raw_ostream &OS) const;
644 void dump(const_iterator I) const;
645 void dump() const;
646#endif
647
648private:
649 template <typename DerivedT, typename RetT = void> class BuilderBase;
650 class SliceBuilder;
651
652 friend class AllocaSlices::SliceBuilder;
653
654#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
655 /// Handle to alloca instruction to simplify method interfaces.
656 AllocaInst &AI;
657#endif
658
659 /// The instruction responsible for this alloca not having a known set
660 /// of slices.
661 ///
662 /// When an instruction (potentially) escapes the pointer to the alloca, we
663 /// store a pointer to that here and abort trying to form slices of the
664 /// alloca. This will be null if the alloca slices are analyzed successfully.
665 Instruction *PointerEscapingInstr;
666 Instruction *PointerEscapingInstrReadOnly;
667
668 /// The slices of the alloca.
669 ///
670 /// We store a vector of the slices formed by uses of the alloca here. This
671 /// vector is sorted by increasing begin offset, and then the unsplittable
672 /// slices before the splittable ones. See the Slice inner class for more
673 /// details.
675
676 /// Instructions which will become dead if we rewrite the alloca.
677 ///
678 /// Note that these are not separated by slice. This is because we expect an
679 /// alloca to be completely rewritten or not rewritten at all. If rewritten,
680 /// all these instructions can simply be removed and replaced with poison as
681 /// they come from outside of the allocated space.
682 SmallVector<Instruction *, 8> DeadUsers;
683
684 /// Uses which will become dead if can promote the alloca.
685 SmallVector<Use *, 8> DeadUseIfPromotable;
686
687 /// Operands which will become dead if we rewrite the alloca.
688 ///
689 /// These are operands that in their particular use can be replaced with
690 /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
691 /// to PHI nodes and the like. They aren't entirely dead (there might be
692 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
693 /// want to swap this particular input for poison to simplify the use lists of
694 /// the alloca.
695 SmallVector<Use *, 8> DeadOperands;
696};
697
698/// A partition of the slices.
699///
700/// An ephemeral representation for a range of slices which can be viewed as
701/// a partition of the alloca. This range represents a span of the alloca's
702/// memory which cannot be split, and provides access to all of the slices
703/// overlapping some part of the partition.
704///
705/// Objects of this type are produced by traversing the alloca's slices, but
706/// are only ephemeral and not persistent.
707class Partition {
708private:
709 friend class AllocaSlices;
710 friend class AllocaSlices::partition_iterator;
711
712 using iterator = AllocaSlices::iterator;
713
714 /// The beginning and ending offsets of the alloca for this
715 /// partition.
716 uint64_t BeginOffset = 0, EndOffset = 0;
717
718 /// The start and end iterators of this partition.
719 iterator SI, SJ;
720
721 /// A collection of split slice tails overlapping the partition.
722 SmallVector<Slice *, 4> SplitTails;
723
724 /// Raw constructor builds an empty partition starting and ending at
725 /// the given iterator.
726 Partition(iterator SI) : SI(SI), SJ(SI) {}
727
728public:
729 /// The start offset of this partition.
730 ///
731 /// All of the contained slices start at or after this offset.
732 uint64_t beginOffset() const { return BeginOffset; }
733
734 /// The end offset of this partition.
735 ///
736 /// All of the contained slices end at or before this offset.
737 uint64_t endOffset() const { return EndOffset; }
738
739 /// The size of the partition.
740 ///
741 /// Note that this can never be zero.
742 uint64_t size() const {
743 assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
744 return EndOffset - BeginOffset;
745 }
746
747 /// Test whether this partition contains no slices, and merely spans
748 /// a region occupied by split slices.
749 bool empty() const { return SI == SJ; }
750
751 /// \name Iterate slices that start within the partition.
752 /// These may be splittable or unsplittable. They have a begin offset >= the
753 /// partition begin offset.
754 /// @{
755 // FIXME: We should probably define a "concat_iterator" helper and use that
756 // to stitch together pointee_iterators over the split tails and the
757 // contiguous iterators of the partition. That would give a much nicer
758 // interface here. We could then additionally expose filtered iterators for
759 // split, unsplit, and unsplittable splices based on the usage patterns.
760 iterator begin() const { return SI; }
761 iterator end() const { return SJ; }
762 /// @}
763
764 /// Get the sequence of split slice tails.
765 ///
766 /// These tails are of slices which start before this partition but are
767 /// split and overlap into the partition. We accumulate these while forming
768 /// partitions.
769 ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
770};
771
772} // end anonymous namespace
773
774/// An iterator over partitions of the alloca's slices.
775///
776/// This iterator implements the core algorithm for partitioning the alloca's
777/// slices. It is a forward iterator as we don't support backtracking for
778/// efficiency reasons, and re-use a single storage area to maintain the
779/// current set of split slices.
780///
781/// It is templated on the slice iterator type to use so that it can operate
782/// with either const or non-const slice iterators.
784 : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
785 Partition> {
786 friend class AllocaSlices;
787
788 /// Most of the state for walking the partitions is held in a class
789 /// with a nice interface for examining them.
790 Partition P;
791
792 /// We need to keep the end of the slices to know when to stop.
793 AllocaSlices::iterator SE;
794
795 /// We also need to keep track of the maximum split end offset seen.
796 /// FIXME: Do we really?
797 uint64_t MaxSplitSliceEndOffset = 0;
798
799 /// Sets the partition to be empty at given iterator, and sets the
800 /// end iterator.
801 partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
802 : P(SI), SE(SE) {
803 // If not already at the end, advance our state to form the initial
804 // partition.
805 if (SI != SE)
806 advance();
807 }
808
809 /// Advance the iterator to the next partition.
810 ///
811 /// Requires that the iterator not be at the end of the slices.
812 void advance() {
813 assert((P.SI != SE || !P.SplitTails.empty()) &&
814 "Cannot advance past the end of the slices!");
815
816 // Clear out any split uses which have ended.
817 if (!P.SplitTails.empty()) {
818 if (P.EndOffset >= MaxSplitSliceEndOffset) {
819 // If we've finished all splits, this is easy.
820 P.SplitTails.clear();
821 MaxSplitSliceEndOffset = 0;
822 } else {
823 // Remove the uses which have ended in the prior partition. This
824 // cannot change the max split slice end because we just checked that
825 // the prior partition ended prior to that max.
826 llvm::erase_if(P.SplitTails,
827 [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
828 assert(llvm::any_of(P.SplitTails,
829 [&](Slice *S) {
830 return S->endOffset() == MaxSplitSliceEndOffset;
831 }) &&
832 "Could not find the current max split slice offset!");
833 assert(llvm::all_of(P.SplitTails,
834 [&](Slice *S) {
835 return S->endOffset() <= MaxSplitSliceEndOffset;
836 }) &&
837 "Max split slice end offset is not actually the max!");
838 }
839 }
840
841 // If P.SI is already at the end, then we've cleared the split tail and
842 // now have an end iterator.
843 if (P.SI == SE) {
844 assert(P.SplitTails.empty() && "Failed to clear the split slices!");
845 return;
846 }
847
848 // If we had a non-empty partition previously, set up the state for
849 // subsequent partitions.
850 if (P.SI != P.SJ) {
851 // Accumulate all the splittable slices which started in the old
852 // partition into the split list.
853 for (Slice &S : P)
854 if (S.isSplittable() && S.endOffset() > P.EndOffset) {
855 P.SplitTails.push_back(&S);
856 MaxSplitSliceEndOffset =
857 std::max(S.endOffset(), MaxSplitSliceEndOffset);
858 }
859
860 // Start from the end of the previous partition.
861 P.SI = P.SJ;
862
863 // If P.SI is now at the end, we at most have a tail of split slices.
864 if (P.SI == SE) {
865 P.BeginOffset = P.EndOffset;
866 P.EndOffset = MaxSplitSliceEndOffset;
867 return;
868 }
869
870 // If the we have split slices and the next slice is after a gap and is
871 // not splittable immediately form an empty partition for the split
872 // slices up until the next slice begins.
873 if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
874 !P.SI->isSplittable()) {
875 P.BeginOffset = P.EndOffset;
876 P.EndOffset = P.SI->beginOffset();
877 return;
878 }
879 }
880
881 // OK, we need to consume new slices. Set the end offset based on the
882 // current slice, and step SJ past it. The beginning offset of the
883 // partition is the beginning offset of the next slice unless we have
884 // pre-existing split slices that are continuing, in which case we begin
885 // at the prior end offset.
886 P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
887 P.EndOffset = P.SI->endOffset();
888 ++P.SJ;
889
890 // There are two strategies to form a partition based on whether the
891 // partition starts with an unsplittable slice or a splittable slice.
892 if (!P.SI->isSplittable()) {
893 // When we're forming an unsplittable region, it must always start at
894 // the first slice and will extend through its end.
895 assert(P.BeginOffset == P.SI->beginOffset());
896
897 // Form a partition including all of the overlapping slices with this
898 // unsplittable slice.
899 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
900 if (!P.SJ->isSplittable())
901 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
902 ++P.SJ;
903 }
904
905 // We have a partition across a set of overlapping unsplittable
906 // partitions.
907 return;
908 }
909
910 // If we're starting with a splittable slice, then we need to form
911 // a synthetic partition spanning it and any other overlapping splittable
912 // splices.
913 assert(P.SI->isSplittable() && "Forming a splittable partition!");
914
915 // Collect all of the overlapping splittable slices.
916 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
917 P.SJ->isSplittable()) {
918 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
919 ++P.SJ;
920 }
921
922 // Back upiP.EndOffset if we ended the span early when encountering an
923 // unsplittable slice. This synthesizes the early end offset of
924 // a partition spanning only splittable slices.
925 if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
926 assert(!P.SJ->isSplittable());
927 P.EndOffset = P.SJ->beginOffset();
928 }
929 }
930
931public:
932 bool operator==(const partition_iterator &RHS) const {
933 assert(SE == RHS.SE &&
934 "End iterators don't match between compared partition iterators!");
935
936 // The observed positions of partitions is marked by the P.SI iterator and
937 // the emptiness of the split slices. The latter is only relevant when
938 // P.SI == SE, as the end iterator will additionally have an empty split
939 // slices list, but the prior may have the same P.SI and a tail of split
940 // slices.
941 if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
942 assert(P.SJ == RHS.P.SJ &&
943 "Same set of slices formed two different sized partitions!");
944 assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
945 "Same slice position with differently sized non-empty split "
946 "slice tails!");
947 return true;
948 }
949 return false;
950 }
951
952 partition_iterator &operator++() {
953 advance();
954 return *this;
955 }
956
957 Partition &operator*() { return P; }
958};
959
960/// A forward range over the partitions of the alloca's slices.
961///
962/// This accesses an iterator range over the partitions of the alloca's
963/// slices. It computes these partitions on the fly based on the overlapping
964/// offsets of the slices and the ability to split them. It will visit "empty"
965/// partitions to cover regions of the alloca only accessed via split
966/// slices.
967iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
968 return make_range(partition_iterator(begin(), end()),
969 partition_iterator(end(), end()));
970}
971
973 // If the condition being selected on is a constant or the same value is
974 // being selected between, fold the select. Yes this does (rarely) happen
975 // early on.
976 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
977 return SI.getOperand(1 + CI->isZero());
978 if (SI.getOperand(1) == SI.getOperand(2))
979 return SI.getOperand(1);
980
981 return nullptr;
982}
983
984/// A helper that folds a PHI node or a select.
986 if (PHINode *PN = dyn_cast<PHINode>(&I)) {
987 // If PN merges together the same value, return that value.
988 return PN->hasConstantValue();
989 }
991}
992
993/// Builder for the alloca slices.
994///
995/// This class builds a set of alloca slices by recursively visiting the uses
996/// of an alloca and making a slice for each load and store at each offset.
997class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
998 friend class PtrUseVisitor<SliceBuilder>;
999 friend class InstVisitor<SliceBuilder>;
1000
1001 using Base = PtrUseVisitor<SliceBuilder>;
1002
1003 const uint64_t AllocSize;
1004 AllocaSlices &AS;
1005
1006 SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
1008
1009 /// Set to de-duplicate dead instructions found in the use walk.
1010 SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
1011
1012public:
1013 SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
1015 AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue()),
1016 AS(AS) {}
1017
1018private:
1019 void markAsDead(Instruction &I) {
1020 if (VisitedDeadInsts.insert(&I).second)
1021 AS.DeadUsers.push_back(&I);
1022 }
1023
1024 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
1025 bool IsSplittable = false) {
1026 // Completely skip uses which have a zero size or start either before or
1027 // past the end of the allocation.
1028 if (Size == 0 || Offset.uge(AllocSize)) {
1029 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
1030 << Offset
1031 << " which has zero size or starts outside of the "
1032 << AllocSize << " byte alloca:\n"
1033 << " alloca: " << AS.AI << "\n"
1034 << " use: " << I << "\n");
1035 return markAsDead(I);
1036 }
1037
1038 uint64_t BeginOffset = Offset.getZExtValue();
1039 uint64_t EndOffset = BeginOffset + Size;
1040
1041 // Clamp the end offset to the end of the allocation. Note that this is
1042 // formulated to handle even the case where "BeginOffset + Size" overflows.
1043 // This may appear superficially to be something we could ignore entirely,
1044 // but that is not so! There may be widened loads or PHI-node uses where
1045 // some instructions are dead but not others. We can't completely ignore
1046 // them, and so have to record at least the information here.
1047 assert(AllocSize >= BeginOffset); // Established above.
1048 if (Size > AllocSize - BeginOffset) {
1049 LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
1050 << Offset << " to remain within the " << AllocSize
1051 << " byte alloca:\n"
1052 << " alloca: " << AS.AI << "\n"
1053 << " use: " << I << "\n");
1054 EndOffset = AllocSize;
1055 }
1056
1057 AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
1058 }
1059
1060 void visitBitCastInst(BitCastInst &BC) {
1061 if (BC.use_empty())
1062 return markAsDead(BC);
1063
1064 return Base::visitBitCastInst(BC);
1065 }
1066
1067 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1068 if (ASC.use_empty())
1069 return markAsDead(ASC);
1070
1071 return Base::visitAddrSpaceCastInst(ASC);
1072 }
1073
1074 void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1075 if (GEPI.use_empty())
1076 return markAsDead(GEPI);
1077
1078 return Base::visitGetElementPtrInst(GEPI);
1079 }
1080
1081 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
1082 uint64_t Size, bool IsVolatile) {
1083 // We allow splitting of non-volatile loads and stores where the type is an
1084 // integer type. These may be used to implement 'memcpy' or other "transfer
1085 // of bits" patterns.
1086 bool IsSplittable =
1087 Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1088
1089 insertUse(I, Offset, Size, IsSplittable);
1090 }
1091
1092 void visitLoadInst(LoadInst &LI) {
1093 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
1094 "All simple FCA loads should have been pre-split");
1095
1096 // If there is a load with an unknown offset, we can still perform store
1097 // to load forwarding for other known-offset loads.
1098 if (!IsOffsetKnown)
1099 return PI.setEscapedReadOnly(&LI);
1100
1101 TypeSize Size = DL.getTypeStoreSize(LI.getType());
1102 if (Size.isScalable()) {
1103 unsigned VScale = LI.getFunction()->getVScaleValue();
1104 if (!VScale)
1105 return PI.setAborted(&LI);
1106
1107 Size = TypeSize::getFixed(Size.getKnownMinValue() * VScale);
1108 }
1109
1110 return handleLoadOrStore(LI.getType(), LI, Offset, Size.getFixedValue(),
1111 LI.isVolatile());
1112 }
1113
1114 void visitStoreInst(StoreInst &SI) {
1115 Value *ValOp = SI.getValueOperand();
1116 if (ValOp == *U)
1117 return PI.setEscapedAndAborted(&SI);
1118 if (!IsOffsetKnown)
1119 return PI.setAborted(&SI);
1120
1121 TypeSize StoreSize = DL.getTypeStoreSize(ValOp->getType());
1122 if (StoreSize.isScalable()) {
1123 unsigned VScale = SI.getFunction()->getVScaleValue();
1124 if (!VScale)
1125 return PI.setAborted(&SI);
1126
1127 StoreSize = TypeSize::getFixed(StoreSize.getKnownMinValue() * VScale);
1128 }
1129
1130 uint64_t Size = StoreSize.getFixedValue();
1131
1132 // If this memory access can be shown to *statically* extend outside the
1133 // bounds of the allocation, it's behavior is undefined, so simply
1134 // ignore it. Note that this is more strict than the generic clamping
1135 // behavior of insertUse. We also try to handle cases which might run the
1136 // risk of overflow.
1137 // FIXME: We should instead consider the pointer to have escaped if this
1138 // function is being instrumented for addressing bugs or race conditions.
1139 if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
1140 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
1141 << Offset << " which extends past the end of the "
1142 << AllocSize << " byte alloca:\n"
1143 << " alloca: " << AS.AI << "\n"
1144 << " use: " << SI << "\n");
1145 return markAsDead(SI);
1146 }
1147
1148 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
1149 "All simple FCA stores should have been pre-split");
1150 handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
1151 }
1152
1153 void visitMemSetInst(MemSetInst &II) {
1154 assert(II.getRawDest() == *U && "Pointer use is not the destination?");
1155 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1156 if ((Length && Length->getValue() == 0) ||
1157 (IsOffsetKnown && Offset.uge(AllocSize)))
1158 // Zero-length mem transfer intrinsics can be ignored entirely.
1159 return markAsDead(II);
1160
1161 if (!IsOffsetKnown)
1162 return PI.setAborted(&II);
1163
1164 insertUse(II, Offset,
1165 Length ? Length->getLimitedValue()
1166 : AllocSize - Offset.getLimitedValue(),
1167 (bool)Length);
1168 }
1169
1170 void visitMemTransferInst(MemTransferInst &II) {
1171 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1172 if (Length && Length->getValue() == 0)
1173 // Zero-length mem transfer intrinsics can be ignored entirely.
1174 return markAsDead(II);
1175
1176 // Because we can visit these intrinsics twice, also check to see if the
1177 // first time marked this instruction as dead. If so, skip it.
1178 if (VisitedDeadInsts.count(&II))
1179 return;
1180
1181 if (!IsOffsetKnown)
1182 return PI.setAborted(&II);
1183
1184 // This side of the transfer is completely out-of-bounds, and so we can
1185 // nuke the entire transfer. However, we also need to nuke the other side
1186 // if already added to our partitions.
1187 // FIXME: Yet another place we really should bypass this when
1188 // instrumenting for ASan.
1189 if (Offset.uge(AllocSize)) {
1190 SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
1191 MemTransferSliceMap.find(&II);
1192 if (MTPI != MemTransferSliceMap.end())
1193 AS.Slices[MTPI->second].kill();
1194 return markAsDead(II);
1195 }
1196
1197 uint64_t RawOffset = Offset.getLimitedValue();
1198 uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
1199
1200 // Check for the special case where the same exact value is used for both
1201 // source and dest.
1202 if (*U == II.getRawDest() && *U == II.getRawSource()) {
1203 // For non-volatile transfers this is a no-op.
1204 if (!II.isVolatile())
1205 return markAsDead(II);
1206
1207 return insertUse(II, Offset, Size, /*IsSplittable=*/false);
1208 }
1209
1210 // If we have seen both source and destination for a mem transfer, then
1211 // they both point to the same alloca.
1212 bool Inserted;
1213 SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
1214 std::tie(MTPI, Inserted) =
1215 MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
1216 unsigned PrevIdx = MTPI->second;
1217 if (!Inserted) {
1218 Slice &PrevP = AS.Slices[PrevIdx];
1219
1220 // Check if the begin offsets match and this is a non-volatile transfer.
1221 // In that case, we can completely elide the transfer.
1222 if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
1223 PrevP.kill();
1224 return markAsDead(II);
1225 }
1226
1227 // Otherwise we have an offset transfer within the same alloca. We can't
1228 // split those.
1229 PrevP.makeUnsplittable();
1230 }
1231
1232 // Insert the use now that we've fixed up the splittable nature.
1233 insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
1234
1235 // Check that we ended up with a valid index in the map.
1236 assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
1237 "Map index doesn't point back to a slice with this user.");
1238 }
1239
1240 // Disable SRoA for any intrinsics except for lifetime invariants.
1241 // FIXME: What about debug intrinsics? This matches old behavior, but
1242 // doesn't make sense.
1243 void visitIntrinsicInst(IntrinsicInst &II) {
1244 if (II.isDroppable()) {
1245 AS.DeadUseIfPromotable.push_back(U);
1246 return;
1247 }
1248
1249 if (!IsOffsetKnown)
1250 return PI.setAborted(&II);
1251
1252 if (II.isLifetimeStartOrEnd()) {
1253 insertUse(II, Offset, AllocSize, true);
1254 return;
1255 }
1256
1257 Base::visitIntrinsicInst(II);
1258 }
1259
1260 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
1261 // We consider any PHI or select that results in a direct load or store of
1262 // the same offset to be a viable use for slicing purposes. These uses
1263 // are considered unsplittable and the size is the maximum loaded or stored
1264 // size.
1265 SmallPtrSet<Instruction *, 4> Visited;
1267 Visited.insert(Root);
1268 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
1269 const DataLayout &DL = Root->getDataLayout();
1270 // If there are no loads or stores, the access is dead. We mark that as
1271 // a size zero access.
1272 Size = 0;
1273 do {
1274 Instruction *I, *UsedI;
1275 std::tie(UsedI, I) = Uses.pop_back_val();
1276
1277 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
1278 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
1279 if (LoadSize.isScalable()) {
1280 PI.setAborted(LI);
1281 return nullptr;
1282 }
1283 Size = std::max(Size, LoadSize.getFixedValue());
1284 continue;
1285 }
1286 if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
1287 Value *Op = SI->getOperand(0);
1288 if (Op == UsedI)
1289 return SI;
1290 TypeSize StoreSize = DL.getTypeStoreSize(Op->getType());
1291 if (StoreSize.isScalable()) {
1292 PI.setAborted(SI);
1293 return nullptr;
1294 }
1295 Size = std::max(Size, StoreSize.getFixedValue());
1296 continue;
1297 }
1298
1299 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
1300 if (!GEP->hasAllZeroIndices())
1301 return GEP;
1302 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
1304 return I;
1305 }
1306
1307 for (User *U : I->users())
1308 if (Visited.insert(cast<Instruction>(U)).second)
1309 Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
1310 } while (!Uses.empty());
1311
1312 return nullptr;
1313 }
1314
1315 void visitPHINodeOrSelectInst(Instruction &I) {
1317 if (I.use_empty())
1318 return markAsDead(I);
1319
1320 // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1321 // instructions in this BB, which may be required during rewriting. Bail out
1322 // on these cases.
1323 if (isa<PHINode>(I) &&
1324 I.getParent()->getFirstInsertionPt() == I.getParent()->end())
1325 return PI.setAborted(&I);
1326
1327 // TODO: We could use simplifyInstruction here to fold PHINodes and
1328 // SelectInsts. However, doing so requires to change the current
1329 // dead-operand-tracking mechanism. For instance, suppose neither loading
1330 // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1331 // trap either. However, if we simply replace %U with undef using the
1332 // current dead-operand-tracking mechanism, "load (select undef, undef,
1333 // %other)" may trap because the select may return the first operand
1334 // "undef".
1335 if (Value *Result = foldPHINodeOrSelectInst(I)) {
1336 if (Result == *U)
1337 // If the result of the constant fold will be the pointer, recurse
1338 // through the PHI/select as if we had RAUW'ed it.
1339 enqueueUsers(I);
1340 else
1341 // Otherwise the operand to the PHI/select is dead, and we can replace
1342 // it with poison.
1343 AS.DeadOperands.push_back(U);
1344
1345 return;
1346 }
1347
1348 if (!IsOffsetKnown)
1349 return PI.setAborted(&I);
1350
1351 // See if we already have computed info on this node.
1352 uint64_t &Size = PHIOrSelectSizes[&I];
1353 if (!Size) {
1354 // This is a new PHI/Select, check for an unsafe use of it.
1355 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
1356 return PI.setAborted(UnsafeI);
1357 }
1358
1359 // For PHI and select operands outside the alloca, we can't nuke the entire
1360 // phi or select -- the other side might still be relevant, so we special
1361 // case them here and use a separate structure to track the operands
1362 // themselves which should be replaced with poison.
1363 // FIXME: This should instead be escaped in the event we're instrumenting
1364 // for address sanitization.
1365 if (Offset.uge(AllocSize)) {
1366 AS.DeadOperands.push_back(U);
1367 return;
1368 }
1369
1370 insertUse(I, Offset, Size);
1371 }
1372
1373 void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
1374
1375 void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
1376
1377 /// Disable SROA entirely if there are unhandled users of the alloca.
1378 void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1379
1380 void visitCallBase(CallBase &CB) {
1381 // If the call operand is read-only and only does a read-only or address
1382 // capture, then we mark it as EscapedReadOnly.
1383 if (CB.isDataOperand(U) &&
1384 !capturesFullProvenance(CB.getCaptureInfo(U->getOperandNo())) &&
1385 CB.onlyReadsMemory(U->getOperandNo())) {
1386 PI.setEscapedReadOnly(&CB);
1387 return;
1388 }
1389
1390 Base::visitCallBase(CB);
1391 }
1392};
1393
1394AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
1395 :
1396#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1397 AI(AI),
1398#endif
1399 PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
1400 SliceBuilder PB(DL, AI, *this);
1401 SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
1402 if (PtrI.isEscaped() || PtrI.isAborted()) {
1403 // FIXME: We should sink the escape vs. abort info into the caller nicely,
1404 // possibly by just storing the PtrInfo in the AllocaSlices.
1405 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1406 : PtrI.getAbortingInst();
1407 assert(PointerEscapingInstr && "Did not track a bad instruction");
1408 return;
1409 }
1410 PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
1411
1412 llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
1413
1414 // Sort the uses. This arranges for the offsets to be in ascending order,
1415 // and the sizes to be in descending order.
1416 llvm::stable_sort(Slices);
1417}
1418
1419#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1420
1421void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1422 StringRef Indent) const {
1423 printSlice(OS, I, Indent);
1424 OS << "\n";
1425 printUse(OS, I, Indent);
1426}
1427
1428void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1429 StringRef Indent) const {
1430 OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1431 << " slice #" << (I - begin())
1432 << (I->isSplittable() ? " (splittable)" : "");
1433}
1434
1435void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1436 StringRef Indent) const {
1437 OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1438}
1439
1440void AllocaSlices::print(raw_ostream &OS) const {
1441 if (PointerEscapingInstr) {
1442 OS << "Can't analyze slices for alloca: " << AI << "\n"
1443 << " A pointer to this alloca escaped by:\n"
1444 << " " << *PointerEscapingInstr << "\n";
1445 return;
1446 }
1447
1448 if (PointerEscapingInstrReadOnly)
1449 OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
1450
1451 OS << "Slices of alloca: " << AI << "\n";
1452 for (const_iterator I = begin(), E = end(); I != E; ++I)
1453 print(OS, I);
1454}
1455
1456LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1457 print(dbgs(), I);
1458}
1459LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1460
1461#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1462
1463/// Walk the range of a partitioning looking for a common type to cover this
1464/// sequence of slices.
1465static std::pair<Type *, IntegerType *>
1466findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1467 uint64_t EndOffset) {
1468 Type *Ty = nullptr;
1469 bool TyIsCommon = true;
1470 IntegerType *ITy = nullptr;
1471
1472 // Note that we need to look at *every* alloca slice's Use to ensure we
1473 // always get consistent results regardless of the order of slices.
1474 for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1475 Use *U = I->getUse();
1476 if (isa<IntrinsicInst>(*U->getUser()))
1477 continue;
1478 if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1479 continue;
1480
1481 Type *UserTy = nullptr;
1482 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1483 UserTy = LI->getType();
1484 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1485 UserTy = SI->getValueOperand()->getType();
1486 }
1487
1488 if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
1489 // If the type is larger than the partition, skip it. We only encounter
1490 // this for split integer operations where we want to use the type of the
1491 // entity causing the split. Also skip if the type is not a byte width
1492 // multiple.
1493 if (UserITy->getBitWidth() % 8 != 0 ||
1494 UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1495 continue;
1496
1497 // Track the largest bitwidth integer type used in this way in case there
1498 // is no common type.
1499 if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1500 ITy = UserITy;
1501 }
1502
1503 // To avoid depending on the order of slices, Ty and TyIsCommon must not
1504 // depend on types skipped above.
1505 if (!UserTy || (Ty && Ty != UserTy))
1506 TyIsCommon = false; // Give up on anything but an iN type.
1507 else
1508 Ty = UserTy;
1509 }
1510
1511 return {TyIsCommon ? Ty : nullptr, ITy};
1512}
1513
1514/// PHI instructions that use an alloca and are subsequently loaded can be
1515/// rewritten to load both input pointers in the pred blocks and then PHI the
1516/// results, allowing the load of the alloca to be promoted.
1517/// From this:
1518/// %P2 = phi [i32* %Alloca, i32* %Other]
1519/// %V = load i32* %P2
1520/// to:
1521/// %V1 = load i32* %Alloca -> will be mem2reg'd
1522/// ...
1523/// %V2 = load i32* %Other
1524/// ...
1525/// %V = phi [i32 %V1, i32 %V2]
1526///
1527/// We can do this to a select if its only uses are loads and if the operands
1528/// to the select can be loaded unconditionally.
1529///
1530/// FIXME: This should be hoisted into a generic utility, likely in
1531/// Transforms/Util/Local.h
1533 const DataLayout &DL = PN.getDataLayout();
1534
1535 // For now, we can only do this promotion if the load is in the same block
1536 // as the PHI, and if there are no stores between the phi and load.
1537 // TODO: Allow recursive phi users.
1538 // TODO: Allow stores.
1539 BasicBlock *BB = PN.getParent();
1540 Align MaxAlign;
1541 uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
1542 Type *LoadType = nullptr;
1543 for (User *U : PN.users()) {
1545 if (!LI || !LI->isSimple())
1546 return false;
1547
1548 // For now we only allow loads in the same block as the PHI. This is
1549 // a common case that happens when instcombine merges two loads through
1550 // a PHI.
1551 if (LI->getParent() != BB)
1552 return false;
1553
1554 if (LoadType) {
1555 if (LoadType != LI->getType())
1556 return false;
1557 } else {
1558 LoadType = LI->getType();
1559 }
1560
1561 // Ensure that there are no instructions between the PHI and the load that
1562 // could store.
1563 for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1564 if (BBI->mayWriteToMemory())
1565 return false;
1566
1567 MaxAlign = std::max(MaxAlign, LI->getAlign());
1568 }
1569
1570 if (!LoadType)
1571 return false;
1572
1573 APInt LoadSize =
1574 APInt(APWidth, DL.getTypeStoreSize(LoadType).getFixedValue());
1575
1576 // We can only transform this if it is safe to push the loads into the
1577 // predecessor blocks. The only thing to watch out for is that we can't put
1578 // a possibly trapping load in the predecessor if it is a critical edge.
1579 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1581 Value *InVal = PN.getIncomingValue(Idx);
1582
1583 // If the value is produced by the terminator of the predecessor (an
1584 // invoke) or it has side-effects, there is no valid place to put a load
1585 // in the predecessor.
1586 if (TI == InVal || TI->mayHaveSideEffects())
1587 return false;
1588
1589 // If the predecessor has a single successor, then the edge isn't
1590 // critical.
1591 if (TI->getNumSuccessors() == 1)
1592 continue;
1593
1594 // If this pointer is always safe to load, or if we can prove that there
1595 // is already a load in the block, then we can move the load to the pred
1596 // block.
1597 if (isSafeToLoadUnconditionally(InVal, MaxAlign, LoadSize, DL, TI))
1598 continue;
1599
1600 return false;
1601 }
1602
1603 return true;
1604}
1605
1606static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1607 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1608
1609 LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
1610 Type *LoadTy = SomeLoad->getType();
1611 IRB.SetInsertPoint(&PN);
1612 PHINode *NewPN = IRB.CreatePHI(LoadTy, PN.getNumIncomingValues(),
1613 PN.getName() + ".sroa.speculated");
1614
1615 // Get the AA tags and alignment to use from one of the loads. It does not
1616 // matter which one we get and if any differ.
1617 AAMDNodes AATags = SomeLoad->getAAMetadata();
1618 Align Alignment = SomeLoad->getAlign();
1619
1620 // Rewrite all loads of the PN to use the new PHI.
1621 while (!PN.use_empty()) {
1622 LoadInst *LI = cast<LoadInst>(PN.user_back());
1623 LI->replaceAllUsesWith(NewPN);
1624 LI->eraseFromParent();
1625 }
1626
1627 // Inject loads into all of the pred blocks.
1628 DenseMap<BasicBlock *, Value *> InjectedLoads;
1629 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1630 BasicBlock *Pred = PN.getIncomingBlock(Idx);
1631 Value *InVal = PN.getIncomingValue(Idx);
1632
1633 // A PHI node is allowed to have multiple (duplicated) entries for the same
1634 // basic block, as long as the value is the same. So if we already injected
1635 // a load in the predecessor, then we should reuse the same load for all
1636 // duplicated entries.
1637 if (Value *V = InjectedLoads.lookup(Pred)) {
1638 NewPN->addIncoming(V, Pred);
1639 continue;
1640 }
1641
1642 Instruction *TI = Pred->getTerminator();
1643 IRB.SetInsertPoint(TI);
1644
1645 LoadInst *Load = IRB.CreateAlignedLoad(
1646 LoadTy, InVal, Alignment,
1647 (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1648 ++NumLoadsSpeculated;
1649 if (AATags)
1650 Load->setAAMetadata(AATags);
1651 NewPN->addIncoming(Load, Pred);
1652 InjectedLoads[Pred] = Load;
1653 }
1654
1655 LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1656 PN.eraseFromParent();
1657}
1658
1659SelectHandSpeculativity &
1660SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) {
1661 if (isTrueVal)
1663 else
1665 return *this;
1666}
1667
1668bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const {
1669 return isTrueVal ? Bitfield::get<SelectHandSpeculativity::TrueVal>(Storage)
1670 : Bitfield::get<SelectHandSpeculativity::FalseVal>(Storage);
1671}
1672
1673bool SelectHandSpeculativity::areAllSpeculatable() const {
1674 return isSpeculatable(/*isTrueVal=*/true) &&
1675 isSpeculatable(/*isTrueVal=*/false);
1676}
1677
1678bool SelectHandSpeculativity::areAnySpeculatable() const {
1679 return isSpeculatable(/*isTrueVal=*/true) ||
1680 isSpeculatable(/*isTrueVal=*/false);
1681}
1682bool SelectHandSpeculativity::areNoneSpeculatable() const {
1683 return !areAnySpeculatable();
1684}
1685
1686static SelectHandSpeculativity
1688 assert(LI.isSimple() && "Only for simple loads");
1689 SelectHandSpeculativity Spec;
1690
1691 const DataLayout &DL = SI.getDataLayout();
1692 for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()})
1694 &LI))
1695 Spec.setAsSpeculatable(/*isTrueVal=*/Value == SI.getTrueValue());
1696 else if (PreserveCFG)
1697 return Spec;
1698
1699 return Spec;
1700}
1701
1702std::optional<RewriteableMemOps>
1703SROA::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) {
1704 RewriteableMemOps Ops;
1705
1706 for (User *U : SI.users()) {
1707 if (auto *BC = dyn_cast<BitCastInst>(U); BC && BC->hasOneUse())
1708 U = *BC->user_begin();
1709
1710 if (auto *Store = dyn_cast<StoreInst>(U)) {
1711 // Note that atomic stores can be transformed; atomic semantics do not
1712 // have any meaning for a local alloca. Stores are not speculatable,
1713 // however, so if we can't turn it into a predicated store, we are done.
1714 if (Store->isVolatile() || PreserveCFG)
1715 return {}; // Give up on this `select`.
1716 Ops.emplace_back(Store);
1717 continue;
1718 }
1719
1720 auto *LI = dyn_cast<LoadInst>(U);
1721
1722 // Note that atomic loads can be transformed;
1723 // atomic semantics do not have any meaning for a local alloca.
1724 if (!LI || LI->isVolatile())
1725 return {}; // Give up on this `select`.
1726
1727 PossiblySpeculatableLoad Load(LI);
1728 if (!LI->isSimple()) {
1729 // If the `load` is not simple, we can't speculatively execute it,
1730 // but we could handle this via a CFG modification. But can we?
1731 if (PreserveCFG)
1732 return {}; // Give up on this `select`.
1733 Ops.emplace_back(Load);
1734 continue;
1735 }
1736
1737 SelectHandSpeculativity Spec =
1739 if (PreserveCFG && !Spec.areAllSpeculatable())
1740 return {}; // Give up on this `select`.
1741
1742 Load.setInt(Spec);
1743 Ops.emplace_back(Load);
1744 }
1745
1746 return Ops;
1747}
1748
1750 IRBuilderTy &IRB) {
1751 LLVM_DEBUG(dbgs() << " original load: " << SI << "\n");
1752
1753 Value *TV = SI.getTrueValue();
1754 Value *FV = SI.getFalseValue();
1755 // Replace the given load of the select with a select of two loads.
1756
1757 assert(LI.isSimple() && "We only speculate simple loads");
1758
1759 IRB.SetInsertPoint(&LI);
1760
1761 LoadInst *TL =
1762 IRB.CreateAlignedLoad(LI.getType(), TV, LI.getAlign(),
1763 LI.getName() + ".sroa.speculate.load.true");
1764 LoadInst *FL =
1765 IRB.CreateAlignedLoad(LI.getType(), FV, LI.getAlign(),
1766 LI.getName() + ".sroa.speculate.load.false");
1767 NumLoadsSpeculated += 2;
1768
1769 // Transfer alignment and AA info if present.
1770 TL->setAlignment(LI.getAlign());
1771 FL->setAlignment(LI.getAlign());
1772
1773 AAMDNodes Tags = LI.getAAMetadata();
1774 if (Tags) {
1775 TL->setAAMetadata(Tags);
1776 FL->setAAMetadata(Tags);
1777 }
1778
1779 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
1780 LI.getName() + ".sroa.speculated");
1781
1782 LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1783 LI.replaceAllUsesWith(V);
1784}
1785
1786template <typename T>
1788 SelectHandSpeculativity Spec,
1789 DomTreeUpdater &DTU) {
1790 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Only for load and store!");
1791 LLVM_DEBUG(dbgs() << " original mem op: " << I << "\n");
1792 BasicBlock *Head = I.getParent();
1793 Instruction *ThenTerm = nullptr;
1794 Instruction *ElseTerm = nullptr;
1795 if (Spec.areNoneSpeculatable())
1796 SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm,
1797 SI.getMetadata(LLVMContext::MD_prof), &DTU);
1798 else {
1799 SplitBlockAndInsertIfThen(SI.getCondition(), &I, /*Unreachable=*/false,
1800 SI.getMetadata(LLVMContext::MD_prof), &DTU,
1801 /*LI=*/nullptr, /*ThenBlock=*/nullptr);
1802 if (Spec.isSpeculatable(/*isTrueVal=*/true))
1803 cast<BranchInst>(Head->getTerminator())->swapSuccessors();
1804 }
1805 auto *HeadBI = cast<BranchInst>(Head->getTerminator());
1806 Spec = {}; // Do not use `Spec` beyond this point.
1807 BasicBlock *Tail = I.getParent();
1808 Tail->setName(Head->getName() + ".cont");
1809 PHINode *PN;
1810 if (isa<LoadInst>(I))
1811 PN = PHINode::Create(I.getType(), 2, "", I.getIterator());
1812 for (BasicBlock *SuccBB : successors(Head)) {
1813 bool IsThen = SuccBB == HeadBI->getSuccessor(0);
1814 int SuccIdx = IsThen ? 0 : 1;
1815 auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB;
1816 auto &CondMemOp = cast<T>(*I.clone());
1817 if (NewMemOpBB != Head) {
1818 NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else"));
1819 if (isa<LoadInst>(I))
1820 ++NumLoadsPredicated;
1821 else
1822 ++NumStoresPredicated;
1823 } else {
1824 CondMemOp.dropUBImplyingAttrsAndMetadata();
1825 ++NumLoadsSpeculated;
1826 }
1827 CondMemOp.insertBefore(NewMemOpBB->getTerminator()->getIterator());
1828 Value *Ptr = SI.getOperand(1 + SuccIdx);
1829 CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr);
1830 if (isa<LoadInst>(I)) {
1831 CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val");
1832 PN->addIncoming(&CondMemOp, NewMemOpBB);
1833 } else
1834 LLVM_DEBUG(dbgs() << " to: " << CondMemOp << "\n");
1835 }
1836 if (isa<LoadInst>(I)) {
1837 PN->takeName(&I);
1838 LLVM_DEBUG(dbgs() << " to: " << *PN << "\n");
1839 I.replaceAllUsesWith(PN);
1840 }
1841}
1842
1844 SelectHandSpeculativity Spec,
1845 DomTreeUpdater &DTU) {
1846 if (auto *LI = dyn_cast<LoadInst>(&I))
1847 rewriteMemOpOfSelect(SelInst, *LI, Spec, DTU);
1848 else if (auto *SI = dyn_cast<StoreInst>(&I))
1849 rewriteMemOpOfSelect(SelInst, *SI, Spec, DTU);
1850 else
1851 llvm_unreachable_internal("Only for load and store.");
1852}
1853
1855 const RewriteableMemOps &Ops,
1856 IRBuilderTy &IRB, DomTreeUpdater *DTU) {
1857 bool CFGChanged = false;
1858 LLVM_DEBUG(dbgs() << " original select: " << SI << "\n");
1859
1860 for (const RewriteableMemOp &Op : Ops) {
1861 SelectHandSpeculativity Spec;
1862 Instruction *I;
1863 if (auto *const *US = std::get_if<UnspeculatableStore>(&Op)) {
1864 I = *US;
1865 } else {
1866 auto PSL = std::get<PossiblySpeculatableLoad>(Op);
1867 I = PSL.getPointer();
1868 Spec = PSL.getInt();
1869 }
1870 if (Spec.areAllSpeculatable()) {
1872 } else {
1873 assert(DTU && "Should not get here when not allowed to modify the CFG!");
1874 rewriteMemOpOfSelect(SI, *I, Spec, *DTU);
1875 CFGChanged = true;
1876 }
1877 I->eraseFromParent();
1878 }
1879
1880 for (User *U : make_early_inc_range(SI.users()))
1881 cast<BitCastInst>(U)->eraseFromParent();
1882 SI.eraseFromParent();
1883 return CFGChanged;
1884}
1885
1886/// Compute an adjusted pointer from Ptr by Offset bytes where the
1887/// resulting pointer has PointerTy.
1888static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1890 const Twine &NamePrefix) {
1891 if (Offset != 0)
1892 Ptr = IRB.CreateInBoundsPtrAdd(Ptr, IRB.getInt(Offset),
1893 NamePrefix + "sroa_idx");
1894 return IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, PointerTy,
1895 NamePrefix + "sroa_cast");
1896}
1897
1898/// Compute the adjusted alignment for a load or store from an offset.
1902
1903/// Test whether we can convert a value from the old to the new type.
1904///
1905/// This predicate should be used to guard calls to convertValue in order to
1906/// ensure that we only try to convert viable values. The strategy is that we
1907/// will peel off single element struct and array wrappings to get to an
1908/// underlying value, and convert that value.
1909static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy,
1910 unsigned VScale = 0) {
1911 if (OldTy == NewTy)
1912 return true;
1913
1914 // For integer types, we can't handle any bit-width differences. This would
1915 // break both vector conversions with extension and introduce endianness
1916 // issues when in conjunction with loads and stores.
1917 if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
1919 cast<IntegerType>(NewTy)->getBitWidth() &&
1920 "We can't have the same bitwidth for different int types");
1921 return false;
1922 }
1923
1924 TypeSize NewSize = DL.getTypeSizeInBits(NewTy);
1925 TypeSize OldSize = DL.getTypeSizeInBits(OldTy);
1926
1927 if ((isa<ScalableVectorType>(NewTy) && isa<FixedVectorType>(OldTy)) ||
1928 (isa<ScalableVectorType>(OldTy) && isa<FixedVectorType>(NewTy))) {
1929 // Conversion is only possible when the size of scalable vectors is known.
1930 if (!VScale)
1931 return false;
1932
1933 // For ptr-to-int and int-to-ptr casts, the pointer side is resolved within
1934 // a single domain (either fixed or scalable). Any additional conversion
1935 // between fixed and scalable types is handled through integer types.
1936 auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy;
1937 auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy;
1938
1939 if (isa<ScalableVectorType>(NewTy)) {
1941 return false;
1942
1943 NewSize = TypeSize::getFixed(NewSize.getKnownMinValue() * VScale);
1944 } else {
1946 return false;
1947
1948 OldSize = TypeSize::getFixed(OldSize.getKnownMinValue() * VScale);
1949 }
1950 }
1951
1952 if (NewSize != OldSize)
1953 return false;
1954 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
1955 return false;
1956
1957 // We can convert pointers to integers and vice-versa. Same for vectors
1958 // of pointers and integers.
1959 OldTy = OldTy->getScalarType();
1960 NewTy = NewTy->getScalarType();
1961 if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
1962 if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
1963 unsigned OldAS = OldTy->getPointerAddressSpace();
1964 unsigned NewAS = NewTy->getPointerAddressSpace();
1965 // Convert pointers if they are pointers from the same address space or
1966 // different integral (not non-integral) address spaces with the same
1967 // pointer size.
1968 return OldAS == NewAS ||
1969 (!DL.isNonIntegralAddressSpace(OldAS) &&
1970 !DL.isNonIntegralAddressSpace(NewAS) &&
1971 DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
1972 }
1973
1974 // We can convert integers to integral pointers, but not to non-integral
1975 // pointers.
1976 if (OldTy->isIntegerTy())
1977 return !DL.isNonIntegralPointerType(NewTy);
1978
1979 // We can convert integral pointers to integers, but non-integral pointers
1980 // need to remain pointers.
1981 if (!DL.isNonIntegralPointerType(OldTy))
1982 return NewTy->isIntegerTy();
1983
1984 return false;
1985 }
1986
1987 if (OldTy->isTargetExtTy() || NewTy->isTargetExtTy())
1988 return false;
1989
1990 return true;
1991}
1992
1993/// Generic routine to convert an SSA value to a value of a different
1994/// type.
1995///
1996/// This will try various different casting techniques, such as bitcasts,
1997/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
1998/// two types for viability with this routine.
1999static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2000 Type *NewTy) {
2001 Type *OldTy = V->getType();
2002
2003#ifndef NDEBUG
2004 BasicBlock *BB = IRB.GetInsertBlock();
2005 assert(BB && BB->getParent() && "VScale unknown!");
2006 unsigned VScale = BB->getParent()->getVScaleValue();
2007 assert(canConvertValue(DL, OldTy, NewTy, VScale) &&
2008 "Value not convertable to type");
2009#endif
2010
2011 if (OldTy == NewTy)
2012 return V;
2013
2014 assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
2015 "Integer types must be the exact same to convert.");
2016
2017 // A variant of bitcast that supports a mixture of fixed and scalable types
2018 // that are know to have the same size.
2019 auto CreateBitCastLike = [&IRB](Value *In, Type *Ty) -> Value * {
2020 Type *InTy = In->getType();
2021 if (InTy == Ty)
2022 return In;
2023
2025 // For vscale_range(2) expand <4 x i32> to <vscale x 4 x i16> -->
2026 // <4 x i32> to <vscale x 2 x i32> to <vscale x 4 x i16>
2028 return IRB.CreateBitCast(IRB.CreateInsertVector(VTy,
2029 PoisonValue::get(VTy), In,
2030 IRB.getInt64(0)),
2031 Ty);
2032 }
2033
2035 // For vscale_range(2) expand <vscale x 4 x i16> to <4 x i32> -->
2036 // <vscale x 4 x i16> to <vscale x 2 x i32> to <4 x i32>
2038 return IRB.CreateExtractVector(Ty, IRB.CreateBitCast(In, VTy),
2039 IRB.getInt64(0));
2040 }
2041
2042 return IRB.CreateBitCast(In, Ty);
2043 };
2044
2045 // See if we need inttoptr for this type pair. May require additional bitcast.
2046 if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
2047 // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
2048 // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
2049 // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
2050 // Directly handle i64 to i8*
2051 return IRB.CreateIntToPtr(CreateBitCastLike(V, DL.getIntPtrType(NewTy)),
2052 NewTy);
2053 }
2054
2055 // See if we need ptrtoint for this type pair. May require additional bitcast.
2056 if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
2057 // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
2058 // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
2059 // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
2060 // Expand i8* to i64 --> i8* to i64 to i64
2061 return CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
2062 NewTy);
2063 }
2064
2065 if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
2066 unsigned OldAS = OldTy->getPointerAddressSpace();
2067 unsigned NewAS = NewTy->getPointerAddressSpace();
2068 // To convert pointers with different address spaces (they are already
2069 // checked convertible, i.e. they have the same pointer size), so far we
2070 // cannot use `bitcast` (which has restrict on the same address space) or
2071 // `addrspacecast` (which is not always no-op casting). Instead, use a pair
2072 // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
2073 // size.
2074 if (OldAS != NewAS) {
2075 assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
2076 return IRB.CreateIntToPtr(
2077 CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
2078 DL.getIntPtrType(NewTy)),
2079 NewTy);
2080 }
2081 }
2082
2083 return CreateBitCastLike(V, NewTy);
2084}
2085
2086/// Test whether the given slice use can be promoted to a vector.
2087///
2088/// This function is called to test each entry in a partition which is slated
2089/// for a single slice.
2090static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
2091 VectorType *Ty,
2092 uint64_t ElementSize,
2093 const DataLayout &DL,
2094 unsigned VScale) {
2095 // First validate the slice offsets.
2096 uint64_t BeginOffset =
2097 std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
2098 uint64_t BeginIndex = BeginOffset / ElementSize;
2099 if (BeginIndex * ElementSize != BeginOffset ||
2100 BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
2101 return false;
2102 uint64_t EndOffset = std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
2103 uint64_t EndIndex = EndOffset / ElementSize;
2104 if (EndIndex * ElementSize != EndOffset ||
2105 EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
2106 return false;
2107
2108 assert(EndIndex > BeginIndex && "Empty vector!");
2109 uint64_t NumElements = EndIndex - BeginIndex;
2110 Type *SliceTy = (NumElements == 1)
2111 ? Ty->getElementType()
2112 : FixedVectorType::get(Ty->getElementType(), NumElements);
2113
2114 Type *SplitIntTy =
2115 Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
2116
2117 Use *U = S.getUse();
2118
2119 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2120 if (MI->isVolatile())
2121 return false;
2122 if (!S.isSplittable())
2123 return false; // Skip any unsplittable intrinsics.
2124 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2125 if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2126 return false;
2127 } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2128 if (LI->isVolatile())
2129 return false;
2130 Type *LTy = LI->getType();
2131 // Disable vector promotion when there are loads or stores of an FCA.
2132 if (LTy->isStructTy())
2133 return false;
2134 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2135 assert(LTy->isIntegerTy());
2136 LTy = SplitIntTy;
2137 }
2138 if (!canConvertValue(DL, SliceTy, LTy, VScale))
2139 return false;
2140 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2141 if (SI->isVolatile())
2142 return false;
2143 Type *STy = SI->getValueOperand()->getType();
2144 // Disable vector promotion when there are loads or stores of an FCA.
2145 if (STy->isStructTy())
2146 return false;
2147 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2148 assert(STy->isIntegerTy());
2149 STy = SplitIntTy;
2150 }
2151 if (!canConvertValue(DL, STy, SliceTy, VScale))
2152 return false;
2153 } else {
2154 return false;
2155 }
2156
2157 return true;
2158}
2159
2160/// Test whether a vector type is viable for promotion.
2161///
2162/// This implements the necessary checking for \c checkVectorTypesForPromotion
2163/// (and thus isVectorPromotionViable) over all slices of the alloca for the
2164/// given VectorType.
2165static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
2166 const DataLayout &DL, unsigned VScale) {
2167 uint64_t ElementSize =
2168 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2169
2170 // While the definition of LLVM vectors is bitpacked, we don't support sizes
2171 // that aren't byte sized.
2172 if (ElementSize % 8)
2173 return false;
2174 assert((DL.getTypeSizeInBits(VTy).getFixedValue() % 8) == 0 &&
2175 "vector size not a multiple of element size?");
2176 ElementSize /= 8;
2177
2178 for (const Slice &S : P)
2179 if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, VScale))
2180 return false;
2181
2182 for (const Slice *S : P.splitSliceTails())
2183 if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, VScale))
2184 return false;
2185
2186 return true;
2187}
2188
2189/// Test whether any vector type in \p CandidateTys is viable for promotion.
2190///
2191/// This implements the necessary checking for \c isVectorPromotionViable over
2192/// all slices of the alloca for the given VectorType.
2193static VectorType *
2195 SmallVectorImpl<VectorType *> &CandidateTys,
2196 bool HaveCommonEltTy, Type *CommonEltTy,
2197 bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
2198 VectorType *CommonVecPtrTy, unsigned VScale) {
2199 // If we didn't find a vector type, nothing to do here.
2200 if (CandidateTys.empty())
2201 return nullptr;
2202
2203 // Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
2204 // then we should choose it, not some other alternative.
2205 // But, we can't perform a no-op pointer address space change via bitcast,
2206 // so if we didn't have a common pointer element type, bail.
2207 if (HaveVecPtrTy && !HaveCommonVecPtrTy)
2208 return nullptr;
2209
2210 // Try to pick the "best" element type out of the choices.
2211 if (!HaveCommonEltTy && HaveVecPtrTy) {
2212 // If there was a pointer element type, there's really only one choice.
2213 CandidateTys.clear();
2214 CandidateTys.push_back(CommonVecPtrTy);
2215 } else if (!HaveCommonEltTy && !HaveVecPtrTy) {
2216 // Integer-ify vector types.
2217 for (VectorType *&VTy : CandidateTys) {
2218 if (!VTy->getElementType()->isIntegerTy())
2219 VTy = cast<VectorType>(VTy->getWithNewType(IntegerType::getIntNTy(
2220 VTy->getContext(), VTy->getScalarSizeInBits())));
2221 }
2222
2223 // Rank the remaining candidate vector types. This is easy because we know
2224 // they're all integer vectors. We sort by ascending number of elements.
2225 auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2226 (void)DL;
2227 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2228 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2229 "Cannot have vector types of different sizes!");
2230 assert(RHSTy->getElementType()->isIntegerTy() &&
2231 "All non-integer types eliminated!");
2232 assert(LHSTy->getElementType()->isIntegerTy() &&
2233 "All non-integer types eliminated!");
2234 return cast<FixedVectorType>(RHSTy)->getNumElements() <
2235 cast<FixedVectorType>(LHSTy)->getNumElements();
2236 };
2237 auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2238 (void)DL;
2239 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2240 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2241 "Cannot have vector types of different sizes!");
2242 assert(RHSTy->getElementType()->isIntegerTy() &&
2243 "All non-integer types eliminated!");
2244 assert(LHSTy->getElementType()->isIntegerTy() &&
2245 "All non-integer types eliminated!");
2246 return cast<FixedVectorType>(RHSTy)->getNumElements() ==
2247 cast<FixedVectorType>(LHSTy)->getNumElements();
2248 };
2249 llvm::sort(CandidateTys, RankVectorTypesComp);
2250 CandidateTys.erase(llvm::unique(CandidateTys, RankVectorTypesEq),
2251 CandidateTys.end());
2252 } else {
2253// The only way to have the same element type in every vector type is to
2254// have the same vector type. Check that and remove all but one.
2255#ifndef NDEBUG
2256 for (VectorType *VTy : CandidateTys) {
2257 assert(VTy->getElementType() == CommonEltTy &&
2258 "Unaccounted for element type!");
2259 assert(VTy == CandidateTys[0] &&
2260 "Different vector types with the same element type!");
2261 }
2262#endif
2263 CandidateTys.resize(1);
2264 }
2265
2266 // FIXME: hack. Do we have a named constant for this?
2267 // SDAG SDNode can't have more than 65535 operands.
2268 llvm::erase_if(CandidateTys, [](VectorType *VTy) {
2269 return cast<FixedVectorType>(VTy)->getNumElements() >
2270 std::numeric_limits<unsigned short>::max();
2271 });
2272
2273 for (VectorType *VTy : CandidateTys)
2274 if (checkVectorTypeForPromotion(P, VTy, DL, VScale))
2275 return VTy;
2276
2277 return nullptr;
2278}
2279
2281 SetVector<Type *> &OtherTys, ArrayRef<VectorType *> CandidateTysCopy,
2282 function_ref<void(Type *)> CheckCandidateType, Partition &P,
2283 const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
2284 bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
2285 bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) {
2286 [[maybe_unused]] VectorType *OriginalElt =
2287 CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
2288 // Consider additional vector types where the element type size is a
2289 // multiple of load/store element size.
2290 for (Type *Ty : OtherTys) {
2292 continue;
2293 unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
2294 // Make a copy of CandidateTys and iterate through it, because we
2295 // might append to CandidateTys in the loop.
2296 for (VectorType *const VTy : CandidateTysCopy) {
2297 // The elements in the copy should remain invariant throughout the loop
2298 assert(CandidateTysCopy[0] == OriginalElt && "Different Element");
2299 unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue();
2300 unsigned ElementSize =
2301 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2302 if (TypeSize != VectorSize && TypeSize != ElementSize &&
2303 VectorSize % TypeSize == 0) {
2304 VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false);
2305 CheckCandidateType(NewVTy);
2306 }
2307 }
2308 }
2309
2311 P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2312 HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
2313}
2314
2315/// Test whether the given alloca partitioning and range of slices can be
2316/// promoted to a vector.
2317///
2318/// This is a quick test to check whether we can rewrite a particular alloca
2319/// partition (and its newly formed alloca) into a vector alloca with only
2320/// whole-vector loads and stores such that it could be promoted to a vector
2321/// SSA value. We only can ensure this for a limited set of operations, and we
2322/// don't want to do the rewrites unless we are confident that the result will
2323/// be promotable, so we have an early test here.
2325 unsigned VScale) {
2326 // Collect the candidate types for vector-based promotion. Also track whether
2327 // we have different element types.
2328 SmallVector<VectorType *, 4> CandidateTys;
2329 SetVector<Type *> LoadStoreTys;
2330 SetVector<Type *> DeferredTys;
2331 Type *CommonEltTy = nullptr;
2332 VectorType *CommonVecPtrTy = nullptr;
2333 bool HaveVecPtrTy = false;
2334 bool HaveCommonEltTy = true;
2335 bool HaveCommonVecPtrTy = true;
2336 auto CheckCandidateType = [&](Type *Ty) {
2337 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2338 // Return if bitcast to vectors is different for total size in bits.
2339 if (!CandidateTys.empty()) {
2340 VectorType *V = CandidateTys[0];
2341 if (DL.getTypeSizeInBits(VTy).getFixedValue() !=
2342 DL.getTypeSizeInBits(V).getFixedValue()) {
2343 CandidateTys.clear();
2344 return;
2345 }
2346 }
2347 CandidateTys.push_back(VTy);
2348 Type *EltTy = VTy->getElementType();
2349
2350 if (!CommonEltTy)
2351 CommonEltTy = EltTy;
2352 else if (CommonEltTy != EltTy)
2353 HaveCommonEltTy = false;
2354
2355 if (EltTy->isPointerTy()) {
2356 HaveVecPtrTy = true;
2357 if (!CommonVecPtrTy)
2358 CommonVecPtrTy = VTy;
2359 else if (CommonVecPtrTy != VTy)
2360 HaveCommonVecPtrTy = false;
2361 }
2362 }
2363 };
2364
2365 // Put load and store types into a set for de-duplication.
2366 for (const Slice &S : P) {
2367 Type *Ty;
2368 if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
2369 Ty = LI->getType();
2370 else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
2371 Ty = SI->getValueOperand()->getType();
2372 else
2373 continue;
2374
2375 auto CandTy = Ty->getScalarType();
2376 if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() ||
2377 S.endOffset() != P.endOffset())) {
2378 DeferredTys.insert(Ty);
2379 continue;
2380 }
2381
2382 LoadStoreTys.insert(Ty);
2383 // Consider any loads or stores that are the exact size of the slice.
2384 if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
2385 CheckCandidateType(Ty);
2386 }
2387
2388 SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
2390 LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
2391 CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2392 HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
2393 return VTy;
2394
2395 CandidateTys.clear();
2397 DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
2398 HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
2399 CommonVecPtrTy, VScale);
2400}
2401
2402/// Test whether a slice of an alloca is valid for integer widening.
2403///
2404/// This implements the necessary checking for the \c isIntegerWideningViable
2405/// test below on a single slice of the alloca.
2406static bool isIntegerWideningViableForSlice(const Slice &S,
2407 uint64_t AllocBeginOffset,
2408 Type *AllocaTy,
2409 const DataLayout &DL,
2410 bool &WholeAllocaOp) {
2411 uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedValue();
2412
2413 uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2414 uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2415
2416 Use *U = S.getUse();
2417
2418 // Lifetime intrinsics operate over the whole alloca whose sizes are usually
2419 // larger than other load/store slices (RelEnd > Size). But lifetime are
2420 // always promotable and should not impact other slices' promotability of the
2421 // partition.
2422 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2423 if (II->isLifetimeStartOrEnd() || II->isDroppable())
2424 return true;
2425 }
2426
2427 // We can't reasonably handle cases where the load or store extends past
2428 // the end of the alloca's type and into its padding.
2429 if (RelEnd > Size)
2430 return false;
2431
2432 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2433 if (LI->isVolatile())
2434 return false;
2435 // We can't handle loads that extend past the allocated memory.
2436 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
2437 if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size)
2438 return false;
2439 // So far, AllocaSliceRewriter does not support widening split slice tails
2440 // in rewriteIntegerLoad.
2441 if (S.beginOffset() < AllocBeginOffset)
2442 return false;
2443 // Note that we don't count vector loads or stores as whole-alloca
2444 // operations which enable integer widening because we would prefer to use
2445 // vector widening instead.
2446 if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
2447 WholeAllocaOp = true;
2448 if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
2449 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2450 return false;
2451 } else if (RelBegin != 0 || RelEnd != Size ||
2452 !canConvertValue(DL, AllocaTy, LI->getType())) {
2453 // Non-integer loads need to be convertible from the alloca type so that
2454 // they are promotable.
2455 return false;
2456 }
2457 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2458 Type *ValueTy = SI->getValueOperand()->getType();
2459 if (SI->isVolatile())
2460 return false;
2461 // We can't handle stores that extend past the allocated memory.
2462 TypeSize StoreSize = DL.getTypeStoreSize(ValueTy);
2463 if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size)
2464 return false;
2465 // So far, AllocaSliceRewriter does not support widening split slice tails
2466 // in rewriteIntegerStore.
2467 if (S.beginOffset() < AllocBeginOffset)
2468 return false;
2469 // Note that we don't count vector loads or stores as whole-alloca
2470 // operations which enable integer widening because we would prefer to use
2471 // vector widening instead.
2472 if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
2473 WholeAllocaOp = true;
2474 if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
2475 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2476 return false;
2477 } else if (RelBegin != 0 || RelEnd != Size ||
2478 !canConvertValue(DL, ValueTy, AllocaTy)) {
2479 // Non-integer stores need to be convertible to the alloca type so that
2480 // they are promotable.
2481 return false;
2482 }
2483 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2484 if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
2485 return false;
2486 if (!S.isSplittable())
2487 return false; // Skip any unsplittable intrinsics.
2488 } else {
2489 return false;
2490 }
2491
2492 return true;
2493}
2494
2495/// Test whether the given alloca partition's integer operations can be
2496/// widened to promotable ones.
2497///
2498/// This is a quick test to check whether we can rewrite the integer loads and
2499/// stores to a particular alloca into wider loads and stores and be able to
2500/// promote the resulting alloca.
2501static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2502 const DataLayout &DL) {
2503 uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedValue();
2504 // Don't create integer types larger than the maximum bitwidth.
2505 if (SizeInBits > IntegerType::MAX_INT_BITS)
2506 return false;
2507
2508 // Don't try to handle allocas with bit-padding.
2509 if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedValue())
2510 return false;
2511
2512 // We need to ensure that an integer type with the appropriate bitwidth can
2513 // be converted to the alloca type, whatever that is. We don't want to force
2514 // the alloca itself to have an integer type if there is a more suitable one.
2515 Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
2516 if (!canConvertValue(DL, AllocaTy, IntTy) ||
2517 !canConvertValue(DL, IntTy, AllocaTy))
2518 return false;
2519
2520 // While examining uses, we ensure that the alloca has a covering load or
2521 // store. We don't want to widen the integer operations only to fail to
2522 // promote due to some other unsplittable entry (which we may make splittable
2523 // later). However, if there are only splittable uses, go ahead and assume
2524 // that we cover the alloca.
2525 // FIXME: We shouldn't consider split slices that happen to start in the
2526 // partition here...
2527 bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
2528
2529 for (const Slice &S : P)
2530 if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
2531 WholeAllocaOp))
2532 return false;
2533
2534 for (const Slice *S : P.splitSliceTails())
2535 if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
2536 WholeAllocaOp))
2537 return false;
2538
2539 return WholeAllocaOp;
2540}
2541
2542static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2544 const Twine &Name) {
2545 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2546 IntegerType *IntTy = cast<IntegerType>(V->getType());
2547 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2548 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2549 "Element extends past full value");
2550 uint64_t ShAmt = 8 * Offset;
2551 if (DL.isBigEndian())
2552 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2553 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2554 if (ShAmt) {
2555 V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
2556 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2557 }
2558 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2559 "Cannot extract to a larger integer!");
2560 if (Ty != IntTy) {
2561 V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
2562 LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2563 }
2564 return V;
2565}
2566
2567static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2568 Value *V, uint64_t Offset, const Twine &Name) {
2569 IntegerType *IntTy = cast<IntegerType>(Old->getType());
2570 IntegerType *Ty = cast<IntegerType>(V->getType());
2571 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2572 "Cannot insert a larger integer!");
2573 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2574 if (Ty != IntTy) {
2575 V = IRB.CreateZExt(V, IntTy, Name + ".ext");
2576 LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2577 }
2578 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2579 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2580 "Element store outside of alloca store");
2581 uint64_t ShAmt = 8 * Offset;
2582 if (DL.isBigEndian())
2583 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2584 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2585 if (ShAmt) {
2586 V = IRB.CreateShl(V, ShAmt, Name + ".shift");
2587 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2588 }
2589
2590 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2591 APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
2592 Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
2593 LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2594 V = IRB.CreateOr(Old, V, Name + ".insert");
2595 LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2596 }
2597 return V;
2598}
2599
2600static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2601 unsigned EndIndex, const Twine &Name) {
2602 auto *VecTy = cast<FixedVectorType>(V->getType());
2603 unsigned NumElements = EndIndex - BeginIndex;
2604 assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2605
2606 if (NumElements == VecTy->getNumElements())
2607 return V;
2608
2609 if (NumElements == 1) {
2610 V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
2611 Name + ".extract");
2612 LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2613 return V;
2614 }
2615
2616 auto Mask = llvm::to_vector<8>(llvm::seq<int>(BeginIndex, EndIndex));
2617 V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
2618 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2619 return V;
2620}
2621
2622static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2623 unsigned BeginIndex, const Twine &Name) {
2624 VectorType *VecTy = cast<VectorType>(Old->getType());
2625 assert(VecTy && "Can only insert a vector into a vector");
2626
2627 VectorType *Ty = dyn_cast<VectorType>(V->getType());
2628 if (!Ty) {
2629 // Single element to insert.
2630 V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
2631 Name + ".insert");
2632 LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2633 return V;
2634 }
2635
2638 "Too many elements!");
2641 assert(V->getType() == VecTy && "Vector type mismatch");
2642 return V;
2643 }
2644 unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Ty)->getNumElements();
2645
2646 // When inserting a smaller vector into the larger to store, we first
2647 // use a shuffle vector to widen it with undef elements, and then
2648 // a second shuffle vector to select between the loaded vector and the
2649 // incoming vector.
2651 Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
2652 for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2653 if (i >= BeginIndex && i < EndIndex)
2654 Mask.push_back(i - BeginIndex);
2655 else
2656 Mask.push_back(-1);
2657 V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
2658 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2659
2662 for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2663 Mask2.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
2664
2665 V = IRB.CreateSelect(ConstantVector::get(Mask2), V, Old, Name + "blend");
2666
2667 LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2668 return V;
2669}
2670
2671/// This function takes two vector values and combines them into a single vector
2672/// by concatenating their elements. The function handles:
2673///
2674/// 1. Element type mismatch: If either vector's element type differs from
2675/// NewAIEltType, the function bitcasts the vector to use NewAIEltType while
2676/// preserving the total bit width (adjusting the number of elements
2677/// accordingly).
2678///
2679/// 2. Size mismatch: After transforming the vectors to have the desired element
2680/// type, if the two vectors have different numbers of elements, the smaller
2681/// vector is extended with poison values to match the size of the larger
2682/// vector before concatenation.
2683///
2684/// 3. Concatenation: The vectors are merged using a shuffle operation that
2685/// places all elements of V0 first, followed by all elements of V1.
2686///
2687/// \param V0 The first vector to merge (must be a vector type)
2688/// \param V1 The second vector to merge (must be a vector type)
2689/// \param DL The data layout for size calculations
2690/// \param NewAIEltTy The desired element type for the result vector
2691/// \param Builder IRBuilder for creating new instructions
2692/// \return A new vector containing all elements from V0 followed by all
2693/// elements from V1
2695 Type *NewAIEltTy, IRBuilder<> &Builder) {
2696 // V0 and V1 are vectors
2697 // Create a new vector type with combined elements
2698 // Use ShuffleVector to concatenate the vectors
2699 auto *VecType0 = cast<FixedVectorType>(V0->getType());
2700 auto *VecType1 = cast<FixedVectorType>(V1->getType());
2701
2702 // If V0/V1 element types are different from NewAllocaElementType,
2703 // we need to introduce bitcasts before merging them
2704 auto BitcastIfNeeded = [&](Value *&V, FixedVectorType *&VecType,
2705 const char *DebugName) {
2706 Type *EltType = VecType->getElementType();
2707 if (EltType != NewAIEltTy) {
2708 // Calculate new number of elements to maintain same bit width
2709 unsigned TotalBits =
2710 VecType->getNumElements() * DL.getTypeSizeInBits(EltType);
2711 unsigned NewNumElts = TotalBits / DL.getTypeSizeInBits(NewAIEltTy);
2712
2713 auto *NewVecType = FixedVectorType::get(NewAIEltTy, NewNumElts);
2714 V = Builder.CreateBitCast(V, NewVecType);
2715 VecType = NewVecType;
2716 LLVM_DEBUG(dbgs() << " bitcast " << DebugName << ": " << *V << "\n");
2717 }
2718 };
2719
2720 BitcastIfNeeded(V0, VecType0, "V0");
2721 BitcastIfNeeded(V1, VecType1, "V1");
2722
2723 unsigned NumElts0 = VecType0->getNumElements();
2724 unsigned NumElts1 = VecType1->getNumElements();
2725
2726 SmallVector<int, 16> ShuffleMask;
2727
2728 if (NumElts0 == NumElts1) {
2729 for (unsigned i = 0; i < NumElts0 + NumElts1; ++i)
2730 ShuffleMask.push_back(i);
2731 } else {
2732 // If two vectors have different sizes, we need to extend
2733 // the smaller vector to the size of the larger vector.
2734 unsigned SmallSize = std::min(NumElts0, NumElts1);
2735 unsigned LargeSize = std::max(NumElts0, NumElts1);
2736 bool IsV0Smaller = NumElts0 < NumElts1;
2737 Value *&ExtendedVec = IsV0Smaller ? V0 : V1;
2738 SmallVector<int, 16> ExtendMask;
2739 for (unsigned i = 0; i < SmallSize; ++i)
2740 ExtendMask.push_back(i);
2741 for (unsigned i = SmallSize; i < LargeSize; ++i)
2742 ExtendMask.push_back(PoisonMaskElem);
2743 ExtendedVec = Builder.CreateShuffleVector(
2744 ExtendedVec, PoisonValue::get(ExtendedVec->getType()), ExtendMask);
2745 LLVM_DEBUG(dbgs() << " shufflevector: " << *ExtendedVec << "\n");
2746 for (unsigned i = 0; i < NumElts0; ++i)
2747 ShuffleMask.push_back(i);
2748 for (unsigned i = 0; i < NumElts1; ++i)
2749 ShuffleMask.push_back(LargeSize + i);
2750 }
2751
2752 return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2753}
2754
2755namespace {
2756
2757/// Visitor to rewrite instructions using p particular slice of an alloca
2758/// to use a new alloca.
2759///
2760/// Also implements the rewriting to vector-based accesses when the partition
2761/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2762/// lives here.
2763class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
2764 // Befriend the base class so it can delegate to private visit methods.
2765 friend class InstVisitor<AllocaSliceRewriter, bool>;
2766
2767 using Base = InstVisitor<AllocaSliceRewriter, bool>;
2768
2769 const DataLayout &DL;
2770 AllocaSlices &AS;
2771 SROA &Pass;
2772 AllocaInst &OldAI, &NewAI;
2773 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2774 Type *NewAllocaTy;
2775
2776 // This is a convenience and flag variable that will be null unless the new
2777 // alloca's integer operations should be widened to this integer type due to
2778 // passing isIntegerWideningViable above. If it is non-null, the desired
2779 // integer type will be stored here for easy access during rewriting.
2780 IntegerType *IntTy;
2781
2782 // If we are rewriting an alloca partition which can be written as pure
2783 // vector operations, we stash extra information here. When VecTy is
2784 // non-null, we have some strict guarantees about the rewritten alloca:
2785 // - The new alloca is exactly the size of the vector type here.
2786 // - The accesses all either map to the entire vector or to a single
2787 // element.
2788 // - The set of accessing instructions is only one of those handled above
2789 // in isVectorPromotionViable. Generally these are the same access kinds
2790 // which are promotable via mem2reg.
2791 VectorType *VecTy;
2792 Type *ElementTy;
2793 uint64_t ElementSize;
2794
2795 // The original offset of the slice currently being rewritten relative to
2796 // the original alloca.
2797 uint64_t BeginOffset = 0;
2798 uint64_t EndOffset = 0;
2799
2800 // The new offsets of the slice currently being rewritten relative to the
2801 // original alloca.
2802 uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2803
2804 uint64_t SliceSize = 0;
2805 bool IsSplittable = false;
2806 bool IsSplit = false;
2807 Use *OldUse = nullptr;
2808 Instruction *OldPtr = nullptr;
2809
2810 // Track post-rewrite users which are PHI nodes and Selects.
2811 SmallSetVector<PHINode *, 8> &PHIUsers;
2812 SmallSetVector<SelectInst *, 8> &SelectUsers;
2813
2814 // Utility IR builder, whose name prefix is setup for each visited use, and
2815 // the insertion point is set to point to the user.
2816 IRBuilderTy IRB;
2817
2818 // Return the new alloca, addrspacecasted if required to avoid changing the
2819 // addrspace of a volatile access.
2820 Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) {
2821 if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace())
2822 return &NewAI;
2823
2824 Type *AccessTy = IRB.getPtrTy(AddrSpace);
2825 return IRB.CreateAddrSpaceCast(&NewAI, AccessTy);
2826 }
2827
2828public:
2829 AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
2830 AllocaInst &OldAI, AllocaInst &NewAI,
2831 uint64_t NewAllocaBeginOffset,
2832 uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2833 VectorType *PromotableVecTy,
2834 SmallSetVector<PHINode *, 8> &PHIUsers,
2835 SmallSetVector<SelectInst *, 8> &SelectUsers)
2836 : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2837 NewAllocaBeginOffset(NewAllocaBeginOffset),
2838 NewAllocaEndOffset(NewAllocaEndOffset),
2839 NewAllocaTy(NewAI.getAllocatedType()),
2840 IntTy(
2841 IsIntegerPromotable
2842 ? Type::getIntNTy(NewAI.getContext(),
2843 DL.getTypeSizeInBits(NewAI.getAllocatedType())
2844 .getFixedValue())
2845 : nullptr),
2846 VecTy(PromotableVecTy),
2847 ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2848 ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8
2849 : 0),
2850 PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2851 IRB(NewAI.getContext(), ConstantFolder()) {
2852 if (VecTy) {
2853 assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % 8) == 0 &&
2854 "Only multiple-of-8 sized vector elements are viable");
2855 ++NumVectorized;
2856 }
2857 assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2858 }
2859
2860 bool visit(AllocaSlices::const_iterator I) {
2861 bool CanSROA = true;
2862 BeginOffset = I->beginOffset();
2863 EndOffset = I->endOffset();
2864 IsSplittable = I->isSplittable();
2865 IsSplit =
2866 BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2867 LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2868 LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2869 LLVM_DEBUG(dbgs() << "\n");
2870
2871 // Compute the intersecting offset range.
2872 assert(BeginOffset < NewAllocaEndOffset);
2873 assert(EndOffset > NewAllocaBeginOffset);
2874 NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
2875 NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
2876
2877 SliceSize = NewEndOffset - NewBeginOffset;
2878 LLVM_DEBUG(dbgs() << " Begin:(" << BeginOffset << ", " << EndOffset
2879 << ") NewBegin:(" << NewBeginOffset << ", "
2880 << NewEndOffset << ") NewAllocaBegin:("
2881 << NewAllocaBeginOffset << ", " << NewAllocaEndOffset
2882 << ")\n");
2883 assert(IsSplit || NewBeginOffset == BeginOffset);
2884 OldUse = I->getUse();
2885 OldPtr = cast<Instruction>(OldUse->get());
2886
2887 Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
2888 IRB.SetInsertPoint(OldUserI);
2889 IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2890 IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." +
2891 Twine(BeginOffset) + ".");
2892
2893 CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
2894 if (VecTy || IntTy)
2895 assert(CanSROA);
2896 return CanSROA;
2897 }
2898
2899 /// Attempts to rewrite a partition using tree-structured merge optimization.
2900 ///
2901 /// This function analyzes a partition to determine if it can be optimized
2902 /// using a tree-structured merge pattern, where multiple non-overlapping
2903 /// stores completely fill an alloca. And there is no load from the alloca in
2904 /// the middle of the stores. Such patterns can be optimized by eliminating
2905 /// the intermediate stores and directly constructing the final vector by
2906 /// using shufflevectors.
2907 ///
2908 /// Example transformation:
2909 /// Before: (stores do not have to be in order)
2910 /// %alloca = alloca <8 x float>
2911 /// store <2 x float> %val0, ptr %alloca ; offset 0-1
2912 /// store <2 x float> %val2, ptr %alloca+16 ; offset 4-5
2913 /// store <2 x float> %val1, ptr %alloca+8 ; offset 2-3
2914 /// store <2 x float> %val3, ptr %alloca+24 ; offset 6-7
2915 ///
2916 /// After:
2917 /// %alloca = alloca <8 x float>
2918 /// %shuffle0 = shufflevector %val0, %val1, <4 x i32> <i32 0, i32 1, i32 2,
2919 /// i32 3>
2920 /// %shuffle1 = shufflevector %val2, %val3, <4 x i32> <i32 0, i32 1, i32 2,
2921 /// i32 3>
2922 /// %shuffle2 = shufflevector %shuffle0, %shuffle1, <8 x i32> <i32 0, i32 1,
2923 /// i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2924 /// store %shuffle2, ptr %alloca
2925 ///
2926 /// The optimization looks for partitions that:
2927 /// 1. Have no overlapping split slice tails
2928 /// 2. Contain non-overlapping stores that cover the entire alloca
2929 /// 3. Have exactly one load that reads the complete alloca structure and not
2930 /// in the middle of the stores (TODO: maybe we can relax the constraint
2931 /// about reading the entire alloca structure)
2932 ///
2933 /// \param P The partition to analyze and potentially rewrite
2934 /// \return An optional vector of values that were deleted during the rewrite
2935 /// process, or std::nullopt if the partition cannot be optimized
2936 /// using tree-structured merge
2937 std::optional<SmallVector<Value *, 4>>
2938 rewriteTreeStructuredMerge(Partition &P) {
2939 // No tail slices that overlap with the partition
2940 if (P.splitSliceTails().size() > 0)
2941 return std::nullopt;
2942
2943 SmallVector<Value *, 4> DeletedValues;
2944 LoadInst *TheLoad = nullptr;
2945
2946 // Structure to hold store information
2947 struct StoreInfo {
2948 StoreInst *Store;
2949 uint64_t BeginOffset;
2950 uint64_t EndOffset;
2951 Value *StoredValue;
2952 StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val)
2953 : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val) {}
2954 };
2955
2956 SmallVector<StoreInfo, 4> StoreInfos;
2957
2958 // If the new alloca is a fixed vector type, we use its element type as the
2959 // allocated element type, otherwise we use i8 as the allocated element
2960 Type *AllocatedEltTy =
2962 ? cast<FixedVectorType>(NewAI.getAllocatedType())->getElementType()
2963 : Type::getInt8Ty(NewAI.getContext());
2964 unsigned AllocatedEltTySize = DL.getTypeSizeInBits(AllocatedEltTy);
2965
2966 // Helper to check if a type is
2967 // 1. A fixed vector type
2968 // 2. The element type is not a pointer
2969 // 3. The element type size is byte-aligned
2970 // We only handle the cases that the ld/st meet these conditions
2971 auto IsTypeValidForTreeStructuredMerge = [&](Type *Ty) -> bool {
2972 auto *FixedVecTy = dyn_cast<FixedVectorType>(Ty);
2973 return FixedVecTy &&
2974 DL.getTypeSizeInBits(FixedVecTy->getElementType()) % 8 == 0 &&
2975 !FixedVecTy->getElementType()->isPointerTy();
2976 };
2977
2978 for (Slice &S : P) {
2979 auto *User = cast<Instruction>(S.getUse()->getUser());
2980 if (auto *LI = dyn_cast<LoadInst>(User)) {
2981 // Do not handle the case if
2982 // 1. There is more than one load
2983 // 2. The load is volatile
2984 // 3. The load does not read the entire alloca structure
2985 // 4. The load does not meet the conditions in the helper function
2986 if (TheLoad || !IsTypeValidForTreeStructuredMerge(LI->getType()) ||
2987 S.beginOffset() != NewAllocaBeginOffset ||
2988 S.endOffset() != NewAllocaEndOffset || LI->isVolatile())
2989 return std::nullopt;
2990 TheLoad = LI;
2991 } else if (auto *SI = dyn_cast<StoreInst>(User)) {
2992 // Do not handle the case if
2993 // 1. The store does not meet the conditions in the helper function
2994 // 2. The store is volatile
2995 // 3. The total store size is not a multiple of the allocated element
2996 // type size
2997 if (!IsTypeValidForTreeStructuredMerge(
2998 SI->getValueOperand()->getType()) ||
2999 SI->isVolatile())
3000 return std::nullopt;
3001 auto *VecTy = cast<FixedVectorType>(SI->getValueOperand()->getType());
3002 unsigned NumElts = VecTy->getNumElements();
3003 unsigned EltSize = DL.getTypeSizeInBits(VecTy->getElementType());
3004 if (NumElts * EltSize % AllocatedEltTySize != 0)
3005 return std::nullopt;
3006 StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(),
3007 SI->getValueOperand());
3008 } else {
3009 // If we have instructions other than load and store, we cannot do the
3010 // tree structured merge
3011 return std::nullopt;
3012 }
3013 }
3014 // If we do not have any load, we cannot do the tree structured merge
3015 if (!TheLoad)
3016 return std::nullopt;
3017
3018 // If we do not have multiple stores, we cannot do the tree structured merge
3019 if (StoreInfos.size() < 2)
3020 return std::nullopt;
3021
3022 // Stores should not overlap and should cover the whole alloca
3023 // Sort by begin offset
3024 llvm::sort(StoreInfos, [](const StoreInfo &A, const StoreInfo &B) {
3025 return A.BeginOffset < B.BeginOffset;
3026 });
3027
3028 // Check for overlaps and coverage
3029 uint64_t ExpectedStart = NewAllocaBeginOffset;
3030 for (auto &StoreInfo : StoreInfos) {
3031 uint64_t BeginOff = StoreInfo.BeginOffset;
3032 uint64_t EndOff = StoreInfo.EndOffset;
3033
3034 // Check for gap or overlap
3035 if (BeginOff != ExpectedStart)
3036 return std::nullopt;
3037
3038 ExpectedStart = EndOff;
3039 }
3040 // Check that stores cover the entire alloca
3041 if (ExpectedStart != NewAllocaEndOffset)
3042 return std::nullopt;
3043
3044 // Stores should be in the same basic block
3045 // The load should not be in the middle of the stores
3046 // Note:
3047 // If the load is in a different basic block with the stores, we can still
3048 // do the tree structured merge. This is because we do not have the
3049 // store->load forwarding here. The merged vector will be stored back to
3050 // NewAI and the new load will load from NewAI. The forwarding will be
3051 // handled later when we try to promote NewAI.
3052 BasicBlock *LoadBB = TheLoad->getParent();
3053 BasicBlock *StoreBB = StoreInfos[0].Store->getParent();
3054
3055 for (auto &StoreInfo : StoreInfos) {
3056 if (StoreInfo.Store->getParent() != StoreBB)
3057 return std::nullopt;
3058 if (LoadBB == StoreBB && !StoreInfo.Store->comesBefore(TheLoad))
3059 return std::nullopt;
3060 }
3061
3062 // If we reach here, the partition can be merged with a tree structured
3063 // merge
3064 LLVM_DEBUG({
3065 dbgs() << "Tree structured merge rewrite:\n Load: " << *TheLoad
3066 << "\n Ordered stores:\n";
3067 for (auto [i, Info] : enumerate(StoreInfos))
3068 dbgs() << " [" << i << "] Range[" << Info.BeginOffset << ", "
3069 << Info.EndOffset << ") \tStore: " << *Info.Store
3070 << "\tValue: " << *Info.StoredValue << "\n";
3071 });
3072
3073 // Instead of having these stores, we merge all the stored values into a
3074 // vector and store the merged value into the alloca
3075 std::queue<Value *> VecElements;
3076 IRBuilder<> Builder(StoreInfos.back().Store);
3077 for (const auto &Info : StoreInfos) {
3078 DeletedValues.push_back(Info.Store);
3079 VecElements.push(Info.StoredValue);
3080 }
3081
3082 LLVM_DEBUG(dbgs() << " Rewrite stores into shufflevectors:\n");
3083 while (VecElements.size() > 1) {
3084 const auto NumElts = VecElements.size();
3085 for ([[maybe_unused]] const auto _ : llvm::seq(NumElts / 2)) {
3086 Value *V0 = VecElements.front();
3087 VecElements.pop();
3088 Value *V1 = VecElements.front();
3089 VecElements.pop();
3090 Value *Merged = mergeTwoVectors(V0, V1, DL, AllocatedEltTy, Builder);
3091 LLVM_DEBUG(dbgs() << " shufflevector: " << *Merged << "\n");
3092 VecElements.push(Merged);
3093 }
3094 if (NumElts % 2 == 1) {
3095 Value *V = VecElements.front();
3096 VecElements.pop();
3097 VecElements.push(V);
3098 }
3099 }
3100
3101 // Store the merged value into the alloca
3102 Value *MergedValue = VecElements.front();
3103 Builder.CreateAlignedStore(MergedValue, &NewAI, getSliceAlign());
3104
3105 IRBuilder<> LoadBuilder(TheLoad);
3106 TheLoad->replaceAllUsesWith(LoadBuilder.CreateAlignedLoad(
3107 TheLoad->getType(), &NewAI, getSliceAlign(), TheLoad->isVolatile(),
3108 TheLoad->getName() + ".sroa.new.load"));
3109 DeletedValues.push_back(TheLoad);
3110
3111 return DeletedValues;
3112 }
3113
3114private:
3115 // Make sure the other visit overloads are visible.
3116 using Base::visit;
3117
3118 // Every instruction which can end up as a user must have a rewrite rule.
3119 bool visitInstruction(Instruction &I) {
3120 LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
3121 llvm_unreachable("No rewrite rule for this instruction!");
3122 }
3123
3124 Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
3125 // Note that the offset computation can use BeginOffset or NewBeginOffset
3126 // interchangeably for unsplit slices.
3127 assert(IsSplit || BeginOffset == NewBeginOffset);
3128 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3129
3130#ifndef NDEBUG
3131 StringRef OldName = OldPtr->getName();
3132 // Skip through the last '.sroa.' component of the name.
3133 size_t LastSROAPrefix = OldName.rfind(".sroa.");
3134 if (LastSROAPrefix != StringRef::npos) {
3135 OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
3136 // Look for an SROA slice index.
3137 size_t IndexEnd = OldName.find_first_not_of("0123456789");
3138 if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
3139 // Strip the index and look for the offset.
3140 OldName = OldName.substr(IndexEnd + 1);
3141 size_t OffsetEnd = OldName.find_first_not_of("0123456789");
3142 if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
3143 // Strip the offset.
3144 OldName = OldName.substr(OffsetEnd + 1);
3145 }
3146 }
3147 // Strip any SROA suffixes as well.
3148 OldName = OldName.substr(0, OldName.find(".sroa_"));
3149#endif
3150
3151 return getAdjustedPtr(IRB, DL, &NewAI,
3152 APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
3153 PointerTy,
3154#ifndef NDEBUG
3155 Twine(OldName) + "."
3156#else
3157 Twine()
3158#endif
3159 );
3160 }
3161
3162 /// Compute suitable alignment to access this slice of the *new*
3163 /// alloca.
3164 ///
3165 /// You can optionally pass a type to this routine and if that type's ABI
3166 /// alignment is itself suitable, this will return zero.
3167 Align getSliceAlign() {
3168 return commonAlignment(NewAI.getAlign(),
3169 NewBeginOffset - NewAllocaBeginOffset);
3170 }
3171
3172 unsigned getIndex(uint64_t Offset) {
3173 assert(VecTy && "Can only call getIndex when rewriting a vector");
3174 uint64_t RelOffset = Offset - NewAllocaBeginOffset;
3175 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
3176 uint32_t Index = RelOffset / ElementSize;
3177 assert(Index * ElementSize == RelOffset);
3178 return Index;
3179 }
3180
3181 void deleteIfTriviallyDead(Value *V) {
3184 Pass.DeadInsts.push_back(I);
3185 }
3186
3187 Value *rewriteVectorizedLoadInst(LoadInst &LI) {
3188 unsigned BeginIndex = getIndex(NewBeginOffset);
3189 unsigned EndIndex = getIndex(NewEndOffset);
3190 assert(EndIndex > BeginIndex && "Empty vector!");
3191
3192 LoadInst *Load = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3193 NewAI.getAlign(), "load");
3194
3195 Load->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3196 LLVMContext::MD_access_group});
3197 return extractVector(IRB, Load, BeginIndex, EndIndex, "vec");
3198 }
3199
3200 Value *rewriteIntegerLoad(LoadInst &LI) {
3201 assert(IntTy && "We cannot insert an integer to the alloca");
3202 assert(!LI.isVolatile());
3203 Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3204 NewAI.getAlign(), "load");
3205 V = convertValue(DL, IRB, V, IntTy);
3206 assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3207 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3208 if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
3209 IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
3210 V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
3211 }
3212 // It is possible that the extracted type is not the load type. This
3213 // happens if there is a load past the end of the alloca, and as
3214 // a consequence the slice is narrower but still a candidate for integer
3215 // lowering. To handle this case, we just zero extend the extracted
3216 // integer.
3217 assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
3218 "Can only handle an extract for an overly wide load");
3219 if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
3220 V = IRB.CreateZExt(V, LI.getType());
3221 return V;
3222 }
3223
3224 bool visitLoadInst(LoadInst &LI) {
3225 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3226 Value *OldOp = LI.getOperand(0);
3227 assert(OldOp == OldPtr);
3228
3229 AAMDNodes AATags = LI.getAAMetadata();
3230
3231 unsigned AS = LI.getPointerAddressSpace();
3232
3233 Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
3234 : LI.getType();
3235 bool IsPtrAdjusted = false;
3236 Value *V;
3237 if (VecTy) {
3238 V = rewriteVectorizedLoadInst(LI);
3239 } else if (IntTy && LI.getType()->isIntegerTy()) {
3240 V = rewriteIntegerLoad(LI);
3241 } else if (NewBeginOffset == NewAllocaBeginOffset &&
3242 NewEndOffset == NewAllocaEndOffset &&
3243 (canConvertValue(DL, NewAllocaTy, TargetTy) ||
3244 (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() &&
3245 DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize &&
3246 !LI.isVolatile()))) {
3247 Value *NewPtr =
3248 getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile());
3249 LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), NewPtr,
3250 NewAI.getAlign(), LI.isVolatile(),
3251 LI.getName());
3252 if (LI.isVolatile())
3253 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3254 if (NewLI->isAtomic())
3255 NewLI->setAlignment(LI.getAlign());
3256
3257 // Copy any metadata that is valid for the new load. This may require
3258 // conversion to a different kind of metadata, e.g. !nonnull might change
3259 // to !range or vice versa.
3260 copyMetadataForLoad(*NewLI, LI);
3261
3262 // Do this after copyMetadataForLoad() to preserve the TBAA shift.
3263 if (AATags)
3264 NewLI->setAAMetadata(AATags.adjustForAccess(
3265 NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3266
3267 // Try to preserve nonnull metadata
3268 V = NewLI;
3269
3270 // If this is an integer load past the end of the slice (which means the
3271 // bytes outside the slice are undef or this load is dead) just forcibly
3272 // fix the integer size with correct handling of endianness.
3273 if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
3274 if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
3275 if (AITy->getBitWidth() < TITy->getBitWidth()) {
3276 V = IRB.CreateZExt(V, TITy, "load.ext");
3277 if (DL.isBigEndian())
3278 V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
3279 "endian_shift");
3280 }
3281 } else {
3282 Type *LTy = IRB.getPtrTy(AS);
3283 LoadInst *NewLI =
3284 IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
3285 getSliceAlign(), LI.isVolatile(), LI.getName());
3286
3287 if (AATags)
3288 NewLI->setAAMetadata(AATags.adjustForAccess(
3289 NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3290
3291 if (LI.isVolatile())
3292 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3293 NewLI->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3294 LLVMContext::MD_access_group});
3295
3296 V = NewLI;
3297 IsPtrAdjusted = true;
3298 }
3299 V = convertValue(DL, IRB, V, TargetTy);
3300
3301 if (IsSplit) {
3302 assert(!LI.isVolatile());
3303 assert(LI.getType()->isIntegerTy() &&
3304 "Only integer type loads and stores are split");
3305 assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() &&
3306 "Split load isn't smaller than original load");
3307 assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
3308 "Non-byte-multiple bit width");
3309 // Move the insertion point just past the load so that we can refer to it.
3310 BasicBlock::iterator LIIt = std::next(LI.getIterator());
3311 // Ensure the insertion point comes before any debug-info immediately
3312 // after the load, so that variable values referring to the load are
3313 // dominated by it.
3314 LIIt.setHeadBit(true);
3315 IRB.SetInsertPoint(LI.getParent(), LIIt);
3316 // Create a placeholder value with the same type as LI to use as the
3317 // basis for the new value. This allows us to replace the uses of LI with
3318 // the computed value, and then replace the placeholder with LI, leaving
3319 // LI only used for this computation.
3320 Value *Placeholder =
3321 new LoadInst(LI.getType(), PoisonValue::get(IRB.getPtrTy(AS)), "",
3322 false, Align(1));
3323 V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
3324 "insert");
3325 LI.replaceAllUsesWith(V);
3326 Placeholder->replaceAllUsesWith(&LI);
3327 Placeholder->deleteValue();
3328 } else {
3329 LI.replaceAllUsesWith(V);
3330 }
3331
3332 Pass.DeadInsts.push_back(&LI);
3333 deleteIfTriviallyDead(OldOp);
3334 LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
3335 return !LI.isVolatile() && !IsPtrAdjusted;
3336 }
3337
3338 bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
3339 AAMDNodes AATags) {
3340 // Capture V for the purpose of debug-info accounting once it's converted
3341 // to a vector store.
3342 Value *OrigV = V;
3343 if (V->getType() != VecTy) {
3344 unsigned BeginIndex = getIndex(NewBeginOffset);
3345 unsigned EndIndex = getIndex(NewEndOffset);
3346 assert(EndIndex > BeginIndex && "Empty vector!");
3347 unsigned NumElements = EndIndex - BeginIndex;
3348 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3349 "Too many elements!");
3350 Type *SliceTy = (NumElements == 1)
3351 ? ElementTy
3352 : FixedVectorType::get(ElementTy, NumElements);
3353 if (V->getType() != SliceTy)
3354 V = convertValue(DL, IRB, V, SliceTy);
3355
3356 // Mix in the existing elements.
3357 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3358 NewAI.getAlign(), "load");
3359 V = insertVector(IRB, Old, V, BeginIndex, "vec");
3360 }
3361 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3362 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3363 LLVMContext::MD_access_group});
3364 if (AATags)
3365 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3366 V->getType(), DL));
3367 Pass.DeadInsts.push_back(&SI);
3368
3369 // NOTE: Careful to use OrigV rather than V.
3370 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3371 Store, Store->getPointerOperand(), OrigV, DL);
3372 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3373 return true;
3374 }
3375
3376 bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
3377 assert(IntTy && "We cannot extract an integer from the alloca");
3378 assert(!SI.isVolatile());
3379 if (DL.getTypeSizeInBits(V->getType()).getFixedValue() !=
3380 IntTy->getBitWidth()) {
3381 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3382 NewAI.getAlign(), "oldload");
3383 Old = convertValue(DL, IRB, Old, IntTy);
3384 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3385 uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
3386 V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
3387 }
3388 V = convertValue(DL, IRB, V, NewAllocaTy);
3389 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3390 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3391 LLVMContext::MD_access_group});
3392 if (AATags)
3393 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3394 V->getType(), DL));
3395
3396 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3397 Store, Store->getPointerOperand(),
3398 Store->getValueOperand(), DL);
3399
3400 Pass.DeadInsts.push_back(&SI);
3401 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3402 return true;
3403 }
3404
3405 bool visitStoreInst(StoreInst &SI) {
3406 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3407 Value *OldOp = SI.getOperand(1);
3408 assert(OldOp == OldPtr);
3409
3410 AAMDNodes AATags = SI.getAAMetadata();
3411 Value *V = SI.getValueOperand();
3412
3413 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3414 // alloca that should be re-examined after promoting this alloca.
3415 if (V->getType()->isPointerTy())
3416 if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
3417 Pass.PostPromotionWorklist.insert(AI);
3418
3419 TypeSize StoreSize = DL.getTypeStoreSize(V->getType());
3420 if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) {
3421 assert(!SI.isVolatile());
3422 assert(V->getType()->isIntegerTy() &&
3423 "Only integer type loads and stores are split");
3424 assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
3425 "Non-byte-multiple bit width");
3426 IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
3427 V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
3428 "extract");
3429 }
3430
3431 if (VecTy)
3432 return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
3433 if (IntTy && V->getType()->isIntegerTy())
3434 return rewriteIntegerStore(V, SI, AATags);
3435
3436 StoreInst *NewSI;
3437 if (NewBeginOffset == NewAllocaBeginOffset &&
3438 NewEndOffset == NewAllocaEndOffset &&
3439 canConvertValue(DL, V->getType(), NewAllocaTy)) {
3440 V = convertValue(DL, IRB, V, NewAllocaTy);
3441 Value *NewPtr =
3442 getPtrToNewAI(SI.getPointerAddressSpace(), SI.isVolatile());
3443
3444 NewSI =
3445 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), SI.isVolatile());
3446 } else {
3447 unsigned AS = SI.getPointerAddressSpace();
3448 Value *NewPtr = getNewAllocaSlicePtr(IRB, IRB.getPtrTy(AS));
3449 NewSI =
3450 IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
3451 }
3452 NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3453 LLVMContext::MD_access_group});
3454 if (AATags)
3455 NewSI->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3456 V->getType(), DL));
3457 if (SI.isVolatile())
3458 NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
3459 if (NewSI->isAtomic())
3460 NewSI->setAlignment(SI.getAlign());
3461
3462 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3463 NewSI, NewSI->getPointerOperand(),
3464 NewSI->getValueOperand(), DL);
3465
3466 Pass.DeadInsts.push_back(&SI);
3467 deleteIfTriviallyDead(OldOp);
3468
3469 LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
3470 return NewSI->getPointerOperand() == &NewAI &&
3471 NewSI->getValueOperand()->getType() == NewAllocaTy &&
3472 !SI.isVolatile();
3473 }
3474
3475 /// Compute an integer value from splatting an i8 across the given
3476 /// number of bytes.
3477 ///
3478 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
3479 /// call this routine.
3480 /// FIXME: Heed the advice above.
3481 ///
3482 /// \param V The i8 value to splat.
3483 /// \param Size The number of bytes in the output (assuming i8 is one byte)
3484 Value *getIntegerSplat(Value *V, unsigned Size) {
3485 assert(Size > 0 && "Expected a positive number of bytes.");
3486 IntegerType *VTy = cast<IntegerType>(V->getType());
3487 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
3488 if (Size == 1)
3489 return V;
3490
3491 Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
3492 V = IRB.CreateMul(
3493 IRB.CreateZExt(V, SplatIntTy, "zext"),
3494 IRB.CreateUDiv(Constant::getAllOnesValue(SplatIntTy),
3495 IRB.CreateZExt(Constant::getAllOnesValue(V->getType()),
3496 SplatIntTy)),
3497 "isplat");
3498 return V;
3499 }
3500
3501 /// Compute a vector splat for a given element value.
3502 Value *getVectorSplat(Value *V, unsigned NumElements) {
3503 V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
3504 LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
3505 return V;
3506 }
3507
3508 bool visitMemSetInst(MemSetInst &II) {
3509 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3510 assert(II.getRawDest() == OldPtr);
3511
3512 AAMDNodes AATags = II.getAAMetadata();
3513
3514 // If the memset has a variable size, it cannot be split, just adjust the
3515 // pointer to the new alloca.
3516 if (!isa<ConstantInt>(II.getLength())) {
3517 assert(!IsSplit);
3518 assert(NewBeginOffset == BeginOffset);
3519 II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
3520 II.setDestAlignment(getSliceAlign());
3521 // In theory we should call migrateDebugInfo here. However, we do not
3522 // emit dbg.assign intrinsics for mem intrinsics storing through non-
3523 // constant geps, or storing a variable number of bytes.
3525 "AT: Unexpected link to non-const GEP");
3526 deleteIfTriviallyDead(OldPtr);
3527 return false;
3528 }
3529
3530 // Record this instruction for deletion.
3531 Pass.DeadInsts.push_back(&II);
3532
3533 Type *AllocaTy = NewAI.getAllocatedType();
3534 Type *ScalarTy = AllocaTy->getScalarType();
3535
3536 const bool CanContinue = [&]() {
3537 if (VecTy || IntTy)
3538 return true;
3539 if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)
3540 return false;
3541 // Length must be in range for FixedVectorType.
3542 auto *C = cast<ConstantInt>(II.getLength());
3543 const uint64_t Len = C->getLimitedValue();
3544 if (Len > std::numeric_limits<unsigned>::max())
3545 return false;
3546 auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
3547 auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
3548 return canConvertValue(DL, SrcTy, AllocaTy) &&
3549 DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedValue());
3550 }();
3551
3552 // If this doesn't map cleanly onto the alloca type, and that type isn't
3553 // a single value type, just emit a memset.
3554 if (!CanContinue) {
3555 Type *SizeTy = II.getLength()->getType();
3556 unsigned Sz = NewEndOffset - NewBeginOffset;
3557 Constant *Size = ConstantInt::get(SizeTy, Sz);
3558 MemIntrinsic *New = cast<MemIntrinsic>(IRB.CreateMemSet(
3559 getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
3560 MaybeAlign(getSliceAlign()), II.isVolatile()));
3561 if (AATags)
3562 New->setAAMetadata(
3563 AATags.adjustForAccess(NewBeginOffset - BeginOffset, Sz));
3564
3565 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3566 New, New->getRawDest(), nullptr, DL);
3567
3568 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3569 return false;
3570 }
3571
3572 // If we can represent this as a simple value, we have to build the actual
3573 // value to store, which requires expanding the byte present in memset to
3574 // a sensible representation for the alloca type. This is essentially
3575 // splatting the byte to a sufficiently wide integer, splatting it across
3576 // any desired vector width, and bitcasting to the final type.
3577 Value *V;
3578
3579 if (VecTy) {
3580 // If this is a memset of a vectorized alloca, insert it.
3581 assert(ElementTy == ScalarTy);
3582
3583 unsigned BeginIndex = getIndex(NewBeginOffset);
3584 unsigned EndIndex = getIndex(NewEndOffset);
3585 assert(EndIndex > BeginIndex && "Empty vector!");
3586 unsigned NumElements = EndIndex - BeginIndex;
3587 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3588 "Too many elements!");
3589
3590 Value *Splat = getIntegerSplat(
3591 II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8);
3592 Splat = convertValue(DL, IRB, Splat, ElementTy);
3593 if (NumElements > 1)
3594 Splat = getVectorSplat(Splat, NumElements);
3595
3596 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3597 NewAI.getAlign(), "oldload");
3598 V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
3599 } else if (IntTy) {
3600 // If this is a memset on an alloca where we can widen stores, insert the
3601 // set integer.
3602 assert(!II.isVolatile());
3603
3604 uint64_t Size = NewEndOffset - NewBeginOffset;
3605 V = getIntegerSplat(II.getValue(), Size);
3606
3607 if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
3608 EndOffset != NewAllocaBeginOffset)) {
3609 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3610 NewAI.getAlign(), "oldload");
3611 Old = convertValue(DL, IRB, Old, IntTy);
3612 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3613 V = insertInteger(DL, IRB, Old, V, Offset, "insert");
3614 } else {
3615 assert(V->getType() == IntTy &&
3616 "Wrong type for an alloca wide integer!");
3617 }
3618 V = convertValue(DL, IRB, V, AllocaTy);
3619 } else {
3620 // Established these invariants above.
3621 assert(NewBeginOffset == NewAllocaBeginOffset);
3622 assert(NewEndOffset == NewAllocaEndOffset);
3623
3624 V = getIntegerSplat(II.getValue(),
3625 DL.getTypeSizeInBits(ScalarTy).getFixedValue() / 8);
3626 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
3627 V = getVectorSplat(
3628 V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
3629
3630 V = convertValue(DL, IRB, V, AllocaTy);
3631 }
3632
3633 Value *NewPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3634 StoreInst *New =
3635 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), II.isVolatile());
3636 New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3637 LLVMContext::MD_access_group});
3638 if (AATags)
3639 New->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3640 V->getType(), DL));
3641
3642 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3643 New, New->getPointerOperand(), V, DL);
3644
3645 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3646 return !II.isVolatile();
3647 }
3648
3649 bool visitMemTransferInst(MemTransferInst &II) {
3650 // Rewriting of memory transfer instructions can be a bit tricky. We break
3651 // them into two categories: split intrinsics and unsplit intrinsics.
3652
3653 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3654
3655 AAMDNodes AATags = II.getAAMetadata();
3656
3657 bool IsDest = &II.getRawDestUse() == OldUse;
3658 assert((IsDest && II.getRawDest() == OldPtr) ||
3659 (!IsDest && II.getRawSource() == OldPtr));
3660
3661 Align SliceAlign = getSliceAlign();
3662 // For unsplit intrinsics, we simply modify the source and destination
3663 // pointers in place. This isn't just an optimization, it is a matter of
3664 // correctness. With unsplit intrinsics we may be dealing with transfers
3665 // within a single alloca before SROA ran, or with transfers that have
3666 // a variable length. We may also be dealing with memmove instead of
3667 // memcpy, and so simply updating the pointers is the necessary for us to
3668 // update both source and dest of a single call.
3669 if (!IsSplittable) {
3670 Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3671 if (IsDest) {
3672 // Update the address component of linked dbg.assigns.
3673 for (DbgVariableRecord *DbgAssign : at::getDVRAssignmentMarkers(&II)) {
3674 if (llvm::is_contained(DbgAssign->location_ops(), II.getDest()) ||
3675 DbgAssign->getAddress() == II.getDest())
3676 DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
3677 }
3678 II.setDest(AdjustedPtr);
3679 II.setDestAlignment(SliceAlign);
3680 } else {
3681 II.setSource(AdjustedPtr);
3682 II.setSourceAlignment(SliceAlign);
3683 }
3684
3685 LLVM_DEBUG(dbgs() << " to: " << II << "\n");
3686 deleteIfTriviallyDead(OldPtr);
3687 return false;
3688 }
3689 // For split transfer intrinsics we have an incredibly useful assurance:
3690 // the source and destination do not reside within the same alloca, and at
3691 // least one of them does not escape. This means that we can replace
3692 // memmove with memcpy, and we don't need to worry about all manner of
3693 // downsides to splitting and transforming the operations.
3694
3695 // If this doesn't map cleanly onto the alloca type, and that type isn't
3696 // a single value type, just emit a memcpy.
3697 bool EmitMemCpy =
3698 !VecTy && !IntTy &&
3699 (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
3700 SliceSize !=
3701 DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedValue() ||
3702 !DL.typeSizeEqualsStoreSize(NewAI.getAllocatedType()) ||
3704
3705 // If we're just going to emit a memcpy, the alloca hasn't changed, and the
3706 // size hasn't been shrunk based on analysis of the viable range, this is
3707 // a no-op.
3708 if (EmitMemCpy && &OldAI == &NewAI) {
3709 // Ensure the start lines up.
3710 assert(NewBeginOffset == BeginOffset);
3711
3712 // Rewrite the size as needed.
3713 if (NewEndOffset != EndOffset)
3714 II.setLength(NewEndOffset - NewBeginOffset);
3715 return false;
3716 }
3717 // Record this instruction for deletion.
3718 Pass.DeadInsts.push_back(&II);
3719
3720 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3721 // alloca that should be re-examined after rewriting this instruction.
3722 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
3723 if (AllocaInst *AI =
3725 assert(AI != &OldAI && AI != &NewAI &&
3726 "Splittable transfers cannot reach the same alloca on both ends.");
3727 Pass.Worklist.insert(AI);
3728 }
3729
3730 Type *OtherPtrTy = OtherPtr->getType();
3731 unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
3732
3733 // Compute the relative offset for the other pointer within the transfer.
3734 unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
3735 APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
3736 Align OtherAlign =
3737 (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
3738 OtherAlign =
3739 commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
3740
3741 if (EmitMemCpy) {
3742 // Compute the other pointer, folding as much as possible to produce
3743 // a single, simple GEP in most cases.
3744 OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3745 OtherPtr->getName() + ".");
3746
3747 Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3748 Type *SizeTy = II.getLength()->getType();
3749 Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
3750
3751 Value *DestPtr, *SrcPtr;
3752 MaybeAlign DestAlign, SrcAlign;
3753 // Note: IsDest is true iff we're copying into the new alloca slice
3754 if (IsDest) {
3755 DestPtr = OurPtr;
3756 DestAlign = SliceAlign;
3757 SrcPtr = OtherPtr;
3758 SrcAlign = OtherAlign;
3759 } else {
3760 DestPtr = OtherPtr;
3761 DestAlign = OtherAlign;
3762 SrcPtr = OurPtr;
3763 SrcAlign = SliceAlign;
3764 }
3765 CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
3766 Size, II.isVolatile());
3767 if (AATags)
3768 New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3769
3770 APInt Offset(DL.getIndexTypeSizeInBits(DestPtr->getType()), 0);
3771 if (IsDest) {
3772 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8,
3773 &II, New, DestPtr, nullptr, DL);
3774 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3776 DL, Offset, /*AllowNonInbounds*/ true))) {
3777 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8,
3778 SliceSize * 8, &II, New, DestPtr, nullptr, DL);
3779 }
3780 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3781 return false;
3782 }
3783
3784 bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3785 NewEndOffset == NewAllocaEndOffset;
3786 uint64_t Size = NewEndOffset - NewBeginOffset;
3787 unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
3788 unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
3789 unsigned NumElements = EndIndex - BeginIndex;
3790 IntegerType *SubIntTy =
3791 IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
3792
3793 // Reset the other pointer type to match the register type we're going to
3794 // use, but using the address space of the original other pointer.
3795 Type *OtherTy;
3796 if (VecTy && !IsWholeAlloca) {
3797 if (NumElements == 1)
3798 OtherTy = VecTy->getElementType();
3799 else
3800 OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
3801 } else if (IntTy && !IsWholeAlloca) {
3802 OtherTy = SubIntTy;
3803 } else {
3804 OtherTy = NewAllocaTy;
3805 }
3806
3807 Value *AdjPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3808 OtherPtr->getName() + ".");
3809 MaybeAlign SrcAlign = OtherAlign;
3810 MaybeAlign DstAlign = SliceAlign;
3811 if (!IsDest)
3812 std::swap(SrcAlign, DstAlign);
3813
3814 Value *SrcPtr;
3815 Value *DstPtr;
3816
3817 if (IsDest) {
3818 DstPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3819 SrcPtr = AdjPtr;
3820 } else {
3821 DstPtr = AdjPtr;
3822 SrcPtr = getPtrToNewAI(II.getSourceAddressSpace(), II.isVolatile());
3823 }
3824
3825 Value *Src;
3826 if (VecTy && !IsWholeAlloca && !IsDest) {
3827 Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3828 NewAI.getAlign(), "load");
3829 Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
3830 } else if (IntTy && !IsWholeAlloca && !IsDest) {
3831 Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3832 NewAI.getAlign(), "load");
3833 Src = convertValue(DL, IRB, Src, IntTy);
3834 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3835 Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
3836 } else {
3837 LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
3838 II.isVolatile(), "copyload");
3839 Load->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3840 LLVMContext::MD_access_group});
3841 if (AATags)
3842 Load->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3843 Load->getType(), DL));
3844 Src = Load;
3845 }
3846
3847 if (VecTy && !IsWholeAlloca && IsDest) {
3848 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3849 NewAI.getAlign(), "oldload");
3850 Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
3851 } else if (IntTy && !IsWholeAlloca && IsDest) {
3852 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3853 NewAI.getAlign(), "oldload");
3854 Old = convertValue(DL, IRB, Old, IntTy);
3855 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3856 Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
3857 Src = convertValue(DL, IRB, Src, NewAllocaTy);
3858 }
3859
3860 StoreInst *Store = cast<StoreInst>(
3861 IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
3862 Store->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3863 LLVMContext::MD_access_group});
3864 if (AATags)
3865 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3866 Src->getType(), DL));
3867
3868 APInt Offset(DL.getIndexTypeSizeInBits(DstPtr->getType()), 0);
3869 if (IsDest) {
3870
3871 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3872 Store, DstPtr, Src, DL);
3873 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3875 DL, Offset, /*AllowNonInbounds*/ true))) {
3876 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8, SliceSize * 8,
3877 &II, Store, DstPtr, Src, DL);
3878 }
3879
3880 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3881 return !II.isVolatile();
3882 }
3883
3884 bool visitIntrinsicInst(IntrinsicInst &II) {
3885 assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
3886 "Unexpected intrinsic!");
3887 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3888
3889 // Record this instruction for deletion.
3890 Pass.DeadInsts.push_back(&II);
3891
3892 if (II.isDroppable()) {
3893 assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3894 // TODO For now we forget assumed information, this can be improved.
3895 OldPtr->dropDroppableUsesIn(II);
3896 return true;
3897 }
3898
3899 assert(II.getArgOperand(0) == OldPtr);
3900 Type *PointerTy = IRB.getPtrTy(OldPtr->getType()->getPointerAddressSpace());
3901 Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3902 Value *New;
3903 if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3904 New = IRB.CreateLifetimeStart(Ptr);
3905 else
3906 New = IRB.CreateLifetimeEnd(Ptr);
3907
3908 (void)New;
3909 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3910
3911 return true;
3912 }
3913
3914 void fixLoadStoreAlign(Instruction &Root) {
3915 // This algorithm implements the same visitor loop as
3916 // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3917 // or store found.
3918 SmallPtrSet<Instruction *, 4> Visited;
3919 SmallVector<Instruction *, 4> Uses;
3920 Visited.insert(&Root);
3921 Uses.push_back(&Root);
3922 do {
3923 Instruction *I = Uses.pop_back_val();
3924
3925 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
3926 LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
3927 continue;
3928 }
3929 if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
3930 SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
3931 continue;
3932 }
3933
3937 for (User *U : I->users())
3938 if (Visited.insert(cast<Instruction>(U)).second)
3939 Uses.push_back(cast<Instruction>(U));
3940 } while (!Uses.empty());
3941 }
3942
3943 bool visitPHINode(PHINode &PN) {
3944 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
3945 assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3946 assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3947
3948 // We would like to compute a new pointer in only one place, but have it be
3949 // as local as possible to the PHI. To do that, we re-use the location of
3950 // the old pointer, which necessarily must be in the right position to
3951 // dominate the PHI.
3952 IRBuilderBase::InsertPointGuard Guard(IRB);
3953 if (isa<PHINode>(OldPtr))
3954 IRB.SetInsertPoint(OldPtr->getParent(),
3955 OldPtr->getParent()->getFirstInsertionPt());
3956 else
3957 IRB.SetInsertPoint(OldPtr);
3958 IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3959
3960 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3961 // Replace the operands which were using the old pointer.
3962 std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
3963
3964 LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
3965 deleteIfTriviallyDead(OldPtr);
3966
3967 // Fix the alignment of any loads or stores using this PHI node.
3968 fixLoadStoreAlign(PN);
3969
3970 // PHIs can't be promoted on their own, but often can be speculated. We
3971 // check the speculation outside of the rewriter so that we see the
3972 // fully-rewritten alloca.
3973 PHIUsers.insert(&PN);
3974 return true;
3975 }
3976
3977 bool visitSelectInst(SelectInst &SI) {
3978 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3979 assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
3980 "Pointer isn't an operand!");
3981 assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3982 assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3983
3984 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3985 // Replace the operands which were using the old pointer.
3986 if (SI.getOperand(1) == OldPtr)
3987 SI.setOperand(1, NewPtr);
3988 if (SI.getOperand(2) == OldPtr)
3989 SI.setOperand(2, NewPtr);
3990
3991 LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
3992 deleteIfTriviallyDead(OldPtr);
3993
3994 // Fix the alignment of any loads or stores using this select.
3995 fixLoadStoreAlign(SI);
3996
3997 // Selects can't be promoted on their own, but often can be speculated. We
3998 // check the speculation outside of the rewriter so that we see the
3999 // fully-rewritten alloca.
4000 SelectUsers.insert(&SI);
4001 return true;
4002 }
4003};
4004
4005/// Visitor to rewrite aggregate loads and stores as scalar.
4006///
4007/// This pass aggressively rewrites all aggregate loads and stores on
4008/// a particular pointer (or any pointer derived from it which we can identify)
4009/// with scalar loads and stores.
4010class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
4011 // Befriend the base class so it can delegate to private visit methods.
4012 friend class InstVisitor<AggLoadStoreRewriter, bool>;
4013
4014 /// Queue of pointer uses to analyze and potentially rewrite.
4016
4017 /// Set to prevent us from cycling with phi nodes and loops.
4018 SmallPtrSet<User *, 8> Visited;
4019
4020 /// The current pointer use being rewritten. This is used to dig up the used
4021 /// value (as opposed to the user).
4022 Use *U = nullptr;
4023
4024 /// Used to calculate offsets, and hence alignment, of subobjects.
4025 const DataLayout &DL;
4026
4027 IRBuilderTy &IRB;
4028
4029public:
4030 AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
4031 : DL(DL), IRB(IRB) {}
4032
4033 /// Rewrite loads and stores through a pointer and all pointers derived from
4034 /// it.
4035 bool rewrite(Instruction &I) {
4036 LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
4037 enqueueUsers(I);
4038 bool Changed = false;
4039 while (!Queue.empty()) {
4040 U = Queue.pop_back_val();
4041 Changed |= visit(cast<Instruction>(U->getUser()));
4042 }
4043 return Changed;
4044 }
4045
4046private:
4047 /// Enqueue all the users of the given instruction for further processing.
4048 /// This uses a set to de-duplicate users.
4049 void enqueueUsers(Instruction &I) {
4050 for (Use &U : I.uses())
4051 if (Visited.insert(U.getUser()).second)
4052 Queue.push_back(&U);
4053 }
4054
4055 // Conservative default is to not rewrite anything.
4056 bool visitInstruction(Instruction &I) { return false; }
4057
4058 /// Generic recursive split emission class.
4059 template <typename Derived> class OpSplitter {
4060 protected:
4061 /// The builder used to form new instructions.
4062 IRBuilderTy &IRB;
4063
4064 /// The indices which to be used with insert- or extractvalue to select the
4065 /// appropriate value within the aggregate.
4066 SmallVector<unsigned, 4> Indices;
4067
4068 /// The indices to a GEP instruction which will move Ptr to the correct slot
4069 /// within the aggregate.
4070 SmallVector<Value *, 4> GEPIndices;
4071
4072 /// The base pointer of the original op, used as a base for GEPing the
4073 /// split operations.
4074 Value *Ptr;
4075
4076 /// The base pointee type being GEPed into.
4077 Type *BaseTy;
4078
4079 /// Known alignment of the base pointer.
4080 Align BaseAlign;
4081
4082 /// To calculate offset of each component so we can correctly deduce
4083 /// alignments.
4084 const DataLayout &DL;
4085
4086 /// Initialize the splitter with an insertion point, Ptr and start with a
4087 /// single zero GEP index.
4088 OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4089 Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
4090 : IRB(IRB), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy),
4091 BaseAlign(BaseAlign), DL(DL) {
4092 IRB.SetInsertPoint(InsertionPoint);
4093 }
4094
4095 public:
4096 /// Generic recursive split emission routine.
4097 ///
4098 /// This method recursively splits an aggregate op (load or store) into
4099 /// scalar or vector ops. It splits recursively until it hits a single value
4100 /// and emits that single value operation via the template argument.
4101 ///
4102 /// The logic of this routine relies on GEPs and insertvalue and
4103 /// extractvalue all operating with the same fundamental index list, merely
4104 /// formatted differently (GEPs need actual values).
4105 ///
4106 /// \param Ty The type being split recursively into smaller ops.
4107 /// \param Agg The aggregate value being built up or stored, depending on
4108 /// whether this is splitting a load or a store respectively.
4109 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
4110 if (Ty->isSingleValueType()) {
4111 unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
4112 return static_cast<Derived *>(this)->emitFunc(
4113 Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
4114 }
4115
4116 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
4117 unsigned OldSize = Indices.size();
4118 (void)OldSize;
4119 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
4120 ++Idx) {
4121 assert(Indices.size() == OldSize && "Did not return to the old size");
4122 Indices.push_back(Idx);
4123 GEPIndices.push_back(IRB.getInt32(Idx));
4124 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
4125 GEPIndices.pop_back();
4126 Indices.pop_back();
4127 }
4128 return;
4129 }
4130
4131 if (StructType *STy = dyn_cast<StructType>(Ty)) {
4132 unsigned OldSize = Indices.size();
4133 (void)OldSize;
4134 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
4135 ++Idx) {
4136 assert(Indices.size() == OldSize && "Did not return to the old size");
4137 Indices.push_back(Idx);
4138 GEPIndices.push_back(IRB.getInt32(Idx));
4139 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
4140 GEPIndices.pop_back();
4141 Indices.pop_back();
4142 }
4143 return;
4144 }
4145
4146 llvm_unreachable("Only arrays and structs are aggregate loadable types");
4147 }
4148 };
4149
4150 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
4151 AAMDNodes AATags;
4152 // A vector to hold the split components that we want to emit
4153 // separate fake uses for.
4154 SmallVector<Value *, 4> Components;
4155 // A vector to hold all the fake uses of the struct that we are splitting.
4156 // Usually there should only be one, but we are handling the general case.
4158
4159 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4160 AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
4161 IRBuilderTy &IRB)
4162 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
4163 IRB),
4164 AATags(AATags) {}
4165
4166 /// Emit a leaf load of a single value. This is called at the leaves of the
4167 /// recursive emission to actually load values.
4168 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4170 // Load the single value and insert it using the indices.
4171 Value *GEP =
4172 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
4173 LoadInst *Load =
4174 IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
4175
4176 APInt Offset(
4177 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4178 if (AATags &&
4179 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
4180 Load->setAAMetadata(
4181 AATags.adjustForAccess(Offset.getZExtValue(), Load->getType(), DL));
4182 // Record the load so we can generate a fake use for this aggregate
4183 // component.
4184 Components.push_back(Load);
4185
4186 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
4187 LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
4188 }
4189
4190 // Stash the fake uses that use the value generated by this instruction.
4191 void recordFakeUses(LoadInst &LI) {
4192 for (Use &U : LI.uses())
4193 if (auto *II = dyn_cast<IntrinsicInst>(U.getUser()))
4194 if (II->getIntrinsicID() == Intrinsic::fake_use)
4195 FakeUses.push_back(II);
4196 }
4197
4198 // Replace all fake uses of the aggregate with a series of fake uses, one
4199 // for each split component.
4200 void emitFakeUses() {
4201 for (Instruction *I : FakeUses) {
4202 IRB.SetInsertPoint(I);
4203 for (auto *V : Components)
4204 IRB.CreateIntrinsic(Intrinsic::fake_use, {V});
4205 I->eraseFromParent();
4206 }
4207 }
4208 };
4209
4210 bool visitLoadInst(LoadInst &LI) {
4211 assert(LI.getPointerOperand() == *U);
4212 if (!LI.isSimple() || LI.getType()->isSingleValueType())
4213 return false;
4214
4215 // We have an aggregate being loaded, split it apart.
4216 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
4217 LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
4218 getAdjustedAlignment(&LI, 0), DL, IRB);
4219 Splitter.recordFakeUses(LI);
4221 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
4222 Splitter.emitFakeUses();
4223 Visited.erase(&LI);
4224 LI.replaceAllUsesWith(V);
4225 LI.eraseFromParent();
4226 return true;
4227 }
4228
4229 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
4230 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4231 AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign,
4232 const DataLayout &DL, IRBuilderTy &IRB)
4233 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
4234 DL, IRB),
4235 AATags(AATags), AggStore(AggStore) {}
4236 AAMDNodes AATags;
4237 StoreInst *AggStore;
4238 /// Emit a leaf store of a single value. This is called at the leaves of the
4239 /// recursive emission to actually produce stores.
4240 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4242 // Extract the single value and store it using the indices.
4243 //
4244 // The gep and extractvalue values are factored out of the CreateStore
4245 // call to make the output independent of the argument evaluation order.
4246 Value *ExtractValue =
4247 IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
4248 Value *InBoundsGEP =
4249 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
4250 StoreInst *Store =
4251 IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
4252
4253 APInt Offset(
4254 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4255 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset);
4256 if (AATags) {
4257 Store->setAAMetadata(AATags.adjustForAccess(
4258 Offset.getZExtValue(), ExtractValue->getType(), DL));
4259 }
4260
4261 // migrateDebugInfo requires the base Alloca. Walk to it from this gep.
4262 // If we cannot (because there's an intervening non-const or unbounded
4263 // gep) then we wouldn't expect to see dbg.assign intrinsics linked to
4264 // this instruction.
4266 if (auto *OldAI = dyn_cast<AllocaInst>(Base)) {
4267 uint64_t SizeInBits =
4268 DL.getTypeSizeInBits(Store->getValueOperand()->getType());
4269 migrateDebugInfo(OldAI, /*IsSplit*/ true, Offset.getZExtValue() * 8,
4270 SizeInBits, AggStore, Store,
4271 Store->getPointerOperand(), Store->getValueOperand(),
4272 DL);
4273 } else {
4275 "AT: unexpected debug.assign linked to store through "
4276 "unbounded GEP");
4277 }
4278 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
4279 }
4280 };
4281
4282 bool visitStoreInst(StoreInst &SI) {
4283 if (!SI.isSimple() || SI.getPointerOperand() != *U)
4284 return false;
4285 Value *V = SI.getValueOperand();
4286 if (V->getType()->isSingleValueType())
4287 return false;
4288
4289 // We have an aggregate being stored, split it apart.
4290 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
4291 StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI,
4292 getAdjustedAlignment(&SI, 0), DL, IRB);
4293 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
4294 Visited.erase(&SI);
4295 // The stores replacing SI each have markers describing fragments of the
4296 // assignment so delete the assignment markers linked to SI.
4298 SI.eraseFromParent();
4299 return true;
4300 }
4301
4302 bool visitBitCastInst(BitCastInst &BC) {
4303 enqueueUsers(BC);
4304 return false;
4305 }
4306
4307 bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
4308 enqueueUsers(ASC);
4309 return false;
4310 }
4311
4312 // Unfold gep (select cond, ptr1, ptr2), idx
4313 // => select cond, gep(ptr1, idx), gep(ptr2, idx)
4314 // and gep ptr, (select cond, idx1, idx2)
4315 // => select cond, gep(ptr, idx1), gep(ptr, idx2)
4316 // We also allow for i1 zext indices, which are equivalent to selects.
4317 bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
4318 // Check whether the GEP has exactly one select operand and all indices
4319 // will become constant after the transform.
4321 for (Value *Op : GEPI.indices()) {
4322 if (auto *SI = dyn_cast<SelectInst>(Op)) {
4323 if (Sel)
4324 return false;
4325
4326 Sel = SI;
4327 if (!isa<ConstantInt>(SI->getTrueValue()) ||
4328 !isa<ConstantInt>(SI->getFalseValue()))
4329 return false;
4330 continue;
4331 }
4332 if (auto *ZI = dyn_cast<ZExtInst>(Op)) {
4333 if (Sel)
4334 return false;
4335 Sel = ZI;
4336 if (!ZI->getSrcTy()->isIntegerTy(1))
4337 return false;
4338 continue;
4339 }
4340
4341 if (!isa<ConstantInt>(Op))
4342 return false;
4343 }
4344
4345 if (!Sel)
4346 return false;
4347
4348 LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):\n";
4349 dbgs() << " original: " << *Sel << "\n";
4350 dbgs() << " " << GEPI << "\n";);
4351
4352 auto GetNewOps = [&](Value *SelOp) {
4353 SmallVector<Value *> NewOps;
4354 for (Value *Op : GEPI.operands())
4355 if (Op == Sel)
4356 NewOps.push_back(SelOp);
4357 else
4358 NewOps.push_back(Op);
4359 return NewOps;
4360 };
4361
4362 Value *Cond, *True, *False;
4363 if (auto *SI = dyn_cast<SelectInst>(Sel)) {
4364 Cond = SI->getCondition();
4365 True = SI->getTrueValue();
4366 False = SI->getFalseValue();
4367 } else {
4368 Cond = Sel->getOperand(0);
4369 True = ConstantInt::get(Sel->getType(), 1);
4370 False = ConstantInt::get(Sel->getType(), 0);
4371 }
4372 SmallVector<Value *> TrueOps = GetNewOps(True);
4373 SmallVector<Value *> FalseOps = GetNewOps(False);
4374
4375 IRB.SetInsertPoint(&GEPI);
4376 GEPNoWrapFlags NW = GEPI.getNoWrapFlags();
4377
4378 Type *Ty = GEPI.getSourceElementType();
4379 Value *NTrue = IRB.CreateGEP(Ty, TrueOps[0], ArrayRef(TrueOps).drop_front(),
4380 True->getName() + ".sroa.gep", NW);
4381
4382 Value *NFalse =
4383 IRB.CreateGEP(Ty, FalseOps[0], ArrayRef(FalseOps).drop_front(),
4384 False->getName() + ".sroa.gep", NW);
4385
4386 Value *NSel =
4387 IRB.CreateSelect(Cond, NTrue, NFalse, Sel->getName() + ".sroa.sel");
4388 Visited.erase(&GEPI);
4389 GEPI.replaceAllUsesWith(NSel);
4390 GEPI.eraseFromParent();
4391 Instruction *NSelI = cast<Instruction>(NSel);
4392 Visited.insert(NSelI);
4393 enqueueUsers(*NSelI);
4394
4395 LLVM_DEBUG(dbgs() << " to: " << *NTrue << "\n";
4396 dbgs() << " " << *NFalse << "\n";
4397 dbgs() << " " << *NSel << "\n";);
4398
4399 return true;
4400 }
4401
4402 // Unfold gep (phi ptr1, ptr2), idx
4403 // => phi ((gep ptr1, idx), (gep ptr2, idx))
4404 // and gep ptr, (phi idx1, idx2)
4405 // => phi ((gep ptr, idx1), (gep ptr, idx2))
4406 bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
4407 // To prevent infinitely expanding recursive phis, bail if the GEP pointer
4408 // operand (looking through the phi if it is the phi we want to unfold) is
4409 // an instruction besides a static alloca.
4410 PHINode *Phi = dyn_cast<PHINode>(GEPI.getPointerOperand());
4411 auto IsInvalidPointerOperand = [](Value *V) {
4412 if (!isa<Instruction>(V))
4413 return false;
4414 if (auto *AI = dyn_cast<AllocaInst>(V))
4415 return !AI->isStaticAlloca();
4416 return true;
4417 };
4418 if (Phi) {
4419 if (any_of(Phi->operands(), IsInvalidPointerOperand))
4420 return false;
4421 } else {
4422 if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
4423 return false;
4424 }
4425 // Check whether the GEP has exactly one phi operand (including the pointer
4426 // operand) and all indices will become constant after the transform.
4427 for (Value *Op : GEPI.indices()) {
4428 if (auto *SI = dyn_cast<PHINode>(Op)) {
4429 if (Phi)
4430 return false;
4431
4432 Phi = SI;
4433 if (!all_of(Phi->incoming_values(),
4434 [](Value *V) { return isa<ConstantInt>(V); }))
4435 return false;
4436 continue;
4437 }
4438
4439 if (!isa<ConstantInt>(Op))
4440 return false;
4441 }
4442
4443 if (!Phi)
4444 return false;
4445
4446 LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):\n";
4447 dbgs() << " original: " << *Phi << "\n";
4448 dbgs() << " " << GEPI << "\n";);
4449
4450 auto GetNewOps = [&](Value *PhiOp) {
4451 SmallVector<Value *> NewOps;
4452 for (Value *Op : GEPI.operands())
4453 if (Op == Phi)
4454 NewOps.push_back(PhiOp);
4455 else
4456 NewOps.push_back(Op);
4457 return NewOps;
4458 };
4459
4460 IRB.SetInsertPoint(Phi);
4461 PHINode *NewPhi = IRB.CreatePHI(GEPI.getType(), Phi->getNumIncomingValues(),
4462 Phi->getName() + ".sroa.phi");
4463
4464 Type *SourceTy = GEPI.getSourceElementType();
4465 // We only handle arguments, constants, and static allocas here, so we can
4466 // insert GEPs at the end of the entry block.
4467 IRB.SetInsertPoint(GEPI.getFunction()->getEntryBlock().getTerminator());
4468 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
4469 Value *Op = Phi->getIncomingValue(I);
4470 BasicBlock *BB = Phi->getIncomingBlock(I);
4471 Value *NewGEP;
4472 if (int NI = NewPhi->getBasicBlockIndex(BB); NI >= 0) {
4473 NewGEP = NewPhi->getIncomingValue(NI);
4474 } else {
4475 SmallVector<Value *> NewOps = GetNewOps(Op);
4476 NewGEP =
4477 IRB.CreateGEP(SourceTy, NewOps[0], ArrayRef(NewOps).drop_front(),
4478 Phi->getName() + ".sroa.gep", GEPI.getNoWrapFlags());
4479 }
4480 NewPhi->addIncoming(NewGEP, BB);
4481 }
4482
4483 Visited.erase(&GEPI);
4484 GEPI.replaceAllUsesWith(NewPhi);
4485 GEPI.eraseFromParent();
4486 Visited.insert(NewPhi);
4487 enqueueUsers(*NewPhi);
4488
4489 LLVM_DEBUG(dbgs() << " to: ";
4490 for (Value *In
4491 : NewPhi->incoming_values()) dbgs()
4492 << "\n " << *In;
4493 dbgs() << "\n " << *NewPhi << '\n');
4494
4495 return true;
4496 }
4497
4498 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
4499 if (unfoldGEPSelect(GEPI))
4500 return true;
4501
4502 if (unfoldGEPPhi(GEPI))
4503 return true;
4504
4505 enqueueUsers(GEPI);
4506 return false;
4507 }
4508
4509 bool visitPHINode(PHINode &PN) {
4510 enqueueUsers(PN);
4511 return false;
4512 }
4513
4514 bool visitSelectInst(SelectInst &SI) {
4515 enqueueUsers(SI);
4516 return false;
4517 }
4518};
4519
4520} // end anonymous namespace
4521
4522/// Strip aggregate type wrapping.
4523///
4524/// This removes no-op aggregate types wrapping an underlying type. It will
4525/// strip as many layers of types as it can without changing either the type
4526/// size or the allocated size.
4528 if (Ty->isSingleValueType())
4529 return Ty;
4530
4531 uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue();
4532 uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
4533
4534 Type *InnerTy;
4535 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
4536 InnerTy = ArrTy->getElementType();
4537 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
4538 const StructLayout *SL = DL.getStructLayout(STy);
4539 unsigned Index = SL->getElementContainingOffset(0);
4540 InnerTy = STy->getElementType(Index);
4541 } else {
4542 return Ty;
4543 }
4544
4545 if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedValue() ||
4546 TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedValue())
4547 return Ty;
4548
4549 return stripAggregateTypeWrapping(DL, InnerTy);
4550}
4551
4552/// Try to find a partition of the aggregate type passed in for a given
4553/// offset and size.
4554///
4555/// This recurses through the aggregate type and tries to compute a subtype
4556/// based on the offset and size. When the offset and size span a sub-section
4557/// of an array, it will even compute a new array type for that sub-section,
4558/// and the same for structs.
4559///
4560/// Note that this routine is very strict and tries to find a partition of the
4561/// type which produces the *exact* right offset and size. It is not forgiving
4562/// when the size or offset cause either end of type-based partition to be off.
4563/// Also, this is a best-effort routine. It is reasonable to give up and not
4564/// return a type if necessary.
4566 uint64_t Size) {
4567 if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedValue() == Size)
4568 return stripAggregateTypeWrapping(DL, Ty);
4569 if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() ||
4570 (DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size)
4571 return nullptr;
4572
4573 if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
4574 Type *ElementTy;
4575 uint64_t TyNumElements;
4576 if (auto *AT = dyn_cast<ArrayType>(Ty)) {
4577 ElementTy = AT->getElementType();
4578 TyNumElements = AT->getNumElements();
4579 } else {
4580 // FIXME: This isn't right for vectors with non-byte-sized or
4581 // non-power-of-two sized elements.
4582 auto *VT = cast<FixedVectorType>(Ty);
4583 ElementTy = VT->getElementType();
4584 TyNumElements = VT->getNumElements();
4585 }
4586 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4587 uint64_t NumSkippedElements = Offset / ElementSize;
4588 if (NumSkippedElements >= TyNumElements)
4589 return nullptr;
4590 Offset -= NumSkippedElements * ElementSize;
4591
4592 // First check if we need to recurse.
4593 if (Offset > 0 || Size < ElementSize) {
4594 // Bail if the partition ends in a different array element.
4595 if ((Offset + Size) > ElementSize)
4596 return nullptr;
4597 // Recurse through the element type trying to peel off offset bytes.
4598 return getTypePartition(DL, ElementTy, Offset, Size);
4599 }
4600 assert(Offset == 0);
4601
4602 if (Size == ElementSize)
4603 return stripAggregateTypeWrapping(DL, ElementTy);
4604 assert(Size > ElementSize);
4605 uint64_t NumElements = Size / ElementSize;
4606 if (NumElements * ElementSize != Size)
4607 return nullptr;
4608 return ArrayType::get(ElementTy, NumElements);
4609 }
4610
4612 if (!STy)
4613 return nullptr;
4614
4615 const StructLayout *SL = DL.getStructLayout(STy);
4616
4617 if (SL->getSizeInBits().isScalable())
4618 return nullptr;
4619
4620 if (Offset >= SL->getSizeInBytes())
4621 return nullptr;
4622 uint64_t EndOffset = Offset + Size;
4623 if (EndOffset > SL->getSizeInBytes())
4624 return nullptr;
4625
4626 unsigned Index = SL->getElementContainingOffset(Offset);
4627 Offset -= SL->getElementOffset(Index);
4628
4629 Type *ElementTy = STy->getElementType(Index);
4630 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4631 if (Offset >= ElementSize)
4632 return nullptr; // The offset points into alignment padding.
4633
4634 // See if any partition must be contained by the element.
4635 if (Offset > 0 || Size < ElementSize) {
4636 if ((Offset + Size) > ElementSize)
4637 return nullptr;
4638 return getTypePartition(DL, ElementTy, Offset, Size);
4639 }
4640 assert(Offset == 0);
4641
4642 if (Size == ElementSize)
4643 return stripAggregateTypeWrapping(DL, ElementTy);
4644
4645 StructType::element_iterator EI = STy->element_begin() + Index,
4646 EE = STy->element_end();
4647 if (EndOffset < SL->getSizeInBytes()) {
4648 unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
4649 if (Index == EndIndex)
4650 return nullptr; // Within a single element and its padding.
4651
4652 // Don't try to form "natural" types if the elements don't line up with the
4653 // expected size.
4654 // FIXME: We could potentially recurse down through the last element in the
4655 // sub-struct to find a natural end point.
4656 if (SL->getElementOffset(EndIndex) != EndOffset)
4657 return nullptr;
4658
4659 assert(Index < EndIndex);
4660 EE = STy->element_begin() + EndIndex;
4661 }
4662
4663 // Try to build up a sub-structure.
4664 StructType *SubTy =
4665 StructType::get(STy->getContext(), ArrayRef(EI, EE), STy->isPacked());
4666 const StructLayout *SubSL = DL.getStructLayout(SubTy);
4667 if (Size != SubSL->getSizeInBytes())
4668 return nullptr; // The sub-struct doesn't have quite the size needed.
4669
4670 return SubTy;
4671}
4672
4673/// Pre-split loads and stores to simplify rewriting.
4674///
4675/// We want to break up the splittable load+store pairs as much as
4676/// possible. This is important to do as a preprocessing step, as once we
4677/// start rewriting the accesses to partitions of the alloca we lose the
4678/// necessary information to correctly split apart paired loads and stores
4679/// which both point into this alloca. The case to consider is something like
4680/// the following:
4681///
4682/// %a = alloca [12 x i8]
4683/// %gep1 = getelementptr i8, ptr %a, i32 0
4684/// %gep2 = getelementptr i8, ptr %a, i32 4
4685/// %gep3 = getelementptr i8, ptr %a, i32 8
4686/// store float 0.0, ptr %gep1
4687/// store float 1.0, ptr %gep2
4688/// %v = load i64, ptr %gep1
4689/// store i64 %v, ptr %gep2
4690/// %f1 = load float, ptr %gep2
4691/// %f2 = load float, ptr %gep3
4692///
4693/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
4694/// promote everything so we recover the 2 SSA values that should have been
4695/// there all along.
4696///
4697/// \returns true if any changes are made.
4698bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
4699 LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
4700
4701 // Track the loads and stores which are candidates for pre-splitting here, in
4702 // the order they first appear during the partition scan. These give stable
4703 // iteration order and a basis for tracking which loads and stores we
4704 // actually split.
4707
4708 // We need to accumulate the splits required of each load or store where we
4709 // can find them via a direct lookup. This is important to cross-check loads
4710 // and stores against each other. We also track the slice so that we can kill
4711 // all the slices that end up split.
4712 struct SplitOffsets {
4713 Slice *S;
4714 std::vector<uint64_t> Splits;
4715 };
4716 SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
4717
4718 // Track loads out of this alloca which cannot, for any reason, be pre-split.
4719 // This is important as we also cannot pre-split stores of those loads!
4720 // FIXME: This is all pretty gross. It means that we can be more aggressive
4721 // in pre-splitting when the load feeding the store happens to come from
4722 // a separate alloca. Put another way, the effectiveness of SROA would be
4723 // decreased by a frontend which just concatenated all of its local allocas
4724 // into one big flat alloca. But defeating such patterns is exactly the job
4725 // SROA is tasked with! Sadly, to not have this discrepancy we would have
4726 // change store pre-splitting to actually force pre-splitting of the load
4727 // that feeds it *and all stores*. That makes pre-splitting much harder, but
4728 // maybe it would make it more principled?
4729 SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
4730
4731 LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
4732 for (auto &P : AS.partitions()) {
4733 for (Slice &S : P) {
4734 Instruction *I = cast<Instruction>(S.getUse()->getUser());
4735 if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
4736 // If this is a load we have to track that it can't participate in any
4737 // pre-splitting. If this is a store of a load we have to track that
4738 // that load also can't participate in any pre-splitting.
4739 if (auto *LI = dyn_cast<LoadInst>(I))
4740 UnsplittableLoads.insert(LI);
4741 else if (auto *SI = dyn_cast<StoreInst>(I))
4742 if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
4743 UnsplittableLoads.insert(LI);
4744 continue;
4745 }
4746 assert(P.endOffset() > S.beginOffset() &&
4747 "Empty or backwards partition!");
4748
4749 // Determine if this is a pre-splittable slice.
4750 if (auto *LI = dyn_cast<LoadInst>(I)) {
4751 assert(!LI->isVolatile() && "Cannot split volatile loads!");
4752
4753 // The load must be used exclusively to store into other pointers for
4754 // us to be able to arbitrarily pre-split it. The stores must also be
4755 // simple to avoid changing semantics.
4756 auto IsLoadSimplyStored = [](LoadInst *LI) {
4757 for (User *LU : LI->users()) {
4758 auto *SI = dyn_cast<StoreInst>(LU);
4759 if (!SI || !SI->isSimple())
4760 return false;
4761 }
4762 return true;
4763 };
4764 if (!IsLoadSimplyStored(LI)) {
4765 UnsplittableLoads.insert(LI);
4766 continue;
4767 }
4768
4769 Loads.push_back(LI);
4770 } else if (auto *SI = dyn_cast<StoreInst>(I)) {
4771 if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
4772 // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
4773 continue;
4774 auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
4775 if (!StoredLoad || !StoredLoad->isSimple())
4776 continue;
4777 assert(!SI->isVolatile() && "Cannot split volatile stores!");
4778
4779 Stores.push_back(SI);
4780 } else {
4781 // Other uses cannot be pre-split.
4782 continue;
4783 }
4784
4785 // Record the initial split.
4786 LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
4787 auto &Offsets = SplitOffsetsMap[I];
4788 assert(Offsets.Splits.empty() &&
4789 "Should not have splits the first time we see an instruction!");
4790 Offsets.S = &S;
4791 Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
4792 }
4793
4794 // Now scan the already split slices, and add a split for any of them which
4795 // we're going to pre-split.
4796 for (Slice *S : P.splitSliceTails()) {
4797 auto SplitOffsetsMapI =
4798 SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
4799 if (SplitOffsetsMapI == SplitOffsetsMap.end())
4800 continue;
4801 auto &Offsets = SplitOffsetsMapI->second;
4802
4803 assert(Offsets.S == S && "Found a mismatched slice!");
4804 assert(!Offsets.Splits.empty() &&
4805 "Cannot have an empty set of splits on the second partition!");
4806 assert(Offsets.Splits.back() ==
4807 P.beginOffset() - Offsets.S->beginOffset() &&
4808 "Previous split does not end where this one begins!");
4809
4810 // Record each split. The last partition's end isn't needed as the size
4811 // of the slice dictates that.
4812 if (S->endOffset() > P.endOffset())
4813 Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
4814 }
4815 }
4816
4817 // We may have split loads where some of their stores are split stores. For
4818 // such loads and stores, we can only pre-split them if their splits exactly
4819 // match relative to their starting offset. We have to verify this prior to
4820 // any rewriting.
4821 llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
4822 // Lookup the load we are storing in our map of split
4823 // offsets.
4824 auto *LI = cast<LoadInst>(SI->getValueOperand());
4825 // If it was completely unsplittable, then we're done,
4826 // and this store can't be pre-split.
4827 if (UnsplittableLoads.count(LI))
4828 return true;
4829
4830 auto LoadOffsetsI = SplitOffsetsMap.find(LI);
4831 if (LoadOffsetsI == SplitOffsetsMap.end())
4832 return false; // Unrelated loads are definitely safe.
4833 auto &LoadOffsets = LoadOffsetsI->second;
4834
4835 // Now lookup the store's offsets.
4836 auto &StoreOffsets = SplitOffsetsMap[SI];
4837
4838 // If the relative offsets of each split in the load and
4839 // store match exactly, then we can split them and we
4840 // don't need to remove them here.
4841 if (LoadOffsets.Splits == StoreOffsets.Splits)
4842 return false;
4843
4844 LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
4845 << " " << *LI << "\n"
4846 << " " << *SI << "\n");
4847
4848 // We've found a store and load that we need to split
4849 // with mismatched relative splits. Just give up on them
4850 // and remove both instructions from our list of
4851 // candidates.
4852 UnsplittableLoads.insert(LI);
4853 return true;
4854 });
4855 // Now we have to go *back* through all the stores, because a later store may
4856 // have caused an earlier store's load to become unsplittable and if it is
4857 // unsplittable for the later store, then we can't rely on it being split in
4858 // the earlier store either.
4859 llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
4860 auto *LI = cast<LoadInst>(SI->getValueOperand());
4861 return UnsplittableLoads.count(LI);
4862 });
4863 // Once we've established all the loads that can't be split for some reason,
4864 // filter any that made it into our list out.
4865 llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
4866 return UnsplittableLoads.count(LI);
4867 });
4868
4869 // If no loads or stores are left, there is no pre-splitting to be done for
4870 // this alloca.
4871 if (Loads.empty() && Stores.empty())
4872 return false;
4873
4874 // From here on, we can't fail and will be building new accesses, so rig up
4875 // an IR builder.
4876 IRBuilderTy IRB(&AI);
4877
4878 // Collect the new slices which we will merge into the alloca slices.
4879 SmallVector<Slice, 4> NewSlices;
4880
4881 // Track any allocas we end up splitting loads and stores for so we iterate
4882 // on them.
4883 SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
4884
4885 // At this point, we have collected all of the loads and stores we can
4886 // pre-split, and the specific splits needed for them. We actually do the
4887 // splitting in a specific order in order to handle when one of the loads in
4888 // the value operand to one of the stores.
4889 //
4890 // First, we rewrite all of the split loads, and just accumulate each split
4891 // load in a parallel structure. We also build the slices for them and append
4892 // them to the alloca slices.
4893 SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
4894 std::vector<LoadInst *> SplitLoads;
4895 const DataLayout &DL = AI.getDataLayout();
4896 for (LoadInst *LI : Loads) {
4897 SplitLoads.clear();
4898
4899 auto &Offsets = SplitOffsetsMap[LI];
4900 unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
4901 assert(LI->getType()->getIntegerBitWidth() % 8 == 0 &&
4902 "Load must have type size equal to store size");
4903 assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize &&
4904 "Load must be >= slice size");
4905
4906 uint64_t BaseOffset = Offsets.S->beginOffset();
4907 assert(BaseOffset + SliceSize > BaseOffset &&
4908 "Cannot represent alloca access size using 64-bit integers!");
4909
4911 IRB.SetInsertPoint(LI);
4912
4913 LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
4914
4915 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4916 int Idx = 0, Size = Offsets.Splits.size();
4917 for (;;) {
4918 auto *PartTy = Type::getIntNTy(LI->getContext(), PartSize * 8);
4919 auto AS = LI->getPointerAddressSpace();
4920 auto *PartPtrTy = LI->getPointerOperandType();
4921 LoadInst *PLoad = IRB.CreateAlignedLoad(
4922 PartTy,
4923 getAdjustedPtr(IRB, DL, BasePtr,
4924 APInt(DL.getIndexSizeInBits(AS), PartOffset),
4925 PartPtrTy, BasePtr->getName() + "."),
4926 getAdjustedAlignment(LI, PartOffset),
4927 /*IsVolatile*/ false, LI->getName());
4928 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4929 LLVMContext::MD_access_group});
4930
4931 // Append this load onto the list of split loads so we can find it later
4932 // to rewrite the stores.
4933 SplitLoads.push_back(PLoad);
4934
4935 // Now build a new slice for the alloca.
4936 NewSlices.push_back(
4937 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4938 &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
4939 /*IsSplittable*/ false));
4940 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4941 << ", " << NewSlices.back().endOffset()
4942 << "): " << *PLoad << "\n");
4943
4944 // See if we've handled all the splits.
4945 if (Idx >= Size)
4946 break;
4947
4948 // Setup the next partition.
4949 PartOffset = Offsets.Splits[Idx];
4950 ++Idx;
4951 PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset;
4952 }
4953
4954 // Now that we have the split loads, do the slow walk over all uses of the
4955 // load and rewrite them as split stores, or save the split loads to use
4956 // below if the store is going to be split there anyways.
4957 bool DeferredStores = false;
4958 for (User *LU : LI->users()) {
4959 StoreInst *SI = cast<StoreInst>(LU);
4960 if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
4961 DeferredStores = true;
4962 LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
4963 << "\n");
4964 continue;
4965 }
4966
4967 Value *StoreBasePtr = SI->getPointerOperand();
4968 IRB.SetInsertPoint(SI);
4969 AAMDNodes AATags = SI->getAAMetadata();
4970
4971 LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
4972
4973 for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
4974 LoadInst *PLoad = SplitLoads[Idx];
4975 uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
4976 auto *PartPtrTy = SI->getPointerOperandType();
4977
4978 auto AS = SI->getPointerAddressSpace();
4979 StoreInst *PStore = IRB.CreateAlignedStore(
4980 PLoad,
4981 getAdjustedPtr(IRB, DL, StoreBasePtr,
4982 APInt(DL.getIndexSizeInBits(AS), PartOffset),
4983 PartPtrTy, StoreBasePtr->getName() + "."),
4984 getAdjustedAlignment(SI, PartOffset),
4985 /*IsVolatile*/ false);
4986 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
4987 LLVMContext::MD_access_group,
4988 LLVMContext::MD_DIAssignID});
4989
4990 if (AATags)
4991 PStore->setAAMetadata(
4992 AATags.adjustForAccess(PartOffset, PLoad->getType(), DL));
4993 LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
4994 }
4995
4996 // We want to immediately iterate on any allocas impacted by splitting
4997 // this store, and we have to track any promotable alloca (indicated by
4998 // a direct store) as needing to be resplit because it is no longer
4999 // promotable.
5000 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
5001 ResplitPromotableAllocas.insert(OtherAI);
5002 Worklist.insert(OtherAI);
5003 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
5004 StoreBasePtr->stripInBoundsOffsets())) {
5005 Worklist.insert(OtherAI);
5006 }
5007
5008 // Mark the original store as dead.
5009 DeadInsts.push_back(SI);
5010 }
5011
5012 // Save the split loads if there are deferred stores among the users.
5013 if (DeferredStores)
5014 SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
5015
5016 // Mark the original load as dead and kill the original slice.
5017 DeadInsts.push_back(LI);
5018 Offsets.S->kill();
5019 }
5020
5021 // Second, we rewrite all of the split stores. At this point, we know that
5022 // all loads from this alloca have been split already. For stores of such
5023 // loads, we can simply look up the pre-existing split loads. For stores of
5024 // other loads, we split those loads first and then write split stores of
5025 // them.
5026 for (StoreInst *SI : Stores) {
5027 auto *LI = cast<LoadInst>(SI->getValueOperand());
5028 IntegerType *Ty = cast<IntegerType>(LI->getType());
5029 assert(Ty->getBitWidth() % 8 == 0);
5030 uint64_t StoreSize = Ty->getBitWidth() / 8;
5031 assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
5032
5033 auto &Offsets = SplitOffsetsMap[SI];
5034 assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
5035 "Slice size should always match load size exactly!");
5036 uint64_t BaseOffset = Offsets.S->beginOffset();
5037 assert(BaseOffset + StoreSize > BaseOffset &&
5038 "Cannot represent alloca access size using 64-bit integers!");
5039
5040 Value *LoadBasePtr = LI->getPointerOperand();
5041 Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
5042
5043 LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
5044
5045 // Check whether we have an already split load.
5046 auto SplitLoadsMapI = SplitLoadsMap.find(LI);
5047 std::vector<LoadInst *> *SplitLoads = nullptr;
5048 if (SplitLoadsMapI != SplitLoadsMap.end()) {
5049 SplitLoads = &SplitLoadsMapI->second;
5050 assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
5051 "Too few split loads for the number of splits in the store!");
5052 } else {
5053 LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
5054 }
5055
5056 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
5057 int Idx = 0, Size = Offsets.Splits.size();
5058 for (;;) {
5059 auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
5060 auto *LoadPartPtrTy = LI->getPointerOperandType();
5061 auto *StorePartPtrTy = SI->getPointerOperandType();
5062
5063 // Either lookup a split load or create one.
5064 LoadInst *PLoad;
5065 if (SplitLoads) {
5066 PLoad = (*SplitLoads)[Idx];
5067 } else {
5068 IRB.SetInsertPoint(LI);
5069 auto AS = LI->getPointerAddressSpace();
5070 PLoad = IRB.CreateAlignedLoad(
5071 PartTy,
5072 getAdjustedPtr(IRB, DL, LoadBasePtr,
5073 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5074 LoadPartPtrTy, LoadBasePtr->getName() + "."),
5075 getAdjustedAlignment(LI, PartOffset),
5076 /*IsVolatile*/ false, LI->getName());
5077 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
5078 LLVMContext::MD_access_group});
5079 }
5080
5081 // And store this partition.
5082 IRB.SetInsertPoint(SI);
5083 auto AS = SI->getPointerAddressSpace();
5084 StoreInst *PStore = IRB.CreateAlignedStore(
5085 PLoad,
5086 getAdjustedPtr(IRB, DL, StoreBasePtr,
5087 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5088 StorePartPtrTy, StoreBasePtr->getName() + "."),
5089 getAdjustedAlignment(SI, PartOffset),
5090 /*IsVolatile*/ false);
5091 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
5092 LLVMContext::MD_access_group});
5093
5094 // Now build a new slice for the alloca.
5095 NewSlices.push_back(
5096 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
5097 &PStore->getOperandUse(PStore->getPointerOperandIndex()),
5098 /*IsSplittable*/ false));
5099 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
5100 << ", " << NewSlices.back().endOffset()
5101 << "): " << *PStore << "\n");
5102 if (!SplitLoads) {
5103 LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
5104 }
5105
5106 // See if we've finished all the splits.
5107 if (Idx >= Size)
5108 break;
5109
5110 // Setup the next partition.
5111 PartOffset = Offsets.Splits[Idx];
5112 ++Idx;
5113 PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
5114 }
5115
5116 // We want to immediately iterate on any allocas impacted by splitting
5117 // this load, which is only relevant if it isn't a load of this alloca and
5118 // thus we didn't already split the loads above. We also have to keep track
5119 // of any promotable allocas we split loads on as they can no longer be
5120 // promoted.
5121 if (!SplitLoads) {
5122 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
5123 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5124 ResplitPromotableAllocas.insert(OtherAI);
5125 Worklist.insert(OtherAI);
5126 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
5127 LoadBasePtr->stripInBoundsOffsets())) {
5128 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5129 Worklist.insert(OtherAI);
5130 }
5131 }
5132
5133 // Mark the original store as dead now that we've split it up and kill its
5134 // slice. Note that we leave the original load in place unless this store
5135 // was its only use. It may in turn be split up if it is an alloca load
5136 // for some other alloca, but it may be a normal load. This may introduce
5137 // redundant loads, but where those can be merged the rest of the optimizer
5138 // should handle the merging, and this uncovers SSA splits which is more
5139 // important. In practice, the original loads will almost always be fully
5140 // split and removed eventually, and the splits will be merged by any
5141 // trivial CSE, including instcombine.
5142 if (LI->hasOneUse()) {
5143 assert(*LI->user_begin() == SI && "Single use isn't this store!");
5144 DeadInsts.push_back(LI);
5145 }
5146 DeadInsts.push_back(SI);
5147 Offsets.S->kill();
5148 }
5149
5150 // Remove the killed slices that have ben pre-split.
5151 llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
5152
5153 // Insert our new slices. This will sort and merge them into the sorted
5154 // sequence.
5155 AS.insert(NewSlices);
5156
5157 LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
5158#ifndef NDEBUG
5159 for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
5160 LLVM_DEBUG(AS.print(dbgs(), I, " "));
5161#endif
5162
5163 // Finally, don't try to promote any allocas that new require re-splitting.
5164 // They have already been added to the worklist above.
5165 PromotableAllocas.set_subtract(ResplitPromotableAllocas);
5166
5167 return true;
5168}
5169
5170/// Rewrite an alloca partition's users.
5171///
5172/// This routine drives both of the rewriting goals of the SROA pass. It tries
5173/// to rewrite uses of an alloca partition to be conducive for SSA value
5174/// promotion. If the partition needs a new, more refined alloca, this will
5175/// build that new alloca, preserving as much type information as possible, and
5176/// rewrite the uses of the old alloca to point at the new one and have the
5177/// appropriate new offsets. It also evaluates how successful the rewrite was
5178/// at enabling promotion and if it was successful queues the alloca to be
5179/// promoted.
5180AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
5181 Partition &P) {
5182 // Try to compute a friendly type for this partition of the alloca. This
5183 // won't always succeed, in which case we fall back to a legal integer type
5184 // or an i8 array of an appropriate size.
5185 Type *SliceTy = nullptr;
5186 VectorType *SliceVecTy = nullptr;
5187 const DataLayout &DL = AI.getDataLayout();
5188 unsigned VScale = AI.getFunction()->getVScaleValue();
5189
5190 std::pair<Type *, IntegerType *> CommonUseTy =
5191 findCommonType(P.begin(), P.end(), P.endOffset());
5192 // Do all uses operate on the same type?
5193 if (CommonUseTy.first) {
5194 TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first);
5195 if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) {
5196 SliceTy = CommonUseTy.first;
5197 SliceVecTy = dyn_cast<VectorType>(SliceTy);
5198 }
5199 }
5200 // If not, can we find an appropriate subtype in the original allocated type?
5201 if (!SliceTy)
5202 if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
5203 P.beginOffset(), P.size()))
5204 SliceTy = TypePartitionTy;
5205
5206 // If still not, can we use the largest bitwidth integer type used?
5207 if (!SliceTy && CommonUseTy.second)
5208 if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) {
5209 SliceTy = CommonUseTy.second;
5210 SliceVecTy = dyn_cast<VectorType>(SliceTy);
5211 }
5212 if ((!SliceTy || (SliceTy->isArrayTy() &&
5213 SliceTy->getArrayElementType()->isIntegerTy())) &&
5214 DL.isLegalInteger(P.size() * 8)) {
5215 SliceTy = Type::getIntNTy(*C, P.size() * 8);
5216 }
5217
5218 // If the common use types are not viable for promotion then attempt to find
5219 // another type that is viable.
5220 if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL, VScale))
5221 if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
5222 P.beginOffset(), P.size())) {
5223 VectorType *TypePartitionVecTy = dyn_cast<VectorType>(TypePartitionTy);
5224 if (TypePartitionVecTy &&
5225 checkVectorTypeForPromotion(P, TypePartitionVecTy, DL, VScale))
5226 SliceTy = TypePartitionTy;
5227 }
5228
5229 if (!SliceTy)
5230 SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
5231 assert(DL.getTypeAllocSize(SliceTy).getFixedValue() >= P.size());
5232
5233 bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
5234
5235 VectorType *VecTy =
5236 IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale);
5237 if (VecTy)
5238 SliceTy = VecTy;
5239
5240 // Check for the case where we're going to rewrite to a new alloca of the
5241 // exact same type as the original, and with the same access offsets. In that
5242 // case, re-use the existing alloca, but still run through the rewriter to
5243 // perform phi and select speculation.
5244 // P.beginOffset() can be non-zero even with the same type in a case with
5245 // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
5246 AllocaInst *NewAI;
5247 if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
5248 NewAI = &AI;
5249 // FIXME: We should be able to bail at this point with "nothing changed".
5250 // FIXME: We might want to defer PHI speculation until after here.
5251 // FIXME: return nullptr;
5252 } else {
5253 // Make sure the alignment is compatible with P.beginOffset().
5254 const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
5255 // If we will get at least this much alignment from the type alone, leave
5256 // the alloca's alignment unconstrained.
5257 const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy);
5258 NewAI = new AllocaInst(
5259 SliceTy, AI.getAddressSpace(), nullptr,
5260 IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment,
5261 AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()),
5262 AI.getIterator());
5263 // Copy the old AI debug location over to the new one.
5264 NewAI->setDebugLoc(AI.getDebugLoc());
5265 ++NumNewAllocas;
5266 }
5267
5268 LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
5269 << "," << P.endOffset() << ") to: " << *NewAI << "\n");
5270
5271 // Track the high watermark on the worklist as it is only relevant for
5272 // promoted allocas. We will reset it to this point if the alloca is not in
5273 // fact scheduled for promotion.
5274 unsigned PPWOldSize = PostPromotionWorklist.size();
5275 unsigned NumUses = 0;
5276 SmallSetVector<PHINode *, 8> PHIUsers;
5277 SmallSetVector<SelectInst *, 8> SelectUsers;
5278
5279 AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
5280 P.endOffset(), IsIntegerPromotable, VecTy,
5281 PHIUsers, SelectUsers);
5282 bool Promotable = true;
5283 // Check whether we can have tree-structured merge.
5284 if (auto DeletedValues = Rewriter.rewriteTreeStructuredMerge(P)) {
5285 NumUses += DeletedValues->size() + 1;
5286 for (Value *V : *DeletedValues)
5287 DeadInsts.push_back(V);
5288 } else {
5289 for (Slice *S : P.splitSliceTails()) {
5290 Promotable &= Rewriter.visit(S);
5291 ++NumUses;
5292 }
5293 for (Slice &S : P) {
5294 Promotable &= Rewriter.visit(&S);
5295 ++NumUses;
5296 }
5297 }
5298
5299 NumAllocaPartitionUses += NumUses;
5300 MaxUsesPerAllocaPartition.updateMax(NumUses);
5301
5302 // Now that we've processed all the slices in the new partition, check if any
5303 // PHIs or Selects would block promotion.
5304 for (PHINode *PHI : PHIUsers)
5305 if (!isSafePHIToSpeculate(*PHI)) {
5306 Promotable = false;
5307 PHIUsers.clear();
5308 SelectUsers.clear();
5309 break;
5310 }
5311
5313 NewSelectsToRewrite;
5314 NewSelectsToRewrite.reserve(SelectUsers.size());
5315 for (SelectInst *Sel : SelectUsers) {
5316 std::optional<RewriteableMemOps> Ops =
5317 isSafeSelectToSpeculate(*Sel, PreserveCFG);
5318 if (!Ops) {
5319 Promotable = false;
5320 PHIUsers.clear();
5321 SelectUsers.clear();
5322 NewSelectsToRewrite.clear();
5323 break;
5324 }
5325 NewSelectsToRewrite.emplace_back(std::make_pair(Sel, *Ops));
5326 }
5327
5328 if (Promotable) {
5329 for (Use *U : AS.getDeadUsesIfPromotable()) {
5330 auto *OldInst = dyn_cast<Instruction>(U->get());
5331 Value::dropDroppableUse(*U);
5332 if (OldInst)
5333 if (isInstructionTriviallyDead(OldInst))
5334 DeadInsts.push_back(OldInst);
5335 }
5336 if (PHIUsers.empty() && SelectUsers.empty()) {
5337 // Promote the alloca.
5338 PromotableAllocas.insert(NewAI);
5339 } else {
5340 // If we have either PHIs or Selects to speculate, add them to those
5341 // worklists and re-queue the new alloca so that we promote in on the
5342 // next iteration.
5343 SpeculatablePHIs.insert_range(PHIUsers);
5344 SelectsToRewrite.reserve(SelectsToRewrite.size() +
5345 NewSelectsToRewrite.size());
5346 for (auto &&KV : llvm::make_range(
5347 std::make_move_iterator(NewSelectsToRewrite.begin()),
5348 std::make_move_iterator(NewSelectsToRewrite.end())))
5349 SelectsToRewrite.insert(std::move(KV));
5350 Worklist.insert(NewAI);
5351 }
5352 } else {
5353 // Drop any post-promotion work items if promotion didn't happen.
5354 while (PostPromotionWorklist.size() > PPWOldSize)
5355 PostPromotionWorklist.pop_back();
5356
5357 // We couldn't promote and we didn't create a new partition, nothing
5358 // happened.
5359 if (NewAI == &AI)
5360 return nullptr;
5361
5362 // If we can't promote the alloca, iterate on it to check for new
5363 // refinements exposed by splitting the current alloca. Don't iterate on an
5364 // alloca which didn't actually change and didn't get promoted.
5365 Worklist.insert(NewAI);
5366 }
5367
5368 return NewAI;
5369}
5370
5371// There isn't a shared interface to get the "address" parts out of a
5372// dbg.declare and dbg.assign, so provide some wrappers.
5375 return DVR->isKillAddress();
5376 return DVR->isKillLocation();
5377}
5378
5381 return DVR->getAddressExpression();
5382 return DVR->getExpression();
5383}
5384
5385/// Create or replace an existing fragment in a DIExpression with \p Frag.
5386/// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext
5387/// operation, add \p BitExtractOffset to the offset part.
5388///
5389/// Returns the new expression, or nullptr if this fails (see details below).
5390///
5391/// This function is similar to DIExpression::createFragmentExpression except
5392/// for 3 important distinctions:
5393/// 1. The new fragment isn't relative to an existing fragment.
5394/// 2. It assumes the computed location is a memory location. This means we
5395/// don't need to perform checks that creating the fragment preserves the
5396/// expression semantics.
5397/// 3. Existing extract_bits are modified independently of fragment changes
5398/// using \p BitExtractOffset. A change to the fragment offset or size
5399/// may affect a bit extract. But a bit extract offset can change
5400/// independently of the fragment dimensions.
5401///
5402/// Returns the new expression, or nullptr if one couldn't be created.
5403/// Ideally this is only used to signal that a bit-extract has become
5404/// zero-sized (and thus the new debug record has no size and can be
5405/// dropped), however, it fails for other reasons too - see the FIXME below.
5406///
5407/// FIXME: To keep the change that introduces this function NFC it bails
5408/// in some situations unecessarily, e.g. when fragment and bit extract
5409/// sizes differ.
5412 int64_t BitExtractOffset) {
5414 bool HasFragment = false;
5415 bool HasBitExtract = false;
5416
5417 for (auto &Op : Expr->expr_ops()) {
5418 if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
5419 HasFragment = true;
5420 continue;
5421 }
5422 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5424 HasBitExtract = true;
5425 int64_t ExtractOffsetInBits = Op.getArg(0);
5426 int64_t ExtractSizeInBits = Op.getArg(1);
5427
5428 // DIExpression::createFragmentExpression doesn't know how to handle
5429 // a fragment that is smaller than the extract. Copy the behaviour
5430 // (bail) to avoid non-NFC changes.
5431 // FIXME: Don't do this.
5432 if (Frag.SizeInBits < uint64_t(ExtractSizeInBits))
5433 return nullptr;
5434
5435 assert(BitExtractOffset <= 0);
5436 int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset;
5437
5438 // DIExpression::createFragmentExpression doesn't know what to do
5439 // if the new extract starts "outside" the existing one. Copy the
5440 // behaviour (bail) to avoid non-NFC changes.
5441 // FIXME: Don't do this.
5442 if (AdjustedOffset < 0)
5443 return nullptr;
5444
5445 Ops.push_back(Op.getOp());
5446 Ops.push_back(std::max<int64_t>(0, AdjustedOffset));
5447 Ops.push_back(ExtractSizeInBits);
5448 continue;
5449 }
5450 Op.appendToVector(Ops);
5451 }
5452
5453 // Unsupported by createFragmentExpression, so don't support it here yet to
5454 // preserve NFC-ness.
5455 if (HasFragment && HasBitExtract)
5456 return nullptr;
5457
5458 if (!HasBitExtract) {
5460 Ops.push_back(Frag.OffsetInBits);
5461 Ops.push_back(Frag.SizeInBits);
5462 }
5463 return DIExpression::get(Expr->getContext(), Ops);
5464}
5465
5466/// Insert a new DbgRecord.
5467/// \p Orig Original to copy record type, debug loc and variable from, and
5468/// additionally value and value expression for dbg_assign records.
5469/// \p NewAddr Location's new base address.
5470/// \p NewAddrExpr New expression to apply to address.
5471/// \p BeforeInst Insert position.
5472/// \p NewFragment New fragment (absolute, non-relative).
5473/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5474static void
5476 DIExpression *NewAddrExpr, Instruction *BeforeInst,
5477 std::optional<DIExpression::FragmentInfo> NewFragment,
5478 int64_t BitExtractAdjustment) {
5479 (void)DIB;
5480
5481 // A dbg_assign puts fragment info in the value expression only. The address
5482 // expression has already been built: NewAddrExpr. A dbg_declare puts the
5483 // new fragment info into NewAddrExpr (as it only has one expression).
5484 DIExpression *NewFragmentExpr =
5485 Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr;
5486 if (NewFragment)
5487 NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment,
5488 BitExtractAdjustment);
5489 if (!NewFragmentExpr)
5490 return;
5491
5492 if (Orig->isDbgDeclare()) {
5494 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5495 BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5496 BeforeInst->getIterator());
5497 return;
5498 }
5499
5500 if (Orig->isDbgValue()) {
5502 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5503 // Drop debug information if the expression doesn't start with a
5504 // DW_OP_deref. This is because without a DW_OP_deref, the #dbg_value
5505 // describes the address of alloca rather than the value inside the alloca.
5506 if (!NewFragmentExpr->startsWithDeref())
5507 DVR->setKillAddress();
5508 BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5509 BeforeInst->getIterator());
5510 return;
5511 }
5512
5513 // Apply a DIAssignID to the store if it doesn't already have it.
5514 if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) {
5515 NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
5517 }
5518
5520 NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
5521 NewAddrExpr, Orig->getDebugLoc());
5522 LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
5523 (void)NewAssign;
5524}
5525
5526/// Walks the slices of an alloca and form partitions based on them,
5527/// rewriting each of their uses.
5528bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
5529 if (AS.begin() == AS.end())
5530 return false;
5531
5532 unsigned NumPartitions = 0;
5533 bool Changed = false;
5534 const DataLayout &DL = AI.getModule()->getDataLayout();
5535
5536 // First try to pre-split loads and stores.
5537 Changed |= presplitLoadsAndStores(AI, AS);
5538
5539 // Now that we have identified any pre-splitting opportunities,
5540 // mark loads and stores unsplittable except for the following case.
5541 // We leave a slice splittable if all other slices are disjoint or fully
5542 // included in the slice, such as whole-alloca loads and stores.
5543 // If we fail to split these during pre-splitting, we want to force them
5544 // to be rewritten into a partition.
5545 bool IsSorted = true;
5546
5547 uint64_t AllocaSize =
5548 DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue();
5549 const uint64_t MaxBitVectorSize = 1024;
5550 if (AllocaSize <= MaxBitVectorSize) {
5551 // If a byte boundary is included in any load or store, a slice starting or
5552 // ending at the boundary is not splittable.
5553 SmallBitVector SplittableOffset(AllocaSize + 1, true);
5554 for (Slice &S : AS)
5555 for (unsigned O = S.beginOffset() + 1;
5556 O < S.endOffset() && O < AllocaSize; O++)
5557 SplittableOffset.reset(O);
5558
5559 for (Slice &S : AS) {
5560 if (!S.isSplittable())
5561 continue;
5562
5563 if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
5564 (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
5565 continue;
5566
5567 if (isa<LoadInst>(S.getUse()->getUser()) ||
5568 isa<StoreInst>(S.getUse()->getUser())) {
5569 S.makeUnsplittable();
5570 IsSorted = false;
5571 }
5572 }
5573 } else {
5574 // We only allow whole-alloca splittable loads and stores
5575 // for a large alloca to avoid creating too large BitVector.
5576 for (Slice &S : AS) {
5577 if (!S.isSplittable())
5578 continue;
5579
5580 if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
5581 continue;
5582
5583 if (isa<LoadInst>(S.getUse()->getUser()) ||
5584 isa<StoreInst>(S.getUse()->getUser())) {
5585 S.makeUnsplittable();
5586 IsSorted = false;
5587 }
5588 }
5589 }
5590
5591 if (!IsSorted)
5593
5594 /// Describes the allocas introduced by rewritePartition in order to migrate
5595 /// the debug info.
5596 struct Fragment {
5597 AllocaInst *Alloca;
5598 uint64_t Offset;
5599 uint64_t Size;
5600 Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
5601 : Alloca(AI), Offset(O), Size(S) {}
5602 };
5603 SmallVector<Fragment, 4> Fragments;
5604
5605 // Rewrite each partition.
5606 for (auto &P : AS.partitions()) {
5607 if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
5608 Changed = true;
5609 if (NewAI != &AI) {
5610 uint64_t SizeOfByte = 8;
5611 uint64_t AllocaSize =
5612 DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedValue();
5613 // Don't include any padding.
5614 uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
5615 Fragments.push_back(
5616 Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
5617 }
5618 }
5619 ++NumPartitions;
5620 }
5621
5622 NumAllocaPartitions += NumPartitions;
5623 MaxPartitionsPerAlloca.updateMax(NumPartitions);
5624
5625 // Migrate debug information from the old alloca to the new alloca(s)
5626 // and the individual partitions.
5627 auto MigrateOne = [&](DbgVariableRecord *DbgVariable) {
5628 // Can't overlap with undef memory.
5629 if (isKillAddress(DbgVariable))
5630 return;
5631
5632 const Value *DbgPtr = DbgVariable->getAddress();
5634 DbgVariable->getFragmentOrEntireVariable();
5635 // Get the address expression constant offset if one exists and the ops
5636 // that come after it.
5637 int64_t CurrentExprOffsetInBytes = 0;
5638 SmallVector<uint64_t> PostOffsetOps;
5639 if (!getAddressExpression(DbgVariable)
5640 ->extractLeadingOffset(CurrentExprOffsetInBytes, PostOffsetOps))
5641 return; // Couldn't interpret this DIExpression - drop the var.
5642
5643 // Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext.
5644 int64_t ExtractOffsetInBits = 0;
5645 for (auto Op : getAddressExpression(DbgVariable)->expr_ops()) {
5646 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5648 ExtractOffsetInBits = Op.getArg(0);
5649 break;
5650 }
5651 }
5652
5653 DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
5654 for (auto Fragment : Fragments) {
5655 int64_t OffsetFromLocationInBits;
5656 std::optional<DIExpression::FragmentInfo> NewDbgFragment;
5657 // Find the variable fragment that the new alloca slice covers.
5658 // Drop debug info for this variable fragment if we can't compute an
5659 // intersect between it and the alloca slice.
5661 DL, &AI, Fragment.Offset, Fragment.Size, DbgPtr,
5662 CurrentExprOffsetInBytes * 8, ExtractOffsetInBits, VarFrag,
5663 NewDbgFragment, OffsetFromLocationInBits))
5664 continue; // Do not migrate this fragment to this slice.
5665
5666 // Zero sized fragment indicates there's no intersect between the variable
5667 // fragment and the alloca slice. Skip this slice for this variable
5668 // fragment.
5669 if (NewDbgFragment && !NewDbgFragment->SizeInBits)
5670 continue; // Do not migrate this fragment to this slice.
5671
5672 // No fragment indicates DbgVariable's variable or fragment exactly
5673 // overlaps the slice; copy its fragment (or nullopt if there isn't one).
5674 if (!NewDbgFragment)
5675 NewDbgFragment = DbgVariable->getFragment();
5676
5677 // Reduce the new expression offset by the bit-extract offset since
5678 // we'll be keeping that.
5679 int64_t OffestFromNewAllocaInBits =
5680 OffsetFromLocationInBits - ExtractOffsetInBits;
5681 // We need to adjust an existing bit extract if the offset expression
5682 // can't eat the slack (i.e., if the new offset would be negative).
5683 int64_t BitExtractOffset =
5684 std::min<int64_t>(0, OffestFromNewAllocaInBits);
5685 // The magnitude of a negative value indicates the number of bits into
5686 // the existing variable fragment that the memory region begins. The new
5687 // variable fragment already excludes those bits - the new DbgPtr offset
5688 // only needs to be applied if it's positive.
5689 OffestFromNewAllocaInBits =
5690 std::max(int64_t(0), OffestFromNewAllocaInBits);
5691
5692 // Rebuild the expression:
5693 // {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment}
5694 // Add NewDbgFragment later, because dbg.assigns don't want it in the
5695 // address expression but the value expression instead.
5696 DIExpression *NewExpr = DIExpression::get(AI.getContext(), PostOffsetOps);
5697 if (OffestFromNewAllocaInBits > 0) {
5698 int64_t OffsetInBytes = (OffestFromNewAllocaInBits + 7) / 8;
5699 NewExpr = DIExpression::prepend(NewExpr, /*flags=*/0, OffsetInBytes);
5700 }
5701
5702 // Remove any existing intrinsics on the new alloca describing
5703 // the variable fragment.
5704 auto RemoveOne = [DbgVariable](auto *OldDII) {
5705 auto SameVariableFragment = [](const auto *LHS, const auto *RHS) {
5706 return LHS->getVariable() == RHS->getVariable() &&
5707 LHS->getDebugLoc()->getInlinedAt() ==
5708 RHS->getDebugLoc()->getInlinedAt();
5709 };
5710 if (SameVariableFragment(OldDII, DbgVariable))
5711 OldDII->eraseFromParent();
5712 };
5713 for_each(findDVRDeclares(Fragment.Alloca), RemoveOne);
5714 for_each(findDVRValues(Fragment.Alloca), RemoveOne);
5715 insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI,
5716 NewDbgFragment, BitExtractOffset);
5717 }
5718 };
5719
5720 // Migrate debug information from the old alloca to the new alloca(s)
5721 // and the individual partitions.
5722 for_each(findDVRDeclares(&AI), MigrateOne);
5723 for_each(findDVRValues(&AI), MigrateOne);
5724 for_each(at::getDVRAssignmentMarkers(&AI), MigrateOne);
5725
5726 return Changed;
5727}
5728
5729/// Clobber a use with poison, deleting the used value if it becomes dead.
5730void SROA::clobberUse(Use &U) {
5731 Value *OldV = U;
5732 // Replace the use with an poison value.
5733 U = PoisonValue::get(OldV->getType());
5734
5735 // Check for this making an instruction dead. We have to garbage collect
5736 // all the dead instructions to ensure the uses of any alloca end up being
5737 // minimal.
5738 if (Instruction *OldI = dyn_cast<Instruction>(OldV))
5739 if (isInstructionTriviallyDead(OldI)) {
5740 DeadInsts.push_back(OldI);
5741 }
5742}
5743
5744/// A basic LoadAndStorePromoter that does not remove store nodes.
5746public:
5748 Type *ZeroType)
5749 : LoadAndStorePromoter(Insts, S), ZeroType(ZeroType) {}
5750 bool shouldDelete(Instruction *I) const override {
5751 return !isa<StoreInst>(I) && !isa<AllocaInst>(I);
5752 }
5753
5755 return UndefValue::get(ZeroType);
5756 }
5757
5758private:
5759 Type *ZeroType;
5760};
5761
5762bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
5763 // Look through each "partition", looking for slices with the same start/end
5764 // that do not overlap with any before them. The slices are sorted by
5765 // increasing beginOffset. We don't use AS.partitions(), as it will use a more
5766 // sophisticated algorithm that takes splittable slices into account.
5767 LLVM_DEBUG(dbgs() << "Attempting to propagate values on " << AI << "\n");
5768 bool AllSameAndValid = true;
5769 Type *PartitionType = nullptr;
5771 uint64_t BeginOffset = 0;
5772 uint64_t EndOffset = 0;
5773
5774 auto Flush = [&]() {
5775 if (AllSameAndValid && !Insts.empty()) {
5776 LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
5777 << EndOffset << ")\n");
5779 SSAUpdater SSA(&NewPHIs);
5780 Insts.push_back(&AI);
5781 BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
5782 Promoter.run(Insts);
5783 }
5784 AllSameAndValid = true;
5785 PartitionType = nullptr;
5786 Insts.clear();
5787 };
5788
5789 for (Slice &S : AS) {
5790 auto *User = cast<Instruction>(S.getUse()->getUser());
5791 if (isAssumeLikeIntrinsic(User)) {
5792 LLVM_DEBUG({
5793 dbgs() << "Ignoring slice: ";
5794 AS.print(dbgs(), &S);
5795 });
5796 continue;
5797 }
5798 if (S.beginOffset() >= EndOffset) {
5799 Flush();
5800 BeginOffset = S.beginOffset();
5801 EndOffset = S.endOffset();
5802 } else if (S.beginOffset() != BeginOffset || S.endOffset() != EndOffset) {
5803 if (AllSameAndValid) {
5804 LLVM_DEBUG({
5805 dbgs() << "Slice does not match range [" << BeginOffset << ", "
5806 << EndOffset << ")";
5807 AS.print(dbgs(), &S);
5808 });
5809 AllSameAndValid = false;
5810 }
5811 EndOffset = std::max(EndOffset, S.endOffset());
5812 continue;
5813 }
5814
5815 if (auto *LI = dyn_cast<LoadInst>(User)) {
5816 Type *UserTy = LI->getType();
5817 // LoadAndStorePromoter requires all the types to be the same.
5818 if (!LI->isSimple() || (PartitionType && UserTy != PartitionType))
5819 AllSameAndValid = false;
5820 PartitionType = UserTy;
5821 Insts.push_back(User);
5822 } else if (auto *SI = dyn_cast<StoreInst>(User)) {
5823 Type *UserTy = SI->getValueOperand()->getType();
5824 if (!SI->isSimple() || (PartitionType && UserTy != PartitionType))
5825 AllSameAndValid = false;
5826 PartitionType = UserTy;
5827 Insts.push_back(User);
5828 } else {
5829 AllSameAndValid = false;
5830 }
5831 }
5832
5833 Flush();
5834 return true;
5835}
5836
5837/// Analyze an alloca for SROA.
5838///
5839/// This analyzes the alloca to ensure we can reason about it, builds
5840/// the slices of the alloca, and then hands it off to be split and
5841/// rewritten as needed.
5842std::pair<bool /*Changed*/, bool /*CFGChanged*/>
5843SROA::runOnAlloca(AllocaInst &AI) {
5844 bool Changed = false;
5845 bool CFGChanged = false;
5846
5847 LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
5848 ++NumAllocasAnalyzed;
5849
5850 // Special case dead allocas, as they're trivial.
5851 if (AI.use_empty()) {
5852 AI.eraseFromParent();
5853 Changed = true;
5854 return {Changed, CFGChanged};
5855 }
5856 const DataLayout &DL = AI.getDataLayout();
5857
5858 // Skip alloca forms that this analysis can't handle.
5859 auto *AT = AI.getAllocatedType();
5860 TypeSize Size = DL.getTypeAllocSize(AT);
5861 if (AI.isArrayAllocation() || !AT->isSized() || Size.isScalable() ||
5862 Size.getFixedValue() == 0)
5863 return {Changed, CFGChanged};
5864
5865 // First, split any FCA loads and stores touching this alloca to promote
5866 // better splitting and promotion opportunities.
5867 IRBuilderTy IRB(&AI);
5868 AggLoadStoreRewriter AggRewriter(DL, IRB);
5869 Changed |= AggRewriter.rewrite(AI);
5870
5871 // Build the slices using a recursive instruction-visiting builder.
5872 AllocaSlices AS(DL, AI);
5873 LLVM_DEBUG(AS.print(dbgs()));
5874 if (AS.isEscaped())
5875 return {Changed, CFGChanged};
5876
5877 if (AS.isEscapedReadOnly()) {
5878 Changed |= propagateStoredValuesToLoads(AI, AS);
5879 return {Changed, CFGChanged};
5880 }
5881
5882 // Delete all the dead users of this alloca before splitting and rewriting it.
5883 for (Instruction *DeadUser : AS.getDeadUsers()) {
5884 // Free up everything used by this instruction.
5885 for (Use &DeadOp : DeadUser->operands())
5886 clobberUse(DeadOp);
5887
5888 // Now replace the uses of this instruction.
5889 DeadUser->replaceAllUsesWith(PoisonValue::get(DeadUser->getType()));
5890
5891 // And mark it for deletion.
5892 DeadInsts.push_back(DeadUser);
5893 Changed = true;
5894 }
5895 for (Use *DeadOp : AS.getDeadOperands()) {
5896 clobberUse(*DeadOp);
5897 Changed = true;
5898 }
5899
5900 // No slices to split. Leave the dead alloca for a later pass to clean up.
5901 if (AS.begin() == AS.end())
5902 return {Changed, CFGChanged};
5903
5904 Changed |= splitAlloca(AI, AS);
5905
5906 LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
5907 while (!SpeculatablePHIs.empty())
5908 speculatePHINodeLoads(IRB, *SpeculatablePHIs.pop_back_val());
5909
5910 LLVM_DEBUG(dbgs() << " Rewriting Selects\n");
5911 auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector();
5912 while (!RemainingSelectsToRewrite.empty()) {
5913 const auto [K, V] = RemainingSelectsToRewrite.pop_back_val();
5914 CFGChanged |=
5915 rewriteSelectInstMemOps(*K, V, IRB, PreserveCFG ? nullptr : DTU);
5916 }
5917
5918 return {Changed, CFGChanged};
5919}
5920
5921/// Delete the dead instructions accumulated in this run.
5922///
5923/// Recursively deletes the dead instructions we've accumulated. This is done
5924/// at the very end to maximize locality of the recursive delete and to
5925/// minimize the problems of invalidated instruction pointers as such pointers
5926/// are used heavily in the intermediate stages of the algorithm.
5927///
5928/// We also record the alloca instructions deleted here so that they aren't
5929/// subsequently handed to mem2reg to promote.
5930bool SROA::deleteDeadInstructions(
5931 SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
5932 bool Changed = false;
5933 while (!DeadInsts.empty()) {
5934 Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
5935 if (!I)
5936 continue;
5937 LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
5938
5939 // If the instruction is an alloca, find the possible dbg.declare connected
5940 // to it, and remove it too. We must do this before calling RAUW or we will
5941 // not be able to find it.
5942 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
5943 DeletedAllocas.insert(AI);
5944 for (DbgVariableRecord *OldDII : findDVRDeclares(AI))
5945 OldDII->eraseFromParent();
5946 }
5947
5949 I->replaceAllUsesWith(UndefValue::get(I->getType()));
5950
5951 for (Use &Operand : I->operands())
5952 if (Instruction *U = dyn_cast<Instruction>(Operand)) {
5953 // Zero out the operand and see if it becomes trivially dead.
5954 Operand = nullptr;
5956 DeadInsts.push_back(U);
5957 }
5958
5959 ++NumDeleted;
5960 I->eraseFromParent();
5961 Changed = true;
5962 }
5963 return Changed;
5964}
5965/// Promote the allocas, using the best available technique.
5966///
5967/// This attempts to promote whatever allocas have been identified as viable in
5968/// the PromotableAllocas list. If that list is empty, there is nothing to do.
5969/// This function returns whether any promotion occurred.
5970bool SROA::promoteAllocas() {
5971 if (PromotableAllocas.empty())
5972 return false;
5973
5974 if (SROASkipMem2Reg) {
5975 LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
5976 } else {
5977 LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
5978 NumPromoted += PromotableAllocas.size();
5979 PromoteMemToReg(PromotableAllocas.getArrayRef(), DTU->getDomTree(), AC);
5980 }
5981
5982 PromotableAllocas.clear();
5983 return true;
5984}
5985
5986std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) {
5987 LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
5988
5989 const DataLayout &DL = F.getDataLayout();
5990 BasicBlock &EntryBB = F.getEntryBlock();
5991 for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
5992 I != E; ++I) {
5993 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
5994 if (DL.getTypeAllocSize(AI->getAllocatedType()).isScalable() &&
5996 PromotableAllocas.insert(AI);
5997 else
5998 Worklist.insert(AI);
5999 }
6000 }
6001
6002 bool Changed = false;
6003 bool CFGChanged = false;
6004 // A set of deleted alloca instruction pointers which should be removed from
6005 // the list of promotable allocas.
6006 SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
6007
6008 do {
6009 while (!Worklist.empty()) {
6010 auto [IterationChanged, IterationCFGChanged] =
6011 runOnAlloca(*Worklist.pop_back_val());
6012 Changed |= IterationChanged;
6013 CFGChanged |= IterationCFGChanged;
6014
6015 Changed |= deleteDeadInstructions(DeletedAllocas);
6016
6017 // Remove the deleted allocas from various lists so that we don't try to
6018 // continue processing them.
6019 if (!DeletedAllocas.empty()) {
6020 Worklist.set_subtract(DeletedAllocas);
6021 PostPromotionWorklist.set_subtract(DeletedAllocas);
6022 PromotableAllocas.set_subtract(DeletedAllocas);
6023 DeletedAllocas.clear();
6024 }
6025 }
6026
6027 Changed |= promoteAllocas();
6028
6029 Worklist = PostPromotionWorklist;
6030 PostPromotionWorklist.clear();
6031 } while (!Worklist.empty());
6032
6033 assert((!CFGChanged || Changed) && "Can not only modify the CFG.");
6034 assert((!CFGChanged || !PreserveCFG) &&
6035 "Should not have modified the CFG when told to preserve it.");
6036
6037 if (Changed && isAssignmentTrackingEnabled(*F.getParent())) {
6038 for (auto &BB : F) {
6040 }
6041 }
6042
6043 return {Changed, CFGChanged};
6044}
6045
6049 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6050 auto [Changed, CFGChanged] =
6051 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6052 if (!Changed)
6053 return PreservedAnalyses::all();
6055 if (!CFGChanged)
6058 return PA;
6059}
6060
6062 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
6063 static_cast<PassInfoMixin<SROAPass> *>(this)->printPipeline(
6064 OS, MapClassName2PassName);
6065 OS << (PreserveCFG == SROAOptions::PreserveCFG ? "<preserve-cfg>"
6066 : "<modify-cfg>");
6067}
6068
6069SROAPass::SROAPass(SROAOptions PreserveCFG) : PreserveCFG(PreserveCFG) {}
6070
6071namespace {
6072
6073/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
6074class SROALegacyPass : public FunctionPass {
6076
6077public:
6078 static char ID;
6079
6083 }
6084
6085 bool runOnFunction(Function &F) override {
6086 if (skipFunction(F))
6087 return false;
6088
6089 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
6090 AssumptionCache &AC =
6091 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
6092 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6093 auto [Changed, _] =
6094 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6095 return Changed;
6096 }
6097
6098 void getAnalysisUsage(AnalysisUsage &AU) const override {
6099 AU.addRequired<AssumptionCacheTracker>();
6100 AU.addRequired<DominatorTreeWrapperPass>();
6101 AU.addPreserved<GlobalsAAWrapperPass>();
6102 AU.addPreserved<DominatorTreeWrapperPass>();
6103 }
6104
6105 StringRef getPassName() const override { return "SROA"; }
6106};
6107
6108} // end anonymous namespace
6109
6110char SROALegacyPass::ID = 0;
6111
6116
6117INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
6118 "Scalar Replacement Of Aggregates", false, false)
6121INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
static bool runOnFunction(Function &F, bool PostInlining)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
This defines the Use class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
print mir2vec MIR2Vec Vocabulary Printer Pass
Definition MIR2Vec.cpp:273
This file implements a map that provides insertion order iteration.
static std::optional< uint64_t > getSizeInBytes(std::optional< uint64_t > SizeInBits)
Memory SSA
Definition MemorySSA.cpp:72
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
PassBuilder PB(Machine, PassOpts->PTO, std::nullopt, &PIC)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PointerIntPair class.
This file provides a collection of visitors which walk the (instruction) uses of a pointer.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
bool isDead(const MachineInstr &MI, const MachineRegisterInfo &MRI)
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, uint64_t OldAllocaOffsetInBits, uint64_t SliceSizeInBits, Instruction *OldInst, Instruction *Inst, Value *Dest, Value *Value, const DataLayout &DL)
Find linked dbg.assign and generate a new one with the correct FragmentInfo.
Definition SROA.cpp:338
static VectorType * isVectorPromotionViable(Partition &P, const DataLayout &DL, unsigned VScale)
Test whether the given alloca partitioning and range of slices can be promoted to a vector.
Definition SROA.cpp:2324
static Align getAdjustedAlignment(Instruction *I, uint64_t Offset)
Compute the adjusted alignment for a load or store from an offset.
Definition SROA.cpp:1899
static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy, const DataLayout &DL, unsigned VScale)
Test whether a vector type is viable for promotion.
Definition SROA.cpp:2165
static cl::opt< bool > SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false), cl::Hidden)
Disable running mem2reg during SROA in order to test or debug SROA.
static VectorType * checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, SmallVectorImpl< VectorType * > &CandidateTys, bool HaveCommonEltTy, Type *CommonEltTy, bool HaveVecPtrTy, bool HaveCommonVecPtrTy, VectorType *CommonVecPtrTy, unsigned VScale)
Test whether any vector type in CandidateTys is viable for promotion.
Definition SROA.cpp:2194
static std::pair< Type *, IntegerType * > findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E, uint64_t EndOffset)
Walk the range of a partitioning looking for a common type to cover this sequence of slices.
Definition SROA.cpp:1466
static Type * stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty)
Strip aggregate type wrapping.
Definition SROA.cpp:4527
static FragCalcResult calculateFragment(DILocalVariable *Variable, uint64_t NewStorageSliceOffsetInBits, uint64_t NewStorageSliceSizeInBits, std::optional< DIExpression::FragmentInfo > StorageFragment, std::optional< DIExpression::FragmentInfo > CurrentFragment, DIExpression::FragmentInfo &Target)
Definition SROA.cpp:273
static DIExpression * createOrReplaceFragment(const DIExpression *Expr, DIExpression::FragmentInfo Frag, int64_t BitExtractOffset)
Create or replace an existing fragment in a DIExpression with Frag.
Definition SROA.cpp:5410
static Value * insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, Value *V, uint64_t Offset, const Twine &Name)
Definition SROA.cpp:2567
static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, VectorType *Ty, uint64_t ElementSize, const DataLayout &DL, unsigned VScale)
Test whether the given slice use can be promoted to a vector.
Definition SROA.cpp:2090
static Value * getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Offset, Type *PointerTy, const Twine &NamePrefix)
Compute an adjusted pointer from Ptr by Offset bytes where the resulting pointer has PointerTy.
Definition SROA.cpp:1888
static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t AllocBeginOffset, Type *AllocaTy, const DataLayout &DL, bool &WholeAllocaOp)
Test whether a slice of an alloca is valid for integer widening.
Definition SROA.cpp:2406
static Value * extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, unsigned EndIndex, const Twine &Name)
Definition SROA.cpp:2600
static Value * foldPHINodeOrSelectInst(Instruction &I)
A helper that folds a PHI node or a select.
Definition SROA.cpp:985
static bool rewriteSelectInstMemOps(SelectInst &SI, const RewriteableMemOps &Ops, IRBuilderTy &IRB, DomTreeUpdater *DTU)
Definition SROA.cpp:1854
static void rewriteMemOpOfSelect(SelectInst &SI, T &I, SelectHandSpeculativity Spec, DomTreeUpdater &DTU)
Definition SROA.cpp:1787
static Value * foldSelectInst(SelectInst &SI)
Definition SROA.cpp:972
bool isKillAddress(const DbgVariableRecord *DVR)
Definition SROA.cpp:5373
static Value * insertVector(IRBuilderTy &IRB, Value *Old, Value *V, unsigned BeginIndex, const Twine &Name)
Definition SROA.cpp:2622
static bool isIntegerWideningViable(Partition &P, Type *AllocaTy, const DataLayout &DL)
Test whether the given alloca partition's integer operations can be widened to promotable ones.
Definition SROA.cpp:2501
static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN)
Definition SROA.cpp:1606
static VectorType * createAndCheckVectorTypesForPromotion(SetVector< Type * > &OtherTys, ArrayRef< VectorType * > CandidateTysCopy, function_ref< void(Type *)> CheckCandidateType, Partition &P, const DataLayout &DL, SmallVectorImpl< VectorType * > &CandidateTys, bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy, bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale)
Definition SROA.cpp:2280
static DebugVariable getAggregateVariable(DbgVariableRecord *DVR)
Definition SROA.cpp:319
static bool isSafePHIToSpeculate(PHINode &PN)
PHI instructions that use an alloca and are subsequently loaded can be rewritten to load both input p...
Definition SROA.cpp:1532
static Value * extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, IntegerType *Ty, uint64_t Offset, const Twine &Name)
Definition SROA.cpp:2542
static void insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, AllocaInst *NewAddr, DIExpression *NewAddrExpr, Instruction *BeforeInst, std::optional< DIExpression::FragmentInfo > NewFragment, int64_t BitExtractAdjustment)
Insert a new DbgRecord.
Definition SROA.cpp:5475
static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI, IRBuilderTy &IRB)
Definition SROA.cpp:1749
static Value * mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL, Type *NewAIEltTy, IRBuilder<> &Builder)
This function takes two vector values and combines them into a single vector by concatenating their e...
Definition SROA.cpp:2694
const DIExpression * getAddressExpression(const DbgVariableRecord *DVR)
Definition SROA.cpp:5379
static Type * getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, uint64_t Size)
Try to find a partition of the aggregate type passed in for a given offset and size.
Definition SROA.cpp:4565
static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy, unsigned VScale=0)
Test whether we can convert a value from the old to the new type.
Definition SROA.cpp:1909
static SelectHandSpeculativity isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG)
Definition SROA.cpp:1687
static Value * convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, Type *NewTy)
Generic routine to convert an SSA value to a value of a different type.
Definition SROA.cpp:1999
This file provides the interface for LLVM's Scalar Replacement of Aggregates pass.
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Virtual Register Rewriter
Value * RHS
Value * LHS
Builder for the alloca slices.
Definition SROA.cpp:997
SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
Definition SROA.cpp:1013
An iterator over partitions of the alloca's slices.
Definition SROA.cpp:785
bool operator==(const partition_iterator &RHS) const
Definition SROA.cpp:932
partition_iterator & operator++()
Definition SROA.cpp:952
bool shouldDelete(Instruction *I) const override
Return false if a sub-class wants to keep one of the loads/stores after the SSA construction.
Definition SROA.cpp:5750
BasicLoadAndStorePromoter(ArrayRef< const Instruction * > Insts, SSAUpdater &S, Type *ZeroType)
Definition SROA.cpp:5747
Value * getValueToUseForAlloca(Instruction *I) const override
Return the value to use for the point in the code that the alloca is positioned.
Definition SROA.cpp:5754
Class for arbitrary precision integers.
Definition APInt.h:78
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
iterator begin() const
Definition ArrayRef.h:135
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
LLVM_ABI CaptureInfo getCaptureInfo(unsigned OpNo) const
Return which pointer components this operand may capture.
bool onlyReadsMemory(unsigned OpNo) const
bool isDataOperand(const Use *U) const
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static DIAssignID * getDistinct(LLVMContext &Context)
LLVM_ABI DbgInstPtr insertDbgAssign(Instruction *LinkedInstr, Value *Val, DILocalVariable *SrcVar, DIExpression *ValExpr, Value *Addr, DIExpression *AddrExpr, const DILocation *DL)
Insert a new llvm.dbg.assign intrinsic call.
DWARF expression.
iterator_range< expr_op_iterator > expr_ops() const
DbgVariableFragmentInfo FragmentInfo
LLVM_ABI bool startsWithDeref() const
Return whether the first element a DW_OP_deref.
static LLVM_ABI bool calculateFragmentIntersect(const DataLayout &DL, const Value *SliceStart, uint64_t SliceOffsetInBits, uint64_t SliceSizeInBits, const Value *DbgPtr, int64_t DbgPtrOffsetInBits, int64_t DbgExtractOffsetInBits, DIExpression::FragmentInfo VarFrag, std::optional< DIExpression::FragmentInfo > &Result, int64_t &OffsetFromLocationInBits)
Computes a fragment, bit-extract operation if needed, and new constant offset to describe a part of a...
static LLVM_ABI std::optional< DIExpression * > createFragmentExpression(const DIExpression *Expr, unsigned OffsetInBits, unsigned SizeInBits)
Create a DIExpression to describe one part of an aggregate variable that is fragmented across multipl...
static LLVM_ABI DIExpression * prepend(const DIExpression *Expr, uint8_t Flags, int64_t Offset=0)
Prepend DIExpr with a deref and offset operation and optionally turn it into a stack value or/and an ...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
DebugLoc getDebugLoc() const
LLVM_ABI void moveBefore(DbgRecord *MoveBefore)
void setDebugLoc(DebugLoc Loc)
Record of a variable value-assignment, aka a non instruction representation of the dbg....
LLVM_ABI bool isKillAddress() const
Check whether this kills the address component.
LLVM_ABI bool isKillLocation() const
Value * getValue(unsigned OpIdx=0) const
static LLVM_ABI DbgVariableRecord * createDbgVariableRecord(Value *Location, DILocalVariable *DV, DIExpression *Expr, const DILocation *DI)
static LLVM_ABI DbgVariableRecord * createDVRDeclare(Value *Address, DILocalVariable *DV, DIExpression *Expr, const DILocation *DI)
DIExpression * getExpression() const
LLVM_ABI void setKillAddress()
Kill the address component.
DILocalVariable * getVariable() const
static LLVM_ABI DbgVariableRecord * createLinkedDVRAssign(Instruction *LinkedInstr, Value *Val, DILocalVariable *Variable, DIExpression *Expression, Value *Address, DIExpression *AddressExpression, const DILocation *DI)
DIExpression * getAddressExpression() const
LLVM_ABI DILocation * getInlinedAt() const
Definition DebugLoc.cpp:69
Identifies a unique instance of a variable.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:163
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:322
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
Class to represent fixed width SIMD vectors.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
unsigned getVScaleValue() const
Return the value for vscale based on the vscale_range attribute or 0 when unknown.
const BasicBlock & getEntryBlock() const
Definition Function.h:807
LLVM_ABI bool accumulateConstantOffset(const DataLayout &DL, APInt &Offset, function_ref< bool(Value &, APInt &)> ExternalAnalysis=nullptr) const
Accumulate the constant address offset of this GEP if possible.
Definition Operator.cpp:113
iterator_range< op_iterator > indices()
Type * getSourceElementType() const
LLVM_ABI GEPNoWrapFlags getNoWrapFlags() const
Get the nowrap flags for the GEP instruction.
This provides the default implementation of the IRBuilder 'InsertHelper' method that is called whenev...
Definition IRBuilder.h:61
virtual void InsertHelper(Instruction *I, const Twine &Name, BasicBlock::iterator InsertPt) const
Definition IRBuilder.h:65
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2783
Base class for instruction visitors.
Definition InstVisitor.h:78
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void setAAMetadata(const AAMDNodes &N)
Sets the AA metadata on this instruction from the AAMDNodes structure.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Class to represent integer types.
@ MAX_INT_BITS
Maximum number of bits that can be specified.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
A wrapper class for inspecting calls to intrinsic functions.
LoadAndStorePromoter(ArrayRef< const Instruction * > Insts, SSAUpdater &S, StringRef Name=StringRef())
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Value * getPointerOperand()
bool isVolatile() const
Return true if this is a load from a volatile memory location.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this load instruction.
Type * getPointerOperandType() const
static unsigned getPointerOperandIndex()
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this load instruction.
bool isSimple() const
Align getAlign() const
Return the alignment of the access that is being performed.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
LLVMContext & getContext() const
Definition Metadata.h:1242
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
This is the common base class for memset/memcpy/memmove.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
PointerIntPair - This class implements a pair of a pointer and small integer.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
PtrUseVisitor(const DataLayout &DL)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Run the pass over the function.
Definition SROA.cpp:6046
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
Definition SROA.cpp:6061
SROAPass(SROAOptions PreserveCFG)
If PreserveCFG is set, then the pass is not allowed to modify CFG in any way, even if it would update...
Definition SROA.cpp:6069
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:102
void clear()
Completely clear the SetVector.
Definition SetVector.h:266
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
Value * getValueOperand()
static unsigned getPointerOperandIndex()
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition StringRef.h:573
size_t rfind(char C, size_t From=npos) const
Search for the last character C in the string.
Definition StringRef.h:345
size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition StringRef.h:293
static constexpr size_t npos
Definition StringRef.h:57
LLVM_ABI size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Used to lazily calculate structure layout information for a target machine, based on the DataLayout s...
Definition DataLayout.h:712
TypeSize getSizeInBytes() const
Definition DataLayout.h:721
LLVM_ABI unsigned getElementContainingOffset(uint64_t FixedOffset) const
Given a valid byte offset into the structure, returns the structure index that contains it.
TypeSize getElementOffset(unsigned Idx) const
Definition DataLayout.h:743
TypeSize getSizeInBits() const
Definition DataLayout.h:723
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
element_iterator element_end() const
element_iterator element_begin() const
bool isPacked() const
Type * getElementType(unsigned N) const
Type::subtype_iterator element_iterator
Target - Wrapper for Target specific information.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getIntegerBitWidth() const
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:264
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
Type * getArrayElementType() const
Definition Type.h:408
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:296
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
bool isTargetExtTy() const
Return true if this is a target extension type.
Definition Type.h:203
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:292
op_iterator op_begin()
Definition User.h:284
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
op_iterator op_end()
Definition User.h:286
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI const Value * stripInBoundsOffsets(function_ref< void(const Value *)> Func=[](const Value *) {}) const
Strip off pointer casts and inbounds GEPs.
Definition Value.cpp:812
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void dropDroppableUsesIn(User &Usr)
Remove every use of this value in User that can safely be removed.
Definition Value.cpp:218
LLVM_ABI const Value * stripAndAccumulateConstantOffsets(const DataLayout &DL, APInt &Offset, bool AllowNonInbounds, bool AllowInvariantGroup=false, function_ref< bool(Value &Value, APInt &Offset)> ExternalAnalysis=nullptr, bool LookThroughIntToPtr=false) const
Accumulate the constant offset this value has compared to a base pointer.
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static VectorType * getWithSizeAndScalar(VectorType *SizeTy, Type *EltTy)
This static method attempts to construct a VectorType with the same size-in-bits as SizeTy but with a...
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:172
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition iterator.h:80
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
Offsets
Offsets in bytes from the start of the input buffer.
SmallVector< DbgVariableRecord * > getDVRAssignmentMarkers(const Instruction *Inst)
Return a range of dbg_assign records for which Inst performs the assignment they encode.
Definition DebugInfo.h:201
LLVM_ABI void deleteAssignmentMarkers(const Instruction *Inst)
Delete the llvm.dbg.assign intrinsics linked to Inst.
initializer< Ty > init(const Ty &Val)
@ DW_OP_LLVM_extract_bits_zext
Only used in LLVM metadata.
Definition Dwarf.h:151
@ DW_OP_LLVM_fragment
Only used in LLVM metadata.
Definition Dwarf.h:144
@ DW_OP_LLVM_extract_bits_sext
Only used in LLVM metadata.
Definition Dwarf.h:150
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
bool empty() const
Definition BasicBlock.h:101
Context & getContext() const
Definition BasicBlock.h:99
iterator end() const
Definition BasicBlock.h:89
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
bool operator<(int64_t V1, const APSInt &V2)
Definition APSInt.h:362
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2060
LLVM_ABI bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1720
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1727
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
LLVM_ABI void PromoteMemToReg(ArrayRef< AllocaInst * > Allocas, DominatorTree &DT, AssumptionCache *AC=nullptr)
Promote the specified list of alloca instructions into scalar registers, inserting PHI nodes as appro...
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2474
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
auto successors(const MachineBasicBlock *BB)
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2113
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI std::optional< RegOrConstant > getVectorSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI)
Definition Utils.cpp:1495
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
void * PointerTy
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2078
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI bool isAllocaPromotable(const AllocaInst *AI)
Return true if this alloca is legal for promotion.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:754
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2130
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
bool capturesFullProvenance(CaptureComponents CC)
Definition ModRef.h:336
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1624
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void initializeSROALegacyPassPass(PassRegistry &)
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
LLVM_ABI TinyPtrVector< DbgVariableRecord * > findDVRValues(Value *V)
As above, for DVRValues.
Definition DebugInfo.cpp:66
LLVM_ABI void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2122
LLVM_ABI TinyPtrVector< DbgVariableRecord * > findDVRDeclares(Value *V)
Finds dbg.declare records declaring local variables as living in the memory that 'V' points to.
Definition DebugInfo.cpp:49
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1899
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI Instruction * SplitBlockAndInsertIfThen(Value *Cond, BasicBlock::iterator SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI FunctionPass * createSROAPass(bool PreserveCFG=true)
Definition SROA.cpp:6112
SROAOptions
Definition SROA.h:24
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define NDEBUG
Definition regutils.h:48
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
AAMDNodes shift(size_t Offset) const
Create a new AAMDNode that describes this AAMDNode after applying a constant offset to the start of t...
Definition Metadata.h:820
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Describes an element of a Bitfield.
Definition Bitfields.h:223
static Bitfield::Type get(StorageType Packed)
Unpacks the field from the Packed value.
Definition Bitfields.h:254
static void set(StorageType &Packed, typename Bitfield::Type Value)
Sets the typed value in the provided Packed value.
Definition Bitfields.h:270
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:70