LLVM 23.0.0git
SROA.cpp
Go to the documentation of this file.
1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This transformation implements the well known scalar replacement of
10/// aggregates transformation. It tries to identify promotable elements of an
11/// aggregate alloca, and promote them to registers. It will also try to
12/// convert uses of an element (or set of elements) of an alloca into a vector
13/// or bitfield-style integer scalar if appropriate.
14///
15/// It works to do this with minimal slicing of the alloca so that regions
16/// which are merely transferred in and out of external memory remain unchanged
17/// and are not decomposed to scalar code.
18///
19/// Because this also performs alloca promotion, it can be thought of as also
20/// serving the purpose of SSA formation. The algorithm iterates on the
21/// function until all opportunities for promotion have been realized.
22///
23//===----------------------------------------------------------------------===//
24
26#include "llvm/ADT/APInt.h"
27#include "llvm/ADT/ArrayRef.h"
28#include "llvm/ADT/DenseMap.h"
29#include "llvm/ADT/MapVector.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SetVector.h"
36#include "llvm/ADT/Statistic.h"
37#include "llvm/ADT/StringRef.h"
38#include "llvm/ADT/Twine.h"
39#include "llvm/ADT/iterator.h"
44#include "llvm/Analysis/Loads.h"
47#include "llvm/Config/llvm-config.h"
48#include "llvm/IR/BasicBlock.h"
49#include "llvm/IR/Constant.h"
51#include "llvm/IR/Constants.h"
52#include "llvm/IR/DIBuilder.h"
53#include "llvm/IR/DataLayout.h"
54#include "llvm/IR/DebugInfo.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/GlobalAlias.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstVisitor.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/LLVMContext.h"
66#include "llvm/IR/Metadata.h"
67#include "llvm/IR/Module.h"
68#include "llvm/IR/Operator.h"
69#include "llvm/IR/PassManager.h"
70#include "llvm/IR/Type.h"
71#include "llvm/IR/Use.h"
72#include "llvm/IR/User.h"
73#include "llvm/IR/Value.h"
74#include "llvm/IR/ValueHandle.h"
76#include "llvm/Pass.h"
80#include "llvm/Support/Debug.h"
88#include <algorithm>
89#include <cassert>
90#include <cstddef>
91#include <cstdint>
92#include <cstring>
93#include <iterator>
94#include <queue>
95#include <string>
96#include <tuple>
97#include <utility>
98#include <variant>
99#include <vector>
100
101using namespace llvm;
102
103#define DEBUG_TYPE "sroa"
104
105STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
106STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
107STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
108STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
109STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
110STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
111STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
112STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
113STATISTIC(NumLoadsPredicated,
114 "Number of loads rewritten into predicated loads to allow promotion");
116 NumStoresPredicated,
117 "Number of stores rewritten into predicated loads to allow promotion");
118STATISTIC(NumDeleted, "Number of instructions deleted");
119STATISTIC(NumVectorized, "Number of vectorized aggregates");
120
121namespace llvm {
122/// Disable running mem2reg during SROA in order to test or debug SROA.
123static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false),
124 cl::Hidden);
126} // namespace llvm
127
128namespace {
129
130class AllocaSliceRewriter;
131class AllocaSlices;
132class Partition;
133
134class SelectHandSpeculativity {
135 unsigned char Storage = 0; // None are speculatable by default.
136 using TrueVal = Bitfield::Element<bool, 0, 1>; // Low 0'th bit.
137 using FalseVal = Bitfield::Element<bool, 1, 1>; // Low 1'th bit.
138public:
139 SelectHandSpeculativity() = default;
140 SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal);
141 bool isSpeculatable(bool isTrueVal) const;
142 bool areAllSpeculatable() const;
143 bool areAnySpeculatable() const;
144 bool areNoneSpeculatable() const;
145 // For interop as int half of PointerIntPair.
146 explicit operator intptr_t() const { return static_cast<intptr_t>(Storage); }
147 explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {}
148};
149static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char));
150
151using PossiblySpeculatableLoad =
153using UnspeculatableStore = StoreInst *;
154using RewriteableMemOp =
155 std::variant<PossiblySpeculatableLoad, UnspeculatableStore>;
156using RewriteableMemOps = SmallVector<RewriteableMemOp, 2>;
157
158/// An optimization pass providing Scalar Replacement of Aggregates.
159///
160/// This pass takes allocations which can be completely analyzed (that is, they
161/// don't escape) and tries to turn them into scalar SSA values. There are
162/// a few steps to this process.
163///
164/// 1) It takes allocations of aggregates and analyzes the ways in which they
165/// are used to try to split them into smaller allocations, ideally of
166/// a single scalar data type. It will split up memcpy and memset accesses
167/// as necessary and try to isolate individual scalar accesses.
168/// 2) It will transform accesses into forms which are suitable for SSA value
169/// promotion. This can be replacing a memset with a scalar store of an
170/// integer value, or it can involve speculating operations on a PHI or
171/// select to be a PHI or select of the results.
172/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
173/// onto insert and extract operations on a vector value, and convert them to
174/// this form. By doing so, it will enable promotion of vector aggregates to
175/// SSA vector values.
176class SROA {
177 LLVMContext *const C;
178 DomTreeUpdater *const DTU;
179 AssumptionCache *const AC;
180 const bool PreserveCFG;
181
182 /// Worklist of alloca instructions to simplify.
183 ///
184 /// Each alloca in the function is added to this. Each new alloca formed gets
185 /// added to it as well to recursively simplify unless that alloca can be
186 /// directly promoted. Finally, each time we rewrite a use of an alloca other
187 /// the one being actively rewritten, we add it back onto the list if not
188 /// already present to ensure it is re-visited.
189 SmallSetVector<AllocaInst *, 16> Worklist;
190
191 /// A collection of instructions to delete.
192 /// We try to batch deletions to simplify code and make things a bit more
193 /// efficient. We also make sure there is no dangling pointers.
194 SmallVector<WeakVH, 8> DeadInsts;
195
196 /// Post-promotion worklist.
197 ///
198 /// Sometimes we discover an alloca which has a high probability of becoming
199 /// viable for SROA after a round of promotion takes place. In those cases,
200 /// the alloca is enqueued here for re-processing.
201 ///
202 /// Note that we have to be very careful to clear allocas out of this list in
203 /// the event they are deleted.
204 SmallSetVector<AllocaInst *, 16> PostPromotionWorklist;
205
206 /// A collection of alloca instructions we can directly promote.
207 SetVector<AllocaInst *, SmallVector<AllocaInst *>,
208 SmallPtrSet<AllocaInst *, 16>, 16>
209 PromotableAllocas;
210
211 /// A worklist of PHIs to speculate prior to promoting allocas.
212 ///
213 /// All of these PHIs have been checked for the safety of speculation and by
214 /// being speculated will allow promoting allocas currently in the promotable
215 /// queue.
216 SmallSetVector<PHINode *, 8> SpeculatablePHIs;
217
218 /// A worklist of select instructions to rewrite prior to promoting
219 /// allocas.
220 SmallMapVector<SelectInst *, RewriteableMemOps, 8> SelectsToRewrite;
221
222 /// Select instructions that use an alloca and are subsequently loaded can be
223 /// rewritten to load both input pointers and then select between the result,
224 /// allowing the load of the alloca to be promoted.
225 /// From this:
226 /// %P2 = select i1 %cond, ptr %Alloca, ptr %Other
227 /// %V = load <type>, ptr %P2
228 /// to:
229 /// %V1 = load <type>, ptr %Alloca -> will be mem2reg'd
230 /// %V2 = load <type>, ptr %Other
231 /// %V = select i1 %cond, <type> %V1, <type> %V2
232 ///
233 /// We can do this to a select if its only uses are loads
234 /// and if either the operand to the select can be loaded unconditionally,
235 /// or if we are allowed to perform CFG modifications.
236 /// If found an intervening bitcast with a single use of the load,
237 /// allow the promotion.
238 static std::optional<RewriteableMemOps>
239 isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG);
240
241public:
242 SROA(LLVMContext *C, DomTreeUpdater *DTU, AssumptionCache *AC,
243 SROAOptions PreserveCFG_)
244 : C(C), DTU(DTU), AC(AC),
245 PreserveCFG(PreserveCFG_ == SROAOptions::PreserveCFG) {}
246
247 /// Main run method used by both the SROAPass and by the legacy pass.
248 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runSROA(Function &F);
249
250private:
251 friend class AllocaSliceRewriter;
252
253 bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
254 std::pair<AllocaInst *, uint64_t>
255 rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
256 bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
257 bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
258 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
259 void clobberUse(Use &U);
260 bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
261 bool promoteAllocas();
262};
263
264} // end anonymous namespace
265
266/// Calculate the fragment of a variable to use when slicing a store
267/// based on the slice dimensions, existing fragment, and base storage
268/// fragment.
269/// Results:
270/// UseFrag - Use Target as the new fragment.
271/// UseNoFrag - The new slice already covers the whole variable.
272/// Skip - The new alloca slice doesn't include this variable.
273/// FIXME: Can we use calculateFragmentIntersect instead?
274namespace {
275enum FragCalcResult { UseFrag, UseNoFrag, Skip };
276}
277static FragCalcResult
279 uint64_t NewStorageSliceOffsetInBits,
280 uint64_t NewStorageSliceSizeInBits,
281 std::optional<DIExpression::FragmentInfo> StorageFragment,
282 std::optional<DIExpression::FragmentInfo> CurrentFragment,
284 // If the base storage describes part of the variable apply the offset and
285 // the size constraint.
286 if (StorageFragment) {
287 Target.SizeInBits =
288 std::min(NewStorageSliceSizeInBits, StorageFragment->SizeInBits);
289 Target.OffsetInBits =
290 NewStorageSliceOffsetInBits + StorageFragment->OffsetInBits;
291 } else {
292 Target.SizeInBits = NewStorageSliceSizeInBits;
293 Target.OffsetInBits = NewStorageSliceOffsetInBits;
294 }
295
296 // If this slice extracts the entirety of an independent variable from a
297 // larger alloca, do not produce a fragment expression, as the variable is
298 // not fragmented.
299 if (!CurrentFragment) {
300 if (auto Size = Variable->getSizeInBits()) {
301 // Treat the current fragment as covering the whole variable.
302 CurrentFragment = DIExpression::FragmentInfo(*Size, 0);
303 if (Target == CurrentFragment)
304 return UseNoFrag;
305 }
306 }
307
308 // No additional work to do if there isn't a fragment already, or there is
309 // but it already exactly describes the new assignment.
310 if (!CurrentFragment || *CurrentFragment == Target)
311 return UseFrag;
312
313 // Reject the target fragment if it doesn't fit wholly within the current
314 // fragment. TODO: We could instead chop up the target to fit in the case of
315 // a partial overlap.
316 if (Target.startInBits() < CurrentFragment->startInBits() ||
317 Target.endInBits() > CurrentFragment->endInBits())
318 return Skip;
319
320 // Target fits within the current fragment, return it.
321 return UseFrag;
322}
323
325 return DebugVariable(DVR->getVariable(), std::nullopt,
326 DVR->getDebugLoc().getInlinedAt());
327}
328
329/// Find linked dbg.assign and generate a new one with the correct
330/// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the
331/// value component is copied from the old dbg.assign to the new.
332/// \param OldAlloca Alloca for the variable before splitting.
333/// \param IsSplit True if the store (not necessarily alloca)
334/// is being split.
335/// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca.
336/// \param SliceSizeInBits New number of bits being written to.
337/// \param OldInst Instruction that is being split.
338/// \param Inst New instruction performing this part of the
339/// split store.
340/// \param Dest Store destination.
341/// \param Value Stored value.
342/// \param DL Datalayout.
343static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
344 uint64_t OldAllocaOffsetInBits,
345 uint64_t SliceSizeInBits, Instruction *OldInst,
346 Instruction *Inst, Value *Dest, Value *Value,
347 const DataLayout &DL) {
348 // If we want allocas to be migrated using this helper then we need to ensure
349 // that the BaseFragments map code still works. A simple solution would be
350 // to choose to always clone alloca dbg_assigns (rather than sometimes
351 // "stealing" them).
352 assert(!isa<AllocaInst>(Inst) && "Unexpected alloca");
353
354 auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst);
355 // Nothing to do if OldInst has no linked dbg.assign intrinsics.
356 if (DVRAssignMarkerRange.empty())
357 return;
358
359 LLVM_DEBUG(dbgs() << " migrateDebugInfo\n");
360 LLVM_DEBUG(dbgs() << " OldAlloca: " << *OldAlloca << "\n");
361 LLVM_DEBUG(dbgs() << " IsSplit: " << IsSplit << "\n");
362 LLVM_DEBUG(dbgs() << " OldAllocaOffsetInBits: " << OldAllocaOffsetInBits
363 << "\n");
364 LLVM_DEBUG(dbgs() << " SliceSizeInBits: " << SliceSizeInBits << "\n");
365 LLVM_DEBUG(dbgs() << " OldInst: " << *OldInst << "\n");
366 LLVM_DEBUG(dbgs() << " Inst: " << *Inst << "\n");
367 LLVM_DEBUG(dbgs() << " Dest: " << *Dest << "\n");
368 if (Value)
369 LLVM_DEBUG(dbgs() << " Value: " << *Value << "\n");
370
371 /// Map of aggregate variables to their fragment associated with OldAlloca.
373 BaseFragments;
374 for (auto *DVR : at::getDVRAssignmentMarkers(OldAlloca))
375 BaseFragments[getAggregateVariable(DVR)] =
376 DVR->getExpression()->getFragmentInfo();
377
378 // The new inst needs a DIAssignID unique metadata tag (if OldInst has
379 // one). It shouldn't already have one: assert this assumption.
380 assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID));
381 DIAssignID *NewID = nullptr;
382 auto &Ctx = Inst->getContext();
383 DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
384 assert(OldAlloca->isStaticAlloca());
385
386 auto MigrateDbgAssign = [&](DbgVariableRecord *DbgAssign) {
387 LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign
388 << "\n");
389 auto *Expr = DbgAssign->getExpression();
390 bool SetKillLocation = false;
391
392 if (IsSplit) {
393 std::optional<DIExpression::FragmentInfo> BaseFragment;
394 {
395 auto R = BaseFragments.find(getAggregateVariable(DbgAssign));
396 if (R == BaseFragments.end())
397 return;
398 BaseFragment = R->second;
399 }
400 std::optional<DIExpression::FragmentInfo> CurrentFragment =
401 Expr->getFragmentInfo();
402 DIExpression::FragmentInfo NewFragment;
403 FragCalcResult Result = calculateFragment(
404 DbgAssign->getVariable(), OldAllocaOffsetInBits, SliceSizeInBits,
405 BaseFragment, CurrentFragment, NewFragment);
406
407 if (Result == Skip)
408 return;
409 if (Result == UseFrag && !(NewFragment == CurrentFragment)) {
410 if (CurrentFragment) {
411 // Rewrite NewFragment to be relative to the existing one (this is
412 // what createFragmentExpression wants). CalculateFragment has
413 // already resolved the size for us. FIXME: Should it return the
414 // relative fragment too?
415 NewFragment.OffsetInBits -= CurrentFragment->OffsetInBits;
416 }
417 // Add the new fragment info to the existing expression if possible.
419 Expr, NewFragment.OffsetInBits, NewFragment.SizeInBits)) {
420 Expr = *E;
421 } else {
422 // Otherwise, add the new fragment info to an empty expression and
423 // discard the value component of this dbg.assign as the value cannot
424 // be computed with the new fragment.
426 DIExpression::get(Expr->getContext(), {}),
427 NewFragment.OffsetInBits, NewFragment.SizeInBits);
428 SetKillLocation = true;
429 }
430 }
431 }
432
433 // If we haven't created a DIAssignID ID do that now and attach it to Inst.
434 if (!NewID) {
435 NewID = DIAssignID::getDistinct(Ctx);
436 Inst->setMetadata(LLVMContext::MD_DIAssignID, NewID);
437 }
438
439 DbgVariableRecord *NewAssign;
440 if (IsSplit) {
441 ::Value *NewValue = Value ? Value : DbgAssign->getValue();
443 DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
444 Dest, DIExpression::get(Expr->getContext(), {}),
445 DbgAssign->getDebugLoc())));
446 } else {
447 // The store is not split, simply steal the existing dbg_assign.
448 NewAssign = DbgAssign;
449 NewAssign->setAssignId(NewID); // FIXME: Can we avoid generating new IDs?
450 NewAssign->setAddress(Dest);
451 if (Value)
452 NewAssign->replaceVariableLocationOp(0u, Value);
453 assert(Expr == NewAssign->getExpression());
454 }
455
456 // If we've updated the value but the original dbg.assign has an arglist
457 // then kill it now - we can't use the requested new value.
458 // We can't replace the DIArgList with the new value as it'd leave
459 // the DIExpression in an invalid state (DW_OP_LLVM_arg operands without
460 // an arglist). And we can't keep the DIArgList in case the linked store
461 // is being split - in which case the DIArgList + expression may no longer
462 // be computing the correct value.
463 // This should be a very rare situation as it requires the value being
464 // stored to differ from the dbg.assign (i.e., the value has been
465 // represented differently in the debug intrinsic for some reason).
466 SetKillLocation |=
467 Value && (DbgAssign->hasArgList() ||
468 !DbgAssign->getExpression()->isSingleLocationExpression());
469 if (SetKillLocation)
470 NewAssign->setKillLocation();
471
472 // We could use more precision here at the cost of some additional (code)
473 // complexity - if the original dbg.assign was adjacent to its store, we
474 // could position this new dbg.assign adjacent to its store rather than the
475 // old dbg.assgn. That would result in interleaved dbg.assigns rather than
476 // what we get now:
477 // split store !1
478 // split store !2
479 // dbg.assign !1
480 // dbg.assign !2
481 // This (current behaviour) results results in debug assignments being
482 // noted as slightly offset (in code) from the store. In practice this
483 // should have little effect on the debugging experience due to the fact
484 // that all the split stores should get the same line number.
485 if (NewAssign != DbgAssign) {
486 NewAssign->moveBefore(DbgAssign->getIterator());
487 NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
488 }
489 LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
490 };
491
492 for_each(DVRAssignMarkerRange, MigrateDbgAssign);
493}
494
495namespace {
496
497/// A custom IRBuilder inserter which prefixes all names, but only in
498/// Assert builds.
499class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
500 std::string Prefix;
501
502 Twine getNameWithPrefix(const Twine &Name) const {
503 return Name.isTriviallyEmpty() ? Name : Prefix + Name;
504 }
505
506public:
507 void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
508
509 void InsertHelper(Instruction *I, const Twine &Name,
510 BasicBlock::iterator InsertPt) const override {
511 IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name),
512 InsertPt);
513 }
514};
515
516/// Provide a type for IRBuilder that drops names in release builds.
518
519/// A used slice of an alloca.
520///
521/// This structure represents a slice of an alloca used by some instruction. It
522/// stores both the begin and end offsets of this use, a pointer to the use
523/// itself, and a flag indicating whether we can classify the use as splittable
524/// or not when forming partitions of the alloca.
525class Slice {
526 /// The beginning offset of the range.
527 uint64_t BeginOffset = 0;
528
529 /// The ending offset, not included in the range.
530 uint64_t EndOffset = 0;
531
532 /// Storage for both the use of this slice and whether it can be
533 /// split.
534 PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
535
536public:
537 Slice() = default;
538
539 Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
540 : BeginOffset(BeginOffset), EndOffset(EndOffset),
541 UseAndIsSplittable(U, IsSplittable) {}
542
543 uint64_t beginOffset() const { return BeginOffset; }
544 uint64_t endOffset() const { return EndOffset; }
545
546 bool isSplittable() const { return UseAndIsSplittable.getInt(); }
547 void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
548
549 Use *getUse() const { return UseAndIsSplittable.getPointer(); }
550
551 bool isDead() const { return getUse() == nullptr; }
552 void kill() { UseAndIsSplittable.setPointer(nullptr); }
553
554 /// Support for ordering ranges.
555 ///
556 /// This provides an ordering over ranges such that start offsets are
557 /// always increasing, and within equal start offsets, the end offsets are
558 /// decreasing. Thus the spanning range comes first in a cluster with the
559 /// same start position.
560 bool operator<(const Slice &RHS) const {
561 if (beginOffset() < RHS.beginOffset())
562 return true;
563 if (beginOffset() > RHS.beginOffset())
564 return false;
565 if (isSplittable() != RHS.isSplittable())
566 return !isSplittable();
567 if (endOffset() > RHS.endOffset())
568 return true;
569 return false;
570 }
571
572 /// Support comparison with a single offset to allow binary searches.
573 [[maybe_unused]] friend bool operator<(const Slice &LHS, uint64_t RHSOffset) {
574 return LHS.beginOffset() < RHSOffset;
575 }
576 [[maybe_unused]] friend bool operator<(uint64_t LHSOffset, const Slice &RHS) {
577 return LHSOffset < RHS.beginOffset();
578 }
579
580 bool operator==(const Slice &RHS) const {
581 return isSplittable() == RHS.isSplittable() &&
582 beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
583 }
584 bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
585};
586
587/// Representation of the alloca slices.
588///
589/// This class represents the slices of an alloca which are formed by its
590/// various uses. If a pointer escapes, we can't fully build a representation
591/// for the slices used and we reflect that in this structure. The uses are
592/// stored, sorted by increasing beginning offset and with unsplittable slices
593/// starting at a particular offset before splittable slices.
594class AllocaSlices {
595public:
596 /// Construct the slices of a particular alloca.
597 AllocaSlices(const DataLayout &DL, AllocaInst &AI);
598
599 /// Test whether a pointer to the allocation escapes our analysis.
600 ///
601 /// If this is true, the slices are never fully built and should be
602 /// ignored.
603 bool isEscaped() const { return PointerEscapingInstr; }
604 bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
605
606 /// Support for iterating over the slices.
607 /// @{
608 using iterator = SmallVectorImpl<Slice>::iterator;
609 using range = iterator_range<iterator>;
610
611 iterator begin() { return Slices.begin(); }
612 iterator end() { return Slices.end(); }
613
614 using const_iterator = SmallVectorImpl<Slice>::const_iterator;
615 using const_range = iterator_range<const_iterator>;
616
617 const_iterator begin() const { return Slices.begin(); }
618 const_iterator end() const { return Slices.end(); }
619 /// @}
620
621 /// Erase a range of slices.
622 void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
623
624 /// Insert new slices for this alloca.
625 ///
626 /// This moves the slices into the alloca's slices collection, and re-sorts
627 /// everything so that the usual ordering properties of the alloca's slices
628 /// hold.
629 void insert(ArrayRef<Slice> NewSlices) {
630 int OldSize = Slices.size();
631 Slices.append(NewSlices.begin(), NewSlices.end());
632 auto SliceI = Slices.begin() + OldSize;
633 std::stable_sort(SliceI, Slices.end());
634 std::inplace_merge(Slices.begin(), SliceI, Slices.end());
635 }
636
637 // Forward declare the iterator and range accessor for walking the
638 // partitions.
639 class partition_iterator;
641
642 /// Access the dead users for this alloca.
643 ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
644
645 /// Access Uses that should be dropped if the alloca is promotable.
646 ArrayRef<Use *> getDeadUsesIfPromotable() const {
647 return DeadUseIfPromotable;
648 }
649
650 /// Access the dead operands referring to this alloca.
651 ///
652 /// These are operands which have cannot actually be used to refer to the
653 /// alloca as they are outside its range and the user doesn't correct for
654 /// that. These mostly consist of PHI node inputs and the like which we just
655 /// need to replace with undef.
656 ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
657
658#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
659 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
660 void printSlice(raw_ostream &OS, const_iterator I,
661 StringRef Indent = " ") const;
662 void printUse(raw_ostream &OS, const_iterator I,
663 StringRef Indent = " ") const;
664 void print(raw_ostream &OS) const;
665 void dump(const_iterator I) const;
666 void dump() const;
667#endif
668
669private:
670 template <typename DerivedT, typename RetT = void> class BuilderBase;
671 class SliceBuilder;
672
673 friend class AllocaSlices::SliceBuilder;
674
675#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
676 /// Handle to alloca instruction to simplify method interfaces.
677 AllocaInst &AI;
678#endif
679
680 /// The instruction responsible for this alloca not having a known set
681 /// of slices.
682 ///
683 /// When an instruction (potentially) escapes the pointer to the alloca, we
684 /// store a pointer to that here and abort trying to form slices of the
685 /// alloca. This will be null if the alloca slices are analyzed successfully.
686 Instruction *PointerEscapingInstr;
687 Instruction *PointerEscapingInstrReadOnly;
688
689 /// The slices of the alloca.
690 ///
691 /// We store a vector of the slices formed by uses of the alloca here. This
692 /// vector is sorted by increasing begin offset, and then the unsplittable
693 /// slices before the splittable ones. See the Slice inner class for more
694 /// details.
696
697 /// Instructions which will become dead if we rewrite the alloca.
698 ///
699 /// Note that these are not separated by slice. This is because we expect an
700 /// alloca to be completely rewritten or not rewritten at all. If rewritten,
701 /// all these instructions can simply be removed and replaced with poison as
702 /// they come from outside of the allocated space.
703 SmallVector<Instruction *, 8> DeadUsers;
704
705 /// Uses which will become dead if can promote the alloca.
706 SmallVector<Use *, 8> DeadUseIfPromotable;
707
708 /// Operands which will become dead if we rewrite the alloca.
709 ///
710 /// These are operands that in their particular use can be replaced with
711 /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
712 /// to PHI nodes and the like. They aren't entirely dead (there might be
713 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
714 /// want to swap this particular input for poison to simplify the use lists of
715 /// the alloca.
716 SmallVector<Use *, 8> DeadOperands;
717};
718
719/// A partition of the slices.
720///
721/// An ephemeral representation for a range of slices which can be viewed as
722/// a partition of the alloca. This range represents a span of the alloca's
723/// memory which cannot be split, and provides access to all of the slices
724/// overlapping some part of the partition.
725///
726/// Objects of this type are produced by traversing the alloca's slices, but
727/// are only ephemeral and not persistent.
728class Partition {
729private:
730 friend class AllocaSlices;
731 friend class AllocaSlices::partition_iterator;
732
733 using iterator = AllocaSlices::iterator;
734
735 /// The beginning and ending offsets of the alloca for this
736 /// partition.
737 uint64_t BeginOffset = 0, EndOffset = 0;
738
739 /// The start and end iterators of this partition.
740 iterator SI, SJ;
741
742 /// A collection of split slice tails overlapping the partition.
743 SmallVector<Slice *, 4> SplitTails;
744
745 /// Raw constructor builds an empty partition starting and ending at
746 /// the given iterator.
747 Partition(iterator SI) : SI(SI), SJ(SI) {}
748
749public:
750 /// The start offset of this partition.
751 ///
752 /// All of the contained slices start at or after this offset.
753 uint64_t beginOffset() const { return BeginOffset; }
754
755 /// The end offset of this partition.
756 ///
757 /// All of the contained slices end at or before this offset.
758 uint64_t endOffset() const { return EndOffset; }
759
760 /// The size of the partition.
761 ///
762 /// Note that this can never be zero.
763 uint64_t size() const {
764 assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
765 return EndOffset - BeginOffset;
766 }
767
768 /// Test whether this partition contains no slices, and merely spans
769 /// a region occupied by split slices.
770 bool empty() const { return SI == SJ; }
771
772 /// \name Iterate slices that start within the partition.
773 /// These may be splittable or unsplittable. They have a begin offset >= the
774 /// partition begin offset.
775 /// @{
776 // FIXME: We should probably define a "concat_iterator" helper and use that
777 // to stitch together pointee_iterators over the split tails and the
778 // contiguous iterators of the partition. That would give a much nicer
779 // interface here. We could then additionally expose filtered iterators for
780 // split, unsplit, and unsplittable splices based on the usage patterns.
781 iterator begin() const { return SI; }
782 iterator end() const { return SJ; }
783 /// @}
784
785 /// Get the sequence of split slice tails.
786 ///
787 /// These tails are of slices which start before this partition but are
788 /// split and overlap into the partition. We accumulate these while forming
789 /// partitions.
790 ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
791};
792
793} // end anonymous namespace
794
795/// An iterator over partitions of the alloca's slices.
796///
797/// This iterator implements the core algorithm for partitioning the alloca's
798/// slices. It is a forward iterator as we don't support backtracking for
799/// efficiency reasons, and re-use a single storage area to maintain the
800/// current set of split slices.
801///
802/// It is templated on the slice iterator type to use so that it can operate
803/// with either const or non-const slice iterators.
805 : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
806 Partition> {
807 friend class AllocaSlices;
808
809 /// Most of the state for walking the partitions is held in a class
810 /// with a nice interface for examining them.
811 Partition P;
812
813 /// We need to keep the end of the slices to know when to stop.
814 AllocaSlices::iterator SE;
815
816 /// We also need to keep track of the maximum split end offset seen.
817 /// FIXME: Do we really?
818 uint64_t MaxSplitSliceEndOffset = 0;
819
820 /// Sets the partition to be empty at given iterator, and sets the
821 /// end iterator.
822 partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
823 : P(SI), SE(SE) {
824 // If not already at the end, advance our state to form the initial
825 // partition.
826 if (SI != SE)
827 advance();
828 }
829
830 /// Advance the iterator to the next partition.
831 ///
832 /// Requires that the iterator not be at the end of the slices.
833 void advance() {
834 assert((P.SI != SE || !P.SplitTails.empty()) &&
835 "Cannot advance past the end of the slices!");
836
837 // Clear out any split uses which have ended.
838 if (!P.SplitTails.empty()) {
839 if (P.EndOffset >= MaxSplitSliceEndOffset) {
840 // If we've finished all splits, this is easy.
841 P.SplitTails.clear();
842 MaxSplitSliceEndOffset = 0;
843 } else {
844 // Remove the uses which have ended in the prior partition. This
845 // cannot change the max split slice end because we just checked that
846 // the prior partition ended prior to that max.
847 llvm::erase_if(P.SplitTails,
848 [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
849 assert(llvm::any_of(P.SplitTails,
850 [&](Slice *S) {
851 return S->endOffset() == MaxSplitSliceEndOffset;
852 }) &&
853 "Could not find the current max split slice offset!");
854 assert(llvm::all_of(P.SplitTails,
855 [&](Slice *S) {
856 return S->endOffset() <= MaxSplitSliceEndOffset;
857 }) &&
858 "Max split slice end offset is not actually the max!");
859 }
860 }
861
862 // If P.SI is already at the end, then we've cleared the split tail and
863 // now have an end iterator.
864 if (P.SI == SE) {
865 assert(P.SplitTails.empty() && "Failed to clear the split slices!");
866 return;
867 }
868
869 // If we had a non-empty partition previously, set up the state for
870 // subsequent partitions.
871 if (P.SI != P.SJ) {
872 // Accumulate all the splittable slices which started in the old
873 // partition into the split list.
874 for (Slice &S : P)
875 if (S.isSplittable() && S.endOffset() > P.EndOffset) {
876 P.SplitTails.push_back(&S);
877 MaxSplitSliceEndOffset =
878 std::max(S.endOffset(), MaxSplitSliceEndOffset);
879 }
880
881 // Start from the end of the previous partition.
882 P.SI = P.SJ;
883
884 // If P.SI is now at the end, we at most have a tail of split slices.
885 if (P.SI == SE) {
886 P.BeginOffset = P.EndOffset;
887 P.EndOffset = MaxSplitSliceEndOffset;
888 return;
889 }
890
891 // If the we have split slices and the next slice is after a gap and is
892 // not splittable immediately form an empty partition for the split
893 // slices up until the next slice begins.
894 if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
895 !P.SI->isSplittable()) {
896 P.BeginOffset = P.EndOffset;
897 P.EndOffset = P.SI->beginOffset();
898 return;
899 }
900 }
901
902 // OK, we need to consume new slices. Set the end offset based on the
903 // current slice, and step SJ past it. The beginning offset of the
904 // partition is the beginning offset of the next slice unless we have
905 // pre-existing split slices that are continuing, in which case we begin
906 // at the prior end offset.
907 P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
908 P.EndOffset = P.SI->endOffset();
909 ++P.SJ;
910
911 // There are two strategies to form a partition based on whether the
912 // partition starts with an unsplittable slice or a splittable slice.
913 if (!P.SI->isSplittable()) {
914 // When we're forming an unsplittable region, it must always start at
915 // the first slice and will extend through its end.
916 assert(P.BeginOffset == P.SI->beginOffset());
917
918 // Form a partition including all of the overlapping slices with this
919 // unsplittable slice.
920 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
921 if (!P.SJ->isSplittable())
922 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
923 ++P.SJ;
924 }
925
926 // We have a partition across a set of overlapping unsplittable
927 // partitions.
928 return;
929 }
930
931 // If we're starting with a splittable slice, then we need to form
932 // a synthetic partition spanning it and any other overlapping splittable
933 // splices.
934 assert(P.SI->isSplittable() && "Forming a splittable partition!");
935
936 // Collect all of the overlapping splittable slices.
937 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
938 P.SJ->isSplittable()) {
939 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
940 ++P.SJ;
941 }
942
943 // Back upiP.EndOffset if we ended the span early when encountering an
944 // unsplittable slice. This synthesizes the early end offset of
945 // a partition spanning only splittable slices.
946 if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
947 assert(!P.SJ->isSplittable());
948 P.EndOffset = P.SJ->beginOffset();
949 }
950 }
951
952public:
953 bool operator==(const partition_iterator &RHS) const {
954 assert(SE == RHS.SE &&
955 "End iterators don't match between compared partition iterators!");
956
957 // The observed positions of partitions is marked by the P.SI iterator and
958 // the emptiness of the split slices. The latter is only relevant when
959 // P.SI == SE, as the end iterator will additionally have an empty split
960 // slices list, but the prior may have the same P.SI and a tail of split
961 // slices.
962 if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
963 assert(P.SJ == RHS.P.SJ &&
964 "Same set of slices formed two different sized partitions!");
965 assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
966 "Same slice position with differently sized non-empty split "
967 "slice tails!");
968 return true;
969 }
970 return false;
971 }
972
973 partition_iterator &operator++() {
974 advance();
975 return *this;
976 }
977
978 Partition &operator*() { return P; }
979};
980
981/// A forward range over the partitions of the alloca's slices.
982///
983/// This accesses an iterator range over the partitions of the alloca's
984/// slices. It computes these partitions on the fly based on the overlapping
985/// offsets of the slices and the ability to split them. It will visit "empty"
986/// partitions to cover regions of the alloca only accessed via split
987/// slices.
988iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
989 return make_range(partition_iterator(begin(), end()),
990 partition_iterator(end(), end()));
991}
992
994 // If the condition being selected on is a constant or the same value is
995 // being selected between, fold the select. Yes this does (rarely) happen
996 // early on.
997 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
998 return SI.getOperand(1 + CI->isZero());
999 if (SI.getOperand(1) == SI.getOperand(2))
1000 return SI.getOperand(1);
1001
1002 return nullptr;
1003}
1004
1005/// A helper that folds a PHI node or a select.
1007 if (PHINode *PN = dyn_cast<PHINode>(&I)) {
1008 // If PN merges together the same value, return that value.
1009 return PN->hasConstantValue();
1010 }
1012}
1013
1014/// Builder for the alloca slices.
1015///
1016/// This class builds a set of alloca slices by recursively visiting the uses
1017/// of an alloca and making a slice for each load and store at each offset.
1018class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
1019 friend class PtrUseVisitor<SliceBuilder>;
1020 friend class InstVisitor<SliceBuilder>;
1021
1022 using Base = PtrUseVisitor<SliceBuilder>;
1023
1024 const uint64_t AllocSize;
1025 AllocaSlices &AS;
1026
1027 SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
1029
1030 /// Set to de-duplicate dead instructions found in the use walk.
1031 SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
1032
1033public:
1034 SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
1036 AllocSize(AI.getAllocationSize(DL)->getFixedValue()), AS(AS) {}
1037
1038private:
1039 void markAsDead(Instruction &I) {
1040 if (VisitedDeadInsts.insert(&I).second)
1041 AS.DeadUsers.push_back(&I);
1042 }
1043
1044 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
1045 bool IsSplittable = false) {
1046 // Completely skip uses which have a zero size or start either before or
1047 // past the end of the allocation.
1048 if (Size == 0 || Offset.uge(AllocSize)) {
1049 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
1050 << Offset
1051 << " which has zero size or starts outside of the "
1052 << AllocSize << " byte alloca:\n"
1053 << " alloca: " << AS.AI << "\n"
1054 << " use: " << I << "\n");
1055 return markAsDead(I);
1056 }
1057
1058 uint64_t BeginOffset = Offset.getZExtValue();
1059 uint64_t EndOffset = BeginOffset + Size;
1060
1061 // Clamp the end offset to the end of the allocation. Note that this is
1062 // formulated to handle even the case where "BeginOffset + Size" overflows.
1063 // This may appear superficially to be something we could ignore entirely,
1064 // but that is not so! There may be widened loads or PHI-node uses where
1065 // some instructions are dead but not others. We can't completely ignore
1066 // them, and so have to record at least the information here.
1067 assert(AllocSize >= BeginOffset); // Established above.
1068 if (Size > AllocSize - BeginOffset) {
1069 LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
1070 << Offset << " to remain within the " << AllocSize
1071 << " byte alloca:\n"
1072 << " alloca: " << AS.AI << "\n"
1073 << " use: " << I << "\n");
1074 EndOffset = AllocSize;
1075 }
1076
1077 AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
1078 }
1079
1080 void visitBitCastInst(BitCastInst &BC) {
1081 if (BC.use_empty())
1082 return markAsDead(BC);
1083
1084 return Base::visitBitCastInst(BC);
1085 }
1086
1087 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1088 if (ASC.use_empty())
1089 return markAsDead(ASC);
1090
1091 return Base::visitAddrSpaceCastInst(ASC);
1092 }
1093
1094 void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1095 if (GEPI.use_empty())
1096 return markAsDead(GEPI);
1097
1098 return Base::visitGetElementPtrInst(GEPI);
1099 }
1100
1101 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
1102 uint64_t Size, bool IsVolatile) {
1103 // We allow splitting of non-volatile loads and stores where the type is an
1104 // integer type. These may be used to implement 'memcpy' or other "transfer
1105 // of bits" patterns.
1106 bool IsSplittable =
1107 Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1108
1109 insertUse(I, Offset, Size, IsSplittable);
1110 }
1111
1112 void visitLoadInst(LoadInst &LI) {
1113 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
1114 "All simple FCA loads should have been pre-split");
1115
1116 // If there is a load with an unknown offset, we can still perform store
1117 // to load forwarding for other known-offset loads.
1118 if (!IsOffsetKnown)
1119 return PI.setEscapedReadOnly(&LI);
1120
1121 TypeSize Size = DL.getTypeStoreSize(LI.getType());
1122 if (Size.isScalable()) {
1123 unsigned VScale = LI.getFunction()->getVScaleValue();
1124 if (!VScale)
1125 return PI.setAborted(&LI);
1126
1127 Size = TypeSize::getFixed(Size.getKnownMinValue() * VScale);
1128 }
1129
1130 return handleLoadOrStore(LI.getType(), LI, Offset, Size.getFixedValue(),
1131 LI.isVolatile());
1132 }
1133
1134 void visitStoreInst(StoreInst &SI) {
1135 Value *ValOp = SI.getValueOperand();
1136 if (ValOp == *U)
1137 return PI.setEscapedAndAborted(&SI);
1138 if (!IsOffsetKnown)
1139 return PI.setAborted(&SI);
1140
1141 TypeSize StoreSize = DL.getTypeStoreSize(ValOp->getType());
1142 if (StoreSize.isScalable()) {
1143 unsigned VScale = SI.getFunction()->getVScaleValue();
1144 if (!VScale)
1145 return PI.setAborted(&SI);
1146
1147 StoreSize = TypeSize::getFixed(StoreSize.getKnownMinValue() * VScale);
1148 }
1149
1150 uint64_t Size = StoreSize.getFixedValue();
1151
1152 // If this memory access can be shown to *statically* extend outside the
1153 // bounds of the allocation, it's behavior is undefined, so simply
1154 // ignore it. Note that this is more strict than the generic clamping
1155 // behavior of insertUse. We also try to handle cases which might run the
1156 // risk of overflow.
1157 // FIXME: We should instead consider the pointer to have escaped if this
1158 // function is being instrumented for addressing bugs or race conditions.
1159 if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
1160 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
1161 << Offset << " which extends past the end of the "
1162 << AllocSize << " byte alloca:\n"
1163 << " alloca: " << AS.AI << "\n"
1164 << " use: " << SI << "\n");
1165 return markAsDead(SI);
1166 }
1167
1168 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
1169 "All simple FCA stores should have been pre-split");
1170 handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
1171 }
1172
1173 void visitMemSetInst(MemSetInst &II) {
1174 assert(II.getRawDest() == *U && "Pointer use is not the destination?");
1175 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1176 if ((Length && Length->getValue() == 0) ||
1177 (IsOffsetKnown && Offset.uge(AllocSize)))
1178 // Zero-length mem transfer intrinsics can be ignored entirely.
1179 return markAsDead(II);
1180
1181 if (!IsOffsetKnown)
1182 return PI.setAborted(&II);
1183
1184 insertUse(II, Offset,
1185 Length ? Length->getLimitedValue()
1186 : AllocSize - Offset.getLimitedValue(),
1187 (bool)Length);
1188 }
1189
1190 void visitMemTransferInst(MemTransferInst &II) {
1191 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1192 if (Length && Length->getValue() == 0)
1193 // Zero-length mem transfer intrinsics can be ignored entirely.
1194 return markAsDead(II);
1195
1196 // Because we can visit these intrinsics twice, also check to see if the
1197 // first time marked this instruction as dead. If so, skip it.
1198 if (VisitedDeadInsts.count(&II))
1199 return;
1200
1201 if (!IsOffsetKnown)
1202 return PI.setAborted(&II);
1203
1204 // This side of the transfer is completely out-of-bounds, and so we can
1205 // nuke the entire transfer. However, we also need to nuke the other side
1206 // if already added to our partitions.
1207 // FIXME: Yet another place we really should bypass this when
1208 // instrumenting for ASan.
1209 if (Offset.uge(AllocSize)) {
1210 SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
1211 MemTransferSliceMap.find(&II);
1212 if (MTPI != MemTransferSliceMap.end())
1213 AS.Slices[MTPI->second].kill();
1214 return markAsDead(II);
1215 }
1216
1217 uint64_t RawOffset = Offset.getLimitedValue();
1218 uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
1219
1220 // Check for the special case where the same exact value is used for both
1221 // source and dest.
1222 if (*U == II.getRawDest() && *U == II.getRawSource()) {
1223 // For non-volatile transfers this is a no-op.
1224 if (!II.isVolatile())
1225 return markAsDead(II);
1226
1227 return insertUse(II, Offset, Size, /*IsSplittable=*/false);
1228 }
1229
1230 // If we have seen both source and destination for a mem transfer, then
1231 // they both point to the same alloca.
1232 bool Inserted;
1233 SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
1234 std::tie(MTPI, Inserted) =
1235 MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
1236 unsigned PrevIdx = MTPI->second;
1237 if (!Inserted) {
1238 Slice &PrevP = AS.Slices[PrevIdx];
1239
1240 // Check if the begin offsets match and this is a non-volatile transfer.
1241 // In that case, we can completely elide the transfer.
1242 if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
1243 PrevP.kill();
1244 return markAsDead(II);
1245 }
1246
1247 // Otherwise we have an offset transfer within the same alloca. We can't
1248 // split those.
1249 PrevP.makeUnsplittable();
1250 }
1251
1252 // Insert the use now that we've fixed up the splittable nature.
1253 insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
1254
1255 // Check that we ended up with a valid index in the map.
1256 assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
1257 "Map index doesn't point back to a slice with this user.");
1258 }
1259
1260 // Disable SRoA for any intrinsics except for lifetime invariants.
1261 // FIXME: What about debug intrinsics? This matches old behavior, but
1262 // doesn't make sense.
1263 void visitIntrinsicInst(IntrinsicInst &II) {
1264 if (II.isDroppable()) {
1265 AS.DeadUseIfPromotable.push_back(U);
1266 return;
1267 }
1268
1269 if (!IsOffsetKnown)
1270 return PI.setAborted(&II);
1271
1272 if (II.isLifetimeStartOrEnd()) {
1273 insertUse(II, Offset, AllocSize, true);
1274 return;
1275 }
1276
1277 Base::visitIntrinsicInst(II);
1278 }
1279
1280 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
1281 // We consider any PHI or select that results in a direct load or store of
1282 // the same offset to be a viable use for slicing purposes. These uses
1283 // are considered unsplittable and the size is the maximum loaded or stored
1284 // size.
1285 SmallPtrSet<Instruction *, 4> Visited;
1287 Visited.insert(Root);
1288 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
1289 const DataLayout &DL = Root->getDataLayout();
1290 // If there are no loads or stores, the access is dead. We mark that as
1291 // a size zero access.
1292 Size = 0;
1293 do {
1294 Instruction *I, *UsedI;
1295 std::tie(UsedI, I) = Uses.pop_back_val();
1296
1297 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
1298 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
1299 if (LoadSize.isScalable()) {
1300 PI.setAborted(LI);
1301 return nullptr;
1302 }
1303 Size = std::max(Size, LoadSize.getFixedValue());
1304 continue;
1305 }
1306 if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
1307 Value *Op = SI->getOperand(0);
1308 if (Op == UsedI)
1309 return SI;
1310 TypeSize StoreSize = DL.getTypeStoreSize(Op->getType());
1311 if (StoreSize.isScalable()) {
1312 PI.setAborted(SI);
1313 return nullptr;
1314 }
1315 Size = std::max(Size, StoreSize.getFixedValue());
1316 continue;
1317 }
1318
1319 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
1320 if (!GEP->hasAllZeroIndices())
1321 return GEP;
1322 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
1324 return I;
1325 }
1326
1327 for (User *U : I->users())
1328 if (Visited.insert(cast<Instruction>(U)).second)
1329 Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
1330 } while (!Uses.empty());
1331
1332 return nullptr;
1333 }
1334
1335 void visitPHINodeOrSelectInst(Instruction &I) {
1337 if (I.use_empty())
1338 return markAsDead(I);
1339
1340 // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1341 // instructions in this BB, which may be required during rewriting. Bail out
1342 // on these cases.
1343 if (isa<PHINode>(I) && !I.getParent()->hasInsertionPt())
1344 return PI.setAborted(&I);
1345
1346 // TODO: We could use simplifyInstruction here to fold PHINodes and
1347 // SelectInsts. However, doing so requires to change the current
1348 // dead-operand-tracking mechanism. For instance, suppose neither loading
1349 // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1350 // trap either. However, if we simply replace %U with undef using the
1351 // current dead-operand-tracking mechanism, "load (select undef, undef,
1352 // %other)" may trap because the select may return the first operand
1353 // "undef".
1354 if (Value *Result = foldPHINodeOrSelectInst(I)) {
1355 if (Result == *U)
1356 // If the result of the constant fold will be the pointer, recurse
1357 // through the PHI/select as if we had RAUW'ed it.
1358 enqueueUsers(I);
1359 else
1360 // Otherwise the operand to the PHI/select is dead, and we can replace
1361 // it with poison.
1362 AS.DeadOperands.push_back(U);
1363
1364 return;
1365 }
1366
1367 if (!IsOffsetKnown)
1368 return PI.setAborted(&I);
1369
1370 // See if we already have computed info on this node.
1371 uint64_t &Size = PHIOrSelectSizes[&I];
1372 if (!Size) {
1373 // This is a new PHI/Select, check for an unsafe use of it.
1374 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
1375 return PI.setAborted(UnsafeI);
1376 }
1377
1378 // For PHI and select operands outside the alloca, we can't nuke the entire
1379 // phi or select -- the other side might still be relevant, so we special
1380 // case them here and use a separate structure to track the operands
1381 // themselves which should be replaced with poison.
1382 // FIXME: This should instead be escaped in the event we're instrumenting
1383 // for address sanitization.
1384 if (Offset.uge(AllocSize)) {
1385 AS.DeadOperands.push_back(U);
1386 return;
1387 }
1388
1389 insertUse(I, Offset, Size);
1390 }
1391
1392 void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
1393
1394 void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
1395
1396 /// Disable SROA entirely if there are unhandled users of the alloca.
1397 void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1398
1399 void visitCallBase(CallBase &CB) {
1400 // If the call operand is read-only and only does a read-only or address
1401 // capture, then we mark it as EscapedReadOnly.
1402 if (CB.isDataOperand(U) &&
1403 !capturesFullProvenance(CB.getCaptureInfo(U->getOperandNo())) &&
1404 CB.onlyReadsMemory(U->getOperandNo())) {
1405 PI.setEscapedReadOnly(&CB);
1406 return;
1407 }
1408
1409 Base::visitCallBase(CB);
1410 }
1411};
1412
1413AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
1414 :
1415#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1416 AI(AI),
1417#endif
1418 PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
1419 SliceBuilder PB(DL, AI, *this);
1420 SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
1421 if (PtrI.isEscaped() || PtrI.isAborted()) {
1422 // FIXME: We should sink the escape vs. abort info into the caller nicely,
1423 // possibly by just storing the PtrInfo in the AllocaSlices.
1424 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1425 : PtrI.getAbortingInst();
1426 assert(PointerEscapingInstr && "Did not track a bad instruction");
1427 return;
1428 }
1429 PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
1430
1431 llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
1432
1433 // Sort the uses. This arranges for the offsets to be in ascending order,
1434 // and the sizes to be in descending order.
1435 llvm::stable_sort(Slices);
1436}
1437
1438#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1439
1440void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1441 StringRef Indent) const {
1442 printSlice(OS, I, Indent);
1443 OS << "\n";
1444 printUse(OS, I, Indent);
1445}
1446
1447void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1448 StringRef Indent) const {
1449 OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1450 << " slice #" << (I - begin())
1451 << (I->isSplittable() ? " (splittable)" : "");
1452}
1453
1454void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1455 StringRef Indent) const {
1456 OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1457}
1458
1459void AllocaSlices::print(raw_ostream &OS) const {
1460 if (PointerEscapingInstr) {
1461 OS << "Can't analyze slices for alloca: " << AI << "\n"
1462 << " A pointer to this alloca escaped by:\n"
1463 << " " << *PointerEscapingInstr << "\n";
1464 return;
1465 }
1466
1467 if (PointerEscapingInstrReadOnly)
1468 OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
1469
1470 OS << "Slices of alloca: " << AI << "\n";
1471 for (const_iterator I = begin(), E = end(); I != E; ++I)
1472 print(OS, I);
1473}
1474
1475LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1476 print(dbgs(), I);
1477}
1478LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1479
1480#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1481
1482/// Walk the range of a partitioning looking for a common type to cover this
1483/// sequence of slices.
1484static std::pair<Type *, IntegerType *>
1485findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1486 uint64_t EndOffset) {
1487 Type *Ty = nullptr;
1488 bool TyIsCommon = true;
1489 IntegerType *ITy = nullptr;
1490
1491 // Note that we need to look at *every* alloca slice's Use to ensure we
1492 // always get consistent results regardless of the order of slices.
1493 for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1494 Use *U = I->getUse();
1495 if (isa<IntrinsicInst>(*U->getUser()))
1496 continue;
1497 if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1498 continue;
1499
1500 Type *UserTy = nullptr;
1501 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1502 UserTy = LI->getType();
1503 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1504 UserTy = SI->getValueOperand()->getType();
1505 }
1506
1507 if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
1508 // If the type is larger than the partition, skip it. We only encounter
1509 // this for split integer operations where we want to use the type of the
1510 // entity causing the split. Also skip if the type is not a byte width
1511 // multiple.
1512 if (UserITy->getBitWidth() % 8 != 0 ||
1513 UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1514 continue;
1515
1516 // Track the largest bitwidth integer type used in this way in case there
1517 // is no common type.
1518 if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1519 ITy = UserITy;
1520 }
1521
1522 // To avoid depending on the order of slices, Ty and TyIsCommon must not
1523 // depend on types skipped above.
1524 if (!UserTy || (Ty && Ty != UserTy))
1525 TyIsCommon = false; // Give up on anything but an iN type.
1526 else
1527 Ty = UserTy;
1528 }
1529
1530 return {TyIsCommon ? Ty : nullptr, ITy};
1531}
1532
1533/// PHI instructions that use an alloca and are subsequently loaded can be
1534/// rewritten to load both input pointers in the pred blocks and then PHI the
1535/// results, allowing the load of the alloca to be promoted.
1536/// From this:
1537/// %P2 = phi [i32* %Alloca, i32* %Other]
1538/// %V = load i32* %P2
1539/// to:
1540/// %V1 = load i32* %Alloca -> will be mem2reg'd
1541/// ...
1542/// %V2 = load i32* %Other
1543/// ...
1544/// %V = phi [i32 %V1, i32 %V2]
1545///
1546/// We can do this to a select if its only uses are loads and if the operands
1547/// to the select can be loaded unconditionally.
1548///
1549/// FIXME: This should be hoisted into a generic utility, likely in
1550/// Transforms/Util/Local.h
1552 const DataLayout &DL = PN.getDataLayout();
1553
1554 // For now, we can only do this promotion if the load is in the same block
1555 // as the PHI, and if there are no stores between the phi and load.
1556 // TODO: Allow recursive phi users.
1557 // TODO: Allow stores.
1558 BasicBlock *BB = PN.getParent();
1559 Align MaxAlign;
1560 uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
1561 Type *LoadType = nullptr;
1562 for (User *U : PN.users()) {
1564 if (!LI || !LI->isSimple())
1565 return false;
1566
1567 // For now we only allow loads in the same block as the PHI. This is
1568 // a common case that happens when instcombine merges two loads through
1569 // a PHI.
1570 if (LI->getParent() != BB)
1571 return false;
1572
1573 if (LoadType) {
1574 if (LoadType != LI->getType())
1575 return false;
1576 } else {
1577 LoadType = LI->getType();
1578 }
1579
1580 // Ensure that there are no instructions between the PHI and the load that
1581 // could store.
1582 for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1583 if (BBI->mayWriteToMemory())
1584 return false;
1585
1586 MaxAlign = std::max(MaxAlign, LI->getAlign());
1587 }
1588
1589 if (!LoadType)
1590 return false;
1591
1592 APInt LoadSize =
1593 APInt(APWidth, DL.getTypeStoreSize(LoadType).getFixedValue());
1594
1595 // We can only transform this if it is safe to push the loads into the
1596 // predecessor blocks. The only thing to watch out for is that we can't put
1597 // a possibly trapping load in the predecessor if it is a critical edge.
1598 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1600 Value *InVal = PN.getIncomingValue(Idx);
1601
1602 // If the value is produced by the terminator of the predecessor (an
1603 // invoke) or it has side-effects, there is no valid place to put a load
1604 // in the predecessor.
1605 if (TI == InVal || TI->mayHaveSideEffects())
1606 return false;
1607
1608 // If the predecessor has a single successor, then the edge isn't
1609 // critical.
1610 if (TI->getNumSuccessors() == 1)
1611 continue;
1612
1613 // If this pointer is always safe to load, or if we can prove that there
1614 // is already a load in the block, then we can move the load to the pred
1615 // block.
1616 if (isSafeToLoadUnconditionally(InVal, MaxAlign, LoadSize, DL, TI))
1617 continue;
1618
1619 return false;
1620 }
1621
1622 return true;
1623}
1624
1625static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1626 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1627
1628 LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
1629 Type *LoadTy = SomeLoad->getType();
1630 IRB.SetInsertPoint(&PN);
1631 PHINode *NewPN = IRB.CreatePHI(LoadTy, PN.getNumIncomingValues(),
1632 PN.getName() + ".sroa.speculated");
1633
1634 // Get the AA tags and alignment to use from one of the loads. It does not
1635 // matter which one we get and if any differ.
1636 AAMDNodes AATags = SomeLoad->getAAMetadata();
1637 Align Alignment = SomeLoad->getAlign();
1638
1639 // Rewrite all loads of the PN to use the new PHI.
1640 while (!PN.use_empty()) {
1641 LoadInst *LI = cast<LoadInst>(PN.user_back());
1642 LI->replaceAllUsesWith(NewPN);
1643 LI->eraseFromParent();
1644 }
1645
1646 // Inject loads into all of the pred blocks.
1647 DenseMap<BasicBlock *, Value *> InjectedLoads;
1648 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1649 BasicBlock *Pred = PN.getIncomingBlock(Idx);
1650 Value *InVal = PN.getIncomingValue(Idx);
1651
1652 // A PHI node is allowed to have multiple (duplicated) entries for the same
1653 // basic block, as long as the value is the same. So if we already injected
1654 // a load in the predecessor, then we should reuse the same load for all
1655 // duplicated entries.
1656 if (Value *V = InjectedLoads.lookup(Pred)) {
1657 NewPN->addIncoming(V, Pred);
1658 continue;
1659 }
1660
1661 Instruction *TI = Pred->getTerminator();
1662 IRB.SetInsertPoint(TI);
1663
1664 LoadInst *Load = IRB.CreateAlignedLoad(
1665 LoadTy, InVal, Alignment,
1666 (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1667 ++NumLoadsSpeculated;
1668 if (AATags)
1669 Load->setAAMetadata(AATags);
1670 NewPN->addIncoming(Load, Pred);
1671 InjectedLoads[Pred] = Load;
1672 }
1673
1674 LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1675 PN.eraseFromParent();
1676}
1677
1678SelectHandSpeculativity &
1679SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) {
1680 if (isTrueVal)
1682 else
1684 return *this;
1685}
1686
1687bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const {
1688 return isTrueVal ? Bitfield::get<SelectHandSpeculativity::TrueVal>(Storage)
1689 : Bitfield::get<SelectHandSpeculativity::FalseVal>(Storage);
1690}
1691
1692bool SelectHandSpeculativity::areAllSpeculatable() const {
1693 return isSpeculatable(/*isTrueVal=*/true) &&
1694 isSpeculatable(/*isTrueVal=*/false);
1695}
1696
1697bool SelectHandSpeculativity::areAnySpeculatable() const {
1698 return isSpeculatable(/*isTrueVal=*/true) ||
1699 isSpeculatable(/*isTrueVal=*/false);
1700}
1701bool SelectHandSpeculativity::areNoneSpeculatable() const {
1702 return !areAnySpeculatable();
1703}
1704
1705static SelectHandSpeculativity
1707 assert(LI.isSimple() && "Only for simple loads");
1708 SelectHandSpeculativity Spec;
1709
1710 const DataLayout &DL = SI.getDataLayout();
1711 for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()})
1713 &LI))
1714 Spec.setAsSpeculatable(/*isTrueVal=*/Value == SI.getTrueValue());
1715 else if (PreserveCFG)
1716 return Spec;
1717
1718 return Spec;
1719}
1720
1721std::optional<RewriteableMemOps>
1722SROA::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) {
1723 RewriteableMemOps Ops;
1724
1725 for (User *U : SI.users()) {
1726 if (auto *BC = dyn_cast<BitCastInst>(U); BC && BC->hasOneUse())
1727 U = *BC->user_begin();
1728
1729 if (auto *Store = dyn_cast<StoreInst>(U)) {
1730 // Note that atomic stores can be transformed; atomic semantics do not
1731 // have any meaning for a local alloca. Stores are not speculatable,
1732 // however, so if we can't turn it into a predicated store, we are done.
1733 if (Store->isVolatile() || PreserveCFG)
1734 return {}; // Give up on this `select`.
1735 Ops.emplace_back(Store);
1736 continue;
1737 }
1738
1739 auto *LI = dyn_cast<LoadInst>(U);
1740
1741 // Note that atomic loads can be transformed;
1742 // atomic semantics do not have any meaning for a local alloca.
1743 if (!LI || LI->isVolatile())
1744 return {}; // Give up on this `select`.
1745
1746 PossiblySpeculatableLoad Load(LI);
1747 if (!LI->isSimple()) {
1748 // If the `load` is not simple, we can't speculatively execute it,
1749 // but we could handle this via a CFG modification. But can we?
1750 if (PreserveCFG)
1751 return {}; // Give up on this `select`.
1752 Ops.emplace_back(Load);
1753 continue;
1754 }
1755
1756 SelectHandSpeculativity Spec =
1758 if (PreserveCFG && !Spec.areAllSpeculatable())
1759 return {}; // Give up on this `select`.
1760
1761 Load.setInt(Spec);
1762 Ops.emplace_back(Load);
1763 }
1764
1765 return Ops;
1766}
1767
1769 IRBuilderTy &IRB) {
1770 LLVM_DEBUG(dbgs() << " original load: " << SI << "\n");
1771
1772 Value *TV = SI.getTrueValue();
1773 Value *FV = SI.getFalseValue();
1774 // Replace the given load of the select with a select of two loads.
1775
1776 assert(LI.isSimple() && "We only speculate simple loads");
1777
1778 IRB.SetInsertPoint(&LI);
1779
1780 LoadInst *TL =
1781 IRB.CreateAlignedLoad(LI.getType(), TV, LI.getAlign(),
1782 LI.getName() + ".sroa.speculate.load.true");
1783 LoadInst *FL =
1784 IRB.CreateAlignedLoad(LI.getType(), FV, LI.getAlign(),
1785 LI.getName() + ".sroa.speculate.load.false");
1786 NumLoadsSpeculated += 2;
1787
1788 // Transfer alignment and AA info if present.
1789 TL->setAlignment(LI.getAlign());
1790 FL->setAlignment(LI.getAlign());
1791
1792 AAMDNodes Tags = LI.getAAMetadata();
1793 if (Tags) {
1794 TL->setAAMetadata(Tags);
1795 FL->setAAMetadata(Tags);
1796 }
1797
1798 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
1799 LI.getName() + ".sroa.speculated",
1800 ProfcheckDisableMetadataFixes ? nullptr : &SI);
1801
1802 LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1803 LI.replaceAllUsesWith(V);
1804}
1805
1806template <typename T>
1808 SelectHandSpeculativity Spec,
1809 DomTreeUpdater &DTU) {
1810 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Only for load and store!");
1811 LLVM_DEBUG(dbgs() << " original mem op: " << I << "\n");
1812 BasicBlock *Head = I.getParent();
1813 Instruction *ThenTerm = nullptr;
1814 Instruction *ElseTerm = nullptr;
1815 if (Spec.areNoneSpeculatable())
1816 SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm,
1817 SI.getMetadata(LLVMContext::MD_prof), &DTU);
1818 else {
1819 SplitBlockAndInsertIfThen(SI.getCondition(), &I, /*Unreachable=*/false,
1820 SI.getMetadata(LLVMContext::MD_prof), &DTU,
1821 /*LI=*/nullptr, /*ThenBlock=*/nullptr);
1822 if (Spec.isSpeculatable(/*isTrueVal=*/true))
1823 cast<CondBrInst>(Head->getTerminator())->swapSuccessors();
1824 }
1825 auto *HeadBI = cast<CondBrInst>(Head->getTerminator());
1826 Spec = {}; // Do not use `Spec` beyond this point.
1827 BasicBlock *Tail = I.getParent();
1828 Tail->setName(Head->getName() + ".cont");
1829 PHINode *PN;
1830 if (isa<LoadInst>(I))
1831 PN = PHINode::Create(I.getType(), 2, "", I.getIterator());
1832 for (BasicBlock *SuccBB : successors(Head)) {
1833 bool IsThen = SuccBB == HeadBI->getSuccessor(0);
1834 int SuccIdx = IsThen ? 0 : 1;
1835 auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB;
1836 auto &CondMemOp = cast<T>(*I.clone());
1837 if (NewMemOpBB != Head) {
1838 NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else"));
1839 if (isa<LoadInst>(I))
1840 ++NumLoadsPredicated;
1841 else
1842 ++NumStoresPredicated;
1843 } else {
1844 CondMemOp.dropUBImplyingAttrsAndMetadata();
1845 ++NumLoadsSpeculated;
1846 }
1847 CondMemOp.insertBefore(NewMemOpBB->getTerminator()->getIterator());
1848 Value *Ptr = SI.getOperand(1 + SuccIdx);
1849 CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr);
1850 if (isa<LoadInst>(I)) {
1851 CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val");
1852 PN->addIncoming(&CondMemOp, NewMemOpBB);
1853 } else
1854 LLVM_DEBUG(dbgs() << " to: " << CondMemOp << "\n");
1855 }
1856 if (isa<LoadInst>(I)) {
1857 PN->takeName(&I);
1858 LLVM_DEBUG(dbgs() << " to: " << *PN << "\n");
1859 I.replaceAllUsesWith(PN);
1860 }
1861}
1862
1864 SelectHandSpeculativity Spec,
1865 DomTreeUpdater &DTU) {
1866 if (auto *LI = dyn_cast<LoadInst>(&I))
1867 rewriteMemOpOfSelect(SelInst, *LI, Spec, DTU);
1868 else if (auto *SI = dyn_cast<StoreInst>(&I))
1869 rewriteMemOpOfSelect(SelInst, *SI, Spec, DTU);
1870 else
1871 llvm_unreachable_internal("Only for load and store.");
1872}
1873
1875 const RewriteableMemOps &Ops,
1876 IRBuilderTy &IRB, DomTreeUpdater *DTU) {
1877 bool CFGChanged = false;
1878 LLVM_DEBUG(dbgs() << " original select: " << SI << "\n");
1879
1880 for (const RewriteableMemOp &Op : Ops) {
1881 SelectHandSpeculativity Spec;
1882 Instruction *I;
1883 if (auto *const *US = std::get_if<UnspeculatableStore>(&Op)) {
1884 I = *US;
1885 } else {
1886 auto PSL = std::get<PossiblySpeculatableLoad>(Op);
1887 I = PSL.getPointer();
1888 Spec = PSL.getInt();
1889 }
1890 if (Spec.areAllSpeculatable()) {
1892 } else {
1893 assert(DTU && "Should not get here when not allowed to modify the CFG!");
1894 rewriteMemOpOfSelect(SI, *I, Spec, *DTU);
1895 CFGChanged = true;
1896 }
1897 I->eraseFromParent();
1898 }
1899
1900 for (User *U : make_early_inc_range(SI.users()))
1901 cast<BitCastInst>(U)->eraseFromParent();
1902 SI.eraseFromParent();
1903 return CFGChanged;
1904}
1905
1906/// Compute an adjusted pointer from Ptr by Offset bytes where the
1907/// resulting pointer has PointerTy.
1908static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1910 const Twine &NamePrefix) {
1911 if (Offset != 0)
1912 Ptr = IRB.CreateInBoundsPtrAdd(Ptr, IRB.getInt(Offset),
1913 NamePrefix + "sroa_idx");
1914 return IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, PointerTy,
1915 NamePrefix + "sroa_cast");
1916}
1917
1918/// Compute the adjusted alignment for a load or store from an offset.
1922
1923/// Test whether we can convert a value from the old to the new type.
1924///
1925/// This predicate should be used to guard calls to convertValue in order to
1926/// ensure that we only try to convert viable values. The strategy is that we
1927/// will peel off single element struct and array wrappings to get to an
1928/// underlying value, and convert that value.
1929static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy,
1930 unsigned VScale = 0) {
1931 if (OldTy == NewTy)
1932 return true;
1933
1934 // For integer types, we can't handle any bit-width differences. This would
1935 // break both vector conversions with extension and introduce endianness
1936 // issues when in conjunction with loads and stores.
1937 if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
1939 cast<IntegerType>(NewTy)->getBitWidth() &&
1940 "We can't have the same bitwidth for different int types");
1941 return false;
1942 }
1943
1944 TypeSize NewSize = DL.getTypeSizeInBits(NewTy);
1945 TypeSize OldSize = DL.getTypeSizeInBits(OldTy);
1946
1947 if ((isa<ScalableVectorType>(NewTy) && isa<FixedVectorType>(OldTy)) ||
1948 (isa<ScalableVectorType>(OldTy) && isa<FixedVectorType>(NewTy))) {
1949 // Conversion is only possible when the size of scalable vectors is known.
1950 if (!VScale)
1951 return false;
1952
1953 // For ptr-to-int and int-to-ptr casts, the pointer side is resolved within
1954 // a single domain (either fixed or scalable). Any additional conversion
1955 // between fixed and scalable types is handled through integer types.
1956 auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy;
1957 auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy;
1958
1959 if (isa<ScalableVectorType>(NewTy)) {
1961 return false;
1962
1963 NewSize = TypeSize::getFixed(NewSize.getKnownMinValue() * VScale);
1964 } else {
1966 return false;
1967
1968 OldSize = TypeSize::getFixed(OldSize.getKnownMinValue() * VScale);
1969 }
1970 }
1971
1972 if (NewSize != OldSize)
1973 return false;
1974 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
1975 return false;
1976
1977 // We can convert pointers to integers and vice-versa. Same for vectors
1978 // of pointers and integers.
1979 OldTy = OldTy->getScalarType();
1980 NewTy = NewTy->getScalarType();
1981 if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
1982 if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
1983 unsigned OldAS = OldTy->getPointerAddressSpace();
1984 unsigned NewAS = NewTy->getPointerAddressSpace();
1985 // Convert pointers if they are pointers from the same address space or
1986 // different integral (not non-integral) address spaces with the same
1987 // pointer size.
1988 return OldAS == NewAS ||
1989 (!DL.isNonIntegralAddressSpace(OldAS) &&
1990 !DL.isNonIntegralAddressSpace(NewAS) &&
1991 DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
1992 }
1993
1994 // We can convert integers to integral pointers, but not to non-integral
1995 // pointers.
1996 if (OldTy->isIntegerTy())
1997 return !DL.isNonIntegralPointerType(NewTy);
1998
1999 // We can convert integral pointers to integers, but non-integral pointers
2000 // need to remain pointers.
2001 if (!DL.isNonIntegralPointerType(OldTy))
2002 return NewTy->isIntegerTy();
2003
2004 return false;
2005 }
2006
2007 if (OldTy->isTargetExtTy() || NewTy->isTargetExtTy())
2008 return false;
2009
2010 return true;
2011}
2012
2013/// Test whether the given slice use can be promoted to a vector.
2014///
2015/// This function is called to test each entry in a partition which is slated
2016/// for a single slice.
2017static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
2018 VectorType *Ty,
2019 uint64_t ElementSize,
2020 const DataLayout &DL,
2021 unsigned VScale) {
2022 // First validate the slice offsets.
2023 uint64_t BeginOffset =
2024 std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
2025 uint64_t BeginIndex = BeginOffset / ElementSize;
2026 if (BeginIndex * ElementSize != BeginOffset ||
2027 BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
2028 return false;
2029 uint64_t EndOffset = std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
2030 uint64_t EndIndex = EndOffset / ElementSize;
2031 if (EndIndex * ElementSize != EndOffset ||
2032 EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
2033 return false;
2034
2035 assert(EndIndex > BeginIndex && "Empty vector!");
2036 uint64_t NumElements = EndIndex - BeginIndex;
2037 Type *SliceTy = (NumElements == 1)
2038 ? Ty->getElementType()
2039 : FixedVectorType::get(Ty->getElementType(), NumElements);
2040
2041 Type *SplitIntTy =
2042 Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
2043
2044 Use *U = S.getUse();
2045
2046 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2047 if (MI->isVolatile())
2048 return false;
2049 if (!S.isSplittable())
2050 return false; // Skip any unsplittable intrinsics.
2051 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2052 if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2053 return false;
2054 } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2055 if (LI->isVolatile())
2056 return false;
2057 Type *LTy = LI->getType();
2058 // Disable vector promotion when there are loads or stores of an FCA.
2059 if (LTy->isStructTy())
2060 return false;
2061 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2062 assert(LTy->isIntegerTy());
2063 LTy = SplitIntTy;
2064 }
2065 if (!canConvertValue(DL, SliceTy, LTy, VScale))
2066 return false;
2067 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2068 if (SI->isVolatile())
2069 return false;
2070 Type *STy = SI->getValueOperand()->getType();
2071 // Disable vector promotion when there are loads or stores of an FCA.
2072 if (STy->isStructTy())
2073 return false;
2074 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2075 assert(STy->isIntegerTy());
2076 STy = SplitIntTy;
2077 }
2078 if (!canConvertValue(DL, STy, SliceTy, VScale))
2079 return false;
2080 } else {
2081 return false;
2082 }
2083
2084 return true;
2085}
2086
2087/// Test whether any vector type in \p CandidateTys is viable for promotion.
2088///
2089/// This implements the necessary checking for \c isVectorPromotionViable over
2090/// all slices of the alloca for the given VectorType.
2091static VectorType *
2093 SmallVectorImpl<VectorType *> &CandidateTys,
2094 bool HaveCommonEltTy, Type *CommonEltTy,
2095 bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
2096 VectorType *CommonVecPtrTy, unsigned VScale) {
2097 // If we didn't find a vector type, nothing to do here.
2098 if (CandidateTys.empty())
2099 return nullptr;
2100
2101 // Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
2102 // then we should choose it, not some other alternative.
2103 // But, we can't perform a no-op pointer address space change via bitcast,
2104 // so if we didn't have a common pointer element type, bail.
2105 if (HaveVecPtrTy && !HaveCommonVecPtrTy)
2106 return nullptr;
2107
2108 // Try to pick the "best" element type out of the choices.
2109 if (!HaveCommonEltTy && HaveVecPtrTy) {
2110 // If there was a pointer element type, there's really only one choice.
2111 CandidateTys.clear();
2112 CandidateTys.push_back(CommonVecPtrTy);
2113 } else if (!HaveCommonEltTy && !HaveVecPtrTy) {
2114 // Integer-ify vector types.
2115 for (VectorType *&VTy : CandidateTys) {
2116 if (!VTy->getElementType()->isIntegerTy())
2117 VTy = cast<VectorType>(VTy->getWithNewType(IntegerType::getIntNTy(
2118 VTy->getContext(), VTy->getScalarSizeInBits())));
2119 }
2120
2121 // Rank the remaining candidate vector types. This is easy because we know
2122 // they're all integer vectors. We sort by ascending number of elements.
2123 auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2124 (void)DL;
2125 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2126 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2127 "Cannot have vector types of different sizes!");
2128 assert(RHSTy->getElementType()->isIntegerTy() &&
2129 "All non-integer types eliminated!");
2130 assert(LHSTy->getElementType()->isIntegerTy() &&
2131 "All non-integer types eliminated!");
2132 return cast<FixedVectorType>(RHSTy)->getNumElements() <
2133 cast<FixedVectorType>(LHSTy)->getNumElements();
2134 };
2135 auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2136 (void)DL;
2137 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2138 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2139 "Cannot have vector types of different sizes!");
2140 assert(RHSTy->getElementType()->isIntegerTy() &&
2141 "All non-integer types eliminated!");
2142 assert(LHSTy->getElementType()->isIntegerTy() &&
2143 "All non-integer types eliminated!");
2144 return cast<FixedVectorType>(RHSTy)->getNumElements() ==
2145 cast<FixedVectorType>(LHSTy)->getNumElements();
2146 };
2147 llvm::sort(CandidateTys, RankVectorTypesComp);
2148 CandidateTys.erase(llvm::unique(CandidateTys, RankVectorTypesEq),
2149 CandidateTys.end());
2150 } else {
2151// The only way to have the same element type in every vector type is to
2152// have the same vector type. Check that and remove all but one.
2153#ifndef NDEBUG
2154 for (VectorType *VTy : CandidateTys) {
2155 assert(VTy->getElementType() == CommonEltTy &&
2156 "Unaccounted for element type!");
2157 assert(VTy == CandidateTys[0] &&
2158 "Different vector types with the same element type!");
2159 }
2160#endif
2161 CandidateTys.resize(1);
2162 }
2163
2164 // FIXME: hack. Do we have a named constant for this?
2165 // SDAG SDNode can't have more than 65535 operands.
2166 llvm::erase_if(CandidateTys, [](VectorType *VTy) {
2167 return cast<FixedVectorType>(VTy)->getNumElements() >
2168 std::numeric_limits<unsigned short>::max();
2169 });
2170
2171 // Find a vector type viable for promotion by iterating over all slices.
2172 auto *VTy = llvm::find_if(CandidateTys, [&](VectorType *VTy) -> bool {
2173 uint64_t ElementSize =
2174 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2175
2176 // While the definition of LLVM vectors is bitpacked, we don't support sizes
2177 // that aren't byte sized.
2178 if (ElementSize % 8)
2179 return false;
2180 assert((DL.getTypeSizeInBits(VTy).getFixedValue() % 8) == 0 &&
2181 "vector size not a multiple of element size?");
2182 ElementSize /= 8;
2183
2184 for (const Slice &S : P)
2185 if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, VScale))
2186 return false;
2187
2188 for (const Slice *S : P.splitSliceTails())
2189 if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, VScale))
2190 return false;
2191
2192 return true;
2193 });
2194 return VTy != CandidateTys.end() ? *VTy : nullptr;
2195}
2196
2198 SetVector<Type *> &OtherTys, ArrayRef<VectorType *> CandidateTysCopy,
2199 function_ref<void(Type *)> CheckCandidateType, Partition &P,
2200 const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
2201 bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
2202 bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) {
2203 [[maybe_unused]] VectorType *OriginalElt =
2204 CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
2205 // Consider additional vector types where the element type size is a
2206 // multiple of load/store element size.
2207 for (Type *Ty : OtherTys) {
2209 continue;
2210 unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
2211 // Make a copy of CandidateTys and iterate through it, because we
2212 // might append to CandidateTys in the loop.
2213 for (VectorType *const VTy : CandidateTysCopy) {
2214 // The elements in the copy should remain invariant throughout the loop
2215 assert(CandidateTysCopy[0] == OriginalElt && "Different Element");
2216 unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue();
2217 unsigned ElementSize =
2218 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2219 if (TypeSize != VectorSize && TypeSize != ElementSize &&
2220 VectorSize % TypeSize == 0) {
2221 VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false);
2222 CheckCandidateType(NewVTy);
2223 }
2224 }
2225 }
2226
2228 P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2229 HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
2230}
2231
2232/// Test whether the given alloca partitioning and range of slices can be
2233/// promoted to a vector.
2234///
2235/// This is a quick test to check whether we can rewrite a particular alloca
2236/// partition (and its newly formed alloca) into a vector alloca with only
2237/// whole-vector loads and stores such that it could be promoted to a vector
2238/// SSA value. We only can ensure this for a limited set of operations, and we
2239/// don't want to do the rewrites unless we are confident that the result will
2240/// be promotable, so we have an early test here.
2242 unsigned VScale) {
2243 // Collect the candidate types for vector-based promotion. Also track whether
2244 // we have different element types.
2245 SmallVector<VectorType *, 4> CandidateTys;
2246 SetVector<Type *> LoadStoreTys;
2247 SetVector<Type *> DeferredTys;
2248 Type *CommonEltTy = nullptr;
2249 VectorType *CommonVecPtrTy = nullptr;
2250 bool HaveVecPtrTy = false;
2251 bool HaveCommonEltTy = true;
2252 bool HaveCommonVecPtrTy = true;
2253 auto CheckCandidateType = [&](Type *Ty) {
2254 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2255 // Return if bitcast to vectors is different for total size in bits.
2256 if (!CandidateTys.empty()) {
2257 VectorType *V = CandidateTys[0];
2258 if (DL.getTypeSizeInBits(VTy).getFixedValue() !=
2259 DL.getTypeSizeInBits(V).getFixedValue()) {
2260 CandidateTys.clear();
2261 return;
2262 }
2263 }
2264 CandidateTys.push_back(VTy);
2265 Type *EltTy = VTy->getElementType();
2266
2267 if (!CommonEltTy)
2268 CommonEltTy = EltTy;
2269 else if (CommonEltTy != EltTy)
2270 HaveCommonEltTy = false;
2271
2272 if (EltTy->isPointerTy()) {
2273 HaveVecPtrTy = true;
2274 if (!CommonVecPtrTy)
2275 CommonVecPtrTy = VTy;
2276 else if (CommonVecPtrTy != VTy)
2277 HaveCommonVecPtrTy = false;
2278 }
2279 }
2280 };
2281
2282 // Put load and store types into a set for de-duplication.
2283 for (const Slice &S : P) {
2284 Type *Ty;
2285 if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
2286 Ty = LI->getType();
2287 else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
2288 Ty = SI->getValueOperand()->getType();
2289 else
2290 continue;
2291
2292 auto CandTy = Ty->getScalarType();
2293 if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() ||
2294 S.endOffset() != P.endOffset())) {
2295 DeferredTys.insert(Ty);
2296 continue;
2297 }
2298
2299 LoadStoreTys.insert(Ty);
2300 // Consider any loads or stores that are the exact size of the slice.
2301 if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
2302 CheckCandidateType(Ty);
2303 }
2304
2305 SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
2307 LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
2308 CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2309 HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
2310 return VTy;
2311
2312 CandidateTys.clear();
2314 DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
2315 HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
2316 CommonVecPtrTy, VScale);
2317}
2318
2319/// Test whether a slice of an alloca is valid for integer widening.
2320///
2321/// This implements the necessary checking for the \c isIntegerWideningViable
2322/// test below on a single slice of the alloca.
2323static bool isIntegerWideningViableForSlice(const Slice &S,
2324 uint64_t AllocBeginOffset,
2325 Type *AllocaTy,
2326 const DataLayout &DL,
2327 bool &WholeAllocaOp) {
2328 uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedValue();
2329
2330 uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2331 uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2332
2333 Use *U = S.getUse();
2334
2335 // Lifetime intrinsics operate over the whole alloca whose sizes are usually
2336 // larger than other load/store slices (RelEnd > Size). But lifetime are
2337 // always promotable and should not impact other slices' promotability of the
2338 // partition.
2339 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2340 if (II->isLifetimeStartOrEnd() || II->isDroppable())
2341 return true;
2342 }
2343
2344 // We can't reasonably handle cases where the load or store extends past
2345 // the end of the alloca's type and into its padding.
2346 if (RelEnd > Size)
2347 return false;
2348
2349 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2350 if (LI->isVolatile())
2351 return false;
2352 // We can't handle loads that extend past the allocated memory.
2353 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
2354 if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size)
2355 return false;
2356 // So far, AllocaSliceRewriter does not support widening split slice tails
2357 // in rewriteIntegerLoad.
2358 if (S.beginOffset() < AllocBeginOffset)
2359 return false;
2360 // Note that we don't count vector loads or stores as whole-alloca
2361 // operations which enable integer widening because we would prefer to use
2362 // vector widening instead.
2363 if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
2364 WholeAllocaOp = true;
2365 if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
2366 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2367 return false;
2368 } else if (RelBegin != 0 || RelEnd != Size ||
2369 !canConvertValue(DL, AllocaTy, LI->getType())) {
2370 // Non-integer loads need to be convertible from the alloca type so that
2371 // they are promotable.
2372 return false;
2373 }
2374 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2375 Type *ValueTy = SI->getValueOperand()->getType();
2376 if (SI->isVolatile())
2377 return false;
2378 // We can't handle stores that extend past the allocated memory.
2379 TypeSize StoreSize = DL.getTypeStoreSize(ValueTy);
2380 if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size)
2381 return false;
2382 // So far, AllocaSliceRewriter does not support widening split slice tails
2383 // in rewriteIntegerStore.
2384 if (S.beginOffset() < AllocBeginOffset)
2385 return false;
2386 // Note that we don't count vector loads or stores as whole-alloca
2387 // operations which enable integer widening because we would prefer to use
2388 // vector widening instead.
2389 if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
2390 WholeAllocaOp = true;
2391 if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
2392 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2393 return false;
2394 } else if (RelBegin != 0 || RelEnd != Size ||
2395 !canConvertValue(DL, ValueTy, AllocaTy)) {
2396 // Non-integer stores need to be convertible to the alloca type so that
2397 // they are promotable.
2398 return false;
2399 }
2400 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2401 if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
2402 return false;
2403 if (!S.isSplittable())
2404 return false; // Skip any unsplittable intrinsics.
2405 } else {
2406 return false;
2407 }
2408
2409 return true;
2410}
2411
2412/// Test whether the given alloca partition's integer operations can be
2413/// widened to promotable ones.
2414///
2415/// This is a quick test to check whether we can rewrite the integer loads and
2416/// stores to a particular alloca into wider loads and stores and be able to
2417/// promote the resulting alloca.
2418static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2419 const DataLayout &DL) {
2420 uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedValue();
2421 // Don't create integer types larger than the maximum bitwidth.
2422 if (SizeInBits > IntegerType::MAX_INT_BITS)
2423 return false;
2424
2425 // Don't try to handle allocas with bit-padding.
2426 if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedValue())
2427 return false;
2428
2429 // We need to ensure that an integer type with the appropriate bitwidth can
2430 // be converted to the alloca type, whatever that is. We don't want to force
2431 // the alloca itself to have an integer type if there is a more suitable one.
2432 Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
2433 if (!canConvertValue(DL, AllocaTy, IntTy) ||
2434 !canConvertValue(DL, IntTy, AllocaTy))
2435 return false;
2436
2437 // While examining uses, we ensure that the alloca has a covering load or
2438 // store. We don't want to widen the integer operations only to fail to
2439 // promote due to some other unsplittable entry (which we may make splittable
2440 // later). However, if there are only splittable uses, go ahead and assume
2441 // that we cover the alloca.
2442 // FIXME: We shouldn't consider split slices that happen to start in the
2443 // partition here...
2444 bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
2445
2446 for (const Slice &S : P)
2447 if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
2448 WholeAllocaOp))
2449 return false;
2450
2451 for (const Slice *S : P.splitSliceTails())
2452 if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
2453 WholeAllocaOp))
2454 return false;
2455
2456 return WholeAllocaOp;
2457}
2458
2459static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2461 const Twine &Name) {
2462 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2463 IntegerType *IntTy = cast<IntegerType>(V->getType());
2464 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2465 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2466 "Element extends past full value");
2467 uint64_t ShAmt = 8 * Offset;
2468 if (DL.isBigEndian())
2469 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2470 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2471 if (ShAmt) {
2472 V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
2473 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2474 }
2475 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2476 "Cannot extract to a larger integer!");
2477 if (Ty != IntTy) {
2478 V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
2479 LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2480 }
2481 return V;
2482}
2483
2484static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2485 Value *V, uint64_t Offset, const Twine &Name) {
2486 IntegerType *IntTy = cast<IntegerType>(Old->getType());
2487 IntegerType *Ty = cast<IntegerType>(V->getType());
2488 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2489 "Cannot insert a larger integer!");
2490 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2491 if (Ty != IntTy) {
2492 V = IRB.CreateZExt(V, IntTy, Name + ".ext");
2493 LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2494 }
2495 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2496 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2497 "Element store outside of alloca store");
2498 uint64_t ShAmt = 8 * Offset;
2499 if (DL.isBigEndian())
2500 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2501 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2502 if (ShAmt) {
2503 V = IRB.CreateShl(V, ShAmt, Name + ".shift");
2504 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2505 }
2506
2507 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2508 APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
2509 Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
2510 LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2511 V = IRB.CreateOr(Old, V, Name + ".insert");
2512 LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2513 }
2514 return V;
2515}
2516
2517static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2518 unsigned EndIndex, const Twine &Name) {
2519 auto *VecTy = cast<FixedVectorType>(V->getType());
2520 unsigned NumElements = EndIndex - BeginIndex;
2521 assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2522
2523 if (NumElements == VecTy->getNumElements())
2524 return V;
2525
2526 if (NumElements == 1) {
2527 V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
2528 Name + ".extract");
2529 LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2530 return V;
2531 }
2532
2533 auto Mask = llvm::to_vector<8>(llvm::seq<int>(BeginIndex, EndIndex));
2534 V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
2535 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2536 return V;
2537}
2538
2539static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2540 unsigned BeginIndex, const Twine &Name) {
2541 VectorType *VecTy = cast<VectorType>(Old->getType());
2542 assert(VecTy && "Can only insert a vector into a vector");
2543
2544 VectorType *Ty = dyn_cast<VectorType>(V->getType());
2545 if (!Ty) {
2546 // Single element to insert.
2547 V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
2548 Name + ".insert");
2549 LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2550 return V;
2551 }
2552
2553 unsigned NumSubElements = cast<FixedVectorType>(Ty)->getNumElements();
2554 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
2555
2556 assert(NumSubElements <= NumElements && "Too many elements!");
2557 if (NumSubElements == NumElements) {
2558 assert(V->getType() == VecTy && "Vector type mismatch");
2559 return V;
2560 }
2561 unsigned EndIndex = BeginIndex + NumSubElements;
2562
2563 // When inserting a smaller vector into the larger to store, we first
2564 // use a shuffle vector to widen it with undef elements, and then
2565 // a second shuffle vector to select between the loaded vector and the
2566 // incoming vector.
2568 Mask.reserve(NumElements);
2569 for (unsigned Idx = 0; Idx != NumElements; ++Idx)
2570 if (Idx >= BeginIndex && Idx < EndIndex)
2571 Mask.push_back(Idx - BeginIndex);
2572 else
2573 Mask.push_back(-1);
2574 V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
2575 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2576
2577 Mask.clear();
2578 for (unsigned Idx = 0; Idx != NumElements; ++Idx)
2579 if (Idx >= BeginIndex && Idx < EndIndex)
2580 Mask.push_back(Idx);
2581 else
2582 Mask.push_back(Idx + NumElements);
2583 V = IRB.CreateShuffleVector(V, Old, Mask, Name + "blend");
2584 LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2585 return V;
2586}
2587
2588/// This function takes two vector values and combines them into a single vector
2589/// by concatenating their elements. The function handles:
2590///
2591/// 1. Element type mismatch: If either vector's element type differs from
2592/// NewAIEltType, the function bitcasts the vector to use NewAIEltType while
2593/// preserving the total bit width (adjusting the number of elements
2594/// accordingly).
2595///
2596/// 2. Size mismatch: After transforming the vectors to have the desired element
2597/// type, if the two vectors have different numbers of elements, the smaller
2598/// vector is extended with poison values to match the size of the larger
2599/// vector before concatenation.
2600///
2601/// 3. Concatenation: The vectors are merged using a shuffle operation that
2602/// places all elements of V0 first, followed by all elements of V1.
2603///
2604/// \param V0 The first vector to merge (must be a vector type)
2605/// \param V1 The second vector to merge (must be a vector type)
2606/// \param DL The data layout for size calculations
2607/// \param NewAIEltTy The desired element type for the result vector
2608/// \param Builder IRBuilder for creating new instructions
2609/// \return A new vector containing all elements from V0 followed by all
2610/// elements from V1
2612 Type *NewAIEltTy, IRBuilder<> &Builder) {
2613 // V0 and V1 are vectors
2614 // Create a new vector type with combined elements
2615 // Use ShuffleVector to concatenate the vectors
2616 auto *VecType0 = cast<FixedVectorType>(V0->getType());
2617 auto *VecType1 = cast<FixedVectorType>(V1->getType());
2618
2619 // If V0/V1 element types are different from NewAllocaElementType,
2620 // we need to introduce bitcasts before merging them
2621 auto BitcastIfNeeded = [&](Value *&V, FixedVectorType *&VecType,
2622 const char *DebugName) {
2623 Type *EltType = VecType->getElementType();
2624 if (EltType != NewAIEltTy) {
2625 // Calculate new number of elements to maintain same bit width
2626 unsigned TotalBits =
2627 VecType->getNumElements() * DL.getTypeSizeInBits(EltType);
2628 unsigned NewNumElts = TotalBits / DL.getTypeSizeInBits(NewAIEltTy);
2629
2630 auto *NewVecType = FixedVectorType::get(NewAIEltTy, NewNumElts);
2631 V = Builder.CreateBitCast(V, NewVecType);
2632 VecType = NewVecType;
2633 LLVM_DEBUG(dbgs() << " bitcast " << DebugName << ": " << *V << "\n");
2634 }
2635 };
2636
2637 BitcastIfNeeded(V0, VecType0, "V0");
2638 BitcastIfNeeded(V1, VecType1, "V1");
2639
2640 unsigned NumElts0 = VecType0->getNumElements();
2641 unsigned NumElts1 = VecType1->getNumElements();
2642
2643 SmallVector<int, 16> ShuffleMask;
2644
2645 if (NumElts0 == NumElts1) {
2646 for (unsigned i = 0; i < NumElts0 + NumElts1; ++i)
2647 ShuffleMask.push_back(i);
2648 } else {
2649 // If two vectors have different sizes, we need to extend
2650 // the smaller vector to the size of the larger vector.
2651 unsigned SmallSize = std::min(NumElts0, NumElts1);
2652 unsigned LargeSize = std::max(NumElts0, NumElts1);
2653 bool IsV0Smaller = NumElts0 < NumElts1;
2654 Value *&ExtendedVec = IsV0Smaller ? V0 : V1;
2655 SmallVector<int, 16> ExtendMask;
2656 for (unsigned i = 0; i < SmallSize; ++i)
2657 ExtendMask.push_back(i);
2658 for (unsigned i = SmallSize; i < LargeSize; ++i)
2659 ExtendMask.push_back(PoisonMaskElem);
2660 ExtendedVec = Builder.CreateShuffleVector(
2661 ExtendedVec, PoisonValue::get(ExtendedVec->getType()), ExtendMask);
2662 LLVM_DEBUG(dbgs() << " shufflevector: " << *ExtendedVec << "\n");
2663 for (unsigned i = 0; i < NumElts0; ++i)
2664 ShuffleMask.push_back(i);
2665 for (unsigned i = 0; i < NumElts1; ++i)
2666 ShuffleMask.push_back(LargeSize + i);
2667 }
2668
2669 return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2670}
2671
2672namespace {
2673
2674/// Visitor to rewrite instructions using p particular slice of an alloca
2675/// to use a new alloca.
2676///
2677/// Also implements the rewriting to vector-based accesses when the partition
2678/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2679/// lives here.
2680class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
2681 // Befriend the base class so it can delegate to private visit methods.
2682 friend class InstVisitor<AllocaSliceRewriter, bool>;
2683
2684 using Base = InstVisitor<AllocaSliceRewriter, bool>;
2685
2686 const DataLayout &DL;
2687 AllocaSlices &AS;
2688 SROA &Pass;
2689 AllocaInst &OldAI, &NewAI;
2690 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2691 Type *NewAllocaTy;
2692
2693 // This is a convenience and flag variable that will be null unless the new
2694 // alloca's integer operations should be widened to this integer type due to
2695 // passing isIntegerWideningViable above. If it is non-null, the desired
2696 // integer type will be stored here for easy access during rewriting.
2697 IntegerType *IntTy;
2698
2699 // If we are rewriting an alloca partition which can be written as pure
2700 // vector operations, we stash extra information here. When VecTy is
2701 // non-null, we have some strict guarantees about the rewritten alloca:
2702 // - The new alloca is exactly the size of the vector type here.
2703 // - The accesses all either map to the entire vector or to a single
2704 // element.
2705 // - The set of accessing instructions is only one of those handled above
2706 // in isVectorPromotionViable. Generally these are the same access kinds
2707 // which are promotable via mem2reg.
2708 VectorType *VecTy;
2709 Type *ElementTy;
2710 uint64_t ElementSize;
2711
2712 // The original offset of the slice currently being rewritten relative to
2713 // the original alloca.
2714 uint64_t BeginOffset = 0;
2715 uint64_t EndOffset = 0;
2716
2717 // The new offsets of the slice currently being rewritten relative to the
2718 // original alloca.
2719 uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2720
2721 uint64_t SliceSize = 0;
2722 bool IsSplittable = false;
2723 bool IsSplit = false;
2724 Use *OldUse = nullptr;
2725 Instruction *OldPtr = nullptr;
2726
2727 // Track post-rewrite users which are PHI nodes and Selects.
2728 SmallSetVector<PHINode *, 8> &PHIUsers;
2729 SmallSetVector<SelectInst *, 8> &SelectUsers;
2730
2731 // Utility IR builder, whose name prefix is setup for each visited use, and
2732 // the insertion point is set to point to the user.
2733 IRBuilderTy IRB;
2734
2735 // Return the new alloca, addrspacecasted if required to avoid changing the
2736 // addrspace of a volatile access.
2737 Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) {
2738 if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace())
2739 return &NewAI;
2740
2741 Type *AccessTy = IRB.getPtrTy(AddrSpace);
2742 return IRB.CreateAddrSpaceCast(&NewAI, AccessTy);
2743 }
2744
2745public:
2746 AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
2747 AllocaInst &OldAI, AllocaInst &NewAI, Type *NewAllocaTy,
2748 uint64_t NewAllocaBeginOffset,
2749 uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2750 VectorType *PromotableVecTy,
2751 SmallSetVector<PHINode *, 8> &PHIUsers,
2752 SmallSetVector<SelectInst *, 8> &SelectUsers)
2753 : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2754 NewAllocaBeginOffset(NewAllocaBeginOffset),
2755 NewAllocaEndOffset(NewAllocaEndOffset), NewAllocaTy(NewAllocaTy),
2756 IntTy(IsIntegerPromotable
2757 ? Type::getIntNTy(
2758 NewAI.getContext(),
2759 DL.getTypeSizeInBits(NewAllocaTy).getFixedValue())
2760 : nullptr),
2761 VecTy(PromotableVecTy),
2762 ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2763 ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8
2764 : 0),
2765 PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2766 IRB(NewAI.getContext(), ConstantFolder()) {
2767 if (VecTy) {
2768 assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % 8) == 0 &&
2769 "Only multiple-of-8 sized vector elements are viable");
2770 ++NumVectorized;
2771 }
2772 assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2773 }
2774
2775 bool visit(AllocaSlices::const_iterator I) {
2776 bool CanSROA = true;
2777 BeginOffset = I->beginOffset();
2778 EndOffset = I->endOffset();
2779 IsSplittable = I->isSplittable();
2780 IsSplit =
2781 BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2782 LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2783 LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2784 LLVM_DEBUG(dbgs() << "\n");
2785
2786 // Compute the intersecting offset range.
2787 assert(BeginOffset < NewAllocaEndOffset);
2788 assert(EndOffset > NewAllocaBeginOffset);
2789 NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
2790 NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
2791
2792 SliceSize = NewEndOffset - NewBeginOffset;
2793 LLVM_DEBUG(dbgs() << " Begin:(" << BeginOffset << ", " << EndOffset
2794 << ") NewBegin:(" << NewBeginOffset << ", "
2795 << NewEndOffset << ") NewAllocaBegin:("
2796 << NewAllocaBeginOffset << ", " << NewAllocaEndOffset
2797 << ")\n");
2798 assert(IsSplit || NewBeginOffset == BeginOffset);
2799 OldUse = I->getUse();
2800 OldPtr = cast<Instruction>(OldUse->get());
2801
2802 Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
2803 IRB.SetInsertPoint(OldUserI);
2804 IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2805 // Avoid materializing the name prefix when it is discarded anyway.
2806 if (!IRB.getContext().shouldDiscardValueNames())
2807 IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." +
2808 Twine(BeginOffset) + ".");
2809
2810 CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
2811 if (VecTy || IntTy)
2812 assert(CanSROA);
2813 return CanSROA;
2814 }
2815
2816 /// Attempts to rewrite a partition using tree-structured merge optimization.
2817 ///
2818 /// This function analyzes a partition to determine if it can be optimized
2819 /// using a tree-structured merge pattern, where multiple non-overlapping
2820 /// stores completely fill an alloca. And there is no load from the alloca in
2821 /// the middle of the stores. Such patterns can be optimized by eliminating
2822 /// the intermediate stores and directly constructing the final vector by
2823 /// using shufflevectors.
2824 ///
2825 /// Example transformation:
2826 /// Before: (stores do not have to be in order)
2827 /// %alloca = alloca <8 x float>
2828 /// store <2 x float> %val0, ptr %alloca ; offset 0-1
2829 /// store <2 x float> %val2, ptr %alloca+16 ; offset 4-5
2830 /// store <2 x float> %val1, ptr %alloca+8 ; offset 2-3
2831 /// store <2 x float> %val3, ptr %alloca+24 ; offset 6-7
2832 ///
2833 /// After:
2834 /// %alloca = alloca <8 x float>
2835 /// %shuffle0 = shufflevector %val0, %val1, <4 x i32> <i32 0, i32 1, i32 2,
2836 /// i32 3>
2837 /// %shuffle1 = shufflevector %val2, %val3, <4 x i32> <i32 0, i32 1, i32 2,
2838 /// i32 3>
2839 /// %shuffle2 = shufflevector %shuffle0, %shuffle1, <8 x i32> <i32 0, i32 1,
2840 /// i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2841 /// store %shuffle2, ptr %alloca
2842 ///
2843 /// The optimization looks for partitions that:
2844 /// 1. Have no overlapping split slice tails
2845 /// 2. Contain non-overlapping stores that cover the entire alloca
2846 /// 3. Have exactly one load that reads the complete alloca structure and not
2847 /// in the middle of the stores (TODO: maybe we can relax the constraint
2848 /// about reading the entire alloca structure)
2849 ///
2850 /// \param P The partition to analyze and potentially rewrite
2851 /// \return An optional vector of values that were deleted during the rewrite
2852 /// process, or std::nullopt if the partition cannot be optimized
2853 /// using tree-structured merge
2854 std::optional<SmallVector<Value *, 4>>
2855 rewriteTreeStructuredMerge(Partition &P) {
2856 // No tail slices that overlap with the partition
2857 if (P.splitSliceTails().size() > 0)
2858 return std::nullopt;
2859
2860 SmallVector<Value *, 4> DeletedValues;
2861 LoadInst *TheLoad = nullptr;
2862
2863 // Structure to hold store information
2864 struct StoreInfo {
2865 StoreInst *Store;
2866 uint64_t BeginOffset;
2867 uint64_t EndOffset;
2868 Value *StoredValue;
2869 StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val)
2870 : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val) {}
2871 };
2872
2873 SmallVector<StoreInfo, 4> StoreInfos;
2874
2875 // If the new alloca is a fixed vector type, we use its element type as the
2876 // allocated element type, otherwise we use i8 as the allocated element
2877 Type *AllocatedEltTy =
2878 isa<FixedVectorType>(NewAllocaTy)
2879 ? cast<FixedVectorType>(NewAllocaTy)->getElementType()
2880 : Type::getInt8Ty(NewAI.getContext());
2881 unsigned AllocatedEltTySize = DL.getTypeSizeInBits(AllocatedEltTy);
2882
2883 // Helper to check if a type is
2884 // 1. A fixed vector type
2885 // 2. The element type is not a pointer
2886 // 3. The element type size is byte-aligned
2887 // We only handle the cases that the ld/st meet these conditions
2888 auto IsTypeValidForTreeStructuredMerge = [&](Type *Ty) -> bool {
2889 auto *FixedVecTy = dyn_cast<FixedVectorType>(Ty);
2890 return FixedVecTy &&
2891 DL.getTypeSizeInBits(FixedVecTy->getElementType()) % 8 == 0 &&
2892 !FixedVecTy->getElementType()->isPointerTy();
2893 };
2894
2895 for (Slice &S : P) {
2896 auto *User = cast<Instruction>(S.getUse()->getUser());
2897 if (auto *LI = dyn_cast<LoadInst>(User)) {
2898 // Do not handle the case if
2899 // 1. There is more than one load
2900 // 2. The load is volatile
2901 // 3. The load does not read the entire alloca structure
2902 // 4. The load does not meet the conditions in the helper function
2903 if (TheLoad || !IsTypeValidForTreeStructuredMerge(LI->getType()) ||
2904 S.beginOffset() != NewAllocaBeginOffset ||
2905 S.endOffset() != NewAllocaEndOffset || LI->isVolatile())
2906 return std::nullopt;
2907 TheLoad = LI;
2908 } else if (auto *SI = dyn_cast<StoreInst>(User)) {
2909 // Do not handle the case if
2910 // 1. The store does not meet the conditions in the helper function
2911 // 2. The store is volatile
2912 // 3. The total store size is not a multiple of the allocated element
2913 // type size
2914 if (!IsTypeValidForTreeStructuredMerge(
2915 SI->getValueOperand()->getType()) ||
2916 SI->isVolatile())
2917 return std::nullopt;
2918 auto *VecTy = cast<FixedVectorType>(SI->getValueOperand()->getType());
2919 unsigned NumElts = VecTy->getNumElements();
2920 unsigned EltSize = DL.getTypeSizeInBits(VecTy->getElementType());
2921 if (NumElts * EltSize % AllocatedEltTySize != 0)
2922 return std::nullopt;
2923 StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(),
2924 SI->getValueOperand());
2925 } else {
2926 // If we have instructions other than load and store, we cannot do the
2927 // tree structured merge
2928 return std::nullopt;
2929 }
2930 }
2931 // If we do not have any load, we cannot do the tree structured merge
2932 if (!TheLoad)
2933 return std::nullopt;
2934
2935 // If we do not have multiple stores, we cannot do the tree structured merge
2936 if (StoreInfos.size() < 2)
2937 return std::nullopt;
2938
2939 // Stores should not overlap and should cover the whole alloca
2940 // Sort by begin offset
2941 llvm::sort(StoreInfos, [](const StoreInfo &A, const StoreInfo &B) {
2942 return A.BeginOffset < B.BeginOffset;
2943 });
2944
2945 // Check for overlaps and coverage
2946 uint64_t ExpectedStart = NewAllocaBeginOffset;
2947 for (auto &StoreInfo : StoreInfos) {
2948 uint64_t BeginOff = StoreInfo.BeginOffset;
2949 uint64_t EndOff = StoreInfo.EndOffset;
2950
2951 // Check for gap or overlap
2952 if (BeginOff != ExpectedStart)
2953 return std::nullopt;
2954
2955 ExpectedStart = EndOff;
2956 }
2957 // Check that stores cover the entire alloca
2958 if (ExpectedStart != NewAllocaEndOffset)
2959 return std::nullopt;
2960
2961 // Stores should be in the same basic block
2962 // The load should not be in the middle of the stores
2963 // Note:
2964 // If the load is in a different basic block with the stores, we can still
2965 // do the tree structured merge. This is because we do not have the
2966 // store->load forwarding here. The merged vector will be stored back to
2967 // NewAI and the new load will load from NewAI. The forwarding will be
2968 // handled later when we try to promote NewAI.
2969 BasicBlock *LoadBB = TheLoad->getParent();
2970 BasicBlock *StoreBB = StoreInfos[0].Store->getParent();
2971
2972 for (auto &StoreInfo : StoreInfos) {
2973 if (StoreInfo.Store->getParent() != StoreBB)
2974 return std::nullopt;
2975 if (LoadBB == StoreBB && !StoreInfo.Store->comesBefore(TheLoad))
2976 return std::nullopt;
2977 }
2978
2979 // If we reach here, the partition can be merged with a tree structured
2980 // merge
2981 LLVM_DEBUG({
2982 dbgs() << "Tree structured merge rewrite:\n Load: " << *TheLoad
2983 << "\n Ordered stores:\n";
2984 for (auto [i, Info] : enumerate(StoreInfos))
2985 dbgs() << " [" << i << "] Range[" << Info.BeginOffset << ", "
2986 << Info.EndOffset << ") \tStore: " << *Info.Store
2987 << "\tValue: " << *Info.StoredValue << "\n";
2988 });
2989
2990 // Instead of having these stores, we merge all the stored values into a
2991 // vector and store the merged value into the alloca
2992 std::queue<Value *> VecElements;
2993 // StoreInfos is sorted by offset, not by block order. Anchoring to
2994 // StoreInfos.back().Store (last by offset) can place shuffles before
2995 // operands that appear later in the block (invalid SSA). Insert before
2996 // TheLoad when it shares the store block (after all stores, before any
2997 // later IR in that block). Otherwise insert before the store block's
2998 // terminator so the merge runs after every store and any trailing
2999 // instructions in that block.
3000 IRBuilder<> Builder(LoadBB == StoreBB ? TheLoad : StoreBB->getTerminator());
3001 for (const auto &Info : StoreInfos) {
3002 DeletedValues.push_back(Info.Store);
3003 VecElements.push(Info.StoredValue);
3004 }
3005
3006 LLVM_DEBUG(dbgs() << " Rewrite stores into shufflevectors:\n");
3007 while (VecElements.size() > 1) {
3008 const auto NumElts = VecElements.size();
3009 for ([[maybe_unused]] const auto _ : llvm::seq(NumElts / 2)) {
3010 Value *V0 = VecElements.front();
3011 VecElements.pop();
3012 Value *V1 = VecElements.front();
3013 VecElements.pop();
3014 Value *Merged = mergeTwoVectors(V0, V1, DL, AllocatedEltTy, Builder);
3015 LLVM_DEBUG(dbgs() << " shufflevector: " << *Merged << "\n");
3016 VecElements.push(Merged);
3017 }
3018 if (NumElts % 2 == 1) {
3019 Value *V = VecElements.front();
3020 VecElements.pop();
3021 VecElements.push(V);
3022 }
3023 }
3024
3025 // Store the merged value into the alloca
3026 Value *MergedValue = VecElements.front();
3027 Builder.CreateAlignedStore(MergedValue, &NewAI, getSliceAlign());
3028
3029 IRBuilder<> LoadBuilder(TheLoad);
3030 TheLoad->replaceAllUsesWith(LoadBuilder.CreateAlignedLoad(
3031 TheLoad->getType(), &NewAI, getSliceAlign(), TheLoad->isVolatile(),
3032 TheLoad->getName() + ".sroa.new.load"));
3033 DeletedValues.push_back(TheLoad);
3034
3035 return DeletedValues;
3036 }
3037
3038private:
3039 // Make sure the other visit overloads are visible.
3040 using Base::visit;
3041
3042 // Every instruction which can end up as a user must have a rewrite rule.
3043 bool visitInstruction(Instruction &I) {
3044 LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
3045 llvm_unreachable("No rewrite rule for this instruction!");
3046 }
3047
3048 Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
3049 // Note that the offset computation can use BeginOffset or NewBeginOffset
3050 // interchangeably for unsplit slices.
3051 assert(IsSplit || BeginOffset == NewBeginOffset);
3052 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3053
3054 StringRef OldName = OldPtr->getName();
3055 // Skip through the last '.sroa.' component of the name.
3056 size_t LastSROAPrefix = OldName.rfind(".sroa.");
3057 if (LastSROAPrefix != StringRef::npos) {
3058 OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
3059 // Look for an SROA slice index.
3060 size_t IndexEnd = OldName.find_first_not_of("0123456789");
3061 if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
3062 // Strip the index and look for the offset.
3063 OldName = OldName.substr(IndexEnd + 1);
3064 size_t OffsetEnd = OldName.find_first_not_of("0123456789");
3065 if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
3066 // Strip the offset.
3067 OldName = OldName.substr(OffsetEnd + 1);
3068 }
3069 }
3070 // Strip any SROA suffixes as well.
3071 OldName = OldName.substr(0, OldName.find(".sroa_"));
3072
3073 return getAdjustedPtr(IRB, DL, &NewAI,
3074 APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
3075 PointerTy, Twine(OldName) + ".");
3076 }
3077
3078 /// Compute suitable alignment to access this slice of the *new*
3079 /// alloca.
3080 ///
3081 /// You can optionally pass a type to this routine and if that type's ABI
3082 /// alignment is itself suitable, this will return zero.
3083 Align getSliceAlign() {
3084 return commonAlignment(NewAI.getAlign(),
3085 NewBeginOffset - NewAllocaBeginOffset);
3086 }
3087
3088 unsigned getIndex(uint64_t Offset) {
3089 assert(VecTy && "Can only call getIndex when rewriting a vector");
3090 uint64_t RelOffset = Offset - NewAllocaBeginOffset;
3091 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
3092 uint32_t Index = RelOffset / ElementSize;
3093 assert(Index * ElementSize == RelOffset);
3094 return Index;
3095 }
3096
3097 void deleteIfTriviallyDead(Value *V) {
3100 Pass.DeadInsts.push_back(I);
3101 }
3102
3103 Value *rewriteVectorizedLoadInst(LoadInst &LI) {
3104 unsigned BeginIndex = getIndex(NewBeginOffset);
3105 unsigned EndIndex = getIndex(NewEndOffset);
3106 assert(EndIndex > BeginIndex && "Empty vector!");
3107
3108 LoadInst *Load =
3109 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3110
3111 Load->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3112 LLVMContext::MD_access_group});
3113 return extractVector(IRB, Load, BeginIndex, EndIndex, "vec");
3114 }
3115
3116 Value *rewriteIntegerLoad(LoadInst &LI) {
3117 assert(IntTy && "We cannot insert an integer to the alloca");
3118 assert(!LI.isVolatile());
3119 Value *V =
3120 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3121 V = IRB.CreateBitPreservingCastChain(DL, V, IntTy);
3122 assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3123 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3124 if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
3125 IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
3126 V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
3127 }
3128 // It is possible that the extracted type is not the load type. This
3129 // happens if there is a load past the end of the alloca, and as
3130 // a consequence the slice is narrower but still a candidate for integer
3131 // lowering. To handle this case, we just zero extend the extracted
3132 // integer.
3133 assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
3134 "Can only handle an extract for an overly wide load");
3135 if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
3136 V = IRB.CreateZExt(V, LI.getType());
3137 return V;
3138 }
3139
3140 bool visitLoadInst(LoadInst &LI) {
3141 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3142 Value *OldOp = LI.getOperand(0);
3143 assert(OldOp == OldPtr);
3144
3145 AAMDNodes AATags = LI.getAAMetadata();
3146
3147 unsigned AS = LI.getPointerAddressSpace();
3148
3149 Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
3150 : LI.getType();
3151 bool IsPtrAdjusted = false;
3152 Value *V;
3153 if (VecTy) {
3154 V = rewriteVectorizedLoadInst(LI);
3155 } else if (IntTy && LI.getType()->isIntegerTy()) {
3156 V = rewriteIntegerLoad(LI);
3157 } else if (NewBeginOffset == NewAllocaBeginOffset &&
3158 NewEndOffset == NewAllocaEndOffset &&
3159 (canConvertValue(DL, NewAllocaTy, TargetTy) ||
3160 (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() &&
3161 DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize &&
3162 !LI.isVolatile()))) {
3163 Value *NewPtr =
3164 getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile());
3165 LoadInst *NewLI = IRB.CreateAlignedLoad(
3166 NewAllocaTy, NewPtr, NewAI.getAlign(), LI.isVolatile(), LI.getName());
3167 if (LI.isVolatile())
3168 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3169 if (NewLI->isAtomic())
3170 NewLI->setAlignment(LI.getAlign());
3171
3172 // Copy any metadata that is valid for the new load. This may require
3173 // conversion to a different kind of metadata, e.g. !nonnull might change
3174 // to !range or vice versa.
3175 copyMetadataForLoad(*NewLI, LI);
3176
3177 // Do this after copyMetadataForLoad() to preserve the TBAA shift.
3178 if (AATags)
3179 NewLI->setAAMetadata(AATags.adjustForAccess(
3180 NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3181
3182 // Try to preserve nonnull metadata
3183 V = NewLI;
3184
3185 // If this is an integer load past the end of the slice (which means the
3186 // bytes outside the slice are undef or this load is dead) just forcibly
3187 // fix the integer size with correct handling of endianness.
3188 if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
3189 if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
3190 if (AITy->getBitWidth() < TITy->getBitWidth()) {
3191 V = IRB.CreateZExt(V, TITy, "load.ext");
3192 if (DL.isBigEndian())
3193 V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
3194 "endian_shift");
3195 }
3196 } else {
3197 Type *LTy = IRB.getPtrTy(AS);
3198 LoadInst *NewLI =
3199 IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
3200 getSliceAlign(), LI.isVolatile(), LI.getName());
3201
3202 if (AATags)
3203 NewLI->setAAMetadata(AATags.adjustForAccess(
3204 NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3205
3206 if (LI.isVolatile())
3207 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3208 NewLI->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3209 LLVMContext::MD_access_group});
3210
3211 V = NewLI;
3212 IsPtrAdjusted = true;
3213 }
3214 V = IRB.CreateBitPreservingCastChain(DL, V, TargetTy);
3215
3216 if (IsSplit) {
3217 assert(!LI.isVolatile());
3218 assert(LI.getType()->isIntegerTy() &&
3219 "Only integer type loads and stores are split");
3220 assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() &&
3221 "Split load isn't smaller than original load");
3222 assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
3223 "Non-byte-multiple bit width");
3224 // Move the insertion point just past the load so that we can refer to it.
3225 BasicBlock::iterator LIIt = std::next(LI.getIterator());
3226 // Ensure the insertion point comes before any debug-info immediately
3227 // after the load, so that variable values referring to the load are
3228 // dominated by it.
3229 LIIt.setHeadBit(true);
3230 IRB.SetInsertPoint(LI.getParent(), LIIt);
3231 // Create a placeholder value with the same type as LI to use as the
3232 // basis for the new value. This allows us to replace the uses of LI with
3233 // the computed value, and then replace the placeholder with LI, leaving
3234 // LI only used for this computation.
3235 Value *Placeholder =
3236 new LoadInst(LI.getType(), PoisonValue::get(IRB.getPtrTy(AS)), "",
3237 false, Align(1));
3238 V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
3239 "insert");
3240 LI.replaceAllUsesWith(V);
3241 Placeholder->replaceAllUsesWith(&LI);
3242 Placeholder->deleteValue();
3243 } else {
3244 LI.replaceAllUsesWith(V);
3245 }
3246
3247 Pass.DeadInsts.push_back(&LI);
3248 deleteIfTriviallyDead(OldOp);
3249 LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
3250 return !LI.isVolatile() && !IsPtrAdjusted;
3251 }
3252
3253 bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
3254 AAMDNodes AATags) {
3255 // Capture V for the purpose of debug-info accounting once it's converted
3256 // to a vector store.
3257 Value *OrigV = V;
3258 if (V->getType() != VecTy) {
3259 unsigned BeginIndex = getIndex(NewBeginOffset);
3260 unsigned EndIndex = getIndex(NewEndOffset);
3261 assert(EndIndex > BeginIndex && "Empty vector!");
3262 unsigned NumElements = EndIndex - BeginIndex;
3263 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3264 "Too many elements!");
3265 Type *SliceTy = (NumElements == 1)
3266 ? ElementTy
3267 : FixedVectorType::get(ElementTy, NumElements);
3268 if (V->getType() != SliceTy)
3269 V = IRB.CreateBitPreservingCastChain(DL, V, SliceTy);
3270
3271 // Mix in the existing elements.
3272 Value *Old =
3273 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3274 V = insertVector(IRB, Old, V, BeginIndex, "vec");
3275 }
3276 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3277 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3278 LLVMContext::MD_access_group});
3279 if (AATags)
3280 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3281 V->getType(), DL));
3282 Pass.DeadInsts.push_back(&SI);
3283
3284 // NOTE: Careful to use OrigV rather than V.
3285 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3286 Store, Store->getPointerOperand(), OrigV, DL);
3287 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3288 return true;
3289 }
3290
3291 bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
3292 assert(IntTy && "We cannot extract an integer from the alloca");
3293 assert(!SI.isVolatile());
3294 if (DL.getTypeSizeInBits(V->getType()).getFixedValue() !=
3295 IntTy->getBitWidth()) {
3296 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
3297 "oldload");
3298 Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy);
3299 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3300 uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
3301 V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
3302 }
3303 V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
3304 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3305 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3306 LLVMContext::MD_access_group});
3307 if (AATags)
3308 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3309 V->getType(), DL));
3310
3311 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3312 Store, Store->getPointerOperand(),
3313 Store->getValueOperand(), DL);
3314
3315 Pass.DeadInsts.push_back(&SI);
3316 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3317 return true;
3318 }
3319
3320 bool visitStoreInst(StoreInst &SI) {
3321 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3322 Value *OldOp = SI.getOperand(1);
3323 assert(OldOp == OldPtr);
3324
3325 AAMDNodes AATags = SI.getAAMetadata();
3326 Value *V = SI.getValueOperand();
3327
3328 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3329 // alloca that should be re-examined after promoting this alloca.
3330 if (V->getType()->isPointerTy())
3331 if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
3332 Pass.PostPromotionWorklist.insert(AI);
3333
3334 TypeSize StoreSize = DL.getTypeStoreSize(V->getType());
3335 if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) {
3336 assert(!SI.isVolatile());
3337 assert(V->getType()->isIntegerTy() &&
3338 "Only integer type loads and stores are split");
3339 assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
3340 "Non-byte-multiple bit width");
3341 IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
3342 V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
3343 "extract");
3344 }
3345
3346 if (VecTy)
3347 return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
3348 if (IntTy && V->getType()->isIntegerTy())
3349 return rewriteIntegerStore(V, SI, AATags);
3350
3351 StoreInst *NewSI;
3352 if (NewBeginOffset == NewAllocaBeginOffset &&
3353 NewEndOffset == NewAllocaEndOffset &&
3354 canConvertValue(DL, V->getType(), NewAllocaTy)) {
3355 V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
3356 Value *NewPtr =
3357 getPtrToNewAI(SI.getPointerAddressSpace(), SI.isVolatile());
3358
3359 NewSI =
3360 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), SI.isVolatile());
3361 } else {
3362 unsigned AS = SI.getPointerAddressSpace();
3363 Value *NewPtr = getNewAllocaSlicePtr(IRB, IRB.getPtrTy(AS));
3364 NewSI =
3365 IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
3366 }
3367 NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3368 LLVMContext::MD_access_group});
3369 if (AATags)
3370 NewSI->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3371 V->getType(), DL));
3372 if (SI.isVolatile())
3373 NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
3374 if (NewSI->isAtomic())
3375 NewSI->setAlignment(SI.getAlign());
3376
3377 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3378 NewSI, NewSI->getPointerOperand(),
3379 NewSI->getValueOperand(), DL);
3380
3381 Pass.DeadInsts.push_back(&SI);
3382 deleteIfTriviallyDead(OldOp);
3383
3384 LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
3385 return NewSI->getPointerOperand() == &NewAI &&
3386 NewSI->getValueOperand()->getType() == NewAllocaTy &&
3387 !SI.isVolatile();
3388 }
3389
3390 /// Compute an integer value from splatting an i8 across the given
3391 /// number of bytes.
3392 ///
3393 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
3394 /// call this routine.
3395 /// FIXME: Heed the advice above.
3396 ///
3397 /// \param V The i8 value to splat.
3398 /// \param Size The number of bytes in the output (assuming i8 is one byte)
3399 Value *getIntegerSplat(Value *V, unsigned Size) {
3400 assert(Size > 0 && "Expected a positive number of bytes.");
3401 IntegerType *VTy = cast<IntegerType>(V->getType());
3402 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
3403 if (Size == 1)
3404 return V;
3405
3406 Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
3407 V = IRB.CreateMul(
3408 IRB.CreateZExt(V, SplatIntTy, "zext"),
3409 IRB.CreateUDiv(Constant::getAllOnesValue(SplatIntTy),
3410 IRB.CreateZExt(Constant::getAllOnesValue(V->getType()),
3411 SplatIntTy)),
3412 "isplat");
3413 return V;
3414 }
3415
3416 /// Compute a vector splat for a given element value.
3417 Value *getVectorSplat(Value *V, unsigned NumElements) {
3418 V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
3419 LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
3420 return V;
3421 }
3422
3423 bool visitMemSetInst(MemSetInst &II) {
3424 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3425 assert(II.getRawDest() == OldPtr);
3426
3427 AAMDNodes AATags = II.getAAMetadata();
3428
3429 // If the memset has a variable size, it cannot be split, just adjust the
3430 // pointer to the new alloca.
3431 if (!isa<ConstantInt>(II.getLength())) {
3432 assert(!IsSplit);
3433 assert(NewBeginOffset == BeginOffset);
3434 II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
3435 II.setDestAlignment(getSliceAlign());
3436 // In theory we should call migrateDebugInfo here. However, we do not
3437 // emit dbg.assign intrinsics for mem intrinsics storing through non-
3438 // constant geps, or storing a variable number of bytes.
3440 "AT: Unexpected link to non-const GEP");
3441 deleteIfTriviallyDead(OldPtr);
3442 return false;
3443 }
3444
3445 // Record this instruction for deletion.
3446 Pass.DeadInsts.push_back(&II);
3447
3448 Type *ScalarTy = NewAllocaTy->getScalarType();
3449
3450 const bool CanContinue = [&]() {
3451 if (VecTy || IntTy)
3452 return true;
3453 if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)
3454 return false;
3455 // Length must be in range for FixedVectorType.
3456 auto *C = cast<ConstantInt>(II.getLength());
3457 const uint64_t Len = C->getLimitedValue();
3458 if (Len > std::numeric_limits<unsigned>::max())
3459 return false;
3460 auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
3461 auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
3462 return canConvertValue(DL, SrcTy, NewAllocaTy) &&
3463 DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedValue());
3464 }();
3465
3466 // If this doesn't map cleanly onto the alloca type, and that type isn't
3467 // a single value type, just emit a memset.
3468 if (!CanContinue) {
3469 Type *SizeTy = II.getLength()->getType();
3470 unsigned Sz = NewEndOffset - NewBeginOffset;
3471 Constant *Size = ConstantInt::get(SizeTy, Sz);
3472 MemIntrinsic *New = cast<MemIntrinsic>(IRB.CreateMemSet(
3473 getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
3474 MaybeAlign(getSliceAlign()), II.isVolatile()));
3475 if (AATags)
3476 New->setAAMetadata(
3477 AATags.adjustForAccess(NewBeginOffset - BeginOffset, Sz));
3478
3479 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3480 New, New->getRawDest(), nullptr, DL);
3481
3482 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3483 return false;
3484 }
3485
3486 // If we can represent this as a simple value, we have to build the actual
3487 // value to store, which requires expanding the byte present in memset to
3488 // a sensible representation for the alloca type. This is essentially
3489 // splatting the byte to a sufficiently wide integer, splatting it across
3490 // any desired vector width, and bitcasting to the final type.
3491 Value *V;
3492
3493 if (VecTy) {
3494 // If this is a memset of a vectorized alloca, insert it.
3495 assert(ElementTy == ScalarTy);
3496
3497 unsigned BeginIndex = getIndex(NewBeginOffset);
3498 unsigned EndIndex = getIndex(NewEndOffset);
3499 assert(EndIndex > BeginIndex && "Empty vector!");
3500 unsigned NumElements = EndIndex - BeginIndex;
3501 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3502 "Too many elements!");
3503
3504 Value *Splat = getIntegerSplat(
3505 II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8);
3506 Splat = IRB.CreateBitPreservingCastChain(DL, Splat, ElementTy);
3507 if (NumElements > 1)
3508 Splat = getVectorSplat(Splat, NumElements);
3509
3510 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
3511 "oldload");
3512 V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
3513 } else if (IntTy) {
3514 // If this is a memset on an alloca where we can widen stores, insert the
3515 // set integer.
3516 assert(!II.isVolatile());
3517
3518 uint64_t Size = NewEndOffset - NewBeginOffset;
3519 V = getIntegerSplat(II.getValue(), Size);
3520
3521 if (IntTy && (NewBeginOffset != NewAllocaBeginOffset ||
3522 NewEndOffset != NewAllocaEndOffset)) {
3523 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI,
3524 NewAI.getAlign(), "oldload");
3525 Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy);
3526 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3527 V = insertInteger(DL, IRB, Old, V, Offset, "insert");
3528 } else {
3529 assert(V->getType() == IntTy &&
3530 "Wrong type for an alloca wide integer!");
3531 }
3532 V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
3533 } else {
3534 // Established these invariants above.
3535 assert(NewBeginOffset == NewAllocaBeginOffset);
3536 assert(NewEndOffset == NewAllocaEndOffset);
3537
3538 V = getIntegerSplat(II.getValue(),
3539 DL.getTypeSizeInBits(ScalarTy).getFixedValue() / 8);
3540 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(NewAllocaTy))
3541 V = getVectorSplat(
3542 V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
3543
3544 V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
3545 }
3546
3547 Value *NewPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3548 StoreInst *New =
3549 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), II.isVolatile());
3550 New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3551 LLVMContext::MD_access_group});
3552 if (AATags)
3553 New->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3554 V->getType(), DL));
3555
3556 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3557 New, New->getPointerOperand(), V, DL);
3558
3559 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3560 return !II.isVolatile();
3561 }
3562
3563 bool visitMemTransferInst(MemTransferInst &II) {
3564 // Rewriting of memory transfer instructions can be a bit tricky. We break
3565 // them into two categories: split intrinsics and unsplit intrinsics.
3566
3567 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3568
3569 AAMDNodes AATags = II.getAAMetadata();
3570
3571 bool IsDest = &II.getRawDestUse() == OldUse;
3572 assert((IsDest && II.getRawDest() == OldPtr) ||
3573 (!IsDest && II.getRawSource() == OldPtr));
3574
3575 Align SliceAlign = getSliceAlign();
3576 // For unsplit intrinsics, we simply modify the source and destination
3577 // pointers in place. This isn't just an optimization, it is a matter of
3578 // correctness. With unsplit intrinsics we may be dealing with transfers
3579 // within a single alloca before SROA ran, or with transfers that have
3580 // a variable length. We may also be dealing with memmove instead of
3581 // memcpy, and so simply updating the pointers is the necessary for us to
3582 // update both source and dest of a single call.
3583 if (!IsSplittable) {
3584 Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3585 if (IsDest) {
3586 // Update the address component of linked dbg.assigns.
3587 for (DbgVariableRecord *DbgAssign : at::getDVRAssignmentMarkers(&II)) {
3588 if (llvm::is_contained(DbgAssign->location_ops(), II.getDest()) ||
3589 DbgAssign->getAddress() == II.getDest())
3590 DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
3591 }
3592 II.setDest(AdjustedPtr);
3593 II.setDestAlignment(SliceAlign);
3594 } else {
3595 II.setSource(AdjustedPtr);
3596 II.setSourceAlignment(SliceAlign);
3597 }
3598
3599 LLVM_DEBUG(dbgs() << " to: " << II << "\n");
3600 deleteIfTriviallyDead(OldPtr);
3601 return false;
3602 }
3603 // For split transfer intrinsics we have an incredibly useful assurance:
3604 // the source and destination do not reside within the same alloca, and at
3605 // least one of them does not escape. This means that we can replace
3606 // memmove with memcpy, and we don't need to worry about all manner of
3607 // downsides to splitting and transforming the operations.
3608
3609 // If this doesn't map cleanly onto the alloca type, and that type isn't
3610 // a single value type, just emit a memcpy.
3611 bool EmitMemCpy =
3612 !VecTy && !IntTy &&
3613 (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
3614 SliceSize != DL.getTypeStoreSize(NewAllocaTy).getFixedValue() ||
3615 !DL.typeSizeEqualsStoreSize(NewAllocaTy) ||
3616 !NewAllocaTy->isSingleValueType());
3617
3618 // If we're just going to emit a memcpy, the alloca hasn't changed, and the
3619 // size hasn't been shrunk based on analysis of the viable range, this is
3620 // a no-op.
3621 if (EmitMemCpy && &OldAI == &NewAI) {
3622 // Ensure the start lines up.
3623 assert(NewBeginOffset == BeginOffset);
3624
3625 // Rewrite the size as needed.
3626 if (NewEndOffset != EndOffset)
3627 II.setLength(NewEndOffset - NewBeginOffset);
3628 return false;
3629 }
3630 // Record this instruction for deletion.
3631 Pass.DeadInsts.push_back(&II);
3632
3633 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3634 // alloca that should be re-examined after rewriting this instruction.
3635 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
3636 if (AllocaInst *AI =
3638 assert(AI != &OldAI && AI != &NewAI &&
3639 "Splittable transfers cannot reach the same alloca on both ends.");
3640 Pass.Worklist.insert(AI);
3641 }
3642
3643 Type *OtherPtrTy = OtherPtr->getType();
3644 unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
3645
3646 // Compute the relative offset for the other pointer within the transfer.
3647 unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
3648 APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
3649 Align OtherAlign =
3650 (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
3651 OtherAlign =
3652 commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
3653
3654 if (EmitMemCpy) {
3655 // Compute the other pointer, folding as much as possible to produce
3656 // a single, simple GEP in most cases.
3657 OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3658 OtherPtr->getName() + ".");
3659
3660 Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3661 Type *SizeTy = II.getLength()->getType();
3662 Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
3663
3664 Value *DestPtr, *SrcPtr;
3665 MaybeAlign DestAlign, SrcAlign;
3666 // Note: IsDest is true iff we're copying into the new alloca slice
3667 if (IsDest) {
3668 DestPtr = OurPtr;
3669 DestAlign = SliceAlign;
3670 SrcPtr = OtherPtr;
3671 SrcAlign = OtherAlign;
3672 } else {
3673 DestPtr = OtherPtr;
3674 DestAlign = OtherAlign;
3675 SrcPtr = OurPtr;
3676 SrcAlign = SliceAlign;
3677 }
3678 CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
3679 Size, II.isVolatile());
3680 if (AATags)
3681 New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3682
3683 APInt Offset(DL.getIndexTypeSizeInBits(DestPtr->getType()), 0);
3684 if (IsDest) {
3685 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8,
3686 &II, New, DestPtr, nullptr, DL);
3687 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3689 DL, Offset, /*AllowNonInbounds*/ true))) {
3690 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8,
3691 SliceSize * 8, &II, New, DestPtr, nullptr, DL);
3692 }
3693 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3694 return false;
3695 }
3696
3697 bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3698 NewEndOffset == NewAllocaEndOffset;
3699 uint64_t Size = NewEndOffset - NewBeginOffset;
3700 unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
3701 unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
3702 unsigned NumElements = EndIndex - BeginIndex;
3703 IntegerType *SubIntTy =
3704 IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
3705
3706 // Reset the other pointer type to match the register type we're going to
3707 // use, but using the address space of the original other pointer.
3708 Type *OtherTy;
3709 if (VecTy && !IsWholeAlloca) {
3710 if (NumElements == 1)
3711 OtherTy = VecTy->getElementType();
3712 else
3713 OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
3714 } else if (IntTy && !IsWholeAlloca) {
3715 OtherTy = SubIntTy;
3716 } else {
3717 OtherTy = NewAllocaTy;
3718 }
3719
3720 Value *AdjPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3721 OtherPtr->getName() + ".");
3722 MaybeAlign SrcAlign = OtherAlign;
3723 MaybeAlign DstAlign = SliceAlign;
3724 if (!IsDest)
3725 std::swap(SrcAlign, DstAlign);
3726
3727 Value *SrcPtr;
3728 Value *DstPtr;
3729
3730 if (IsDest) {
3731 DstPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3732 SrcPtr = AdjPtr;
3733 } else {
3734 DstPtr = AdjPtr;
3735 SrcPtr = getPtrToNewAI(II.getSourceAddressSpace(), II.isVolatile());
3736 }
3737
3738 Value *Src;
3739 if (VecTy && !IsWholeAlloca && !IsDest) {
3740 Src =
3741 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3742 Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
3743 } else if (IntTy && !IsWholeAlloca && !IsDest) {
3744 Src =
3745 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3746 Src = IRB.CreateBitPreservingCastChain(DL, Src, IntTy);
3747 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3748 Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
3749 } else {
3750 LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
3751 II.isVolatile(), "copyload");
3752 Load->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3753 LLVMContext::MD_access_group});
3754 if (AATags)
3755 Load->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3756 Load->getType(), DL));
3757 Src = Load;
3758 }
3759
3760 if (VecTy && !IsWholeAlloca && IsDest) {
3761 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
3762 "oldload");
3763 Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
3764 } else if (IntTy && !IsWholeAlloca && IsDest) {
3765 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
3766 "oldload");
3767 Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy);
3768 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3769 Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
3770 Src = IRB.CreateBitPreservingCastChain(DL, Src, NewAllocaTy);
3771 }
3772
3773 StoreInst *Store = cast<StoreInst>(
3774 IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
3775 Store->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3776 LLVMContext::MD_access_group});
3777 if (AATags)
3778 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3779 Src->getType(), DL));
3780
3781 APInt Offset(DL.getIndexTypeSizeInBits(DstPtr->getType()), 0);
3782 if (IsDest) {
3783
3784 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3785 Store, DstPtr, Src, DL);
3786 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3788 DL, Offset, /*AllowNonInbounds*/ true))) {
3789 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8, SliceSize * 8,
3790 &II, Store, DstPtr, Src, DL);
3791 }
3792
3793 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3794 return !II.isVolatile();
3795 }
3796
3797 bool visitIntrinsicInst(IntrinsicInst &II) {
3798 assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
3799 "Unexpected intrinsic!");
3800 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3801
3802 // Record this instruction for deletion.
3803 Pass.DeadInsts.push_back(&II);
3804
3805 if (II.isDroppable()) {
3806 assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3807 // TODO For now we forget assumed information, this can be improved.
3808 OldPtr->dropDroppableUsesIn(II);
3809 return true;
3810 }
3811
3812 assert(II.getArgOperand(0) == OldPtr);
3813 Type *PointerTy = IRB.getPtrTy(OldPtr->getType()->getPointerAddressSpace());
3814 Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3815 Value *New;
3816 if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3817 New = IRB.CreateLifetimeStart(Ptr);
3818 else
3819 New = IRB.CreateLifetimeEnd(Ptr);
3820
3821 (void)New;
3822 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3823
3824 return true;
3825 }
3826
3827 void fixLoadStoreAlign(Instruction &Root) {
3828 // This algorithm implements the same visitor loop as
3829 // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3830 // or store found.
3831 SmallPtrSet<Instruction *, 4> Visited;
3832 SmallVector<Instruction *, 4> Uses;
3833 Visited.insert(&Root);
3834 Uses.push_back(&Root);
3835 do {
3836 Instruction *I = Uses.pop_back_val();
3837
3838 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
3839 LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
3840 continue;
3841 }
3842 if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
3843 SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
3844 continue;
3845 }
3846
3850 for (User *U : I->users())
3851 if (Visited.insert(cast<Instruction>(U)).second)
3852 Uses.push_back(cast<Instruction>(U));
3853 } while (!Uses.empty());
3854 }
3855
3856 bool visitPHINode(PHINode &PN) {
3857 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
3858 assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3859 assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3860
3861 // We would like to compute a new pointer in only one place, but have it be
3862 // as local as possible to the PHI. To do that, we re-use the location of
3863 // the old pointer, which necessarily must be in the right position to
3864 // dominate the PHI.
3865 IRBuilderBase::InsertPointGuard Guard(IRB);
3866 if (isa<PHINode>(OldPtr))
3867 IRB.SetInsertPoint(OldPtr->getParent(),
3868 OldPtr->getParent()->getFirstInsertionPt());
3869 else
3870 IRB.SetInsertPoint(OldPtr);
3871 IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3872
3873 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3874 // Replace the operands which were using the old pointer.
3875 std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
3876
3877 LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
3878 deleteIfTriviallyDead(OldPtr);
3879
3880 // Fix the alignment of any loads or stores using this PHI node.
3881 fixLoadStoreAlign(PN);
3882
3883 // PHIs can't be promoted on their own, but often can be speculated. We
3884 // check the speculation outside of the rewriter so that we see the
3885 // fully-rewritten alloca.
3886 PHIUsers.insert(&PN);
3887 return true;
3888 }
3889
3890 bool visitSelectInst(SelectInst &SI) {
3891 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3892 assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
3893 "Pointer isn't an operand!");
3894 assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3895 assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3896
3897 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3898 // Replace the operands which were using the old pointer.
3899 if (SI.getOperand(1) == OldPtr)
3900 SI.setOperand(1, NewPtr);
3901 if (SI.getOperand(2) == OldPtr)
3902 SI.setOperand(2, NewPtr);
3903
3904 LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
3905 deleteIfTriviallyDead(OldPtr);
3906
3907 // Fix the alignment of any loads or stores using this select.
3908 fixLoadStoreAlign(SI);
3909
3910 // Selects can't be promoted on their own, but often can be speculated. We
3911 // check the speculation outside of the rewriter so that we see the
3912 // fully-rewritten alloca.
3913 SelectUsers.insert(&SI);
3914 return true;
3915 }
3916};
3917
3918/// Visitor to rewrite aggregate loads and stores as scalar.
3919///
3920/// This pass aggressively rewrites all aggregate loads and stores on
3921/// a particular pointer (or any pointer derived from it which we can identify)
3922/// with scalar loads and stores.
3923class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
3924 // Befriend the base class so it can delegate to private visit methods.
3925 friend class InstVisitor<AggLoadStoreRewriter, bool>;
3926
3927 /// Queue of pointer uses to analyze and potentially rewrite.
3929
3930 /// Set to prevent us from cycling with phi nodes and loops.
3931 SmallPtrSet<User *, 8> Visited;
3932
3933 /// The current pointer use being rewritten. This is used to dig up the used
3934 /// value (as opposed to the user).
3935 Use *U = nullptr;
3936
3937 /// Used to calculate offsets, and hence alignment, of subobjects.
3938 const DataLayout &DL;
3939
3940 IRBuilderTy &IRB;
3941
3942public:
3943 AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
3944 : DL(DL), IRB(IRB) {}
3945
3946 /// Rewrite loads and stores through a pointer and all pointers derived from
3947 /// it.
3948 bool rewrite(Instruction &I) {
3949 LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
3950 enqueueUsers(I);
3951 bool Changed = false;
3952 while (!Queue.empty()) {
3953 U = Queue.pop_back_val();
3954 Changed |= visit(cast<Instruction>(U->getUser()));
3955 }
3956 return Changed;
3957 }
3958
3959private:
3960 /// Enqueue all the users of the given instruction for further processing.
3961 /// This uses a set to de-duplicate users.
3962 void enqueueUsers(Instruction &I) {
3963 for (Use &U : I.uses())
3964 if (Visited.insert(U.getUser()).second)
3965 Queue.push_back(&U);
3966 }
3967
3968 // Conservative default is to not rewrite anything.
3969 bool visitInstruction(Instruction &I) { return false; }
3970
3971 /// Generic recursive split emission class.
3972 template <typename Derived> class OpSplitter {
3973 protected:
3974 /// The builder used to form new instructions.
3975 IRBuilderTy &IRB;
3976
3977 /// The indices which to be used with insert- or extractvalue to select the
3978 /// appropriate value within the aggregate.
3979 SmallVector<unsigned, 4> Indices;
3980
3981 /// The indices to a GEP instruction which will move Ptr to the correct slot
3982 /// within the aggregate.
3983 SmallVector<Value *, 4> GEPIndices;
3984
3985 /// The base pointer of the original op, used as a base for GEPing the
3986 /// split operations.
3987 Value *Ptr;
3988
3989 /// The base pointee type being GEPed into.
3990 Type *BaseTy;
3991
3992 /// Known alignment of the base pointer.
3993 Align BaseAlign;
3994
3995 /// To calculate offset of each component so we can correctly deduce
3996 /// alignments.
3997 const DataLayout &DL;
3998
3999 /// Initialize the splitter with an insertion point, Ptr and start with a
4000 /// single zero GEP index.
4001 OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4002 Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
4003 : IRB(IRB), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy),
4004 BaseAlign(BaseAlign), DL(DL) {
4005 IRB.SetInsertPoint(InsertionPoint);
4006 }
4007
4008 public:
4009 /// Generic recursive split emission routine.
4010 ///
4011 /// This method recursively splits an aggregate op (load or store) into
4012 /// scalar or vector ops. It splits recursively until it hits a single value
4013 /// and emits that single value operation via the template argument.
4014 ///
4015 /// The logic of this routine relies on GEPs and insertvalue and
4016 /// extractvalue all operating with the same fundamental index list, merely
4017 /// formatted differently (GEPs need actual values).
4018 ///
4019 /// \param Ty The type being split recursively into smaller ops.
4020 /// \param Agg The aggregate value being built up or stored, depending on
4021 /// whether this is splitting a load or a store respectively.
4022 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
4023 if (Ty->isSingleValueType()) {
4024 unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
4025 return static_cast<Derived *>(this)->emitFunc(
4026 Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
4027 }
4028
4029 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
4030 unsigned OldSize = Indices.size();
4031 (void)OldSize;
4032 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
4033 ++Idx) {
4034 assert(Indices.size() == OldSize && "Did not return to the old size");
4035 Indices.push_back(Idx);
4036 GEPIndices.push_back(IRB.getInt32(Idx));
4037 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
4038 GEPIndices.pop_back();
4039 Indices.pop_back();
4040 }
4041 return;
4042 }
4043
4044 if (StructType *STy = dyn_cast<StructType>(Ty)) {
4045 unsigned OldSize = Indices.size();
4046 (void)OldSize;
4047 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
4048 ++Idx) {
4049 assert(Indices.size() == OldSize && "Did not return to the old size");
4050 Indices.push_back(Idx);
4051 GEPIndices.push_back(IRB.getInt32(Idx));
4052 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
4053 GEPIndices.pop_back();
4054 Indices.pop_back();
4055 }
4056 return;
4057 }
4058
4059 llvm_unreachable("Only arrays and structs are aggregate loadable types");
4060 }
4061 };
4062
4063 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
4064 AAMDNodes AATags;
4065 // A vector to hold the split components that we want to emit
4066 // separate fake uses for.
4067 SmallVector<Value *, 4> Components;
4068 // A vector to hold all the fake uses of the struct that we are splitting.
4069 // Usually there should only be one, but we are handling the general case.
4071
4072 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4073 AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
4074 IRBuilderTy &IRB)
4075 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
4076 IRB),
4077 AATags(AATags) {}
4078
4079 /// Emit a leaf load of a single value. This is called at the leaves of the
4080 /// recursive emission to actually load values.
4081 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4083 // Load the single value and insert it using the indices.
4084 Value *GEP =
4085 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
4086 LoadInst *Load =
4087 IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
4088
4089 APInt Offset(
4090 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4091 if (AATags &&
4092 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
4093 Load->setAAMetadata(
4094 AATags.adjustForAccess(Offset.getZExtValue(), Load->getType(), DL));
4095 // Record the load so we can generate a fake use for this aggregate
4096 // component.
4097 Components.push_back(Load);
4098
4099 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
4100 LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
4101 }
4102
4103 // Stash the fake uses that use the value generated by this instruction.
4104 void recordFakeUses(LoadInst &LI) {
4105 for (Use &U : LI.uses())
4106 if (auto *II = dyn_cast<IntrinsicInst>(U.getUser()))
4107 if (II->getIntrinsicID() == Intrinsic::fake_use)
4108 FakeUses.push_back(II);
4109 }
4110
4111 // Replace all fake uses of the aggregate with a series of fake uses, one
4112 // for each split component.
4113 void emitFakeUses() {
4114 for (Instruction *I : FakeUses) {
4115 IRB.SetInsertPoint(I);
4116 for (auto *V : Components)
4117 IRB.CreateIntrinsic(Intrinsic::fake_use, {V});
4118 I->eraseFromParent();
4119 }
4120 }
4121 };
4122
4123 bool visitLoadInst(LoadInst &LI) {
4124 assert(LI.getPointerOperand() == *U);
4125 if (!LI.isSimple() || LI.getType()->isSingleValueType())
4126 return false;
4127
4128 // We have an aggregate being loaded, split it apart.
4129 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
4130 LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
4131 getAdjustedAlignment(&LI, 0), DL, IRB);
4132 Splitter.recordFakeUses(LI);
4134 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
4135 Splitter.emitFakeUses();
4136 Visited.erase(&LI);
4137 LI.replaceAllUsesWith(V);
4138 LI.eraseFromParent();
4139 return true;
4140 }
4141
4142 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
4143 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4144 AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign,
4145 const DataLayout &DL, IRBuilderTy &IRB)
4146 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
4147 DL, IRB),
4148 AATags(AATags), AggStore(AggStore) {}
4149 AAMDNodes AATags;
4150 StoreInst *AggStore;
4151 /// Emit a leaf store of a single value. This is called at the leaves of the
4152 /// recursive emission to actually produce stores.
4153 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4155 // Extract the single value and store it using the indices.
4156 //
4157 // The gep and extractvalue values are factored out of the CreateStore
4158 // call to make the output independent of the argument evaluation order.
4159 Value *ExtractValue =
4160 IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
4161 Value *InBoundsGEP =
4162 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
4163 StoreInst *Store =
4164 IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
4165
4166 APInt Offset(
4167 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4168 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset);
4169 if (AATags) {
4170 Store->setAAMetadata(AATags.adjustForAccess(
4171 Offset.getZExtValue(), ExtractValue->getType(), DL));
4172 }
4173
4174 // migrateDebugInfo requires the base Alloca. Walk to it from this gep.
4175 // If we cannot (because there's an intervening non-const or unbounded
4176 // gep) then we wouldn't expect to see dbg.assign intrinsics linked to
4177 // this instruction.
4179 if (auto *OldAI = dyn_cast<AllocaInst>(Base)) {
4180 uint64_t SizeInBits =
4181 DL.getTypeSizeInBits(Store->getValueOperand()->getType());
4182 migrateDebugInfo(OldAI, /*IsSplit*/ true, Offset.getZExtValue() * 8,
4183 SizeInBits, AggStore, Store,
4184 Store->getPointerOperand(), Store->getValueOperand(),
4185 DL);
4186 } else {
4188 "AT: unexpected debug.assign linked to store through "
4189 "unbounded GEP");
4190 }
4191 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
4192 }
4193 };
4194
4195 bool visitStoreInst(StoreInst &SI) {
4196 if (!SI.isSimple() || SI.getPointerOperand() != *U)
4197 return false;
4198 Value *V = SI.getValueOperand();
4199 if (V->getType()->isSingleValueType())
4200 return false;
4201
4202 // We have an aggregate being stored, split it apart.
4203 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
4204 StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI,
4205 getAdjustedAlignment(&SI, 0), DL, IRB);
4206 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
4207 Visited.erase(&SI);
4208 // The stores replacing SI each have markers describing fragments of the
4209 // assignment so delete the assignment markers linked to SI.
4211 SI.eraseFromParent();
4212 return true;
4213 }
4214
4215 bool visitBitCastInst(BitCastInst &BC) {
4216 enqueueUsers(BC);
4217 return false;
4218 }
4219
4220 bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
4221 enqueueUsers(ASC);
4222 return false;
4223 }
4224
4225 // Unfold gep (select cond, ptr1, ptr2), idx
4226 // => select cond, gep(ptr1, idx), gep(ptr2, idx)
4227 // and gep ptr, (select cond, idx1, idx2)
4228 // => select cond, gep(ptr, idx1), gep(ptr, idx2)
4229 // We also allow for i1 zext indices, which are equivalent to selects.
4230 bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
4231 // Check whether the GEP has exactly one select operand and all indices
4232 // will become constant after the transform.
4234 for (Value *Op : GEPI.indices()) {
4235 if (auto *SI = dyn_cast<SelectInst>(Op)) {
4236 if (Sel)
4237 return false;
4238
4239 Sel = SI;
4240 if (!isa<ConstantInt>(SI->getTrueValue()) ||
4241 !isa<ConstantInt>(SI->getFalseValue()))
4242 return false;
4243 continue;
4244 }
4245 if (auto *ZI = dyn_cast<ZExtInst>(Op)) {
4246 if (Sel)
4247 return false;
4248 Sel = ZI;
4249 if (!ZI->getSrcTy()->isIntegerTy(1))
4250 return false;
4251 continue;
4252 }
4253
4254 if (!isa<ConstantInt>(Op))
4255 return false;
4256 }
4257
4258 if (!Sel)
4259 return false;
4260
4261 LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):\n";
4262 dbgs() << " original: " << *Sel << "\n";
4263 dbgs() << " " << GEPI << "\n";);
4264
4265 auto GetNewOps = [&](Value *SelOp) {
4266 SmallVector<Value *> NewOps;
4267 for (Value *Op : GEPI.operands())
4268 if (Op == Sel)
4269 NewOps.push_back(SelOp);
4270 else
4271 NewOps.push_back(Op);
4272 return NewOps;
4273 };
4274
4275 Value *Cond, *True, *False;
4276 Instruction *MDFrom = nullptr;
4277 if (auto *SI = dyn_cast<SelectInst>(Sel)) {
4278 Cond = SI->getCondition();
4279 True = SI->getTrueValue();
4280 False = SI->getFalseValue();
4282 MDFrom = SI;
4283 } else {
4284 Cond = Sel->getOperand(0);
4285 True = ConstantInt::get(Sel->getType(), 1);
4286 False = ConstantInt::get(Sel->getType(), 0);
4287 }
4288 SmallVector<Value *> TrueOps = GetNewOps(True);
4289 SmallVector<Value *> FalseOps = GetNewOps(False);
4290
4291 IRB.SetInsertPoint(&GEPI);
4292 GEPNoWrapFlags NW = GEPI.getNoWrapFlags();
4293
4294 Type *Ty = GEPI.getSourceElementType();
4295 Value *NTrue = IRB.CreateGEP(Ty, TrueOps[0], ArrayRef(TrueOps).drop_front(),
4296 True->getName() + ".sroa.gep", NW);
4297
4298 Value *NFalse =
4299 IRB.CreateGEP(Ty, FalseOps[0], ArrayRef(FalseOps).drop_front(),
4300 False->getName() + ".sroa.gep", NW);
4301
4302 Value *NSel = MDFrom
4303 ? IRB.CreateSelect(Cond, NTrue, NFalse,
4304 Sel->getName() + ".sroa.sel", MDFrom)
4305 : IRB.CreateSelectWithUnknownProfile(
4306 Cond, NTrue, NFalse, DEBUG_TYPE,
4307 Sel->getName() + ".sroa.sel");
4308 Visited.erase(&GEPI);
4309 GEPI.replaceAllUsesWith(NSel);
4310 GEPI.eraseFromParent();
4311 Instruction *NSelI = cast<Instruction>(NSel);
4312 Visited.insert(NSelI);
4313 enqueueUsers(*NSelI);
4314
4315 LLVM_DEBUG(dbgs() << " to: " << *NTrue << "\n";
4316 dbgs() << " " << *NFalse << "\n";
4317 dbgs() << " " << *NSel << "\n";);
4318
4319 return true;
4320 }
4321
4322 // Unfold gep (phi ptr1, ptr2), idx
4323 // => phi ((gep ptr1, idx), (gep ptr2, idx))
4324 // and gep ptr, (phi idx1, idx2)
4325 // => phi ((gep ptr, idx1), (gep ptr, idx2))
4326 bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
4327 // To prevent infinitely expanding recursive phis, bail if the GEP pointer
4328 // operand (looking through the phi if it is the phi we want to unfold) is
4329 // an instruction besides a static alloca.
4330 PHINode *Phi = dyn_cast<PHINode>(GEPI.getPointerOperand());
4331 auto IsInvalidPointerOperand = [](Value *V) {
4332 if (!isa<Instruction>(V))
4333 return false;
4334 if (auto *AI = dyn_cast<AllocaInst>(V))
4335 return !AI->isStaticAlloca();
4336 return true;
4337 };
4338 if (Phi) {
4339 if (any_of(Phi->operands(), IsInvalidPointerOperand))
4340 return false;
4341 } else {
4342 if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
4343 return false;
4344 }
4345 // Check whether the GEP has exactly one phi operand (including the pointer
4346 // operand) and all indices will become constant after the transform.
4347 for (Value *Op : GEPI.indices()) {
4348 if (auto *SI = dyn_cast<PHINode>(Op)) {
4349 if (Phi)
4350 return false;
4351
4352 Phi = SI;
4353 if (!all_of(Phi->incoming_values(),
4354 [](Value *V) { return isa<ConstantInt>(V); }))
4355 return false;
4356 continue;
4357 }
4358
4359 if (!isa<ConstantInt>(Op))
4360 return false;
4361 }
4362
4363 if (!Phi)
4364 return false;
4365
4366 LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):\n";
4367 dbgs() << " original: " << *Phi << "\n";
4368 dbgs() << " " << GEPI << "\n";);
4369
4370 auto GetNewOps = [&](Value *PhiOp) {
4371 SmallVector<Value *> NewOps;
4372 for (Value *Op : GEPI.operands())
4373 if (Op == Phi)
4374 NewOps.push_back(PhiOp);
4375 else
4376 NewOps.push_back(Op);
4377 return NewOps;
4378 };
4379
4380 IRB.SetInsertPoint(Phi);
4381 PHINode *NewPhi = IRB.CreatePHI(GEPI.getType(), Phi->getNumIncomingValues(),
4382 Phi->getName() + ".sroa.phi");
4383
4384 Type *SourceTy = GEPI.getSourceElementType();
4385 // We only handle arguments, constants, and static allocas here, so we can
4386 // insert GEPs at the end of the entry block.
4387 IRB.SetInsertPoint(GEPI.getFunction()->getEntryBlock().getTerminator());
4388 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
4389 Value *Op = Phi->getIncomingValue(I);
4390 BasicBlock *BB = Phi->getIncomingBlock(I);
4391 Value *NewGEP;
4392 if (int NI = NewPhi->getBasicBlockIndex(BB); NI >= 0) {
4393 NewGEP = NewPhi->getIncomingValue(NI);
4394 } else {
4395 SmallVector<Value *> NewOps = GetNewOps(Op);
4396 NewGEP =
4397 IRB.CreateGEP(SourceTy, NewOps[0], ArrayRef(NewOps).drop_front(),
4398 Phi->getName() + ".sroa.gep", GEPI.getNoWrapFlags());
4399 }
4400 NewPhi->addIncoming(NewGEP, BB);
4401 }
4402
4403 Visited.erase(&GEPI);
4404 GEPI.replaceAllUsesWith(NewPhi);
4405 GEPI.eraseFromParent();
4406 Visited.insert(NewPhi);
4407 enqueueUsers(*NewPhi);
4408
4409 LLVM_DEBUG(dbgs() << " to: ";
4410 for (Value *In
4411 : NewPhi->incoming_values()) dbgs()
4412 << "\n " << *In;
4413 dbgs() << "\n " << *NewPhi << '\n');
4414
4415 return true;
4416 }
4417
4418 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
4419 if (unfoldGEPSelect(GEPI))
4420 return true;
4421
4422 if (unfoldGEPPhi(GEPI))
4423 return true;
4424
4425 enqueueUsers(GEPI);
4426 return false;
4427 }
4428
4429 bool visitPHINode(PHINode &PN) {
4430 enqueueUsers(PN);
4431 return false;
4432 }
4433
4434 bool visitSelectInst(SelectInst &SI) {
4435 enqueueUsers(SI);
4436 return false;
4437 }
4438};
4439
4440} // end anonymous namespace
4441
4442/// Strip aggregate type wrapping.
4443///
4444/// This removes no-op aggregate types wrapping an underlying type. It will
4445/// strip as many layers of types as it can without changing either the type
4446/// size or the allocated size.
4448 if (Ty->isSingleValueType())
4449 return Ty;
4450
4451 uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue();
4452 uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
4453
4454 Type *InnerTy;
4455 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
4456 InnerTy = ArrTy->getElementType();
4457 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
4458 const StructLayout *SL = DL.getStructLayout(STy);
4459 unsigned Index = SL->getElementContainingOffset(0);
4460 InnerTy = STy->getElementType(Index);
4461 } else {
4462 return Ty;
4463 }
4464
4465 if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedValue() ||
4466 TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedValue())
4467 return Ty;
4468
4469 return stripAggregateTypeWrapping(DL, InnerTy);
4470}
4471
4472/// Try to find a partition of the aggregate type passed in for a given
4473/// offset and size.
4474///
4475/// This recurses through the aggregate type and tries to compute a subtype
4476/// based on the offset and size. When the offset and size span a sub-section
4477/// of an array, it will even compute a new array type for that sub-section,
4478/// and the same for structs.
4479///
4480/// Note that this routine is very strict and tries to find a partition of the
4481/// type which produces the *exact* right offset and size. It is not forgiving
4482/// when the size or offset cause either end of type-based partition to be off.
4483/// Also, this is a best-effort routine. It is reasonable to give up and not
4484/// return a type if necessary.
4486 uint64_t Size) {
4487 if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedValue() == Size)
4488 return stripAggregateTypeWrapping(DL, Ty);
4489 if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() ||
4490 (DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size)
4491 return nullptr;
4492
4493 if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
4494 Type *ElementTy;
4495 uint64_t TyNumElements;
4496 if (auto *AT = dyn_cast<ArrayType>(Ty)) {
4497 ElementTy = AT->getElementType();
4498 TyNumElements = AT->getNumElements();
4499 } else {
4500 // FIXME: This isn't right for vectors with non-byte-sized or
4501 // non-power-of-two sized elements.
4502 auto *VT = cast<FixedVectorType>(Ty);
4503 ElementTy = VT->getElementType();
4504 TyNumElements = VT->getNumElements();
4505 }
4506 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4507 uint64_t NumSkippedElements = Offset / ElementSize;
4508 if (NumSkippedElements >= TyNumElements)
4509 return nullptr;
4510 Offset -= NumSkippedElements * ElementSize;
4511
4512 // First check if we need to recurse.
4513 if (Offset > 0 || Size < ElementSize) {
4514 // Bail if the partition ends in a different array element.
4515 if ((Offset + Size) > ElementSize)
4516 return nullptr;
4517 // Recurse through the element type trying to peel off offset bytes.
4518 return getTypePartition(DL, ElementTy, Offset, Size);
4519 }
4520 assert(Offset == 0);
4521
4522 if (Size == ElementSize)
4523 return stripAggregateTypeWrapping(DL, ElementTy);
4524 assert(Size > ElementSize);
4525 uint64_t NumElements = Size / ElementSize;
4526 if (NumElements * ElementSize != Size)
4527 return nullptr;
4528 return ArrayType::get(ElementTy, NumElements);
4529 }
4530
4532 if (!STy)
4533 return nullptr;
4534
4535 const StructLayout *SL = DL.getStructLayout(STy);
4536
4537 if (SL->getSizeInBits().isScalable())
4538 return nullptr;
4539
4540 if (Offset >= SL->getSizeInBytes())
4541 return nullptr;
4542 uint64_t EndOffset = Offset + Size;
4543 if (EndOffset > SL->getSizeInBytes())
4544 return nullptr;
4545
4546 unsigned Index = SL->getElementContainingOffset(Offset);
4547 Offset -= SL->getElementOffset(Index);
4548
4549 Type *ElementTy = STy->getElementType(Index);
4550 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4551 if (Offset >= ElementSize)
4552 return nullptr; // The offset points into alignment padding.
4553
4554 // See if any partition must be contained by the element.
4555 if (Offset > 0 || Size < ElementSize) {
4556 if ((Offset + Size) > ElementSize)
4557 return nullptr;
4558 return getTypePartition(DL, ElementTy, Offset, Size);
4559 }
4560 assert(Offset == 0);
4561
4562 if (Size == ElementSize)
4563 return stripAggregateTypeWrapping(DL, ElementTy);
4564
4565 StructType::element_iterator EI = STy->element_begin() + Index,
4566 EE = STy->element_end();
4567 if (EndOffset < SL->getSizeInBytes()) {
4568 unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
4569 if (Index == EndIndex)
4570 return nullptr; // Within a single element and its padding.
4571
4572 // Don't try to form "natural" types if the elements don't line up with the
4573 // expected size.
4574 // FIXME: We could potentially recurse down through the last element in the
4575 // sub-struct to find a natural end point.
4576 if (SL->getElementOffset(EndIndex) != EndOffset)
4577 return nullptr;
4578
4579 assert(Index < EndIndex);
4580 EE = STy->element_begin() + EndIndex;
4581 }
4582
4583 // Try to build up a sub-structure.
4584 StructType *SubTy =
4585 StructType::get(STy->getContext(), ArrayRef(EI, EE), STy->isPacked());
4586 const StructLayout *SubSL = DL.getStructLayout(SubTy);
4587 if (Size != SubSL->getSizeInBytes())
4588 return nullptr; // The sub-struct doesn't have quite the size needed.
4589
4590 return SubTy;
4591}
4592
4593/// Pre-split loads and stores to simplify rewriting.
4594///
4595/// We want to break up the splittable load+store pairs as much as
4596/// possible. This is important to do as a preprocessing step, as once we
4597/// start rewriting the accesses to partitions of the alloca we lose the
4598/// necessary information to correctly split apart paired loads and stores
4599/// which both point into this alloca. The case to consider is something like
4600/// the following:
4601///
4602/// %a = alloca [12 x i8]
4603/// %gep1 = getelementptr i8, ptr %a, i32 0
4604/// %gep2 = getelementptr i8, ptr %a, i32 4
4605/// %gep3 = getelementptr i8, ptr %a, i32 8
4606/// store float 0.0, ptr %gep1
4607/// store float 1.0, ptr %gep2
4608/// %v = load i64, ptr %gep1
4609/// store i64 %v, ptr %gep2
4610/// %f1 = load float, ptr %gep2
4611/// %f2 = load float, ptr %gep3
4612///
4613/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
4614/// promote everything so we recover the 2 SSA values that should have been
4615/// there all along.
4616///
4617/// \returns true if any changes are made.
4618bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
4619 LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
4620
4621 // Track the loads and stores which are candidates for pre-splitting here, in
4622 // the order they first appear during the partition scan. These give stable
4623 // iteration order and a basis for tracking which loads and stores we
4624 // actually split.
4627
4628 // We need to accumulate the splits required of each load or store where we
4629 // can find them via a direct lookup. This is important to cross-check loads
4630 // and stores against each other. We also track the slice so that we can kill
4631 // all the slices that end up split.
4632 struct SplitOffsets {
4633 Slice *S;
4634 std::vector<uint64_t> Splits;
4635 };
4636 SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
4637
4638 // Track loads out of this alloca which cannot, for any reason, be pre-split.
4639 // This is important as we also cannot pre-split stores of those loads!
4640 // FIXME: This is all pretty gross. It means that we can be more aggressive
4641 // in pre-splitting when the load feeding the store happens to come from
4642 // a separate alloca. Put another way, the effectiveness of SROA would be
4643 // decreased by a frontend which just concatenated all of its local allocas
4644 // into one big flat alloca. But defeating such patterns is exactly the job
4645 // SROA is tasked with! Sadly, to not have this discrepancy we would have
4646 // change store pre-splitting to actually force pre-splitting of the load
4647 // that feeds it *and all stores*. That makes pre-splitting much harder, but
4648 // maybe it would make it more principled?
4649 SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
4650
4651 LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
4652 for (auto &P : AS.partitions()) {
4653 for (Slice &S : P) {
4654 Instruction *I = cast<Instruction>(S.getUse()->getUser());
4655 if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
4656 // If this is a load we have to track that it can't participate in any
4657 // pre-splitting. If this is a store of a load we have to track that
4658 // that load also can't participate in any pre-splitting.
4659 if (auto *LI = dyn_cast<LoadInst>(I))
4660 UnsplittableLoads.insert(LI);
4661 else if (auto *SI = dyn_cast<StoreInst>(I))
4662 if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
4663 UnsplittableLoads.insert(LI);
4664 continue;
4665 }
4666 assert(P.endOffset() > S.beginOffset() &&
4667 "Empty or backwards partition!");
4668
4669 // Determine if this is a pre-splittable slice.
4670 if (auto *LI = dyn_cast<LoadInst>(I)) {
4671 assert(!LI->isVolatile() && "Cannot split volatile loads!");
4672
4673 // The load must be used exclusively to store into other pointers for
4674 // us to be able to arbitrarily pre-split it. The stores must also be
4675 // simple to avoid changing semantics.
4676 auto IsLoadSimplyStored = [](LoadInst *LI) {
4677 for (User *LU : LI->users()) {
4678 auto *SI = dyn_cast<StoreInst>(LU);
4679 if (!SI || !SI->isSimple())
4680 return false;
4681 }
4682 return true;
4683 };
4684 if (!IsLoadSimplyStored(LI)) {
4685 UnsplittableLoads.insert(LI);
4686 continue;
4687 }
4688
4689 Loads.push_back(LI);
4690 } else if (auto *SI = dyn_cast<StoreInst>(I)) {
4691 if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
4692 // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
4693 continue;
4694 auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
4695 if (!StoredLoad || !StoredLoad->isSimple())
4696 continue;
4697 assert(!SI->isVolatile() && "Cannot split volatile stores!");
4698
4699 Stores.push_back(SI);
4700 } else {
4701 // Other uses cannot be pre-split.
4702 continue;
4703 }
4704
4705 // Record the initial split.
4706 LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
4707 auto &Offsets = SplitOffsetsMap[I];
4708 assert(Offsets.Splits.empty() &&
4709 "Should not have splits the first time we see an instruction!");
4710 Offsets.S = &S;
4711 Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
4712 }
4713
4714 // Now scan the already split slices, and add a split for any of them which
4715 // we're going to pre-split.
4716 for (Slice *S : P.splitSliceTails()) {
4717 auto SplitOffsetsMapI =
4718 SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
4719 if (SplitOffsetsMapI == SplitOffsetsMap.end())
4720 continue;
4721 auto &Offsets = SplitOffsetsMapI->second;
4722
4723 assert(Offsets.S == S && "Found a mismatched slice!");
4724 assert(!Offsets.Splits.empty() &&
4725 "Cannot have an empty set of splits on the second partition!");
4726 assert(Offsets.Splits.back() ==
4727 P.beginOffset() - Offsets.S->beginOffset() &&
4728 "Previous split does not end where this one begins!");
4729
4730 // Record each split. The last partition's end isn't needed as the size
4731 // of the slice dictates that.
4732 if (S->endOffset() > P.endOffset())
4733 Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
4734 }
4735 }
4736
4737 // We may have split loads where some of their stores are split stores. For
4738 // such loads and stores, we can only pre-split them if their splits exactly
4739 // match relative to their starting offset. We have to verify this prior to
4740 // any rewriting.
4741 llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
4742 // Lookup the load we are storing in our map of split
4743 // offsets.
4744 auto *LI = cast<LoadInst>(SI->getValueOperand());
4745 // If it was completely unsplittable, then we're done,
4746 // and this store can't be pre-split.
4747 if (UnsplittableLoads.count(LI))
4748 return true;
4749
4750 auto LoadOffsetsI = SplitOffsetsMap.find(LI);
4751 if (LoadOffsetsI == SplitOffsetsMap.end())
4752 return false; // Unrelated loads are definitely safe.
4753 auto &LoadOffsets = LoadOffsetsI->second;
4754
4755 // Now lookup the store's offsets.
4756 auto &StoreOffsets = SplitOffsetsMap[SI];
4757
4758 // If the relative offsets of each split in the load and
4759 // store match exactly, then we can split them and we
4760 // don't need to remove them here.
4761 if (LoadOffsets.Splits == StoreOffsets.Splits)
4762 return false;
4763
4764 LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
4765 << " " << *LI << "\n"
4766 << " " << *SI << "\n");
4767
4768 // We've found a store and load that we need to split
4769 // with mismatched relative splits. Just give up on them
4770 // and remove both instructions from our list of
4771 // candidates.
4772 UnsplittableLoads.insert(LI);
4773 return true;
4774 });
4775 // Now we have to go *back* through all the stores, because a later store may
4776 // have caused an earlier store's load to become unsplittable and if it is
4777 // unsplittable for the later store, then we can't rely on it being split in
4778 // the earlier store either.
4779 llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
4780 auto *LI = cast<LoadInst>(SI->getValueOperand());
4781 return UnsplittableLoads.count(LI);
4782 });
4783 // Once we've established all the loads that can't be split for some reason,
4784 // filter any that made it into our list out.
4785 llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
4786 return UnsplittableLoads.count(LI);
4787 });
4788
4789 // If no loads or stores are left, there is no pre-splitting to be done for
4790 // this alloca.
4791 if (Loads.empty() && Stores.empty())
4792 return false;
4793
4794 // From here on, we can't fail and will be building new accesses, so rig up
4795 // an IR builder.
4796 IRBuilderTy IRB(&AI);
4797
4798 // Collect the new slices which we will merge into the alloca slices.
4799 SmallVector<Slice, 4> NewSlices;
4800
4801 // Track any allocas we end up splitting loads and stores for so we iterate
4802 // on them.
4803 SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
4804
4805 // At this point, we have collected all of the loads and stores we can
4806 // pre-split, and the specific splits needed for them. We actually do the
4807 // splitting in a specific order in order to handle when one of the loads in
4808 // the value operand to one of the stores.
4809 //
4810 // First, we rewrite all of the split loads, and just accumulate each split
4811 // load in a parallel structure. We also build the slices for them and append
4812 // them to the alloca slices.
4813 SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
4814 std::vector<LoadInst *> SplitLoads;
4815 const DataLayout &DL = AI.getDataLayout();
4816 for (LoadInst *LI : Loads) {
4817 SplitLoads.clear();
4818
4819 auto &Offsets = SplitOffsetsMap[LI];
4820 unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
4821 assert(LI->getType()->getIntegerBitWidth() % 8 == 0 &&
4822 "Load must have type size equal to store size");
4823 assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize &&
4824 "Load must be >= slice size");
4825
4826 uint64_t BaseOffset = Offsets.S->beginOffset();
4827 assert(BaseOffset + SliceSize > BaseOffset &&
4828 "Cannot represent alloca access size using 64-bit integers!");
4829
4831 IRB.SetInsertPoint(LI);
4832
4833 LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
4834
4835 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4836 int Idx = 0, Size = Offsets.Splits.size();
4837 for (;;) {
4838 auto *PartTy = Type::getIntNTy(LI->getContext(), PartSize * 8);
4839 auto AS = LI->getPointerAddressSpace();
4840 auto *PartPtrTy = LI->getPointerOperandType();
4841 LoadInst *PLoad = IRB.CreateAlignedLoad(
4842 PartTy,
4843 getAdjustedPtr(IRB, DL, BasePtr,
4844 APInt(DL.getIndexSizeInBits(AS), PartOffset),
4845 PartPtrTy, BasePtr->getName() + "."),
4846 getAdjustedAlignment(LI, PartOffset),
4847 /*IsVolatile*/ false, LI->getName());
4848 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4849 LLVMContext::MD_access_group});
4850
4851 // Append this load onto the list of split loads so we can find it later
4852 // to rewrite the stores.
4853 SplitLoads.push_back(PLoad);
4854
4855 // Now build a new slice for the alloca.
4856 NewSlices.push_back(
4857 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4858 &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
4859 /*IsSplittable*/ false));
4860 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4861 << ", " << NewSlices.back().endOffset()
4862 << "): " << *PLoad << "\n");
4863
4864 // See if we've handled all the splits.
4865 if (Idx >= Size)
4866 break;
4867
4868 // Setup the next partition.
4869 PartOffset = Offsets.Splits[Idx];
4870 ++Idx;
4871 PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset;
4872 }
4873
4874 // Now that we have the split loads, do the slow walk over all uses of the
4875 // load and rewrite them as split stores, or save the split loads to use
4876 // below if the store is going to be split there anyways.
4877 bool DeferredStores = false;
4878 for (User *LU : LI->users()) {
4879 StoreInst *SI = cast<StoreInst>(LU);
4880 if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
4881 DeferredStores = true;
4882 LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
4883 << "\n");
4884 continue;
4885 }
4886
4887 Value *StoreBasePtr = SI->getPointerOperand();
4888 IRB.SetInsertPoint(SI);
4889 AAMDNodes AATags = SI->getAAMetadata();
4890
4891 LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
4892
4893 for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
4894 LoadInst *PLoad = SplitLoads[Idx];
4895 uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
4896 auto *PartPtrTy = SI->getPointerOperandType();
4897
4898 auto AS = SI->getPointerAddressSpace();
4899 StoreInst *PStore = IRB.CreateAlignedStore(
4900 PLoad,
4901 getAdjustedPtr(IRB, DL, StoreBasePtr,
4902 APInt(DL.getIndexSizeInBits(AS), PartOffset),
4903 PartPtrTy, StoreBasePtr->getName() + "."),
4904 getAdjustedAlignment(SI, PartOffset),
4905 /*IsVolatile*/ false);
4906 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
4907 LLVMContext::MD_access_group,
4908 LLVMContext::MD_DIAssignID});
4909
4910 if (AATags)
4911 PStore->setAAMetadata(
4912 AATags.adjustForAccess(PartOffset, PLoad->getType(), DL));
4913 LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
4914 }
4915
4916 // We want to immediately iterate on any allocas impacted by splitting
4917 // this store, and we have to track any promotable alloca (indicated by
4918 // a direct store) as needing to be resplit because it is no longer
4919 // promotable.
4920 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
4921 ResplitPromotableAllocas.insert(OtherAI);
4922 Worklist.insert(OtherAI);
4923 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4924 StoreBasePtr->stripInBoundsOffsets())) {
4925 Worklist.insert(OtherAI);
4926 }
4927
4928 // Mark the original store as dead.
4929 DeadInsts.push_back(SI);
4930 }
4931
4932 // Save the split loads if there are deferred stores among the users.
4933 if (DeferredStores)
4934 SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
4935
4936 // Mark the original load as dead and kill the original slice.
4937 DeadInsts.push_back(LI);
4938 Offsets.S->kill();
4939 }
4940
4941 // Second, we rewrite all of the split stores. At this point, we know that
4942 // all loads from this alloca have been split already. For stores of such
4943 // loads, we can simply look up the pre-existing split loads. For stores of
4944 // other loads, we split those loads first and then write split stores of
4945 // them.
4946 for (StoreInst *SI : Stores) {
4947 auto *LI = cast<LoadInst>(SI->getValueOperand());
4948 IntegerType *Ty = cast<IntegerType>(LI->getType());
4949 assert(Ty->getBitWidth() % 8 == 0);
4950 uint64_t StoreSize = Ty->getBitWidth() / 8;
4951 assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
4952
4953 auto &Offsets = SplitOffsetsMap[SI];
4954 assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
4955 "Slice size should always match load size exactly!");
4956 uint64_t BaseOffset = Offsets.S->beginOffset();
4957 assert(BaseOffset + StoreSize > BaseOffset &&
4958 "Cannot represent alloca access size using 64-bit integers!");
4959
4960 Value *LoadBasePtr = LI->getPointerOperand();
4961 Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
4962
4963 LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
4964
4965 // Check whether we have an already split load.
4966 auto SplitLoadsMapI = SplitLoadsMap.find(LI);
4967 std::vector<LoadInst *> *SplitLoads = nullptr;
4968 if (SplitLoadsMapI != SplitLoadsMap.end()) {
4969 SplitLoads = &SplitLoadsMapI->second;
4970 assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
4971 "Too few split loads for the number of splits in the store!");
4972 } else {
4973 LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
4974 }
4975
4976 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4977 int Idx = 0, Size = Offsets.Splits.size();
4978 for (;;) {
4979 auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
4980 auto *LoadPartPtrTy = LI->getPointerOperandType();
4981 auto *StorePartPtrTy = SI->getPointerOperandType();
4982
4983 // Either lookup a split load or create one.
4984 LoadInst *PLoad;
4985 if (SplitLoads) {
4986 PLoad = (*SplitLoads)[Idx];
4987 } else {
4988 IRB.SetInsertPoint(LI);
4989 auto AS = LI->getPointerAddressSpace();
4990 PLoad = IRB.CreateAlignedLoad(
4991 PartTy,
4992 getAdjustedPtr(IRB, DL, LoadBasePtr,
4993 APInt(DL.getIndexSizeInBits(AS), PartOffset),
4994 LoadPartPtrTy, LoadBasePtr->getName() + "."),
4995 getAdjustedAlignment(LI, PartOffset),
4996 /*IsVolatile*/ false, LI->getName());
4997 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4998 LLVMContext::MD_access_group});
4999 }
5000
5001 // And store this partition.
5002 IRB.SetInsertPoint(SI);
5003 auto AS = SI->getPointerAddressSpace();
5004 StoreInst *PStore = IRB.CreateAlignedStore(
5005 PLoad,
5006 getAdjustedPtr(IRB, DL, StoreBasePtr,
5007 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5008 StorePartPtrTy, StoreBasePtr->getName() + "."),
5009 getAdjustedAlignment(SI, PartOffset),
5010 /*IsVolatile*/ false);
5011 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
5012 LLVMContext::MD_access_group});
5013
5014 // Now build a new slice for the alloca.
5015 NewSlices.push_back(
5016 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
5017 &PStore->getOperandUse(PStore->getPointerOperandIndex()),
5018 /*IsSplittable*/ false));
5019 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
5020 << ", " << NewSlices.back().endOffset()
5021 << "): " << *PStore << "\n");
5022 if (!SplitLoads) {
5023 LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
5024 }
5025
5026 // See if we've finished all the splits.
5027 if (Idx >= Size)
5028 break;
5029
5030 // Setup the next partition.
5031 PartOffset = Offsets.Splits[Idx];
5032 ++Idx;
5033 PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
5034 }
5035
5036 // We want to immediately iterate on any allocas impacted by splitting
5037 // this load, which is only relevant if it isn't a load of this alloca and
5038 // thus we didn't already split the loads above. We also have to keep track
5039 // of any promotable allocas we split loads on as they can no longer be
5040 // promoted.
5041 if (!SplitLoads) {
5042 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
5043 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5044 ResplitPromotableAllocas.insert(OtherAI);
5045 Worklist.insert(OtherAI);
5046 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
5047 LoadBasePtr->stripInBoundsOffsets())) {
5048 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5049 Worklist.insert(OtherAI);
5050 }
5051 }
5052
5053 // Mark the original store as dead now that we've split it up and kill its
5054 // slice. Note that we leave the original load in place unless this store
5055 // was its only use. It may in turn be split up if it is an alloca load
5056 // for some other alloca, but it may be a normal load. This may introduce
5057 // redundant loads, but where those can be merged the rest of the optimizer
5058 // should handle the merging, and this uncovers SSA splits which is more
5059 // important. In practice, the original loads will almost always be fully
5060 // split and removed eventually, and the splits will be merged by any
5061 // trivial CSE, including instcombine.
5062 if (LI->hasOneUse()) {
5063 assert(*LI->user_begin() == SI && "Single use isn't this store!");
5064 DeadInsts.push_back(LI);
5065 }
5066 DeadInsts.push_back(SI);
5067 Offsets.S->kill();
5068 }
5069
5070 // Remove the killed slices that have ben pre-split.
5071 llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
5072
5073 // Insert our new slices. This will sort and merge them into the sorted
5074 // sequence.
5075 AS.insert(NewSlices);
5076
5077 LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
5078#ifndef NDEBUG
5079 for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
5080 LLVM_DEBUG(AS.print(dbgs(), I, " "));
5081#endif
5082
5083 // Finally, don't try to promote any allocas that new require re-splitting.
5084 // They have already been added to the worklist above.
5085 PromotableAllocas.set_subtract(ResplitPromotableAllocas);
5086
5087 return true;
5088}
5089
5090/// Select a partition type for an alloca partition.
5091///
5092/// Try to compute a friendly type for this partition of the alloca. This
5093/// won't always succeed, in which case we fall back to a legal integer type
5094/// or an i8 array of an appropriate size.
5095///
5096/// \returns A tuple with the following elements:
5097/// - PartitionType: The computed type for this partition.
5098/// - IsIntegerWideningViable: True if integer widening promotion is used.
5099/// - VectorType: The vector type if vector promotion is used, otherwise
5100/// nullptr.
5101static std::tuple<Type *, bool, VectorType *>
5103 LLVMContext &C) {
5104 // First check if the partition is viable for vector promotion.
5105 //
5106 // We prefer vector promotion over integer widening promotion when:
5107 // - The vector element type is a floating-point type.
5108 // - All the loads/stores to the alloca are vector loads/stores to the
5109 // entire alloca or load/store a single element of the vector.
5110 //
5111 // Otherwise when there is an integer vector with mixed type loads/stores we
5112 // prefer integer widening promotion because it's more likely the user is
5113 // doing bitwise arithmetic and we generate better code.
5114 VectorType *VecTy =
5116 // If the vector element type is a floating-point type, we prefer vector
5117 // promotion. If the vector has one element, let the below code select
5118 // whether we promote with the vector or scalar.
5119 if (VecTy && VecTy->getElementType()->isFloatingPointTy() &&
5120 VecTy->getElementCount().getFixedValue() > 1)
5121 return {VecTy, false, VecTy};
5122
5123 // Check if there is a common type that all slices of the partition use that
5124 // spans the partition.
5125 auto [CommonUseTy, LargestIntTy] =
5126 findCommonType(P.begin(), P.end(), P.endOffset());
5127 if (CommonUseTy) {
5128 TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy);
5129 if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) {
5130 // We prefer vector promotion here because if vector promotion is viable
5131 // and there is a common type used, then it implies the second listed
5132 // condition for preferring vector promotion is true.
5133 if (VecTy)
5134 return {VecTy, false, VecTy};
5135 return {CommonUseTy, isIntegerWideningViable(P, CommonUseTy, DL),
5136 nullptr};
5137 }
5138 }
5139
5140 // Can we find an appropriate subtype in the original allocated
5141 // type?
5142 if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
5143 P.beginOffset(), P.size())) {
5144 // If the partition is an integer array that can be spanned by a legal
5145 // integer type, prefer to represent it as a legal integer type because
5146 // it's more likely to be promotable.
5147 if (TypePartitionTy->isArrayTy() &&
5148 TypePartitionTy->getArrayElementType()->isIntegerTy() &&
5149 DL.isLegalInteger(P.size() * 8))
5150 TypePartitionTy = Type::getIntNTy(C, P.size() * 8);
5151 // There was no common type used, so we prefer integer widening promotion.
5152 if (isIntegerWideningViable(P, TypePartitionTy, DL))
5153 return {TypePartitionTy, true, nullptr};
5154 if (VecTy)
5155 return {VecTy, false, VecTy};
5156 // If we couldn't promote with TypePartitionTy, try with the largest
5157 // integer type used.
5158 if (LargestIntTy &&
5159 DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size() &&
5160 isIntegerWideningViable(P, LargestIntTy, DL))
5161 return {LargestIntTy, true, nullptr};
5162
5163 // Fallback to TypePartitionTy and we probably won't promote.
5164 return {TypePartitionTy, false, nullptr};
5165 }
5166
5167 // Select the largest integer type used if it spans the partition.
5168 if (LargestIntTy &&
5169 DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size())
5170 return {LargestIntTy, false, nullptr};
5171
5172 // Select a legal integer type if it spans the partition.
5173 if (DL.isLegalInteger(P.size() * 8))
5174 return {Type::getIntNTy(C, P.size() * 8), false, nullptr};
5175
5176 // Fallback to an i8 array.
5177 return {ArrayType::get(Type::getInt8Ty(C), P.size()), false, nullptr};
5178}
5179
5180/// Rewrite an alloca partition's users.
5181///
5182/// This routine drives both of the rewriting goals of the SROA pass. It tries
5183/// to rewrite uses of an alloca partition to be conducive for SSA value
5184/// promotion. If the partition needs a new, more refined alloca, this will
5185/// build that new alloca, preserving as much type information as possible, and
5186/// rewrite the uses of the old alloca to point at the new one and have the
5187/// appropriate new offsets. It also evaluates how successful the rewrite was
5188/// at enabling promotion and if it was successful queues the alloca to be
5189/// promoted.
5190std::pair<AllocaInst *, uint64_t>
5191SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P) {
5192 const DataLayout &DL = AI.getDataLayout();
5193 // Select the type for the new alloca that spans the partition.
5194 auto [PartitionTy, IsIntegerWideningViable, VecTy] =
5195 selectPartitionType(P, DL, AI, *C);
5196
5197 // Check for the case where we're going to rewrite to a new alloca of the
5198 // exact same type as the original, and with the same access offsets. In that
5199 // case, re-use the existing alloca, but still run through the rewriter to
5200 // perform phi and select speculation.
5201 // P.beginOffset() can be non-zero even with the same type in a case with
5202 // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
5203 AllocaInst *NewAI;
5204 if (PartitionTy == AI.getAllocatedType() && P.beginOffset() == 0) {
5205 NewAI = &AI;
5206 // FIXME: We should be able to bail at this point with "nothing changed".
5207 // FIXME: We might want to defer PHI speculation until after here.
5208 // FIXME: return nullptr;
5209 } else {
5210 // Make sure the alignment is compatible with P.beginOffset().
5211 const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
5212 // If we will get at least this much alignment from the type alone, leave
5213 // the alloca's alignment unconstrained.
5214 const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(PartitionTy);
5215 NewAI = new AllocaInst(
5216 PartitionTy, AI.getAddressSpace(), nullptr,
5217 IsUnconstrained ? DL.getPrefTypeAlign(PartitionTy) : Alignment,
5218 AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()),
5219 AI.getIterator());
5220 // Copy the old AI debug location over to the new one.
5221 NewAI->setDebugLoc(AI.getDebugLoc());
5222 ++NumNewAllocas;
5223 }
5224
5225 LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
5226 << "," << P.endOffset() << ") to: " << *NewAI << "\n");
5227
5228 // Track the high watermark on the worklist as it is only relevant for
5229 // promoted allocas. We will reset it to this point if the alloca is not in
5230 // fact scheduled for promotion.
5231 unsigned PPWOldSize = PostPromotionWorklist.size();
5232 unsigned NumUses = 0;
5233 SmallSetVector<PHINode *, 8> PHIUsers;
5234 SmallSetVector<SelectInst *, 8> SelectUsers;
5235
5236 AllocaSliceRewriter Rewriter(
5237 DL, AS, *this, AI, *NewAI, PartitionTy, P.beginOffset(), P.endOffset(),
5238 IsIntegerWideningViable, VecTy, PHIUsers, SelectUsers);
5239 bool Promotable = true;
5240 // Check whether we can have tree-structured merge.
5241 if (auto DeletedValues = Rewriter.rewriteTreeStructuredMerge(P)) {
5242 NumUses += DeletedValues->size() + 1;
5243 for (Value *V : *DeletedValues)
5244 DeadInsts.push_back(V);
5245 } else {
5246 for (Slice *S : P.splitSliceTails()) {
5247 Promotable &= Rewriter.visit(S);
5248 ++NumUses;
5249 }
5250 for (Slice &S : P) {
5251 Promotable &= Rewriter.visit(&S);
5252 ++NumUses;
5253 }
5254 }
5255
5256 NumAllocaPartitionUses += NumUses;
5257 MaxUsesPerAllocaPartition.updateMax(NumUses);
5258
5259 // Now that we've processed all the slices in the new partition, check if any
5260 // PHIs or Selects would block promotion.
5261 for (PHINode *PHI : PHIUsers)
5262 if (!isSafePHIToSpeculate(*PHI)) {
5263 Promotable = false;
5264 PHIUsers.clear();
5265 SelectUsers.clear();
5266 break;
5267 }
5268
5270 NewSelectsToRewrite;
5271 NewSelectsToRewrite.reserve(SelectUsers.size());
5272 for (SelectInst *Sel : SelectUsers) {
5273 std::optional<RewriteableMemOps> Ops =
5274 isSafeSelectToSpeculate(*Sel, PreserveCFG);
5275 if (!Ops) {
5276 Promotable = false;
5277 PHIUsers.clear();
5278 SelectUsers.clear();
5279 NewSelectsToRewrite.clear();
5280 break;
5281 }
5282 NewSelectsToRewrite.emplace_back(std::make_pair(Sel, *Ops));
5283 }
5284
5285 if (Promotable) {
5286 for (Use *U : AS.getDeadUsesIfPromotable()) {
5287 auto *OldInst = dyn_cast<Instruction>(U->get());
5288 Value::dropDroppableUse(*U);
5289 if (OldInst)
5290 if (isInstructionTriviallyDead(OldInst))
5291 DeadInsts.push_back(OldInst);
5292 }
5293 if (PHIUsers.empty() && SelectUsers.empty()) {
5294 // Promote the alloca.
5295 PromotableAllocas.insert(NewAI);
5296 } else {
5297 // If we have either PHIs or Selects to speculate, add them to those
5298 // worklists and re-queue the new alloca so that we promote in on the
5299 // next iteration.
5300 SpeculatablePHIs.insert_range(PHIUsers);
5301 SelectsToRewrite.reserve(SelectsToRewrite.size() +
5302 NewSelectsToRewrite.size());
5303 for (auto &&KV : llvm::make_range(
5304 std::make_move_iterator(NewSelectsToRewrite.begin()),
5305 std::make_move_iterator(NewSelectsToRewrite.end())))
5306 SelectsToRewrite.insert(std::move(KV));
5307 Worklist.insert(NewAI);
5308 }
5309 } else {
5310 // Drop any post-promotion work items if promotion didn't happen.
5311 while (PostPromotionWorklist.size() > PPWOldSize)
5312 PostPromotionWorklist.pop_back();
5313
5314 // We couldn't promote and we didn't create a new partition, nothing
5315 // happened.
5316 if (NewAI == &AI)
5317 return {nullptr, 0};
5318
5319 // If we can't promote the alloca, iterate on it to check for new
5320 // refinements exposed by splitting the current alloca. Don't iterate on an
5321 // alloca which didn't actually change and didn't get promoted.
5322 Worklist.insert(NewAI);
5323 }
5324
5325 return {NewAI, DL.getTypeSizeInBits(PartitionTy).getFixedValue()};
5326}
5327
5328// There isn't a shared interface to get the "address" parts out of a
5329// dbg.declare and dbg.assign, so provide some wrappers.
5332 return DVR->isKillAddress();
5333 return DVR->isKillLocation();
5334}
5335
5338 return DVR->getAddressExpression();
5339 return DVR->getExpression();
5340}
5341
5342/// Create or replace an existing fragment in a DIExpression with \p Frag.
5343/// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext
5344/// operation, add \p BitExtractOffset to the offset part.
5345///
5346/// Returns the new expression, or nullptr if this fails (see details below).
5347///
5348/// This function is similar to DIExpression::createFragmentExpression except
5349/// for 3 important distinctions:
5350/// 1. The new fragment isn't relative to an existing fragment.
5351/// 2. It assumes the computed location is a memory location. This means we
5352/// don't need to perform checks that creating the fragment preserves the
5353/// expression semantics.
5354/// 3. Existing extract_bits are modified independently of fragment changes
5355/// using \p BitExtractOffset. A change to the fragment offset or size
5356/// may affect a bit extract. But a bit extract offset can change
5357/// independently of the fragment dimensions.
5358///
5359/// Returns the new expression, or nullptr if one couldn't be created.
5360/// Ideally this is only used to signal that a bit-extract has become
5361/// zero-sized (and thus the new debug record has no size and can be
5362/// dropped), however, it fails for other reasons too - see the FIXME below.
5363///
5364/// FIXME: To keep the change that introduces this function NFC it bails
5365/// in some situations unecessarily, e.g. when fragment and bit extract
5366/// sizes differ.
5369 int64_t BitExtractOffset) {
5371 bool HasFragment = false;
5372 bool HasBitExtract = false;
5373
5374 for (auto &Op : Expr->expr_ops()) {
5375 if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
5376 HasFragment = true;
5377 continue;
5378 }
5379 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5381 HasBitExtract = true;
5382 int64_t ExtractOffsetInBits = Op.getArg(0);
5383 int64_t ExtractSizeInBits = Op.getArg(1);
5384
5385 // DIExpression::createFragmentExpression doesn't know how to handle
5386 // a fragment that is smaller than the extract. Copy the behaviour
5387 // (bail) to avoid non-NFC changes.
5388 // FIXME: Don't do this.
5389 if (Frag.SizeInBits < uint64_t(ExtractSizeInBits))
5390 return nullptr;
5391
5392 assert(BitExtractOffset <= 0);
5393 int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset;
5394
5395 // DIExpression::createFragmentExpression doesn't know what to do
5396 // if the new extract starts "outside" the existing one. Copy the
5397 // behaviour (bail) to avoid non-NFC changes.
5398 // FIXME: Don't do this.
5399 if (AdjustedOffset < 0)
5400 return nullptr;
5401
5402 Ops.push_back(Op.getOp());
5403 Ops.push_back(std::max<int64_t>(0, AdjustedOffset));
5404 Ops.push_back(ExtractSizeInBits);
5405 continue;
5406 }
5407 Op.appendToVector(Ops);
5408 }
5409
5410 // Unsupported by createFragmentExpression, so don't support it here yet to
5411 // preserve NFC-ness.
5412 if (HasFragment && HasBitExtract)
5413 return nullptr;
5414
5415 if (!HasBitExtract) {
5417 Ops.push_back(Frag.OffsetInBits);
5418 Ops.push_back(Frag.SizeInBits);
5419 }
5420 return DIExpression::get(Expr->getContext(), Ops);
5421}
5422
5423/// Insert a new DbgRecord.
5424/// \p Orig Original to copy record type, debug loc and variable from, and
5425/// additionally value and value expression for dbg_assign records.
5426/// \p NewAddr Location's new base address.
5427/// \p NewAddrExpr New expression to apply to address.
5428/// \p BeforeInst Insert position.
5429/// \p NewFragment New fragment (absolute, non-relative).
5430/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5431static void
5433 DIExpression *NewAddrExpr, Instruction *BeforeInst,
5434 std::optional<DIExpression::FragmentInfo> NewFragment,
5435 int64_t BitExtractAdjustment) {
5436 (void)DIB;
5437
5438 // A dbg_assign puts fragment info in the value expression only. The address
5439 // expression has already been built: NewAddrExpr. A dbg_declare puts the
5440 // new fragment info into NewAddrExpr (as it only has one expression).
5441 DIExpression *NewFragmentExpr =
5442 Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr;
5443 if (NewFragment)
5444 NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment,
5445 BitExtractAdjustment);
5446 if (!NewFragmentExpr)
5447 return;
5448
5449 if (Orig->isDbgDeclare()) {
5451 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5452 BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5453 BeforeInst->getIterator());
5454 return;
5455 }
5456
5457 if (Orig->isDbgValue()) {
5459 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5460 // Drop debug information if the expression doesn't start with a
5461 // DW_OP_deref. This is because without a DW_OP_deref, the #dbg_value
5462 // describes the address of alloca rather than the value inside the alloca.
5463 if (!NewFragmentExpr->startsWithDeref())
5464 DVR->setKillAddress();
5465 BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5466 BeforeInst->getIterator());
5467 return;
5468 }
5469
5470 // Apply a DIAssignID to the store if it doesn't already have it.
5471 if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) {
5472 NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
5474 }
5475
5477 NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
5478 NewAddrExpr, Orig->getDebugLoc());
5479 LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
5480 (void)NewAssign;
5481}
5482
5483/// Walks the slices of an alloca and form partitions based on them,
5484/// rewriting each of their uses.
5485bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
5486 if (AS.begin() == AS.end())
5487 return false;
5488
5489 unsigned NumPartitions = 0;
5490 bool Changed = false;
5491 const DataLayout &DL = AI.getModule()->getDataLayout();
5492
5493 // First try to pre-split loads and stores.
5494 Changed |= presplitLoadsAndStores(AI, AS);
5495
5496 // Now that we have identified any pre-splitting opportunities,
5497 // mark loads and stores unsplittable except for the following case.
5498 // We leave a slice splittable if all other slices are disjoint or fully
5499 // included in the slice, such as whole-alloca loads and stores.
5500 // If we fail to split these during pre-splitting, we want to force them
5501 // to be rewritten into a partition.
5502 bool IsSorted = true;
5503
5504 uint64_t AllocaSize = AI.getAllocationSize(DL)->getFixedValue();
5505 const uint64_t MaxBitVectorSize = 1024;
5506 if (AllocaSize <= MaxBitVectorSize) {
5507 // If a byte boundary is included in any load or store, a slice starting or
5508 // ending at the boundary is not splittable.
5509 SmallBitVector SplittableOffset(AllocaSize + 1, true);
5510 for (Slice &S : AS)
5511 for (unsigned O = S.beginOffset() + 1;
5512 O < S.endOffset() && O < AllocaSize; O++)
5513 SplittableOffset.reset(O);
5514
5515 for (Slice &S : AS) {
5516 if (!S.isSplittable())
5517 continue;
5518
5519 if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
5520 (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
5521 continue;
5522
5523 if (isa<LoadInst>(S.getUse()->getUser()) ||
5524 isa<StoreInst>(S.getUse()->getUser())) {
5525 S.makeUnsplittable();
5526 IsSorted = false;
5527 }
5528 }
5529 } else {
5530 // We only allow whole-alloca splittable loads and stores
5531 // for a large alloca to avoid creating too large BitVector.
5532 for (Slice &S : AS) {
5533 if (!S.isSplittable())
5534 continue;
5535
5536 if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
5537 continue;
5538
5539 if (isa<LoadInst>(S.getUse()->getUser()) ||
5540 isa<StoreInst>(S.getUse()->getUser())) {
5541 S.makeUnsplittable();
5542 IsSorted = false;
5543 }
5544 }
5545 }
5546
5547 if (!IsSorted)
5549
5550 /// Describes the allocas introduced by rewritePartition in order to migrate
5551 /// the debug info.
5552 struct Fragment {
5553 AllocaInst *Alloca;
5554 uint64_t Offset;
5555 uint64_t Size;
5556 Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
5557 : Alloca(AI), Offset(O), Size(S) {}
5558 };
5559 SmallVector<Fragment, 4> Fragments;
5560
5561 // Rewrite each partition.
5562 for (auto &P : AS.partitions()) {
5563 auto [NewAI, ActiveBits] = rewritePartition(AI, AS, P);
5564 if (NewAI) {
5565 Changed = true;
5566 if (NewAI != &AI) {
5567 uint64_t SizeOfByte = 8;
5568 // Don't include any padding.
5569 uint64_t Size = std::min(ActiveBits, P.size() * SizeOfByte);
5570 Fragments.push_back(
5571 Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
5572 }
5573 }
5574 ++NumPartitions;
5575 }
5576
5577 NumAllocaPartitions += NumPartitions;
5578 MaxPartitionsPerAlloca.updateMax(NumPartitions);
5579
5580 // Migrate debug information from the old alloca to the new alloca(s)
5581 // and the individual partitions.
5582 auto MigrateOne = [&](DbgVariableRecord *DbgVariable) {
5583 // Can't overlap with undef memory.
5584 if (isKillAddress(DbgVariable))
5585 return;
5586
5587 const Value *DbgPtr = DbgVariable->getAddress();
5589 DbgVariable->getFragmentOrEntireVariable();
5590 // Get the address expression constant offset if one exists and the ops
5591 // that come after it.
5592 int64_t CurrentExprOffsetInBytes = 0;
5593 SmallVector<uint64_t> PostOffsetOps;
5594 if (!getAddressExpression(DbgVariable)
5595 ->extractLeadingOffset(CurrentExprOffsetInBytes, PostOffsetOps))
5596 return; // Couldn't interpret this DIExpression - drop the var.
5597
5598 // Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext.
5599 int64_t ExtractOffsetInBits = 0;
5600 for (auto Op : getAddressExpression(DbgVariable)->expr_ops()) {
5601 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5603 ExtractOffsetInBits = Op.getArg(0);
5604 break;
5605 }
5606 }
5607
5608 DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
5609 for (auto Fragment : Fragments) {
5610 int64_t OffsetFromLocationInBits;
5611 std::optional<DIExpression::FragmentInfo> NewDbgFragment;
5612 // Find the variable fragment that the new alloca slice covers.
5613 // Drop debug info for this variable fragment if we can't compute an
5614 // intersect between it and the alloca slice.
5616 DL, &AI, Fragment.Offset, Fragment.Size, DbgPtr,
5617 CurrentExprOffsetInBytes * 8, ExtractOffsetInBits, VarFrag,
5618 NewDbgFragment, OffsetFromLocationInBits))
5619 continue; // Do not migrate this fragment to this slice.
5620
5621 // Zero sized fragment indicates there's no intersect between the variable
5622 // fragment and the alloca slice. Skip this slice for this variable
5623 // fragment.
5624 if (NewDbgFragment && !NewDbgFragment->SizeInBits)
5625 continue; // Do not migrate this fragment to this slice.
5626
5627 // No fragment indicates DbgVariable's variable or fragment exactly
5628 // overlaps the slice; copy its fragment (or nullopt if there isn't one).
5629 if (!NewDbgFragment)
5630 NewDbgFragment = DbgVariable->getFragment();
5631
5632 // Reduce the new expression offset by the bit-extract offset since
5633 // we'll be keeping that.
5634 int64_t OffestFromNewAllocaInBits =
5635 OffsetFromLocationInBits - ExtractOffsetInBits;
5636 // We need to adjust an existing bit extract if the offset expression
5637 // can't eat the slack (i.e., if the new offset would be negative).
5638 int64_t BitExtractOffset =
5639 std::min<int64_t>(0, OffestFromNewAllocaInBits);
5640 // The magnitude of a negative value indicates the number of bits into
5641 // the existing variable fragment that the memory region begins. The new
5642 // variable fragment already excludes those bits - the new DbgPtr offset
5643 // only needs to be applied if it's positive.
5644 OffestFromNewAllocaInBits =
5645 std::max(int64_t(0), OffestFromNewAllocaInBits);
5646
5647 // Rebuild the expression:
5648 // {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment}
5649 // Add NewDbgFragment later, because dbg.assigns don't want it in the
5650 // address expression but the value expression instead.
5651 DIExpression *NewExpr = DIExpression::get(AI.getContext(), PostOffsetOps);
5652 if (OffestFromNewAllocaInBits > 0) {
5653 int64_t OffsetInBytes = (OffestFromNewAllocaInBits + 7) / 8;
5654 NewExpr = DIExpression::prepend(NewExpr, /*flags=*/0, OffsetInBytes);
5655 }
5656
5657 // Remove any existing intrinsics on the new alloca describing
5658 // the variable fragment.
5659 auto RemoveOne = [DbgVariable](auto *OldDII) {
5660 auto SameVariableFragment = [](const auto *LHS, const auto *RHS) {
5661 return LHS->getVariable() == RHS->getVariable() &&
5662 LHS->getDebugLoc()->getInlinedAt() ==
5663 RHS->getDebugLoc()->getInlinedAt();
5664 };
5665 if (SameVariableFragment(OldDII, DbgVariable))
5666 OldDII->eraseFromParent();
5667 };
5668 for_each(findDVRDeclares(Fragment.Alloca), RemoveOne);
5669 for_each(findDVRValues(Fragment.Alloca), RemoveOne);
5670 insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI,
5671 NewDbgFragment, BitExtractOffset);
5672 }
5673 };
5674
5675 // Migrate debug information from the old alloca to the new alloca(s)
5676 // and the individual partitions.
5677 for_each(findDVRDeclares(&AI), MigrateOne);
5678 for_each(findDVRValues(&AI), MigrateOne);
5679 for_each(at::getDVRAssignmentMarkers(&AI), MigrateOne);
5680
5681 return Changed;
5682}
5683
5684/// Clobber a use with poison, deleting the used value if it becomes dead.
5685void SROA::clobberUse(Use &U) {
5686 Value *OldV = U;
5687 // Replace the use with an poison value.
5688 U = PoisonValue::get(OldV->getType());
5689
5690 // Check for this making an instruction dead. We have to garbage collect
5691 // all the dead instructions to ensure the uses of any alloca end up being
5692 // minimal.
5693 if (Instruction *OldI = dyn_cast<Instruction>(OldV))
5694 if (isInstructionTriviallyDead(OldI)) {
5695 DeadInsts.push_back(OldI);
5696 }
5697}
5698
5699/// A basic LoadAndStorePromoter that does not remove store nodes.
5701public:
5703 Type *ZeroType)
5704 : LoadAndStorePromoter(Insts, S), ZeroType(ZeroType) {}
5705 bool shouldDelete(Instruction *I) const override {
5706 return !isa<StoreInst>(I) && !isa<AllocaInst>(I);
5707 }
5708
5710 return UndefValue::get(ZeroType);
5711 }
5712
5713private:
5714 Type *ZeroType;
5715};
5716
5717bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
5718 // Look through each "partition", looking for slices with the same start/end
5719 // that do not overlap with any before them. The slices are sorted by
5720 // increasing beginOffset. We don't use AS.partitions(), as it will use a more
5721 // sophisticated algorithm that takes splittable slices into account.
5722 LLVM_DEBUG(dbgs() << "Attempting to propagate values on " << AI << "\n");
5723 bool AllSameAndValid = true;
5724 Type *PartitionType = nullptr;
5726 uint64_t BeginOffset = 0;
5727 uint64_t EndOffset = 0;
5728
5729 auto Flush = [&]() {
5730 if (AllSameAndValid && !Insts.empty()) {
5731 LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
5732 << EndOffset << ")\n");
5734 SSAUpdater SSA(&NewPHIs);
5735 Insts.push_back(&AI);
5736 BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
5737 Promoter.run(Insts);
5738 }
5739 AllSameAndValid = true;
5740 PartitionType = nullptr;
5741 Insts.clear();
5742 };
5743
5744 for (Slice &S : AS) {
5745 auto *User = cast<Instruction>(S.getUse()->getUser());
5746 if (isAssumeLikeIntrinsic(User)) {
5747 LLVM_DEBUG({
5748 dbgs() << "Ignoring slice: ";
5749 AS.print(dbgs(), &S);
5750 });
5751 continue;
5752 }
5753 if (S.beginOffset() >= EndOffset) {
5754 Flush();
5755 BeginOffset = S.beginOffset();
5756 EndOffset = S.endOffset();
5757 } else if (S.beginOffset() != BeginOffset || S.endOffset() != EndOffset) {
5758 if (AllSameAndValid) {
5759 LLVM_DEBUG({
5760 dbgs() << "Slice does not match range [" << BeginOffset << ", "
5761 << EndOffset << ")";
5762 AS.print(dbgs(), &S);
5763 });
5764 AllSameAndValid = false;
5765 }
5766 EndOffset = std::max(EndOffset, S.endOffset());
5767 continue;
5768 }
5769
5770 if (auto *LI = dyn_cast<LoadInst>(User)) {
5771 Type *UserTy = LI->getType();
5772 // LoadAndStorePromoter requires all the types to be the same.
5773 if (!LI->isSimple() || (PartitionType && UserTy != PartitionType))
5774 AllSameAndValid = false;
5775 PartitionType = UserTy;
5776 Insts.push_back(User);
5777 } else if (auto *SI = dyn_cast<StoreInst>(User)) {
5778 Type *UserTy = SI->getValueOperand()->getType();
5779 if (!SI->isSimple() || (PartitionType && UserTy != PartitionType))
5780 AllSameAndValid = false;
5781 PartitionType = UserTy;
5782 Insts.push_back(User);
5783 } else {
5784 AllSameAndValid = false;
5785 }
5786 }
5787
5788 Flush();
5789 return true;
5790}
5791
5792/// Analyze an alloca for SROA.
5793///
5794/// This analyzes the alloca to ensure we can reason about it, builds
5795/// the slices of the alloca, and then hands it off to be split and
5796/// rewritten as needed.
5797std::pair<bool /*Changed*/, bool /*CFGChanged*/>
5798SROA::runOnAlloca(AllocaInst &AI) {
5799 bool Changed = false;
5800 bool CFGChanged = false;
5801
5802 LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
5803 ++NumAllocasAnalyzed;
5804
5805 // Special case dead allocas, as they're trivial.
5806 if (AI.use_empty()) {
5807 AI.eraseFromParent();
5808 Changed = true;
5809 return {Changed, CFGChanged};
5810 }
5811 const DataLayout &DL = AI.getDataLayout();
5812
5813 // Skip alloca forms that this analysis can't handle.
5814 std::optional<TypeSize> Size = AI.getAllocationSize(DL);
5815 if (AI.isArrayAllocation() || !Size || Size->isScalable() || Size->isZero())
5816 return {Changed, CFGChanged};
5817
5818 // First, split any FCA loads and stores touching this alloca to promote
5819 // better splitting and promotion opportunities.
5820 IRBuilderTy IRB(&AI);
5821 AggLoadStoreRewriter AggRewriter(DL, IRB);
5822 Changed |= AggRewriter.rewrite(AI);
5823
5824 // Build the slices using a recursive instruction-visiting builder.
5825 AllocaSlices AS(DL, AI);
5826 LLVM_DEBUG(AS.print(dbgs()));
5827 if (AS.isEscaped())
5828 return {Changed, CFGChanged};
5829
5830 if (AS.isEscapedReadOnly()) {
5831 Changed |= propagateStoredValuesToLoads(AI, AS);
5832 return {Changed, CFGChanged};
5833 }
5834
5835 // Delete all the dead users of this alloca before splitting and rewriting it.
5836 for (Instruction *DeadUser : AS.getDeadUsers()) {
5837 // Free up everything used by this instruction.
5838 for (Use &DeadOp : DeadUser->operands())
5839 clobberUse(DeadOp);
5840
5841 // Now replace the uses of this instruction.
5842 DeadUser->replaceAllUsesWith(PoisonValue::get(DeadUser->getType()));
5843
5844 // And mark it for deletion.
5845 DeadInsts.push_back(DeadUser);
5846 Changed = true;
5847 }
5848 for (Use *DeadOp : AS.getDeadOperands()) {
5849 clobberUse(*DeadOp);
5850 Changed = true;
5851 }
5852
5853 // No slices to split. Leave the dead alloca for a later pass to clean up.
5854 if (AS.begin() == AS.end())
5855 return {Changed, CFGChanged};
5856
5857 Changed |= splitAlloca(AI, AS);
5858
5859 LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
5860 while (!SpeculatablePHIs.empty())
5861 speculatePHINodeLoads(IRB, *SpeculatablePHIs.pop_back_val());
5862
5863 LLVM_DEBUG(dbgs() << " Rewriting Selects\n");
5864 auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector();
5865 while (!RemainingSelectsToRewrite.empty()) {
5866 const auto [K, V] = RemainingSelectsToRewrite.pop_back_val();
5867 CFGChanged |=
5868 rewriteSelectInstMemOps(*K, V, IRB, PreserveCFG ? nullptr : DTU);
5869 }
5870
5871 return {Changed, CFGChanged};
5872}
5873
5874/// Delete the dead instructions accumulated in this run.
5875///
5876/// Recursively deletes the dead instructions we've accumulated. This is done
5877/// at the very end to maximize locality of the recursive delete and to
5878/// minimize the problems of invalidated instruction pointers as such pointers
5879/// are used heavily in the intermediate stages of the algorithm.
5880///
5881/// We also record the alloca instructions deleted here so that they aren't
5882/// subsequently handed to mem2reg to promote.
5883bool SROA::deleteDeadInstructions(
5884 SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
5885 bool Changed = false;
5886 while (!DeadInsts.empty()) {
5887 Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
5888 if (!I)
5889 continue;
5890 LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
5891
5892 // If the instruction is an alloca, find the possible dbg.declare connected
5893 // to it, and remove it too. We must do this before calling RAUW or we will
5894 // not be able to find it.
5895 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
5896 DeletedAllocas.insert(AI);
5897 for (DbgVariableRecord *OldDII : findDVRDeclares(AI))
5898 OldDII->eraseFromParent();
5899 }
5900
5902 I->replaceAllUsesWith(UndefValue::get(I->getType()));
5903
5904 for (Use &Operand : I->operands())
5905 if (Instruction *U = dyn_cast<Instruction>(Operand)) {
5906 // Zero out the operand and see if it becomes trivially dead.
5907 Operand = nullptr;
5909 DeadInsts.push_back(U);
5910 }
5911
5912 ++NumDeleted;
5913 I->eraseFromParent();
5914 Changed = true;
5915 }
5916 return Changed;
5917}
5918/// Promote the allocas, using the best available technique.
5919///
5920/// This attempts to promote whatever allocas have been identified as viable in
5921/// the PromotableAllocas list. If that list is empty, there is nothing to do.
5922/// This function returns whether any promotion occurred.
5923bool SROA::promoteAllocas() {
5924 if (PromotableAllocas.empty())
5925 return false;
5926
5927 if (SROASkipMem2Reg) {
5928 LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
5929 } else {
5930 LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
5931 NumPromoted += PromotableAllocas.size();
5932 PromoteMemToReg(PromotableAllocas.getArrayRef(), DTU->getDomTree(), AC);
5933 }
5934
5935 PromotableAllocas.clear();
5936 return true;
5937}
5938
5939std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) {
5940 LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
5941
5942 const DataLayout &DL = F.getDataLayout();
5943 BasicBlock &EntryBB = F.getEntryBlock();
5944 for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
5945 I != E; ++I) {
5946 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
5947 std::optional<TypeSize> Size = AI->getAllocationSize(DL);
5948 if (Size && Size->isScalable() && isAllocaPromotable(AI))
5949 PromotableAllocas.insert(AI);
5950 else
5951 Worklist.insert(AI);
5952 }
5953 }
5954
5955 bool Changed = false;
5956 bool CFGChanged = false;
5957 // A set of deleted alloca instruction pointers which should be removed from
5958 // the list of promotable allocas.
5959 SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
5960
5961 do {
5962 while (!Worklist.empty()) {
5963 auto [IterationChanged, IterationCFGChanged] =
5964 runOnAlloca(*Worklist.pop_back_val());
5965 Changed |= IterationChanged;
5966 CFGChanged |= IterationCFGChanged;
5967
5968 Changed |= deleteDeadInstructions(DeletedAllocas);
5969
5970 // Remove the deleted allocas from various lists so that we don't try to
5971 // continue processing them.
5972 if (!DeletedAllocas.empty()) {
5973 Worklist.set_subtract(DeletedAllocas);
5974 PostPromotionWorklist.set_subtract(DeletedAllocas);
5975 PromotableAllocas.set_subtract(DeletedAllocas);
5976 DeletedAllocas.clear();
5977 }
5978 }
5979
5980 Changed |= promoteAllocas();
5981
5982 Worklist = PostPromotionWorklist;
5983 PostPromotionWorklist.clear();
5984 } while (!Worklist.empty());
5985
5986 assert((!CFGChanged || Changed) && "Can not only modify the CFG.");
5987 assert((!CFGChanged || !PreserveCFG) &&
5988 "Should not have modified the CFG when told to preserve it.");
5989
5990 if (Changed && isAssignmentTrackingEnabled(*F.getParent())) {
5991 for (auto &BB : F) {
5993 }
5994 }
5995
5996 return {Changed, CFGChanged};
5997}
5998
6002 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6003 auto [Changed, CFGChanged] =
6004 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6005 if (!Changed)
6006 return PreservedAnalyses::all();
6008 if (!CFGChanged)
6011 return PA;
6012}
6013
6015 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
6016 static_cast<PassInfoMixin<SROAPass> *>(this)->printPipeline(
6017 OS, MapClassName2PassName);
6018 OS << (PreserveCFG == SROAOptions::PreserveCFG ? "<preserve-cfg>"
6019 : "<modify-cfg>");
6020}
6021
6022SROAPass::SROAPass(SROAOptions PreserveCFG) : PreserveCFG(PreserveCFG) {}
6023
6024namespace {
6025
6026/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
6027class SROALegacyPass : public FunctionPass {
6029
6030public:
6031 static char ID;
6032
6036 }
6037
6038 bool runOnFunction(Function &F) override {
6039 if (skipFunction(F))
6040 return false;
6041
6042 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
6043 AssumptionCache &AC =
6044 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
6045 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6046 auto [Changed, _] =
6047 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6048 return Changed;
6049 }
6050
6051 void getAnalysisUsage(AnalysisUsage &AU) const override {
6052 AU.addRequired<AssumptionCacheTracker>();
6053 AU.addRequired<DominatorTreeWrapperPass>();
6054 AU.addPreserved<GlobalsAAWrapperPass>();
6055 AU.addPreserved<DominatorTreeWrapperPass>();
6056 }
6057
6058 StringRef getPassName() const override { return "SROA"; }
6059};
6060
6061} // end anonymous namespace
6062
6063char SROALegacyPass::ID = 0;
6064
6069
6070INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
6071 "Scalar Replacement Of Aggregates", false, false)
6074INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
This defines the Use class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
print mir2vec MIR2Vec Vocabulary Printer Pass
Definition MIR2Vec.cpp:598
This file implements a map that provides insertion order iteration.
static std::optional< AllocFnsTy > getAllocationSize(const CallBase *CB, const TargetLibraryInfo *TLI)
static std::optional< uint64_t > getSizeInBytes(std::optional< uint64_t > SizeInBits)
Memory SSA
Definition MemorySSA.cpp:72
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
PassBuilder PB(Machine, PassOpts->PTO, std::nullopt, &PIC)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PointerIntPair class.
This file provides a collection of visitors which walk the (instruction) uses of a pointer.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
bool isDead(const MachineInstr &MI, const MachineRegisterInfo &MRI)
static void visit(BasicBlock &Start, std::function< bool(BasicBlock *)> op)
static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, uint64_t OldAllocaOffsetInBits, uint64_t SliceSizeInBits, Instruction *OldInst, Instruction *Inst, Value *Dest, Value *Value, const DataLayout &DL)
Find linked dbg.assign and generate a new one with the correct FragmentInfo.
Definition SROA.cpp:343
static std::tuple< Type *, bool, VectorType * > selectPartitionType(Partition &P, const DataLayout &DL, AllocaInst &AI, LLVMContext &C)
Select a partition type for an alloca partition.
Definition SROA.cpp:5102
static VectorType * isVectorPromotionViable(Partition &P, const DataLayout &DL, unsigned VScale)
Test whether the given alloca partitioning and range of slices can be promoted to a vector.
Definition SROA.cpp:2241
static Align getAdjustedAlignment(Instruction *I, uint64_t Offset)
Compute the adjusted alignment for a load or store from an offset.
Definition SROA.cpp:1919
static VectorType * checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, SmallVectorImpl< VectorType * > &CandidateTys, bool HaveCommonEltTy, Type *CommonEltTy, bool HaveVecPtrTy, bool HaveCommonVecPtrTy, VectorType *CommonVecPtrTy, unsigned VScale)
Test whether any vector type in CandidateTys is viable for promotion.
Definition SROA.cpp:2092
static std::pair< Type *, IntegerType * > findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E, uint64_t EndOffset)
Walk the range of a partitioning looking for a common type to cover this sequence of slices.
Definition SROA.cpp:1485
static Type * stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty)
Strip aggregate type wrapping.
Definition SROA.cpp:4447
static FragCalcResult calculateFragment(DILocalVariable *Variable, uint64_t NewStorageSliceOffsetInBits, uint64_t NewStorageSliceSizeInBits, std::optional< DIExpression::FragmentInfo > StorageFragment, std::optional< DIExpression::FragmentInfo > CurrentFragment, DIExpression::FragmentInfo &Target)
Definition SROA.cpp:278
static DIExpression * createOrReplaceFragment(const DIExpression *Expr, DIExpression::FragmentInfo Frag, int64_t BitExtractOffset)
Create or replace an existing fragment in a DIExpression with Frag.
Definition SROA.cpp:5367
static Value * insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, Value *V, uint64_t Offset, const Twine &Name)
Definition SROA.cpp:2484
static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, VectorType *Ty, uint64_t ElementSize, const DataLayout &DL, unsigned VScale)
Test whether the given slice use can be promoted to a vector.
Definition SROA.cpp:2017
static Value * getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Offset, Type *PointerTy, const Twine &NamePrefix)
Compute an adjusted pointer from Ptr by Offset bytes where the resulting pointer has PointerTy.
Definition SROA.cpp:1908
static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t AllocBeginOffset, Type *AllocaTy, const DataLayout &DL, bool &WholeAllocaOp)
Test whether a slice of an alloca is valid for integer widening.
Definition SROA.cpp:2323
static Value * extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, unsigned EndIndex, const Twine &Name)
Definition SROA.cpp:2517
static Value * foldPHINodeOrSelectInst(Instruction &I)
A helper that folds a PHI node or a select.
Definition SROA.cpp:1006
static bool rewriteSelectInstMemOps(SelectInst &SI, const RewriteableMemOps &Ops, IRBuilderTy &IRB, DomTreeUpdater *DTU)
Definition SROA.cpp:1874
static void rewriteMemOpOfSelect(SelectInst &SI, T &I, SelectHandSpeculativity Spec, DomTreeUpdater &DTU)
Definition SROA.cpp:1807
static Value * foldSelectInst(SelectInst &SI)
Definition SROA.cpp:993
bool isKillAddress(const DbgVariableRecord *DVR)
Definition SROA.cpp:5330
static Value * insertVector(IRBuilderTy &IRB, Value *Old, Value *V, unsigned BeginIndex, const Twine &Name)
Definition SROA.cpp:2539
static bool isIntegerWideningViable(Partition &P, Type *AllocaTy, const DataLayout &DL)
Test whether the given alloca partition's integer operations can be widened to promotable ones.
Definition SROA.cpp:2418
static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN)
Definition SROA.cpp:1625
static VectorType * createAndCheckVectorTypesForPromotion(SetVector< Type * > &OtherTys, ArrayRef< VectorType * > CandidateTysCopy, function_ref< void(Type *)> CheckCandidateType, Partition &P, const DataLayout &DL, SmallVectorImpl< VectorType * > &CandidateTys, bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy, bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale)
Definition SROA.cpp:2197
static DebugVariable getAggregateVariable(DbgVariableRecord *DVR)
Definition SROA.cpp:324
static bool isSafePHIToSpeculate(PHINode &PN)
PHI instructions that use an alloca and are subsequently loaded can be rewritten to load both input p...
Definition SROA.cpp:1551
static Value * extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, IntegerType *Ty, uint64_t Offset, const Twine &Name)
Definition SROA.cpp:2459
static void insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, AllocaInst *NewAddr, DIExpression *NewAddrExpr, Instruction *BeforeInst, std::optional< DIExpression::FragmentInfo > NewFragment, int64_t BitExtractAdjustment)
Insert a new DbgRecord.
Definition SROA.cpp:5432
static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI, IRBuilderTy &IRB)
Definition SROA.cpp:1768
static Value * mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL, Type *NewAIEltTy, IRBuilder<> &Builder)
This function takes two vector values and combines them into a single vector by concatenating their e...
Definition SROA.cpp:2611
const DIExpression * getAddressExpression(const DbgVariableRecord *DVR)
Definition SROA.cpp:5336
static Type * getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, uint64_t Size)
Try to find a partition of the aggregate type passed in for a given offset and size.
Definition SROA.cpp:4485
static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy, unsigned VScale=0)
Test whether we can convert a value from the old to the new type.
Definition SROA.cpp:1929
static SelectHandSpeculativity isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG)
Definition SROA.cpp:1706
This file provides the interface for LLVM's Scalar Replacement of Aggregates pass.
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Virtual Register Rewriter
Value * RHS
Value * LHS
Builder for the alloca slices.
Definition SROA.cpp:1018
SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
Definition SROA.cpp:1034
An iterator over partitions of the alloca's slices.
Definition SROA.cpp:806
bool operator==(const partition_iterator &RHS) const
Definition SROA.cpp:953
partition_iterator & operator++()
Definition SROA.cpp:973
bool shouldDelete(Instruction *I) const override
Return false if a sub-class wants to keep one of the loads/stores after the SSA construction.
Definition SROA.cpp:5705
BasicLoadAndStorePromoter(ArrayRef< const Instruction * > Insts, SSAUpdater &S, Type *ZeroType)
Definition SROA.cpp:5702
Value * getValueToUseForAlloca(Instruction *I) const override
Return the value to use for the point in the code that the alloca is positioned.
Definition SROA.cpp:5709
Class for arbitrary precision integers.
Definition APInt.h:78
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:131
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
iterator begin() const
Definition ArrayRef.h:130
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
LLVM_ABI CaptureInfo getCaptureInfo(unsigned OpNo) const
Return which pointer components this operand may capture.
bool onlyReadsMemory(unsigned OpNo) const
bool isDataOperand(const Use *U) const
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static DIAssignID * getDistinct(LLVMContext &Context)
LLVM_ABI DbgInstPtr insertDbgAssign(Instruction *LinkedInstr, Value *Val, DILocalVariable *SrcVar, DIExpression *ValExpr, Value *Addr, DIExpression *AddrExpr, const DILocation *DL)
Insert a new llvm.dbg.assign intrinsic call.
DWARF expression.
iterator_range< expr_op_iterator > expr_ops() const
DbgVariableFragmentInfo FragmentInfo
LLVM_ABI bool startsWithDeref() const
Return whether the first element a DW_OP_deref.
static LLVM_ABI bool calculateFragmentIntersect(const DataLayout &DL, const Value *SliceStart, uint64_t SliceOffsetInBits, uint64_t SliceSizeInBits, const Value *DbgPtr, int64_t DbgPtrOffsetInBits, int64_t DbgExtractOffsetInBits, DIExpression::FragmentInfo VarFrag, std::optional< DIExpression::FragmentInfo > &Result, int64_t &OffsetFromLocationInBits)
Computes a fragment, bit-extract operation if needed, and new constant offset to describe a part of a...
static LLVM_ABI std::optional< DIExpression * > createFragmentExpression(const DIExpression *Expr, unsigned OffsetInBits, unsigned SizeInBits)
Create a DIExpression to describe one part of an aggregate variable that is fragmented across multipl...
static LLVM_ABI DIExpression * prepend(const DIExpression *Expr, uint8_t Flags, int64_t Offset=0)
Prepend DIExpr with a deref and offset operation and optionally turn it into a stack value or/and an ...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI void moveBefore(DbgRecord *MoveBefore)
DebugLoc getDebugLoc() const
void setDebugLoc(DebugLoc Loc)
Record of a variable value-assignment, aka a non instruction representation of the dbg....
LLVM_ABI void setKillAddress()
Kill the address component.
LLVM_ABI bool isKillLocation() const
LLVM_ABI bool isKillAddress() const
Check whether this kills the address component.
LLVM_ABI void replaceVariableLocationOp(Value *OldValue, Value *NewValue, bool AllowEmpty=false)
Value * getValue(unsigned OpIdx=0) const
static LLVM_ABI DbgVariableRecord * createLinkedDVRAssign(Instruction *LinkedInstr, Value *Val, DILocalVariable *Variable, DIExpression *Expression, Value *Address, DIExpression *AddressExpression, const DILocation *DI)
LLVM_ABI void setAssignId(DIAssignID *New)
DIExpression * getExpression() const
static LLVM_ABI DbgVariableRecord * createDVRDeclare(Value *Address, DILocalVariable *DV, DIExpression *Expr, const DILocation *DI)
static LLVM_ABI DbgVariableRecord * createDbgVariableRecord(Value *Location, DILocalVariable *DV, DIExpression *Expr, const DILocation *DI)
DILocalVariable * getVariable() const
DIExpression * getAddressExpression() const
LLVM_ABI DILocation * getInlinedAt() const
Definition DebugLoc.cpp:67
Identifies a unique instance of a variable.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:316
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
Class to represent fixed width SIMD vectors.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
unsigned getVScaleValue() const
Return the value for vscale based on the vscale_range attribute or 0 when unknown.
const BasicBlock & getEntryBlock() const
Definition Function.h:809
LLVM_ABI bool accumulateConstantOffset(const DataLayout &DL, APInt &Offset, function_ref< bool(Value &, APInt &)> ExternalAnalysis=nullptr) const
Accumulate the constant address offset of this GEP if possible.
Definition Operator.cpp:125
iterator_range< op_iterator > indices()
Type * getSourceElementType() const
LLVM_ABI GEPNoWrapFlags getNoWrapFlags() const
Get the nowrap flags for the GEP instruction.
This provides the default implementation of the IRBuilder 'InsertHelper' method that is called whenev...
Definition IRBuilder.h:61
virtual void InsertHelper(Instruction *I, const Twine &Name, BasicBlock::iterator InsertPt) const
Definition IRBuilder.h:65
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2847
Base class for instruction visitors.
Definition InstVisitor.h:78
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void setAAMetadata(const AAMDNodes &N)
Sets the AA metadata on this instruction from the AAMDNodes structure.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Class to represent integer types.
@ MAX_INT_BITS
Maximum number of bits that can be specified.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LoadAndStorePromoter(ArrayRef< const Instruction * > Insts, SSAUpdater &S, StringRef Name=StringRef())
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Value * getPointerOperand()
bool isVolatile() const
Return true if this is a load from a volatile memory location.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this load instruction.
Type * getPointerOperandType() const
static unsigned getPointerOperandIndex()
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this load instruction.
bool isSimple() const
Align getAlign() const
Return the alignment of the access that is being performed.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
LLVMContext & getContext() const
Definition Metadata.h:1244
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
This is the common base class for memset/memcpy/memmove.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
PointerIntPair - This class implements a pair of a pointer and small integer.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
PtrUseVisitor(const DataLayout &DL)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Run the pass over the function.
Definition SROA.cpp:5999
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
Definition SROA.cpp:6014
SROAPass(SROAOptions PreserveCFG)
If PreserveCFG is set, then the pass is not allowed to modify CFG in any way, even if it would update...
Definition SROA.cpp:6022
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
Value * getValueOperand()
static unsigned getPointerOperandIndex()
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static constexpr size_t npos
Definition StringRef.h:57
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition StringRef.h:591
size_t rfind(char C, size_t From=npos) const
Search for the last character C in the string.
Definition StringRef.h:365
size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition StringRef.h:290
LLVM_ABI size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Used to lazily calculate structure layout information for a target machine, based on the DataLayout s...
Definition DataLayout.h:743
TypeSize getSizeInBytes() const
Definition DataLayout.h:752
LLVM_ABI unsigned getElementContainingOffset(uint64_t FixedOffset) const
Given a valid byte offset into the structure, returns the structure index that contains it.
TypeSize getElementOffset(unsigned Idx) const
Definition DataLayout.h:774
TypeSize getSizeInBits() const
Definition DataLayout.h:754
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:483
element_iterator element_end() const
element_iterator element_begin() const
bool isPacked() const
Type * getElementType(unsigned N) const
Type::subtype_iterator element_iterator
Target - Wrapper for Target specific information.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI unsigned getIntegerBitWidth() const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:313
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
bool isTargetExtTy() const
Return true if this is a target extension type.
Definition Type.h:205
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:287
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:267
op_iterator op_begin()
Definition User.h:259
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
op_iterator op_end()
Definition User.h:261
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
LLVM_ABI const Value * stripInBoundsOffsets(function_ref< void(const Value *)> Func=[](const Value *) {}) const
Strip off pointer casts and inbounds GEPs.
Definition Value.cpp:820
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void dropDroppableUsesIn(User &Usr)
Remove every use of this value in User that can safely be removed.
Definition Value.cpp:214
LLVM_ABI const Value * stripAndAccumulateConstantOffsets(const DataLayout &DL, APInt &Offset, bool AllowNonInbounds, bool AllowInvariantGroup=false, function_ref< bool(Value &Value, APInt &Offset)> ExternalAnalysis=nullptr, bool LookThroughIntToPtr=false) const
Accumulate the constant offset this value has compared to a base pointer.
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:399
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static VectorType * getWithSizeAndScalar(VectorType *SizeTy, Type *EltTy)
This static method attempts to construct a VectorType with the same size-in-bits as SizeTy but with a...
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition iterator.h:80
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
Offsets
Offsets in bytes from the start of the input buffer.
SmallVector< DbgVariableRecord * > getDVRAssignmentMarkers(const Instruction *Inst)
Return a range of dbg_assign records for which Inst performs the assignment they encode.
Definition DebugInfo.h:203
LLVM_ABI void deleteAssignmentMarkers(const Instruction *Inst)
Delete the llvm.dbg.assign intrinsics linked to Inst.
initializer< Ty > init(const Ty &Val)
@ DW_OP_LLVM_extract_bits_zext
Only used in LLVM metadata.
Definition Dwarf.h:151
@ DW_OP_LLVM_fragment
Only used in LLVM metadata.
Definition Dwarf.h:144
@ DW_OP_LLVM_extract_bits_sext
Only used in LLVM metadata.
Definition Dwarf.h:150
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
bool empty() const
Definition BasicBlock.h:101
Context & getContext() const
Definition BasicBlock.h:99
iterator end() const
Definition BasicBlock.h:89
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
static cl::opt< bool > SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false), cl::Hidden)
Disable running mem2reg during SROA in order to test or debug SROA.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:557
@ Length
Definition DWP.cpp:557
bool operator<(int64_t V1, const APSInt &V2)
Definition APSInt.h:360
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2115
LLVM_ABI bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
cl::opt< bool > ProfcheckDisableMetadataFixes
Definition LoopInfo.cpp:60
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668
LLVM_ABI void PromoteMemToReg(ArrayRef< AllocaInst * > Allocas, DominatorTree &DT, AssumptionCache *AC=nullptr)
Promote the specified list of alloca instructions into scalar registers, inserting PHI nodes as appro...
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto successors(const MachineBasicBlock *BB)
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2142
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI std::optional< RegOrConstant > getVectorSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI)
Definition Utils.cpp:1452
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
void * PointerTy
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2133
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI bool isAllocaPromotable(const AllocaInst *AI)
Return true if this alloca is legal for promotion.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2199
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:403
bool capturesFullProvenance(CaptureComponents CC)
Definition ModRef.h:396
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:446
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void initializeSROALegacyPassPass(PassRegistry &)
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
LLVM_ABI TinyPtrVector< DbgVariableRecord * > findDVRValues(Value *V)
As above, for DVRValues.
Definition DebugInfo.cpp:82
LLVM_ABI void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1771
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2191
LLVM_ABI TinyPtrVector< DbgVariableRecord * > findDVRDeclares(Value *V)
Finds dbg.declare records declaring local variables as living in the memory that 'V' points to.
Definition DebugInfo.cpp:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI Instruction * SplitBlockAndInsertIfThen(Value *Cond, BasicBlock::iterator SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI FunctionPass * createSROAPass(bool PreserveCFG=true)
Definition SROA.cpp:6065
SROAOptions
Definition SROA.h:24
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define NDEBUG
Definition regutils.h:48
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
AAMDNodes shift(size_t Offset) const
Create a new AAMDNode that describes this AAMDNode after applying a constant offset to the start of t...
Definition Metadata.h:822
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Describes an element of a Bitfield.
Definition Bitfields.h:176
static Bitfield::Type get(StorageType Packed)
Unpacks the field from the Packed value.
Definition Bitfields.h:207
static void set(StorageType &Packed, typename Bitfield::Type Value)
Sets the typed value in the provided Packed value.
Definition Bitfields.h:223
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:70