LLVM 23.0.0git
SROA.cpp
Go to the documentation of this file.
1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This transformation implements the well known scalar replacement of
10/// aggregates transformation. It tries to identify promotable elements of an
11/// aggregate alloca, and promote them to registers. It will also try to
12/// convert uses of an element (or set of elements) of an alloca into a vector
13/// or bitfield-style integer scalar if appropriate.
14///
15/// It works to do this with minimal slicing of the alloca so that regions
16/// which are merely transferred in and out of external memory remain unchanged
17/// and are not decomposed to scalar code.
18///
19/// Because this also performs alloca promotion, it can be thought of as also
20/// serving the purpose of SSA formation. The algorithm iterates on the
21/// function until all opportunities for promotion have been realized.
22///
23//===----------------------------------------------------------------------===//
24
26#include "llvm/ADT/APInt.h"
27#include "llvm/ADT/ArrayRef.h"
28#include "llvm/ADT/DenseMap.h"
29#include "llvm/ADT/MapVector.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SetVector.h"
36#include "llvm/ADT/Statistic.h"
37#include "llvm/ADT/StringRef.h"
38#include "llvm/ADT/Twine.h"
39#include "llvm/ADT/iterator.h"
44#include "llvm/Analysis/Loads.h"
47#include "llvm/Config/llvm-config.h"
48#include "llvm/IR/BasicBlock.h"
49#include "llvm/IR/Constant.h"
51#include "llvm/IR/Constants.h"
52#include "llvm/IR/DIBuilder.h"
53#include "llvm/IR/DataLayout.h"
54#include "llvm/IR/DebugInfo.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/GlobalAlias.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstVisitor.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/LLVMContext.h"
67#include "llvm/IR/Metadata.h"
68#include "llvm/IR/Module.h"
69#include "llvm/IR/Operator.h"
70#include "llvm/IR/PassManager.h"
71#include "llvm/IR/Type.h"
72#include "llvm/IR/Use.h"
73#include "llvm/IR/User.h"
74#include "llvm/IR/Value.h"
75#include "llvm/IR/ValueHandle.h"
77#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
89#include <algorithm>
90#include <cassert>
91#include <cstddef>
92#include <cstdint>
93#include <cstring>
94#include <iterator>
95#include <queue>
96#include <string>
97#include <tuple>
98#include <utility>
99#include <variant>
100#include <vector>
101
102using namespace llvm;
103
104#define DEBUG_TYPE "sroa"
105
106STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
107STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
108STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
109STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
110STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
111STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
112STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
113STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
114STATISTIC(NumLoadsPredicated,
115 "Number of loads rewritten into predicated loads to allow promotion");
117 NumStoresPredicated,
118 "Number of stores rewritten into predicated loads to allow promotion");
119STATISTIC(NumDeleted, "Number of instructions deleted");
120STATISTIC(NumVectorized, "Number of vectorized aggregates");
121
122namespace llvm {
123/// Disable running mem2reg during SROA in order to test or debug SROA.
124static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false),
125 cl::Hidden);
127} // namespace llvm
128
129namespace {
130
131class AllocaSliceRewriter;
132class AllocaSlices;
133class Partition;
134
135class SelectHandSpeculativity {
136 unsigned char Storage = 0; // None are speculatable by default.
137 using TrueVal = Bitfield::Element<bool, 0, 1>; // Low 0'th bit.
138 using FalseVal = Bitfield::Element<bool, 1, 1>; // Low 1'th bit.
139public:
140 SelectHandSpeculativity() = default;
141 SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal);
142 bool isSpeculatable(bool isTrueVal) const;
143 bool areAllSpeculatable() const;
144 bool areAnySpeculatable() const;
145 bool areNoneSpeculatable() const;
146 // For interop as int half of PointerIntPair.
147 explicit operator intptr_t() const { return static_cast<intptr_t>(Storage); }
148 explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {}
149};
150static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char));
151
152using PossiblySpeculatableLoad =
154using UnspeculatableStore = StoreInst *;
155using RewriteableMemOp =
156 std::variant<PossiblySpeculatableLoad, UnspeculatableStore>;
157using RewriteableMemOps = SmallVector<RewriteableMemOp, 2>;
158
159/// An optimization pass providing Scalar Replacement of Aggregates.
160///
161/// This pass takes allocations which can be completely analyzed (that is, they
162/// don't escape) and tries to turn them into scalar SSA values. There are
163/// a few steps to this process.
164///
165/// 1) It takes allocations of aggregates and analyzes the ways in which they
166/// are used to try to split them into smaller allocations, ideally of
167/// a single scalar data type. It will split up memcpy and memset accesses
168/// as necessary and try to isolate individual scalar accesses.
169/// 2) It will transform accesses into forms which are suitable for SSA value
170/// promotion. This can be replacing a memset with a scalar store of an
171/// integer value, or it can involve speculating operations on a PHI or
172/// select to be a PHI or select of the results.
173/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
174/// onto insert and extract operations on a vector value, and convert them to
175/// this form. By doing so, it will enable promotion of vector aggregates to
176/// SSA vector values.
177class SROA {
178 LLVMContext *const C;
179 DomTreeUpdater *const DTU;
180 AssumptionCache *const AC;
181 const bool PreserveCFG;
182
183 /// Worklist of alloca instructions to simplify.
184 ///
185 /// Each alloca in the function is added to this. Each new alloca formed gets
186 /// added to it as well to recursively simplify unless that alloca can be
187 /// directly promoted. Finally, each time we rewrite a use of an alloca other
188 /// the one being actively rewritten, we add it back onto the list if not
189 /// already present to ensure it is re-visited.
190 SmallSetVector<AllocaInst *, 16> Worklist;
191
192 /// A collection of instructions to delete.
193 /// We try to batch deletions to simplify code and make things a bit more
194 /// efficient. We also make sure there is no dangling pointers.
195 SmallVector<WeakVH, 8> DeadInsts;
196
197 /// Post-promotion worklist.
198 ///
199 /// Sometimes we discover an alloca which has a high probability of becoming
200 /// viable for SROA after a round of promotion takes place. In those cases,
201 /// the alloca is enqueued here for re-processing.
202 ///
203 /// Note that we have to be very careful to clear allocas out of this list in
204 /// the event they are deleted.
205 SmallSetVector<AllocaInst *, 16> PostPromotionWorklist;
206
207 /// A collection of alloca instructions we can directly promote.
208 SetVector<AllocaInst *, SmallVector<AllocaInst *>,
209 SmallPtrSet<AllocaInst *, 16>, 16>
210 PromotableAllocas;
211
212 /// A worklist of PHIs to speculate prior to promoting allocas.
213 ///
214 /// All of these PHIs have been checked for the safety of speculation and by
215 /// being speculated will allow promoting allocas currently in the promotable
216 /// queue.
217 SmallSetVector<PHINode *, 8> SpeculatablePHIs;
218
219 /// A worklist of select instructions to rewrite prior to promoting
220 /// allocas.
221 SmallMapVector<SelectInst *, RewriteableMemOps, 8> SelectsToRewrite;
222
223 /// Select instructions that use an alloca and are subsequently loaded can be
224 /// rewritten to load both input pointers and then select between the result,
225 /// allowing the load of the alloca to be promoted.
226 /// From this:
227 /// %P2 = select i1 %cond, ptr %Alloca, ptr %Other
228 /// %V = load <type>, ptr %P2
229 /// to:
230 /// %V1 = load <type>, ptr %Alloca -> will be mem2reg'd
231 /// %V2 = load <type>, ptr %Other
232 /// %V = select i1 %cond, <type> %V1, <type> %V2
233 ///
234 /// We can do this to a select if its only uses are loads
235 /// and if either the operand to the select can be loaded unconditionally,
236 /// or if we are allowed to perform CFG modifications.
237 /// If found an intervening bitcast with a single use of the load,
238 /// allow the promotion.
239 static std::optional<RewriteableMemOps>
240 isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG);
241
242public:
243 SROA(LLVMContext *C, DomTreeUpdater *DTU, AssumptionCache *AC,
244 SROAOptions PreserveCFG_)
245 : C(C), DTU(DTU), AC(AC),
246 PreserveCFG(PreserveCFG_ == SROAOptions::PreserveCFG) {}
247
248 /// Main run method used by both the SROAPass and by the legacy pass.
249 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runSROA(Function &F);
250
251private:
252 friend class AllocaSliceRewriter;
253
254 bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
255 std::pair<AllocaInst *, uint64_t>
256 rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
257 bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
258 bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
259 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
260 void clobberUse(Use &U);
261 bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
262 bool promoteAllocas();
263};
264
265} // end anonymous namespace
266
267/// Calculate the fragment of a variable to use when slicing a store
268/// based on the slice dimensions, existing fragment, and base storage
269/// fragment.
270/// Results:
271/// UseFrag - Use Target as the new fragment.
272/// UseNoFrag - The new slice already covers the whole variable.
273/// Skip - The new alloca slice doesn't include this variable.
274/// FIXME: Can we use calculateFragmentIntersect instead?
275namespace {
276enum FragCalcResult { UseFrag, UseNoFrag, Skip };
277}
278static FragCalcResult
280 uint64_t NewStorageSliceOffsetInBits,
281 uint64_t NewStorageSliceSizeInBits,
282 std::optional<DIExpression::FragmentInfo> StorageFragment,
283 std::optional<DIExpression::FragmentInfo> CurrentFragment,
285 // If the base storage describes part of the variable apply the offset and
286 // the size constraint.
287 if (StorageFragment) {
288 Target.SizeInBits =
289 std::min(NewStorageSliceSizeInBits, StorageFragment->SizeInBits);
290 Target.OffsetInBits =
291 NewStorageSliceOffsetInBits + StorageFragment->OffsetInBits;
292 } else {
293 Target.SizeInBits = NewStorageSliceSizeInBits;
294 Target.OffsetInBits = NewStorageSliceOffsetInBits;
295 }
296
297 // If this slice extracts the entirety of an independent variable from a
298 // larger alloca, do not produce a fragment expression, as the variable is
299 // not fragmented.
300 if (!CurrentFragment) {
301 if (auto Size = Variable->getSizeInBits()) {
302 // Treat the current fragment as covering the whole variable.
303 CurrentFragment = DIExpression::FragmentInfo(*Size, 0);
304 if (Target == CurrentFragment)
305 return UseNoFrag;
306 }
307 }
308
309 // No additional work to do if there isn't a fragment already, or there is
310 // but it already exactly describes the new assignment.
311 if (!CurrentFragment || *CurrentFragment == Target)
312 return UseFrag;
313
314 // Reject the target fragment if it doesn't fit wholly within the current
315 // fragment. TODO: We could instead chop up the target to fit in the case of
316 // a partial overlap.
317 if (Target.startInBits() < CurrentFragment->startInBits() ||
318 Target.endInBits() > CurrentFragment->endInBits())
319 return Skip;
320
321 // Target fits within the current fragment, return it.
322 return UseFrag;
323}
324
326 return DebugVariable(DVR->getVariable(), std::nullopt,
327 DVR->getDebugLoc().getInlinedAt());
328}
329
330/// Find linked dbg.assign and generate a new one with the correct
331/// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the
332/// value component is copied from the old dbg.assign to the new.
333/// \param OldAlloca Alloca for the variable before splitting.
334/// \param IsSplit True if the store (not necessarily alloca)
335/// is being split.
336/// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca.
337/// \param SliceSizeInBits New number of bits being written to.
338/// \param OldInst Instruction that is being split.
339/// \param Inst New instruction performing this part of the
340/// split store.
341/// \param Dest Store destination.
342/// \param Value Stored value.
343/// \param DL Datalayout.
344static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
345 uint64_t OldAllocaOffsetInBits,
346 uint64_t SliceSizeInBits, Instruction *OldInst,
347 Instruction *Inst, Value *Dest, Value *Value,
348 const DataLayout &DL) {
349 // If we want allocas to be migrated using this helper then we need to ensure
350 // that the BaseFragments map code still works. A simple solution would be
351 // to choose to always clone alloca dbg_assigns (rather than sometimes
352 // "stealing" them).
353 assert(!isa<AllocaInst>(Inst) && "Unexpected alloca");
354
355 auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst);
356 // Nothing to do if OldInst has no linked dbg.assign intrinsics.
357 if (DVRAssignMarkerRange.empty())
358 return;
359
360 LLVM_DEBUG(dbgs() << " migrateDebugInfo\n");
361 LLVM_DEBUG(dbgs() << " OldAlloca: " << *OldAlloca << "\n");
362 LLVM_DEBUG(dbgs() << " IsSplit: " << IsSplit << "\n");
363 LLVM_DEBUG(dbgs() << " OldAllocaOffsetInBits: " << OldAllocaOffsetInBits
364 << "\n");
365 LLVM_DEBUG(dbgs() << " SliceSizeInBits: " << SliceSizeInBits << "\n");
366 LLVM_DEBUG(dbgs() << " OldInst: " << *OldInst << "\n");
367 LLVM_DEBUG(dbgs() << " Inst: " << *Inst << "\n");
368 LLVM_DEBUG(dbgs() << " Dest: " << *Dest << "\n");
369 if (Value)
370 LLVM_DEBUG(dbgs() << " Value: " << *Value << "\n");
371
372 /// Map of aggregate variables to their fragment associated with OldAlloca.
374 BaseFragments;
375 for (auto *DVR : at::getDVRAssignmentMarkers(OldAlloca))
376 BaseFragments[getAggregateVariable(DVR)] =
377 DVR->getExpression()->getFragmentInfo();
378
379 // The new inst needs a DIAssignID unique metadata tag (if OldInst has
380 // one). It shouldn't already have one: assert this assumption.
381 assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID));
382 DIAssignID *NewID = nullptr;
383 auto &Ctx = Inst->getContext();
384 DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
385 assert(OldAlloca->isStaticAlloca());
386
387 auto MigrateDbgAssign = [&](DbgVariableRecord *DbgAssign) {
388 LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign
389 << "\n");
390 auto *Expr = DbgAssign->getExpression();
391 bool SetKillLocation = false;
392
393 if (IsSplit) {
394 std::optional<DIExpression::FragmentInfo> BaseFragment;
395 {
396 auto R = BaseFragments.find(getAggregateVariable(DbgAssign));
397 if (R == BaseFragments.end())
398 return;
399 BaseFragment = R->second;
400 }
401 std::optional<DIExpression::FragmentInfo> CurrentFragment =
402 Expr->getFragmentInfo();
403 DIExpression::FragmentInfo NewFragment;
404 FragCalcResult Result = calculateFragment(
405 DbgAssign->getVariable(), OldAllocaOffsetInBits, SliceSizeInBits,
406 BaseFragment, CurrentFragment, NewFragment);
407
408 if (Result == Skip)
409 return;
410 if (Result == UseFrag && !(NewFragment == CurrentFragment)) {
411 if (CurrentFragment) {
412 // Rewrite NewFragment to be relative to the existing one (this is
413 // what createFragmentExpression wants). CalculateFragment has
414 // already resolved the size for us. FIXME: Should it return the
415 // relative fragment too?
416 NewFragment.OffsetInBits -= CurrentFragment->OffsetInBits;
417 }
418 // Add the new fragment info to the existing expression if possible.
420 Expr, NewFragment.OffsetInBits, NewFragment.SizeInBits)) {
421 Expr = *E;
422 } else {
423 // Otherwise, add the new fragment info to an empty expression and
424 // discard the value component of this dbg.assign as the value cannot
425 // be computed with the new fragment.
427 DIExpression::get(Expr->getContext(), {}),
428 NewFragment.OffsetInBits, NewFragment.SizeInBits);
429 SetKillLocation = true;
430 }
431 }
432 }
433
434 // If we haven't created a DIAssignID ID do that now and attach it to Inst.
435 if (!NewID) {
436 NewID = DIAssignID::getDistinct(Ctx);
437 Inst->setMetadata(LLVMContext::MD_DIAssignID, NewID);
438 }
439
440 DbgVariableRecord *NewAssign;
441 if (IsSplit) {
442 ::Value *NewValue = Value ? Value : DbgAssign->getValue();
444 DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
445 Dest, DIExpression::get(Expr->getContext(), {}),
446 DbgAssign->getDebugLoc())));
447 } else {
448 // The store is not split, simply steal the existing dbg_assign.
449 NewAssign = DbgAssign;
450 NewAssign->setAssignId(NewID); // FIXME: Can we avoid generating new IDs?
451 NewAssign->setAddress(Dest);
452 if (Value)
453 NewAssign->replaceVariableLocationOp(0u, Value);
454 assert(Expr == NewAssign->getExpression());
455 }
456
457 // If we've updated the value but the original dbg.assign has an arglist
458 // then kill it now - we can't use the requested new value.
459 // We can't replace the DIArgList with the new value as it'd leave
460 // the DIExpression in an invalid state (DW_OP_LLVM_arg operands without
461 // an arglist). And we can't keep the DIArgList in case the linked store
462 // is being split - in which case the DIArgList + expression may no longer
463 // be computing the correct value.
464 // This should be a very rare situation as it requires the value being
465 // stored to differ from the dbg.assign (i.e., the value has been
466 // represented differently in the debug intrinsic for some reason).
467 SetKillLocation |=
468 Value && (DbgAssign->hasArgList() ||
469 !DbgAssign->getExpression()->isSingleLocationExpression());
470 if (SetKillLocation)
471 NewAssign->setKillLocation();
472
473 // We could use more precision here at the cost of some additional (code)
474 // complexity - if the original dbg.assign was adjacent to its store, we
475 // could position this new dbg.assign adjacent to its store rather than the
476 // old dbg.assgn. That would result in interleaved dbg.assigns rather than
477 // what we get now:
478 // split store !1
479 // split store !2
480 // dbg.assign !1
481 // dbg.assign !2
482 // This (current behaviour) results results in debug assignments being
483 // noted as slightly offset (in code) from the store. In practice this
484 // should have little effect on the debugging experience due to the fact
485 // that all the split stores should get the same line number.
486 if (NewAssign != DbgAssign) {
487 NewAssign->moveBefore(DbgAssign->getIterator());
488 NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
489 }
490 LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
491 };
492
493 for_each(DVRAssignMarkerRange, MigrateDbgAssign);
494}
495
496namespace {
497
498/// A custom IRBuilder inserter which prefixes all names, but only in
499/// Assert builds.
500class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
501 std::string Prefix;
502
503 Twine getNameWithPrefix(const Twine &Name) const {
504 return Name.isTriviallyEmpty() ? Name : Prefix + Name;
505 }
506
507public:
508 void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
509
510 void InsertHelper(Instruction *I, const Twine &Name,
511 BasicBlock::iterator InsertPt) const override {
512 IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name),
513 InsertPt);
514 }
515};
516
517/// Provide a type for IRBuilder that drops names in release builds.
519
520/// A used slice of an alloca.
521///
522/// This structure represents a slice of an alloca used by some instruction. It
523/// stores both the begin and end offsets of this use, a pointer to the use
524/// itself, and a flag indicating whether we can classify the use as splittable
525/// or not when forming partitions of the alloca.
526class Slice {
527 /// The beginning offset of the range.
528 uint64_t BeginOffset = 0;
529
530 /// The ending offset, not included in the range.
531 uint64_t EndOffset = 0;
532
533 /// Storage for both the use of this slice and whether it can be
534 /// split.
535 PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
536
537public:
538 Slice() = default;
539
540 Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable,
541 Value *ProtectedFieldDisc)
542 : BeginOffset(BeginOffset), EndOffset(EndOffset),
543 UseAndIsSplittable(U, IsSplittable),
544 ProtectedFieldDisc(ProtectedFieldDisc) {}
545
546 uint64_t beginOffset() const { return BeginOffset; }
547 uint64_t endOffset() const { return EndOffset; }
548
549 bool isSplittable() const { return UseAndIsSplittable.getInt(); }
550 void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
551
552 Use *getUse() const { return UseAndIsSplittable.getPointer(); }
553
554 bool isDead() const { return getUse() == nullptr; }
555 void kill() { UseAndIsSplittable.setPointer(nullptr); }
556
557 // When this access is via an llvm.protected.field.ptr intrinsic, contains
558 // the second argument to the intrinsic, the discriminator.
559 Value *ProtectedFieldDisc;
560
561 /// Support for ordering ranges.
562 ///
563 /// This provides an ordering over ranges such that start offsets are
564 /// always increasing, and within equal start offsets, the end offsets are
565 /// decreasing. Thus the spanning range comes first in a cluster with the
566 /// same start position.
567 bool operator<(const Slice &RHS) const {
568 if (beginOffset() < RHS.beginOffset())
569 return true;
570 if (beginOffset() > RHS.beginOffset())
571 return false;
572 if (isSplittable() != RHS.isSplittable())
573 return !isSplittable();
574 if (endOffset() > RHS.endOffset())
575 return true;
576 return false;
577 }
578
579 /// Support comparison with a single offset to allow binary searches.
580 [[maybe_unused]] friend bool operator<(const Slice &LHS, uint64_t RHSOffset) {
581 return LHS.beginOffset() < RHSOffset;
582 }
583 [[maybe_unused]] friend bool operator<(uint64_t LHSOffset, const Slice &RHS) {
584 return LHSOffset < RHS.beginOffset();
585 }
586
587 bool operator==(const Slice &RHS) const {
588 return isSplittable() == RHS.isSplittable() &&
589 beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
590 }
591 bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
592};
593
594/// Representation of the alloca slices.
595///
596/// This class represents the slices of an alloca which are formed by its
597/// various uses. If a pointer escapes, we can't fully build a representation
598/// for the slices used and we reflect that in this structure. The uses are
599/// stored, sorted by increasing beginning offset and with unsplittable slices
600/// starting at a particular offset before splittable slices.
601class AllocaSlices {
602public:
603 /// Construct the slices of a particular alloca.
604 AllocaSlices(const DataLayout &DL, AllocaInst &AI);
605
606 /// Test whether a pointer to the allocation escapes our analysis.
607 ///
608 /// If this is true, the slices are never fully built and should be
609 /// ignored.
610 bool isEscaped() const { return PointerEscapingInstr; }
611 bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
612
613 /// Support for iterating over the slices.
614 /// @{
615 using iterator = SmallVectorImpl<Slice>::iterator;
616 using range = iterator_range<iterator>;
617
618 iterator begin() { return Slices.begin(); }
619 iterator end() { return Slices.end(); }
620
621 using const_iterator = SmallVectorImpl<Slice>::const_iterator;
622 using const_range = iterator_range<const_iterator>;
623
624 const_iterator begin() const { return Slices.begin(); }
625 const_iterator end() const { return Slices.end(); }
626 /// @}
627
628 /// Erase a range of slices.
629 void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
630
631 /// Insert new slices for this alloca.
632 ///
633 /// This moves the slices into the alloca's slices collection, and re-sorts
634 /// everything so that the usual ordering properties of the alloca's slices
635 /// hold.
636 void insert(ArrayRef<Slice> NewSlices) {
637 int OldSize = Slices.size();
638 Slices.append(NewSlices.begin(), NewSlices.end());
639 auto SliceI = Slices.begin() + OldSize;
640 std::stable_sort(SliceI, Slices.end());
641 std::inplace_merge(Slices.begin(), SliceI, Slices.end());
642 }
643
644 // Forward declare the iterator and range accessor for walking the
645 // partitions.
646 class partition_iterator;
648
649 /// Access the dead users for this alloca.
650 ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
651
652 /// Access the users for this alloca that are llvm.protected.field.ptr
653 /// intrinsics.
654 ArrayRef<IntrinsicInst *> getPFPUsers() const { return PFPUsers; }
655
656 /// Access Uses that should be dropped if the alloca is promotable.
657 ArrayRef<Use *> getDeadUsesIfPromotable() const {
658 return DeadUseIfPromotable;
659 }
660
661 /// Access the dead operands referring to this alloca.
662 ///
663 /// These are operands which have cannot actually be used to refer to the
664 /// alloca as they are outside its range and the user doesn't correct for
665 /// that. These mostly consist of PHI node inputs and the like which we just
666 /// need to replace with undef.
667 ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
668
669#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
670 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
671 void printSlice(raw_ostream &OS, const_iterator I,
672 StringRef Indent = " ") const;
673 void printUse(raw_ostream &OS, const_iterator I,
674 StringRef Indent = " ") const;
675 void print(raw_ostream &OS) const;
676 void dump(const_iterator I) const;
677 void dump() const;
678#endif
679
680private:
681 template <typename DerivedT, typename RetT = void> class BuilderBase;
682 class SliceBuilder;
683
684 friend class AllocaSlices::SliceBuilder;
685
686#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
687 /// Handle to alloca instruction to simplify method interfaces.
688 AllocaInst &AI;
689#endif
690
691 /// The instruction responsible for this alloca not having a known set
692 /// of slices.
693 ///
694 /// When an instruction (potentially) escapes the pointer to the alloca, we
695 /// store a pointer to that here and abort trying to form slices of the
696 /// alloca. This will be null if the alloca slices are analyzed successfully.
697 Instruction *PointerEscapingInstr;
698 Instruction *PointerEscapingInstrReadOnly;
699
700 /// The slices of the alloca.
701 ///
702 /// We store a vector of the slices formed by uses of the alloca here. This
703 /// vector is sorted by increasing begin offset, and then the unsplittable
704 /// slices before the splittable ones. See the Slice inner class for more
705 /// details.
707
708 /// Instructions which will become dead if we rewrite the alloca.
709 ///
710 /// Note that these are not separated by slice. This is because we expect an
711 /// alloca to be completely rewritten or not rewritten at all. If rewritten,
712 /// all these instructions can simply be removed and replaced with poison as
713 /// they come from outside of the allocated space.
714 SmallVector<Instruction *, 8> DeadUsers;
715
716 /// Users that are llvm.protected.field.ptr intrinsics. These will be RAUW'd
717 /// to their first argument if we rewrite the alloca.
719
720 /// Uses which will become dead if can promote the alloca.
721 SmallVector<Use *, 8> DeadUseIfPromotable;
722
723 /// Operands which will become dead if we rewrite the alloca.
724 ///
725 /// These are operands that in their particular use can be replaced with
726 /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
727 /// to PHI nodes and the like. They aren't entirely dead (there might be
728 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
729 /// want to swap this particular input for poison to simplify the use lists of
730 /// the alloca.
731 SmallVector<Use *, 8> DeadOperands;
732};
733
734/// A partition of the slices.
735///
736/// An ephemeral representation for a range of slices which can be viewed as
737/// a partition of the alloca. This range represents a span of the alloca's
738/// memory which cannot be split, and provides access to all of the slices
739/// overlapping some part of the partition.
740///
741/// Objects of this type are produced by traversing the alloca's slices, but
742/// are only ephemeral and not persistent.
743class Partition {
744private:
745 friend class AllocaSlices;
746 friend class AllocaSlices::partition_iterator;
747
748 using iterator = AllocaSlices::iterator;
749
750 /// The beginning and ending offsets of the alloca for this
751 /// partition.
752 uint64_t BeginOffset = 0, EndOffset = 0;
753
754 /// The start and end iterators of this partition.
755 iterator SI, SJ;
756
757 /// A collection of split slice tails overlapping the partition.
758 SmallVector<Slice *, 4> SplitTails;
759
760 /// Raw constructor builds an empty partition starting and ending at
761 /// the given iterator.
762 Partition(iterator SI) : SI(SI), SJ(SI) {}
763
764public:
765 /// The start offset of this partition.
766 ///
767 /// All of the contained slices start at or after this offset.
768 uint64_t beginOffset() const { return BeginOffset; }
769
770 /// The end offset of this partition.
771 ///
772 /// All of the contained slices end at or before this offset.
773 uint64_t endOffset() const { return EndOffset; }
774
775 /// The size of the partition.
776 ///
777 /// Note that this can never be zero.
778 uint64_t size() const {
779 assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
780 return EndOffset - BeginOffset;
781 }
782
783 /// Test whether this partition contains no slices, and merely spans
784 /// a region occupied by split slices.
785 bool empty() const { return SI == SJ; }
786
787 /// \name Iterate slices that start within the partition.
788 /// These may be splittable or unsplittable. They have a begin offset >= the
789 /// partition begin offset.
790 /// @{
791 // FIXME: We should probably define a "concat_iterator" helper and use that
792 // to stitch together pointee_iterators over the split tails and the
793 // contiguous iterators of the partition. That would give a much nicer
794 // interface here. We could then additionally expose filtered iterators for
795 // split, unsplit, and unsplittable splices based on the usage patterns.
796 iterator begin() const { return SI; }
797 iterator end() const { return SJ; }
798 /// @}
799
800 /// Get the sequence of split slice tails.
801 ///
802 /// These tails are of slices which start before this partition but are
803 /// split and overlap into the partition. We accumulate these while forming
804 /// partitions.
805 ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
806};
807
808} // end anonymous namespace
809
810/// An iterator over partitions of the alloca's slices.
811///
812/// This iterator implements the core algorithm for partitioning the alloca's
813/// slices. It is a forward iterator as we don't support backtracking for
814/// efficiency reasons, and re-use a single storage area to maintain the
815/// current set of split slices.
816///
817/// It is templated on the slice iterator type to use so that it can operate
818/// with either const or non-const slice iterators.
820 : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
821 Partition> {
822 friend class AllocaSlices;
823
824 /// Most of the state for walking the partitions is held in a class
825 /// with a nice interface for examining them.
826 Partition P;
827
828 /// We need to keep the end of the slices to know when to stop.
829 AllocaSlices::iterator SE;
830
831 /// We also need to keep track of the maximum split end offset seen.
832 /// FIXME: Do we really?
833 uint64_t MaxSplitSliceEndOffset = 0;
834
835 /// Sets the partition to be empty at given iterator, and sets the
836 /// end iterator.
837 partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
838 : P(SI), SE(SE) {
839 // If not already at the end, advance our state to form the initial
840 // partition.
841 if (SI != SE)
842 advance();
843 }
844
845 /// Advance the iterator to the next partition.
846 ///
847 /// Requires that the iterator not be at the end of the slices.
848 void advance() {
849 assert((P.SI != SE || !P.SplitTails.empty()) &&
850 "Cannot advance past the end of the slices!");
851
852 // Clear out any split uses which have ended.
853 if (!P.SplitTails.empty()) {
854 if (P.EndOffset >= MaxSplitSliceEndOffset) {
855 // If we've finished all splits, this is easy.
856 P.SplitTails.clear();
857 MaxSplitSliceEndOffset = 0;
858 } else {
859 // Remove the uses which have ended in the prior partition. This
860 // cannot change the max split slice end because we just checked that
861 // the prior partition ended prior to that max.
862 llvm::erase_if(P.SplitTails,
863 [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
864 assert(llvm::any_of(P.SplitTails,
865 [&](Slice *S) {
866 return S->endOffset() == MaxSplitSliceEndOffset;
867 }) &&
868 "Could not find the current max split slice offset!");
869 assert(llvm::all_of(P.SplitTails,
870 [&](Slice *S) {
871 return S->endOffset() <= MaxSplitSliceEndOffset;
872 }) &&
873 "Max split slice end offset is not actually the max!");
874 }
875 }
876
877 // If P.SI is already at the end, then we've cleared the split tail and
878 // now have an end iterator.
879 if (P.SI == SE) {
880 assert(P.SplitTails.empty() && "Failed to clear the split slices!");
881 return;
882 }
883
884 // If we had a non-empty partition previously, set up the state for
885 // subsequent partitions.
886 if (P.SI != P.SJ) {
887 // Accumulate all the splittable slices which started in the old
888 // partition into the split list.
889 for (Slice &S : P)
890 if (S.isSplittable() && S.endOffset() > P.EndOffset) {
891 P.SplitTails.push_back(&S);
892 MaxSplitSliceEndOffset =
893 std::max(S.endOffset(), MaxSplitSliceEndOffset);
894 }
895
896 // Start from the end of the previous partition.
897 P.SI = P.SJ;
898
899 // If P.SI is now at the end, we at most have a tail of split slices.
900 if (P.SI == SE) {
901 P.BeginOffset = P.EndOffset;
902 P.EndOffset = MaxSplitSliceEndOffset;
903 return;
904 }
905
906 // If the we have split slices and the next slice is after a gap and is
907 // not splittable immediately form an empty partition for the split
908 // slices up until the next slice begins.
909 if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
910 !P.SI->isSplittable()) {
911 P.BeginOffset = P.EndOffset;
912 P.EndOffset = P.SI->beginOffset();
913 return;
914 }
915 }
916
917 // OK, we need to consume new slices. Set the end offset based on the
918 // current slice, and step SJ past it. The beginning offset of the
919 // partition is the beginning offset of the next slice unless we have
920 // pre-existing split slices that are continuing, in which case we begin
921 // at the prior end offset.
922 P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
923 P.EndOffset = P.SI->endOffset();
924 ++P.SJ;
925
926 // There are two strategies to form a partition based on whether the
927 // partition starts with an unsplittable slice or a splittable slice.
928 if (!P.SI->isSplittable()) {
929 // When we're forming an unsplittable region, it must always start at
930 // the first slice and will extend through its end.
931 assert(P.BeginOffset == P.SI->beginOffset());
932
933 // Form a partition including all of the overlapping slices with this
934 // unsplittable slice.
935 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
936 if (!P.SJ->isSplittable())
937 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
938 ++P.SJ;
939 }
940
941 // We have a partition across a set of overlapping unsplittable
942 // partitions.
943 return;
944 }
945
946 // If we're starting with a splittable slice, then we need to form
947 // a synthetic partition spanning it and any other overlapping splittable
948 // splices.
949 assert(P.SI->isSplittable() && "Forming a splittable partition!");
950
951 // Collect all of the overlapping splittable slices.
952 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
953 P.SJ->isSplittable()) {
954 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
955 ++P.SJ;
956 }
957
958 // Back upiP.EndOffset if we ended the span early when encountering an
959 // unsplittable slice. This synthesizes the early end offset of
960 // a partition spanning only splittable slices.
961 if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
962 assert(!P.SJ->isSplittable());
963 P.EndOffset = P.SJ->beginOffset();
964 }
965 }
966
967public:
968 bool operator==(const partition_iterator &RHS) const {
969 assert(SE == RHS.SE &&
970 "End iterators don't match between compared partition iterators!");
971
972 // The observed positions of partitions is marked by the P.SI iterator and
973 // the emptiness of the split slices. The latter is only relevant when
974 // P.SI == SE, as the end iterator will additionally have an empty split
975 // slices list, but the prior may have the same P.SI and a tail of split
976 // slices.
977 if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
978 assert(P.SJ == RHS.P.SJ &&
979 "Same set of slices formed two different sized partitions!");
980 assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
981 "Same slice position with differently sized non-empty split "
982 "slice tails!");
983 return true;
984 }
985 return false;
986 }
987
988 partition_iterator &operator++() {
989 advance();
990 return *this;
991 }
992
993 Partition &operator*() { return P; }
994};
995
996/// A forward range over the partitions of the alloca's slices.
997///
998/// This accesses an iterator range over the partitions of the alloca's
999/// slices. It computes these partitions on the fly based on the overlapping
1000/// offsets of the slices and the ability to split them. It will visit "empty"
1001/// partitions to cover regions of the alloca only accessed via split
1002/// slices.
1003iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
1004 return make_range(partition_iterator(begin(), end()),
1005 partition_iterator(end(), end()));
1006}
1007
1009 // If the condition being selected on is a constant or the same value is
1010 // being selected between, fold the select. Yes this does (rarely) happen
1011 // early on.
1012 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
1013 return SI.getOperand(1 + CI->isZero());
1014 if (SI.getOperand(1) == SI.getOperand(2))
1015 return SI.getOperand(1);
1016
1017 return nullptr;
1018}
1019
1020/// A helper that folds a PHI node or a select.
1022 if (PHINode *PN = dyn_cast<PHINode>(&I)) {
1023 // If PN merges together the same value, return that value.
1024 return PN->hasConstantValue();
1025 }
1027}
1028
1029/// Builder for the alloca slices.
1030///
1031/// This class builds a set of alloca slices by recursively visiting the uses
1032/// of an alloca and making a slice for each load and store at each offset.
1033class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
1034 friend class PtrUseVisitor<SliceBuilder>;
1035 friend class InstVisitor<SliceBuilder>;
1036
1037 using Base = PtrUseVisitor<SliceBuilder>;
1038
1039 const uint64_t AllocSize;
1040 AllocaSlices &AS;
1041
1042 SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
1044
1045 /// Set to de-duplicate dead instructions found in the use walk.
1046 SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
1047
1048 // When this access is via an llvm.protected.field.ptr intrinsic, contains
1049 // the second argument to the intrinsic, the discriminator.
1050 Value *ProtectedFieldDisc = nullptr;
1051
1052public:
1053 SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
1055 AllocSize(AI.getAllocationSize(DL)->getFixedValue()), AS(AS) {}
1056
1057private:
1058 void markAsDead(Instruction &I) {
1059 if (VisitedDeadInsts.insert(&I).second)
1060 AS.DeadUsers.push_back(&I);
1061 }
1062
1063 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
1064 bool IsSplittable = false) {
1065 // Completely skip uses which have a zero size or start either before or
1066 // past the end of the allocation.
1067 if (Size == 0 || Offset.uge(AllocSize)) {
1068 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
1069 << Offset
1070 << " which has zero size or starts outside of the "
1071 << AllocSize << " byte alloca:\n"
1072 << " alloca: " << AS.AI << "\n"
1073 << " use: " << I << "\n");
1074 return markAsDead(I);
1075 }
1076
1077 uint64_t BeginOffset = Offset.getZExtValue();
1078 uint64_t EndOffset = BeginOffset + Size;
1079
1080 // Clamp the end offset to the end of the allocation. Note that this is
1081 // formulated to handle even the case where "BeginOffset + Size" overflows.
1082 // This may appear superficially to be something we could ignore entirely,
1083 // but that is not so! There may be widened loads or PHI-node uses where
1084 // some instructions are dead but not others. We can't completely ignore
1085 // them, and so have to record at least the information here.
1086 assert(AllocSize >= BeginOffset); // Established above.
1087 if (Size > AllocSize - BeginOffset) {
1088 LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
1089 << Offset << " to remain within the " << AllocSize
1090 << " byte alloca:\n"
1091 << " alloca: " << AS.AI << "\n"
1092 << " use: " << I << "\n");
1093 EndOffset = AllocSize;
1094 }
1095
1096 AS.Slices.push_back(
1097 Slice(BeginOffset, EndOffset, U, IsSplittable, ProtectedFieldDisc));
1098 }
1099
1100 void visitBitCastInst(BitCastInst &BC) {
1101 if (BC.use_empty())
1102 return markAsDead(BC);
1103
1104 return Base::visitBitCastInst(BC);
1105 }
1106
1107 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1108 if (ASC.use_empty())
1109 return markAsDead(ASC);
1110
1111 return Base::visitAddrSpaceCastInst(ASC);
1112 }
1113
1114 void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1115 if (GEPI.use_empty())
1116 return markAsDead(GEPI);
1117
1118 return Base::visitGetElementPtrInst(GEPI);
1119 }
1120
1121 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
1122 uint64_t Size, bool IsVolatile) {
1123 // We allow splitting of non-volatile loads and stores where the type is an
1124 // integer type. These may be used to implement 'memcpy' or other "transfer
1125 // of bits" patterns.
1126 bool IsSplittable =
1127 Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1128
1129 insertUse(I, Offset, Size, IsSplittable);
1130 }
1131
1132 void visitLoadInst(LoadInst &LI) {
1133 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
1134 "All simple FCA loads should have been pre-split");
1135
1136 // If there is a load with an unknown offset, we can still perform store
1137 // to load forwarding for other known-offset loads.
1138 if (!IsOffsetKnown)
1139 return PI.setEscapedReadOnly(&LI);
1140
1141 TypeSize Size = DL.getTypeStoreSize(LI.getType());
1142 if (Size.isScalable()) {
1143 unsigned VScale = LI.getFunction()->getVScaleValue();
1144 if (!VScale)
1145 return PI.setAborted(&LI);
1146
1147 Size = TypeSize::getFixed(Size.getKnownMinValue() * VScale);
1148 }
1149
1150 return handleLoadOrStore(LI.getType(), LI, Offset, Size.getFixedValue(),
1151 LI.isVolatile());
1152 }
1153
1154 void visitStoreInst(StoreInst &SI) {
1155 Value *ValOp = SI.getValueOperand();
1156 if (ValOp == *U)
1157 return PI.setEscapedAndAborted(&SI);
1158 if (!IsOffsetKnown)
1159 return PI.setAborted(&SI);
1160
1161 TypeSize StoreSize = DL.getTypeStoreSize(ValOp->getType());
1162 if (StoreSize.isScalable()) {
1163 unsigned VScale = SI.getFunction()->getVScaleValue();
1164 if (!VScale)
1165 return PI.setAborted(&SI);
1166
1167 StoreSize = TypeSize::getFixed(StoreSize.getKnownMinValue() * VScale);
1168 }
1169
1170 uint64_t Size = StoreSize.getFixedValue();
1171
1172 // If this memory access can be shown to *statically* extend outside the
1173 // bounds of the allocation, it's behavior is undefined, so simply
1174 // ignore it. Note that this is more strict than the generic clamping
1175 // behavior of insertUse. We also try to handle cases which might run the
1176 // risk of overflow.
1177 // FIXME: We should instead consider the pointer to have escaped if this
1178 // function is being instrumented for addressing bugs or race conditions.
1179 if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
1180 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
1181 << Offset << " which extends past the end of the "
1182 << AllocSize << " byte alloca:\n"
1183 << " alloca: " << AS.AI << "\n"
1184 << " use: " << SI << "\n");
1185 return markAsDead(SI);
1186 }
1187
1188 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
1189 "All simple FCA stores should have been pre-split");
1190 handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
1191 }
1192
1193 void visitMemSetInst(MemSetInst &II) {
1194 assert(II.getRawDest() == *U && "Pointer use is not the destination?");
1195 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1196 if ((Length && Length->getValue() == 0) ||
1197 (IsOffsetKnown && Offset.uge(AllocSize)))
1198 // Zero-length mem transfer intrinsics can be ignored entirely.
1199 return markAsDead(II);
1200
1201 if (!IsOffsetKnown)
1202 return PI.setAborted(&II);
1203
1204 insertUse(II, Offset,
1205 Length ? Length->getLimitedValue()
1206 : AllocSize - Offset.getLimitedValue(),
1207 (bool)Length);
1208 }
1209
1210 void visitMemTransferInst(MemTransferInst &II) {
1211 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1212 if (Length && Length->getValue() == 0)
1213 // Zero-length mem transfer intrinsics can be ignored entirely.
1214 return markAsDead(II);
1215
1216 // Because we can visit these intrinsics twice, also check to see if the
1217 // first time marked this instruction as dead. If so, skip it.
1218 if (VisitedDeadInsts.count(&II))
1219 return;
1220
1221 if (!IsOffsetKnown)
1222 return PI.setAborted(&II);
1223
1224 // This side of the transfer is completely out-of-bounds, and so we can
1225 // nuke the entire transfer. However, we also need to nuke the other side
1226 // if already added to our partitions.
1227 // FIXME: Yet another place we really should bypass this when
1228 // instrumenting for ASan.
1229 if (Offset.uge(AllocSize)) {
1230 SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
1231 MemTransferSliceMap.find(&II);
1232 if (MTPI != MemTransferSliceMap.end())
1233 AS.Slices[MTPI->second].kill();
1234 return markAsDead(II);
1235 }
1236
1237 uint64_t RawOffset = Offset.getLimitedValue();
1238 uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
1239
1240 // Check for the special case where the same exact value is used for both
1241 // source and dest.
1242 if (*U == II.getRawDest() && *U == II.getRawSource()) {
1243 // For non-volatile transfers this is a no-op.
1244 if (!II.isVolatile())
1245 return markAsDead(II);
1246
1247 return insertUse(II, Offset, Size, /*IsSplittable=*/false);
1248 }
1249
1250 // If we have seen both source and destination for a mem transfer, then
1251 // they both point to the same alloca.
1252 bool Inserted;
1253 SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
1254 std::tie(MTPI, Inserted) =
1255 MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
1256 unsigned PrevIdx = MTPI->second;
1257 if (!Inserted) {
1258 Slice &PrevP = AS.Slices[PrevIdx];
1259
1260 // Check if the begin offsets match and this is a non-volatile transfer.
1261 // In that case, we can completely elide the transfer.
1262 if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
1263 PrevP.kill();
1264 return markAsDead(II);
1265 }
1266
1267 // Otherwise we have an offset transfer within the same alloca. We can't
1268 // split those.
1269 PrevP.makeUnsplittable();
1270 }
1271
1272 // Insert the use now that we've fixed up the splittable nature.
1273 insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
1274
1275 // Check that we ended up with a valid index in the map.
1276 assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
1277 "Map index doesn't point back to a slice with this user.");
1278 }
1279
1280 // Disable SRoA for any intrinsics except for lifetime invariants.
1281 // FIXME: What about debug intrinsics? This matches old behavior, but
1282 // doesn't make sense.
1283 void visitIntrinsicInst(IntrinsicInst &II) {
1284 if (II.isDroppable()) {
1285 AS.DeadUseIfPromotable.push_back(U);
1286 return;
1287 }
1288
1289 if (!IsOffsetKnown)
1290 return PI.setAborted(&II);
1291
1292 if (II.isLifetimeStartOrEnd()) {
1293 insertUse(II, Offset, AllocSize, true);
1294 return;
1295 }
1296
1297 if (II.getIntrinsicID() == Intrinsic::protected_field_ptr) {
1298 // We only handle loads and stores as users of llvm.protected.field.ptr.
1299 // Other uses may add items to the worklist, which will cause
1300 // ProtectedFieldDisc to be tracked incorrectly.
1301 AS.PFPUsers.push_back(&II);
1302 ProtectedFieldDisc = II.getArgOperand(1);
1303 for (Use &U : II.uses()) {
1304 this->U = &U;
1305 if (auto *LI = dyn_cast<LoadInst>(U.getUser()))
1306 visitLoadInst(*LI);
1307 else if (auto *SI = dyn_cast<StoreInst>(U.getUser()))
1308 visitStoreInst(*SI);
1309 else
1310 PI.setAborted(&II);
1311 if (PI.isAborted())
1312 break;
1313 }
1314 ProtectedFieldDisc = nullptr;
1315 return;
1316 }
1317
1318 Base::visitIntrinsicInst(II);
1319 }
1320
1321 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
1322 // We consider any PHI or select that results in a direct load or store of
1323 // the same offset to be a viable use for slicing purposes. These uses
1324 // are considered unsplittable and the size is the maximum loaded or stored
1325 // size.
1326 SmallPtrSet<Instruction *, 4> Visited;
1328 Visited.insert(Root);
1329 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
1330 const DataLayout &DL = Root->getDataLayout();
1331 // If there are no loads or stores, the access is dead. We mark that as
1332 // a size zero access.
1333 Size = 0;
1334 do {
1335 Instruction *I, *UsedI;
1336 std::tie(UsedI, I) = Uses.pop_back_val();
1337
1338 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
1339 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
1340 if (LoadSize.isScalable()) {
1341 PI.setAborted(LI);
1342 return nullptr;
1343 }
1344 Size = std::max(Size, LoadSize.getFixedValue());
1345 continue;
1346 }
1347 if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
1348 Value *Op = SI->getOperand(0);
1349 if (Op == UsedI)
1350 return SI;
1351 TypeSize StoreSize = DL.getTypeStoreSize(Op->getType());
1352 if (StoreSize.isScalable()) {
1353 PI.setAborted(SI);
1354 return nullptr;
1355 }
1356 Size = std::max(Size, StoreSize.getFixedValue());
1357 continue;
1358 }
1359
1360 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
1361 if (!GEP->hasAllZeroIndices())
1362 return GEP;
1363 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
1365 return I;
1366 }
1367
1368 for (User *U : I->users())
1369 if (Visited.insert(cast<Instruction>(U)).second)
1370 Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
1371 } while (!Uses.empty());
1372
1373 return nullptr;
1374 }
1375
1376 void visitPHINodeOrSelectInst(Instruction &I) {
1378 if (I.use_empty())
1379 return markAsDead(I);
1380
1381 // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1382 // instructions in this BB, which may be required during rewriting. Bail out
1383 // on these cases.
1384 if (isa<PHINode>(I) && !I.getParent()->hasInsertionPt())
1385 return PI.setAborted(&I);
1386
1387 // TODO: We could use simplifyInstruction here to fold PHINodes and
1388 // SelectInsts. However, doing so requires to change the current
1389 // dead-operand-tracking mechanism. For instance, suppose neither loading
1390 // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1391 // trap either. However, if we simply replace %U with undef using the
1392 // current dead-operand-tracking mechanism, "load (select undef, undef,
1393 // %other)" may trap because the select may return the first operand
1394 // "undef".
1395 if (Value *Result = foldPHINodeOrSelectInst(I)) {
1396 if (Result == *U)
1397 // If the result of the constant fold will be the pointer, recurse
1398 // through the PHI/select as if we had RAUW'ed it.
1399 enqueueUsers(I);
1400 else
1401 // Otherwise the operand to the PHI/select is dead, and we can replace
1402 // it with poison.
1403 AS.DeadOperands.push_back(U);
1404
1405 return;
1406 }
1407
1408 if (!IsOffsetKnown)
1409 return PI.setAborted(&I);
1410
1411 // See if we already have computed info on this node.
1412 uint64_t &Size = PHIOrSelectSizes[&I];
1413 if (!Size) {
1414 // This is a new PHI/Select, check for an unsafe use of it.
1415 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
1416 return PI.setAborted(UnsafeI);
1417 }
1418
1419 // For PHI and select operands outside the alloca, we can't nuke the entire
1420 // phi or select -- the other side might still be relevant, so we special
1421 // case them here and use a separate structure to track the operands
1422 // themselves which should be replaced with poison.
1423 // FIXME: This should instead be escaped in the event we're instrumenting
1424 // for address sanitization.
1425 if (Offset.uge(AllocSize)) {
1426 AS.DeadOperands.push_back(U);
1427 return;
1428 }
1429
1430 insertUse(I, Offset, Size);
1431 }
1432
1433 void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
1434
1435 void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
1436
1437 /// Disable SROA entirely if there are unhandled users of the alloca.
1438 void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1439
1440 void visitCallBase(CallBase &CB) {
1441 // If the call operand is read-only and only does a read-only or address
1442 // capture, then we mark it as EscapedReadOnly.
1443 if (CB.isDataOperand(U) &&
1444 !capturesFullProvenance(CB.getCaptureInfo(U->getOperandNo())) &&
1445 CB.onlyReadsMemory(U->getOperandNo())) {
1446 PI.setEscapedReadOnly(&CB);
1447 return;
1448 }
1449
1450 Base::visitCallBase(CB);
1451 }
1452};
1453
1454AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
1455 :
1456#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1457 AI(AI),
1458#endif
1459 PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
1460 SliceBuilder PB(DL, AI, *this);
1461 SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
1462 if (PtrI.isEscaped() || PtrI.isAborted()) {
1463 // FIXME: We should sink the escape vs. abort info into the caller nicely,
1464 // possibly by just storing the PtrInfo in the AllocaSlices.
1465 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1466 : PtrI.getAbortingInst();
1467 assert(PointerEscapingInstr && "Did not track a bad instruction");
1468 return;
1469 }
1470 PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
1471
1472 llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
1473
1474 // Sort the uses. This arranges for the offsets to be in ascending order,
1475 // and the sizes to be in descending order.
1476 llvm::stable_sort(Slices);
1477}
1478
1479#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1480
1481void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1482 StringRef Indent) const {
1483 printSlice(OS, I, Indent);
1484 OS << "\n";
1485 printUse(OS, I, Indent);
1486}
1487
1488void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1489 StringRef Indent) const {
1490 OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1491 << " slice #" << (I - begin())
1492 << (I->isSplittable() ? " (splittable)" : "");
1493}
1494
1495void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1496 StringRef Indent) const {
1497 OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1498}
1499
1500void AllocaSlices::print(raw_ostream &OS) const {
1501 if (PointerEscapingInstr) {
1502 OS << "Can't analyze slices for alloca: " << AI << "\n"
1503 << " A pointer to this alloca escaped by:\n"
1504 << " " << *PointerEscapingInstr << "\n";
1505 return;
1506 }
1507
1508 if (PointerEscapingInstrReadOnly)
1509 OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
1510
1511 OS << "Slices of alloca: " << AI << "\n";
1512 for (const_iterator I = begin(), E = end(); I != E; ++I)
1513 print(OS, I);
1514}
1515
1516LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1517 print(dbgs(), I);
1518}
1519LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1520
1521#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1522
1523/// Walk the range of a partitioning looking for a common type to cover this
1524/// sequence of slices.
1525static std::pair<Type *, IntegerType *>
1526findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1527 uint64_t EndOffset) {
1528 Type *Ty = nullptr;
1529 bool TyIsCommon = true;
1530 IntegerType *ITy = nullptr;
1531
1532 // Note that we need to look at *every* alloca slice's Use to ensure we
1533 // always get consistent results regardless of the order of slices.
1534 for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1535 Use *U = I->getUse();
1536 if (isa<IntrinsicInst>(*U->getUser()))
1537 continue;
1538 if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1539 continue;
1540
1541 Type *UserTy = nullptr;
1542 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1543 UserTy = LI->getType();
1544 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1545 UserTy = SI->getValueOperand()->getType();
1546 }
1547
1548 if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
1549 // If the type is larger than the partition, skip it. We only encounter
1550 // this for split integer operations where we want to use the type of the
1551 // entity causing the split. Also skip if the type is not a byte width
1552 // multiple.
1553 if (UserITy->getBitWidth() % 8 != 0 ||
1554 UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1555 continue;
1556
1557 // Track the largest bitwidth integer type used in this way in case there
1558 // is no common type.
1559 if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1560 ITy = UserITy;
1561 }
1562
1563 // To avoid depending on the order of slices, Ty and TyIsCommon must not
1564 // depend on types skipped above.
1565 if (!UserTy || (Ty && Ty != UserTy))
1566 TyIsCommon = false; // Give up on anything but an iN type.
1567 else
1568 Ty = UserTy;
1569 }
1570
1571 return {TyIsCommon ? Ty : nullptr, ITy};
1572}
1573
1574/// PHI instructions that use an alloca and are subsequently loaded can be
1575/// rewritten to load both input pointers in the pred blocks and then PHI the
1576/// results, allowing the load of the alloca to be promoted.
1577/// From this:
1578/// %P2 = phi [i32* %Alloca, i32* %Other]
1579/// %V = load i32* %P2
1580/// to:
1581/// %V1 = load i32* %Alloca -> will be mem2reg'd
1582/// ...
1583/// %V2 = load i32* %Other
1584/// ...
1585/// %V = phi [i32 %V1, i32 %V2]
1586///
1587/// We can do this to a select if its only uses are loads and if the operands
1588/// to the select can be loaded unconditionally.
1589///
1590/// FIXME: This should be hoisted into a generic utility, likely in
1591/// Transforms/Util/Local.h
1593 const DataLayout &DL = PN.getDataLayout();
1594
1595 // For now, we can only do this promotion if the load is in the same block
1596 // as the PHI, and if there are no stores between the phi and load.
1597 // TODO: Allow recursive phi users.
1598 // TODO: Allow stores.
1599 BasicBlock *BB = PN.getParent();
1600 Align MaxAlign;
1601 uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
1602 Type *LoadType = nullptr;
1603 for (User *U : PN.users()) {
1605 if (!LI || !LI->isSimple())
1606 return false;
1607
1608 // For now we only allow loads in the same block as the PHI. This is
1609 // a common case that happens when instcombine merges two loads through
1610 // a PHI.
1611 if (LI->getParent() != BB)
1612 return false;
1613
1614 if (LoadType) {
1615 if (LoadType != LI->getType())
1616 return false;
1617 } else {
1618 LoadType = LI->getType();
1619 }
1620
1621 // Ensure that there are no instructions between the PHI and the load that
1622 // could store.
1623 for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1624 if (BBI->mayWriteToMemory())
1625 return false;
1626
1627 MaxAlign = std::max(MaxAlign, LI->getAlign());
1628 }
1629
1630 if (!LoadType)
1631 return false;
1632
1633 APInt LoadSize =
1634 APInt(APWidth, DL.getTypeStoreSize(LoadType).getFixedValue());
1635
1636 // We can only transform this if it is safe to push the loads into the
1637 // predecessor blocks. The only thing to watch out for is that we can't put
1638 // a possibly trapping load in the predecessor if it is a critical edge.
1639 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1641 Value *InVal = PN.getIncomingValue(Idx);
1642
1643 // If the value is produced by the terminator of the predecessor (an
1644 // invoke) or it has side-effects, there is no valid place to put a load
1645 // in the predecessor.
1646 if (TI == InVal || TI->mayHaveSideEffects())
1647 return false;
1648
1649 // If the predecessor has a single successor, then the edge isn't
1650 // critical.
1651 if (TI->getNumSuccessors() == 1)
1652 continue;
1653
1654 // If this pointer is always safe to load, or if we can prove that there
1655 // is already a load in the block, then we can move the load to the pred
1656 // block.
1657 if (isSafeToLoadUnconditionally(InVal, MaxAlign, LoadSize, DL, TI))
1658 continue;
1659
1660 return false;
1661 }
1662
1663 return true;
1664}
1665
1666static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1667 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1668
1669 LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
1670 Type *LoadTy = SomeLoad->getType();
1671 IRB.SetInsertPoint(&PN);
1672 PHINode *NewPN = IRB.CreatePHI(LoadTy, PN.getNumIncomingValues(),
1673 PN.getName() + ".sroa.speculated");
1674
1675 // Get the AA tags and alignment to use from one of the loads. It does not
1676 // matter which one we get and if any differ.
1677 AAMDNodes AATags = SomeLoad->getAAMetadata();
1678 Align Alignment = SomeLoad->getAlign();
1679
1680 // Rewrite all loads of the PN to use the new PHI.
1681 while (!PN.use_empty()) {
1682 LoadInst *LI = cast<LoadInst>(PN.user_back());
1683 LI->replaceAllUsesWith(NewPN);
1684 LI->eraseFromParent();
1685 }
1686
1687 // Inject loads into all of the pred blocks.
1688 DenseMap<BasicBlock *, Value *> InjectedLoads;
1689 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1690 BasicBlock *Pred = PN.getIncomingBlock(Idx);
1691 Value *InVal = PN.getIncomingValue(Idx);
1692
1693 // A PHI node is allowed to have multiple (duplicated) entries for the same
1694 // basic block, as long as the value is the same. So if we already injected
1695 // a load in the predecessor, then we should reuse the same load for all
1696 // duplicated entries.
1697 if (Value *V = InjectedLoads.lookup(Pred)) {
1698 NewPN->addIncoming(V, Pred);
1699 continue;
1700 }
1701
1702 Instruction *TI = Pred->getTerminator();
1703 IRB.SetInsertPoint(TI);
1704
1705 LoadInst *Load = IRB.CreateAlignedLoad(
1706 LoadTy, InVal, Alignment,
1707 (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1708 ++NumLoadsSpeculated;
1709 if (AATags)
1710 Load->setAAMetadata(AATags);
1711 NewPN->addIncoming(Load, Pred);
1712 InjectedLoads[Pred] = Load;
1713 }
1714
1715 LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1716 PN.eraseFromParent();
1717}
1718
1719SelectHandSpeculativity &
1720SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) {
1721 if (isTrueVal)
1723 else
1725 return *this;
1726}
1727
1728bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const {
1729 return isTrueVal ? Bitfield::get<SelectHandSpeculativity::TrueVal>(Storage)
1730 : Bitfield::get<SelectHandSpeculativity::FalseVal>(Storage);
1731}
1732
1733bool SelectHandSpeculativity::areAllSpeculatable() const {
1734 return isSpeculatable(/*isTrueVal=*/true) &&
1735 isSpeculatable(/*isTrueVal=*/false);
1736}
1737
1738bool SelectHandSpeculativity::areAnySpeculatable() const {
1739 return isSpeculatable(/*isTrueVal=*/true) ||
1740 isSpeculatable(/*isTrueVal=*/false);
1741}
1742bool SelectHandSpeculativity::areNoneSpeculatable() const {
1743 return !areAnySpeculatable();
1744}
1745
1746static SelectHandSpeculativity
1748 assert(LI.isSimple() && "Only for simple loads");
1749 SelectHandSpeculativity Spec;
1750
1751 const DataLayout &DL = SI.getDataLayout();
1752 for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()})
1754 &LI))
1755 Spec.setAsSpeculatable(/*isTrueVal=*/Value == SI.getTrueValue());
1756 else if (PreserveCFG)
1757 return Spec;
1758
1759 return Spec;
1760}
1761
1762std::optional<RewriteableMemOps>
1763SROA::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) {
1764 RewriteableMemOps Ops;
1765
1766 for (User *U : SI.users()) {
1767 if (auto *BC = dyn_cast<BitCastInst>(U); BC && BC->hasOneUse())
1768 U = *BC->user_begin();
1769
1770 if (auto *Store = dyn_cast<StoreInst>(U)) {
1771 // Note that atomic stores can be transformed; atomic semantics do not
1772 // have any meaning for a local alloca. Stores are not speculatable,
1773 // however, so if we can't turn it into a predicated store, we are done.
1774 if (Store->isVolatile() || PreserveCFG)
1775 return {}; // Give up on this `select`.
1776 Ops.emplace_back(Store);
1777 continue;
1778 }
1779
1780 auto *LI = dyn_cast<LoadInst>(U);
1781
1782 // Note that atomic loads can be transformed;
1783 // atomic semantics do not have any meaning for a local alloca.
1784 if (!LI || LI->isVolatile())
1785 return {}; // Give up on this `select`.
1786
1787 PossiblySpeculatableLoad Load(LI);
1788 if (!LI->isSimple()) {
1789 // If the `load` is not simple, we can't speculatively execute it,
1790 // but we could handle this via a CFG modification. But can we?
1791 if (PreserveCFG)
1792 return {}; // Give up on this `select`.
1793 Ops.emplace_back(Load);
1794 continue;
1795 }
1796
1797 SelectHandSpeculativity Spec =
1799 if (PreserveCFG && !Spec.areAllSpeculatable())
1800 return {}; // Give up on this `select`.
1801
1802 Load.setInt(Spec);
1803 Ops.emplace_back(Load);
1804 }
1805
1806 return Ops;
1807}
1808
1810 IRBuilderTy &IRB) {
1811 LLVM_DEBUG(dbgs() << " original load: " << SI << "\n");
1812
1813 Value *TV = SI.getTrueValue();
1814 Value *FV = SI.getFalseValue();
1815 // Replace the given load of the select with a select of two loads.
1816
1817 assert(LI.isSimple() && "We only speculate simple loads");
1818
1819 IRB.SetInsertPoint(&LI);
1820
1821 LoadInst *TL =
1822 IRB.CreateAlignedLoad(LI.getType(), TV, LI.getAlign(),
1823 LI.getName() + ".sroa.speculate.load.true");
1824 LoadInst *FL =
1825 IRB.CreateAlignedLoad(LI.getType(), FV, LI.getAlign(),
1826 LI.getName() + ".sroa.speculate.load.false");
1827 NumLoadsSpeculated += 2;
1828
1829 // Transfer alignment and AA info if present.
1830 TL->setAlignment(LI.getAlign());
1831 FL->setAlignment(LI.getAlign());
1832
1833 AAMDNodes Tags = LI.getAAMetadata();
1834 if (Tags) {
1835 TL->setAAMetadata(Tags);
1836 FL->setAAMetadata(Tags);
1837 }
1838
1839 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
1840 LI.getName() + ".sroa.speculated",
1841 ProfcheckDisableMetadataFixes ? nullptr : &SI);
1842
1843 LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1844 LI.replaceAllUsesWith(V);
1845}
1846
1847template <typename T>
1849 SelectHandSpeculativity Spec,
1850 DomTreeUpdater &DTU) {
1851 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Only for load and store!");
1852 LLVM_DEBUG(dbgs() << " original mem op: " << I << "\n");
1853 BasicBlock *Head = I.getParent();
1854 Instruction *ThenTerm = nullptr;
1855 Instruction *ElseTerm = nullptr;
1856 if (Spec.areNoneSpeculatable())
1857 SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm,
1858 SI.getMetadata(LLVMContext::MD_prof), &DTU);
1859 else {
1860 SplitBlockAndInsertIfThen(SI.getCondition(), &I, /*Unreachable=*/false,
1861 SI.getMetadata(LLVMContext::MD_prof), &DTU,
1862 /*LI=*/nullptr, /*ThenBlock=*/nullptr);
1863 if (Spec.isSpeculatable(/*isTrueVal=*/true))
1864 cast<BranchInst>(Head->getTerminator())->swapSuccessors();
1865 }
1866 auto *HeadBI = cast<BranchInst>(Head->getTerminator());
1867 Spec = {}; // Do not use `Spec` beyond this point.
1868 BasicBlock *Tail = I.getParent();
1869 Tail->setName(Head->getName() + ".cont");
1870 PHINode *PN;
1871 if (isa<LoadInst>(I))
1872 PN = PHINode::Create(I.getType(), 2, "", I.getIterator());
1873 for (BasicBlock *SuccBB : successors(Head)) {
1874 bool IsThen = SuccBB == HeadBI->getSuccessor(0);
1875 int SuccIdx = IsThen ? 0 : 1;
1876 auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB;
1877 auto &CondMemOp = cast<T>(*I.clone());
1878 if (NewMemOpBB != Head) {
1879 NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else"));
1880 if (isa<LoadInst>(I))
1881 ++NumLoadsPredicated;
1882 else
1883 ++NumStoresPredicated;
1884 } else {
1885 CondMemOp.dropUBImplyingAttrsAndMetadata();
1886 ++NumLoadsSpeculated;
1887 }
1888 CondMemOp.insertBefore(NewMemOpBB->getTerminator()->getIterator());
1889 Value *Ptr = SI.getOperand(1 + SuccIdx);
1890 CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr);
1891 if (isa<LoadInst>(I)) {
1892 CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val");
1893 PN->addIncoming(&CondMemOp, NewMemOpBB);
1894 } else
1895 LLVM_DEBUG(dbgs() << " to: " << CondMemOp << "\n");
1896 }
1897 if (isa<LoadInst>(I)) {
1898 PN->takeName(&I);
1899 LLVM_DEBUG(dbgs() << " to: " << *PN << "\n");
1900 I.replaceAllUsesWith(PN);
1901 }
1902}
1903
1905 SelectHandSpeculativity Spec,
1906 DomTreeUpdater &DTU) {
1907 if (auto *LI = dyn_cast<LoadInst>(&I))
1908 rewriteMemOpOfSelect(SelInst, *LI, Spec, DTU);
1909 else if (auto *SI = dyn_cast<StoreInst>(&I))
1910 rewriteMemOpOfSelect(SelInst, *SI, Spec, DTU);
1911 else
1912 llvm_unreachable_internal("Only for load and store.");
1913}
1914
1916 const RewriteableMemOps &Ops,
1917 IRBuilderTy &IRB, DomTreeUpdater *DTU) {
1918 bool CFGChanged = false;
1919 LLVM_DEBUG(dbgs() << " original select: " << SI << "\n");
1920
1921 for (const RewriteableMemOp &Op : Ops) {
1922 SelectHandSpeculativity Spec;
1923 Instruction *I;
1924 if (auto *const *US = std::get_if<UnspeculatableStore>(&Op)) {
1925 I = *US;
1926 } else {
1927 auto PSL = std::get<PossiblySpeculatableLoad>(Op);
1928 I = PSL.getPointer();
1929 Spec = PSL.getInt();
1930 }
1931 if (Spec.areAllSpeculatable()) {
1933 } else {
1934 assert(DTU && "Should not get here when not allowed to modify the CFG!");
1935 rewriteMemOpOfSelect(SI, *I, Spec, *DTU);
1936 CFGChanged = true;
1937 }
1938 I->eraseFromParent();
1939 }
1940
1941 for (User *U : make_early_inc_range(SI.users()))
1942 cast<BitCastInst>(U)->eraseFromParent();
1943 SI.eraseFromParent();
1944 return CFGChanged;
1945}
1946
1947/// Compute an adjusted pointer from Ptr by Offset bytes where the
1948/// resulting pointer has PointerTy.
1949static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1951 const Twine &NamePrefix) {
1952 if (Offset != 0)
1953 Ptr = IRB.CreateInBoundsPtrAdd(Ptr, IRB.getInt(Offset),
1954 NamePrefix + "sroa_idx");
1955 return IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, PointerTy,
1956 NamePrefix + "sroa_cast");
1957}
1958
1959/// Compute the adjusted alignment for a load or store from an offset.
1963
1964/// Test whether we can convert a value from the old to the new type.
1965///
1966/// This predicate should be used to guard calls to convertValue in order to
1967/// ensure that we only try to convert viable values. The strategy is that we
1968/// will peel off single element struct and array wrappings to get to an
1969/// underlying value, and convert that value.
1970static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy,
1971 unsigned VScale = 0) {
1972 if (OldTy == NewTy)
1973 return true;
1974
1975 // For integer types, we can't handle any bit-width differences. This would
1976 // break both vector conversions with extension and introduce endianness
1977 // issues when in conjunction with loads and stores.
1978 if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
1980 cast<IntegerType>(NewTy)->getBitWidth() &&
1981 "We can't have the same bitwidth for different int types");
1982 return false;
1983 }
1984
1985 TypeSize NewSize = DL.getTypeSizeInBits(NewTy);
1986 TypeSize OldSize = DL.getTypeSizeInBits(OldTy);
1987
1988 if ((isa<ScalableVectorType>(NewTy) && isa<FixedVectorType>(OldTy)) ||
1989 (isa<ScalableVectorType>(OldTy) && isa<FixedVectorType>(NewTy))) {
1990 // Conversion is only possible when the size of scalable vectors is known.
1991 if (!VScale)
1992 return false;
1993
1994 // For ptr-to-int and int-to-ptr casts, the pointer side is resolved within
1995 // a single domain (either fixed or scalable). Any additional conversion
1996 // between fixed and scalable types is handled through integer types.
1997 auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy;
1998 auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy;
1999
2000 if (isa<ScalableVectorType>(NewTy)) {
2002 return false;
2003
2004 NewSize = TypeSize::getFixed(NewSize.getKnownMinValue() * VScale);
2005 } else {
2007 return false;
2008
2009 OldSize = TypeSize::getFixed(OldSize.getKnownMinValue() * VScale);
2010 }
2011 }
2012
2013 if (NewSize != OldSize)
2014 return false;
2015 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
2016 return false;
2017
2018 // We can convert pointers to integers and vice-versa. Same for vectors
2019 // of pointers and integers.
2020 OldTy = OldTy->getScalarType();
2021 NewTy = NewTy->getScalarType();
2022 if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
2023 if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
2024 unsigned OldAS = OldTy->getPointerAddressSpace();
2025 unsigned NewAS = NewTy->getPointerAddressSpace();
2026 // Convert pointers if they are pointers from the same address space or
2027 // different integral (not non-integral) address spaces with the same
2028 // pointer size.
2029 return OldAS == NewAS ||
2030 (!DL.isNonIntegralAddressSpace(OldAS) &&
2031 !DL.isNonIntegralAddressSpace(NewAS) &&
2032 DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
2033 }
2034
2035 // We can convert integers to integral pointers, but not to non-integral
2036 // pointers.
2037 if (OldTy->isIntegerTy())
2038 return !DL.isNonIntegralPointerType(NewTy);
2039
2040 // We can convert integral pointers to integers, but non-integral pointers
2041 // need to remain pointers.
2042 if (!DL.isNonIntegralPointerType(OldTy))
2043 return NewTy->isIntegerTy();
2044
2045 return false;
2046 }
2047
2048 if (OldTy->isTargetExtTy() || NewTy->isTargetExtTy())
2049 return false;
2050
2051 return true;
2052}
2053
2054/// Test whether the given slice use can be promoted to a vector.
2055///
2056/// This function is called to test each entry in a partition which is slated
2057/// for a single slice.
2058static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
2059 VectorType *Ty,
2060 uint64_t ElementSize,
2061 const DataLayout &DL,
2062 unsigned VScale) {
2063 // First validate the slice offsets.
2064 uint64_t BeginOffset =
2065 std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
2066 uint64_t BeginIndex = BeginOffset / ElementSize;
2067 if (BeginIndex * ElementSize != BeginOffset ||
2068 BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
2069 return false;
2070 uint64_t EndOffset = std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
2071 uint64_t EndIndex = EndOffset / ElementSize;
2072 if (EndIndex * ElementSize != EndOffset ||
2073 EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
2074 return false;
2075
2076 assert(EndIndex > BeginIndex && "Empty vector!");
2077 uint64_t NumElements = EndIndex - BeginIndex;
2078 Type *SliceTy = (NumElements == 1)
2079 ? Ty->getElementType()
2080 : FixedVectorType::get(Ty->getElementType(), NumElements);
2081
2082 Type *SplitIntTy =
2083 Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
2084
2085 Use *U = S.getUse();
2086
2087 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2088 if (MI->isVolatile())
2089 return false;
2090 if (!S.isSplittable())
2091 return false; // Skip any unsplittable intrinsics.
2092 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2093 if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2094 return false;
2095 } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2096 if (LI->isVolatile())
2097 return false;
2098 Type *LTy = LI->getType();
2099 // Disable vector promotion when there are loads or stores of an FCA.
2100 if (LTy->isStructTy())
2101 return false;
2102 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2103 assert(LTy->isIntegerTy());
2104 LTy = SplitIntTy;
2105 }
2106 if (!canConvertValue(DL, SliceTy, LTy, VScale))
2107 return false;
2108 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2109 if (SI->isVolatile())
2110 return false;
2111 Type *STy = SI->getValueOperand()->getType();
2112 // Disable vector promotion when there are loads or stores of an FCA.
2113 if (STy->isStructTy())
2114 return false;
2115 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2116 assert(STy->isIntegerTy());
2117 STy = SplitIntTy;
2118 }
2119 if (!canConvertValue(DL, STy, SliceTy, VScale))
2120 return false;
2121 } else {
2122 return false;
2123 }
2124
2125 return true;
2126}
2127
2128/// Test whether any vector type in \p CandidateTys is viable for promotion.
2129///
2130/// This implements the necessary checking for \c isVectorPromotionViable over
2131/// all slices of the alloca for the given VectorType.
2132static VectorType *
2134 SmallVectorImpl<VectorType *> &CandidateTys,
2135 bool HaveCommonEltTy, Type *CommonEltTy,
2136 bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
2137 VectorType *CommonVecPtrTy, unsigned VScale) {
2138 // If we didn't find a vector type, nothing to do here.
2139 if (CandidateTys.empty())
2140 return nullptr;
2141
2142 // Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
2143 // then we should choose it, not some other alternative.
2144 // But, we can't perform a no-op pointer address space change via bitcast,
2145 // so if we didn't have a common pointer element type, bail.
2146 if (HaveVecPtrTy && !HaveCommonVecPtrTy)
2147 return nullptr;
2148
2149 // Try to pick the "best" element type out of the choices.
2150 if (!HaveCommonEltTy && HaveVecPtrTy) {
2151 // If there was a pointer element type, there's really only one choice.
2152 CandidateTys.clear();
2153 CandidateTys.push_back(CommonVecPtrTy);
2154 } else if (!HaveCommonEltTy && !HaveVecPtrTy) {
2155 // Integer-ify vector types.
2156 for (VectorType *&VTy : CandidateTys) {
2157 if (!VTy->getElementType()->isIntegerTy())
2158 VTy = cast<VectorType>(VTy->getWithNewType(IntegerType::getIntNTy(
2159 VTy->getContext(), VTy->getScalarSizeInBits())));
2160 }
2161
2162 // Rank the remaining candidate vector types. This is easy because we know
2163 // they're all integer vectors. We sort by ascending number of elements.
2164 auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2165 (void)DL;
2166 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2167 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2168 "Cannot have vector types of different sizes!");
2169 assert(RHSTy->getElementType()->isIntegerTy() &&
2170 "All non-integer types eliminated!");
2171 assert(LHSTy->getElementType()->isIntegerTy() &&
2172 "All non-integer types eliminated!");
2173 return cast<FixedVectorType>(RHSTy)->getNumElements() <
2174 cast<FixedVectorType>(LHSTy)->getNumElements();
2175 };
2176 auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2177 (void)DL;
2178 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2179 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2180 "Cannot have vector types of different sizes!");
2181 assert(RHSTy->getElementType()->isIntegerTy() &&
2182 "All non-integer types eliminated!");
2183 assert(LHSTy->getElementType()->isIntegerTy() &&
2184 "All non-integer types eliminated!");
2185 return cast<FixedVectorType>(RHSTy)->getNumElements() ==
2186 cast<FixedVectorType>(LHSTy)->getNumElements();
2187 };
2188 llvm::sort(CandidateTys, RankVectorTypesComp);
2189 CandidateTys.erase(llvm::unique(CandidateTys, RankVectorTypesEq),
2190 CandidateTys.end());
2191 } else {
2192// The only way to have the same element type in every vector type is to
2193// have the same vector type. Check that and remove all but one.
2194#ifndef NDEBUG
2195 for (VectorType *VTy : CandidateTys) {
2196 assert(VTy->getElementType() == CommonEltTy &&
2197 "Unaccounted for element type!");
2198 assert(VTy == CandidateTys[0] &&
2199 "Different vector types with the same element type!");
2200 }
2201#endif
2202 CandidateTys.resize(1);
2203 }
2204
2205 // FIXME: hack. Do we have a named constant for this?
2206 // SDAG SDNode can't have more than 65535 operands.
2207 llvm::erase_if(CandidateTys, [](VectorType *VTy) {
2208 return cast<FixedVectorType>(VTy)->getNumElements() >
2209 std::numeric_limits<unsigned short>::max();
2210 });
2211
2212 // Find a vector type viable for promotion by iterating over all slices.
2213 auto *VTy = llvm::find_if(CandidateTys, [&](VectorType *VTy) -> bool {
2214 uint64_t ElementSize =
2215 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2216
2217 // While the definition of LLVM vectors is bitpacked, we don't support sizes
2218 // that aren't byte sized.
2219 if (ElementSize % 8)
2220 return false;
2221 assert((DL.getTypeSizeInBits(VTy).getFixedValue() % 8) == 0 &&
2222 "vector size not a multiple of element size?");
2223 ElementSize /= 8;
2224
2225 for (const Slice &S : P)
2226 if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, VScale))
2227 return false;
2228
2229 for (const Slice *S : P.splitSliceTails())
2230 if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, VScale))
2231 return false;
2232
2233 return true;
2234 });
2235 return VTy != CandidateTys.end() ? *VTy : nullptr;
2236}
2237
2239 SetVector<Type *> &OtherTys, ArrayRef<VectorType *> CandidateTysCopy,
2240 function_ref<void(Type *)> CheckCandidateType, Partition &P,
2241 const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
2242 bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
2243 bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) {
2244 [[maybe_unused]] VectorType *OriginalElt =
2245 CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
2246 // Consider additional vector types where the element type size is a
2247 // multiple of load/store element size.
2248 for (Type *Ty : OtherTys) {
2250 continue;
2251 unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
2252 // Make a copy of CandidateTys and iterate through it, because we
2253 // might append to CandidateTys in the loop.
2254 for (VectorType *const VTy : CandidateTysCopy) {
2255 // The elements in the copy should remain invariant throughout the loop
2256 assert(CandidateTysCopy[0] == OriginalElt && "Different Element");
2257 unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue();
2258 unsigned ElementSize =
2259 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2260 if (TypeSize != VectorSize && TypeSize != ElementSize &&
2261 VectorSize % TypeSize == 0) {
2262 VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false);
2263 CheckCandidateType(NewVTy);
2264 }
2265 }
2266 }
2267
2269 P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2270 HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
2271}
2272
2273/// Test whether the given alloca partitioning and range of slices can be
2274/// promoted to a vector.
2275///
2276/// This is a quick test to check whether we can rewrite a particular alloca
2277/// partition (and its newly formed alloca) into a vector alloca with only
2278/// whole-vector loads and stores such that it could be promoted to a vector
2279/// SSA value. We only can ensure this for a limited set of operations, and we
2280/// don't want to do the rewrites unless we are confident that the result will
2281/// be promotable, so we have an early test here.
2283 unsigned VScale) {
2284 // Collect the candidate types for vector-based promotion. Also track whether
2285 // we have different element types.
2286 SmallVector<VectorType *, 4> CandidateTys;
2287 SetVector<Type *> LoadStoreTys;
2288 SetVector<Type *> DeferredTys;
2289 Type *CommonEltTy = nullptr;
2290 VectorType *CommonVecPtrTy = nullptr;
2291 bool HaveVecPtrTy = false;
2292 bool HaveCommonEltTy = true;
2293 bool HaveCommonVecPtrTy = true;
2294 auto CheckCandidateType = [&](Type *Ty) {
2295 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2296 // Return if bitcast to vectors is different for total size in bits.
2297 if (!CandidateTys.empty()) {
2298 VectorType *V = CandidateTys[0];
2299 if (DL.getTypeSizeInBits(VTy).getFixedValue() !=
2300 DL.getTypeSizeInBits(V).getFixedValue()) {
2301 CandidateTys.clear();
2302 return;
2303 }
2304 }
2305 CandidateTys.push_back(VTy);
2306 Type *EltTy = VTy->getElementType();
2307
2308 if (!CommonEltTy)
2309 CommonEltTy = EltTy;
2310 else if (CommonEltTy != EltTy)
2311 HaveCommonEltTy = false;
2312
2313 if (EltTy->isPointerTy()) {
2314 HaveVecPtrTy = true;
2315 if (!CommonVecPtrTy)
2316 CommonVecPtrTy = VTy;
2317 else if (CommonVecPtrTy != VTy)
2318 HaveCommonVecPtrTy = false;
2319 }
2320 }
2321 };
2322
2323 // Put load and store types into a set for de-duplication.
2324 for (const Slice &S : P) {
2325 Type *Ty;
2326 if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
2327 Ty = LI->getType();
2328 else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
2329 Ty = SI->getValueOperand()->getType();
2330 else
2331 continue;
2332
2333 auto CandTy = Ty->getScalarType();
2334 if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() ||
2335 S.endOffset() != P.endOffset())) {
2336 DeferredTys.insert(Ty);
2337 continue;
2338 }
2339
2340 LoadStoreTys.insert(Ty);
2341 // Consider any loads or stores that are the exact size of the slice.
2342 if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
2343 CheckCandidateType(Ty);
2344 }
2345
2346 SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
2348 LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
2349 CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2350 HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
2351 return VTy;
2352
2353 CandidateTys.clear();
2355 DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
2356 HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
2357 CommonVecPtrTy, VScale);
2358}
2359
2360/// Test whether a slice of an alloca is valid for integer widening.
2361///
2362/// This implements the necessary checking for the \c isIntegerWideningViable
2363/// test below on a single slice of the alloca.
2364static bool isIntegerWideningViableForSlice(const Slice &S,
2365 uint64_t AllocBeginOffset,
2366 Type *AllocaTy,
2367 const DataLayout &DL,
2368 bool &WholeAllocaOp) {
2369 uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedValue();
2370
2371 uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2372 uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2373
2374 Use *U = S.getUse();
2375
2376 // Lifetime intrinsics operate over the whole alloca whose sizes are usually
2377 // larger than other load/store slices (RelEnd > Size). But lifetime are
2378 // always promotable and should not impact other slices' promotability of the
2379 // partition.
2380 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2381 if (II->isLifetimeStartOrEnd() || II->isDroppable())
2382 return true;
2383 }
2384
2385 // We can't reasonably handle cases where the load or store extends past
2386 // the end of the alloca's type and into its padding.
2387 if (RelEnd > Size)
2388 return false;
2389
2390 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2391 if (LI->isVolatile())
2392 return false;
2393 // We can't handle loads that extend past the allocated memory.
2394 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
2395 if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size)
2396 return false;
2397 // So far, AllocaSliceRewriter does not support widening split slice tails
2398 // in rewriteIntegerLoad.
2399 if (S.beginOffset() < AllocBeginOffset)
2400 return false;
2401 // Note that we don't count vector loads or stores as whole-alloca
2402 // operations which enable integer widening because we would prefer to use
2403 // vector widening instead.
2404 if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
2405 WholeAllocaOp = true;
2406 if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
2407 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2408 return false;
2409 } else if (RelBegin != 0 || RelEnd != Size ||
2410 !canConvertValue(DL, AllocaTy, LI->getType())) {
2411 // Non-integer loads need to be convertible from the alloca type so that
2412 // they are promotable.
2413 return false;
2414 }
2415 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2416 Type *ValueTy = SI->getValueOperand()->getType();
2417 if (SI->isVolatile())
2418 return false;
2419 // We can't handle stores that extend past the allocated memory.
2420 TypeSize StoreSize = DL.getTypeStoreSize(ValueTy);
2421 if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size)
2422 return false;
2423 // So far, AllocaSliceRewriter does not support widening split slice tails
2424 // in rewriteIntegerStore.
2425 if (S.beginOffset() < AllocBeginOffset)
2426 return false;
2427 // Note that we don't count vector loads or stores as whole-alloca
2428 // operations which enable integer widening because we would prefer to use
2429 // vector widening instead.
2430 if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
2431 WholeAllocaOp = true;
2432 if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
2433 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2434 return false;
2435 } else if (RelBegin != 0 || RelEnd != Size ||
2436 !canConvertValue(DL, ValueTy, AllocaTy)) {
2437 // Non-integer stores need to be convertible to the alloca type so that
2438 // they are promotable.
2439 return false;
2440 }
2441 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2442 if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
2443 return false;
2444 if (!S.isSplittable())
2445 return false; // Skip any unsplittable intrinsics.
2446 } else {
2447 return false;
2448 }
2449
2450 return true;
2451}
2452
2453/// Test whether the given alloca partition's integer operations can be
2454/// widened to promotable ones.
2455///
2456/// This is a quick test to check whether we can rewrite the integer loads and
2457/// stores to a particular alloca into wider loads and stores and be able to
2458/// promote the resulting alloca.
2459static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2460 const DataLayout &DL) {
2461 uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedValue();
2462 // Don't create integer types larger than the maximum bitwidth.
2463 if (SizeInBits > IntegerType::MAX_INT_BITS)
2464 return false;
2465
2466 // Don't try to handle allocas with bit-padding.
2467 if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedValue())
2468 return false;
2469
2470 // We need to ensure that an integer type with the appropriate bitwidth can
2471 // be converted to the alloca type, whatever that is. We don't want to force
2472 // the alloca itself to have an integer type if there is a more suitable one.
2473 Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
2474 if (!canConvertValue(DL, AllocaTy, IntTy) ||
2475 !canConvertValue(DL, IntTy, AllocaTy))
2476 return false;
2477
2478 // While examining uses, we ensure that the alloca has a covering load or
2479 // store. We don't want to widen the integer operations only to fail to
2480 // promote due to some other unsplittable entry (which we may make splittable
2481 // later). However, if there are only splittable uses, go ahead and assume
2482 // that we cover the alloca.
2483 // FIXME: We shouldn't consider split slices that happen to start in the
2484 // partition here...
2485 bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
2486
2487 for (const Slice &S : P)
2488 if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
2489 WholeAllocaOp))
2490 return false;
2491
2492 for (const Slice *S : P.splitSliceTails())
2493 if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
2494 WholeAllocaOp))
2495 return false;
2496
2497 return WholeAllocaOp;
2498}
2499
2500static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2502 const Twine &Name) {
2503 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2504 IntegerType *IntTy = cast<IntegerType>(V->getType());
2505 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2506 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2507 "Element extends past full value");
2508 uint64_t ShAmt = 8 * Offset;
2509 if (DL.isBigEndian())
2510 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2511 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2512 if (ShAmt) {
2513 V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
2514 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2515 }
2516 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2517 "Cannot extract to a larger integer!");
2518 if (Ty != IntTy) {
2519 V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
2520 LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2521 }
2522 return V;
2523}
2524
2525static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2526 Value *V, uint64_t Offset, const Twine &Name) {
2527 IntegerType *IntTy = cast<IntegerType>(Old->getType());
2528 IntegerType *Ty = cast<IntegerType>(V->getType());
2529 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2530 "Cannot insert a larger integer!");
2531 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2532 if (Ty != IntTy) {
2533 V = IRB.CreateZExt(V, IntTy, Name + ".ext");
2534 LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2535 }
2536 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2537 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2538 "Element store outside of alloca store");
2539 uint64_t ShAmt = 8 * Offset;
2540 if (DL.isBigEndian())
2541 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2542 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2543 if (ShAmt) {
2544 V = IRB.CreateShl(V, ShAmt, Name + ".shift");
2545 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2546 }
2547
2548 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2549 APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
2550 Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
2551 LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2552 V = IRB.CreateOr(Old, V, Name + ".insert");
2553 LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2554 }
2555 return V;
2556}
2557
2558static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2559 unsigned EndIndex, const Twine &Name) {
2560 auto *VecTy = cast<FixedVectorType>(V->getType());
2561 unsigned NumElements = EndIndex - BeginIndex;
2562 assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2563
2564 if (NumElements == VecTy->getNumElements())
2565 return V;
2566
2567 if (NumElements == 1) {
2568 V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
2569 Name + ".extract");
2570 LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2571 return V;
2572 }
2573
2574 auto Mask = llvm::to_vector<8>(llvm::seq<int>(BeginIndex, EndIndex));
2575 V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
2576 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2577 return V;
2578}
2579
2580static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2581 unsigned BeginIndex, const Twine &Name) {
2582 VectorType *VecTy = cast<VectorType>(Old->getType());
2583 assert(VecTy && "Can only insert a vector into a vector");
2584
2585 VectorType *Ty = dyn_cast<VectorType>(V->getType());
2586 if (!Ty) {
2587 // Single element to insert.
2588 V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
2589 Name + ".insert");
2590 LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2591 return V;
2592 }
2593
2594 unsigned NumSubElements = cast<FixedVectorType>(Ty)->getNumElements();
2595 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
2596
2597 assert(NumSubElements <= NumElements && "Too many elements!");
2598 if (NumSubElements == NumElements) {
2599 assert(V->getType() == VecTy && "Vector type mismatch");
2600 return V;
2601 }
2602 unsigned EndIndex = BeginIndex + NumSubElements;
2603
2604 // When inserting a smaller vector into the larger to store, we first
2605 // use a shuffle vector to widen it with undef elements, and then
2606 // a second shuffle vector to select between the loaded vector and the
2607 // incoming vector.
2609 Mask.reserve(NumElements);
2610 for (unsigned Idx = 0; Idx != NumElements; ++Idx)
2611 if (Idx >= BeginIndex && Idx < EndIndex)
2612 Mask.push_back(Idx - BeginIndex);
2613 else
2614 Mask.push_back(-1);
2615 V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
2616 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2617
2618 Mask.clear();
2619 for (unsigned Idx = 0; Idx != NumElements; ++Idx)
2620 if (Idx >= BeginIndex && Idx < EndIndex)
2621 Mask.push_back(Idx);
2622 else
2623 Mask.push_back(Idx + NumElements);
2624 V = IRB.CreateShuffleVector(V, Old, Mask, Name + "blend");
2625 LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2626 return V;
2627}
2628
2629/// This function takes two vector values and combines them into a single vector
2630/// by concatenating their elements. The function handles:
2631///
2632/// 1. Element type mismatch: If either vector's element type differs from
2633/// NewAIEltType, the function bitcasts the vector to use NewAIEltType while
2634/// preserving the total bit width (adjusting the number of elements
2635/// accordingly).
2636///
2637/// 2. Size mismatch: After transforming the vectors to have the desired element
2638/// type, if the two vectors have different numbers of elements, the smaller
2639/// vector is extended with poison values to match the size of the larger
2640/// vector before concatenation.
2641///
2642/// 3. Concatenation: The vectors are merged using a shuffle operation that
2643/// places all elements of V0 first, followed by all elements of V1.
2644///
2645/// \param V0 The first vector to merge (must be a vector type)
2646/// \param V1 The second vector to merge (must be a vector type)
2647/// \param DL The data layout for size calculations
2648/// \param NewAIEltTy The desired element type for the result vector
2649/// \param Builder IRBuilder for creating new instructions
2650/// \return A new vector containing all elements from V0 followed by all
2651/// elements from V1
2653 Type *NewAIEltTy, IRBuilder<> &Builder) {
2654 // V0 and V1 are vectors
2655 // Create a new vector type with combined elements
2656 // Use ShuffleVector to concatenate the vectors
2657 auto *VecType0 = cast<FixedVectorType>(V0->getType());
2658 auto *VecType1 = cast<FixedVectorType>(V1->getType());
2659
2660 // If V0/V1 element types are different from NewAllocaElementType,
2661 // we need to introduce bitcasts before merging them
2662 auto BitcastIfNeeded = [&](Value *&V, FixedVectorType *&VecType,
2663 const char *DebugName) {
2664 Type *EltType = VecType->getElementType();
2665 if (EltType != NewAIEltTy) {
2666 // Calculate new number of elements to maintain same bit width
2667 unsigned TotalBits =
2668 VecType->getNumElements() * DL.getTypeSizeInBits(EltType);
2669 unsigned NewNumElts = TotalBits / DL.getTypeSizeInBits(NewAIEltTy);
2670
2671 auto *NewVecType = FixedVectorType::get(NewAIEltTy, NewNumElts);
2672 V = Builder.CreateBitCast(V, NewVecType);
2673 VecType = NewVecType;
2674 LLVM_DEBUG(dbgs() << " bitcast " << DebugName << ": " << *V << "\n");
2675 }
2676 };
2677
2678 BitcastIfNeeded(V0, VecType0, "V0");
2679 BitcastIfNeeded(V1, VecType1, "V1");
2680
2681 unsigned NumElts0 = VecType0->getNumElements();
2682 unsigned NumElts1 = VecType1->getNumElements();
2683
2684 SmallVector<int, 16> ShuffleMask;
2685
2686 if (NumElts0 == NumElts1) {
2687 for (unsigned i = 0; i < NumElts0 + NumElts1; ++i)
2688 ShuffleMask.push_back(i);
2689 } else {
2690 // If two vectors have different sizes, we need to extend
2691 // the smaller vector to the size of the larger vector.
2692 unsigned SmallSize = std::min(NumElts0, NumElts1);
2693 unsigned LargeSize = std::max(NumElts0, NumElts1);
2694 bool IsV0Smaller = NumElts0 < NumElts1;
2695 Value *&ExtendedVec = IsV0Smaller ? V0 : V1;
2696 SmallVector<int, 16> ExtendMask;
2697 for (unsigned i = 0; i < SmallSize; ++i)
2698 ExtendMask.push_back(i);
2699 for (unsigned i = SmallSize; i < LargeSize; ++i)
2700 ExtendMask.push_back(PoisonMaskElem);
2701 ExtendedVec = Builder.CreateShuffleVector(
2702 ExtendedVec, PoisonValue::get(ExtendedVec->getType()), ExtendMask);
2703 LLVM_DEBUG(dbgs() << " shufflevector: " << *ExtendedVec << "\n");
2704 for (unsigned i = 0; i < NumElts0; ++i)
2705 ShuffleMask.push_back(i);
2706 for (unsigned i = 0; i < NumElts1; ++i)
2707 ShuffleMask.push_back(LargeSize + i);
2708 }
2709
2710 return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2711}
2712
2713namespace {
2714
2715/// Visitor to rewrite instructions using p particular slice of an alloca
2716/// to use a new alloca.
2717///
2718/// Also implements the rewriting to vector-based accesses when the partition
2719/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2720/// lives here.
2721class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
2722 // Befriend the base class so it can delegate to private visit methods.
2723 friend class InstVisitor<AllocaSliceRewriter, bool>;
2724
2725 using Base = InstVisitor<AllocaSliceRewriter, bool>;
2726
2727 const DataLayout &DL;
2728 AllocaSlices &AS;
2729 SROA &Pass;
2730 AllocaInst &OldAI, &NewAI;
2731 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2732 Type *NewAllocaTy;
2733
2734 // This is a convenience and flag variable that will be null unless the new
2735 // alloca's integer operations should be widened to this integer type due to
2736 // passing isIntegerWideningViable above. If it is non-null, the desired
2737 // integer type will be stored here for easy access during rewriting.
2738 IntegerType *IntTy;
2739
2740 // If we are rewriting an alloca partition which can be written as pure
2741 // vector operations, we stash extra information here. When VecTy is
2742 // non-null, we have some strict guarantees about the rewritten alloca:
2743 // - The new alloca is exactly the size of the vector type here.
2744 // - The accesses all either map to the entire vector or to a single
2745 // element.
2746 // - The set of accessing instructions is only one of those handled above
2747 // in isVectorPromotionViable. Generally these are the same access kinds
2748 // which are promotable via mem2reg.
2749 VectorType *VecTy;
2750 Type *ElementTy;
2751 uint64_t ElementSize;
2752
2753 // The original offset of the slice currently being rewritten relative to
2754 // the original alloca.
2755 uint64_t BeginOffset = 0;
2756 uint64_t EndOffset = 0;
2757
2758 // The new offsets of the slice currently being rewritten relative to the
2759 // original alloca.
2760 uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2761
2762 uint64_t SliceSize = 0;
2763 bool IsSplittable = false;
2764 bool IsSplit = false;
2765 Use *OldUse = nullptr;
2766 Instruction *OldPtr = nullptr;
2767
2768 // Track post-rewrite users which are PHI nodes and Selects.
2769 SmallSetVector<PHINode *, 8> &PHIUsers;
2770 SmallSetVector<SelectInst *, 8> &SelectUsers;
2771
2772 // Utility IR builder, whose name prefix is setup for each visited use, and
2773 // the insertion point is set to point to the user.
2774 IRBuilderTy IRB;
2775
2776 // Return the new alloca, addrspacecasted if required to avoid changing the
2777 // addrspace of a volatile access.
2778 Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) {
2779 if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace())
2780 return &NewAI;
2781
2782 Type *AccessTy = IRB.getPtrTy(AddrSpace);
2783 return IRB.CreateAddrSpaceCast(&NewAI, AccessTy);
2784 }
2785
2786public:
2787 AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
2788 AllocaInst &OldAI, AllocaInst &NewAI, Type *NewAllocaTy,
2789 uint64_t NewAllocaBeginOffset,
2790 uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2791 VectorType *PromotableVecTy,
2792 SmallSetVector<PHINode *, 8> &PHIUsers,
2793 SmallSetVector<SelectInst *, 8> &SelectUsers)
2794 : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2795 NewAllocaBeginOffset(NewAllocaBeginOffset),
2796 NewAllocaEndOffset(NewAllocaEndOffset), NewAllocaTy(NewAllocaTy),
2797 IntTy(IsIntegerPromotable
2798 ? Type::getIntNTy(
2799 NewAI.getContext(),
2800 DL.getTypeSizeInBits(NewAllocaTy).getFixedValue())
2801 : nullptr),
2802 VecTy(PromotableVecTy),
2803 ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2804 ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8
2805 : 0),
2806 PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2807 IRB(NewAI.getContext(), ConstantFolder()) {
2808 if (VecTy) {
2809 assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % 8) == 0 &&
2810 "Only multiple-of-8 sized vector elements are viable");
2811 ++NumVectorized;
2812 }
2813 assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2814 }
2815
2816 bool visit(AllocaSlices::const_iterator I) {
2817 bool CanSROA = true;
2818 BeginOffset = I->beginOffset();
2819 EndOffset = I->endOffset();
2820 IsSplittable = I->isSplittable();
2821 IsSplit =
2822 BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2823 LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2824 LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2825 LLVM_DEBUG(dbgs() << "\n");
2826
2827 // Compute the intersecting offset range.
2828 assert(BeginOffset < NewAllocaEndOffset);
2829 assert(EndOffset > NewAllocaBeginOffset);
2830 NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
2831 NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
2832
2833 SliceSize = NewEndOffset - NewBeginOffset;
2834 LLVM_DEBUG(dbgs() << " Begin:(" << BeginOffset << ", " << EndOffset
2835 << ") NewBegin:(" << NewBeginOffset << ", "
2836 << NewEndOffset << ") NewAllocaBegin:("
2837 << NewAllocaBeginOffset << ", " << NewAllocaEndOffset
2838 << ")\n");
2839 assert(IsSplit || NewBeginOffset == BeginOffset);
2840 OldUse = I->getUse();
2841 OldPtr = cast<Instruction>(OldUse->get());
2842
2843 Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
2844 IRB.SetInsertPoint(OldUserI);
2845 IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2846 IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." +
2847 Twine(BeginOffset) + ".");
2848
2849 CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
2850 if (VecTy || IntTy)
2851 assert(CanSROA);
2852 return CanSROA;
2853 }
2854
2855 /// Attempts to rewrite a partition using tree-structured merge optimization.
2856 ///
2857 /// This function analyzes a partition to determine if it can be optimized
2858 /// using a tree-structured merge pattern, where multiple non-overlapping
2859 /// stores completely fill an alloca. And there is no load from the alloca in
2860 /// the middle of the stores. Such patterns can be optimized by eliminating
2861 /// the intermediate stores and directly constructing the final vector by
2862 /// using shufflevectors.
2863 ///
2864 /// Example transformation:
2865 /// Before: (stores do not have to be in order)
2866 /// %alloca = alloca <8 x float>
2867 /// store <2 x float> %val0, ptr %alloca ; offset 0-1
2868 /// store <2 x float> %val2, ptr %alloca+16 ; offset 4-5
2869 /// store <2 x float> %val1, ptr %alloca+8 ; offset 2-3
2870 /// store <2 x float> %val3, ptr %alloca+24 ; offset 6-7
2871 ///
2872 /// After:
2873 /// %alloca = alloca <8 x float>
2874 /// %shuffle0 = shufflevector %val0, %val1, <4 x i32> <i32 0, i32 1, i32 2,
2875 /// i32 3>
2876 /// %shuffle1 = shufflevector %val2, %val3, <4 x i32> <i32 0, i32 1, i32 2,
2877 /// i32 3>
2878 /// %shuffle2 = shufflevector %shuffle0, %shuffle1, <8 x i32> <i32 0, i32 1,
2879 /// i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2880 /// store %shuffle2, ptr %alloca
2881 ///
2882 /// The optimization looks for partitions that:
2883 /// 1. Have no overlapping split slice tails
2884 /// 2. Contain non-overlapping stores that cover the entire alloca
2885 /// 3. Have exactly one load that reads the complete alloca structure and not
2886 /// in the middle of the stores (TODO: maybe we can relax the constraint
2887 /// about reading the entire alloca structure)
2888 ///
2889 /// \param P The partition to analyze and potentially rewrite
2890 /// \return An optional vector of values that were deleted during the rewrite
2891 /// process, or std::nullopt if the partition cannot be optimized
2892 /// using tree-structured merge
2893 std::optional<SmallVector<Value *, 4>>
2894 rewriteTreeStructuredMerge(Partition &P) {
2895 // No tail slices that overlap with the partition
2896 if (P.splitSliceTails().size() > 0)
2897 return std::nullopt;
2898
2899 SmallVector<Value *, 4> DeletedValues;
2900 LoadInst *TheLoad = nullptr;
2901
2902 // Structure to hold store information
2903 struct StoreInfo {
2904 StoreInst *Store;
2905 uint64_t BeginOffset;
2906 uint64_t EndOffset;
2907 Value *StoredValue;
2908 StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val)
2909 : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val) {}
2910 };
2911
2912 SmallVector<StoreInfo, 4> StoreInfos;
2913
2914 // If the new alloca is a fixed vector type, we use its element type as the
2915 // allocated element type, otherwise we use i8 as the allocated element
2916 Type *AllocatedEltTy =
2917 isa<FixedVectorType>(NewAllocaTy)
2918 ? cast<FixedVectorType>(NewAllocaTy)->getElementType()
2919 : Type::getInt8Ty(NewAI.getContext());
2920 unsigned AllocatedEltTySize = DL.getTypeSizeInBits(AllocatedEltTy);
2921
2922 // Helper to check if a type is
2923 // 1. A fixed vector type
2924 // 2. The element type is not a pointer
2925 // 3. The element type size is byte-aligned
2926 // We only handle the cases that the ld/st meet these conditions
2927 auto IsTypeValidForTreeStructuredMerge = [&](Type *Ty) -> bool {
2928 auto *FixedVecTy = dyn_cast<FixedVectorType>(Ty);
2929 return FixedVecTy &&
2930 DL.getTypeSizeInBits(FixedVecTy->getElementType()) % 8 == 0 &&
2931 !FixedVecTy->getElementType()->isPointerTy();
2932 };
2933
2934 for (Slice &S : P) {
2935 auto *User = cast<Instruction>(S.getUse()->getUser());
2936 if (auto *LI = dyn_cast<LoadInst>(User)) {
2937 // Do not handle the case if
2938 // 1. There is more than one load
2939 // 2. The load is volatile
2940 // 3. The load does not read the entire alloca structure
2941 // 4. The load does not meet the conditions in the helper function
2942 if (TheLoad || !IsTypeValidForTreeStructuredMerge(LI->getType()) ||
2943 S.beginOffset() != NewAllocaBeginOffset ||
2944 S.endOffset() != NewAllocaEndOffset || LI->isVolatile())
2945 return std::nullopt;
2946 TheLoad = LI;
2947 } else if (auto *SI = dyn_cast<StoreInst>(User)) {
2948 // Do not handle the case if
2949 // 1. The store does not meet the conditions in the helper function
2950 // 2. The store is volatile
2951 // 3. The total store size is not a multiple of the allocated element
2952 // type size
2953 if (!IsTypeValidForTreeStructuredMerge(
2954 SI->getValueOperand()->getType()) ||
2955 SI->isVolatile())
2956 return std::nullopt;
2957 auto *VecTy = cast<FixedVectorType>(SI->getValueOperand()->getType());
2958 unsigned NumElts = VecTy->getNumElements();
2959 unsigned EltSize = DL.getTypeSizeInBits(VecTy->getElementType());
2960 if (NumElts * EltSize % AllocatedEltTySize != 0)
2961 return std::nullopt;
2962 StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(),
2963 SI->getValueOperand());
2964 } else {
2965 // If we have instructions other than load and store, we cannot do the
2966 // tree structured merge
2967 return std::nullopt;
2968 }
2969 }
2970 // If we do not have any load, we cannot do the tree structured merge
2971 if (!TheLoad)
2972 return std::nullopt;
2973
2974 // If we do not have multiple stores, we cannot do the tree structured merge
2975 if (StoreInfos.size() < 2)
2976 return std::nullopt;
2977
2978 // Stores should not overlap and should cover the whole alloca
2979 // Sort by begin offset
2980 llvm::sort(StoreInfos, [](const StoreInfo &A, const StoreInfo &B) {
2981 return A.BeginOffset < B.BeginOffset;
2982 });
2983
2984 // Check for overlaps and coverage
2985 uint64_t ExpectedStart = NewAllocaBeginOffset;
2986 for (auto &StoreInfo : StoreInfos) {
2987 uint64_t BeginOff = StoreInfo.BeginOffset;
2988 uint64_t EndOff = StoreInfo.EndOffset;
2989
2990 // Check for gap or overlap
2991 if (BeginOff != ExpectedStart)
2992 return std::nullopt;
2993
2994 ExpectedStart = EndOff;
2995 }
2996 // Check that stores cover the entire alloca
2997 if (ExpectedStart != NewAllocaEndOffset)
2998 return std::nullopt;
2999
3000 // Stores should be in the same basic block
3001 // The load should not be in the middle of the stores
3002 // Note:
3003 // If the load is in a different basic block with the stores, we can still
3004 // do the tree structured merge. This is because we do not have the
3005 // store->load forwarding here. The merged vector will be stored back to
3006 // NewAI and the new load will load from NewAI. The forwarding will be
3007 // handled later when we try to promote NewAI.
3008 BasicBlock *LoadBB = TheLoad->getParent();
3009 BasicBlock *StoreBB = StoreInfos[0].Store->getParent();
3010
3011 for (auto &StoreInfo : StoreInfos) {
3012 if (StoreInfo.Store->getParent() != StoreBB)
3013 return std::nullopt;
3014 if (LoadBB == StoreBB && !StoreInfo.Store->comesBefore(TheLoad))
3015 return std::nullopt;
3016 }
3017
3018 // If we reach here, the partition can be merged with a tree structured
3019 // merge
3020 LLVM_DEBUG({
3021 dbgs() << "Tree structured merge rewrite:\n Load: " << *TheLoad
3022 << "\n Ordered stores:\n";
3023 for (auto [i, Info] : enumerate(StoreInfos))
3024 dbgs() << " [" << i << "] Range[" << Info.BeginOffset << ", "
3025 << Info.EndOffset << ") \tStore: " << *Info.Store
3026 << "\tValue: " << *Info.StoredValue << "\n";
3027 });
3028
3029 // Instead of having these stores, we merge all the stored values into a
3030 // vector and store the merged value into the alloca
3031 std::queue<Value *> VecElements;
3032 IRBuilder<> Builder(StoreInfos.back().Store);
3033 for (const auto &Info : StoreInfos) {
3034 DeletedValues.push_back(Info.Store);
3035 VecElements.push(Info.StoredValue);
3036 }
3037
3038 LLVM_DEBUG(dbgs() << " Rewrite stores into shufflevectors:\n");
3039 while (VecElements.size() > 1) {
3040 const auto NumElts = VecElements.size();
3041 for ([[maybe_unused]] const auto _ : llvm::seq(NumElts / 2)) {
3042 Value *V0 = VecElements.front();
3043 VecElements.pop();
3044 Value *V1 = VecElements.front();
3045 VecElements.pop();
3046 Value *Merged = mergeTwoVectors(V0, V1, DL, AllocatedEltTy, Builder);
3047 LLVM_DEBUG(dbgs() << " shufflevector: " << *Merged << "\n");
3048 VecElements.push(Merged);
3049 }
3050 if (NumElts % 2 == 1) {
3051 Value *V = VecElements.front();
3052 VecElements.pop();
3053 VecElements.push(V);
3054 }
3055 }
3056
3057 // Store the merged value into the alloca
3058 Value *MergedValue = VecElements.front();
3059 Builder.CreateAlignedStore(MergedValue, &NewAI, getSliceAlign());
3060
3061 IRBuilder<> LoadBuilder(TheLoad);
3062 TheLoad->replaceAllUsesWith(LoadBuilder.CreateAlignedLoad(
3063 TheLoad->getType(), &NewAI, getSliceAlign(), TheLoad->isVolatile(),
3064 TheLoad->getName() + ".sroa.new.load"));
3065 DeletedValues.push_back(TheLoad);
3066
3067 return DeletedValues;
3068 }
3069
3070private:
3071 // Make sure the other visit overloads are visible.
3072 using Base::visit;
3073
3074 // Every instruction which can end up as a user must have a rewrite rule.
3075 bool visitInstruction(Instruction &I) {
3076 LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
3077 llvm_unreachable("No rewrite rule for this instruction!");
3078 }
3079
3080 Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
3081 // Note that the offset computation can use BeginOffset or NewBeginOffset
3082 // interchangeably for unsplit slices.
3083 assert(IsSplit || BeginOffset == NewBeginOffset);
3084 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3085
3086 StringRef OldName = OldPtr->getName();
3087 // Skip through the last '.sroa.' component of the name.
3088 size_t LastSROAPrefix = OldName.rfind(".sroa.");
3089 if (LastSROAPrefix != StringRef::npos) {
3090 OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
3091 // Look for an SROA slice index.
3092 size_t IndexEnd = OldName.find_first_not_of("0123456789");
3093 if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
3094 // Strip the index and look for the offset.
3095 OldName = OldName.substr(IndexEnd + 1);
3096 size_t OffsetEnd = OldName.find_first_not_of("0123456789");
3097 if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
3098 // Strip the offset.
3099 OldName = OldName.substr(OffsetEnd + 1);
3100 }
3101 }
3102 // Strip any SROA suffixes as well.
3103 OldName = OldName.substr(0, OldName.find(".sroa_"));
3104
3105 return getAdjustedPtr(IRB, DL, &NewAI,
3106 APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
3107 PointerTy, Twine(OldName) + ".");
3108 }
3109
3110 /// Compute suitable alignment to access this slice of the *new*
3111 /// alloca.
3112 ///
3113 /// You can optionally pass a type to this routine and if that type's ABI
3114 /// alignment is itself suitable, this will return zero.
3115 Align getSliceAlign() {
3116 return commonAlignment(NewAI.getAlign(),
3117 NewBeginOffset - NewAllocaBeginOffset);
3118 }
3119
3120 unsigned getIndex(uint64_t Offset) {
3121 assert(VecTy && "Can only call getIndex when rewriting a vector");
3122 uint64_t RelOffset = Offset - NewAllocaBeginOffset;
3123 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
3124 uint32_t Index = RelOffset / ElementSize;
3125 assert(Index * ElementSize == RelOffset);
3126 return Index;
3127 }
3128
3129 void deleteIfTriviallyDead(Value *V) {
3132 Pass.DeadInsts.push_back(I);
3133 }
3134
3135 Value *rewriteVectorizedLoadInst(LoadInst &LI) {
3136 unsigned BeginIndex = getIndex(NewBeginOffset);
3137 unsigned EndIndex = getIndex(NewEndOffset);
3138 assert(EndIndex > BeginIndex && "Empty vector!");
3139
3140 LoadInst *Load =
3141 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3142
3143 Load->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3144 LLVMContext::MD_access_group});
3145 return extractVector(IRB, Load, BeginIndex, EndIndex, "vec");
3146 }
3147
3148 Value *rewriteIntegerLoad(LoadInst &LI) {
3149 assert(IntTy && "We cannot insert an integer to the alloca");
3150 assert(!LI.isVolatile());
3151 Value *V =
3152 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3153 V = IRB.CreateBitPreservingCastChain(DL, V, IntTy);
3154 assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3155 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3156 if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
3157 IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
3158 V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
3159 }
3160 // It is possible that the extracted type is not the load type. This
3161 // happens if there is a load past the end of the alloca, and as
3162 // a consequence the slice is narrower but still a candidate for integer
3163 // lowering. To handle this case, we just zero extend the extracted
3164 // integer.
3165 assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
3166 "Can only handle an extract for an overly wide load");
3167 if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
3168 V = IRB.CreateZExt(V, LI.getType());
3169 return V;
3170 }
3171
3172 bool visitLoadInst(LoadInst &LI) {
3173 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3174 Value *OldOp = LI.getOperand(0);
3175 assert(OldOp == OldPtr);
3176
3177 AAMDNodes AATags = LI.getAAMetadata();
3178
3179 unsigned AS = LI.getPointerAddressSpace();
3180
3181 Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
3182 : LI.getType();
3183 bool IsPtrAdjusted = false;
3184 Value *V;
3185 if (VecTy) {
3186 V = rewriteVectorizedLoadInst(LI);
3187 } else if (IntTy && LI.getType()->isIntegerTy()) {
3188 V = rewriteIntegerLoad(LI);
3189 } else if (NewBeginOffset == NewAllocaBeginOffset &&
3190 NewEndOffset == NewAllocaEndOffset &&
3191 (canConvertValue(DL, NewAllocaTy, TargetTy) ||
3192 (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() &&
3193 DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize &&
3194 !LI.isVolatile()))) {
3195 Value *NewPtr =
3196 getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile());
3197 LoadInst *NewLI = IRB.CreateAlignedLoad(
3198 NewAllocaTy, NewPtr, NewAI.getAlign(), LI.isVolatile(), LI.getName());
3199 if (LI.isVolatile())
3200 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3201 if (NewLI->isAtomic())
3202 NewLI->setAlignment(LI.getAlign());
3203
3204 // Copy any metadata that is valid for the new load. This may require
3205 // conversion to a different kind of metadata, e.g. !nonnull might change
3206 // to !range or vice versa.
3207 copyMetadataForLoad(*NewLI, LI);
3208
3209 // Do this after copyMetadataForLoad() to preserve the TBAA shift.
3210 if (AATags)
3211 NewLI->setAAMetadata(AATags.adjustForAccess(
3212 NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3213
3214 // Try to preserve nonnull metadata
3215 V = NewLI;
3216
3217 // If this is an integer load past the end of the slice (which means the
3218 // bytes outside the slice are undef or this load is dead) just forcibly
3219 // fix the integer size with correct handling of endianness.
3220 if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
3221 if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
3222 if (AITy->getBitWidth() < TITy->getBitWidth()) {
3223 V = IRB.CreateZExt(V, TITy, "load.ext");
3224 if (DL.isBigEndian())
3225 V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
3226 "endian_shift");
3227 }
3228 } else {
3229 Type *LTy = IRB.getPtrTy(AS);
3230 LoadInst *NewLI =
3231 IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
3232 getSliceAlign(), LI.isVolatile(), LI.getName());
3233
3234 if (AATags)
3235 NewLI->setAAMetadata(AATags.adjustForAccess(
3236 NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3237
3238 if (LI.isVolatile())
3239 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3240 NewLI->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3241 LLVMContext::MD_access_group});
3242
3243 V = NewLI;
3244 IsPtrAdjusted = true;
3245 }
3246 V = IRB.CreateBitPreservingCastChain(DL, V, TargetTy);
3247
3248 if (IsSplit) {
3249 assert(!LI.isVolatile());
3250 assert(LI.getType()->isIntegerTy() &&
3251 "Only integer type loads and stores are split");
3252 assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() &&
3253 "Split load isn't smaller than original load");
3254 assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
3255 "Non-byte-multiple bit width");
3256 // Move the insertion point just past the load so that we can refer to it.
3257 BasicBlock::iterator LIIt = std::next(LI.getIterator());
3258 // Ensure the insertion point comes before any debug-info immediately
3259 // after the load, so that variable values referring to the load are
3260 // dominated by it.
3261 LIIt.setHeadBit(true);
3262 IRB.SetInsertPoint(LI.getParent(), LIIt);
3263 // Create a placeholder value with the same type as LI to use as the
3264 // basis for the new value. This allows us to replace the uses of LI with
3265 // the computed value, and then replace the placeholder with LI, leaving
3266 // LI only used for this computation.
3267 Value *Placeholder =
3268 new LoadInst(LI.getType(), PoisonValue::get(IRB.getPtrTy(AS)), "",
3269 false, Align(1));
3270 V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
3271 "insert");
3272 LI.replaceAllUsesWith(V);
3273 Placeholder->replaceAllUsesWith(&LI);
3274 Placeholder->deleteValue();
3275 } else {
3276 LI.replaceAllUsesWith(V);
3277 }
3278
3279 Pass.DeadInsts.push_back(&LI);
3280 deleteIfTriviallyDead(OldOp);
3281 LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
3282 return !LI.isVolatile() && !IsPtrAdjusted;
3283 }
3284
3285 bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
3286 AAMDNodes AATags) {
3287 // Capture V for the purpose of debug-info accounting once it's converted
3288 // to a vector store.
3289 Value *OrigV = V;
3290 if (V->getType() != VecTy) {
3291 unsigned BeginIndex = getIndex(NewBeginOffset);
3292 unsigned EndIndex = getIndex(NewEndOffset);
3293 assert(EndIndex > BeginIndex && "Empty vector!");
3294 unsigned NumElements = EndIndex - BeginIndex;
3295 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3296 "Too many elements!");
3297 Type *SliceTy = (NumElements == 1)
3298 ? ElementTy
3299 : FixedVectorType::get(ElementTy, NumElements);
3300 if (V->getType() != SliceTy)
3301 V = IRB.CreateBitPreservingCastChain(DL, V, SliceTy);
3302
3303 // Mix in the existing elements.
3304 Value *Old =
3305 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3306 V = insertVector(IRB, Old, V, BeginIndex, "vec");
3307 }
3308 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3309 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3310 LLVMContext::MD_access_group});
3311 if (AATags)
3312 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3313 V->getType(), DL));
3314 Pass.DeadInsts.push_back(&SI);
3315
3316 // NOTE: Careful to use OrigV rather than V.
3317 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3318 Store, Store->getPointerOperand(), OrigV, DL);
3319 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3320 return true;
3321 }
3322
3323 bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
3324 assert(IntTy && "We cannot extract an integer from the alloca");
3325 assert(!SI.isVolatile());
3326 if (DL.getTypeSizeInBits(V->getType()).getFixedValue() !=
3327 IntTy->getBitWidth()) {
3328 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
3329 "oldload");
3330 Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy);
3331 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3332 uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
3333 V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
3334 }
3335 V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
3336 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3337 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3338 LLVMContext::MD_access_group});
3339 if (AATags)
3340 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3341 V->getType(), DL));
3342
3343 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3344 Store, Store->getPointerOperand(),
3345 Store->getValueOperand(), DL);
3346
3347 Pass.DeadInsts.push_back(&SI);
3348 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3349 return true;
3350 }
3351
3352 bool visitStoreInst(StoreInst &SI) {
3353 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3354 Value *OldOp = SI.getOperand(1);
3355 assert(OldOp == OldPtr);
3356
3357 AAMDNodes AATags = SI.getAAMetadata();
3358 Value *V = SI.getValueOperand();
3359
3360 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3361 // alloca that should be re-examined after promoting this alloca.
3362 if (V->getType()->isPointerTy())
3363 if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
3364 Pass.PostPromotionWorklist.insert(AI);
3365
3366 TypeSize StoreSize = DL.getTypeStoreSize(V->getType());
3367 if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) {
3368 assert(!SI.isVolatile());
3369 assert(V->getType()->isIntegerTy() &&
3370 "Only integer type loads and stores are split");
3371 assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
3372 "Non-byte-multiple bit width");
3373 IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
3374 V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
3375 "extract");
3376 }
3377
3378 if (VecTy)
3379 return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
3380 if (IntTy && V->getType()->isIntegerTy())
3381 return rewriteIntegerStore(V, SI, AATags);
3382
3383 StoreInst *NewSI;
3384 if (NewBeginOffset == NewAllocaBeginOffset &&
3385 NewEndOffset == NewAllocaEndOffset &&
3386 canConvertValue(DL, V->getType(), NewAllocaTy)) {
3387 V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
3388 Value *NewPtr =
3389 getPtrToNewAI(SI.getPointerAddressSpace(), SI.isVolatile());
3390
3391 NewSI =
3392 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), SI.isVolatile());
3393 } else {
3394 unsigned AS = SI.getPointerAddressSpace();
3395 Value *NewPtr = getNewAllocaSlicePtr(IRB, IRB.getPtrTy(AS));
3396 NewSI =
3397 IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
3398 }
3399 NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3400 LLVMContext::MD_access_group});
3401 if (AATags)
3402 NewSI->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3403 V->getType(), DL));
3404 if (SI.isVolatile())
3405 NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
3406 if (NewSI->isAtomic())
3407 NewSI->setAlignment(SI.getAlign());
3408
3409 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3410 NewSI, NewSI->getPointerOperand(),
3411 NewSI->getValueOperand(), DL);
3412
3413 Pass.DeadInsts.push_back(&SI);
3414 deleteIfTriviallyDead(OldOp);
3415
3416 LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
3417 return NewSI->getPointerOperand() == &NewAI &&
3418 NewSI->getValueOperand()->getType() == NewAllocaTy &&
3419 !SI.isVolatile();
3420 }
3421
3422 /// Compute an integer value from splatting an i8 across the given
3423 /// number of bytes.
3424 ///
3425 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
3426 /// call this routine.
3427 /// FIXME: Heed the advice above.
3428 ///
3429 /// \param V The i8 value to splat.
3430 /// \param Size The number of bytes in the output (assuming i8 is one byte)
3431 Value *getIntegerSplat(Value *V, unsigned Size) {
3432 assert(Size > 0 && "Expected a positive number of bytes.");
3433 IntegerType *VTy = cast<IntegerType>(V->getType());
3434 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
3435 if (Size == 1)
3436 return V;
3437
3438 Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
3439 V = IRB.CreateMul(
3440 IRB.CreateZExt(V, SplatIntTy, "zext"),
3441 IRB.CreateUDiv(Constant::getAllOnesValue(SplatIntTy),
3442 IRB.CreateZExt(Constant::getAllOnesValue(V->getType()),
3443 SplatIntTy)),
3444 "isplat");
3445 return V;
3446 }
3447
3448 /// Compute a vector splat for a given element value.
3449 Value *getVectorSplat(Value *V, unsigned NumElements) {
3450 V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
3451 LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
3452 return V;
3453 }
3454
3455 bool visitMemSetInst(MemSetInst &II) {
3456 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3457 assert(II.getRawDest() == OldPtr);
3458
3459 AAMDNodes AATags = II.getAAMetadata();
3460
3461 // If the memset has a variable size, it cannot be split, just adjust the
3462 // pointer to the new alloca.
3463 if (!isa<ConstantInt>(II.getLength())) {
3464 assert(!IsSplit);
3465 assert(NewBeginOffset == BeginOffset);
3466 II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
3467 II.setDestAlignment(getSliceAlign());
3468 // In theory we should call migrateDebugInfo here. However, we do not
3469 // emit dbg.assign intrinsics for mem intrinsics storing through non-
3470 // constant geps, or storing a variable number of bytes.
3472 "AT: Unexpected link to non-const GEP");
3473 deleteIfTriviallyDead(OldPtr);
3474 return false;
3475 }
3476
3477 // Record this instruction for deletion.
3478 Pass.DeadInsts.push_back(&II);
3479
3480 Type *ScalarTy = NewAllocaTy->getScalarType();
3481
3482 const bool CanContinue = [&]() {
3483 if (VecTy || IntTy)
3484 return true;
3485 if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)
3486 return false;
3487 // Length must be in range for FixedVectorType.
3488 auto *C = cast<ConstantInt>(II.getLength());
3489 const uint64_t Len = C->getLimitedValue();
3490 if (Len > std::numeric_limits<unsigned>::max())
3491 return false;
3492 auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
3493 auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
3494 return canConvertValue(DL, SrcTy, NewAllocaTy) &&
3495 DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedValue());
3496 }();
3497
3498 // If this doesn't map cleanly onto the alloca type, and that type isn't
3499 // a single value type, just emit a memset.
3500 if (!CanContinue) {
3501 Type *SizeTy = II.getLength()->getType();
3502 unsigned Sz = NewEndOffset - NewBeginOffset;
3503 Constant *Size = ConstantInt::get(SizeTy, Sz);
3504 MemIntrinsic *New = cast<MemIntrinsic>(IRB.CreateMemSet(
3505 getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
3506 MaybeAlign(getSliceAlign()), II.isVolatile()));
3507 if (AATags)
3508 New->setAAMetadata(
3509 AATags.adjustForAccess(NewBeginOffset - BeginOffset, Sz));
3510
3511 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3512 New, New->getRawDest(), nullptr, DL);
3513
3514 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3515 return false;
3516 }
3517
3518 // If we can represent this as a simple value, we have to build the actual
3519 // value to store, which requires expanding the byte present in memset to
3520 // a sensible representation for the alloca type. This is essentially
3521 // splatting the byte to a sufficiently wide integer, splatting it across
3522 // any desired vector width, and bitcasting to the final type.
3523 Value *V;
3524
3525 if (VecTy) {
3526 // If this is a memset of a vectorized alloca, insert it.
3527 assert(ElementTy == ScalarTy);
3528
3529 unsigned BeginIndex = getIndex(NewBeginOffset);
3530 unsigned EndIndex = getIndex(NewEndOffset);
3531 assert(EndIndex > BeginIndex && "Empty vector!");
3532 unsigned NumElements = EndIndex - BeginIndex;
3533 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3534 "Too many elements!");
3535
3536 Value *Splat = getIntegerSplat(
3537 II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8);
3538 Splat = IRB.CreateBitPreservingCastChain(DL, Splat, ElementTy);
3539 if (NumElements > 1)
3540 Splat = getVectorSplat(Splat, NumElements);
3541
3542 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
3543 "oldload");
3544 V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
3545 } else if (IntTy) {
3546 // If this is a memset on an alloca where we can widen stores, insert the
3547 // set integer.
3548 assert(!II.isVolatile());
3549
3550 uint64_t Size = NewEndOffset - NewBeginOffset;
3551 V = getIntegerSplat(II.getValue(), Size);
3552
3553 if (IntTy && (NewBeginOffset != NewAllocaBeginOffset ||
3554 NewEndOffset != NewAllocaEndOffset)) {
3555 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI,
3556 NewAI.getAlign(), "oldload");
3557 Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy);
3558 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3559 V = insertInteger(DL, IRB, Old, V, Offset, "insert");
3560 } else {
3561 assert(V->getType() == IntTy &&
3562 "Wrong type for an alloca wide integer!");
3563 }
3564 V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
3565 } else {
3566 // Established these invariants above.
3567 assert(NewBeginOffset == NewAllocaBeginOffset);
3568 assert(NewEndOffset == NewAllocaEndOffset);
3569
3570 V = getIntegerSplat(II.getValue(),
3571 DL.getTypeSizeInBits(ScalarTy).getFixedValue() / 8);
3572 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(NewAllocaTy))
3573 V = getVectorSplat(
3574 V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
3575
3576 V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
3577 }
3578
3579 Value *NewPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3580 StoreInst *New =
3581 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), II.isVolatile());
3582 New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3583 LLVMContext::MD_access_group});
3584 if (AATags)
3585 New->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3586 V->getType(), DL));
3587
3588 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3589 New, New->getPointerOperand(), V, DL);
3590
3591 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3592 return !II.isVolatile();
3593 }
3594
3595 bool visitMemTransferInst(MemTransferInst &II) {
3596 // Rewriting of memory transfer instructions can be a bit tricky. We break
3597 // them into two categories: split intrinsics and unsplit intrinsics.
3598
3599 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3600
3601 AAMDNodes AATags = II.getAAMetadata();
3602
3603 bool IsDest = &II.getRawDestUse() == OldUse;
3604 assert((IsDest && II.getRawDest() == OldPtr) ||
3605 (!IsDest && II.getRawSource() == OldPtr));
3606
3607 Align SliceAlign = getSliceAlign();
3608 // For unsplit intrinsics, we simply modify the source and destination
3609 // pointers in place. This isn't just an optimization, it is a matter of
3610 // correctness. With unsplit intrinsics we may be dealing with transfers
3611 // within a single alloca before SROA ran, or with transfers that have
3612 // a variable length. We may also be dealing with memmove instead of
3613 // memcpy, and so simply updating the pointers is the necessary for us to
3614 // update both source and dest of a single call.
3615 if (!IsSplittable) {
3616 Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3617 if (IsDest) {
3618 // Update the address component of linked dbg.assigns.
3619 for (DbgVariableRecord *DbgAssign : at::getDVRAssignmentMarkers(&II)) {
3620 if (llvm::is_contained(DbgAssign->location_ops(), II.getDest()) ||
3621 DbgAssign->getAddress() == II.getDest())
3622 DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
3623 }
3624 II.setDest(AdjustedPtr);
3625 II.setDestAlignment(SliceAlign);
3626 } else {
3627 II.setSource(AdjustedPtr);
3628 II.setSourceAlignment(SliceAlign);
3629 }
3630
3631 LLVM_DEBUG(dbgs() << " to: " << II << "\n");
3632 deleteIfTriviallyDead(OldPtr);
3633 return false;
3634 }
3635 // For split transfer intrinsics we have an incredibly useful assurance:
3636 // the source and destination do not reside within the same alloca, and at
3637 // least one of them does not escape. This means that we can replace
3638 // memmove with memcpy, and we don't need to worry about all manner of
3639 // downsides to splitting and transforming the operations.
3640
3641 // If this doesn't map cleanly onto the alloca type, and that type isn't
3642 // a single value type, just emit a memcpy.
3643 bool EmitMemCpy =
3644 !VecTy && !IntTy &&
3645 (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
3646 SliceSize != DL.getTypeStoreSize(NewAllocaTy).getFixedValue() ||
3647 !DL.typeSizeEqualsStoreSize(NewAllocaTy) ||
3648 !NewAllocaTy->isSingleValueType());
3649
3650 // If we're just going to emit a memcpy, the alloca hasn't changed, and the
3651 // size hasn't been shrunk based on analysis of the viable range, this is
3652 // a no-op.
3653 if (EmitMemCpy && &OldAI == &NewAI) {
3654 // Ensure the start lines up.
3655 assert(NewBeginOffset == BeginOffset);
3656
3657 // Rewrite the size as needed.
3658 if (NewEndOffset != EndOffset)
3659 II.setLength(NewEndOffset - NewBeginOffset);
3660 return false;
3661 }
3662 // Record this instruction for deletion.
3663 Pass.DeadInsts.push_back(&II);
3664
3665 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3666 // alloca that should be re-examined after rewriting this instruction.
3667 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
3668 if (AllocaInst *AI =
3670 assert(AI != &OldAI && AI != &NewAI &&
3671 "Splittable transfers cannot reach the same alloca on both ends.");
3672 Pass.Worklist.insert(AI);
3673 }
3674
3675 Type *OtherPtrTy = OtherPtr->getType();
3676 unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
3677
3678 // Compute the relative offset for the other pointer within the transfer.
3679 unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
3680 APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
3681 Align OtherAlign =
3682 (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
3683 OtherAlign =
3684 commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
3685
3686 if (EmitMemCpy) {
3687 // Compute the other pointer, folding as much as possible to produce
3688 // a single, simple GEP in most cases.
3689 OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3690 OtherPtr->getName() + ".");
3691
3692 Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3693 Type *SizeTy = II.getLength()->getType();
3694 Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
3695
3696 Value *DestPtr, *SrcPtr;
3697 MaybeAlign DestAlign, SrcAlign;
3698 // Note: IsDest is true iff we're copying into the new alloca slice
3699 if (IsDest) {
3700 DestPtr = OurPtr;
3701 DestAlign = SliceAlign;
3702 SrcPtr = OtherPtr;
3703 SrcAlign = OtherAlign;
3704 } else {
3705 DestPtr = OtherPtr;
3706 DestAlign = OtherAlign;
3707 SrcPtr = OurPtr;
3708 SrcAlign = SliceAlign;
3709 }
3710 CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
3711 Size, II.isVolatile());
3712 if (AATags)
3713 New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3714
3715 APInt Offset(DL.getIndexTypeSizeInBits(DestPtr->getType()), 0);
3716 if (IsDest) {
3717 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8,
3718 &II, New, DestPtr, nullptr, DL);
3719 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3721 DL, Offset, /*AllowNonInbounds*/ true))) {
3722 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8,
3723 SliceSize * 8, &II, New, DestPtr, nullptr, DL);
3724 }
3725 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3726 return false;
3727 }
3728
3729 bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3730 NewEndOffset == NewAllocaEndOffset;
3731 uint64_t Size = NewEndOffset - NewBeginOffset;
3732 unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
3733 unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
3734 unsigned NumElements = EndIndex - BeginIndex;
3735 IntegerType *SubIntTy =
3736 IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
3737
3738 // Reset the other pointer type to match the register type we're going to
3739 // use, but using the address space of the original other pointer.
3740 Type *OtherTy;
3741 if (VecTy && !IsWholeAlloca) {
3742 if (NumElements == 1)
3743 OtherTy = VecTy->getElementType();
3744 else
3745 OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
3746 } else if (IntTy && !IsWholeAlloca) {
3747 OtherTy = SubIntTy;
3748 } else {
3749 OtherTy = NewAllocaTy;
3750 }
3751
3752 Value *AdjPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3753 OtherPtr->getName() + ".");
3754 MaybeAlign SrcAlign = OtherAlign;
3755 MaybeAlign DstAlign = SliceAlign;
3756 if (!IsDest)
3757 std::swap(SrcAlign, DstAlign);
3758
3759 Value *SrcPtr;
3760 Value *DstPtr;
3761
3762 if (IsDest) {
3763 DstPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3764 SrcPtr = AdjPtr;
3765 } else {
3766 DstPtr = AdjPtr;
3767 SrcPtr = getPtrToNewAI(II.getSourceAddressSpace(), II.isVolatile());
3768 }
3769
3770 Value *Src;
3771 if (VecTy && !IsWholeAlloca && !IsDest) {
3772 Src =
3773 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3774 Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
3775 } else if (IntTy && !IsWholeAlloca && !IsDest) {
3776 Src =
3777 IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
3778 Src = IRB.CreateBitPreservingCastChain(DL, Src, IntTy);
3779 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3780 Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
3781 } else {
3782 LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
3783 II.isVolatile(), "copyload");
3784 Load->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3785 LLVMContext::MD_access_group});
3786 if (AATags)
3787 Load->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3788 Load->getType(), DL));
3789 Src = Load;
3790 }
3791
3792 if (VecTy && !IsWholeAlloca && IsDest) {
3793 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
3794 "oldload");
3795 Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
3796 } else if (IntTy && !IsWholeAlloca && IsDest) {
3797 Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
3798 "oldload");
3799 Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy);
3800 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3801 Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
3802 Src = IRB.CreateBitPreservingCastChain(DL, Src, NewAllocaTy);
3803 }
3804
3805 StoreInst *Store = cast<StoreInst>(
3806 IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
3807 Store->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3808 LLVMContext::MD_access_group});
3809 if (AATags)
3810 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3811 Src->getType(), DL));
3812
3813 APInt Offset(DL.getIndexTypeSizeInBits(DstPtr->getType()), 0);
3814 if (IsDest) {
3815
3816 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3817 Store, DstPtr, Src, DL);
3818 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3820 DL, Offset, /*AllowNonInbounds*/ true))) {
3821 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8, SliceSize * 8,
3822 &II, Store, DstPtr, Src, DL);
3823 }
3824
3825 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3826 return !II.isVolatile();
3827 }
3828
3829 bool visitIntrinsicInst(IntrinsicInst &II) {
3830 assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
3831 "Unexpected intrinsic!");
3832 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3833
3834 // Record this instruction for deletion.
3835 Pass.DeadInsts.push_back(&II);
3836
3837 if (II.isDroppable()) {
3838 assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3839 // TODO For now we forget assumed information, this can be improved.
3840 OldPtr->dropDroppableUsesIn(II);
3841 return true;
3842 }
3843
3844 assert(II.getArgOperand(0) == OldPtr);
3845 Type *PointerTy = IRB.getPtrTy(OldPtr->getType()->getPointerAddressSpace());
3846 Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3847 Value *New;
3848 if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3849 New = IRB.CreateLifetimeStart(Ptr);
3850 else
3851 New = IRB.CreateLifetimeEnd(Ptr);
3852
3853 (void)New;
3854 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3855
3856 return true;
3857 }
3858
3859 void fixLoadStoreAlign(Instruction &Root) {
3860 // This algorithm implements the same visitor loop as
3861 // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3862 // or store found.
3863 SmallPtrSet<Instruction *, 4> Visited;
3864 SmallVector<Instruction *, 4> Uses;
3865 Visited.insert(&Root);
3866 Uses.push_back(&Root);
3867 do {
3868 Instruction *I = Uses.pop_back_val();
3869
3870 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
3871 LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
3872 continue;
3873 }
3874 if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
3875 SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
3876 continue;
3877 }
3878
3882 for (User *U : I->users())
3883 if (Visited.insert(cast<Instruction>(U)).second)
3884 Uses.push_back(cast<Instruction>(U));
3885 } while (!Uses.empty());
3886 }
3887
3888 bool visitPHINode(PHINode &PN) {
3889 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
3890 assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3891 assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3892
3893 // We would like to compute a new pointer in only one place, but have it be
3894 // as local as possible to the PHI. To do that, we re-use the location of
3895 // the old pointer, which necessarily must be in the right position to
3896 // dominate the PHI.
3897 IRBuilderBase::InsertPointGuard Guard(IRB);
3898 if (isa<PHINode>(OldPtr))
3899 IRB.SetInsertPoint(OldPtr->getParent(),
3900 OldPtr->getParent()->getFirstInsertionPt());
3901 else
3902 IRB.SetInsertPoint(OldPtr);
3903 IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3904
3905 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3906 // Replace the operands which were using the old pointer.
3907 std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
3908
3909 LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
3910 deleteIfTriviallyDead(OldPtr);
3911
3912 // Fix the alignment of any loads or stores using this PHI node.
3913 fixLoadStoreAlign(PN);
3914
3915 // PHIs can't be promoted on their own, but often can be speculated. We
3916 // check the speculation outside of the rewriter so that we see the
3917 // fully-rewritten alloca.
3918 PHIUsers.insert(&PN);
3919 return true;
3920 }
3921
3922 bool visitSelectInst(SelectInst &SI) {
3923 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3924 assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
3925 "Pointer isn't an operand!");
3926 assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3927 assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3928
3929 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3930 // Replace the operands which were using the old pointer.
3931 if (SI.getOperand(1) == OldPtr)
3932 SI.setOperand(1, NewPtr);
3933 if (SI.getOperand(2) == OldPtr)
3934 SI.setOperand(2, NewPtr);
3935
3936 LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
3937 deleteIfTriviallyDead(OldPtr);
3938
3939 // Fix the alignment of any loads or stores using this select.
3940 fixLoadStoreAlign(SI);
3941
3942 // Selects can't be promoted on their own, but often can be speculated. We
3943 // check the speculation outside of the rewriter so that we see the
3944 // fully-rewritten alloca.
3945 SelectUsers.insert(&SI);
3946 return true;
3947 }
3948};
3949
3950/// Visitor to rewrite aggregate loads and stores as scalar.
3951///
3952/// This pass aggressively rewrites all aggregate loads and stores on
3953/// a particular pointer (or any pointer derived from it which we can identify)
3954/// with scalar loads and stores.
3955class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
3956 // Befriend the base class so it can delegate to private visit methods.
3957 friend class InstVisitor<AggLoadStoreRewriter, bool>;
3958
3959 /// Queue of pointer uses to analyze and potentially rewrite.
3961
3962 /// Set to prevent us from cycling with phi nodes and loops.
3963 SmallPtrSet<User *, 8> Visited;
3964
3965 /// The current pointer use being rewritten. This is used to dig up the used
3966 /// value (as opposed to the user).
3967 Use *U = nullptr;
3968
3969 /// Used to calculate offsets, and hence alignment, of subobjects.
3970 const DataLayout &DL;
3971
3972 IRBuilderTy &IRB;
3973
3974public:
3975 AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
3976 : DL(DL), IRB(IRB) {}
3977
3978 /// Rewrite loads and stores through a pointer and all pointers derived from
3979 /// it.
3980 bool rewrite(Instruction &I) {
3981 LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
3982 enqueueUsers(I);
3983 bool Changed = false;
3984 while (!Queue.empty()) {
3985 U = Queue.pop_back_val();
3986 Changed |= visit(cast<Instruction>(U->getUser()));
3987 }
3988 return Changed;
3989 }
3990
3991private:
3992 /// Enqueue all the users of the given instruction for further processing.
3993 /// This uses a set to de-duplicate users.
3994 void enqueueUsers(Instruction &I) {
3995 for (Use &U : I.uses())
3996 if (Visited.insert(U.getUser()).second)
3997 Queue.push_back(&U);
3998 }
3999
4000 // Conservative default is to not rewrite anything.
4001 bool visitInstruction(Instruction &I) { return false; }
4002
4003 /// Generic recursive split emission class.
4004 template <typename Derived> class OpSplitter {
4005 protected:
4006 /// The builder used to form new instructions.
4007 IRBuilderTy &IRB;
4008
4009 /// The indices which to be used with insert- or extractvalue to select the
4010 /// appropriate value within the aggregate.
4011 SmallVector<unsigned, 4> Indices;
4012
4013 /// The indices to a GEP instruction which will move Ptr to the correct slot
4014 /// within the aggregate.
4015 SmallVector<Value *, 4> GEPIndices;
4016
4017 /// The base pointer of the original op, used as a base for GEPing the
4018 /// split operations.
4019 Value *Ptr;
4020
4021 /// The base pointee type being GEPed into.
4022 Type *BaseTy;
4023
4024 /// Known alignment of the base pointer.
4025 Align BaseAlign;
4026
4027 /// To calculate offset of each component so we can correctly deduce
4028 /// alignments.
4029 const DataLayout &DL;
4030
4031 /// Initialize the splitter with an insertion point, Ptr and start with a
4032 /// single zero GEP index.
4033 OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4034 Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
4035 : IRB(IRB), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy),
4036 BaseAlign(BaseAlign), DL(DL) {
4037 IRB.SetInsertPoint(InsertionPoint);
4038 }
4039
4040 public:
4041 /// Generic recursive split emission routine.
4042 ///
4043 /// This method recursively splits an aggregate op (load or store) into
4044 /// scalar or vector ops. It splits recursively until it hits a single value
4045 /// and emits that single value operation via the template argument.
4046 ///
4047 /// The logic of this routine relies on GEPs and insertvalue and
4048 /// extractvalue all operating with the same fundamental index list, merely
4049 /// formatted differently (GEPs need actual values).
4050 ///
4051 /// \param Ty The type being split recursively into smaller ops.
4052 /// \param Agg The aggregate value being built up or stored, depending on
4053 /// whether this is splitting a load or a store respectively.
4054 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
4055 if (Ty->isSingleValueType()) {
4056 unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
4057 return static_cast<Derived *>(this)->emitFunc(
4058 Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
4059 }
4060
4061 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
4062 unsigned OldSize = Indices.size();
4063 (void)OldSize;
4064 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
4065 ++Idx) {
4066 assert(Indices.size() == OldSize && "Did not return to the old size");
4067 Indices.push_back(Idx);
4068 GEPIndices.push_back(IRB.getInt32(Idx));
4069 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
4070 GEPIndices.pop_back();
4071 Indices.pop_back();
4072 }
4073 return;
4074 }
4075
4076 if (StructType *STy = dyn_cast<StructType>(Ty)) {
4077 unsigned OldSize = Indices.size();
4078 (void)OldSize;
4079 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
4080 ++Idx) {
4081 assert(Indices.size() == OldSize && "Did not return to the old size");
4082 Indices.push_back(Idx);
4083 GEPIndices.push_back(IRB.getInt32(Idx));
4084 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
4085 GEPIndices.pop_back();
4086 Indices.pop_back();
4087 }
4088 return;
4089 }
4090
4091 llvm_unreachable("Only arrays and structs are aggregate loadable types");
4092 }
4093 };
4094
4095 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
4096 AAMDNodes AATags;
4097 // A vector to hold the split components that we want to emit
4098 // separate fake uses for.
4099 SmallVector<Value *, 4> Components;
4100 // A vector to hold all the fake uses of the struct that we are splitting.
4101 // Usually there should only be one, but we are handling the general case.
4103
4104 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4105 AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
4106 IRBuilderTy &IRB)
4107 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
4108 IRB),
4109 AATags(AATags) {}
4110
4111 /// Emit a leaf load of a single value. This is called at the leaves of the
4112 /// recursive emission to actually load values.
4113 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4115 // Load the single value and insert it using the indices.
4116 Value *GEP =
4117 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
4118 LoadInst *Load =
4119 IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
4120
4121 APInt Offset(
4122 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4123 if (AATags &&
4124 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
4125 Load->setAAMetadata(
4126 AATags.adjustForAccess(Offset.getZExtValue(), Load->getType(), DL));
4127 // Record the load so we can generate a fake use for this aggregate
4128 // component.
4129 Components.push_back(Load);
4130
4131 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
4132 LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
4133 }
4134
4135 // Stash the fake uses that use the value generated by this instruction.
4136 void recordFakeUses(LoadInst &LI) {
4137 for (Use &U : LI.uses())
4138 if (auto *II = dyn_cast<IntrinsicInst>(U.getUser()))
4139 if (II->getIntrinsicID() == Intrinsic::fake_use)
4140 FakeUses.push_back(II);
4141 }
4142
4143 // Replace all fake uses of the aggregate with a series of fake uses, one
4144 // for each split component.
4145 void emitFakeUses() {
4146 for (Instruction *I : FakeUses) {
4147 IRB.SetInsertPoint(I);
4148 for (auto *V : Components)
4149 IRB.CreateIntrinsic(Intrinsic::fake_use, {V});
4150 I->eraseFromParent();
4151 }
4152 }
4153 };
4154
4155 bool visitLoadInst(LoadInst &LI) {
4156 assert(LI.getPointerOperand() == *U);
4157 if (!LI.isSimple() || LI.getType()->isSingleValueType())
4158 return false;
4159
4160 // We have an aggregate being loaded, split it apart.
4161 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
4162 LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
4163 getAdjustedAlignment(&LI, 0), DL, IRB);
4164 Splitter.recordFakeUses(LI);
4166 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
4167 Splitter.emitFakeUses();
4168 Visited.erase(&LI);
4169 LI.replaceAllUsesWith(V);
4170 LI.eraseFromParent();
4171 return true;
4172 }
4173
4174 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
4175 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4176 AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign,
4177 const DataLayout &DL, IRBuilderTy &IRB)
4178 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
4179 DL, IRB),
4180 AATags(AATags), AggStore(AggStore) {}
4181 AAMDNodes AATags;
4182 StoreInst *AggStore;
4183 /// Emit a leaf store of a single value. This is called at the leaves of the
4184 /// recursive emission to actually produce stores.
4185 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4187 // Extract the single value and store it using the indices.
4188 //
4189 // The gep and extractvalue values are factored out of the CreateStore
4190 // call to make the output independent of the argument evaluation order.
4191 Value *ExtractValue =
4192 IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
4193 Value *InBoundsGEP =
4194 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
4195 StoreInst *Store =
4196 IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
4197
4198 APInt Offset(
4199 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4200 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset);
4201 if (AATags) {
4202 Store->setAAMetadata(AATags.adjustForAccess(
4203 Offset.getZExtValue(), ExtractValue->getType(), DL));
4204 }
4205
4206 // migrateDebugInfo requires the base Alloca. Walk to it from this gep.
4207 // If we cannot (because there's an intervening non-const or unbounded
4208 // gep) then we wouldn't expect to see dbg.assign intrinsics linked to
4209 // this instruction.
4211 if (auto *OldAI = dyn_cast<AllocaInst>(Base)) {
4212 uint64_t SizeInBits =
4213 DL.getTypeSizeInBits(Store->getValueOperand()->getType());
4214 migrateDebugInfo(OldAI, /*IsSplit*/ true, Offset.getZExtValue() * 8,
4215 SizeInBits, AggStore, Store,
4216 Store->getPointerOperand(), Store->getValueOperand(),
4217 DL);
4218 } else {
4220 "AT: unexpected debug.assign linked to store through "
4221 "unbounded GEP");
4222 }
4223 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
4224 }
4225 };
4226
4227 bool visitStoreInst(StoreInst &SI) {
4228 if (!SI.isSimple() || SI.getPointerOperand() != *U)
4229 return false;
4230 Value *V = SI.getValueOperand();
4231 if (V->getType()->isSingleValueType())
4232 return false;
4233
4234 // We have an aggregate being stored, split it apart.
4235 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
4236 StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI,
4237 getAdjustedAlignment(&SI, 0), DL, IRB);
4238 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
4239 Visited.erase(&SI);
4240 // The stores replacing SI each have markers describing fragments of the
4241 // assignment so delete the assignment markers linked to SI.
4243 SI.eraseFromParent();
4244 return true;
4245 }
4246
4247 bool visitBitCastInst(BitCastInst &BC) {
4248 enqueueUsers(BC);
4249 return false;
4250 }
4251
4252 bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
4253 enqueueUsers(ASC);
4254 return false;
4255 }
4256
4257 // Unfold gep (select cond, ptr1, ptr2), idx
4258 // => select cond, gep(ptr1, idx), gep(ptr2, idx)
4259 // and gep ptr, (select cond, idx1, idx2)
4260 // => select cond, gep(ptr, idx1), gep(ptr, idx2)
4261 // We also allow for i1 zext indices, which are equivalent to selects.
4262 bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
4263 // Check whether the GEP has exactly one select operand and all indices
4264 // will become constant after the transform.
4266 for (Value *Op : GEPI.indices()) {
4267 if (auto *SI = dyn_cast<SelectInst>(Op)) {
4268 if (Sel)
4269 return false;
4270
4271 Sel = SI;
4272 if (!isa<ConstantInt>(SI->getTrueValue()) ||
4273 !isa<ConstantInt>(SI->getFalseValue()))
4274 return false;
4275 continue;
4276 }
4277 if (auto *ZI = dyn_cast<ZExtInst>(Op)) {
4278 if (Sel)
4279 return false;
4280 Sel = ZI;
4281 if (!ZI->getSrcTy()->isIntegerTy(1))
4282 return false;
4283 continue;
4284 }
4285
4286 if (!isa<ConstantInt>(Op))
4287 return false;
4288 }
4289
4290 if (!Sel)
4291 return false;
4292
4293 LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):\n";
4294 dbgs() << " original: " << *Sel << "\n";
4295 dbgs() << " " << GEPI << "\n";);
4296
4297 auto GetNewOps = [&](Value *SelOp) {
4298 SmallVector<Value *> NewOps;
4299 for (Value *Op : GEPI.operands())
4300 if (Op == Sel)
4301 NewOps.push_back(SelOp);
4302 else
4303 NewOps.push_back(Op);
4304 return NewOps;
4305 };
4306
4307 Value *Cond, *True, *False;
4308 Instruction *MDFrom = nullptr;
4309 if (auto *SI = dyn_cast<SelectInst>(Sel)) {
4310 Cond = SI->getCondition();
4311 True = SI->getTrueValue();
4312 False = SI->getFalseValue();
4314 MDFrom = SI;
4315 } else {
4316 Cond = Sel->getOperand(0);
4317 True = ConstantInt::get(Sel->getType(), 1);
4318 False = ConstantInt::get(Sel->getType(), 0);
4319 }
4320 SmallVector<Value *> TrueOps = GetNewOps(True);
4321 SmallVector<Value *> FalseOps = GetNewOps(False);
4322
4323 IRB.SetInsertPoint(&GEPI);
4324 GEPNoWrapFlags NW = GEPI.getNoWrapFlags();
4325
4326 Type *Ty = GEPI.getSourceElementType();
4327 Value *NTrue = IRB.CreateGEP(Ty, TrueOps[0], ArrayRef(TrueOps).drop_front(),
4328 True->getName() + ".sroa.gep", NW);
4329
4330 Value *NFalse =
4331 IRB.CreateGEP(Ty, FalseOps[0], ArrayRef(FalseOps).drop_front(),
4332 False->getName() + ".sroa.gep", NW);
4333
4334 Value *NSel = MDFrom
4335 ? IRB.CreateSelect(Cond, NTrue, NFalse,
4336 Sel->getName() + ".sroa.sel", MDFrom)
4337 : IRB.CreateSelectWithUnknownProfile(
4338 Cond, NTrue, NFalse, DEBUG_TYPE,
4339 Sel->getName() + ".sroa.sel");
4340 Visited.erase(&GEPI);
4341 GEPI.replaceAllUsesWith(NSel);
4342 GEPI.eraseFromParent();
4343 Instruction *NSelI = cast<Instruction>(NSel);
4344 Visited.insert(NSelI);
4345 enqueueUsers(*NSelI);
4346
4347 LLVM_DEBUG(dbgs() << " to: " << *NTrue << "\n";
4348 dbgs() << " " << *NFalse << "\n";
4349 dbgs() << " " << *NSel << "\n";);
4350
4351 return true;
4352 }
4353
4354 // Unfold gep (phi ptr1, ptr2), idx
4355 // => phi ((gep ptr1, idx), (gep ptr2, idx))
4356 // and gep ptr, (phi idx1, idx2)
4357 // => phi ((gep ptr, idx1), (gep ptr, idx2))
4358 bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
4359 // To prevent infinitely expanding recursive phis, bail if the GEP pointer
4360 // operand (looking through the phi if it is the phi we want to unfold) is
4361 // an instruction besides a static alloca.
4362 PHINode *Phi = dyn_cast<PHINode>(GEPI.getPointerOperand());
4363 auto IsInvalidPointerOperand = [](Value *V) {
4364 if (!isa<Instruction>(V))
4365 return false;
4366 if (auto *AI = dyn_cast<AllocaInst>(V))
4367 return !AI->isStaticAlloca();
4368 return true;
4369 };
4370 if (Phi) {
4371 if (any_of(Phi->operands(), IsInvalidPointerOperand))
4372 return false;
4373 } else {
4374 if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
4375 return false;
4376 }
4377 // Check whether the GEP has exactly one phi operand (including the pointer
4378 // operand) and all indices will become constant after the transform.
4379 for (Value *Op : GEPI.indices()) {
4380 if (auto *SI = dyn_cast<PHINode>(Op)) {
4381 if (Phi)
4382 return false;
4383
4384 Phi = SI;
4385 if (!all_of(Phi->incoming_values(),
4386 [](Value *V) { return isa<ConstantInt>(V); }))
4387 return false;
4388 continue;
4389 }
4390
4391 if (!isa<ConstantInt>(Op))
4392 return false;
4393 }
4394
4395 if (!Phi)
4396 return false;
4397
4398 LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):\n";
4399 dbgs() << " original: " << *Phi << "\n";
4400 dbgs() << " " << GEPI << "\n";);
4401
4402 auto GetNewOps = [&](Value *PhiOp) {
4403 SmallVector<Value *> NewOps;
4404 for (Value *Op : GEPI.operands())
4405 if (Op == Phi)
4406 NewOps.push_back(PhiOp);
4407 else
4408 NewOps.push_back(Op);
4409 return NewOps;
4410 };
4411
4412 IRB.SetInsertPoint(Phi);
4413 PHINode *NewPhi = IRB.CreatePHI(GEPI.getType(), Phi->getNumIncomingValues(),
4414 Phi->getName() + ".sroa.phi");
4415
4416 Type *SourceTy = GEPI.getSourceElementType();
4417 // We only handle arguments, constants, and static allocas here, so we can
4418 // insert GEPs at the end of the entry block.
4419 IRB.SetInsertPoint(GEPI.getFunction()->getEntryBlock().getTerminator());
4420 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
4421 Value *Op = Phi->getIncomingValue(I);
4422 BasicBlock *BB = Phi->getIncomingBlock(I);
4423 Value *NewGEP;
4424 if (int NI = NewPhi->getBasicBlockIndex(BB); NI >= 0) {
4425 NewGEP = NewPhi->getIncomingValue(NI);
4426 } else {
4427 SmallVector<Value *> NewOps = GetNewOps(Op);
4428 NewGEP =
4429 IRB.CreateGEP(SourceTy, NewOps[0], ArrayRef(NewOps).drop_front(),
4430 Phi->getName() + ".sroa.gep", GEPI.getNoWrapFlags());
4431 }
4432 NewPhi->addIncoming(NewGEP, BB);
4433 }
4434
4435 Visited.erase(&GEPI);
4436 GEPI.replaceAllUsesWith(NewPhi);
4437 GEPI.eraseFromParent();
4438 Visited.insert(NewPhi);
4439 enqueueUsers(*NewPhi);
4440
4441 LLVM_DEBUG(dbgs() << " to: ";
4442 for (Value *In
4443 : NewPhi->incoming_values()) dbgs()
4444 << "\n " << *In;
4445 dbgs() << "\n " << *NewPhi << '\n');
4446
4447 return true;
4448 }
4449
4450 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
4451 if (unfoldGEPSelect(GEPI))
4452 return true;
4453
4454 if (unfoldGEPPhi(GEPI))
4455 return true;
4456
4457 enqueueUsers(GEPI);
4458 return false;
4459 }
4460
4461 bool visitPHINode(PHINode &PN) {
4462 enqueueUsers(PN);
4463 return false;
4464 }
4465
4466 bool visitSelectInst(SelectInst &SI) {
4467 enqueueUsers(SI);
4468 return false;
4469 }
4470};
4471
4472} // end anonymous namespace
4473
4474/// Strip aggregate type wrapping.
4475///
4476/// This removes no-op aggregate types wrapping an underlying type. It will
4477/// strip as many layers of types as it can without changing either the type
4478/// size or the allocated size.
4480 if (Ty->isSingleValueType())
4481 return Ty;
4482
4483 uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue();
4484 uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
4485
4486 Type *InnerTy;
4487 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
4488 InnerTy = ArrTy->getElementType();
4489 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
4490 const StructLayout *SL = DL.getStructLayout(STy);
4491 unsigned Index = SL->getElementContainingOffset(0);
4492 InnerTy = STy->getElementType(Index);
4493 } else {
4494 return Ty;
4495 }
4496
4497 if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedValue() ||
4498 TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedValue())
4499 return Ty;
4500
4501 return stripAggregateTypeWrapping(DL, InnerTy);
4502}
4503
4504/// Try to find a partition of the aggregate type passed in for a given
4505/// offset and size.
4506///
4507/// This recurses through the aggregate type and tries to compute a subtype
4508/// based on the offset and size. When the offset and size span a sub-section
4509/// of an array, it will even compute a new array type for that sub-section,
4510/// and the same for structs.
4511///
4512/// Note that this routine is very strict and tries to find a partition of the
4513/// type which produces the *exact* right offset and size. It is not forgiving
4514/// when the size or offset cause either end of type-based partition to be off.
4515/// Also, this is a best-effort routine. It is reasonable to give up and not
4516/// return a type if necessary.
4518 uint64_t Size) {
4519 if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedValue() == Size)
4520 return stripAggregateTypeWrapping(DL, Ty);
4521 if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() ||
4522 (DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size)
4523 return nullptr;
4524
4525 if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
4526 Type *ElementTy;
4527 uint64_t TyNumElements;
4528 if (auto *AT = dyn_cast<ArrayType>(Ty)) {
4529 ElementTy = AT->getElementType();
4530 TyNumElements = AT->getNumElements();
4531 } else {
4532 // FIXME: This isn't right for vectors with non-byte-sized or
4533 // non-power-of-two sized elements.
4534 auto *VT = cast<FixedVectorType>(Ty);
4535 ElementTy = VT->getElementType();
4536 TyNumElements = VT->getNumElements();
4537 }
4538 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4539 uint64_t NumSkippedElements = Offset / ElementSize;
4540 if (NumSkippedElements >= TyNumElements)
4541 return nullptr;
4542 Offset -= NumSkippedElements * ElementSize;
4543
4544 // First check if we need to recurse.
4545 if (Offset > 0 || Size < ElementSize) {
4546 // Bail if the partition ends in a different array element.
4547 if ((Offset + Size) > ElementSize)
4548 return nullptr;
4549 // Recurse through the element type trying to peel off offset bytes.
4550 return getTypePartition(DL, ElementTy, Offset, Size);
4551 }
4552 assert(Offset == 0);
4553
4554 if (Size == ElementSize)
4555 return stripAggregateTypeWrapping(DL, ElementTy);
4556 assert(Size > ElementSize);
4557 uint64_t NumElements = Size / ElementSize;
4558 if (NumElements * ElementSize != Size)
4559 return nullptr;
4560 return ArrayType::get(ElementTy, NumElements);
4561 }
4562
4564 if (!STy)
4565 return nullptr;
4566
4567 const StructLayout *SL = DL.getStructLayout(STy);
4568
4569 if (SL->getSizeInBits().isScalable())
4570 return nullptr;
4571
4572 if (Offset >= SL->getSizeInBytes())
4573 return nullptr;
4574 uint64_t EndOffset = Offset + Size;
4575 if (EndOffset > SL->getSizeInBytes())
4576 return nullptr;
4577
4578 unsigned Index = SL->getElementContainingOffset(Offset);
4579 Offset -= SL->getElementOffset(Index);
4580
4581 Type *ElementTy = STy->getElementType(Index);
4582 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4583 if (Offset >= ElementSize)
4584 return nullptr; // The offset points into alignment padding.
4585
4586 // See if any partition must be contained by the element.
4587 if (Offset > 0 || Size < ElementSize) {
4588 if ((Offset + Size) > ElementSize)
4589 return nullptr;
4590 return getTypePartition(DL, ElementTy, Offset, Size);
4591 }
4592 assert(Offset == 0);
4593
4594 if (Size == ElementSize)
4595 return stripAggregateTypeWrapping(DL, ElementTy);
4596
4597 StructType::element_iterator EI = STy->element_begin() + Index,
4598 EE = STy->element_end();
4599 if (EndOffset < SL->getSizeInBytes()) {
4600 unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
4601 if (Index == EndIndex)
4602 return nullptr; // Within a single element and its padding.
4603
4604 // Don't try to form "natural" types if the elements don't line up with the
4605 // expected size.
4606 // FIXME: We could potentially recurse down through the last element in the
4607 // sub-struct to find a natural end point.
4608 if (SL->getElementOffset(EndIndex) != EndOffset)
4609 return nullptr;
4610
4611 assert(Index < EndIndex);
4612 EE = STy->element_begin() + EndIndex;
4613 }
4614
4615 // Try to build up a sub-structure.
4616 StructType *SubTy =
4617 StructType::get(STy->getContext(), ArrayRef(EI, EE), STy->isPacked());
4618 const StructLayout *SubSL = DL.getStructLayout(SubTy);
4619 if (Size != SubSL->getSizeInBytes())
4620 return nullptr; // The sub-struct doesn't have quite the size needed.
4621
4622 return SubTy;
4623}
4624
4625/// Pre-split loads and stores to simplify rewriting.
4626///
4627/// We want to break up the splittable load+store pairs as much as
4628/// possible. This is important to do as a preprocessing step, as once we
4629/// start rewriting the accesses to partitions of the alloca we lose the
4630/// necessary information to correctly split apart paired loads and stores
4631/// which both point into this alloca. The case to consider is something like
4632/// the following:
4633///
4634/// %a = alloca [12 x i8]
4635/// %gep1 = getelementptr i8, ptr %a, i32 0
4636/// %gep2 = getelementptr i8, ptr %a, i32 4
4637/// %gep3 = getelementptr i8, ptr %a, i32 8
4638/// store float 0.0, ptr %gep1
4639/// store float 1.0, ptr %gep2
4640/// %v = load i64, ptr %gep1
4641/// store i64 %v, ptr %gep2
4642/// %f1 = load float, ptr %gep2
4643/// %f2 = load float, ptr %gep3
4644///
4645/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
4646/// promote everything so we recover the 2 SSA values that should have been
4647/// there all along.
4648///
4649/// \returns true if any changes are made.
4650bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
4651 LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
4652
4653 // Track the loads and stores which are candidates for pre-splitting here, in
4654 // the order they first appear during the partition scan. These give stable
4655 // iteration order and a basis for tracking which loads and stores we
4656 // actually split.
4659
4660 // We need to accumulate the splits required of each load or store where we
4661 // can find them via a direct lookup. This is important to cross-check loads
4662 // and stores against each other. We also track the slice so that we can kill
4663 // all the slices that end up split.
4664 struct SplitOffsets {
4665 Slice *S;
4666 std::vector<uint64_t> Splits;
4667 };
4668 SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
4669
4670 // Track loads out of this alloca which cannot, for any reason, be pre-split.
4671 // This is important as we also cannot pre-split stores of those loads!
4672 // FIXME: This is all pretty gross. It means that we can be more aggressive
4673 // in pre-splitting when the load feeding the store happens to come from
4674 // a separate alloca. Put another way, the effectiveness of SROA would be
4675 // decreased by a frontend which just concatenated all of its local allocas
4676 // into one big flat alloca. But defeating such patterns is exactly the job
4677 // SROA is tasked with! Sadly, to not have this discrepancy we would have
4678 // change store pre-splitting to actually force pre-splitting of the load
4679 // that feeds it *and all stores*. That makes pre-splitting much harder, but
4680 // maybe it would make it more principled?
4681 SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
4682
4683 LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
4684 for (auto &P : AS.partitions()) {
4685 for (Slice &S : P) {
4686 Instruction *I = cast<Instruction>(S.getUse()->getUser());
4687 if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
4688 // If this is a load we have to track that it can't participate in any
4689 // pre-splitting. If this is a store of a load we have to track that
4690 // that load also can't participate in any pre-splitting.
4691 if (auto *LI = dyn_cast<LoadInst>(I))
4692 UnsplittableLoads.insert(LI);
4693 else if (auto *SI = dyn_cast<StoreInst>(I))
4694 if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
4695 UnsplittableLoads.insert(LI);
4696 continue;
4697 }
4698 assert(P.endOffset() > S.beginOffset() &&
4699 "Empty or backwards partition!");
4700
4701 // Determine if this is a pre-splittable slice.
4702 if (auto *LI = dyn_cast<LoadInst>(I)) {
4703 assert(!LI->isVolatile() && "Cannot split volatile loads!");
4704
4705 // The load must be used exclusively to store into other pointers for
4706 // us to be able to arbitrarily pre-split it. The stores must also be
4707 // simple to avoid changing semantics.
4708 auto IsLoadSimplyStored = [](LoadInst *LI) {
4709 for (User *LU : LI->users()) {
4710 auto *SI = dyn_cast<StoreInst>(LU);
4711 if (!SI || !SI->isSimple())
4712 return false;
4713 }
4714 return true;
4715 };
4716 if (!IsLoadSimplyStored(LI)) {
4717 UnsplittableLoads.insert(LI);
4718 continue;
4719 }
4720
4721 Loads.push_back(LI);
4722 } else if (auto *SI = dyn_cast<StoreInst>(I)) {
4723 if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
4724 // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
4725 continue;
4726 auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
4727 if (!StoredLoad || !StoredLoad->isSimple())
4728 continue;
4729 assert(!SI->isVolatile() && "Cannot split volatile stores!");
4730
4731 Stores.push_back(SI);
4732 } else {
4733 // Other uses cannot be pre-split.
4734 continue;
4735 }
4736
4737 // Record the initial split.
4738 LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
4739 auto &Offsets = SplitOffsetsMap[I];
4740 assert(Offsets.Splits.empty() &&
4741 "Should not have splits the first time we see an instruction!");
4742 Offsets.S = &S;
4743 Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
4744 }
4745
4746 // Now scan the already split slices, and add a split for any of them which
4747 // we're going to pre-split.
4748 for (Slice *S : P.splitSliceTails()) {
4749 auto SplitOffsetsMapI =
4750 SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
4751 if (SplitOffsetsMapI == SplitOffsetsMap.end())
4752 continue;
4753 auto &Offsets = SplitOffsetsMapI->second;
4754
4755 assert(Offsets.S == S && "Found a mismatched slice!");
4756 assert(!Offsets.Splits.empty() &&
4757 "Cannot have an empty set of splits on the second partition!");
4758 assert(Offsets.Splits.back() ==
4759 P.beginOffset() - Offsets.S->beginOffset() &&
4760 "Previous split does not end where this one begins!");
4761
4762 // Record each split. The last partition's end isn't needed as the size
4763 // of the slice dictates that.
4764 if (S->endOffset() > P.endOffset())
4765 Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
4766 }
4767 }
4768
4769 // We may have split loads where some of their stores are split stores. For
4770 // such loads and stores, we can only pre-split them if their splits exactly
4771 // match relative to their starting offset. We have to verify this prior to
4772 // any rewriting.
4773 llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
4774 // Lookup the load we are storing in our map of split
4775 // offsets.
4776 auto *LI = cast<LoadInst>(SI->getValueOperand());
4777 // If it was completely unsplittable, then we're done,
4778 // and this store can't be pre-split.
4779 if (UnsplittableLoads.count(LI))
4780 return true;
4781
4782 auto LoadOffsetsI = SplitOffsetsMap.find(LI);
4783 if (LoadOffsetsI == SplitOffsetsMap.end())
4784 return false; // Unrelated loads are definitely safe.
4785 auto &LoadOffsets = LoadOffsetsI->second;
4786
4787 // Now lookup the store's offsets.
4788 auto &StoreOffsets = SplitOffsetsMap[SI];
4789
4790 // If the relative offsets of each split in the load and
4791 // store match exactly, then we can split them and we
4792 // don't need to remove them here.
4793 if (LoadOffsets.Splits == StoreOffsets.Splits)
4794 return false;
4795
4796 LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
4797 << " " << *LI << "\n"
4798 << " " << *SI << "\n");
4799
4800 // We've found a store and load that we need to split
4801 // with mismatched relative splits. Just give up on them
4802 // and remove both instructions from our list of
4803 // candidates.
4804 UnsplittableLoads.insert(LI);
4805 return true;
4806 });
4807 // Now we have to go *back* through all the stores, because a later store may
4808 // have caused an earlier store's load to become unsplittable and if it is
4809 // unsplittable for the later store, then we can't rely on it being split in
4810 // the earlier store either.
4811 llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
4812 auto *LI = cast<LoadInst>(SI->getValueOperand());
4813 return UnsplittableLoads.count(LI);
4814 });
4815 // Once we've established all the loads that can't be split for some reason,
4816 // filter any that made it into our list out.
4817 llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
4818 return UnsplittableLoads.count(LI);
4819 });
4820
4821 // If no loads or stores are left, there is no pre-splitting to be done for
4822 // this alloca.
4823 if (Loads.empty() && Stores.empty())
4824 return false;
4825
4826 // From here on, we can't fail and will be building new accesses, so rig up
4827 // an IR builder.
4828 IRBuilderTy IRB(&AI);
4829
4830 // Collect the new slices which we will merge into the alloca slices.
4831 SmallVector<Slice, 4> NewSlices;
4832
4833 // Track any allocas we end up splitting loads and stores for so we iterate
4834 // on them.
4835 SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
4836
4837 // At this point, we have collected all of the loads and stores we can
4838 // pre-split, and the specific splits needed for them. We actually do the
4839 // splitting in a specific order in order to handle when one of the loads in
4840 // the value operand to one of the stores.
4841 //
4842 // First, we rewrite all of the split loads, and just accumulate each split
4843 // load in a parallel structure. We also build the slices for them and append
4844 // them to the alloca slices.
4845 SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
4846 std::vector<LoadInst *> SplitLoads;
4847 const DataLayout &DL = AI.getDataLayout();
4848 for (LoadInst *LI : Loads) {
4849 SplitLoads.clear();
4850
4851 auto &Offsets = SplitOffsetsMap[LI];
4852 unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
4853 assert(LI->getType()->getIntegerBitWidth() % 8 == 0 &&
4854 "Load must have type size equal to store size");
4855 assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize &&
4856 "Load must be >= slice size");
4857
4858 uint64_t BaseOffset = Offsets.S->beginOffset();
4859 assert(BaseOffset + SliceSize > BaseOffset &&
4860 "Cannot represent alloca access size using 64-bit integers!");
4861
4863 IRB.SetInsertPoint(LI);
4864
4865 LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
4866
4867 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4868 int Idx = 0, Size = Offsets.Splits.size();
4869 for (;;) {
4870 auto *PartTy = Type::getIntNTy(LI->getContext(), PartSize * 8);
4871 auto AS = LI->getPointerAddressSpace();
4872 auto *PartPtrTy = LI->getPointerOperandType();
4873 LoadInst *PLoad = IRB.CreateAlignedLoad(
4874 PartTy,
4875 getAdjustedPtr(IRB, DL, BasePtr,
4876 APInt(DL.getIndexSizeInBits(AS), PartOffset),
4877 PartPtrTy, BasePtr->getName() + "."),
4878 getAdjustedAlignment(LI, PartOffset),
4879 /*IsVolatile*/ false, LI->getName());
4880 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4881 LLVMContext::MD_access_group});
4882
4883 // Append this load onto the list of split loads so we can find it later
4884 // to rewrite the stores.
4885 SplitLoads.push_back(PLoad);
4886
4887 // Now build a new slice for the alloca.
4888 NewSlices.push_back(
4889 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4890 &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
4891 /*IsSplittable*/ false, nullptr));
4892 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4893 << ", " << NewSlices.back().endOffset()
4894 << "): " << *PLoad << "\n");
4895
4896 // See if we've handled all the splits.
4897 if (Idx >= Size)
4898 break;
4899
4900 // Setup the next partition.
4901 PartOffset = Offsets.Splits[Idx];
4902 ++Idx;
4903 PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset;
4904 }
4905
4906 // Now that we have the split loads, do the slow walk over all uses of the
4907 // load and rewrite them as split stores, or save the split loads to use
4908 // below if the store is going to be split there anyways.
4909 bool DeferredStores = false;
4910 for (User *LU : LI->users()) {
4911 StoreInst *SI = cast<StoreInst>(LU);
4912 if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
4913 DeferredStores = true;
4914 LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
4915 << "\n");
4916 continue;
4917 }
4918
4919 Value *StoreBasePtr = SI->getPointerOperand();
4920 IRB.SetInsertPoint(SI);
4921 AAMDNodes AATags = SI->getAAMetadata();
4922
4923 LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
4924
4925 for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
4926 LoadInst *PLoad = SplitLoads[Idx];
4927 uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
4928 auto *PartPtrTy = SI->getPointerOperandType();
4929
4930 auto AS = SI->getPointerAddressSpace();
4931 StoreInst *PStore = IRB.CreateAlignedStore(
4932 PLoad,
4933 getAdjustedPtr(IRB, DL, StoreBasePtr,
4934 APInt(DL.getIndexSizeInBits(AS), PartOffset),
4935 PartPtrTy, StoreBasePtr->getName() + "."),
4936 getAdjustedAlignment(SI, PartOffset),
4937 /*IsVolatile*/ false);
4938 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
4939 LLVMContext::MD_access_group,
4940 LLVMContext::MD_DIAssignID});
4941
4942 if (AATags)
4943 PStore->setAAMetadata(
4944 AATags.adjustForAccess(PartOffset, PLoad->getType(), DL));
4945 LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
4946 }
4947
4948 // We want to immediately iterate on any allocas impacted by splitting
4949 // this store, and we have to track any promotable alloca (indicated by
4950 // a direct store) as needing to be resplit because it is no longer
4951 // promotable.
4952 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
4953 ResplitPromotableAllocas.insert(OtherAI);
4954 Worklist.insert(OtherAI);
4955 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4956 StoreBasePtr->stripInBoundsOffsets())) {
4957 Worklist.insert(OtherAI);
4958 }
4959
4960 // Mark the original store as dead.
4961 DeadInsts.push_back(SI);
4962 }
4963
4964 // Save the split loads if there are deferred stores among the users.
4965 if (DeferredStores)
4966 SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
4967
4968 // Mark the original load as dead and kill the original slice.
4969 DeadInsts.push_back(LI);
4970 Offsets.S->kill();
4971 }
4972
4973 // Second, we rewrite all of the split stores. At this point, we know that
4974 // all loads from this alloca have been split already. For stores of such
4975 // loads, we can simply look up the pre-existing split loads. For stores of
4976 // other loads, we split those loads first and then write split stores of
4977 // them.
4978 for (StoreInst *SI : Stores) {
4979 auto *LI = cast<LoadInst>(SI->getValueOperand());
4980 IntegerType *Ty = cast<IntegerType>(LI->getType());
4981 assert(Ty->getBitWidth() % 8 == 0);
4982 uint64_t StoreSize = Ty->getBitWidth() / 8;
4983 assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
4984
4985 auto &Offsets = SplitOffsetsMap[SI];
4986 assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
4987 "Slice size should always match load size exactly!");
4988 uint64_t BaseOffset = Offsets.S->beginOffset();
4989 assert(BaseOffset + StoreSize > BaseOffset &&
4990 "Cannot represent alloca access size using 64-bit integers!");
4991
4992 Value *LoadBasePtr = LI->getPointerOperand();
4993 Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
4994
4995 LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
4996
4997 // Check whether we have an already split load.
4998 auto SplitLoadsMapI = SplitLoadsMap.find(LI);
4999 std::vector<LoadInst *> *SplitLoads = nullptr;
5000 if (SplitLoadsMapI != SplitLoadsMap.end()) {
5001 SplitLoads = &SplitLoadsMapI->second;
5002 assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
5003 "Too few split loads for the number of splits in the store!");
5004 } else {
5005 LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
5006 }
5007
5008 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
5009 int Idx = 0, Size = Offsets.Splits.size();
5010 for (;;) {
5011 auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
5012 auto *LoadPartPtrTy = LI->getPointerOperandType();
5013 auto *StorePartPtrTy = SI->getPointerOperandType();
5014
5015 // Either lookup a split load or create one.
5016 LoadInst *PLoad;
5017 if (SplitLoads) {
5018 PLoad = (*SplitLoads)[Idx];
5019 } else {
5020 IRB.SetInsertPoint(LI);
5021 auto AS = LI->getPointerAddressSpace();
5022 PLoad = IRB.CreateAlignedLoad(
5023 PartTy,
5024 getAdjustedPtr(IRB, DL, LoadBasePtr,
5025 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5026 LoadPartPtrTy, LoadBasePtr->getName() + "."),
5027 getAdjustedAlignment(LI, PartOffset),
5028 /*IsVolatile*/ false, LI->getName());
5029 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
5030 LLVMContext::MD_access_group});
5031 }
5032
5033 // And store this partition.
5034 IRB.SetInsertPoint(SI);
5035 auto AS = SI->getPointerAddressSpace();
5036 StoreInst *PStore = IRB.CreateAlignedStore(
5037 PLoad,
5038 getAdjustedPtr(IRB, DL, StoreBasePtr,
5039 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5040 StorePartPtrTy, StoreBasePtr->getName() + "."),
5041 getAdjustedAlignment(SI, PartOffset),
5042 /*IsVolatile*/ false);
5043 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
5044 LLVMContext::MD_access_group});
5045
5046 // Now build a new slice for the alloca.
5047 // ProtectedFieldDisc==nullptr is a lie, but it doesn't matter because we
5048 // already determined that all accesses are consistent.
5049 NewSlices.push_back(
5050 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
5051 &PStore->getOperandUse(PStore->getPointerOperandIndex()),
5052 /*IsSplittable*/ false, nullptr));
5053 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
5054 << ", " << NewSlices.back().endOffset()
5055 << "): " << *PStore << "\n");
5056 if (!SplitLoads) {
5057 LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
5058 }
5059
5060 // See if we've finished all the splits.
5061 if (Idx >= Size)
5062 break;
5063
5064 // Setup the next partition.
5065 PartOffset = Offsets.Splits[Idx];
5066 ++Idx;
5067 PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
5068 }
5069
5070 // We want to immediately iterate on any allocas impacted by splitting
5071 // this load, which is only relevant if it isn't a load of this alloca and
5072 // thus we didn't already split the loads above. We also have to keep track
5073 // of any promotable allocas we split loads on as they can no longer be
5074 // promoted.
5075 if (!SplitLoads) {
5076 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
5077 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5078 ResplitPromotableAllocas.insert(OtherAI);
5079 Worklist.insert(OtherAI);
5080 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
5081 LoadBasePtr->stripInBoundsOffsets())) {
5082 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5083 Worklist.insert(OtherAI);
5084 }
5085 }
5086
5087 // Mark the original store as dead now that we've split it up and kill its
5088 // slice. Note that we leave the original load in place unless this store
5089 // was its only use. It may in turn be split up if it is an alloca load
5090 // for some other alloca, but it may be a normal load. This may introduce
5091 // redundant loads, but where those can be merged the rest of the optimizer
5092 // should handle the merging, and this uncovers SSA splits which is more
5093 // important. In practice, the original loads will almost always be fully
5094 // split and removed eventually, and the splits will be merged by any
5095 // trivial CSE, including instcombine.
5096 if (LI->hasOneUse()) {
5097 assert(*LI->user_begin() == SI && "Single use isn't this store!");
5098 DeadInsts.push_back(LI);
5099 }
5100 DeadInsts.push_back(SI);
5101 Offsets.S->kill();
5102 }
5103
5104 // Remove the killed slices that have ben pre-split.
5105 llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
5106
5107 // Insert our new slices. This will sort and merge them into the sorted
5108 // sequence.
5109 AS.insert(NewSlices);
5110
5111 LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
5112#ifndef NDEBUG
5113 for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
5114 LLVM_DEBUG(AS.print(dbgs(), I, " "));
5115#endif
5116
5117 // Finally, don't try to promote any allocas that new require re-splitting.
5118 // They have already been added to the worklist above.
5119 PromotableAllocas.set_subtract(ResplitPromotableAllocas);
5120
5121 return true;
5122}
5123
5124/// Select a partition type for an alloca partition.
5125///
5126/// Try to compute a friendly type for this partition of the alloca. This
5127/// won't always succeed, in which case we fall back to a legal integer type
5128/// or an i8 array of an appropriate size.
5129///
5130/// \returns A tuple with the following elements:
5131/// - PartitionType: The computed type for this partition.
5132/// - IsIntegerWideningViable: True if integer widening promotion is used.
5133/// - VectorType: The vector type if vector promotion is used, otherwise
5134/// nullptr.
5135static std::tuple<Type *, bool, VectorType *>
5137 LLVMContext &C) {
5138 // First check if the partition is viable for vector promotion.
5139 //
5140 // We prefer vector promotion over integer widening promotion when:
5141 // - The vector element type is a floating-point type.
5142 // - All the loads/stores to the alloca are vector loads/stores to the
5143 // entire alloca or load/store a single element of the vector.
5144 //
5145 // Otherwise when there is an integer vector with mixed type loads/stores we
5146 // prefer integer widening promotion because it's more likely the user is
5147 // doing bitwise arithmetic and we generate better code.
5148 VectorType *VecTy =
5150 // If the vector element type is a floating-point type, we prefer vector
5151 // promotion. If the vector has one element, let the below code select
5152 // whether we promote with the vector or scalar.
5153 if (VecTy && VecTy->getElementType()->isFloatingPointTy() &&
5154 VecTy->getElementCount().getFixedValue() > 1)
5155 return {VecTy, false, VecTy};
5156
5157 // Check if there is a common type that all slices of the partition use that
5158 // spans the partition.
5159 auto [CommonUseTy, LargestIntTy] =
5160 findCommonType(P.begin(), P.end(), P.endOffset());
5161 if (CommonUseTy) {
5162 TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy);
5163 if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) {
5164 // We prefer vector promotion here because if vector promotion is viable
5165 // and there is a common type used, then it implies the second listed
5166 // condition for preferring vector promotion is true.
5167 if (VecTy)
5168 return {VecTy, false, VecTy};
5169 return {CommonUseTy, isIntegerWideningViable(P, CommonUseTy, DL),
5170 nullptr};
5171 }
5172 }
5173
5174 // Can we find an appropriate subtype in the original allocated
5175 // type?
5176 if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
5177 P.beginOffset(), P.size())) {
5178 // If the partition is an integer array that can be spanned by a legal
5179 // integer type, prefer to represent it as a legal integer type because
5180 // it's more likely to be promotable.
5181 if (TypePartitionTy->isArrayTy() &&
5182 TypePartitionTy->getArrayElementType()->isIntegerTy() &&
5183 DL.isLegalInteger(P.size() * 8))
5184 TypePartitionTy = Type::getIntNTy(C, P.size() * 8);
5185 // There was no common type used, so we prefer integer widening promotion.
5186 if (isIntegerWideningViable(P, TypePartitionTy, DL))
5187 return {TypePartitionTy, true, nullptr};
5188 if (VecTy)
5189 return {VecTy, false, VecTy};
5190 // If we couldn't promote with TypePartitionTy, try with the largest
5191 // integer type used.
5192 if (LargestIntTy &&
5193 DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size() &&
5194 isIntegerWideningViable(P, LargestIntTy, DL))
5195 return {LargestIntTy, true, nullptr};
5196
5197 // Fallback to TypePartitionTy and we probably won't promote.
5198 return {TypePartitionTy, false, nullptr};
5199 }
5200
5201 // Select the largest integer type used if it spans the partition.
5202 if (LargestIntTy &&
5203 DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size())
5204 return {LargestIntTy, false, nullptr};
5205
5206 // Select a legal integer type if it spans the partition.
5207 if (DL.isLegalInteger(P.size() * 8))
5208 return {Type::getIntNTy(C, P.size() * 8), false, nullptr};
5209
5210 // Fallback to an i8 array.
5211 return {ArrayType::get(Type::getInt8Ty(C), P.size()), false, nullptr};
5212}
5213
5214/// Rewrite an alloca partition's users.
5215///
5216/// This routine drives both of the rewriting goals of the SROA pass. It tries
5217/// to rewrite uses of an alloca partition to be conducive for SSA value
5218/// promotion. If the partition needs a new, more refined alloca, this will
5219/// build that new alloca, preserving as much type information as possible, and
5220/// rewrite the uses of the old alloca to point at the new one and have the
5221/// appropriate new offsets. It also evaluates how successful the rewrite was
5222/// at enabling promotion and if it was successful queues the alloca to be
5223/// promoted.
5224std::pair<AllocaInst *, uint64_t>
5225SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P) {
5226 const DataLayout &DL = AI.getDataLayout();
5227 // Select the type for the new alloca that spans the partition.
5228 auto [PartitionTy, IsIntegerWideningViable, VecTy] =
5229 selectPartitionType(P, DL, AI, *C);
5230
5231 // Check for the case where we're going to rewrite to a new alloca of the
5232 // exact same type as the original, and with the same access offsets. In that
5233 // case, re-use the existing alloca, but still run through the rewriter to
5234 // perform phi and select speculation.
5235 // P.beginOffset() can be non-zero even with the same type in a case with
5236 // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
5237 AllocaInst *NewAI;
5238 if (PartitionTy == AI.getAllocatedType() && P.beginOffset() == 0) {
5239 NewAI = &AI;
5240 // FIXME: We should be able to bail at this point with "nothing changed".
5241 // FIXME: We might want to defer PHI speculation until after here.
5242 // FIXME: return nullptr;
5243 } else {
5244 // Make sure the alignment is compatible with P.beginOffset().
5245 const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
5246 // If we will get at least this much alignment from the type alone, leave
5247 // the alloca's alignment unconstrained.
5248 const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(PartitionTy);
5249 NewAI = new AllocaInst(
5250 PartitionTy, AI.getAddressSpace(), nullptr,
5251 IsUnconstrained ? DL.getPrefTypeAlign(PartitionTy) : Alignment,
5252 AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()),
5253 AI.getIterator());
5254 // Copy the old AI debug location over to the new one.
5255 NewAI->setDebugLoc(AI.getDebugLoc());
5256 ++NumNewAllocas;
5257 }
5258
5259 LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
5260 << "," << P.endOffset() << ") to: " << *NewAI << "\n");
5261
5262 // Track the high watermark on the worklist as it is only relevant for
5263 // promoted allocas. We will reset it to this point if the alloca is not in
5264 // fact scheduled for promotion.
5265 unsigned PPWOldSize = PostPromotionWorklist.size();
5266 unsigned NumUses = 0;
5267 SmallSetVector<PHINode *, 8> PHIUsers;
5268 SmallSetVector<SelectInst *, 8> SelectUsers;
5269
5270 AllocaSliceRewriter Rewriter(
5271 DL, AS, *this, AI, *NewAI, PartitionTy, P.beginOffset(), P.endOffset(),
5272 IsIntegerWideningViable, VecTy, PHIUsers, SelectUsers);
5273 bool Promotable = true;
5274 // Check whether we can have tree-structured merge.
5275 if (auto DeletedValues = Rewriter.rewriteTreeStructuredMerge(P)) {
5276 NumUses += DeletedValues->size() + 1;
5277 for (Value *V : *DeletedValues)
5278 DeadInsts.push_back(V);
5279 } else {
5280 for (Slice *S : P.splitSliceTails()) {
5281 Promotable &= Rewriter.visit(S);
5282 ++NumUses;
5283 }
5284 for (Slice &S : P) {
5285 Promotable &= Rewriter.visit(&S);
5286 ++NumUses;
5287 }
5288 }
5289
5290 NumAllocaPartitionUses += NumUses;
5291 MaxUsesPerAllocaPartition.updateMax(NumUses);
5292
5293 // Now that we've processed all the slices in the new partition, check if any
5294 // PHIs or Selects would block promotion.
5295 for (PHINode *PHI : PHIUsers)
5296 if (!isSafePHIToSpeculate(*PHI)) {
5297 Promotable = false;
5298 PHIUsers.clear();
5299 SelectUsers.clear();
5300 break;
5301 }
5302
5304 NewSelectsToRewrite;
5305 NewSelectsToRewrite.reserve(SelectUsers.size());
5306 for (SelectInst *Sel : SelectUsers) {
5307 std::optional<RewriteableMemOps> Ops =
5308 isSafeSelectToSpeculate(*Sel, PreserveCFG);
5309 if (!Ops) {
5310 Promotable = false;
5311 PHIUsers.clear();
5312 SelectUsers.clear();
5313 NewSelectsToRewrite.clear();
5314 break;
5315 }
5316 NewSelectsToRewrite.emplace_back(std::make_pair(Sel, *Ops));
5317 }
5318
5319 if (Promotable) {
5320 for (Use *U : AS.getDeadUsesIfPromotable()) {
5321 auto *OldInst = dyn_cast<Instruction>(U->get());
5322 Value::dropDroppableUse(*U);
5323 if (OldInst)
5324 if (isInstructionTriviallyDead(OldInst))
5325 DeadInsts.push_back(OldInst);
5326 }
5327 if (PHIUsers.empty() && SelectUsers.empty()) {
5328 // Promote the alloca.
5329 PromotableAllocas.insert(NewAI);
5330 } else {
5331 // If we have either PHIs or Selects to speculate, add them to those
5332 // worklists and re-queue the new alloca so that we promote in on the
5333 // next iteration.
5334 SpeculatablePHIs.insert_range(PHIUsers);
5335 SelectsToRewrite.reserve(SelectsToRewrite.size() +
5336 NewSelectsToRewrite.size());
5337 for (auto &&KV : llvm::make_range(
5338 std::make_move_iterator(NewSelectsToRewrite.begin()),
5339 std::make_move_iterator(NewSelectsToRewrite.end())))
5340 SelectsToRewrite.insert(std::move(KV));
5341 Worklist.insert(NewAI);
5342 }
5343 } else {
5344 // Drop any post-promotion work items if promotion didn't happen.
5345 while (PostPromotionWorklist.size() > PPWOldSize)
5346 PostPromotionWorklist.pop_back();
5347
5348 // We couldn't promote and we didn't create a new partition, nothing
5349 // happened.
5350 if (NewAI == &AI)
5351 return {nullptr, 0};
5352
5353 // If we can't promote the alloca, iterate on it to check for new
5354 // refinements exposed by splitting the current alloca. Don't iterate on an
5355 // alloca which didn't actually change and didn't get promoted.
5356 Worklist.insert(NewAI);
5357 }
5358
5359 return {NewAI, DL.getTypeSizeInBits(PartitionTy).getFixedValue()};
5360}
5361
5362// There isn't a shared interface to get the "address" parts out of a
5363// dbg.declare and dbg.assign, so provide some wrappers.
5366 return DVR->isKillAddress();
5367 return DVR->isKillLocation();
5368}
5369
5372 return DVR->getAddressExpression();
5373 return DVR->getExpression();
5374}
5375
5376/// Create or replace an existing fragment in a DIExpression with \p Frag.
5377/// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext
5378/// operation, add \p BitExtractOffset to the offset part.
5379///
5380/// Returns the new expression, or nullptr if this fails (see details below).
5381///
5382/// This function is similar to DIExpression::createFragmentExpression except
5383/// for 3 important distinctions:
5384/// 1. The new fragment isn't relative to an existing fragment.
5385/// 2. It assumes the computed location is a memory location. This means we
5386/// don't need to perform checks that creating the fragment preserves the
5387/// expression semantics.
5388/// 3. Existing extract_bits are modified independently of fragment changes
5389/// using \p BitExtractOffset. A change to the fragment offset or size
5390/// may affect a bit extract. But a bit extract offset can change
5391/// independently of the fragment dimensions.
5392///
5393/// Returns the new expression, or nullptr if one couldn't be created.
5394/// Ideally this is only used to signal that a bit-extract has become
5395/// zero-sized (and thus the new debug record has no size and can be
5396/// dropped), however, it fails for other reasons too - see the FIXME below.
5397///
5398/// FIXME: To keep the change that introduces this function NFC it bails
5399/// in some situations unecessarily, e.g. when fragment and bit extract
5400/// sizes differ.
5403 int64_t BitExtractOffset) {
5405 bool HasFragment = false;
5406 bool HasBitExtract = false;
5407
5408 for (auto &Op : Expr->expr_ops()) {
5409 if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
5410 HasFragment = true;
5411 continue;
5412 }
5413 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5415 HasBitExtract = true;
5416 int64_t ExtractOffsetInBits = Op.getArg(0);
5417 int64_t ExtractSizeInBits = Op.getArg(1);
5418
5419 // DIExpression::createFragmentExpression doesn't know how to handle
5420 // a fragment that is smaller than the extract. Copy the behaviour
5421 // (bail) to avoid non-NFC changes.
5422 // FIXME: Don't do this.
5423 if (Frag.SizeInBits < uint64_t(ExtractSizeInBits))
5424 return nullptr;
5425
5426 assert(BitExtractOffset <= 0);
5427 int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset;
5428
5429 // DIExpression::createFragmentExpression doesn't know what to do
5430 // if the new extract starts "outside" the existing one. Copy the
5431 // behaviour (bail) to avoid non-NFC changes.
5432 // FIXME: Don't do this.
5433 if (AdjustedOffset < 0)
5434 return nullptr;
5435
5436 Ops.push_back(Op.getOp());
5437 Ops.push_back(std::max<int64_t>(0, AdjustedOffset));
5438 Ops.push_back(ExtractSizeInBits);
5439 continue;
5440 }
5441 Op.appendToVector(Ops);
5442 }
5443
5444 // Unsupported by createFragmentExpression, so don't support it here yet to
5445 // preserve NFC-ness.
5446 if (HasFragment && HasBitExtract)
5447 return nullptr;
5448
5449 if (!HasBitExtract) {
5451 Ops.push_back(Frag.OffsetInBits);
5452 Ops.push_back(Frag.SizeInBits);
5453 }
5454 return DIExpression::get(Expr->getContext(), Ops);
5455}
5456
5457/// Insert a new DbgRecord.
5458/// \p Orig Original to copy record type, debug loc and variable from, and
5459/// additionally value and value expression for dbg_assign records.
5460/// \p NewAddr Location's new base address.
5461/// \p NewAddrExpr New expression to apply to address.
5462/// \p BeforeInst Insert position.
5463/// \p NewFragment New fragment (absolute, non-relative).
5464/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5465static void
5467 DIExpression *NewAddrExpr, Instruction *BeforeInst,
5468 std::optional<DIExpression::FragmentInfo> NewFragment,
5469 int64_t BitExtractAdjustment) {
5470 (void)DIB;
5471
5472 // A dbg_assign puts fragment info in the value expression only. The address
5473 // expression has already been built: NewAddrExpr. A dbg_declare puts the
5474 // new fragment info into NewAddrExpr (as it only has one expression).
5475 DIExpression *NewFragmentExpr =
5476 Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr;
5477 if (NewFragment)
5478 NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment,
5479 BitExtractAdjustment);
5480 if (!NewFragmentExpr)
5481 return;
5482
5483 if (Orig->isDbgDeclare()) {
5485 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5486 BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5487 BeforeInst->getIterator());
5488 return;
5489 }
5490
5491 if (Orig->isDbgValue()) {
5493 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5494 // Drop debug information if the expression doesn't start with a
5495 // DW_OP_deref. This is because without a DW_OP_deref, the #dbg_value
5496 // describes the address of alloca rather than the value inside the alloca.
5497 if (!NewFragmentExpr->startsWithDeref())
5498 DVR->setKillAddress();
5499 BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5500 BeforeInst->getIterator());
5501 return;
5502 }
5503
5504 // Apply a DIAssignID to the store if it doesn't already have it.
5505 if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) {
5506 NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
5508 }
5509
5511 NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
5512 NewAddrExpr, Orig->getDebugLoc());
5513 LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
5514 (void)NewAssign;
5515}
5516
5517/// Walks the slices of an alloca and form partitions based on them,
5518/// rewriting each of their uses.
5519bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
5520 if (AS.begin() == AS.end())
5521 return false;
5522
5523 unsigned NumPartitions = 0;
5524 bool Changed = false;
5525 const DataLayout &DL = AI.getModule()->getDataLayout();
5526
5527 // First try to pre-split loads and stores.
5528 Changed |= presplitLoadsAndStores(AI, AS);
5529
5530 // Now that we have identified any pre-splitting opportunities,
5531 // mark loads and stores unsplittable except for the following case.
5532 // We leave a slice splittable if all other slices are disjoint or fully
5533 // included in the slice, such as whole-alloca loads and stores.
5534 // If we fail to split these during pre-splitting, we want to force them
5535 // to be rewritten into a partition.
5536 bool IsSorted = true;
5537
5538 uint64_t AllocaSize = AI.getAllocationSize(DL)->getFixedValue();
5539 const uint64_t MaxBitVectorSize = 1024;
5540 if (AllocaSize <= MaxBitVectorSize) {
5541 // If a byte boundary is included in any load or store, a slice starting or
5542 // ending at the boundary is not splittable.
5543 SmallBitVector SplittableOffset(AllocaSize + 1, true);
5544 for (Slice &S : AS)
5545 for (unsigned O = S.beginOffset() + 1;
5546 O < S.endOffset() && O < AllocaSize; O++)
5547 SplittableOffset.reset(O);
5548
5549 for (Slice &S : AS) {
5550 if (!S.isSplittable())
5551 continue;
5552
5553 if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
5554 (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
5555 continue;
5556
5557 if (isa<LoadInst>(S.getUse()->getUser()) ||
5558 isa<StoreInst>(S.getUse()->getUser())) {
5559 S.makeUnsplittable();
5560 IsSorted = false;
5561 }
5562 }
5563 } else {
5564 // We only allow whole-alloca splittable loads and stores
5565 // for a large alloca to avoid creating too large BitVector.
5566 for (Slice &S : AS) {
5567 if (!S.isSplittable())
5568 continue;
5569
5570 if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
5571 continue;
5572
5573 if (isa<LoadInst>(S.getUse()->getUser()) ||
5574 isa<StoreInst>(S.getUse()->getUser())) {
5575 S.makeUnsplittable();
5576 IsSorted = false;
5577 }
5578 }
5579 }
5580
5581 if (!IsSorted)
5583
5584 /// Describes the allocas introduced by rewritePartition in order to migrate
5585 /// the debug info.
5586 struct Fragment {
5587 AllocaInst *Alloca;
5588 uint64_t Offset;
5589 uint64_t Size;
5590 Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
5591 : Alloca(AI), Offset(O), Size(S) {}
5592 };
5593 SmallVector<Fragment, 4> Fragments;
5594
5595 // Rewrite each partition.
5596 for (auto &P : AS.partitions()) {
5597 auto [NewAI, ActiveBits] = rewritePartition(AI, AS, P);
5598 if (NewAI) {
5599 Changed = true;
5600 if (NewAI != &AI) {
5601 uint64_t SizeOfByte = 8;
5602 // Don't include any padding.
5603 uint64_t Size = std::min(ActiveBits, P.size() * SizeOfByte);
5604 Fragments.push_back(
5605 Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
5606 }
5607 }
5608 ++NumPartitions;
5609 }
5610
5611 NumAllocaPartitions += NumPartitions;
5612 MaxPartitionsPerAlloca.updateMax(NumPartitions);
5613
5614 // Migrate debug information from the old alloca to the new alloca(s)
5615 // and the individual partitions.
5616 auto MigrateOne = [&](DbgVariableRecord *DbgVariable) {
5617 // Can't overlap with undef memory.
5618 if (isKillAddress(DbgVariable))
5619 return;
5620
5621 const Value *DbgPtr = DbgVariable->getAddress();
5623 DbgVariable->getFragmentOrEntireVariable();
5624 // Get the address expression constant offset if one exists and the ops
5625 // that come after it.
5626 int64_t CurrentExprOffsetInBytes = 0;
5627 SmallVector<uint64_t> PostOffsetOps;
5628 if (!getAddressExpression(DbgVariable)
5629 ->extractLeadingOffset(CurrentExprOffsetInBytes, PostOffsetOps))
5630 return; // Couldn't interpret this DIExpression - drop the var.
5631
5632 // Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext.
5633 int64_t ExtractOffsetInBits = 0;
5634 for (auto Op : getAddressExpression(DbgVariable)->expr_ops()) {
5635 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5637 ExtractOffsetInBits = Op.getArg(0);
5638 break;
5639 }
5640 }
5641
5642 DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
5643 for (auto Fragment : Fragments) {
5644 int64_t OffsetFromLocationInBits;
5645 std::optional<DIExpression::FragmentInfo> NewDbgFragment;
5646 // Find the variable fragment that the new alloca slice covers.
5647 // Drop debug info for this variable fragment if we can't compute an
5648 // intersect between it and the alloca slice.
5650 DL, &AI, Fragment.Offset, Fragment.Size, DbgPtr,
5651 CurrentExprOffsetInBytes * 8, ExtractOffsetInBits, VarFrag,
5652 NewDbgFragment, OffsetFromLocationInBits))
5653 continue; // Do not migrate this fragment to this slice.
5654
5655 // Zero sized fragment indicates there's no intersect between the variable
5656 // fragment and the alloca slice. Skip this slice for this variable
5657 // fragment.
5658 if (NewDbgFragment && !NewDbgFragment->SizeInBits)
5659 continue; // Do not migrate this fragment to this slice.
5660
5661 // No fragment indicates DbgVariable's variable or fragment exactly
5662 // overlaps the slice; copy its fragment (or nullopt if there isn't one).
5663 if (!NewDbgFragment)
5664 NewDbgFragment = DbgVariable->getFragment();
5665
5666 // Reduce the new expression offset by the bit-extract offset since
5667 // we'll be keeping that.
5668 int64_t OffestFromNewAllocaInBits =
5669 OffsetFromLocationInBits - ExtractOffsetInBits;
5670 // We need to adjust an existing bit extract if the offset expression
5671 // can't eat the slack (i.e., if the new offset would be negative).
5672 int64_t BitExtractOffset =
5673 std::min<int64_t>(0, OffestFromNewAllocaInBits);
5674 // The magnitude of a negative value indicates the number of bits into
5675 // the existing variable fragment that the memory region begins. The new
5676 // variable fragment already excludes those bits - the new DbgPtr offset
5677 // only needs to be applied if it's positive.
5678 OffestFromNewAllocaInBits =
5679 std::max(int64_t(0), OffestFromNewAllocaInBits);
5680
5681 // Rebuild the expression:
5682 // {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment}
5683 // Add NewDbgFragment later, because dbg.assigns don't want it in the
5684 // address expression but the value expression instead.
5685 DIExpression *NewExpr = DIExpression::get(AI.getContext(), PostOffsetOps);
5686 if (OffestFromNewAllocaInBits > 0) {
5687 int64_t OffsetInBytes = (OffestFromNewAllocaInBits + 7) / 8;
5688 NewExpr = DIExpression::prepend(NewExpr, /*flags=*/0, OffsetInBytes);
5689 }
5690
5691 // Remove any existing intrinsics on the new alloca describing
5692 // the variable fragment.
5693 auto RemoveOne = [DbgVariable](auto *OldDII) {
5694 auto SameVariableFragment = [](const auto *LHS, const auto *RHS) {
5695 return LHS->getVariable() == RHS->getVariable() &&
5696 LHS->getDebugLoc()->getInlinedAt() ==
5697 RHS->getDebugLoc()->getInlinedAt();
5698 };
5699 if (SameVariableFragment(OldDII, DbgVariable))
5700 OldDII->eraseFromParent();
5701 };
5702 for_each(findDVRDeclares(Fragment.Alloca), RemoveOne);
5703 for_each(findDVRValues(Fragment.Alloca), RemoveOne);
5704 insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI,
5705 NewDbgFragment, BitExtractOffset);
5706 }
5707 };
5708
5709 // Migrate debug information from the old alloca to the new alloca(s)
5710 // and the individual partitions.
5711 for_each(findDVRDeclares(&AI), MigrateOne);
5712 for_each(findDVRValues(&AI), MigrateOne);
5713 for_each(at::getDVRAssignmentMarkers(&AI), MigrateOne);
5714
5715 return Changed;
5716}
5717
5718/// Clobber a use with poison, deleting the used value if it becomes dead.
5719void SROA::clobberUse(Use &U) {
5720 Value *OldV = U;
5721 // Replace the use with an poison value.
5722 U = PoisonValue::get(OldV->getType());
5723
5724 // Check for this making an instruction dead. We have to garbage collect
5725 // all the dead instructions to ensure the uses of any alloca end up being
5726 // minimal.
5727 if (Instruction *OldI = dyn_cast<Instruction>(OldV))
5728 if (isInstructionTriviallyDead(OldI)) {
5729 DeadInsts.push_back(OldI);
5730 }
5731}
5732
5733/// A basic LoadAndStorePromoter that does not remove store nodes.
5735public:
5737 Type *ZeroType)
5738 : LoadAndStorePromoter(Insts, S), ZeroType(ZeroType) {}
5739 bool shouldDelete(Instruction *I) const override {
5740 return !isa<StoreInst>(I) && !isa<AllocaInst>(I);
5741 }
5742
5744 return UndefValue::get(ZeroType);
5745 }
5746
5747private:
5748 Type *ZeroType;
5749};
5750
5751bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
5752 // Look through each "partition", looking for slices with the same start/end
5753 // that do not overlap with any before them. The slices are sorted by
5754 // increasing beginOffset. We don't use AS.partitions(), as it will use a more
5755 // sophisticated algorithm that takes splittable slices into account.
5756 LLVM_DEBUG(dbgs() << "Attempting to propagate values on " << AI << "\n");
5757 bool AllSameAndValid = true;
5758 Type *PartitionType = nullptr;
5760 uint64_t BeginOffset = 0;
5761 uint64_t EndOffset = 0;
5762
5763 auto Flush = [&]() {
5764 if (AllSameAndValid && !Insts.empty()) {
5765 LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
5766 << EndOffset << ")\n");
5768 SSAUpdater SSA(&NewPHIs);
5769 Insts.push_back(&AI);
5770 BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
5771 Promoter.run(Insts);
5772 }
5773 AllSameAndValid = true;
5774 PartitionType = nullptr;
5775 Insts.clear();
5776 };
5777
5778 for (Slice &S : AS) {
5779 auto *User = cast<Instruction>(S.getUse()->getUser());
5780 if (isAssumeLikeIntrinsic(User)) {
5781 LLVM_DEBUG({
5782 dbgs() << "Ignoring slice: ";
5783 AS.print(dbgs(), &S);
5784 });
5785 continue;
5786 }
5787 if (S.beginOffset() >= EndOffset) {
5788 Flush();
5789 BeginOffset = S.beginOffset();
5790 EndOffset = S.endOffset();
5791 } else if (S.beginOffset() != BeginOffset || S.endOffset() != EndOffset) {
5792 if (AllSameAndValid) {
5793 LLVM_DEBUG({
5794 dbgs() << "Slice does not match range [" << BeginOffset << ", "
5795 << EndOffset << ")";
5796 AS.print(dbgs(), &S);
5797 });
5798 AllSameAndValid = false;
5799 }
5800 EndOffset = std::max(EndOffset, S.endOffset());
5801 continue;
5802 }
5803
5804 if (auto *LI = dyn_cast<LoadInst>(User)) {
5805 Type *UserTy = LI->getType();
5806 // LoadAndStorePromoter requires all the types to be the same.
5807 if (!LI->isSimple() || (PartitionType && UserTy != PartitionType))
5808 AllSameAndValid = false;
5809 PartitionType = UserTy;
5810 Insts.push_back(User);
5811 } else if (auto *SI = dyn_cast<StoreInst>(User)) {
5812 Type *UserTy = SI->getValueOperand()->getType();
5813 if (!SI->isSimple() || (PartitionType && UserTy != PartitionType))
5814 AllSameAndValid = false;
5815 PartitionType = UserTy;
5816 Insts.push_back(User);
5817 } else {
5818 AllSameAndValid = false;
5819 }
5820 }
5821
5822 Flush();
5823 return true;
5824}
5825
5826/// Analyze an alloca for SROA.
5827///
5828/// This analyzes the alloca to ensure we can reason about it, builds
5829/// the slices of the alloca, and then hands it off to be split and
5830/// rewritten as needed.
5831std::pair<bool /*Changed*/, bool /*CFGChanged*/>
5832SROA::runOnAlloca(AllocaInst &AI) {
5833 bool Changed = false;
5834 bool CFGChanged = false;
5835
5836 LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
5837 ++NumAllocasAnalyzed;
5838
5839 // Special case dead allocas, as they're trivial.
5840 if (AI.use_empty()) {
5841 AI.eraseFromParent();
5842 Changed = true;
5843 return {Changed, CFGChanged};
5844 }
5845 const DataLayout &DL = AI.getDataLayout();
5846
5847 // Skip alloca forms that this analysis can't handle.
5848 std::optional<TypeSize> Size = AI.getAllocationSize(DL);
5849 if (AI.isArrayAllocation() || !Size || Size->isScalable() || Size->isZero())
5850 return {Changed, CFGChanged};
5851
5852 // First, split any FCA loads and stores touching this alloca to promote
5853 // better splitting and promotion opportunities.
5854 IRBuilderTy IRB(&AI);
5855 AggLoadStoreRewriter AggRewriter(DL, IRB);
5856 Changed |= AggRewriter.rewrite(AI);
5857
5858 // Build the slices using a recursive instruction-visiting builder.
5859 AllocaSlices AS(DL, AI);
5860 LLVM_DEBUG(AS.print(dbgs()));
5861 if (AS.isEscaped())
5862 return {Changed, CFGChanged};
5863
5864 if (AS.isEscapedReadOnly()) {
5865 Changed |= propagateStoredValuesToLoads(AI, AS);
5866 return {Changed, CFGChanged};
5867 }
5868
5869 for (auto &P : AS.partitions()) {
5870 // For now, we can't split if a field is accessed both via protected field
5871 // and not, because that would mean that we would need to introduce sign and
5872 // auth operations to convert between the protected and non-protected uses,
5873 // and this pass doesn't know how to do that. Also, this case is unlikely to
5874 // occur in normal code.
5875 std::optional<Value *> ProtectedFieldDisc;
5876 auto SliceHasMismatch = [&](Slice &S) {
5877 if (auto *II = dyn_cast<IntrinsicInst>(S.getUse()->getUser()))
5878 if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
5879 II->getIntrinsicID() == Intrinsic::lifetime_end)
5880 return false;
5881 if (!ProtectedFieldDisc)
5882 ProtectedFieldDisc = S.ProtectedFieldDisc;
5883 return *ProtectedFieldDisc != S.ProtectedFieldDisc;
5884 };
5885 for (Slice &S : P)
5886 if (SliceHasMismatch(S))
5887 return {Changed, CFGChanged};
5888 for (Slice *S : P.splitSliceTails())
5889 if (SliceHasMismatch(*S))
5890 return {Changed, CFGChanged};
5891 }
5892
5893 // Delete all the dead users of this alloca before splitting and rewriting it.
5894 for (Instruction *DeadUser : AS.getDeadUsers()) {
5895 // Free up everything used by this instruction.
5896 for (Use &DeadOp : DeadUser->operands())
5897 clobberUse(DeadOp);
5898
5899 // Now replace the uses of this instruction.
5900 DeadUser->replaceAllUsesWith(PoisonValue::get(DeadUser->getType()));
5901
5902 // And mark it for deletion.
5903 DeadInsts.push_back(DeadUser);
5904 Changed = true;
5905 }
5906 for (Use *DeadOp : AS.getDeadOperands()) {
5907 clobberUse(*DeadOp);
5908 Changed = true;
5909 }
5910 for (IntrinsicInst *PFPUser : AS.getPFPUsers()) {
5911 PFPUser->replaceAllUsesWith(PFPUser->getArgOperand(0));
5912
5913 DeadInsts.push_back(PFPUser);
5914 Changed = true;
5915 }
5916
5917 // No slices to split. Leave the dead alloca for a later pass to clean up.
5918 if (AS.begin() == AS.end())
5919 return {Changed, CFGChanged};
5920
5921 Changed |= splitAlloca(AI, AS);
5922
5923 LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
5924 while (!SpeculatablePHIs.empty())
5925 speculatePHINodeLoads(IRB, *SpeculatablePHIs.pop_back_val());
5926
5927 LLVM_DEBUG(dbgs() << " Rewriting Selects\n");
5928 auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector();
5929 while (!RemainingSelectsToRewrite.empty()) {
5930 const auto [K, V] = RemainingSelectsToRewrite.pop_back_val();
5931 CFGChanged |=
5932 rewriteSelectInstMemOps(*K, V, IRB, PreserveCFG ? nullptr : DTU);
5933 }
5934
5935 return {Changed, CFGChanged};
5936}
5937
5938/// Delete the dead instructions accumulated in this run.
5939///
5940/// Recursively deletes the dead instructions we've accumulated. This is done
5941/// at the very end to maximize locality of the recursive delete and to
5942/// minimize the problems of invalidated instruction pointers as such pointers
5943/// are used heavily in the intermediate stages of the algorithm.
5944///
5945/// We also record the alloca instructions deleted here so that they aren't
5946/// subsequently handed to mem2reg to promote.
5947bool SROA::deleteDeadInstructions(
5948 SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
5949 bool Changed = false;
5950 while (!DeadInsts.empty()) {
5951 Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
5952 if (!I)
5953 continue;
5954 LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
5955
5956 // If the instruction is an alloca, find the possible dbg.declare connected
5957 // to it, and remove it too. We must do this before calling RAUW or we will
5958 // not be able to find it.
5959 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
5960 DeletedAllocas.insert(AI);
5961 for (DbgVariableRecord *OldDII : findDVRDeclares(AI))
5962 OldDII->eraseFromParent();
5963 }
5964
5966 I->replaceAllUsesWith(UndefValue::get(I->getType()));
5967
5968 for (Use &Operand : I->operands())
5969 if (Instruction *U = dyn_cast<Instruction>(Operand)) {
5970 // Zero out the operand and see if it becomes trivially dead.
5971 Operand = nullptr;
5973 DeadInsts.push_back(U);
5974 }
5975
5976 ++NumDeleted;
5977 I->eraseFromParent();
5978 Changed = true;
5979 }
5980 return Changed;
5981}
5982/// Promote the allocas, using the best available technique.
5983///
5984/// This attempts to promote whatever allocas have been identified as viable in
5985/// the PromotableAllocas list. If that list is empty, there is nothing to do.
5986/// This function returns whether any promotion occurred.
5987bool SROA::promoteAllocas() {
5988 if (PromotableAllocas.empty())
5989 return false;
5990
5991 if (SROASkipMem2Reg) {
5992 LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
5993 } else {
5994 LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
5995 NumPromoted += PromotableAllocas.size();
5996 PromoteMemToReg(PromotableAllocas.getArrayRef(), DTU->getDomTree(), AC);
5997 }
5998
5999 PromotableAllocas.clear();
6000 return true;
6001}
6002
6003std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) {
6004 LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
6005
6006 const DataLayout &DL = F.getDataLayout();
6007 BasicBlock &EntryBB = F.getEntryBlock();
6008 for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
6009 I != E; ++I) {
6010 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
6011 std::optional<TypeSize> Size = AI->getAllocationSize(DL);
6012 if (Size && Size->isScalable() && isAllocaPromotable(AI))
6013 PromotableAllocas.insert(AI);
6014 else
6015 Worklist.insert(AI);
6016 }
6017 }
6018
6019 bool Changed = false;
6020 bool CFGChanged = false;
6021 // A set of deleted alloca instruction pointers which should be removed from
6022 // the list of promotable allocas.
6023 SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
6024
6025 do {
6026 while (!Worklist.empty()) {
6027 auto [IterationChanged, IterationCFGChanged] =
6028 runOnAlloca(*Worklist.pop_back_val());
6029 Changed |= IterationChanged;
6030 CFGChanged |= IterationCFGChanged;
6031
6032 Changed |= deleteDeadInstructions(DeletedAllocas);
6033
6034 // Remove the deleted allocas from various lists so that we don't try to
6035 // continue processing them.
6036 if (!DeletedAllocas.empty()) {
6037 Worklist.set_subtract(DeletedAllocas);
6038 PostPromotionWorklist.set_subtract(DeletedAllocas);
6039 PromotableAllocas.set_subtract(DeletedAllocas);
6040 DeletedAllocas.clear();
6041 }
6042 }
6043
6044 Changed |= promoteAllocas();
6045
6046 Worklist = PostPromotionWorklist;
6047 PostPromotionWorklist.clear();
6048 } while (!Worklist.empty());
6049
6050 assert((!CFGChanged || Changed) && "Can not only modify the CFG.");
6051 assert((!CFGChanged || !PreserveCFG) &&
6052 "Should not have modified the CFG when told to preserve it.");
6053
6054 if (Changed && isAssignmentTrackingEnabled(*F.getParent())) {
6055 for (auto &BB : F) {
6057 }
6058 }
6059
6060 return {Changed, CFGChanged};
6061}
6062
6066 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6067 auto [Changed, CFGChanged] =
6068 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6069 if (!Changed)
6070 return PreservedAnalyses::all();
6072 if (!CFGChanged)
6075 return PA;
6076}
6077
6079 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
6080 static_cast<PassInfoMixin<SROAPass> *>(this)->printPipeline(
6081 OS, MapClassName2PassName);
6082 OS << (PreserveCFG == SROAOptions::PreserveCFG ? "<preserve-cfg>"
6083 : "<modify-cfg>");
6084}
6085
6086SROAPass::SROAPass(SROAOptions PreserveCFG) : PreserveCFG(PreserveCFG) {}
6087
6088namespace {
6089
6090/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
6091class SROALegacyPass : public FunctionPass {
6093
6094public:
6095 static char ID;
6096
6100 }
6101
6102 bool runOnFunction(Function &F) override {
6103 if (skipFunction(F))
6104 return false;
6105
6106 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
6107 AssumptionCache &AC =
6108 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
6109 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6110 auto [Changed, _] =
6111 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6112 return Changed;
6113 }
6114
6115 void getAnalysisUsage(AnalysisUsage &AU) const override {
6116 AU.addRequired<AssumptionCacheTracker>();
6117 AU.addRequired<DominatorTreeWrapperPass>();
6118 AU.addPreserved<GlobalsAAWrapperPass>();
6119 AU.addPreserved<DominatorTreeWrapperPass>();
6120 }
6121
6122 StringRef getPassName() const override { return "SROA"; }
6123};
6124
6125} // end anonymous namespace
6126
6127char SROALegacyPass::ID = 0;
6128
6133
6134INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
6135 "Scalar Replacement Of Aggregates", false, false)
6138INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
This defines the Use class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
print mir2vec MIR2Vec Vocabulary Printer Pass
Definition MIR2Vec.cpp:598
This file implements a map that provides insertion order iteration.
static std::optional< AllocFnsTy > getAllocationSize(const CallBase *CB, const TargetLibraryInfo *TLI)
static std::optional< uint64_t > getSizeInBytes(std::optional< uint64_t > SizeInBits)
Memory SSA
Definition MemorySSA.cpp:72
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
PassBuilder PB(Machine, PassOpts->PTO, std::nullopt, &PIC)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PointerIntPair class.
This file provides a collection of visitors which walk the (instruction) uses of a pointer.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
bool isDead(const MachineInstr &MI, const MachineRegisterInfo &MRI)
static void visit(BasicBlock &Start, std::function< bool(BasicBlock *)> op)
static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, uint64_t OldAllocaOffsetInBits, uint64_t SliceSizeInBits, Instruction *OldInst, Instruction *Inst, Value *Dest, Value *Value, const DataLayout &DL)
Find linked dbg.assign and generate a new one with the correct FragmentInfo.
Definition SROA.cpp:344
static std::tuple< Type *, bool, VectorType * > selectPartitionType(Partition &P, const DataLayout &DL, AllocaInst &AI, LLVMContext &C)
Select a partition type for an alloca partition.
Definition SROA.cpp:5136
static VectorType * isVectorPromotionViable(Partition &P, const DataLayout &DL, unsigned VScale)
Test whether the given alloca partitioning and range of slices can be promoted to a vector.
Definition SROA.cpp:2282
static Align getAdjustedAlignment(Instruction *I, uint64_t Offset)
Compute the adjusted alignment for a load or store from an offset.
Definition SROA.cpp:1960
static VectorType * checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, SmallVectorImpl< VectorType * > &CandidateTys, bool HaveCommonEltTy, Type *CommonEltTy, bool HaveVecPtrTy, bool HaveCommonVecPtrTy, VectorType *CommonVecPtrTy, unsigned VScale)
Test whether any vector type in CandidateTys is viable for promotion.
Definition SROA.cpp:2133
static std::pair< Type *, IntegerType * > findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E, uint64_t EndOffset)
Walk the range of a partitioning looking for a common type to cover this sequence of slices.
Definition SROA.cpp:1526
static Type * stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty)
Strip aggregate type wrapping.
Definition SROA.cpp:4479
static FragCalcResult calculateFragment(DILocalVariable *Variable, uint64_t NewStorageSliceOffsetInBits, uint64_t NewStorageSliceSizeInBits, std::optional< DIExpression::FragmentInfo > StorageFragment, std::optional< DIExpression::FragmentInfo > CurrentFragment, DIExpression::FragmentInfo &Target)
Definition SROA.cpp:279
static DIExpression * createOrReplaceFragment(const DIExpression *Expr, DIExpression::FragmentInfo Frag, int64_t BitExtractOffset)
Create or replace an existing fragment in a DIExpression with Frag.
Definition SROA.cpp:5401
static Value * insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, Value *V, uint64_t Offset, const Twine &Name)
Definition SROA.cpp:2525
static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, VectorType *Ty, uint64_t ElementSize, const DataLayout &DL, unsigned VScale)
Test whether the given slice use can be promoted to a vector.
Definition SROA.cpp:2058
static Value * getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Offset, Type *PointerTy, const Twine &NamePrefix)
Compute an adjusted pointer from Ptr by Offset bytes where the resulting pointer has PointerTy.
Definition SROA.cpp:1949
static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t AllocBeginOffset, Type *AllocaTy, const DataLayout &DL, bool &WholeAllocaOp)
Test whether a slice of an alloca is valid for integer widening.
Definition SROA.cpp:2364
static Value * extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, unsigned EndIndex, const Twine &Name)
Definition SROA.cpp:2558
static Value * foldPHINodeOrSelectInst(Instruction &I)
A helper that folds a PHI node or a select.
Definition SROA.cpp:1021
static bool rewriteSelectInstMemOps(SelectInst &SI, const RewriteableMemOps &Ops, IRBuilderTy &IRB, DomTreeUpdater *DTU)
Definition SROA.cpp:1915
static void rewriteMemOpOfSelect(SelectInst &SI, T &I, SelectHandSpeculativity Spec, DomTreeUpdater &DTU)
Definition SROA.cpp:1848
static Value * foldSelectInst(SelectInst &SI)
Definition SROA.cpp:1008
bool isKillAddress(const DbgVariableRecord *DVR)
Definition SROA.cpp:5364
static Value * insertVector(IRBuilderTy &IRB, Value *Old, Value *V, unsigned BeginIndex, const Twine &Name)
Definition SROA.cpp:2580
static bool isIntegerWideningViable(Partition &P, Type *AllocaTy, const DataLayout &DL)
Test whether the given alloca partition's integer operations can be widened to promotable ones.
Definition SROA.cpp:2459
static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN)
Definition SROA.cpp:1666
static VectorType * createAndCheckVectorTypesForPromotion(SetVector< Type * > &OtherTys, ArrayRef< VectorType * > CandidateTysCopy, function_ref< void(Type *)> CheckCandidateType, Partition &P, const DataLayout &DL, SmallVectorImpl< VectorType * > &CandidateTys, bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy, bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale)
Definition SROA.cpp:2238
static DebugVariable getAggregateVariable(DbgVariableRecord *DVR)
Definition SROA.cpp:325
static bool isSafePHIToSpeculate(PHINode &PN)
PHI instructions that use an alloca and are subsequently loaded can be rewritten to load both input p...
Definition SROA.cpp:1592
static Value * extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, IntegerType *Ty, uint64_t Offset, const Twine &Name)
Definition SROA.cpp:2500
static void insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, AllocaInst *NewAddr, DIExpression *NewAddrExpr, Instruction *BeforeInst, std::optional< DIExpression::FragmentInfo > NewFragment, int64_t BitExtractAdjustment)
Insert a new DbgRecord.
Definition SROA.cpp:5466
static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI, IRBuilderTy &IRB)
Definition SROA.cpp:1809
static Value * mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL, Type *NewAIEltTy, IRBuilder<> &Builder)
This function takes two vector values and combines them into a single vector by concatenating their e...
Definition SROA.cpp:2652
const DIExpression * getAddressExpression(const DbgVariableRecord *DVR)
Definition SROA.cpp:5370
static Type * getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, uint64_t Size)
Try to find a partition of the aggregate type passed in for a given offset and size.
Definition SROA.cpp:4517
static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy, unsigned VScale=0)
Test whether we can convert a value from the old to the new type.
Definition SROA.cpp:1970
static SelectHandSpeculativity isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG)
Definition SROA.cpp:1747
This file provides the interface for LLVM's Scalar Replacement of Aggregates pass.
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Virtual Register Rewriter
Value * RHS
Value * LHS
Builder for the alloca slices.
Definition SROA.cpp:1033
SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
Definition SROA.cpp:1053
An iterator over partitions of the alloca's slices.
Definition SROA.cpp:821
bool operator==(const partition_iterator &RHS) const
Definition SROA.cpp:968
partition_iterator & operator++()
Definition SROA.cpp:988
bool shouldDelete(Instruction *I) const override
Return false if a sub-class wants to keep one of the loads/stores after the SSA construction.
Definition SROA.cpp:5739
BasicLoadAndStorePromoter(ArrayRef< const Instruction * > Insts, SSAUpdater &S, Type *ZeroType)
Definition SROA.cpp:5736
Value * getValueToUseForAlloca(Instruction *I) const override
Return the value to use for the point in the code that the alloca is positioned.
Definition SROA.cpp:5743
Class for arbitrary precision integers.
Definition APInt.h:78
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:131
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
iterator begin() const
Definition ArrayRef.h:130
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:483
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:470
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
LLVM_ABI CaptureInfo getCaptureInfo(unsigned OpNo) const
Return which pointer components this operand may capture.
bool onlyReadsMemory(unsigned OpNo) const
bool isDataOperand(const Use *U) const
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static DIAssignID * getDistinct(LLVMContext &Context)
LLVM_ABI DbgInstPtr insertDbgAssign(Instruction *LinkedInstr, Value *Val, DILocalVariable *SrcVar, DIExpression *ValExpr, Value *Addr, DIExpression *AddrExpr, const DILocation *DL)
Insert a new llvm.dbg.assign intrinsic call.
DWARF expression.
iterator_range< expr_op_iterator > expr_ops() const
DbgVariableFragmentInfo FragmentInfo
LLVM_ABI bool startsWithDeref() const
Return whether the first element a DW_OP_deref.
static LLVM_ABI bool calculateFragmentIntersect(const DataLayout &DL, const Value *SliceStart, uint64_t SliceOffsetInBits, uint64_t SliceSizeInBits, const Value *DbgPtr, int64_t DbgPtrOffsetInBits, int64_t DbgExtractOffsetInBits, DIExpression::FragmentInfo VarFrag, std::optional< DIExpression::FragmentInfo > &Result, int64_t &OffsetFromLocationInBits)
Computes a fragment, bit-extract operation if needed, and new constant offset to describe a part of a...
static LLVM_ABI std::optional< DIExpression * > createFragmentExpression(const DIExpression *Expr, unsigned OffsetInBits, unsigned SizeInBits)
Create a DIExpression to describe one part of an aggregate variable that is fragmented across multipl...
static LLVM_ABI DIExpression * prepend(const DIExpression *Expr, uint8_t Flags, int64_t Offset=0)
Prepend DIExpr with a deref and offset operation and optionally turn it into a stack value or/and an ...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI void moveBefore(DbgRecord *MoveBefore)
DebugLoc getDebugLoc() const
void setDebugLoc(DebugLoc Loc)
Record of a variable value-assignment, aka a non instruction representation of the dbg....
LLVM_ABI void setKillAddress()
Kill the address component.
LLVM_ABI bool isKillLocation() const
LLVM_ABI bool isKillAddress() const
Check whether this kills the address component.
LLVM_ABI void replaceVariableLocationOp(Value *OldValue, Value *NewValue, bool AllowEmpty=false)
Value * getValue(unsigned OpIdx=0) const
static LLVM_ABI DbgVariableRecord * createLinkedDVRAssign(Instruction *LinkedInstr, Value *Val, DILocalVariable *Variable, DIExpression *Expression, Value *Address, DIExpression *AddressExpression, const DILocation *DI)
LLVM_ABI void setAssignId(DIAssignID *New)
DIExpression * getExpression() const
static LLVM_ABI DbgVariableRecord * createDVRDeclare(Value *Address, DILocalVariable *DV, DIExpression *Expr, const DILocation *DI)
static LLVM_ABI DbgVariableRecord * createDbgVariableRecord(Value *Location, DILocalVariable *DV, DIExpression *Expr, const DILocation *DI)
DILocalVariable * getVariable() const
DIExpression * getAddressExpression() const
LLVM_ABI DILocation * getInlinedAt() const
Definition DebugLoc.cpp:67
Identifies a unique instance of a variable.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:321
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
Class to represent fixed width SIMD vectors.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
unsigned getVScaleValue() const
Return the value for vscale based on the vscale_range attribute or 0 when unknown.
const BasicBlock & getEntryBlock() const
Definition Function.h:809
LLVM_ABI bool accumulateConstantOffset(const DataLayout &DL, APInt &Offset, function_ref< bool(Value &, APInt &)> ExternalAnalysis=nullptr) const
Accumulate the constant address offset of this GEP if possible.
Definition Operator.cpp:125
iterator_range< op_iterator > indices()
Type * getSourceElementType() const
LLVM_ABI GEPNoWrapFlags getNoWrapFlags() const
Get the nowrap flags for the GEP instruction.
This provides the default implementation of the IRBuilder 'InsertHelper' method that is called whenev...
Definition IRBuilder.h:61
virtual void InsertHelper(Instruction *I, const Twine &Name, BasicBlock::iterator InsertPt) const
Definition IRBuilder.h:65
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2787
Base class for instruction visitors.
Definition InstVisitor.h:78
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void setAAMetadata(const AAMDNodes &N)
Sets the AA metadata on this instruction from the AAMDNodes structure.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Class to represent integer types.
@ MAX_INT_BITS
Maximum number of bits that can be specified.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LoadAndStorePromoter(ArrayRef< const Instruction * > Insts, SSAUpdater &S, StringRef Name=StringRef())
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Value * getPointerOperand()
bool isVolatile() const
Return true if this is a load from a volatile memory location.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this load instruction.
Type * getPointerOperandType() const
static unsigned getPointerOperandIndex()
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this load instruction.
bool isSimple() const
Align getAlign() const
Return the alignment of the access that is being performed.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
LLVMContext & getContext() const
Definition Metadata.h:1244
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
This is the common base class for memset/memcpy/memmove.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
PointerIntPair - This class implements a pair of a pointer and small integer.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
PtrUseVisitor(const DataLayout &DL)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Run the pass over the function.
Definition SROA.cpp:6063
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
Definition SROA.cpp:6078
SROAPass(SROAOptions PreserveCFG)
If PreserveCFG is set, then the pass is not allowed to modify CFG in any way, even if it would update...
Definition SROA.cpp:6086
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
Value * getValueOperand()
static unsigned getPointerOperandIndex()
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static constexpr size_t npos
Definition StringRef.h:57
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition StringRef.h:591
size_t rfind(char C, size_t From=npos) const
Search for the last character C in the string.
Definition StringRef.h:365
size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition StringRef.h:290
LLVM_ABI size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Used to lazily calculate structure layout information for a target machine, based on the DataLayout s...
Definition DataLayout.h:723
TypeSize getSizeInBytes() const
Definition DataLayout.h:732
LLVM_ABI unsigned getElementContainingOffset(uint64_t FixedOffset) const
Given a valid byte offset into the structure, returns the structure index that contains it.
TypeSize getElementOffset(unsigned Idx) const
Definition DataLayout.h:754
TypeSize getSizeInBits() const
Definition DataLayout.h:734
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:413
element_iterator element_end() const
element_iterator element_begin() const
bool isPacked() const
Type * getElementType(unsigned N) const
Type::subtype_iterator element_iterator
Target - Wrapper for Target specific information.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getIntegerBitWidth() const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:296
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
bool isTargetExtTy() const
Return true if this is a target extension type.
Definition Type.h:203
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:267
op_iterator op_begin()
Definition User.h:259
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
op_iterator op_end()
Definition User.h:261
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
LLVM_ABI const Value * stripInBoundsOffsets(function_ref< void(const Value *)> Func=[](const Value *) {}) const
Strip off pointer casts and inbounds GEPs.
Definition Value.cpp:819
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void dropDroppableUsesIn(User &Usr)
Remove every use of this value in User that can safely be removed.
Definition Value.cpp:218
LLVM_ABI const Value * stripAndAccumulateConstantOffsets(const DataLayout &DL, APInt &Offset, bool AllowNonInbounds, bool AllowInvariantGroup=false, function_ref< bool(Value &Value, APInt &Offset)> ExternalAnalysis=nullptr, bool LookThroughIntToPtr=false) const
Accumulate the constant offset this value has compared to a base pointer.
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static VectorType * getWithSizeAndScalar(VectorType *SizeTy, Type *EltTy)
This static method attempts to construct a VectorType with the same size-in-bits as SizeTy but with a...
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition iterator.h:80
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
Offsets
Offsets in bytes from the start of the input buffer.
SmallVector< DbgVariableRecord * > getDVRAssignmentMarkers(const Instruction *Inst)
Return a range of dbg_assign records for which Inst performs the assignment they encode.
Definition DebugInfo.h:203
LLVM_ABI void deleteAssignmentMarkers(const Instruction *Inst)
Delete the llvm.dbg.assign intrinsics linked to Inst.
initializer< Ty > init(const Ty &Val)
@ DW_OP_LLVM_extract_bits_zext
Only used in LLVM metadata.
Definition Dwarf.h:151
@ DW_OP_LLVM_fragment
Only used in LLVM metadata.
Definition Dwarf.h:144
@ DW_OP_LLVM_extract_bits_sext
Only used in LLVM metadata.
Definition Dwarf.h:150
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
bool empty() const
Definition BasicBlock.h:101
Context & getContext() const
Definition BasicBlock.h:99
iterator end() const
Definition BasicBlock.h:89
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
static cl::opt< bool > SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false), cl::Hidden)
Disable running mem2reg during SROA in order to test or debug SROA.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:532
@ Length
Definition DWP.cpp:532
bool operator<(int64_t V1, const APSInt &V2)
Definition APSInt.h:360
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2116
LLVM_ABI bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
cl::opt< bool > ProfcheckDisableMetadataFixes
Definition Metadata.cpp:64
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI void PromoteMemToReg(ArrayRef< AllocaInst * > Allocas, DominatorTree &DT, AssumptionCache *AC=nullptr)
Promote the specified list of alloca instructions into scalar registers, inserting PHI nodes as appro...
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto successors(const MachineBasicBlock *BB)
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2128
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI std::optional< RegOrConstant > getVectorSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI)
Definition Utils.cpp:1500
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
void * PointerTy
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2134
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI bool isAllocaPromotable(const AllocaInst *AI)
Return true if this alloca is legal for promotion.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2200
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:406
bool capturesFullProvenance(CaptureComponents CC)
Definition ModRef.h:353
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void initializeSROALegacyPassPass(PassRegistry &)
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
LLVM_ABI TinyPtrVector< DbgVariableRecord * > findDVRValues(Value *V)
As above, for DVRValues.
Definition DebugInfo.cpp:82
LLVM_ABI void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2192
LLVM_ABI TinyPtrVector< DbgVariableRecord * > findDVRDeclares(Value *V)
Finds dbg.declare records declaring local variables as living in the memory that 'V' points to.
Definition DebugInfo.cpp:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI Instruction * SplitBlockAndInsertIfThen(Value *Cond, BasicBlock::iterator SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI FunctionPass * createSROAPass(bool PreserveCFG=true)
Definition SROA.cpp:6129
SROAOptions
Definition SROA.h:24
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define NDEBUG
Definition regutils.h:48
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
AAMDNodes shift(size_t Offset) const
Create a new AAMDNode that describes this AAMDNode after applying a constant offset to the start of t...
Definition Metadata.h:822
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Describes an element of a Bitfield.
Definition Bitfields.h:176
static Bitfield::Type get(StorageType Packed)
Unpacks the field from the Packed value.
Definition Bitfields.h:207
static void set(StorageType &Packed, typename Bitfield::Type Value)
Sets the typed value in the provided Packed value.
Definition Bitfields.h:223
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:70