LLVM 22.0.0git
SROA.cpp
Go to the documentation of this file.
1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This transformation implements the well known scalar replacement of
10/// aggregates transformation. It tries to identify promotable elements of an
11/// aggregate alloca, and promote them to registers. It will also try to
12/// convert uses of an element (or set of elements) of an alloca into a vector
13/// or bitfield-style integer scalar if appropriate.
14///
15/// It works to do this with minimal slicing of the alloca so that regions
16/// which are merely transferred in and out of external memory remain unchanged
17/// and are not decomposed to scalar code.
18///
19/// Because this also performs alloca promotion, it can be thought of as also
20/// serving the purpose of SSA formation. The algorithm iterates on the
21/// function until all opportunities for promotion have been realized.
22///
23//===----------------------------------------------------------------------===//
24
26#include "llvm/ADT/APInt.h"
27#include "llvm/ADT/ArrayRef.h"
28#include "llvm/ADT/DenseMap.h"
29#include "llvm/ADT/MapVector.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SetVector.h"
36#include "llvm/ADT/Statistic.h"
37#include "llvm/ADT/StringRef.h"
38#include "llvm/ADT/Twine.h"
39#include "llvm/ADT/iterator.h"
44#include "llvm/Analysis/Loads.h"
47#include "llvm/Config/llvm-config.h"
48#include "llvm/IR/BasicBlock.h"
49#include "llvm/IR/Constant.h"
51#include "llvm/IR/Constants.h"
52#include "llvm/IR/DIBuilder.h"
53#include "llvm/IR/DataLayout.h"
54#include "llvm/IR/DebugInfo.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/GlobalAlias.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstVisitor.h"
62#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/LLVMContext.h"
67#include "llvm/IR/Metadata.h"
68#include "llvm/IR/Module.h"
69#include "llvm/IR/Operator.h"
70#include "llvm/IR/PassManager.h"
71#include "llvm/IR/Type.h"
72#include "llvm/IR/Use.h"
73#include "llvm/IR/User.h"
74#include "llvm/IR/Value.h"
75#include "llvm/IR/ValueHandle.h"
77#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
89#include <algorithm>
90#include <cassert>
91#include <cstddef>
92#include <cstdint>
93#include <cstring>
94#include <iterator>
95#include <queue>
96#include <string>
97#include <tuple>
98#include <utility>
99#include <variant>
100#include <vector>
101
102using namespace llvm;
103
104#define DEBUG_TYPE "sroa"
105
106STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
107STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
108STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
109STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
110STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
111STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
112STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
113STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
114STATISTIC(NumLoadsPredicated,
115 "Number of loads rewritten into predicated loads to allow promotion");
117 NumStoresPredicated,
118 "Number of stores rewritten into predicated loads to allow promotion");
119STATISTIC(NumDeleted, "Number of instructions deleted");
120STATISTIC(NumVectorized, "Number of vectorized aggregates");
121
122namespace llvm {
123/// Disable running mem2reg during SROA in order to test or debug SROA.
124static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false),
125 cl::Hidden);
127} // namespace llvm
128
129namespace {
130
131class AllocaSliceRewriter;
132class AllocaSlices;
133class Partition;
134
135class SelectHandSpeculativity {
136 unsigned char Storage = 0; // None are speculatable by default.
137 using TrueVal = Bitfield::Element<bool, 0, 1>; // Low 0'th bit.
138 using FalseVal = Bitfield::Element<bool, 1, 1>; // Low 1'th bit.
139public:
140 SelectHandSpeculativity() = default;
141 SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal);
142 bool isSpeculatable(bool isTrueVal) const;
143 bool areAllSpeculatable() const;
144 bool areAnySpeculatable() const;
145 bool areNoneSpeculatable() const;
146 // For interop as int half of PointerIntPair.
147 explicit operator intptr_t() const { return static_cast<intptr_t>(Storage); }
148 explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {}
149};
150static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char));
151
152using PossiblySpeculatableLoad =
154using UnspeculatableStore = StoreInst *;
155using RewriteableMemOp =
156 std::variant<PossiblySpeculatableLoad, UnspeculatableStore>;
157using RewriteableMemOps = SmallVector<RewriteableMemOp, 2>;
158
159/// An optimization pass providing Scalar Replacement of Aggregates.
160///
161/// This pass takes allocations which can be completely analyzed (that is, they
162/// don't escape) and tries to turn them into scalar SSA values. There are
163/// a few steps to this process.
164///
165/// 1) It takes allocations of aggregates and analyzes the ways in which they
166/// are used to try to split them into smaller allocations, ideally of
167/// a single scalar data type. It will split up memcpy and memset accesses
168/// as necessary and try to isolate individual scalar accesses.
169/// 2) It will transform accesses into forms which are suitable for SSA value
170/// promotion. This can be replacing a memset with a scalar store of an
171/// integer value, or it can involve speculating operations on a PHI or
172/// select to be a PHI or select of the results.
173/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
174/// onto insert and extract operations on a vector value, and convert them to
175/// this form. By doing so, it will enable promotion of vector aggregates to
176/// SSA vector values.
177class SROA {
178 LLVMContext *const C;
179 DomTreeUpdater *const DTU;
180 AssumptionCache *const AC;
181 const bool PreserveCFG;
182
183 /// Worklist of alloca instructions to simplify.
184 ///
185 /// Each alloca in the function is added to this. Each new alloca formed gets
186 /// added to it as well to recursively simplify unless that alloca can be
187 /// directly promoted. Finally, each time we rewrite a use of an alloca other
188 /// the one being actively rewritten, we add it back onto the list if not
189 /// already present to ensure it is re-visited.
190 SmallSetVector<AllocaInst *, 16> Worklist;
191
192 /// A collection of instructions to delete.
193 /// We try to batch deletions to simplify code and make things a bit more
194 /// efficient. We also make sure there is no dangling pointers.
195 SmallVector<WeakVH, 8> DeadInsts;
196
197 /// Post-promotion worklist.
198 ///
199 /// Sometimes we discover an alloca which has a high probability of becoming
200 /// viable for SROA after a round of promotion takes place. In those cases,
201 /// the alloca is enqueued here for re-processing.
202 ///
203 /// Note that we have to be very careful to clear allocas out of this list in
204 /// the event they are deleted.
205 SmallSetVector<AllocaInst *, 16> PostPromotionWorklist;
206
207 /// A collection of alloca instructions we can directly promote.
208 SetVector<AllocaInst *, SmallVector<AllocaInst *>,
209 SmallPtrSet<AllocaInst *, 16>, 16>
210 PromotableAllocas;
211
212 /// A worklist of PHIs to speculate prior to promoting allocas.
213 ///
214 /// All of these PHIs have been checked for the safety of speculation and by
215 /// being speculated will allow promoting allocas currently in the promotable
216 /// queue.
217 SmallSetVector<PHINode *, 8> SpeculatablePHIs;
218
219 /// A worklist of select instructions to rewrite prior to promoting
220 /// allocas.
221 SmallMapVector<SelectInst *, RewriteableMemOps, 8> SelectsToRewrite;
222
223 /// Select instructions that use an alloca and are subsequently loaded can be
224 /// rewritten to load both input pointers and then select between the result,
225 /// allowing the load of the alloca to be promoted.
226 /// From this:
227 /// %P2 = select i1 %cond, ptr %Alloca, ptr %Other
228 /// %V = load <type>, ptr %P2
229 /// to:
230 /// %V1 = load <type>, ptr %Alloca -> will be mem2reg'd
231 /// %V2 = load <type>, ptr %Other
232 /// %V = select i1 %cond, <type> %V1, <type> %V2
233 ///
234 /// We can do this to a select if its only uses are loads
235 /// and if either the operand to the select can be loaded unconditionally,
236 /// or if we are allowed to perform CFG modifications.
237 /// If found an intervening bitcast with a single use of the load,
238 /// allow the promotion.
239 static std::optional<RewriteableMemOps>
240 isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG);
241
242public:
243 SROA(LLVMContext *C, DomTreeUpdater *DTU, AssumptionCache *AC,
244 SROAOptions PreserveCFG_)
245 : C(C), DTU(DTU), AC(AC),
246 PreserveCFG(PreserveCFG_ == SROAOptions::PreserveCFG) {}
247
248 /// Main run method used by both the SROAPass and by the legacy pass.
249 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runSROA(Function &F);
250
251private:
252 friend class AllocaSliceRewriter;
253
254 bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
255 AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
256 bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
257 bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
258 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
259 void clobberUse(Use &U);
260 bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
261 bool promoteAllocas();
262};
263
264} // end anonymous namespace
265
266/// Calculate the fragment of a variable to use when slicing a store
267/// based on the slice dimensions, existing fragment, and base storage
268/// fragment.
269/// Results:
270/// UseFrag - Use Target as the new fragment.
271/// UseNoFrag - The new slice already covers the whole variable.
272/// Skip - The new alloca slice doesn't include this variable.
273/// FIXME: Can we use calculateFragmentIntersect instead?
274namespace {
275enum FragCalcResult { UseFrag, UseNoFrag, Skip };
276}
277static FragCalcResult
279 uint64_t NewStorageSliceOffsetInBits,
280 uint64_t NewStorageSliceSizeInBits,
281 std::optional<DIExpression::FragmentInfo> StorageFragment,
282 std::optional<DIExpression::FragmentInfo> CurrentFragment,
284 // If the base storage describes part of the variable apply the offset and
285 // the size constraint.
286 if (StorageFragment) {
287 Target.SizeInBits =
288 std::min(NewStorageSliceSizeInBits, StorageFragment->SizeInBits);
289 Target.OffsetInBits =
290 NewStorageSliceOffsetInBits + StorageFragment->OffsetInBits;
291 } else {
292 Target.SizeInBits = NewStorageSliceSizeInBits;
293 Target.OffsetInBits = NewStorageSliceOffsetInBits;
294 }
295
296 // If this slice extracts the entirety of an independent variable from a
297 // larger alloca, do not produce a fragment expression, as the variable is
298 // not fragmented.
299 if (!CurrentFragment) {
300 if (auto Size = Variable->getSizeInBits()) {
301 // Treat the current fragment as covering the whole variable.
302 CurrentFragment = DIExpression::FragmentInfo(*Size, 0);
303 if (Target == CurrentFragment)
304 return UseNoFrag;
305 }
306 }
307
308 // No additional work to do if there isn't a fragment already, or there is
309 // but it already exactly describes the new assignment.
310 if (!CurrentFragment || *CurrentFragment == Target)
311 return UseFrag;
312
313 // Reject the target fragment if it doesn't fit wholly within the current
314 // fragment. TODO: We could instead chop up the target to fit in the case of
315 // a partial overlap.
316 if (Target.startInBits() < CurrentFragment->startInBits() ||
317 Target.endInBits() > CurrentFragment->endInBits())
318 return Skip;
319
320 // Target fits within the current fragment, return it.
321 return UseFrag;
322}
323
325 return DebugVariable(DVR->getVariable(), std::nullopt,
326 DVR->getDebugLoc().getInlinedAt());
327}
328
329/// Find linked dbg.assign and generate a new one with the correct
330/// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the
331/// value component is copied from the old dbg.assign to the new.
332/// \param OldAlloca Alloca for the variable before splitting.
333/// \param IsSplit True if the store (not necessarily alloca)
334/// is being split.
335/// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca.
336/// \param SliceSizeInBits New number of bits being written to.
337/// \param OldInst Instruction that is being split.
338/// \param Inst New instruction performing this part of the
339/// split store.
340/// \param Dest Store destination.
341/// \param Value Stored value.
342/// \param DL Datalayout.
343static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
344 uint64_t OldAllocaOffsetInBits,
345 uint64_t SliceSizeInBits, Instruction *OldInst,
346 Instruction *Inst, Value *Dest, Value *Value,
347 const DataLayout &DL) {
348 // If we want allocas to be migrated using this helper then we need to ensure
349 // that the BaseFragments map code still works. A simple solution would be
350 // to choose to always clone alloca dbg_assigns (rather than sometimes
351 // "stealing" them).
352 assert(!isa<AllocaInst>(Inst) && "Unexpected alloca");
353
354 auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst);
355 // Nothing to do if OldInst has no linked dbg.assign intrinsics.
356 if (DVRAssignMarkerRange.empty())
357 return;
358
359 LLVM_DEBUG(dbgs() << " migrateDebugInfo\n");
360 LLVM_DEBUG(dbgs() << " OldAlloca: " << *OldAlloca << "\n");
361 LLVM_DEBUG(dbgs() << " IsSplit: " << IsSplit << "\n");
362 LLVM_DEBUG(dbgs() << " OldAllocaOffsetInBits: " << OldAllocaOffsetInBits
363 << "\n");
364 LLVM_DEBUG(dbgs() << " SliceSizeInBits: " << SliceSizeInBits << "\n");
365 LLVM_DEBUG(dbgs() << " OldInst: " << *OldInst << "\n");
366 LLVM_DEBUG(dbgs() << " Inst: " << *Inst << "\n");
367 LLVM_DEBUG(dbgs() << " Dest: " << *Dest << "\n");
368 if (Value)
369 LLVM_DEBUG(dbgs() << " Value: " << *Value << "\n");
370
371 /// Map of aggregate variables to their fragment associated with OldAlloca.
373 BaseFragments;
374 for (auto *DVR : at::getDVRAssignmentMarkers(OldAlloca))
375 BaseFragments[getAggregateVariable(DVR)] =
376 DVR->getExpression()->getFragmentInfo();
377
378 // The new inst needs a DIAssignID unique metadata tag (if OldInst has
379 // one). It shouldn't already have one: assert this assumption.
380 assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID));
381 DIAssignID *NewID = nullptr;
382 auto &Ctx = Inst->getContext();
383 DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
384 assert(OldAlloca->isStaticAlloca());
385
386 auto MigrateDbgAssign = [&](DbgVariableRecord *DbgAssign) {
387 LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign
388 << "\n");
389 auto *Expr = DbgAssign->getExpression();
390 bool SetKillLocation = false;
391
392 if (IsSplit) {
393 std::optional<DIExpression::FragmentInfo> BaseFragment;
394 {
395 auto R = BaseFragments.find(getAggregateVariable(DbgAssign));
396 if (R == BaseFragments.end())
397 return;
398 BaseFragment = R->second;
399 }
400 std::optional<DIExpression::FragmentInfo> CurrentFragment =
401 Expr->getFragmentInfo();
402 DIExpression::FragmentInfo NewFragment;
403 FragCalcResult Result = calculateFragment(
404 DbgAssign->getVariable(), OldAllocaOffsetInBits, SliceSizeInBits,
405 BaseFragment, CurrentFragment, NewFragment);
406
407 if (Result == Skip)
408 return;
409 if (Result == UseFrag && !(NewFragment == CurrentFragment)) {
410 if (CurrentFragment) {
411 // Rewrite NewFragment to be relative to the existing one (this is
412 // what createFragmentExpression wants). CalculateFragment has
413 // already resolved the size for us. FIXME: Should it return the
414 // relative fragment too?
415 NewFragment.OffsetInBits -= CurrentFragment->OffsetInBits;
416 }
417 // Add the new fragment info to the existing expression if possible.
419 Expr, NewFragment.OffsetInBits, NewFragment.SizeInBits)) {
420 Expr = *E;
421 } else {
422 // Otherwise, add the new fragment info to an empty expression and
423 // discard the value component of this dbg.assign as the value cannot
424 // be computed with the new fragment.
426 DIExpression::get(Expr->getContext(), {}),
427 NewFragment.OffsetInBits, NewFragment.SizeInBits);
428 SetKillLocation = true;
429 }
430 }
431 }
432
433 // If we haven't created a DIAssignID ID do that now and attach it to Inst.
434 if (!NewID) {
435 NewID = DIAssignID::getDistinct(Ctx);
436 Inst->setMetadata(LLVMContext::MD_DIAssignID, NewID);
437 }
438
439 DbgVariableRecord *NewAssign;
440 if (IsSplit) {
441 ::Value *NewValue = Value ? Value : DbgAssign->getValue();
443 DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
444 Dest, DIExpression::get(Expr->getContext(), {}),
445 DbgAssign->getDebugLoc())));
446 } else {
447 // The store is not split, simply steal the existing dbg_assign.
448 NewAssign = DbgAssign;
449 NewAssign->setAssignId(NewID); // FIXME: Can we avoid generating new IDs?
450 NewAssign->setAddress(Dest);
451 if (Value)
452 NewAssign->replaceVariableLocationOp(0u, Value);
453 assert(Expr == NewAssign->getExpression());
454 }
455
456 // If we've updated the value but the original dbg.assign has an arglist
457 // then kill it now - we can't use the requested new value.
458 // We can't replace the DIArgList with the new value as it'd leave
459 // the DIExpression in an invalid state (DW_OP_LLVM_arg operands without
460 // an arglist). And we can't keep the DIArgList in case the linked store
461 // is being split - in which case the DIArgList + expression may no longer
462 // be computing the correct value.
463 // This should be a very rare situation as it requires the value being
464 // stored to differ from the dbg.assign (i.e., the value has been
465 // represented differently in the debug intrinsic for some reason).
466 SetKillLocation |=
467 Value && (DbgAssign->hasArgList() ||
468 !DbgAssign->getExpression()->isSingleLocationExpression());
469 if (SetKillLocation)
470 NewAssign->setKillLocation();
471
472 // We could use more precision here at the cost of some additional (code)
473 // complexity - if the original dbg.assign was adjacent to its store, we
474 // could position this new dbg.assign adjacent to its store rather than the
475 // old dbg.assgn. That would result in interleaved dbg.assigns rather than
476 // what we get now:
477 // split store !1
478 // split store !2
479 // dbg.assign !1
480 // dbg.assign !2
481 // This (current behaviour) results results in debug assignments being
482 // noted as slightly offset (in code) from the store. In practice this
483 // should have little effect on the debugging experience due to the fact
484 // that all the split stores should get the same line number.
485 if (NewAssign != DbgAssign) {
486 NewAssign->moveBefore(DbgAssign->getIterator());
487 NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
488 }
489 LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
490 };
491
492 for_each(DVRAssignMarkerRange, MigrateDbgAssign);
493}
494
495namespace {
496
497/// A custom IRBuilder inserter which prefixes all names, but only in
498/// Assert builds.
499class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
500 std::string Prefix;
501
502 Twine getNameWithPrefix(const Twine &Name) const {
503 return Name.isTriviallyEmpty() ? Name : Prefix + Name;
504 }
505
506public:
507 void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
508
509 void InsertHelper(Instruction *I, const Twine &Name,
510 BasicBlock::iterator InsertPt) const override {
511 IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name),
512 InsertPt);
513 }
514};
515
516/// Provide a type for IRBuilder that drops names in release builds.
518
519/// A used slice of an alloca.
520///
521/// This structure represents a slice of an alloca used by some instruction. It
522/// stores both the begin and end offsets of this use, a pointer to the use
523/// itself, and a flag indicating whether we can classify the use as splittable
524/// or not when forming partitions of the alloca.
525class Slice {
526 /// The beginning offset of the range.
527 uint64_t BeginOffset = 0;
528
529 /// The ending offset, not included in the range.
530 uint64_t EndOffset = 0;
531
532 /// Storage for both the use of this slice and whether it can be
533 /// split.
534 PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
535
536public:
537 Slice() = default;
538
539 Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable,
540 Value *ProtectedFieldDisc)
541 : BeginOffset(BeginOffset), EndOffset(EndOffset),
542 UseAndIsSplittable(U, IsSplittable),
543 ProtectedFieldDisc(ProtectedFieldDisc) {}
544
545 uint64_t beginOffset() const { return BeginOffset; }
546 uint64_t endOffset() const { return EndOffset; }
547
548 bool isSplittable() const { return UseAndIsSplittable.getInt(); }
549 void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
550
551 Use *getUse() const { return UseAndIsSplittable.getPointer(); }
552
553 bool isDead() const { return getUse() == nullptr; }
554 void kill() { UseAndIsSplittable.setPointer(nullptr); }
555
556 // When this access is via an llvm.protected.field.ptr intrinsic, contains
557 // the second argument to the intrinsic, the discriminator.
558 Value *ProtectedFieldDisc;
559
560 /// Support for ordering ranges.
561 ///
562 /// This provides an ordering over ranges such that start offsets are
563 /// always increasing, and within equal start offsets, the end offsets are
564 /// decreasing. Thus the spanning range comes first in a cluster with the
565 /// same start position.
566 bool operator<(const Slice &RHS) const {
567 if (beginOffset() < RHS.beginOffset())
568 return true;
569 if (beginOffset() > RHS.beginOffset())
570 return false;
571 if (isSplittable() != RHS.isSplittable())
572 return !isSplittable();
573 if (endOffset() > RHS.endOffset())
574 return true;
575 return false;
576 }
577
578 /// Support comparison with a single offset to allow binary searches.
579 [[maybe_unused]] friend bool operator<(const Slice &LHS, uint64_t RHSOffset) {
580 return LHS.beginOffset() < RHSOffset;
581 }
582 [[maybe_unused]] friend bool operator<(uint64_t LHSOffset, const Slice &RHS) {
583 return LHSOffset < RHS.beginOffset();
584 }
585
586 bool operator==(const Slice &RHS) const {
587 return isSplittable() == RHS.isSplittable() &&
588 beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
589 }
590 bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
591};
592
593/// Representation of the alloca slices.
594///
595/// This class represents the slices of an alloca which are formed by its
596/// various uses. If a pointer escapes, we can't fully build a representation
597/// for the slices used and we reflect that in this structure. The uses are
598/// stored, sorted by increasing beginning offset and with unsplittable slices
599/// starting at a particular offset before splittable slices.
600class AllocaSlices {
601public:
602 /// Construct the slices of a particular alloca.
603 AllocaSlices(const DataLayout &DL, AllocaInst &AI);
604
605 /// Test whether a pointer to the allocation escapes our analysis.
606 ///
607 /// If this is true, the slices are never fully built and should be
608 /// ignored.
609 bool isEscaped() const { return PointerEscapingInstr; }
610 bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
611
612 /// Support for iterating over the slices.
613 /// @{
614 using iterator = SmallVectorImpl<Slice>::iterator;
615 using range = iterator_range<iterator>;
616
617 iterator begin() { return Slices.begin(); }
618 iterator end() { return Slices.end(); }
619
620 using const_iterator = SmallVectorImpl<Slice>::const_iterator;
621 using const_range = iterator_range<const_iterator>;
622
623 const_iterator begin() const { return Slices.begin(); }
624 const_iterator end() const { return Slices.end(); }
625 /// @}
626
627 /// Erase a range of slices.
628 void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
629
630 /// Insert new slices for this alloca.
631 ///
632 /// This moves the slices into the alloca's slices collection, and re-sorts
633 /// everything so that the usual ordering properties of the alloca's slices
634 /// hold.
635 void insert(ArrayRef<Slice> NewSlices) {
636 int OldSize = Slices.size();
637 Slices.append(NewSlices.begin(), NewSlices.end());
638 auto SliceI = Slices.begin() + OldSize;
639 std::stable_sort(SliceI, Slices.end());
640 std::inplace_merge(Slices.begin(), SliceI, Slices.end());
641 }
642
643 // Forward declare the iterator and range accessor for walking the
644 // partitions.
645 class partition_iterator;
647
648 /// Access the dead users for this alloca.
649 ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
650
651 /// Access the users for this alloca that are llvm.protected.field.ptr
652 /// intrinsics.
653 ArrayRef<IntrinsicInst *> getPFPUsers() const { return PFPUsers; }
654
655 /// Access Uses that should be dropped if the alloca is promotable.
656 ArrayRef<Use *> getDeadUsesIfPromotable() const {
657 return DeadUseIfPromotable;
658 }
659
660 /// Access the dead operands referring to this alloca.
661 ///
662 /// These are operands which have cannot actually be used to refer to the
663 /// alloca as they are outside its range and the user doesn't correct for
664 /// that. These mostly consist of PHI node inputs and the like which we just
665 /// need to replace with undef.
666 ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
667
668#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
669 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
670 void printSlice(raw_ostream &OS, const_iterator I,
671 StringRef Indent = " ") const;
672 void printUse(raw_ostream &OS, const_iterator I,
673 StringRef Indent = " ") const;
674 void print(raw_ostream &OS) const;
675 void dump(const_iterator I) const;
676 void dump() const;
677#endif
678
679private:
680 template <typename DerivedT, typename RetT = void> class BuilderBase;
681 class SliceBuilder;
682
683 friend class AllocaSlices::SliceBuilder;
684
685#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
686 /// Handle to alloca instruction to simplify method interfaces.
687 AllocaInst &AI;
688#endif
689
690 /// The instruction responsible for this alloca not having a known set
691 /// of slices.
692 ///
693 /// When an instruction (potentially) escapes the pointer to the alloca, we
694 /// store a pointer to that here and abort trying to form slices of the
695 /// alloca. This will be null if the alloca slices are analyzed successfully.
696 Instruction *PointerEscapingInstr;
697 Instruction *PointerEscapingInstrReadOnly;
698
699 /// The slices of the alloca.
700 ///
701 /// We store a vector of the slices formed by uses of the alloca here. This
702 /// vector is sorted by increasing begin offset, and then the unsplittable
703 /// slices before the splittable ones. See the Slice inner class for more
704 /// details.
706
707 /// Instructions which will become dead if we rewrite the alloca.
708 ///
709 /// Note that these are not separated by slice. This is because we expect an
710 /// alloca to be completely rewritten or not rewritten at all. If rewritten,
711 /// all these instructions can simply be removed and replaced with poison as
712 /// they come from outside of the allocated space.
713 SmallVector<Instruction *, 8> DeadUsers;
714
715 /// Users that are llvm.protected.field.ptr intrinsics. These will be RAUW'd
716 /// to their first argument if we rewrite the alloca.
718
719 /// Uses which will become dead if can promote the alloca.
720 SmallVector<Use *, 8> DeadUseIfPromotable;
721
722 /// Operands which will become dead if we rewrite the alloca.
723 ///
724 /// These are operands that in their particular use can be replaced with
725 /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
726 /// to PHI nodes and the like. They aren't entirely dead (there might be
727 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
728 /// want to swap this particular input for poison to simplify the use lists of
729 /// the alloca.
730 SmallVector<Use *, 8> DeadOperands;
731};
732
733/// A partition of the slices.
734///
735/// An ephemeral representation for a range of slices which can be viewed as
736/// a partition of the alloca. This range represents a span of the alloca's
737/// memory which cannot be split, and provides access to all of the slices
738/// overlapping some part of the partition.
739///
740/// Objects of this type are produced by traversing the alloca's slices, but
741/// are only ephemeral and not persistent.
742class Partition {
743private:
744 friend class AllocaSlices;
745 friend class AllocaSlices::partition_iterator;
746
747 using iterator = AllocaSlices::iterator;
748
749 /// The beginning and ending offsets of the alloca for this
750 /// partition.
751 uint64_t BeginOffset = 0, EndOffset = 0;
752
753 /// The start and end iterators of this partition.
754 iterator SI, SJ;
755
756 /// A collection of split slice tails overlapping the partition.
757 SmallVector<Slice *, 4> SplitTails;
758
759 /// Raw constructor builds an empty partition starting and ending at
760 /// the given iterator.
761 Partition(iterator SI) : SI(SI), SJ(SI) {}
762
763public:
764 /// The start offset of this partition.
765 ///
766 /// All of the contained slices start at or after this offset.
767 uint64_t beginOffset() const { return BeginOffset; }
768
769 /// The end offset of this partition.
770 ///
771 /// All of the contained slices end at or before this offset.
772 uint64_t endOffset() const { return EndOffset; }
773
774 /// The size of the partition.
775 ///
776 /// Note that this can never be zero.
777 uint64_t size() const {
778 assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
779 return EndOffset - BeginOffset;
780 }
781
782 /// Test whether this partition contains no slices, and merely spans
783 /// a region occupied by split slices.
784 bool empty() const { return SI == SJ; }
785
786 /// \name Iterate slices that start within the partition.
787 /// These may be splittable or unsplittable. They have a begin offset >= the
788 /// partition begin offset.
789 /// @{
790 // FIXME: We should probably define a "concat_iterator" helper and use that
791 // to stitch together pointee_iterators over the split tails and the
792 // contiguous iterators of the partition. That would give a much nicer
793 // interface here. We could then additionally expose filtered iterators for
794 // split, unsplit, and unsplittable splices based on the usage patterns.
795 iterator begin() const { return SI; }
796 iterator end() const { return SJ; }
797 /// @}
798
799 /// Get the sequence of split slice tails.
800 ///
801 /// These tails are of slices which start before this partition but are
802 /// split and overlap into the partition. We accumulate these while forming
803 /// partitions.
804 ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
805};
806
807} // end anonymous namespace
808
809/// An iterator over partitions of the alloca's slices.
810///
811/// This iterator implements the core algorithm for partitioning the alloca's
812/// slices. It is a forward iterator as we don't support backtracking for
813/// efficiency reasons, and re-use a single storage area to maintain the
814/// current set of split slices.
815///
816/// It is templated on the slice iterator type to use so that it can operate
817/// with either const or non-const slice iterators.
819 : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
820 Partition> {
821 friend class AllocaSlices;
822
823 /// Most of the state for walking the partitions is held in a class
824 /// with a nice interface for examining them.
825 Partition P;
826
827 /// We need to keep the end of the slices to know when to stop.
828 AllocaSlices::iterator SE;
829
830 /// We also need to keep track of the maximum split end offset seen.
831 /// FIXME: Do we really?
832 uint64_t MaxSplitSliceEndOffset = 0;
833
834 /// Sets the partition to be empty at given iterator, and sets the
835 /// end iterator.
836 partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
837 : P(SI), SE(SE) {
838 // If not already at the end, advance our state to form the initial
839 // partition.
840 if (SI != SE)
841 advance();
842 }
843
844 /// Advance the iterator to the next partition.
845 ///
846 /// Requires that the iterator not be at the end of the slices.
847 void advance() {
848 assert((P.SI != SE || !P.SplitTails.empty()) &&
849 "Cannot advance past the end of the slices!");
850
851 // Clear out any split uses which have ended.
852 if (!P.SplitTails.empty()) {
853 if (P.EndOffset >= MaxSplitSliceEndOffset) {
854 // If we've finished all splits, this is easy.
855 P.SplitTails.clear();
856 MaxSplitSliceEndOffset = 0;
857 } else {
858 // Remove the uses which have ended in the prior partition. This
859 // cannot change the max split slice end because we just checked that
860 // the prior partition ended prior to that max.
861 llvm::erase_if(P.SplitTails,
862 [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
863 assert(llvm::any_of(P.SplitTails,
864 [&](Slice *S) {
865 return S->endOffset() == MaxSplitSliceEndOffset;
866 }) &&
867 "Could not find the current max split slice offset!");
868 assert(llvm::all_of(P.SplitTails,
869 [&](Slice *S) {
870 return S->endOffset() <= MaxSplitSliceEndOffset;
871 }) &&
872 "Max split slice end offset is not actually the max!");
873 }
874 }
875
876 // If P.SI is already at the end, then we've cleared the split tail and
877 // now have an end iterator.
878 if (P.SI == SE) {
879 assert(P.SplitTails.empty() && "Failed to clear the split slices!");
880 return;
881 }
882
883 // If we had a non-empty partition previously, set up the state for
884 // subsequent partitions.
885 if (P.SI != P.SJ) {
886 // Accumulate all the splittable slices which started in the old
887 // partition into the split list.
888 for (Slice &S : P)
889 if (S.isSplittable() && S.endOffset() > P.EndOffset) {
890 P.SplitTails.push_back(&S);
891 MaxSplitSliceEndOffset =
892 std::max(S.endOffset(), MaxSplitSliceEndOffset);
893 }
894
895 // Start from the end of the previous partition.
896 P.SI = P.SJ;
897
898 // If P.SI is now at the end, we at most have a tail of split slices.
899 if (P.SI == SE) {
900 P.BeginOffset = P.EndOffset;
901 P.EndOffset = MaxSplitSliceEndOffset;
902 return;
903 }
904
905 // If the we have split slices and the next slice is after a gap and is
906 // not splittable immediately form an empty partition for the split
907 // slices up until the next slice begins.
908 if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
909 !P.SI->isSplittable()) {
910 P.BeginOffset = P.EndOffset;
911 P.EndOffset = P.SI->beginOffset();
912 return;
913 }
914 }
915
916 // OK, we need to consume new slices. Set the end offset based on the
917 // current slice, and step SJ past it. The beginning offset of the
918 // partition is the beginning offset of the next slice unless we have
919 // pre-existing split slices that are continuing, in which case we begin
920 // at the prior end offset.
921 P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
922 P.EndOffset = P.SI->endOffset();
923 ++P.SJ;
924
925 // There are two strategies to form a partition based on whether the
926 // partition starts with an unsplittable slice or a splittable slice.
927 if (!P.SI->isSplittable()) {
928 // When we're forming an unsplittable region, it must always start at
929 // the first slice and will extend through its end.
930 assert(P.BeginOffset == P.SI->beginOffset());
931
932 // Form a partition including all of the overlapping slices with this
933 // unsplittable slice.
934 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
935 if (!P.SJ->isSplittable())
936 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
937 ++P.SJ;
938 }
939
940 // We have a partition across a set of overlapping unsplittable
941 // partitions.
942 return;
943 }
944
945 // If we're starting with a splittable slice, then we need to form
946 // a synthetic partition spanning it and any other overlapping splittable
947 // splices.
948 assert(P.SI->isSplittable() && "Forming a splittable partition!");
949
950 // Collect all of the overlapping splittable slices.
951 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
952 P.SJ->isSplittable()) {
953 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
954 ++P.SJ;
955 }
956
957 // Back upiP.EndOffset if we ended the span early when encountering an
958 // unsplittable slice. This synthesizes the early end offset of
959 // a partition spanning only splittable slices.
960 if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
961 assert(!P.SJ->isSplittable());
962 P.EndOffset = P.SJ->beginOffset();
963 }
964 }
965
966public:
967 bool operator==(const partition_iterator &RHS) const {
968 assert(SE == RHS.SE &&
969 "End iterators don't match between compared partition iterators!");
970
971 // The observed positions of partitions is marked by the P.SI iterator and
972 // the emptiness of the split slices. The latter is only relevant when
973 // P.SI == SE, as the end iterator will additionally have an empty split
974 // slices list, but the prior may have the same P.SI and a tail of split
975 // slices.
976 if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
977 assert(P.SJ == RHS.P.SJ &&
978 "Same set of slices formed two different sized partitions!");
979 assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
980 "Same slice position with differently sized non-empty split "
981 "slice tails!");
982 return true;
983 }
984 return false;
985 }
986
987 partition_iterator &operator++() {
988 advance();
989 return *this;
990 }
991
992 Partition &operator*() { return P; }
993};
994
995/// A forward range over the partitions of the alloca's slices.
996///
997/// This accesses an iterator range over the partitions of the alloca's
998/// slices. It computes these partitions on the fly based on the overlapping
999/// offsets of the slices and the ability to split them. It will visit "empty"
1000/// partitions to cover regions of the alloca only accessed via split
1001/// slices.
1002iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
1003 return make_range(partition_iterator(begin(), end()),
1004 partition_iterator(end(), end()));
1005}
1006
1008 // If the condition being selected on is a constant or the same value is
1009 // being selected between, fold the select. Yes this does (rarely) happen
1010 // early on.
1011 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
1012 return SI.getOperand(1 + CI->isZero());
1013 if (SI.getOperand(1) == SI.getOperand(2))
1014 return SI.getOperand(1);
1015
1016 return nullptr;
1017}
1018
1019/// A helper that folds a PHI node or a select.
1021 if (PHINode *PN = dyn_cast<PHINode>(&I)) {
1022 // If PN merges together the same value, return that value.
1023 return PN->hasConstantValue();
1024 }
1026}
1027
1028/// Builder for the alloca slices.
1029///
1030/// This class builds a set of alloca slices by recursively visiting the uses
1031/// of an alloca and making a slice for each load and store at each offset.
1032class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
1033 friend class PtrUseVisitor<SliceBuilder>;
1034 friend class InstVisitor<SliceBuilder>;
1035
1036 using Base = PtrUseVisitor<SliceBuilder>;
1037
1038 const uint64_t AllocSize;
1039 AllocaSlices &AS;
1040
1041 SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
1043
1044 /// Set to de-duplicate dead instructions found in the use walk.
1045 SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
1046
1047 // When this access is via an llvm.protected.field.ptr intrinsic, contains
1048 // the second argument to the intrinsic, the discriminator.
1049 Value *ProtectedFieldDisc = nullptr;
1050
1051public:
1052 SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
1054 AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue()),
1055 AS(AS) {}
1056
1057private:
1058 void markAsDead(Instruction &I) {
1059 if (VisitedDeadInsts.insert(&I).second)
1060 AS.DeadUsers.push_back(&I);
1061 }
1062
1063 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
1064 bool IsSplittable = false) {
1065 // Completely skip uses which have a zero size or start either before or
1066 // past the end of the allocation.
1067 if (Size == 0 || Offset.uge(AllocSize)) {
1068 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
1069 << Offset
1070 << " which has zero size or starts outside of the "
1071 << AllocSize << " byte alloca:\n"
1072 << " alloca: " << AS.AI << "\n"
1073 << " use: " << I << "\n");
1074 return markAsDead(I);
1075 }
1076
1077 uint64_t BeginOffset = Offset.getZExtValue();
1078 uint64_t EndOffset = BeginOffset + Size;
1079
1080 // Clamp the end offset to the end of the allocation. Note that this is
1081 // formulated to handle even the case where "BeginOffset + Size" overflows.
1082 // This may appear superficially to be something we could ignore entirely,
1083 // but that is not so! There may be widened loads or PHI-node uses where
1084 // some instructions are dead but not others. We can't completely ignore
1085 // them, and so have to record at least the information here.
1086 assert(AllocSize >= BeginOffset); // Established above.
1087 if (Size > AllocSize - BeginOffset) {
1088 LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
1089 << Offset << " to remain within the " << AllocSize
1090 << " byte alloca:\n"
1091 << " alloca: " << AS.AI << "\n"
1092 << " use: " << I << "\n");
1093 EndOffset = AllocSize;
1094 }
1095
1096 AS.Slices.push_back(
1097 Slice(BeginOffset, EndOffset, U, IsSplittable, ProtectedFieldDisc));
1098 }
1099
1100 void visitBitCastInst(BitCastInst &BC) {
1101 if (BC.use_empty())
1102 return markAsDead(BC);
1103
1104 return Base::visitBitCastInst(BC);
1105 }
1106
1107 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1108 if (ASC.use_empty())
1109 return markAsDead(ASC);
1110
1111 return Base::visitAddrSpaceCastInst(ASC);
1112 }
1113
1114 void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1115 if (GEPI.use_empty())
1116 return markAsDead(GEPI);
1117
1118 return Base::visitGetElementPtrInst(GEPI);
1119 }
1120
1121 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
1122 uint64_t Size, bool IsVolatile) {
1123 // We allow splitting of non-volatile loads and stores where the type is an
1124 // integer type. These may be used to implement 'memcpy' or other "transfer
1125 // of bits" patterns.
1126 bool IsSplittable =
1127 Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1128
1129 insertUse(I, Offset, Size, IsSplittable);
1130 }
1131
1132 void visitLoadInst(LoadInst &LI) {
1133 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
1134 "All simple FCA loads should have been pre-split");
1135
1136 // If there is a load with an unknown offset, we can still perform store
1137 // to load forwarding for other known-offset loads.
1138 if (!IsOffsetKnown)
1139 return PI.setEscapedReadOnly(&LI);
1140
1141 TypeSize Size = DL.getTypeStoreSize(LI.getType());
1142 if (Size.isScalable()) {
1143 unsigned VScale = LI.getFunction()->getVScaleValue();
1144 if (!VScale)
1145 return PI.setAborted(&LI);
1146
1147 Size = TypeSize::getFixed(Size.getKnownMinValue() * VScale);
1148 }
1149
1150 return handleLoadOrStore(LI.getType(), LI, Offset, Size.getFixedValue(),
1151 LI.isVolatile());
1152 }
1153
1154 void visitStoreInst(StoreInst &SI) {
1155 Value *ValOp = SI.getValueOperand();
1156 if (ValOp == *U)
1157 return PI.setEscapedAndAborted(&SI);
1158 if (!IsOffsetKnown)
1159 return PI.setAborted(&SI);
1160
1161 TypeSize StoreSize = DL.getTypeStoreSize(ValOp->getType());
1162 if (StoreSize.isScalable()) {
1163 unsigned VScale = SI.getFunction()->getVScaleValue();
1164 if (!VScale)
1165 return PI.setAborted(&SI);
1166
1167 StoreSize = TypeSize::getFixed(StoreSize.getKnownMinValue() * VScale);
1168 }
1169
1170 uint64_t Size = StoreSize.getFixedValue();
1171
1172 // If this memory access can be shown to *statically* extend outside the
1173 // bounds of the allocation, it's behavior is undefined, so simply
1174 // ignore it. Note that this is more strict than the generic clamping
1175 // behavior of insertUse. We also try to handle cases which might run the
1176 // risk of overflow.
1177 // FIXME: We should instead consider the pointer to have escaped if this
1178 // function is being instrumented for addressing bugs or race conditions.
1179 if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
1180 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
1181 << Offset << " which extends past the end of the "
1182 << AllocSize << " byte alloca:\n"
1183 << " alloca: " << AS.AI << "\n"
1184 << " use: " << SI << "\n");
1185 return markAsDead(SI);
1186 }
1187
1188 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
1189 "All simple FCA stores should have been pre-split");
1190 handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
1191 }
1192
1193 void visitMemSetInst(MemSetInst &II) {
1194 assert(II.getRawDest() == *U && "Pointer use is not the destination?");
1195 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1196 if ((Length && Length->getValue() == 0) ||
1197 (IsOffsetKnown && Offset.uge(AllocSize)))
1198 // Zero-length mem transfer intrinsics can be ignored entirely.
1199 return markAsDead(II);
1200
1201 if (!IsOffsetKnown)
1202 return PI.setAborted(&II);
1203
1204 insertUse(II, Offset,
1205 Length ? Length->getLimitedValue()
1206 : AllocSize - Offset.getLimitedValue(),
1207 (bool)Length);
1208 }
1209
1210 void visitMemTransferInst(MemTransferInst &II) {
1211 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1212 if (Length && Length->getValue() == 0)
1213 // Zero-length mem transfer intrinsics can be ignored entirely.
1214 return markAsDead(II);
1215
1216 // Because we can visit these intrinsics twice, also check to see if the
1217 // first time marked this instruction as dead. If so, skip it.
1218 if (VisitedDeadInsts.count(&II))
1219 return;
1220
1221 if (!IsOffsetKnown)
1222 return PI.setAborted(&II);
1223
1224 // This side of the transfer is completely out-of-bounds, and so we can
1225 // nuke the entire transfer. However, we also need to nuke the other side
1226 // if already added to our partitions.
1227 // FIXME: Yet another place we really should bypass this when
1228 // instrumenting for ASan.
1229 if (Offset.uge(AllocSize)) {
1230 SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
1231 MemTransferSliceMap.find(&II);
1232 if (MTPI != MemTransferSliceMap.end())
1233 AS.Slices[MTPI->second].kill();
1234 return markAsDead(II);
1235 }
1236
1237 uint64_t RawOffset = Offset.getLimitedValue();
1238 uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
1239
1240 // Check for the special case where the same exact value is used for both
1241 // source and dest.
1242 if (*U == II.getRawDest() && *U == II.getRawSource()) {
1243 // For non-volatile transfers this is a no-op.
1244 if (!II.isVolatile())
1245 return markAsDead(II);
1246
1247 return insertUse(II, Offset, Size, /*IsSplittable=*/false);
1248 }
1249
1250 // If we have seen both source and destination for a mem transfer, then
1251 // they both point to the same alloca.
1252 bool Inserted;
1253 SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
1254 std::tie(MTPI, Inserted) =
1255 MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
1256 unsigned PrevIdx = MTPI->second;
1257 if (!Inserted) {
1258 Slice &PrevP = AS.Slices[PrevIdx];
1259
1260 // Check if the begin offsets match and this is a non-volatile transfer.
1261 // In that case, we can completely elide the transfer.
1262 if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
1263 PrevP.kill();
1264 return markAsDead(II);
1265 }
1266
1267 // Otherwise we have an offset transfer within the same alloca. We can't
1268 // split those.
1269 PrevP.makeUnsplittable();
1270 }
1271
1272 // Insert the use now that we've fixed up the splittable nature.
1273 insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
1274
1275 // Check that we ended up with a valid index in the map.
1276 assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
1277 "Map index doesn't point back to a slice with this user.");
1278 }
1279
1280 // Disable SRoA for any intrinsics except for lifetime invariants.
1281 // FIXME: What about debug intrinsics? This matches old behavior, but
1282 // doesn't make sense.
1283 void visitIntrinsicInst(IntrinsicInst &II) {
1284 if (II.isDroppable()) {
1285 AS.DeadUseIfPromotable.push_back(U);
1286 return;
1287 }
1288
1289 if (!IsOffsetKnown)
1290 return PI.setAborted(&II);
1291
1292 if (II.isLifetimeStartOrEnd()) {
1293 insertUse(II, Offset, AllocSize, true);
1294 return;
1295 }
1296
1297 if (II.getIntrinsicID() == Intrinsic::protected_field_ptr) {
1298 // We only handle loads and stores as users of llvm.protected.field.ptr.
1299 // Other uses may add items to the worklist, which will cause
1300 // ProtectedFieldDisc to be tracked incorrectly.
1301 AS.PFPUsers.push_back(&II);
1302 ProtectedFieldDisc = II.getArgOperand(1);
1303 for (Use &U : II.uses()) {
1304 this->U = &U;
1305 if (auto *LI = dyn_cast<LoadInst>(U.getUser()))
1306 visitLoadInst(*LI);
1307 else if (auto *SI = dyn_cast<StoreInst>(U.getUser()))
1308 visitStoreInst(*SI);
1309 else
1310 PI.setAborted(&II);
1311 if (PI.isAborted())
1312 break;
1313 }
1314 ProtectedFieldDisc = nullptr;
1315 return;
1316 }
1317
1318 Base::visitIntrinsicInst(II);
1319 }
1320
1321 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
1322 // We consider any PHI or select that results in a direct load or store of
1323 // the same offset to be a viable use for slicing purposes. These uses
1324 // are considered unsplittable and the size is the maximum loaded or stored
1325 // size.
1326 SmallPtrSet<Instruction *, 4> Visited;
1328 Visited.insert(Root);
1329 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
1330 const DataLayout &DL = Root->getDataLayout();
1331 // If there are no loads or stores, the access is dead. We mark that as
1332 // a size zero access.
1333 Size = 0;
1334 do {
1335 Instruction *I, *UsedI;
1336 std::tie(UsedI, I) = Uses.pop_back_val();
1337
1338 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
1339 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
1340 if (LoadSize.isScalable()) {
1341 PI.setAborted(LI);
1342 return nullptr;
1343 }
1344 Size = std::max(Size, LoadSize.getFixedValue());
1345 continue;
1346 }
1347 if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
1348 Value *Op = SI->getOperand(0);
1349 if (Op == UsedI)
1350 return SI;
1351 TypeSize StoreSize = DL.getTypeStoreSize(Op->getType());
1352 if (StoreSize.isScalable()) {
1353 PI.setAborted(SI);
1354 return nullptr;
1355 }
1356 Size = std::max(Size, StoreSize.getFixedValue());
1357 continue;
1358 }
1359
1360 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
1361 if (!GEP->hasAllZeroIndices())
1362 return GEP;
1363 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
1365 return I;
1366 }
1367
1368 for (User *U : I->users())
1369 if (Visited.insert(cast<Instruction>(U)).second)
1370 Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
1371 } while (!Uses.empty());
1372
1373 return nullptr;
1374 }
1375
1376 void visitPHINodeOrSelectInst(Instruction &I) {
1378 if (I.use_empty())
1379 return markAsDead(I);
1380
1381 // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1382 // instructions in this BB, which may be required during rewriting. Bail out
1383 // on these cases.
1384 if (isa<PHINode>(I) &&
1385 I.getParent()->getFirstInsertionPt() == I.getParent()->end())
1386 return PI.setAborted(&I);
1387
1388 // TODO: We could use simplifyInstruction here to fold PHINodes and
1389 // SelectInsts. However, doing so requires to change the current
1390 // dead-operand-tracking mechanism. For instance, suppose neither loading
1391 // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1392 // trap either. However, if we simply replace %U with undef using the
1393 // current dead-operand-tracking mechanism, "load (select undef, undef,
1394 // %other)" may trap because the select may return the first operand
1395 // "undef".
1396 if (Value *Result = foldPHINodeOrSelectInst(I)) {
1397 if (Result == *U)
1398 // If the result of the constant fold will be the pointer, recurse
1399 // through the PHI/select as if we had RAUW'ed it.
1400 enqueueUsers(I);
1401 else
1402 // Otherwise the operand to the PHI/select is dead, and we can replace
1403 // it with poison.
1404 AS.DeadOperands.push_back(U);
1405
1406 return;
1407 }
1408
1409 if (!IsOffsetKnown)
1410 return PI.setAborted(&I);
1411
1412 // See if we already have computed info on this node.
1413 uint64_t &Size = PHIOrSelectSizes[&I];
1414 if (!Size) {
1415 // This is a new PHI/Select, check for an unsafe use of it.
1416 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
1417 return PI.setAborted(UnsafeI);
1418 }
1419
1420 // For PHI and select operands outside the alloca, we can't nuke the entire
1421 // phi or select -- the other side might still be relevant, so we special
1422 // case them here and use a separate structure to track the operands
1423 // themselves which should be replaced with poison.
1424 // FIXME: This should instead be escaped in the event we're instrumenting
1425 // for address sanitization.
1426 if (Offset.uge(AllocSize)) {
1427 AS.DeadOperands.push_back(U);
1428 return;
1429 }
1430
1431 insertUse(I, Offset, Size);
1432 }
1433
1434 void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
1435
1436 void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
1437
1438 /// Disable SROA entirely if there are unhandled users of the alloca.
1439 void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1440
1441 void visitCallBase(CallBase &CB) {
1442 // If the call operand is read-only and only does a read-only or address
1443 // capture, then we mark it as EscapedReadOnly.
1444 if (CB.isDataOperand(U) &&
1445 !capturesFullProvenance(CB.getCaptureInfo(U->getOperandNo())) &&
1446 CB.onlyReadsMemory(U->getOperandNo())) {
1447 PI.setEscapedReadOnly(&CB);
1448 return;
1449 }
1450
1451 Base::visitCallBase(CB);
1452 }
1453};
1454
1455AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
1456 :
1457#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1458 AI(AI),
1459#endif
1460 PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
1461 SliceBuilder PB(DL, AI, *this);
1462 SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
1463 if (PtrI.isEscaped() || PtrI.isAborted()) {
1464 // FIXME: We should sink the escape vs. abort info into the caller nicely,
1465 // possibly by just storing the PtrInfo in the AllocaSlices.
1466 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1467 : PtrI.getAbortingInst();
1468 assert(PointerEscapingInstr && "Did not track a bad instruction");
1469 return;
1470 }
1471 PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
1472
1473 llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
1474
1475 // Sort the uses. This arranges for the offsets to be in ascending order,
1476 // and the sizes to be in descending order.
1477 llvm::stable_sort(Slices);
1478}
1479
1480#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1481
1482void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1483 StringRef Indent) const {
1484 printSlice(OS, I, Indent);
1485 OS << "\n";
1486 printUse(OS, I, Indent);
1487}
1488
1489void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1490 StringRef Indent) const {
1491 OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1492 << " slice #" << (I - begin())
1493 << (I->isSplittable() ? " (splittable)" : "");
1494}
1495
1496void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1497 StringRef Indent) const {
1498 OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1499}
1500
1501void AllocaSlices::print(raw_ostream &OS) const {
1502 if (PointerEscapingInstr) {
1503 OS << "Can't analyze slices for alloca: " << AI << "\n"
1504 << " A pointer to this alloca escaped by:\n"
1505 << " " << *PointerEscapingInstr << "\n";
1506 return;
1507 }
1508
1509 if (PointerEscapingInstrReadOnly)
1510 OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
1511
1512 OS << "Slices of alloca: " << AI << "\n";
1513 for (const_iterator I = begin(), E = end(); I != E; ++I)
1514 print(OS, I);
1515}
1516
1517LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1518 print(dbgs(), I);
1519}
1520LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1521
1522#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1523
1524/// Walk the range of a partitioning looking for a common type to cover this
1525/// sequence of slices.
1526static std::pair<Type *, IntegerType *>
1527findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1528 uint64_t EndOffset) {
1529 Type *Ty = nullptr;
1530 bool TyIsCommon = true;
1531 IntegerType *ITy = nullptr;
1532
1533 // Note that we need to look at *every* alloca slice's Use to ensure we
1534 // always get consistent results regardless of the order of slices.
1535 for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1536 Use *U = I->getUse();
1537 if (isa<IntrinsicInst>(*U->getUser()))
1538 continue;
1539 if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1540 continue;
1541
1542 Type *UserTy = nullptr;
1543 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1544 UserTy = LI->getType();
1545 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1546 UserTy = SI->getValueOperand()->getType();
1547 }
1548
1549 if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
1550 // If the type is larger than the partition, skip it. We only encounter
1551 // this for split integer operations where we want to use the type of the
1552 // entity causing the split. Also skip if the type is not a byte width
1553 // multiple.
1554 if (UserITy->getBitWidth() % 8 != 0 ||
1555 UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1556 continue;
1557
1558 // Track the largest bitwidth integer type used in this way in case there
1559 // is no common type.
1560 if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1561 ITy = UserITy;
1562 }
1563
1564 // To avoid depending on the order of slices, Ty and TyIsCommon must not
1565 // depend on types skipped above.
1566 if (!UserTy || (Ty && Ty != UserTy))
1567 TyIsCommon = false; // Give up on anything but an iN type.
1568 else
1569 Ty = UserTy;
1570 }
1571
1572 return {TyIsCommon ? Ty : nullptr, ITy};
1573}
1574
1575/// PHI instructions that use an alloca and are subsequently loaded can be
1576/// rewritten to load both input pointers in the pred blocks and then PHI the
1577/// results, allowing the load of the alloca to be promoted.
1578/// From this:
1579/// %P2 = phi [i32* %Alloca, i32* %Other]
1580/// %V = load i32* %P2
1581/// to:
1582/// %V1 = load i32* %Alloca -> will be mem2reg'd
1583/// ...
1584/// %V2 = load i32* %Other
1585/// ...
1586/// %V = phi [i32 %V1, i32 %V2]
1587///
1588/// We can do this to a select if its only uses are loads and if the operands
1589/// to the select can be loaded unconditionally.
1590///
1591/// FIXME: This should be hoisted into a generic utility, likely in
1592/// Transforms/Util/Local.h
1594 const DataLayout &DL = PN.getDataLayout();
1595
1596 // For now, we can only do this promotion if the load is in the same block
1597 // as the PHI, and if there are no stores between the phi and load.
1598 // TODO: Allow recursive phi users.
1599 // TODO: Allow stores.
1600 BasicBlock *BB = PN.getParent();
1601 Align MaxAlign;
1602 uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
1603 Type *LoadType = nullptr;
1604 for (User *U : PN.users()) {
1606 if (!LI || !LI->isSimple())
1607 return false;
1608
1609 // For now we only allow loads in the same block as the PHI. This is
1610 // a common case that happens when instcombine merges two loads through
1611 // a PHI.
1612 if (LI->getParent() != BB)
1613 return false;
1614
1615 if (LoadType) {
1616 if (LoadType != LI->getType())
1617 return false;
1618 } else {
1619 LoadType = LI->getType();
1620 }
1621
1622 // Ensure that there are no instructions between the PHI and the load that
1623 // could store.
1624 for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1625 if (BBI->mayWriteToMemory())
1626 return false;
1627
1628 MaxAlign = std::max(MaxAlign, LI->getAlign());
1629 }
1630
1631 if (!LoadType)
1632 return false;
1633
1634 APInt LoadSize =
1635 APInt(APWidth, DL.getTypeStoreSize(LoadType).getFixedValue());
1636
1637 // We can only transform this if it is safe to push the loads into the
1638 // predecessor blocks. The only thing to watch out for is that we can't put
1639 // a possibly trapping load in the predecessor if it is a critical edge.
1640 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1642 Value *InVal = PN.getIncomingValue(Idx);
1643
1644 // If the value is produced by the terminator of the predecessor (an
1645 // invoke) or it has side-effects, there is no valid place to put a load
1646 // in the predecessor.
1647 if (TI == InVal || TI->mayHaveSideEffects())
1648 return false;
1649
1650 // If the predecessor has a single successor, then the edge isn't
1651 // critical.
1652 if (TI->getNumSuccessors() == 1)
1653 continue;
1654
1655 // If this pointer is always safe to load, or if we can prove that there
1656 // is already a load in the block, then we can move the load to the pred
1657 // block.
1658 if (isSafeToLoadUnconditionally(InVal, MaxAlign, LoadSize, DL, TI))
1659 continue;
1660
1661 return false;
1662 }
1663
1664 return true;
1665}
1666
1667static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1668 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1669
1670 LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
1671 Type *LoadTy = SomeLoad->getType();
1672 IRB.SetInsertPoint(&PN);
1673 PHINode *NewPN = IRB.CreatePHI(LoadTy, PN.getNumIncomingValues(),
1674 PN.getName() + ".sroa.speculated");
1675
1676 // Get the AA tags and alignment to use from one of the loads. It does not
1677 // matter which one we get and if any differ.
1678 AAMDNodes AATags = SomeLoad->getAAMetadata();
1679 Align Alignment = SomeLoad->getAlign();
1680
1681 // Rewrite all loads of the PN to use the new PHI.
1682 while (!PN.use_empty()) {
1683 LoadInst *LI = cast<LoadInst>(PN.user_back());
1684 LI->replaceAllUsesWith(NewPN);
1685 LI->eraseFromParent();
1686 }
1687
1688 // Inject loads into all of the pred blocks.
1689 DenseMap<BasicBlock *, Value *> InjectedLoads;
1690 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1691 BasicBlock *Pred = PN.getIncomingBlock(Idx);
1692 Value *InVal = PN.getIncomingValue(Idx);
1693
1694 // A PHI node is allowed to have multiple (duplicated) entries for the same
1695 // basic block, as long as the value is the same. So if we already injected
1696 // a load in the predecessor, then we should reuse the same load for all
1697 // duplicated entries.
1698 if (Value *V = InjectedLoads.lookup(Pred)) {
1699 NewPN->addIncoming(V, Pred);
1700 continue;
1701 }
1702
1703 Instruction *TI = Pred->getTerminator();
1704 IRB.SetInsertPoint(TI);
1705
1706 LoadInst *Load = IRB.CreateAlignedLoad(
1707 LoadTy, InVal, Alignment,
1708 (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1709 ++NumLoadsSpeculated;
1710 if (AATags)
1711 Load->setAAMetadata(AATags);
1712 NewPN->addIncoming(Load, Pred);
1713 InjectedLoads[Pred] = Load;
1714 }
1715
1716 LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1717 PN.eraseFromParent();
1718}
1719
1720SelectHandSpeculativity &
1721SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) {
1722 if (isTrueVal)
1724 else
1726 return *this;
1727}
1728
1729bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const {
1730 return isTrueVal ? Bitfield::get<SelectHandSpeculativity::TrueVal>(Storage)
1731 : Bitfield::get<SelectHandSpeculativity::FalseVal>(Storage);
1732}
1733
1734bool SelectHandSpeculativity::areAllSpeculatable() const {
1735 return isSpeculatable(/*isTrueVal=*/true) &&
1736 isSpeculatable(/*isTrueVal=*/false);
1737}
1738
1739bool SelectHandSpeculativity::areAnySpeculatable() const {
1740 return isSpeculatable(/*isTrueVal=*/true) ||
1741 isSpeculatable(/*isTrueVal=*/false);
1742}
1743bool SelectHandSpeculativity::areNoneSpeculatable() const {
1744 return !areAnySpeculatable();
1745}
1746
1747static SelectHandSpeculativity
1749 assert(LI.isSimple() && "Only for simple loads");
1750 SelectHandSpeculativity Spec;
1751
1752 const DataLayout &DL = SI.getDataLayout();
1753 for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()})
1755 &LI))
1756 Spec.setAsSpeculatable(/*isTrueVal=*/Value == SI.getTrueValue());
1757 else if (PreserveCFG)
1758 return Spec;
1759
1760 return Spec;
1761}
1762
1763std::optional<RewriteableMemOps>
1764SROA::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) {
1765 RewriteableMemOps Ops;
1766
1767 for (User *U : SI.users()) {
1768 if (auto *BC = dyn_cast<BitCastInst>(U); BC && BC->hasOneUse())
1769 U = *BC->user_begin();
1770
1771 if (auto *Store = dyn_cast<StoreInst>(U)) {
1772 // Note that atomic stores can be transformed; atomic semantics do not
1773 // have any meaning for a local alloca. Stores are not speculatable,
1774 // however, so if we can't turn it into a predicated store, we are done.
1775 if (Store->isVolatile() || PreserveCFG)
1776 return {}; // Give up on this `select`.
1777 Ops.emplace_back(Store);
1778 continue;
1779 }
1780
1781 auto *LI = dyn_cast<LoadInst>(U);
1782
1783 // Note that atomic loads can be transformed;
1784 // atomic semantics do not have any meaning for a local alloca.
1785 if (!LI || LI->isVolatile())
1786 return {}; // Give up on this `select`.
1787
1788 PossiblySpeculatableLoad Load(LI);
1789 if (!LI->isSimple()) {
1790 // If the `load` is not simple, we can't speculatively execute it,
1791 // but we could handle this via a CFG modification. But can we?
1792 if (PreserveCFG)
1793 return {}; // Give up on this `select`.
1794 Ops.emplace_back(Load);
1795 continue;
1796 }
1797
1798 SelectHandSpeculativity Spec =
1800 if (PreserveCFG && !Spec.areAllSpeculatable())
1801 return {}; // Give up on this `select`.
1802
1803 Load.setInt(Spec);
1804 Ops.emplace_back(Load);
1805 }
1806
1807 return Ops;
1808}
1809
1811 IRBuilderTy &IRB) {
1812 LLVM_DEBUG(dbgs() << " original load: " << SI << "\n");
1813
1814 Value *TV = SI.getTrueValue();
1815 Value *FV = SI.getFalseValue();
1816 // Replace the given load of the select with a select of two loads.
1817
1818 assert(LI.isSimple() && "We only speculate simple loads");
1819
1820 IRB.SetInsertPoint(&LI);
1821
1822 LoadInst *TL =
1823 IRB.CreateAlignedLoad(LI.getType(), TV, LI.getAlign(),
1824 LI.getName() + ".sroa.speculate.load.true");
1825 LoadInst *FL =
1826 IRB.CreateAlignedLoad(LI.getType(), FV, LI.getAlign(),
1827 LI.getName() + ".sroa.speculate.load.false");
1828 NumLoadsSpeculated += 2;
1829
1830 // Transfer alignment and AA info if present.
1831 TL->setAlignment(LI.getAlign());
1832 FL->setAlignment(LI.getAlign());
1833
1834 AAMDNodes Tags = LI.getAAMetadata();
1835 if (Tags) {
1836 TL->setAAMetadata(Tags);
1837 FL->setAAMetadata(Tags);
1838 }
1839
1840 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
1841 LI.getName() + ".sroa.speculated",
1842 ProfcheckDisableMetadataFixes ? nullptr : &SI);
1843
1844 LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1845 LI.replaceAllUsesWith(V);
1846}
1847
1848template <typename T>
1850 SelectHandSpeculativity Spec,
1851 DomTreeUpdater &DTU) {
1852 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Only for load and store!");
1853 LLVM_DEBUG(dbgs() << " original mem op: " << I << "\n");
1854 BasicBlock *Head = I.getParent();
1855 Instruction *ThenTerm = nullptr;
1856 Instruction *ElseTerm = nullptr;
1857 if (Spec.areNoneSpeculatable())
1858 SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm,
1859 SI.getMetadata(LLVMContext::MD_prof), &DTU);
1860 else {
1861 SplitBlockAndInsertIfThen(SI.getCondition(), &I, /*Unreachable=*/false,
1862 SI.getMetadata(LLVMContext::MD_prof), &DTU,
1863 /*LI=*/nullptr, /*ThenBlock=*/nullptr);
1864 if (Spec.isSpeculatable(/*isTrueVal=*/true))
1865 cast<BranchInst>(Head->getTerminator())->swapSuccessors();
1866 }
1867 auto *HeadBI = cast<BranchInst>(Head->getTerminator());
1868 Spec = {}; // Do not use `Spec` beyond this point.
1869 BasicBlock *Tail = I.getParent();
1870 Tail->setName(Head->getName() + ".cont");
1871 PHINode *PN;
1872 if (isa<LoadInst>(I))
1873 PN = PHINode::Create(I.getType(), 2, "", I.getIterator());
1874 for (BasicBlock *SuccBB : successors(Head)) {
1875 bool IsThen = SuccBB == HeadBI->getSuccessor(0);
1876 int SuccIdx = IsThen ? 0 : 1;
1877 auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB;
1878 auto &CondMemOp = cast<T>(*I.clone());
1879 if (NewMemOpBB != Head) {
1880 NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else"));
1881 if (isa<LoadInst>(I))
1882 ++NumLoadsPredicated;
1883 else
1884 ++NumStoresPredicated;
1885 } else {
1886 CondMemOp.dropUBImplyingAttrsAndMetadata();
1887 ++NumLoadsSpeculated;
1888 }
1889 CondMemOp.insertBefore(NewMemOpBB->getTerminator()->getIterator());
1890 Value *Ptr = SI.getOperand(1 + SuccIdx);
1891 CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr);
1892 if (isa<LoadInst>(I)) {
1893 CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val");
1894 PN->addIncoming(&CondMemOp, NewMemOpBB);
1895 } else
1896 LLVM_DEBUG(dbgs() << " to: " << CondMemOp << "\n");
1897 }
1898 if (isa<LoadInst>(I)) {
1899 PN->takeName(&I);
1900 LLVM_DEBUG(dbgs() << " to: " << *PN << "\n");
1901 I.replaceAllUsesWith(PN);
1902 }
1903}
1904
1906 SelectHandSpeculativity Spec,
1907 DomTreeUpdater &DTU) {
1908 if (auto *LI = dyn_cast<LoadInst>(&I))
1909 rewriteMemOpOfSelect(SelInst, *LI, Spec, DTU);
1910 else if (auto *SI = dyn_cast<StoreInst>(&I))
1911 rewriteMemOpOfSelect(SelInst, *SI, Spec, DTU);
1912 else
1913 llvm_unreachable_internal("Only for load and store.");
1914}
1915
1917 const RewriteableMemOps &Ops,
1918 IRBuilderTy &IRB, DomTreeUpdater *DTU) {
1919 bool CFGChanged = false;
1920 LLVM_DEBUG(dbgs() << " original select: " << SI << "\n");
1921
1922 for (const RewriteableMemOp &Op : Ops) {
1923 SelectHandSpeculativity Spec;
1924 Instruction *I;
1925 if (auto *const *US = std::get_if<UnspeculatableStore>(&Op)) {
1926 I = *US;
1927 } else {
1928 auto PSL = std::get<PossiblySpeculatableLoad>(Op);
1929 I = PSL.getPointer();
1930 Spec = PSL.getInt();
1931 }
1932 if (Spec.areAllSpeculatable()) {
1934 } else {
1935 assert(DTU && "Should not get here when not allowed to modify the CFG!");
1936 rewriteMemOpOfSelect(SI, *I, Spec, *DTU);
1937 CFGChanged = true;
1938 }
1939 I->eraseFromParent();
1940 }
1941
1942 for (User *U : make_early_inc_range(SI.users()))
1943 cast<BitCastInst>(U)->eraseFromParent();
1944 SI.eraseFromParent();
1945 return CFGChanged;
1946}
1947
1948/// Compute an adjusted pointer from Ptr by Offset bytes where the
1949/// resulting pointer has PointerTy.
1950static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1952 const Twine &NamePrefix) {
1953 if (Offset != 0)
1954 Ptr = IRB.CreateInBoundsPtrAdd(Ptr, IRB.getInt(Offset),
1955 NamePrefix + "sroa_idx");
1956 return IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, PointerTy,
1957 NamePrefix + "sroa_cast");
1958}
1959
1960/// Compute the adjusted alignment for a load or store from an offset.
1964
1965/// Test whether we can convert a value from the old to the new type.
1966///
1967/// This predicate should be used to guard calls to convertValue in order to
1968/// ensure that we only try to convert viable values. The strategy is that we
1969/// will peel off single element struct and array wrappings to get to an
1970/// underlying value, and convert that value.
1971static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy,
1972 unsigned VScale = 0) {
1973 if (OldTy == NewTy)
1974 return true;
1975
1976 // For integer types, we can't handle any bit-width differences. This would
1977 // break both vector conversions with extension and introduce endianness
1978 // issues when in conjunction with loads and stores.
1979 if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
1981 cast<IntegerType>(NewTy)->getBitWidth() &&
1982 "We can't have the same bitwidth for different int types");
1983 return false;
1984 }
1985
1986 TypeSize NewSize = DL.getTypeSizeInBits(NewTy);
1987 TypeSize OldSize = DL.getTypeSizeInBits(OldTy);
1988
1989 if ((isa<ScalableVectorType>(NewTy) && isa<FixedVectorType>(OldTy)) ||
1990 (isa<ScalableVectorType>(OldTy) && isa<FixedVectorType>(NewTy))) {
1991 // Conversion is only possible when the size of scalable vectors is known.
1992 if (!VScale)
1993 return false;
1994
1995 // For ptr-to-int and int-to-ptr casts, the pointer side is resolved within
1996 // a single domain (either fixed or scalable). Any additional conversion
1997 // between fixed and scalable types is handled through integer types.
1998 auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy;
1999 auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy;
2000
2001 if (isa<ScalableVectorType>(NewTy)) {
2003 return false;
2004
2005 NewSize = TypeSize::getFixed(NewSize.getKnownMinValue() * VScale);
2006 } else {
2008 return false;
2009
2010 OldSize = TypeSize::getFixed(OldSize.getKnownMinValue() * VScale);
2011 }
2012 }
2013
2014 if (NewSize != OldSize)
2015 return false;
2016 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
2017 return false;
2018
2019 // We can convert pointers to integers and vice-versa. Same for vectors
2020 // of pointers and integers.
2021 OldTy = OldTy->getScalarType();
2022 NewTy = NewTy->getScalarType();
2023 if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
2024 if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
2025 unsigned OldAS = OldTy->getPointerAddressSpace();
2026 unsigned NewAS = NewTy->getPointerAddressSpace();
2027 // Convert pointers if they are pointers from the same address space or
2028 // different integral (not non-integral) address spaces with the same
2029 // pointer size.
2030 return OldAS == NewAS ||
2031 (!DL.isNonIntegralAddressSpace(OldAS) &&
2032 !DL.isNonIntegralAddressSpace(NewAS) &&
2033 DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
2034 }
2035
2036 // We can convert integers to integral pointers, but not to non-integral
2037 // pointers.
2038 if (OldTy->isIntegerTy())
2039 return !DL.isNonIntegralPointerType(NewTy);
2040
2041 // We can convert integral pointers to integers, but non-integral pointers
2042 // need to remain pointers.
2043 if (!DL.isNonIntegralPointerType(OldTy))
2044 return NewTy->isIntegerTy();
2045
2046 return false;
2047 }
2048
2049 if (OldTy->isTargetExtTy() || NewTy->isTargetExtTy())
2050 return false;
2051
2052 return true;
2053}
2054
2055/// Generic routine to convert an SSA value to a value of a different
2056/// type.
2057///
2058/// This will try various different casting techniques, such as bitcasts,
2059/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
2060/// two types for viability with this routine.
2061static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2062 Type *NewTy) {
2063 Type *OldTy = V->getType();
2064
2065#ifndef NDEBUG
2066 BasicBlock *BB = IRB.GetInsertBlock();
2067 assert(BB && BB->getParent() && "VScale unknown!");
2068 unsigned VScale = BB->getParent()->getVScaleValue();
2069 assert(canConvertValue(DL, OldTy, NewTy, VScale) &&
2070 "Value not convertable to type");
2071#endif
2072
2073 if (OldTy == NewTy)
2074 return V;
2075
2076 assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
2077 "Integer types must be the exact same to convert.");
2078
2079 // A variant of bitcast that supports a mixture of fixed and scalable types
2080 // that are know to have the same size.
2081 auto CreateBitCastLike = [&IRB](Value *In, Type *Ty) -> Value * {
2082 Type *InTy = In->getType();
2083 if (InTy == Ty)
2084 return In;
2085
2087 // For vscale_range(2) expand <4 x i32> to <vscale x 4 x i16> -->
2088 // <4 x i32> to <vscale x 2 x i32> to <vscale x 4 x i16>
2090 return IRB.CreateBitCast(IRB.CreateInsertVector(VTy,
2091 PoisonValue::get(VTy), In,
2092 IRB.getInt64(0)),
2093 Ty);
2094 }
2095
2097 // For vscale_range(2) expand <vscale x 4 x i16> to <4 x i32> -->
2098 // <vscale x 4 x i16> to <vscale x 2 x i32> to <4 x i32>
2100 return IRB.CreateExtractVector(Ty, IRB.CreateBitCast(In, VTy),
2101 IRB.getInt64(0));
2102 }
2103
2104 return IRB.CreateBitCast(In, Ty);
2105 };
2106
2107 // See if we need inttoptr for this type pair. May require additional bitcast.
2108 if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
2109 // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
2110 // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
2111 // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
2112 // Directly handle i64 to i8*
2113 return IRB.CreateIntToPtr(CreateBitCastLike(V, DL.getIntPtrType(NewTy)),
2114 NewTy);
2115 }
2116
2117 // See if we need ptrtoint for this type pair. May require additional bitcast.
2118 if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
2119 // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
2120 // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
2121 // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
2122 // Expand i8* to i64 --> i8* to i64 to i64
2123 return CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
2124 NewTy);
2125 }
2126
2127 if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
2128 unsigned OldAS = OldTy->getPointerAddressSpace();
2129 unsigned NewAS = NewTy->getPointerAddressSpace();
2130 // To convert pointers with different address spaces (they are already
2131 // checked convertible, i.e. they have the same pointer size), so far we
2132 // cannot use `bitcast` (which has restrict on the same address space) or
2133 // `addrspacecast` (which is not always no-op casting). Instead, use a pair
2134 // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
2135 // size.
2136 if (OldAS != NewAS) {
2137 assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
2138 return IRB.CreateIntToPtr(
2139 CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
2140 DL.getIntPtrType(NewTy)),
2141 NewTy);
2142 }
2143 }
2144
2145 return CreateBitCastLike(V, NewTy);
2146}
2147
2148/// Test whether the given slice use can be promoted to a vector.
2149///
2150/// This function is called to test each entry in a partition which is slated
2151/// for a single slice.
2152static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
2153 VectorType *Ty,
2154 uint64_t ElementSize,
2155 const DataLayout &DL,
2156 unsigned VScale) {
2157 // First validate the slice offsets.
2158 uint64_t BeginOffset =
2159 std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
2160 uint64_t BeginIndex = BeginOffset / ElementSize;
2161 if (BeginIndex * ElementSize != BeginOffset ||
2162 BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
2163 return false;
2164 uint64_t EndOffset = std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
2165 uint64_t EndIndex = EndOffset / ElementSize;
2166 if (EndIndex * ElementSize != EndOffset ||
2167 EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
2168 return false;
2169
2170 assert(EndIndex > BeginIndex && "Empty vector!");
2171 uint64_t NumElements = EndIndex - BeginIndex;
2172 Type *SliceTy = (NumElements == 1)
2173 ? Ty->getElementType()
2174 : FixedVectorType::get(Ty->getElementType(), NumElements);
2175
2176 Type *SplitIntTy =
2177 Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
2178
2179 Use *U = S.getUse();
2180
2181 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2182 if (MI->isVolatile())
2183 return false;
2184 if (!S.isSplittable())
2185 return false; // Skip any unsplittable intrinsics.
2186 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2187 if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2188 return false;
2189 } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2190 if (LI->isVolatile())
2191 return false;
2192 Type *LTy = LI->getType();
2193 // Disable vector promotion when there are loads or stores of an FCA.
2194 if (LTy->isStructTy())
2195 return false;
2196 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2197 assert(LTy->isIntegerTy());
2198 LTy = SplitIntTy;
2199 }
2200 if (!canConvertValue(DL, SliceTy, LTy, VScale))
2201 return false;
2202 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2203 if (SI->isVolatile())
2204 return false;
2205 Type *STy = SI->getValueOperand()->getType();
2206 // Disable vector promotion when there are loads or stores of an FCA.
2207 if (STy->isStructTy())
2208 return false;
2209 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2210 assert(STy->isIntegerTy());
2211 STy = SplitIntTy;
2212 }
2213 if (!canConvertValue(DL, STy, SliceTy, VScale))
2214 return false;
2215 } else {
2216 return false;
2217 }
2218
2219 return true;
2220}
2221
2222/// Test whether any vector type in \p CandidateTys is viable for promotion.
2223///
2224/// This implements the necessary checking for \c isVectorPromotionViable over
2225/// all slices of the alloca for the given VectorType.
2226static VectorType *
2228 SmallVectorImpl<VectorType *> &CandidateTys,
2229 bool HaveCommonEltTy, Type *CommonEltTy,
2230 bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
2231 VectorType *CommonVecPtrTy, unsigned VScale) {
2232 // If we didn't find a vector type, nothing to do here.
2233 if (CandidateTys.empty())
2234 return nullptr;
2235
2236 // Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
2237 // then we should choose it, not some other alternative.
2238 // But, we can't perform a no-op pointer address space change via bitcast,
2239 // so if we didn't have a common pointer element type, bail.
2240 if (HaveVecPtrTy && !HaveCommonVecPtrTy)
2241 return nullptr;
2242
2243 // Try to pick the "best" element type out of the choices.
2244 if (!HaveCommonEltTy && HaveVecPtrTy) {
2245 // If there was a pointer element type, there's really only one choice.
2246 CandidateTys.clear();
2247 CandidateTys.push_back(CommonVecPtrTy);
2248 } else if (!HaveCommonEltTy && !HaveVecPtrTy) {
2249 // Integer-ify vector types.
2250 for (VectorType *&VTy : CandidateTys) {
2251 if (!VTy->getElementType()->isIntegerTy())
2252 VTy = cast<VectorType>(VTy->getWithNewType(IntegerType::getIntNTy(
2253 VTy->getContext(), VTy->getScalarSizeInBits())));
2254 }
2255
2256 // Rank the remaining candidate vector types. This is easy because we know
2257 // they're all integer vectors. We sort by ascending number of elements.
2258 auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2259 (void)DL;
2260 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2261 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2262 "Cannot have vector types of different sizes!");
2263 assert(RHSTy->getElementType()->isIntegerTy() &&
2264 "All non-integer types eliminated!");
2265 assert(LHSTy->getElementType()->isIntegerTy() &&
2266 "All non-integer types eliminated!");
2267 return cast<FixedVectorType>(RHSTy)->getNumElements() <
2268 cast<FixedVectorType>(LHSTy)->getNumElements();
2269 };
2270 auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2271 (void)DL;
2272 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2273 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2274 "Cannot have vector types of different sizes!");
2275 assert(RHSTy->getElementType()->isIntegerTy() &&
2276 "All non-integer types eliminated!");
2277 assert(LHSTy->getElementType()->isIntegerTy() &&
2278 "All non-integer types eliminated!");
2279 return cast<FixedVectorType>(RHSTy)->getNumElements() ==
2280 cast<FixedVectorType>(LHSTy)->getNumElements();
2281 };
2282 llvm::sort(CandidateTys, RankVectorTypesComp);
2283 CandidateTys.erase(llvm::unique(CandidateTys, RankVectorTypesEq),
2284 CandidateTys.end());
2285 } else {
2286// The only way to have the same element type in every vector type is to
2287// have the same vector type. Check that and remove all but one.
2288#ifndef NDEBUG
2289 for (VectorType *VTy : CandidateTys) {
2290 assert(VTy->getElementType() == CommonEltTy &&
2291 "Unaccounted for element type!");
2292 assert(VTy == CandidateTys[0] &&
2293 "Different vector types with the same element type!");
2294 }
2295#endif
2296 CandidateTys.resize(1);
2297 }
2298
2299 // FIXME: hack. Do we have a named constant for this?
2300 // SDAG SDNode can't have more than 65535 operands.
2301 llvm::erase_if(CandidateTys, [](VectorType *VTy) {
2302 return cast<FixedVectorType>(VTy)->getNumElements() >
2303 std::numeric_limits<unsigned short>::max();
2304 });
2305
2306 // Find a vector type viable for promotion by iterating over all slices.
2307 auto *VTy = llvm::find_if(CandidateTys, [&](VectorType *VTy) -> bool {
2308 uint64_t ElementSize =
2309 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2310
2311 // While the definition of LLVM vectors is bitpacked, we don't support sizes
2312 // that aren't byte sized.
2313 if (ElementSize % 8)
2314 return false;
2315 assert((DL.getTypeSizeInBits(VTy).getFixedValue() % 8) == 0 &&
2316 "vector size not a multiple of element size?");
2317 ElementSize /= 8;
2318
2319 for (const Slice &S : P)
2320 if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, VScale))
2321 return false;
2322
2323 for (const Slice *S : P.splitSliceTails())
2324 if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, VScale))
2325 return false;
2326
2327 return true;
2328 });
2329 return VTy != CandidateTys.end() ? *VTy : nullptr;
2330}
2331
2333 SetVector<Type *> &OtherTys, ArrayRef<VectorType *> CandidateTysCopy,
2334 function_ref<void(Type *)> CheckCandidateType, Partition &P,
2335 const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
2336 bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
2337 bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) {
2338 [[maybe_unused]] VectorType *OriginalElt =
2339 CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
2340 // Consider additional vector types where the element type size is a
2341 // multiple of load/store element size.
2342 for (Type *Ty : OtherTys) {
2344 continue;
2345 unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
2346 // Make a copy of CandidateTys and iterate through it, because we
2347 // might append to CandidateTys in the loop.
2348 for (VectorType *const VTy : CandidateTysCopy) {
2349 // The elements in the copy should remain invariant throughout the loop
2350 assert(CandidateTysCopy[0] == OriginalElt && "Different Element");
2351 unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue();
2352 unsigned ElementSize =
2353 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2354 if (TypeSize != VectorSize && TypeSize != ElementSize &&
2355 VectorSize % TypeSize == 0) {
2356 VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false);
2357 CheckCandidateType(NewVTy);
2358 }
2359 }
2360 }
2361
2363 P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2364 HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
2365}
2366
2367/// Test whether the given alloca partitioning and range of slices can be
2368/// promoted to a vector.
2369///
2370/// This is a quick test to check whether we can rewrite a particular alloca
2371/// partition (and its newly formed alloca) into a vector alloca with only
2372/// whole-vector loads and stores such that it could be promoted to a vector
2373/// SSA value. We only can ensure this for a limited set of operations, and we
2374/// don't want to do the rewrites unless we are confident that the result will
2375/// be promotable, so we have an early test here.
2377 unsigned VScale) {
2378 // Collect the candidate types for vector-based promotion. Also track whether
2379 // we have different element types.
2380 SmallVector<VectorType *, 4> CandidateTys;
2381 SetVector<Type *> LoadStoreTys;
2382 SetVector<Type *> DeferredTys;
2383 Type *CommonEltTy = nullptr;
2384 VectorType *CommonVecPtrTy = nullptr;
2385 bool HaveVecPtrTy = false;
2386 bool HaveCommonEltTy = true;
2387 bool HaveCommonVecPtrTy = true;
2388 auto CheckCandidateType = [&](Type *Ty) {
2389 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2390 // Return if bitcast to vectors is different for total size in bits.
2391 if (!CandidateTys.empty()) {
2392 VectorType *V = CandidateTys[0];
2393 if (DL.getTypeSizeInBits(VTy).getFixedValue() !=
2394 DL.getTypeSizeInBits(V).getFixedValue()) {
2395 CandidateTys.clear();
2396 return;
2397 }
2398 }
2399 CandidateTys.push_back(VTy);
2400 Type *EltTy = VTy->getElementType();
2401
2402 if (!CommonEltTy)
2403 CommonEltTy = EltTy;
2404 else if (CommonEltTy != EltTy)
2405 HaveCommonEltTy = false;
2406
2407 if (EltTy->isPointerTy()) {
2408 HaveVecPtrTy = true;
2409 if (!CommonVecPtrTy)
2410 CommonVecPtrTy = VTy;
2411 else if (CommonVecPtrTy != VTy)
2412 HaveCommonVecPtrTy = false;
2413 }
2414 }
2415 };
2416
2417 // Put load and store types into a set for de-duplication.
2418 for (const Slice &S : P) {
2419 Type *Ty;
2420 if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
2421 Ty = LI->getType();
2422 else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
2423 Ty = SI->getValueOperand()->getType();
2424 else
2425 continue;
2426
2427 auto CandTy = Ty->getScalarType();
2428 if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() ||
2429 S.endOffset() != P.endOffset())) {
2430 DeferredTys.insert(Ty);
2431 continue;
2432 }
2433
2434 LoadStoreTys.insert(Ty);
2435 // Consider any loads or stores that are the exact size of the slice.
2436 if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
2437 CheckCandidateType(Ty);
2438 }
2439
2440 SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
2442 LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
2443 CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2444 HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
2445 return VTy;
2446
2447 CandidateTys.clear();
2449 DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
2450 HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
2451 CommonVecPtrTy, VScale);
2452}
2453
2454/// Test whether a slice of an alloca is valid for integer widening.
2455///
2456/// This implements the necessary checking for the \c isIntegerWideningViable
2457/// test below on a single slice of the alloca.
2458static bool isIntegerWideningViableForSlice(const Slice &S,
2459 uint64_t AllocBeginOffset,
2460 Type *AllocaTy,
2461 const DataLayout &DL,
2462 bool &WholeAllocaOp) {
2463 uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedValue();
2464
2465 uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2466 uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2467
2468 Use *U = S.getUse();
2469
2470 // Lifetime intrinsics operate over the whole alloca whose sizes are usually
2471 // larger than other load/store slices (RelEnd > Size). But lifetime are
2472 // always promotable and should not impact other slices' promotability of the
2473 // partition.
2474 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2475 if (II->isLifetimeStartOrEnd() || II->isDroppable())
2476 return true;
2477 }
2478
2479 // We can't reasonably handle cases where the load or store extends past
2480 // the end of the alloca's type and into its padding.
2481 if (RelEnd > Size)
2482 return false;
2483
2484 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2485 if (LI->isVolatile())
2486 return false;
2487 // We can't handle loads that extend past the allocated memory.
2488 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
2489 if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size)
2490 return false;
2491 // So far, AllocaSliceRewriter does not support widening split slice tails
2492 // in rewriteIntegerLoad.
2493 if (S.beginOffset() < AllocBeginOffset)
2494 return false;
2495 // Note that we don't count vector loads or stores as whole-alloca
2496 // operations which enable integer widening because we would prefer to use
2497 // vector widening instead.
2498 if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
2499 WholeAllocaOp = true;
2500 if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
2501 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2502 return false;
2503 } else if (RelBegin != 0 || RelEnd != Size ||
2504 !canConvertValue(DL, AllocaTy, LI->getType())) {
2505 // Non-integer loads need to be convertible from the alloca type so that
2506 // they are promotable.
2507 return false;
2508 }
2509 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2510 Type *ValueTy = SI->getValueOperand()->getType();
2511 if (SI->isVolatile())
2512 return false;
2513 // We can't handle stores that extend past the allocated memory.
2514 TypeSize StoreSize = DL.getTypeStoreSize(ValueTy);
2515 if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size)
2516 return false;
2517 // So far, AllocaSliceRewriter does not support widening split slice tails
2518 // in rewriteIntegerStore.
2519 if (S.beginOffset() < AllocBeginOffset)
2520 return false;
2521 // Note that we don't count vector loads or stores as whole-alloca
2522 // operations which enable integer widening because we would prefer to use
2523 // vector widening instead.
2524 if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
2525 WholeAllocaOp = true;
2526 if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
2527 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2528 return false;
2529 } else if (RelBegin != 0 || RelEnd != Size ||
2530 !canConvertValue(DL, ValueTy, AllocaTy)) {
2531 // Non-integer stores need to be convertible to the alloca type so that
2532 // they are promotable.
2533 return false;
2534 }
2535 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2536 if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
2537 return false;
2538 if (!S.isSplittable())
2539 return false; // Skip any unsplittable intrinsics.
2540 } else {
2541 return false;
2542 }
2543
2544 return true;
2545}
2546
2547/// Test whether the given alloca partition's integer operations can be
2548/// widened to promotable ones.
2549///
2550/// This is a quick test to check whether we can rewrite the integer loads and
2551/// stores to a particular alloca into wider loads and stores and be able to
2552/// promote the resulting alloca.
2553static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2554 const DataLayout &DL) {
2555 uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedValue();
2556 // Don't create integer types larger than the maximum bitwidth.
2557 if (SizeInBits > IntegerType::MAX_INT_BITS)
2558 return false;
2559
2560 // Don't try to handle allocas with bit-padding.
2561 if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedValue())
2562 return false;
2563
2564 // We need to ensure that an integer type with the appropriate bitwidth can
2565 // be converted to the alloca type, whatever that is. We don't want to force
2566 // the alloca itself to have an integer type if there is a more suitable one.
2567 Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
2568 if (!canConvertValue(DL, AllocaTy, IntTy) ||
2569 !canConvertValue(DL, IntTy, AllocaTy))
2570 return false;
2571
2572 // While examining uses, we ensure that the alloca has a covering load or
2573 // store. We don't want to widen the integer operations only to fail to
2574 // promote due to some other unsplittable entry (which we may make splittable
2575 // later). However, if there are only splittable uses, go ahead and assume
2576 // that we cover the alloca.
2577 // FIXME: We shouldn't consider split slices that happen to start in the
2578 // partition here...
2579 bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
2580
2581 for (const Slice &S : P)
2582 if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
2583 WholeAllocaOp))
2584 return false;
2585
2586 for (const Slice *S : P.splitSliceTails())
2587 if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
2588 WholeAllocaOp))
2589 return false;
2590
2591 return WholeAllocaOp;
2592}
2593
2594static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2596 const Twine &Name) {
2597 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2598 IntegerType *IntTy = cast<IntegerType>(V->getType());
2599 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2600 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2601 "Element extends past full value");
2602 uint64_t ShAmt = 8 * Offset;
2603 if (DL.isBigEndian())
2604 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2605 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2606 if (ShAmt) {
2607 V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
2608 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2609 }
2610 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2611 "Cannot extract to a larger integer!");
2612 if (Ty != IntTy) {
2613 V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
2614 LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2615 }
2616 return V;
2617}
2618
2619static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2620 Value *V, uint64_t Offset, const Twine &Name) {
2621 IntegerType *IntTy = cast<IntegerType>(Old->getType());
2622 IntegerType *Ty = cast<IntegerType>(V->getType());
2623 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2624 "Cannot insert a larger integer!");
2625 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2626 if (Ty != IntTy) {
2627 V = IRB.CreateZExt(V, IntTy, Name + ".ext");
2628 LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2629 }
2630 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2631 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2632 "Element store outside of alloca store");
2633 uint64_t ShAmt = 8 * Offset;
2634 if (DL.isBigEndian())
2635 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2636 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2637 if (ShAmt) {
2638 V = IRB.CreateShl(V, ShAmt, Name + ".shift");
2639 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2640 }
2641
2642 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2643 APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
2644 Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
2645 LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2646 V = IRB.CreateOr(Old, V, Name + ".insert");
2647 LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2648 }
2649 return V;
2650}
2651
2652static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2653 unsigned EndIndex, const Twine &Name) {
2654 auto *VecTy = cast<FixedVectorType>(V->getType());
2655 unsigned NumElements = EndIndex - BeginIndex;
2656 assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2657
2658 if (NumElements == VecTy->getNumElements())
2659 return V;
2660
2661 if (NumElements == 1) {
2662 V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
2663 Name + ".extract");
2664 LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2665 return V;
2666 }
2667
2668 auto Mask = llvm::to_vector<8>(llvm::seq<int>(BeginIndex, EndIndex));
2669 V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
2670 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2671 return V;
2672}
2673
2674static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2675 unsigned BeginIndex, const Twine &Name) {
2676 VectorType *VecTy = cast<VectorType>(Old->getType());
2677 assert(VecTy && "Can only insert a vector into a vector");
2678
2679 VectorType *Ty = dyn_cast<VectorType>(V->getType());
2680 if (!Ty) {
2681 // Single element to insert.
2682 V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
2683 Name + ".insert");
2684 LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2685 return V;
2686 }
2687
2690 "Too many elements!");
2693 assert(V->getType() == VecTy && "Vector type mismatch");
2694 return V;
2695 }
2696 unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Ty)->getNumElements();
2697
2698 // When inserting a smaller vector into the larger to store, we first
2699 // use a shuffle vector to widen it with undef elements, and then
2700 // a second shuffle vector to select between the loaded vector and the
2701 // incoming vector.
2703 Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
2704 for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2705 if (i >= BeginIndex && i < EndIndex)
2706 Mask.push_back(i - BeginIndex);
2707 else
2708 Mask.push_back(-1);
2709 V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
2710 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2711
2714 for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2715 Mask2.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
2716
2717 // No profiling support for vector selects.
2718 V = IRB.CreateSelectWithUnknownProfile(ConstantVector::get(Mask2), V, Old,
2719 DEBUG_TYPE, Name + "blend");
2720
2721 LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2722 return V;
2723}
2724
2725/// This function takes two vector values and combines them into a single vector
2726/// by concatenating their elements. The function handles:
2727///
2728/// 1. Element type mismatch: If either vector's element type differs from
2729/// NewAIEltType, the function bitcasts the vector to use NewAIEltType while
2730/// preserving the total bit width (adjusting the number of elements
2731/// accordingly).
2732///
2733/// 2. Size mismatch: After transforming the vectors to have the desired element
2734/// type, if the two vectors have different numbers of elements, the smaller
2735/// vector is extended with poison values to match the size of the larger
2736/// vector before concatenation.
2737///
2738/// 3. Concatenation: The vectors are merged using a shuffle operation that
2739/// places all elements of V0 first, followed by all elements of V1.
2740///
2741/// \param V0 The first vector to merge (must be a vector type)
2742/// \param V1 The second vector to merge (must be a vector type)
2743/// \param DL The data layout for size calculations
2744/// \param NewAIEltTy The desired element type for the result vector
2745/// \param Builder IRBuilder for creating new instructions
2746/// \return A new vector containing all elements from V0 followed by all
2747/// elements from V1
2749 Type *NewAIEltTy, IRBuilder<> &Builder) {
2750 // V0 and V1 are vectors
2751 // Create a new vector type with combined elements
2752 // Use ShuffleVector to concatenate the vectors
2753 auto *VecType0 = cast<FixedVectorType>(V0->getType());
2754 auto *VecType1 = cast<FixedVectorType>(V1->getType());
2755
2756 // If V0/V1 element types are different from NewAllocaElementType,
2757 // we need to introduce bitcasts before merging them
2758 auto BitcastIfNeeded = [&](Value *&V, FixedVectorType *&VecType,
2759 const char *DebugName) {
2760 Type *EltType = VecType->getElementType();
2761 if (EltType != NewAIEltTy) {
2762 // Calculate new number of elements to maintain same bit width
2763 unsigned TotalBits =
2764 VecType->getNumElements() * DL.getTypeSizeInBits(EltType);
2765 unsigned NewNumElts = TotalBits / DL.getTypeSizeInBits(NewAIEltTy);
2766
2767 auto *NewVecType = FixedVectorType::get(NewAIEltTy, NewNumElts);
2768 V = Builder.CreateBitCast(V, NewVecType);
2769 VecType = NewVecType;
2770 LLVM_DEBUG(dbgs() << " bitcast " << DebugName << ": " << *V << "\n");
2771 }
2772 };
2773
2774 BitcastIfNeeded(V0, VecType0, "V0");
2775 BitcastIfNeeded(V1, VecType1, "V1");
2776
2777 unsigned NumElts0 = VecType0->getNumElements();
2778 unsigned NumElts1 = VecType1->getNumElements();
2779
2780 SmallVector<int, 16> ShuffleMask;
2781
2782 if (NumElts0 == NumElts1) {
2783 for (unsigned i = 0; i < NumElts0 + NumElts1; ++i)
2784 ShuffleMask.push_back(i);
2785 } else {
2786 // If two vectors have different sizes, we need to extend
2787 // the smaller vector to the size of the larger vector.
2788 unsigned SmallSize = std::min(NumElts0, NumElts1);
2789 unsigned LargeSize = std::max(NumElts0, NumElts1);
2790 bool IsV0Smaller = NumElts0 < NumElts1;
2791 Value *&ExtendedVec = IsV0Smaller ? V0 : V1;
2792 SmallVector<int, 16> ExtendMask;
2793 for (unsigned i = 0; i < SmallSize; ++i)
2794 ExtendMask.push_back(i);
2795 for (unsigned i = SmallSize; i < LargeSize; ++i)
2796 ExtendMask.push_back(PoisonMaskElem);
2797 ExtendedVec = Builder.CreateShuffleVector(
2798 ExtendedVec, PoisonValue::get(ExtendedVec->getType()), ExtendMask);
2799 LLVM_DEBUG(dbgs() << " shufflevector: " << *ExtendedVec << "\n");
2800 for (unsigned i = 0; i < NumElts0; ++i)
2801 ShuffleMask.push_back(i);
2802 for (unsigned i = 0; i < NumElts1; ++i)
2803 ShuffleMask.push_back(LargeSize + i);
2804 }
2805
2806 return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2807}
2808
2809namespace {
2810
2811/// Visitor to rewrite instructions using p particular slice of an alloca
2812/// to use a new alloca.
2813///
2814/// Also implements the rewriting to vector-based accesses when the partition
2815/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2816/// lives here.
2817class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
2818 // Befriend the base class so it can delegate to private visit methods.
2819 friend class InstVisitor<AllocaSliceRewriter, bool>;
2820
2821 using Base = InstVisitor<AllocaSliceRewriter, bool>;
2822
2823 const DataLayout &DL;
2824 AllocaSlices &AS;
2825 SROA &Pass;
2826 AllocaInst &OldAI, &NewAI;
2827 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2828 Type *NewAllocaTy;
2829
2830 // This is a convenience and flag variable that will be null unless the new
2831 // alloca's integer operations should be widened to this integer type due to
2832 // passing isIntegerWideningViable above. If it is non-null, the desired
2833 // integer type will be stored here for easy access during rewriting.
2834 IntegerType *IntTy;
2835
2836 // If we are rewriting an alloca partition which can be written as pure
2837 // vector operations, we stash extra information here. When VecTy is
2838 // non-null, we have some strict guarantees about the rewritten alloca:
2839 // - The new alloca is exactly the size of the vector type here.
2840 // - The accesses all either map to the entire vector or to a single
2841 // element.
2842 // - The set of accessing instructions is only one of those handled above
2843 // in isVectorPromotionViable. Generally these are the same access kinds
2844 // which are promotable via mem2reg.
2845 VectorType *VecTy;
2846 Type *ElementTy;
2847 uint64_t ElementSize;
2848
2849 // The original offset of the slice currently being rewritten relative to
2850 // the original alloca.
2851 uint64_t BeginOffset = 0;
2852 uint64_t EndOffset = 0;
2853
2854 // The new offsets of the slice currently being rewritten relative to the
2855 // original alloca.
2856 uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2857
2858 uint64_t SliceSize = 0;
2859 bool IsSplittable = false;
2860 bool IsSplit = false;
2861 Use *OldUse = nullptr;
2862 Instruction *OldPtr = nullptr;
2863
2864 // Track post-rewrite users which are PHI nodes and Selects.
2865 SmallSetVector<PHINode *, 8> &PHIUsers;
2866 SmallSetVector<SelectInst *, 8> &SelectUsers;
2867
2868 // Utility IR builder, whose name prefix is setup for each visited use, and
2869 // the insertion point is set to point to the user.
2870 IRBuilderTy IRB;
2871
2872 // Return the new alloca, addrspacecasted if required to avoid changing the
2873 // addrspace of a volatile access.
2874 Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) {
2875 if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace())
2876 return &NewAI;
2877
2878 Type *AccessTy = IRB.getPtrTy(AddrSpace);
2879 return IRB.CreateAddrSpaceCast(&NewAI, AccessTy);
2880 }
2881
2882public:
2883 AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
2884 AllocaInst &OldAI, AllocaInst &NewAI,
2885 uint64_t NewAllocaBeginOffset,
2886 uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2887 VectorType *PromotableVecTy,
2888 SmallSetVector<PHINode *, 8> &PHIUsers,
2889 SmallSetVector<SelectInst *, 8> &SelectUsers)
2890 : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2891 NewAllocaBeginOffset(NewAllocaBeginOffset),
2892 NewAllocaEndOffset(NewAllocaEndOffset),
2893 NewAllocaTy(NewAI.getAllocatedType()),
2894 IntTy(
2895 IsIntegerPromotable
2896 ? Type::getIntNTy(NewAI.getContext(),
2897 DL.getTypeSizeInBits(NewAI.getAllocatedType())
2898 .getFixedValue())
2899 : nullptr),
2900 VecTy(PromotableVecTy),
2901 ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2902 ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8
2903 : 0),
2904 PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2905 IRB(NewAI.getContext(), ConstantFolder()) {
2906 if (VecTy) {
2907 assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % 8) == 0 &&
2908 "Only multiple-of-8 sized vector elements are viable");
2909 ++NumVectorized;
2910 }
2911 assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2912 }
2913
2914 bool visit(AllocaSlices::const_iterator I) {
2915 bool CanSROA = true;
2916 BeginOffset = I->beginOffset();
2917 EndOffset = I->endOffset();
2918 IsSplittable = I->isSplittable();
2919 IsSplit =
2920 BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2921 LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2922 LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2923 LLVM_DEBUG(dbgs() << "\n");
2924
2925 // Compute the intersecting offset range.
2926 assert(BeginOffset < NewAllocaEndOffset);
2927 assert(EndOffset > NewAllocaBeginOffset);
2928 NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
2929 NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
2930
2931 SliceSize = NewEndOffset - NewBeginOffset;
2932 LLVM_DEBUG(dbgs() << " Begin:(" << BeginOffset << ", " << EndOffset
2933 << ") NewBegin:(" << NewBeginOffset << ", "
2934 << NewEndOffset << ") NewAllocaBegin:("
2935 << NewAllocaBeginOffset << ", " << NewAllocaEndOffset
2936 << ")\n");
2937 assert(IsSplit || NewBeginOffset == BeginOffset);
2938 OldUse = I->getUse();
2939 OldPtr = cast<Instruction>(OldUse->get());
2940
2941 Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
2942 IRB.SetInsertPoint(OldUserI);
2943 IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2944 IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." +
2945 Twine(BeginOffset) + ".");
2946
2947 CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
2948 if (VecTy || IntTy)
2949 assert(CanSROA);
2950 return CanSROA;
2951 }
2952
2953 /// Attempts to rewrite a partition using tree-structured merge optimization.
2954 ///
2955 /// This function analyzes a partition to determine if it can be optimized
2956 /// using a tree-structured merge pattern, where multiple non-overlapping
2957 /// stores completely fill an alloca. And there is no load from the alloca in
2958 /// the middle of the stores. Such patterns can be optimized by eliminating
2959 /// the intermediate stores and directly constructing the final vector by
2960 /// using shufflevectors.
2961 ///
2962 /// Example transformation:
2963 /// Before: (stores do not have to be in order)
2964 /// %alloca = alloca <8 x float>
2965 /// store <2 x float> %val0, ptr %alloca ; offset 0-1
2966 /// store <2 x float> %val2, ptr %alloca+16 ; offset 4-5
2967 /// store <2 x float> %val1, ptr %alloca+8 ; offset 2-3
2968 /// store <2 x float> %val3, ptr %alloca+24 ; offset 6-7
2969 ///
2970 /// After:
2971 /// %alloca = alloca <8 x float>
2972 /// %shuffle0 = shufflevector %val0, %val1, <4 x i32> <i32 0, i32 1, i32 2,
2973 /// i32 3>
2974 /// %shuffle1 = shufflevector %val2, %val3, <4 x i32> <i32 0, i32 1, i32 2,
2975 /// i32 3>
2976 /// %shuffle2 = shufflevector %shuffle0, %shuffle1, <8 x i32> <i32 0, i32 1,
2977 /// i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2978 /// store %shuffle2, ptr %alloca
2979 ///
2980 /// The optimization looks for partitions that:
2981 /// 1. Have no overlapping split slice tails
2982 /// 2. Contain non-overlapping stores that cover the entire alloca
2983 /// 3. Have exactly one load that reads the complete alloca structure and not
2984 /// in the middle of the stores (TODO: maybe we can relax the constraint
2985 /// about reading the entire alloca structure)
2986 ///
2987 /// \param P The partition to analyze and potentially rewrite
2988 /// \return An optional vector of values that were deleted during the rewrite
2989 /// process, or std::nullopt if the partition cannot be optimized
2990 /// using tree-structured merge
2991 std::optional<SmallVector<Value *, 4>>
2992 rewriteTreeStructuredMerge(Partition &P) {
2993 // No tail slices that overlap with the partition
2994 if (P.splitSliceTails().size() > 0)
2995 return std::nullopt;
2996
2997 SmallVector<Value *, 4> DeletedValues;
2998 LoadInst *TheLoad = nullptr;
2999
3000 // Structure to hold store information
3001 struct StoreInfo {
3002 StoreInst *Store;
3003 uint64_t BeginOffset;
3004 uint64_t EndOffset;
3005 Value *StoredValue;
3006 StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val)
3007 : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val) {}
3008 };
3009
3010 SmallVector<StoreInfo, 4> StoreInfos;
3011
3012 // If the new alloca is a fixed vector type, we use its element type as the
3013 // allocated element type, otherwise we use i8 as the allocated element
3014 Type *AllocatedEltTy =
3016 ? cast<FixedVectorType>(NewAI.getAllocatedType())->getElementType()
3017 : Type::getInt8Ty(NewAI.getContext());
3018 unsigned AllocatedEltTySize = DL.getTypeSizeInBits(AllocatedEltTy);
3019
3020 // Helper to check if a type is
3021 // 1. A fixed vector type
3022 // 2. The element type is not a pointer
3023 // 3. The element type size is byte-aligned
3024 // We only handle the cases that the ld/st meet these conditions
3025 auto IsTypeValidForTreeStructuredMerge = [&](Type *Ty) -> bool {
3026 auto *FixedVecTy = dyn_cast<FixedVectorType>(Ty);
3027 return FixedVecTy &&
3028 DL.getTypeSizeInBits(FixedVecTy->getElementType()) % 8 == 0 &&
3029 !FixedVecTy->getElementType()->isPointerTy();
3030 };
3031
3032 for (Slice &S : P) {
3033 auto *User = cast<Instruction>(S.getUse()->getUser());
3034 if (auto *LI = dyn_cast<LoadInst>(User)) {
3035 // Do not handle the case if
3036 // 1. There is more than one load
3037 // 2. The load is volatile
3038 // 3. The load does not read the entire alloca structure
3039 // 4. The load does not meet the conditions in the helper function
3040 if (TheLoad || !IsTypeValidForTreeStructuredMerge(LI->getType()) ||
3041 S.beginOffset() != NewAllocaBeginOffset ||
3042 S.endOffset() != NewAllocaEndOffset || LI->isVolatile())
3043 return std::nullopt;
3044 TheLoad = LI;
3045 } else if (auto *SI = dyn_cast<StoreInst>(User)) {
3046 // Do not handle the case if
3047 // 1. The store does not meet the conditions in the helper function
3048 // 2. The store is volatile
3049 // 3. The total store size is not a multiple of the allocated element
3050 // type size
3051 if (!IsTypeValidForTreeStructuredMerge(
3052 SI->getValueOperand()->getType()) ||
3053 SI->isVolatile())
3054 return std::nullopt;
3055 auto *VecTy = cast<FixedVectorType>(SI->getValueOperand()->getType());
3056 unsigned NumElts = VecTy->getNumElements();
3057 unsigned EltSize = DL.getTypeSizeInBits(VecTy->getElementType());
3058 if (NumElts * EltSize % AllocatedEltTySize != 0)
3059 return std::nullopt;
3060 StoreInfos.emplace_back(SI, S.beginOffset(), S.endOffset(),
3061 SI->getValueOperand());
3062 } else {
3063 // If we have instructions other than load and store, we cannot do the
3064 // tree structured merge
3065 return std::nullopt;
3066 }
3067 }
3068 // If we do not have any load, we cannot do the tree structured merge
3069 if (!TheLoad)
3070 return std::nullopt;
3071
3072 // If we do not have multiple stores, we cannot do the tree structured merge
3073 if (StoreInfos.size() < 2)
3074 return std::nullopt;
3075
3076 // Stores should not overlap and should cover the whole alloca
3077 // Sort by begin offset
3078 llvm::sort(StoreInfos, [](const StoreInfo &A, const StoreInfo &B) {
3079 return A.BeginOffset < B.BeginOffset;
3080 });
3081
3082 // Check for overlaps and coverage
3083 uint64_t ExpectedStart = NewAllocaBeginOffset;
3084 for (auto &StoreInfo : StoreInfos) {
3085 uint64_t BeginOff = StoreInfo.BeginOffset;
3086 uint64_t EndOff = StoreInfo.EndOffset;
3087
3088 // Check for gap or overlap
3089 if (BeginOff != ExpectedStart)
3090 return std::nullopt;
3091
3092 ExpectedStart = EndOff;
3093 }
3094 // Check that stores cover the entire alloca
3095 if (ExpectedStart != NewAllocaEndOffset)
3096 return std::nullopt;
3097
3098 // Stores should be in the same basic block
3099 // The load should not be in the middle of the stores
3100 // Note:
3101 // If the load is in a different basic block with the stores, we can still
3102 // do the tree structured merge. This is because we do not have the
3103 // store->load forwarding here. The merged vector will be stored back to
3104 // NewAI and the new load will load from NewAI. The forwarding will be
3105 // handled later when we try to promote NewAI.
3106 BasicBlock *LoadBB = TheLoad->getParent();
3107 BasicBlock *StoreBB = StoreInfos[0].Store->getParent();
3108
3109 for (auto &StoreInfo : StoreInfos) {
3110 if (StoreInfo.Store->getParent() != StoreBB)
3111 return std::nullopt;
3112 if (LoadBB == StoreBB && !StoreInfo.Store->comesBefore(TheLoad))
3113 return std::nullopt;
3114 }
3115
3116 // If we reach here, the partition can be merged with a tree structured
3117 // merge
3118 LLVM_DEBUG({
3119 dbgs() << "Tree structured merge rewrite:\n Load: " << *TheLoad
3120 << "\n Ordered stores:\n";
3121 for (auto [i, Info] : enumerate(StoreInfos))
3122 dbgs() << " [" << i << "] Range[" << Info.BeginOffset << ", "
3123 << Info.EndOffset << ") \tStore: " << *Info.Store
3124 << "\tValue: " << *Info.StoredValue << "\n";
3125 });
3126
3127 // Instead of having these stores, we merge all the stored values into a
3128 // vector and store the merged value into the alloca
3129 std::queue<Value *> VecElements;
3130 IRBuilder<> Builder(StoreInfos.back().Store);
3131 for (const auto &Info : StoreInfos) {
3132 DeletedValues.push_back(Info.Store);
3133 VecElements.push(Info.StoredValue);
3134 }
3135
3136 LLVM_DEBUG(dbgs() << " Rewrite stores into shufflevectors:\n");
3137 while (VecElements.size() > 1) {
3138 const auto NumElts = VecElements.size();
3139 for ([[maybe_unused]] const auto _ : llvm::seq(NumElts / 2)) {
3140 Value *V0 = VecElements.front();
3141 VecElements.pop();
3142 Value *V1 = VecElements.front();
3143 VecElements.pop();
3144 Value *Merged = mergeTwoVectors(V0, V1, DL, AllocatedEltTy, Builder);
3145 LLVM_DEBUG(dbgs() << " shufflevector: " << *Merged << "\n");
3146 VecElements.push(Merged);
3147 }
3148 if (NumElts % 2 == 1) {
3149 Value *V = VecElements.front();
3150 VecElements.pop();
3151 VecElements.push(V);
3152 }
3153 }
3154
3155 // Store the merged value into the alloca
3156 Value *MergedValue = VecElements.front();
3157 Builder.CreateAlignedStore(MergedValue, &NewAI, getSliceAlign());
3158
3159 IRBuilder<> LoadBuilder(TheLoad);
3160 TheLoad->replaceAllUsesWith(LoadBuilder.CreateAlignedLoad(
3161 TheLoad->getType(), &NewAI, getSliceAlign(), TheLoad->isVolatile(),
3162 TheLoad->getName() + ".sroa.new.load"));
3163 DeletedValues.push_back(TheLoad);
3164
3165 return DeletedValues;
3166 }
3167
3168private:
3169 // Make sure the other visit overloads are visible.
3170 using Base::visit;
3171
3172 // Every instruction which can end up as a user must have a rewrite rule.
3173 bool visitInstruction(Instruction &I) {
3174 LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
3175 llvm_unreachable("No rewrite rule for this instruction!");
3176 }
3177
3178 Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
3179 // Note that the offset computation can use BeginOffset or NewBeginOffset
3180 // interchangeably for unsplit slices.
3181 assert(IsSplit || BeginOffset == NewBeginOffset);
3182 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3183
3184 StringRef OldName = OldPtr->getName();
3185 // Skip through the last '.sroa.' component of the name.
3186 size_t LastSROAPrefix = OldName.rfind(".sroa.");
3187 if (LastSROAPrefix != StringRef::npos) {
3188 OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
3189 // Look for an SROA slice index.
3190 size_t IndexEnd = OldName.find_first_not_of("0123456789");
3191 if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
3192 // Strip the index and look for the offset.
3193 OldName = OldName.substr(IndexEnd + 1);
3194 size_t OffsetEnd = OldName.find_first_not_of("0123456789");
3195 if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
3196 // Strip the offset.
3197 OldName = OldName.substr(OffsetEnd + 1);
3198 }
3199 }
3200 // Strip any SROA suffixes as well.
3201 OldName = OldName.substr(0, OldName.find(".sroa_"));
3202
3203 return getAdjustedPtr(IRB, DL, &NewAI,
3204 APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
3205 PointerTy, Twine(OldName) + ".");
3206 }
3207
3208 /// Compute suitable alignment to access this slice of the *new*
3209 /// alloca.
3210 ///
3211 /// You can optionally pass a type to this routine and if that type's ABI
3212 /// alignment is itself suitable, this will return zero.
3213 Align getSliceAlign() {
3214 return commonAlignment(NewAI.getAlign(),
3215 NewBeginOffset - NewAllocaBeginOffset);
3216 }
3217
3218 unsigned getIndex(uint64_t Offset) {
3219 assert(VecTy && "Can only call getIndex when rewriting a vector");
3220 uint64_t RelOffset = Offset - NewAllocaBeginOffset;
3221 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
3222 uint32_t Index = RelOffset / ElementSize;
3223 assert(Index * ElementSize == RelOffset);
3224 return Index;
3225 }
3226
3227 void deleteIfTriviallyDead(Value *V) {
3230 Pass.DeadInsts.push_back(I);
3231 }
3232
3233 Value *rewriteVectorizedLoadInst(LoadInst &LI) {
3234 unsigned BeginIndex = getIndex(NewBeginOffset);
3235 unsigned EndIndex = getIndex(NewEndOffset);
3236 assert(EndIndex > BeginIndex && "Empty vector!");
3237
3238 LoadInst *Load = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3239 NewAI.getAlign(), "load");
3240
3241 Load->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3242 LLVMContext::MD_access_group});
3243 return extractVector(IRB, Load, BeginIndex, EndIndex, "vec");
3244 }
3245
3246 Value *rewriteIntegerLoad(LoadInst &LI) {
3247 assert(IntTy && "We cannot insert an integer to the alloca");
3248 assert(!LI.isVolatile());
3249 Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3250 NewAI.getAlign(), "load");
3251 V = convertValue(DL, IRB, V, IntTy);
3252 assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3253 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3254 if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
3255 IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
3256 V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
3257 }
3258 // It is possible that the extracted type is not the load type. This
3259 // happens if there is a load past the end of the alloca, and as
3260 // a consequence the slice is narrower but still a candidate for integer
3261 // lowering. To handle this case, we just zero extend the extracted
3262 // integer.
3263 assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
3264 "Can only handle an extract for an overly wide load");
3265 if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
3266 V = IRB.CreateZExt(V, LI.getType());
3267 return V;
3268 }
3269
3270 bool visitLoadInst(LoadInst &LI) {
3271 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3272 Value *OldOp = LI.getOperand(0);
3273 assert(OldOp == OldPtr);
3274
3275 AAMDNodes AATags = LI.getAAMetadata();
3276
3277 unsigned AS = LI.getPointerAddressSpace();
3278
3279 Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
3280 : LI.getType();
3281 bool IsPtrAdjusted = false;
3282 Value *V;
3283 if (VecTy) {
3284 V = rewriteVectorizedLoadInst(LI);
3285 } else if (IntTy && LI.getType()->isIntegerTy()) {
3286 V = rewriteIntegerLoad(LI);
3287 } else if (NewBeginOffset == NewAllocaBeginOffset &&
3288 NewEndOffset == NewAllocaEndOffset &&
3289 (canConvertValue(DL, NewAllocaTy, TargetTy) ||
3290 (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() &&
3291 DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize &&
3292 !LI.isVolatile()))) {
3293 Value *NewPtr =
3294 getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile());
3295 LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), NewPtr,
3296 NewAI.getAlign(), LI.isVolatile(),
3297 LI.getName());
3298 if (LI.isVolatile())
3299 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3300 if (NewLI->isAtomic())
3301 NewLI->setAlignment(LI.getAlign());
3302
3303 // Copy any metadata that is valid for the new load. This may require
3304 // conversion to a different kind of metadata, e.g. !nonnull might change
3305 // to !range or vice versa.
3306 copyMetadataForLoad(*NewLI, LI);
3307
3308 // Do this after copyMetadataForLoad() to preserve the TBAA shift.
3309 if (AATags)
3310 NewLI->setAAMetadata(AATags.adjustForAccess(
3311 NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3312
3313 // Try to preserve nonnull metadata
3314 V = NewLI;
3315
3316 // If this is an integer load past the end of the slice (which means the
3317 // bytes outside the slice are undef or this load is dead) just forcibly
3318 // fix the integer size with correct handling of endianness.
3319 if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
3320 if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
3321 if (AITy->getBitWidth() < TITy->getBitWidth()) {
3322 V = IRB.CreateZExt(V, TITy, "load.ext");
3323 if (DL.isBigEndian())
3324 V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
3325 "endian_shift");
3326 }
3327 } else {
3328 Type *LTy = IRB.getPtrTy(AS);
3329 LoadInst *NewLI =
3330 IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
3331 getSliceAlign(), LI.isVolatile(), LI.getName());
3332
3333 if (AATags)
3334 NewLI->setAAMetadata(AATags.adjustForAccess(
3335 NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3336
3337 if (LI.isVolatile())
3338 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3339 NewLI->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3340 LLVMContext::MD_access_group});
3341
3342 V = NewLI;
3343 IsPtrAdjusted = true;
3344 }
3345 V = convertValue(DL, IRB, V, TargetTy);
3346
3347 if (IsSplit) {
3348 assert(!LI.isVolatile());
3349 assert(LI.getType()->isIntegerTy() &&
3350 "Only integer type loads and stores are split");
3351 assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() &&
3352 "Split load isn't smaller than original load");
3353 assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
3354 "Non-byte-multiple bit width");
3355 // Move the insertion point just past the load so that we can refer to it.
3356 BasicBlock::iterator LIIt = std::next(LI.getIterator());
3357 // Ensure the insertion point comes before any debug-info immediately
3358 // after the load, so that variable values referring to the load are
3359 // dominated by it.
3360 LIIt.setHeadBit(true);
3361 IRB.SetInsertPoint(LI.getParent(), LIIt);
3362 // Create a placeholder value with the same type as LI to use as the
3363 // basis for the new value. This allows us to replace the uses of LI with
3364 // the computed value, and then replace the placeholder with LI, leaving
3365 // LI only used for this computation.
3366 Value *Placeholder =
3367 new LoadInst(LI.getType(), PoisonValue::get(IRB.getPtrTy(AS)), "",
3368 false, Align(1));
3369 V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
3370 "insert");
3371 LI.replaceAllUsesWith(V);
3372 Placeholder->replaceAllUsesWith(&LI);
3373 Placeholder->deleteValue();
3374 } else {
3375 LI.replaceAllUsesWith(V);
3376 }
3377
3378 Pass.DeadInsts.push_back(&LI);
3379 deleteIfTriviallyDead(OldOp);
3380 LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
3381 return !LI.isVolatile() && !IsPtrAdjusted;
3382 }
3383
3384 bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
3385 AAMDNodes AATags) {
3386 // Capture V for the purpose of debug-info accounting once it's converted
3387 // to a vector store.
3388 Value *OrigV = V;
3389 if (V->getType() != VecTy) {
3390 unsigned BeginIndex = getIndex(NewBeginOffset);
3391 unsigned EndIndex = getIndex(NewEndOffset);
3392 assert(EndIndex > BeginIndex && "Empty vector!");
3393 unsigned NumElements = EndIndex - BeginIndex;
3394 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3395 "Too many elements!");
3396 Type *SliceTy = (NumElements == 1)
3397 ? ElementTy
3398 : FixedVectorType::get(ElementTy, NumElements);
3399 if (V->getType() != SliceTy)
3400 V = convertValue(DL, IRB, V, SliceTy);
3401
3402 // Mix in the existing elements.
3403 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3404 NewAI.getAlign(), "load");
3405 V = insertVector(IRB, Old, V, BeginIndex, "vec");
3406 }
3407 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3408 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3409 LLVMContext::MD_access_group});
3410 if (AATags)
3411 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3412 V->getType(), DL));
3413 Pass.DeadInsts.push_back(&SI);
3414
3415 // NOTE: Careful to use OrigV rather than V.
3416 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3417 Store, Store->getPointerOperand(), OrigV, DL);
3418 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3419 return true;
3420 }
3421
3422 bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
3423 assert(IntTy && "We cannot extract an integer from the alloca");
3424 assert(!SI.isVolatile());
3425 if (DL.getTypeSizeInBits(V->getType()).getFixedValue() !=
3426 IntTy->getBitWidth()) {
3427 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3428 NewAI.getAlign(), "oldload");
3429 Old = convertValue(DL, IRB, Old, IntTy);
3430 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3431 uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
3432 V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
3433 }
3434 V = convertValue(DL, IRB, V, NewAllocaTy);
3435 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3436 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3437 LLVMContext::MD_access_group});
3438 if (AATags)
3439 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3440 V->getType(), DL));
3441
3442 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3443 Store, Store->getPointerOperand(),
3444 Store->getValueOperand(), DL);
3445
3446 Pass.DeadInsts.push_back(&SI);
3447 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3448 return true;
3449 }
3450
3451 bool visitStoreInst(StoreInst &SI) {
3452 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3453 Value *OldOp = SI.getOperand(1);
3454 assert(OldOp == OldPtr);
3455
3456 AAMDNodes AATags = SI.getAAMetadata();
3457 Value *V = SI.getValueOperand();
3458
3459 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3460 // alloca that should be re-examined after promoting this alloca.
3461 if (V->getType()->isPointerTy())
3462 if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
3463 Pass.PostPromotionWorklist.insert(AI);
3464
3465 TypeSize StoreSize = DL.getTypeStoreSize(V->getType());
3466 if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) {
3467 assert(!SI.isVolatile());
3468 assert(V->getType()->isIntegerTy() &&
3469 "Only integer type loads and stores are split");
3470 assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
3471 "Non-byte-multiple bit width");
3472 IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
3473 V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
3474 "extract");
3475 }
3476
3477 if (VecTy)
3478 return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
3479 if (IntTy && V->getType()->isIntegerTy())
3480 return rewriteIntegerStore(V, SI, AATags);
3481
3482 StoreInst *NewSI;
3483 if (NewBeginOffset == NewAllocaBeginOffset &&
3484 NewEndOffset == NewAllocaEndOffset &&
3485 canConvertValue(DL, V->getType(), NewAllocaTy)) {
3486 V = convertValue(DL, IRB, V, NewAllocaTy);
3487 Value *NewPtr =
3488 getPtrToNewAI(SI.getPointerAddressSpace(), SI.isVolatile());
3489
3490 NewSI =
3491 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), SI.isVolatile());
3492 } else {
3493 unsigned AS = SI.getPointerAddressSpace();
3494 Value *NewPtr = getNewAllocaSlicePtr(IRB, IRB.getPtrTy(AS));
3495 NewSI =
3496 IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
3497 }
3498 NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3499 LLVMContext::MD_access_group});
3500 if (AATags)
3501 NewSI->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3502 V->getType(), DL));
3503 if (SI.isVolatile())
3504 NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
3505 if (NewSI->isAtomic())
3506 NewSI->setAlignment(SI.getAlign());
3507
3508 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3509 NewSI, NewSI->getPointerOperand(),
3510 NewSI->getValueOperand(), DL);
3511
3512 Pass.DeadInsts.push_back(&SI);
3513 deleteIfTriviallyDead(OldOp);
3514
3515 LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
3516 return NewSI->getPointerOperand() == &NewAI &&
3517 NewSI->getValueOperand()->getType() == NewAllocaTy &&
3518 !SI.isVolatile();
3519 }
3520
3521 /// Compute an integer value from splatting an i8 across the given
3522 /// number of bytes.
3523 ///
3524 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
3525 /// call this routine.
3526 /// FIXME: Heed the advice above.
3527 ///
3528 /// \param V The i8 value to splat.
3529 /// \param Size The number of bytes in the output (assuming i8 is one byte)
3530 Value *getIntegerSplat(Value *V, unsigned Size) {
3531 assert(Size > 0 && "Expected a positive number of bytes.");
3532 IntegerType *VTy = cast<IntegerType>(V->getType());
3533 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
3534 if (Size == 1)
3535 return V;
3536
3537 Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
3538 V = IRB.CreateMul(
3539 IRB.CreateZExt(V, SplatIntTy, "zext"),
3540 IRB.CreateUDiv(Constant::getAllOnesValue(SplatIntTy),
3541 IRB.CreateZExt(Constant::getAllOnesValue(V->getType()),
3542 SplatIntTy)),
3543 "isplat");
3544 return V;
3545 }
3546
3547 /// Compute a vector splat for a given element value.
3548 Value *getVectorSplat(Value *V, unsigned NumElements) {
3549 V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
3550 LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
3551 return V;
3552 }
3553
3554 bool visitMemSetInst(MemSetInst &II) {
3555 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3556 assert(II.getRawDest() == OldPtr);
3557
3558 AAMDNodes AATags = II.getAAMetadata();
3559
3560 // If the memset has a variable size, it cannot be split, just adjust the
3561 // pointer to the new alloca.
3562 if (!isa<ConstantInt>(II.getLength())) {
3563 assert(!IsSplit);
3564 assert(NewBeginOffset == BeginOffset);
3565 II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
3566 II.setDestAlignment(getSliceAlign());
3567 // In theory we should call migrateDebugInfo here. However, we do not
3568 // emit dbg.assign intrinsics for mem intrinsics storing through non-
3569 // constant geps, or storing a variable number of bytes.
3571 "AT: Unexpected link to non-const GEP");
3572 deleteIfTriviallyDead(OldPtr);
3573 return false;
3574 }
3575
3576 // Record this instruction for deletion.
3577 Pass.DeadInsts.push_back(&II);
3578
3579 Type *AllocaTy = NewAI.getAllocatedType();
3580 Type *ScalarTy = AllocaTy->getScalarType();
3581
3582 const bool CanContinue = [&]() {
3583 if (VecTy || IntTy)
3584 return true;
3585 if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)
3586 return false;
3587 // Length must be in range for FixedVectorType.
3588 auto *C = cast<ConstantInt>(II.getLength());
3589 const uint64_t Len = C->getLimitedValue();
3590 if (Len > std::numeric_limits<unsigned>::max())
3591 return false;
3592 auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
3593 auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
3594 return canConvertValue(DL, SrcTy, AllocaTy) &&
3595 DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedValue());
3596 }();
3597
3598 // If this doesn't map cleanly onto the alloca type, and that type isn't
3599 // a single value type, just emit a memset.
3600 if (!CanContinue) {
3601 Type *SizeTy = II.getLength()->getType();
3602 unsigned Sz = NewEndOffset - NewBeginOffset;
3603 Constant *Size = ConstantInt::get(SizeTy, Sz);
3604 MemIntrinsic *New = cast<MemIntrinsic>(IRB.CreateMemSet(
3605 getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
3606 MaybeAlign(getSliceAlign()), II.isVolatile()));
3607 if (AATags)
3608 New->setAAMetadata(
3609 AATags.adjustForAccess(NewBeginOffset - BeginOffset, Sz));
3610
3611 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3612 New, New->getRawDest(), nullptr, DL);
3613
3614 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3615 return false;
3616 }
3617
3618 // If we can represent this as a simple value, we have to build the actual
3619 // value to store, which requires expanding the byte present in memset to
3620 // a sensible representation for the alloca type. This is essentially
3621 // splatting the byte to a sufficiently wide integer, splatting it across
3622 // any desired vector width, and bitcasting to the final type.
3623 Value *V;
3624
3625 if (VecTy) {
3626 // If this is a memset of a vectorized alloca, insert it.
3627 assert(ElementTy == ScalarTy);
3628
3629 unsigned BeginIndex = getIndex(NewBeginOffset);
3630 unsigned EndIndex = getIndex(NewEndOffset);
3631 assert(EndIndex > BeginIndex && "Empty vector!");
3632 unsigned NumElements = EndIndex - BeginIndex;
3633 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3634 "Too many elements!");
3635
3636 Value *Splat = getIntegerSplat(
3637 II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8);
3638 Splat = convertValue(DL, IRB, Splat, ElementTy);
3639 if (NumElements > 1)
3640 Splat = getVectorSplat(Splat, NumElements);
3641
3642 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3643 NewAI.getAlign(), "oldload");
3644 V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
3645 } else if (IntTy) {
3646 // If this is a memset on an alloca where we can widen stores, insert the
3647 // set integer.
3648 assert(!II.isVolatile());
3649
3650 uint64_t Size = NewEndOffset - NewBeginOffset;
3651 V = getIntegerSplat(II.getValue(), Size);
3652
3653 if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
3654 EndOffset != NewAllocaBeginOffset)) {
3655 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3656 NewAI.getAlign(), "oldload");
3657 Old = convertValue(DL, IRB, Old, IntTy);
3658 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3659 V = insertInteger(DL, IRB, Old, V, Offset, "insert");
3660 } else {
3661 assert(V->getType() == IntTy &&
3662 "Wrong type for an alloca wide integer!");
3663 }
3664 V = convertValue(DL, IRB, V, AllocaTy);
3665 } else {
3666 // Established these invariants above.
3667 assert(NewBeginOffset == NewAllocaBeginOffset);
3668 assert(NewEndOffset == NewAllocaEndOffset);
3669
3670 V = getIntegerSplat(II.getValue(),
3671 DL.getTypeSizeInBits(ScalarTy).getFixedValue() / 8);
3672 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
3673 V = getVectorSplat(
3674 V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
3675
3676 V = convertValue(DL, IRB, V, AllocaTy);
3677 }
3678
3679 Value *NewPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3680 StoreInst *New =
3681 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), II.isVolatile());
3682 New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3683 LLVMContext::MD_access_group});
3684 if (AATags)
3685 New->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3686 V->getType(), DL));
3687
3688 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3689 New, New->getPointerOperand(), V, DL);
3690
3691 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3692 return !II.isVolatile();
3693 }
3694
3695 bool visitMemTransferInst(MemTransferInst &II) {
3696 // Rewriting of memory transfer instructions can be a bit tricky. We break
3697 // them into two categories: split intrinsics and unsplit intrinsics.
3698
3699 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3700
3701 AAMDNodes AATags = II.getAAMetadata();
3702
3703 bool IsDest = &II.getRawDestUse() == OldUse;
3704 assert((IsDest && II.getRawDest() == OldPtr) ||
3705 (!IsDest && II.getRawSource() == OldPtr));
3706
3707 Align SliceAlign = getSliceAlign();
3708 // For unsplit intrinsics, we simply modify the source and destination
3709 // pointers in place. This isn't just an optimization, it is a matter of
3710 // correctness. With unsplit intrinsics we may be dealing with transfers
3711 // within a single alloca before SROA ran, or with transfers that have
3712 // a variable length. We may also be dealing with memmove instead of
3713 // memcpy, and so simply updating the pointers is the necessary for us to
3714 // update both source and dest of a single call.
3715 if (!IsSplittable) {
3716 Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3717 if (IsDest) {
3718 // Update the address component of linked dbg.assigns.
3719 for (DbgVariableRecord *DbgAssign : at::getDVRAssignmentMarkers(&II)) {
3720 if (llvm::is_contained(DbgAssign->location_ops(), II.getDest()) ||
3721 DbgAssign->getAddress() == II.getDest())
3722 DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
3723 }
3724 II.setDest(AdjustedPtr);
3725 II.setDestAlignment(SliceAlign);
3726 } else {
3727 II.setSource(AdjustedPtr);
3728 II.setSourceAlignment(SliceAlign);
3729 }
3730
3731 LLVM_DEBUG(dbgs() << " to: " << II << "\n");
3732 deleteIfTriviallyDead(OldPtr);
3733 return false;
3734 }
3735 // For split transfer intrinsics we have an incredibly useful assurance:
3736 // the source and destination do not reside within the same alloca, and at
3737 // least one of them does not escape. This means that we can replace
3738 // memmove with memcpy, and we don't need to worry about all manner of
3739 // downsides to splitting and transforming the operations.
3740
3741 // If this doesn't map cleanly onto the alloca type, and that type isn't
3742 // a single value type, just emit a memcpy.
3743 bool EmitMemCpy =
3744 !VecTy && !IntTy &&
3745 (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
3746 SliceSize !=
3747 DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedValue() ||
3748 !DL.typeSizeEqualsStoreSize(NewAI.getAllocatedType()) ||
3750
3751 // If we're just going to emit a memcpy, the alloca hasn't changed, and the
3752 // size hasn't been shrunk based on analysis of the viable range, this is
3753 // a no-op.
3754 if (EmitMemCpy && &OldAI == &NewAI) {
3755 // Ensure the start lines up.
3756 assert(NewBeginOffset == BeginOffset);
3757
3758 // Rewrite the size as needed.
3759 if (NewEndOffset != EndOffset)
3760 II.setLength(NewEndOffset - NewBeginOffset);
3761 return false;
3762 }
3763 // Record this instruction for deletion.
3764 Pass.DeadInsts.push_back(&II);
3765
3766 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3767 // alloca that should be re-examined after rewriting this instruction.
3768 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
3769 if (AllocaInst *AI =
3771 assert(AI != &OldAI && AI != &NewAI &&
3772 "Splittable transfers cannot reach the same alloca on both ends.");
3773 Pass.Worklist.insert(AI);
3774 }
3775
3776 Type *OtherPtrTy = OtherPtr->getType();
3777 unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
3778
3779 // Compute the relative offset for the other pointer within the transfer.
3780 unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
3781 APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
3782 Align OtherAlign =
3783 (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
3784 OtherAlign =
3785 commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
3786
3787 if (EmitMemCpy) {
3788 // Compute the other pointer, folding as much as possible to produce
3789 // a single, simple GEP in most cases.
3790 OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3791 OtherPtr->getName() + ".");
3792
3793 Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3794 Type *SizeTy = II.getLength()->getType();
3795 Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
3796
3797 Value *DestPtr, *SrcPtr;
3798 MaybeAlign DestAlign, SrcAlign;
3799 // Note: IsDest is true iff we're copying into the new alloca slice
3800 if (IsDest) {
3801 DestPtr = OurPtr;
3802 DestAlign = SliceAlign;
3803 SrcPtr = OtherPtr;
3804 SrcAlign = OtherAlign;
3805 } else {
3806 DestPtr = OtherPtr;
3807 DestAlign = OtherAlign;
3808 SrcPtr = OurPtr;
3809 SrcAlign = SliceAlign;
3810 }
3811 CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
3812 Size, II.isVolatile());
3813 if (AATags)
3814 New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3815
3816 APInt Offset(DL.getIndexTypeSizeInBits(DestPtr->getType()), 0);
3817 if (IsDest) {
3818 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8,
3819 &II, New, DestPtr, nullptr, DL);
3820 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3822 DL, Offset, /*AllowNonInbounds*/ true))) {
3823 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8,
3824 SliceSize * 8, &II, New, DestPtr, nullptr, DL);
3825 }
3826 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3827 return false;
3828 }
3829
3830 bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3831 NewEndOffset == NewAllocaEndOffset;
3832 uint64_t Size = NewEndOffset - NewBeginOffset;
3833 unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
3834 unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
3835 unsigned NumElements = EndIndex - BeginIndex;
3836 IntegerType *SubIntTy =
3837 IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
3838
3839 // Reset the other pointer type to match the register type we're going to
3840 // use, but using the address space of the original other pointer.
3841 Type *OtherTy;
3842 if (VecTy && !IsWholeAlloca) {
3843 if (NumElements == 1)
3844 OtherTy = VecTy->getElementType();
3845 else
3846 OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
3847 } else if (IntTy && !IsWholeAlloca) {
3848 OtherTy = SubIntTy;
3849 } else {
3850 OtherTy = NewAllocaTy;
3851 }
3852
3853 Value *AdjPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3854 OtherPtr->getName() + ".");
3855 MaybeAlign SrcAlign = OtherAlign;
3856 MaybeAlign DstAlign = SliceAlign;
3857 if (!IsDest)
3858 std::swap(SrcAlign, DstAlign);
3859
3860 Value *SrcPtr;
3861 Value *DstPtr;
3862
3863 if (IsDest) {
3864 DstPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3865 SrcPtr = AdjPtr;
3866 } else {
3867 DstPtr = AdjPtr;
3868 SrcPtr = getPtrToNewAI(II.getSourceAddressSpace(), II.isVolatile());
3869 }
3870
3871 Value *Src;
3872 if (VecTy && !IsWholeAlloca && !IsDest) {
3873 Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3874 NewAI.getAlign(), "load");
3875 Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
3876 } else if (IntTy && !IsWholeAlloca && !IsDest) {
3877 Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3878 NewAI.getAlign(), "load");
3879 Src = convertValue(DL, IRB, Src, IntTy);
3880 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3881 Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
3882 } else {
3883 LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
3884 II.isVolatile(), "copyload");
3885 Load->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3886 LLVMContext::MD_access_group});
3887 if (AATags)
3888 Load->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3889 Load->getType(), DL));
3890 Src = Load;
3891 }
3892
3893 if (VecTy && !IsWholeAlloca && IsDest) {
3894 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3895 NewAI.getAlign(), "oldload");
3896 Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
3897 } else if (IntTy && !IsWholeAlloca && IsDest) {
3898 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3899 NewAI.getAlign(), "oldload");
3900 Old = convertValue(DL, IRB, Old, IntTy);
3901 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3902 Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
3903 Src = convertValue(DL, IRB, Src, NewAllocaTy);
3904 }
3905
3906 StoreInst *Store = cast<StoreInst>(
3907 IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
3908 Store->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3909 LLVMContext::MD_access_group});
3910 if (AATags)
3911 Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3912 Src->getType(), DL));
3913
3914 APInt Offset(DL.getIndexTypeSizeInBits(DstPtr->getType()), 0);
3915 if (IsDest) {
3916
3917 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3918 Store, DstPtr, Src, DL);
3919 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3921 DL, Offset, /*AllowNonInbounds*/ true))) {
3922 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8, SliceSize * 8,
3923 &II, Store, DstPtr, Src, DL);
3924 }
3925
3926 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3927 return !II.isVolatile();
3928 }
3929
3930 bool visitIntrinsicInst(IntrinsicInst &II) {
3931 assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
3932 "Unexpected intrinsic!");
3933 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3934
3935 // Record this instruction for deletion.
3936 Pass.DeadInsts.push_back(&II);
3937
3938 if (II.isDroppable()) {
3939 assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3940 // TODO For now we forget assumed information, this can be improved.
3941 OldPtr->dropDroppableUsesIn(II);
3942 return true;
3943 }
3944
3945 assert(II.getArgOperand(0) == OldPtr);
3946 Type *PointerTy = IRB.getPtrTy(OldPtr->getType()->getPointerAddressSpace());
3947 Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3948 Value *New;
3949 if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3950 New = IRB.CreateLifetimeStart(Ptr);
3951 else
3952 New = IRB.CreateLifetimeEnd(Ptr);
3953
3954 (void)New;
3955 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3956
3957 return true;
3958 }
3959
3960 void fixLoadStoreAlign(Instruction &Root) {
3961 // This algorithm implements the same visitor loop as
3962 // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3963 // or store found.
3964 SmallPtrSet<Instruction *, 4> Visited;
3965 SmallVector<Instruction *, 4> Uses;
3966 Visited.insert(&Root);
3967 Uses.push_back(&Root);
3968 do {
3969 Instruction *I = Uses.pop_back_val();
3970
3971 if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
3972 LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
3973 continue;
3974 }
3975 if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
3976 SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
3977 continue;
3978 }
3979
3983 for (User *U : I->users())
3984 if (Visited.insert(cast<Instruction>(U)).second)
3985 Uses.push_back(cast<Instruction>(U));
3986 } while (!Uses.empty());
3987 }
3988
3989 bool visitPHINode(PHINode &PN) {
3990 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
3991 assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3992 assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3993
3994 // We would like to compute a new pointer in only one place, but have it be
3995 // as local as possible to the PHI. To do that, we re-use the location of
3996 // the old pointer, which necessarily must be in the right position to
3997 // dominate the PHI.
3998 IRBuilderBase::InsertPointGuard Guard(IRB);
3999 if (isa<PHINode>(OldPtr))
4000 IRB.SetInsertPoint(OldPtr->getParent(),
4001 OldPtr->getParent()->getFirstInsertionPt());
4002 else
4003 IRB.SetInsertPoint(OldPtr);
4004 IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
4005
4006 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
4007 // Replace the operands which were using the old pointer.
4008 std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
4009
4010 LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
4011 deleteIfTriviallyDead(OldPtr);
4012
4013 // Fix the alignment of any loads or stores using this PHI node.
4014 fixLoadStoreAlign(PN);
4015
4016 // PHIs can't be promoted on their own, but often can be speculated. We
4017 // check the speculation outside of the rewriter so that we see the
4018 // fully-rewritten alloca.
4019 PHIUsers.insert(&PN);
4020 return true;
4021 }
4022
4023 bool visitSelectInst(SelectInst &SI) {
4024 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
4025 assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
4026 "Pointer isn't an operand!");
4027 assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
4028 assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
4029
4030 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
4031 // Replace the operands which were using the old pointer.
4032 if (SI.getOperand(1) == OldPtr)
4033 SI.setOperand(1, NewPtr);
4034 if (SI.getOperand(2) == OldPtr)
4035 SI.setOperand(2, NewPtr);
4036
4037 LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
4038 deleteIfTriviallyDead(OldPtr);
4039
4040 // Fix the alignment of any loads or stores using this select.
4041 fixLoadStoreAlign(SI);
4042
4043 // Selects can't be promoted on their own, but often can be speculated. We
4044 // check the speculation outside of the rewriter so that we see the
4045 // fully-rewritten alloca.
4046 SelectUsers.insert(&SI);
4047 return true;
4048 }
4049};
4050
4051/// Visitor to rewrite aggregate loads and stores as scalar.
4052///
4053/// This pass aggressively rewrites all aggregate loads and stores on
4054/// a particular pointer (or any pointer derived from it which we can identify)
4055/// with scalar loads and stores.
4056class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
4057 // Befriend the base class so it can delegate to private visit methods.
4058 friend class InstVisitor<AggLoadStoreRewriter, bool>;
4059
4060 /// Queue of pointer uses to analyze and potentially rewrite.
4062
4063 /// Set to prevent us from cycling with phi nodes and loops.
4064 SmallPtrSet<User *, 8> Visited;
4065
4066 /// The current pointer use being rewritten. This is used to dig up the used
4067 /// value (as opposed to the user).
4068 Use *U = nullptr;
4069
4070 /// Used to calculate offsets, and hence alignment, of subobjects.
4071 const DataLayout &DL;
4072
4073 IRBuilderTy &IRB;
4074
4075public:
4076 AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
4077 : DL(DL), IRB(IRB) {}
4078
4079 /// Rewrite loads and stores through a pointer and all pointers derived from
4080 /// it.
4081 bool rewrite(Instruction &I) {
4082 LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
4083 enqueueUsers(I);
4084 bool Changed = false;
4085 while (!Queue.empty()) {
4086 U = Queue.pop_back_val();
4087 Changed |= visit(cast<Instruction>(U->getUser()));
4088 }
4089 return Changed;
4090 }
4091
4092private:
4093 /// Enqueue all the users of the given instruction for further processing.
4094 /// This uses a set to de-duplicate users.
4095 void enqueueUsers(Instruction &I) {
4096 for (Use &U : I.uses())
4097 if (Visited.insert(U.getUser()).second)
4098 Queue.push_back(&U);
4099 }
4100
4101 // Conservative default is to not rewrite anything.
4102 bool visitInstruction(Instruction &I) { return false; }
4103
4104 /// Generic recursive split emission class.
4105 template <typename Derived> class OpSplitter {
4106 protected:
4107 /// The builder used to form new instructions.
4108 IRBuilderTy &IRB;
4109
4110 /// The indices which to be used with insert- or extractvalue to select the
4111 /// appropriate value within the aggregate.
4112 SmallVector<unsigned, 4> Indices;
4113
4114 /// The indices to a GEP instruction which will move Ptr to the correct slot
4115 /// within the aggregate.
4116 SmallVector<Value *, 4> GEPIndices;
4117
4118 /// The base pointer of the original op, used as a base for GEPing the
4119 /// split operations.
4120 Value *Ptr;
4121
4122 /// The base pointee type being GEPed into.
4123 Type *BaseTy;
4124
4125 /// Known alignment of the base pointer.
4126 Align BaseAlign;
4127
4128 /// To calculate offset of each component so we can correctly deduce
4129 /// alignments.
4130 const DataLayout &DL;
4131
4132 /// Initialize the splitter with an insertion point, Ptr and start with a
4133 /// single zero GEP index.
4134 OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4135 Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
4136 : IRB(IRB), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy),
4137 BaseAlign(BaseAlign), DL(DL) {
4138 IRB.SetInsertPoint(InsertionPoint);
4139 }
4140
4141 public:
4142 /// Generic recursive split emission routine.
4143 ///
4144 /// This method recursively splits an aggregate op (load or store) into
4145 /// scalar or vector ops. It splits recursively until it hits a single value
4146 /// and emits that single value operation via the template argument.
4147 ///
4148 /// The logic of this routine relies on GEPs and insertvalue and
4149 /// extractvalue all operating with the same fundamental index list, merely
4150 /// formatted differently (GEPs need actual values).
4151 ///
4152 /// \param Ty The type being split recursively into smaller ops.
4153 /// \param Agg The aggregate value being built up or stored, depending on
4154 /// whether this is splitting a load or a store respectively.
4155 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
4156 if (Ty->isSingleValueType()) {
4157 unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
4158 return static_cast<Derived *>(this)->emitFunc(
4159 Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
4160 }
4161
4162 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
4163 unsigned OldSize = Indices.size();
4164 (void)OldSize;
4165 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
4166 ++Idx) {
4167 assert(Indices.size() == OldSize && "Did not return to the old size");
4168 Indices.push_back(Idx);
4169 GEPIndices.push_back(IRB.getInt32(Idx));
4170 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
4171 GEPIndices.pop_back();
4172 Indices.pop_back();
4173 }
4174 return;
4175 }
4176
4177 if (StructType *STy = dyn_cast<StructType>(Ty)) {
4178 unsigned OldSize = Indices.size();
4179 (void)OldSize;
4180 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
4181 ++Idx) {
4182 assert(Indices.size() == OldSize && "Did not return to the old size");
4183 Indices.push_back(Idx);
4184 GEPIndices.push_back(IRB.getInt32(Idx));
4185 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
4186 GEPIndices.pop_back();
4187 Indices.pop_back();
4188 }
4189 return;
4190 }
4191
4192 llvm_unreachable("Only arrays and structs are aggregate loadable types");
4193 }
4194 };
4195
4196 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
4197 AAMDNodes AATags;
4198 // A vector to hold the split components that we want to emit
4199 // separate fake uses for.
4200 SmallVector<Value *, 4> Components;
4201 // A vector to hold all the fake uses of the struct that we are splitting.
4202 // Usually there should only be one, but we are handling the general case.
4204
4205 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4206 AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
4207 IRBuilderTy &IRB)
4208 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
4209 IRB),
4210 AATags(AATags) {}
4211
4212 /// Emit a leaf load of a single value. This is called at the leaves of the
4213 /// recursive emission to actually load values.
4214 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4216 // Load the single value and insert it using the indices.
4217 Value *GEP =
4218 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
4219 LoadInst *Load =
4220 IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
4221
4222 APInt Offset(
4223 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4224 if (AATags &&
4225 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
4226 Load->setAAMetadata(
4227 AATags.adjustForAccess(Offset.getZExtValue(), Load->getType(), DL));
4228 // Record the load so we can generate a fake use for this aggregate
4229 // component.
4230 Components.push_back(Load);
4231
4232 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
4233 LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
4234 }
4235
4236 // Stash the fake uses that use the value generated by this instruction.
4237 void recordFakeUses(LoadInst &LI) {
4238 for (Use &U : LI.uses())
4239 if (auto *II = dyn_cast<IntrinsicInst>(U.getUser()))
4240 if (II->getIntrinsicID() == Intrinsic::fake_use)
4241 FakeUses.push_back(II);
4242 }
4243
4244 // Replace all fake uses of the aggregate with a series of fake uses, one
4245 // for each split component.
4246 void emitFakeUses() {
4247 for (Instruction *I : FakeUses) {
4248 IRB.SetInsertPoint(I);
4249 for (auto *V : Components)
4250 IRB.CreateIntrinsic(Intrinsic::fake_use, {V});
4251 I->eraseFromParent();
4252 }
4253 }
4254 };
4255
4256 bool visitLoadInst(LoadInst &LI) {
4257 assert(LI.getPointerOperand() == *U);
4258 if (!LI.isSimple() || LI.getType()->isSingleValueType())
4259 return false;
4260
4261 // We have an aggregate being loaded, split it apart.
4262 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
4263 LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
4264 getAdjustedAlignment(&LI, 0), DL, IRB);
4265 Splitter.recordFakeUses(LI);
4267 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
4268 Splitter.emitFakeUses();
4269 Visited.erase(&LI);
4270 LI.replaceAllUsesWith(V);
4271 LI.eraseFromParent();
4272 return true;
4273 }
4274
4275 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
4276 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4277 AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign,
4278 const DataLayout &DL, IRBuilderTy &IRB)
4279 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
4280 DL, IRB),
4281 AATags(AATags), AggStore(AggStore) {}
4282 AAMDNodes AATags;
4283 StoreInst *AggStore;
4284 /// Emit a leaf store of a single value. This is called at the leaves of the
4285 /// recursive emission to actually produce stores.
4286 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4288 // Extract the single value and store it using the indices.
4289 //
4290 // The gep and extractvalue values are factored out of the CreateStore
4291 // call to make the output independent of the argument evaluation order.
4292 Value *ExtractValue =
4293 IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
4294 Value *InBoundsGEP =
4295 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
4296 StoreInst *Store =
4297 IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
4298
4299 APInt Offset(
4300 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4301 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset);
4302 if (AATags) {
4303 Store->setAAMetadata(AATags.adjustForAccess(
4304 Offset.getZExtValue(), ExtractValue->getType(), DL));
4305 }
4306
4307 // migrateDebugInfo requires the base Alloca. Walk to it from this gep.
4308 // If we cannot (because there's an intervening non-const or unbounded
4309 // gep) then we wouldn't expect to see dbg.assign intrinsics linked to
4310 // this instruction.
4312 if (auto *OldAI = dyn_cast<AllocaInst>(Base)) {
4313 uint64_t SizeInBits =
4314 DL.getTypeSizeInBits(Store->getValueOperand()->getType());
4315 migrateDebugInfo(OldAI, /*IsSplit*/ true, Offset.getZExtValue() * 8,
4316 SizeInBits, AggStore, Store,
4317 Store->getPointerOperand(), Store->getValueOperand(),
4318 DL);
4319 } else {
4321 "AT: unexpected debug.assign linked to store through "
4322 "unbounded GEP");
4323 }
4324 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
4325 }
4326 };
4327
4328 bool visitStoreInst(StoreInst &SI) {
4329 if (!SI.isSimple() || SI.getPointerOperand() != *U)
4330 return false;
4331 Value *V = SI.getValueOperand();
4332 if (V->getType()->isSingleValueType())
4333 return false;
4334
4335 // We have an aggregate being stored, split it apart.
4336 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
4337 StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI,
4338 getAdjustedAlignment(&SI, 0), DL, IRB);
4339 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
4340 Visited.erase(&SI);
4341 // The stores replacing SI each have markers describing fragments of the
4342 // assignment so delete the assignment markers linked to SI.
4344 SI.eraseFromParent();
4345 return true;
4346 }
4347
4348 bool visitBitCastInst(BitCastInst &BC) {
4349 enqueueUsers(BC);
4350 return false;
4351 }
4352
4353 bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
4354 enqueueUsers(ASC);
4355 return false;
4356 }
4357
4358 // Unfold gep (select cond, ptr1, ptr2), idx
4359 // => select cond, gep(ptr1, idx), gep(ptr2, idx)
4360 // and gep ptr, (select cond, idx1, idx2)
4361 // => select cond, gep(ptr, idx1), gep(ptr, idx2)
4362 // We also allow for i1 zext indices, which are equivalent to selects.
4363 bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
4364 // Check whether the GEP has exactly one select operand and all indices
4365 // will become constant after the transform.
4367 for (Value *Op : GEPI.indices()) {
4368 if (auto *SI = dyn_cast<SelectInst>(Op)) {
4369 if (Sel)
4370 return false;
4371
4372 Sel = SI;
4373 if (!isa<ConstantInt>(SI->getTrueValue()) ||
4374 !isa<ConstantInt>(SI->getFalseValue()))
4375 return false;
4376 continue;
4377 }
4378 if (auto *ZI = dyn_cast<ZExtInst>(Op)) {
4379 if (Sel)
4380 return false;
4381 Sel = ZI;
4382 if (!ZI->getSrcTy()->isIntegerTy(1))
4383 return false;
4384 continue;
4385 }
4386
4387 if (!isa<ConstantInt>(Op))
4388 return false;
4389 }
4390
4391 if (!Sel)
4392 return false;
4393
4394 LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):\n";
4395 dbgs() << " original: " << *Sel << "\n";
4396 dbgs() << " " << GEPI << "\n";);
4397
4398 auto GetNewOps = [&](Value *SelOp) {
4399 SmallVector<Value *> NewOps;
4400 for (Value *Op : GEPI.operands())
4401 if (Op == Sel)
4402 NewOps.push_back(SelOp);
4403 else
4404 NewOps.push_back(Op);
4405 return NewOps;
4406 };
4407
4408 Value *Cond, *True, *False;
4409 Instruction *MDFrom = nullptr;
4410 if (auto *SI = dyn_cast<SelectInst>(Sel)) {
4411 Cond = SI->getCondition();
4412 True = SI->getTrueValue();
4413 False = SI->getFalseValue();
4415 MDFrom = SI;
4416 } else {
4417 Cond = Sel->getOperand(0);
4418 True = ConstantInt::get(Sel->getType(), 1);
4419 False = ConstantInt::get(Sel->getType(), 0);
4420 }
4421 SmallVector<Value *> TrueOps = GetNewOps(True);
4422 SmallVector<Value *> FalseOps = GetNewOps(False);
4423
4424 IRB.SetInsertPoint(&GEPI);
4425 GEPNoWrapFlags NW = GEPI.getNoWrapFlags();
4426
4427 Type *Ty = GEPI.getSourceElementType();
4428 Value *NTrue = IRB.CreateGEP(Ty, TrueOps[0], ArrayRef(TrueOps).drop_front(),
4429 True->getName() + ".sroa.gep", NW);
4430
4431 Value *NFalse =
4432 IRB.CreateGEP(Ty, FalseOps[0], ArrayRef(FalseOps).drop_front(),
4433 False->getName() + ".sroa.gep", NW);
4434
4435 Value *NSel = MDFrom
4436 ? IRB.CreateSelect(Cond, NTrue, NFalse,
4437 Sel->getName() + ".sroa.sel", MDFrom)
4438 : IRB.CreateSelectWithUnknownProfile(
4439 Cond, NTrue, NFalse, DEBUG_TYPE,
4440 Sel->getName() + ".sroa.sel");
4441 Visited.erase(&GEPI);
4442 GEPI.replaceAllUsesWith(NSel);
4443 GEPI.eraseFromParent();
4444 Instruction *NSelI = cast<Instruction>(NSel);
4445 Visited.insert(NSelI);
4446 enqueueUsers(*NSelI);
4447
4448 LLVM_DEBUG(dbgs() << " to: " << *NTrue << "\n";
4449 dbgs() << " " << *NFalse << "\n";
4450 dbgs() << " " << *NSel << "\n";);
4451
4452 return true;
4453 }
4454
4455 // Unfold gep (phi ptr1, ptr2), idx
4456 // => phi ((gep ptr1, idx), (gep ptr2, idx))
4457 // and gep ptr, (phi idx1, idx2)
4458 // => phi ((gep ptr, idx1), (gep ptr, idx2))
4459 bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
4460 // To prevent infinitely expanding recursive phis, bail if the GEP pointer
4461 // operand (looking through the phi if it is the phi we want to unfold) is
4462 // an instruction besides a static alloca.
4463 PHINode *Phi = dyn_cast<PHINode>(GEPI.getPointerOperand());
4464 auto IsInvalidPointerOperand = [](Value *V) {
4465 if (!isa<Instruction>(V))
4466 return false;
4467 if (auto *AI = dyn_cast<AllocaInst>(V))
4468 return !AI->isStaticAlloca();
4469 return true;
4470 };
4471 if (Phi) {
4472 if (any_of(Phi->operands(), IsInvalidPointerOperand))
4473 return false;
4474 } else {
4475 if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
4476 return false;
4477 }
4478 // Check whether the GEP has exactly one phi operand (including the pointer
4479 // operand) and all indices will become constant after the transform.
4480 for (Value *Op : GEPI.indices()) {
4481 if (auto *SI = dyn_cast<PHINode>(Op)) {
4482 if (Phi)
4483 return false;
4484
4485 Phi = SI;
4486 if (!all_of(Phi->incoming_values(),
4487 [](Value *V) { return isa<ConstantInt>(V); }))
4488 return false;
4489 continue;
4490 }
4491
4492 if (!isa<ConstantInt>(Op))
4493 return false;
4494 }
4495
4496 if (!Phi)
4497 return false;
4498
4499 LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):\n";
4500 dbgs() << " original: " << *Phi << "\n";
4501 dbgs() << " " << GEPI << "\n";);
4502
4503 auto GetNewOps = [&](Value *PhiOp) {
4504 SmallVector<Value *> NewOps;
4505 for (Value *Op : GEPI.operands())
4506 if (Op == Phi)
4507 NewOps.push_back(PhiOp);
4508 else
4509 NewOps.push_back(Op);
4510 return NewOps;
4511 };
4512
4513 IRB.SetInsertPoint(Phi);
4514 PHINode *NewPhi = IRB.CreatePHI(GEPI.getType(), Phi->getNumIncomingValues(),
4515 Phi->getName() + ".sroa.phi");
4516
4517 Type *SourceTy = GEPI.getSourceElementType();
4518 // We only handle arguments, constants, and static allocas here, so we can
4519 // insert GEPs at the end of the entry block.
4520 IRB.SetInsertPoint(GEPI.getFunction()->getEntryBlock().getTerminator());
4521 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
4522 Value *Op = Phi->getIncomingValue(I);
4523 BasicBlock *BB = Phi->getIncomingBlock(I);
4524 Value *NewGEP;
4525 if (int NI = NewPhi->getBasicBlockIndex(BB); NI >= 0) {
4526 NewGEP = NewPhi->getIncomingValue(NI);
4527 } else {
4528 SmallVector<Value *> NewOps = GetNewOps(Op);
4529 NewGEP =
4530 IRB.CreateGEP(SourceTy, NewOps[0], ArrayRef(NewOps).drop_front(),
4531 Phi->getName() + ".sroa.gep", GEPI.getNoWrapFlags());
4532 }
4533 NewPhi->addIncoming(NewGEP, BB);
4534 }
4535
4536 Visited.erase(&GEPI);
4537 GEPI.replaceAllUsesWith(NewPhi);
4538 GEPI.eraseFromParent();
4539 Visited.insert(NewPhi);
4540 enqueueUsers(*NewPhi);
4541
4542 LLVM_DEBUG(dbgs() << " to: ";
4543 for (Value *In
4544 : NewPhi->incoming_values()) dbgs()
4545 << "\n " << *In;
4546 dbgs() << "\n " << *NewPhi << '\n');
4547
4548 return true;
4549 }
4550
4551 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
4552 if (unfoldGEPSelect(GEPI))
4553 return true;
4554
4555 if (unfoldGEPPhi(GEPI))
4556 return true;
4557
4558 enqueueUsers(GEPI);
4559 return false;
4560 }
4561
4562 bool visitPHINode(PHINode &PN) {
4563 enqueueUsers(PN);
4564 return false;
4565 }
4566
4567 bool visitSelectInst(SelectInst &SI) {
4568 enqueueUsers(SI);
4569 return false;
4570 }
4571};
4572
4573} // end anonymous namespace
4574
4575/// Strip aggregate type wrapping.
4576///
4577/// This removes no-op aggregate types wrapping an underlying type. It will
4578/// strip as many layers of types as it can without changing either the type
4579/// size or the allocated size.
4581 if (Ty->isSingleValueType())
4582 return Ty;
4583
4584 uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue();
4585 uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
4586
4587 Type *InnerTy;
4588 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
4589 InnerTy = ArrTy->getElementType();
4590 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
4591 const StructLayout *SL = DL.getStructLayout(STy);
4592 unsigned Index = SL->getElementContainingOffset(0);
4593 InnerTy = STy->getElementType(Index);
4594 } else {
4595 return Ty;
4596 }
4597
4598 if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedValue() ||
4599 TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedValue())
4600 return Ty;
4601
4602 return stripAggregateTypeWrapping(DL, InnerTy);
4603}
4604
4605/// Try to find a partition of the aggregate type passed in for a given
4606/// offset and size.
4607///
4608/// This recurses through the aggregate type and tries to compute a subtype
4609/// based on the offset and size. When the offset and size span a sub-section
4610/// of an array, it will even compute a new array type for that sub-section,
4611/// and the same for structs.
4612///
4613/// Note that this routine is very strict and tries to find a partition of the
4614/// type which produces the *exact* right offset and size. It is not forgiving
4615/// when the size or offset cause either end of type-based partition to be off.
4616/// Also, this is a best-effort routine. It is reasonable to give up and not
4617/// return a type if necessary.
4619 uint64_t Size) {
4620 if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedValue() == Size)
4621 return stripAggregateTypeWrapping(DL, Ty);
4622 if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() ||
4623 (DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size)
4624 return nullptr;
4625
4626 if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
4627 Type *ElementTy;
4628 uint64_t TyNumElements;
4629 if (auto *AT = dyn_cast<ArrayType>(Ty)) {
4630 ElementTy = AT->getElementType();
4631 TyNumElements = AT->getNumElements();
4632 } else {
4633 // FIXME: This isn't right for vectors with non-byte-sized or
4634 // non-power-of-two sized elements.
4635 auto *VT = cast<FixedVectorType>(Ty);
4636 ElementTy = VT->getElementType();
4637 TyNumElements = VT->getNumElements();
4638 }
4639 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4640 uint64_t NumSkippedElements = Offset / ElementSize;
4641 if (NumSkippedElements >= TyNumElements)
4642 return nullptr;
4643 Offset -= NumSkippedElements * ElementSize;
4644
4645 // First check if we need to recurse.
4646 if (Offset > 0 || Size < ElementSize) {
4647 // Bail if the partition ends in a different array element.
4648 if ((Offset + Size) > ElementSize)
4649 return nullptr;
4650 // Recurse through the element type trying to peel off offset bytes.
4651 return getTypePartition(DL, ElementTy, Offset, Size);
4652 }
4653 assert(Offset == 0);
4654
4655 if (Size == ElementSize)
4656 return stripAggregateTypeWrapping(DL, ElementTy);
4657 assert(Size > ElementSize);
4658 uint64_t NumElements = Size / ElementSize;
4659 if (NumElements * ElementSize != Size)
4660 return nullptr;
4661 return ArrayType::get(ElementTy, NumElements);
4662 }
4663
4665 if (!STy)
4666 return nullptr;
4667
4668 const StructLayout *SL = DL.getStructLayout(STy);
4669
4670 if (SL->getSizeInBits().isScalable())
4671 return nullptr;
4672
4673 if (Offset >= SL->getSizeInBytes())
4674 return nullptr;
4675 uint64_t EndOffset = Offset + Size;
4676 if (EndOffset > SL->getSizeInBytes())
4677 return nullptr;
4678
4679 unsigned Index = SL->getElementContainingOffset(Offset);
4680 Offset -= SL->getElementOffset(Index);
4681
4682 Type *ElementTy = STy->getElementType(Index);
4683 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4684 if (Offset >= ElementSize)
4685 return nullptr; // The offset points into alignment padding.
4686
4687 // See if any partition must be contained by the element.
4688 if (Offset > 0 || Size < ElementSize) {
4689 if ((Offset + Size) > ElementSize)
4690 return nullptr;
4691 return getTypePartition(DL, ElementTy, Offset, Size);
4692 }
4693 assert(Offset == 0);
4694
4695 if (Size == ElementSize)
4696 return stripAggregateTypeWrapping(DL, ElementTy);
4697
4698 StructType::element_iterator EI = STy->element_begin() + Index,
4699 EE = STy->element_end();
4700 if (EndOffset < SL->getSizeInBytes()) {
4701 unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
4702 if (Index == EndIndex)
4703 return nullptr; // Within a single element and its padding.
4704
4705 // Don't try to form "natural" types if the elements don't line up with the
4706 // expected size.
4707 // FIXME: We could potentially recurse down through the last element in the
4708 // sub-struct to find a natural end point.
4709 if (SL->getElementOffset(EndIndex) != EndOffset)
4710 return nullptr;
4711
4712 assert(Index < EndIndex);
4713 EE = STy->element_begin() + EndIndex;
4714 }
4715
4716 // Try to build up a sub-structure.
4717 StructType *SubTy =
4718 StructType::get(STy->getContext(), ArrayRef(EI, EE), STy->isPacked());
4719 const StructLayout *SubSL = DL.getStructLayout(SubTy);
4720 if (Size != SubSL->getSizeInBytes())
4721 return nullptr; // The sub-struct doesn't have quite the size needed.
4722
4723 return SubTy;
4724}
4725
4726/// Pre-split loads and stores to simplify rewriting.
4727///
4728/// We want to break up the splittable load+store pairs as much as
4729/// possible. This is important to do as a preprocessing step, as once we
4730/// start rewriting the accesses to partitions of the alloca we lose the
4731/// necessary information to correctly split apart paired loads and stores
4732/// which both point into this alloca. The case to consider is something like
4733/// the following:
4734///
4735/// %a = alloca [12 x i8]
4736/// %gep1 = getelementptr i8, ptr %a, i32 0
4737/// %gep2 = getelementptr i8, ptr %a, i32 4
4738/// %gep3 = getelementptr i8, ptr %a, i32 8
4739/// store float 0.0, ptr %gep1
4740/// store float 1.0, ptr %gep2
4741/// %v = load i64, ptr %gep1
4742/// store i64 %v, ptr %gep2
4743/// %f1 = load float, ptr %gep2
4744/// %f2 = load float, ptr %gep3
4745///
4746/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
4747/// promote everything so we recover the 2 SSA values that should have been
4748/// there all along.
4749///
4750/// \returns true if any changes are made.
4751bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
4752 LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
4753
4754 // Track the loads and stores which are candidates for pre-splitting here, in
4755 // the order they first appear during the partition scan. These give stable
4756 // iteration order and a basis for tracking which loads and stores we
4757 // actually split.
4760
4761 // We need to accumulate the splits required of each load or store where we
4762 // can find them via a direct lookup. This is important to cross-check loads
4763 // and stores against each other. We also track the slice so that we can kill
4764 // all the slices that end up split.
4765 struct SplitOffsets {
4766 Slice *S;
4767 std::vector<uint64_t> Splits;
4768 };
4769 SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
4770
4771 // Track loads out of this alloca which cannot, for any reason, be pre-split.
4772 // This is important as we also cannot pre-split stores of those loads!
4773 // FIXME: This is all pretty gross. It means that we can be more aggressive
4774 // in pre-splitting when the load feeding the store happens to come from
4775 // a separate alloca. Put another way, the effectiveness of SROA would be
4776 // decreased by a frontend which just concatenated all of its local allocas
4777 // into one big flat alloca. But defeating such patterns is exactly the job
4778 // SROA is tasked with! Sadly, to not have this discrepancy we would have
4779 // change store pre-splitting to actually force pre-splitting of the load
4780 // that feeds it *and all stores*. That makes pre-splitting much harder, but
4781 // maybe it would make it more principled?
4782 SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
4783
4784 LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
4785 for (auto &P : AS.partitions()) {
4786 for (Slice &S : P) {
4787 Instruction *I = cast<Instruction>(S.getUse()->getUser());
4788 if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
4789 // If this is a load we have to track that it can't participate in any
4790 // pre-splitting. If this is a store of a load we have to track that
4791 // that load also can't participate in any pre-splitting.
4792 if (auto *LI = dyn_cast<LoadInst>(I))
4793 UnsplittableLoads.insert(LI);
4794 else if (auto *SI = dyn_cast<StoreInst>(I))
4795 if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
4796 UnsplittableLoads.insert(LI);
4797 continue;
4798 }
4799 assert(P.endOffset() > S.beginOffset() &&
4800 "Empty or backwards partition!");
4801
4802 // Determine if this is a pre-splittable slice.
4803 if (auto *LI = dyn_cast<LoadInst>(I)) {
4804 assert(!LI->isVolatile() && "Cannot split volatile loads!");
4805
4806 // The load must be used exclusively to store into other pointers for
4807 // us to be able to arbitrarily pre-split it. The stores must also be
4808 // simple to avoid changing semantics.
4809 auto IsLoadSimplyStored = [](LoadInst *LI) {
4810 for (User *LU : LI->users()) {
4811 auto *SI = dyn_cast<StoreInst>(LU);
4812 if (!SI || !SI->isSimple())
4813 return false;
4814 }
4815 return true;
4816 };
4817 if (!IsLoadSimplyStored(LI)) {
4818 UnsplittableLoads.insert(LI);
4819 continue;
4820 }
4821
4822 Loads.push_back(LI);
4823 } else if (auto *SI = dyn_cast<StoreInst>(I)) {
4824 if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
4825 // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
4826 continue;
4827 auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
4828 if (!StoredLoad || !StoredLoad->isSimple())
4829 continue;
4830 assert(!SI->isVolatile() && "Cannot split volatile stores!");
4831
4832 Stores.push_back(SI);
4833 } else {
4834 // Other uses cannot be pre-split.
4835 continue;
4836 }
4837
4838 // Record the initial split.
4839 LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
4840 auto &Offsets = SplitOffsetsMap[I];
4841 assert(Offsets.Splits.empty() &&
4842 "Should not have splits the first time we see an instruction!");
4843 Offsets.S = &S;
4844 Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
4845 }
4846
4847 // Now scan the already split slices, and add a split for any of them which
4848 // we're going to pre-split.
4849 for (Slice *S : P.splitSliceTails()) {
4850 auto SplitOffsetsMapI =
4851 SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
4852 if (SplitOffsetsMapI == SplitOffsetsMap.end())
4853 continue;
4854 auto &Offsets = SplitOffsetsMapI->second;
4855
4856 assert(Offsets.S == S && "Found a mismatched slice!");
4857 assert(!Offsets.Splits.empty() &&
4858 "Cannot have an empty set of splits on the second partition!");
4859 assert(Offsets.Splits.back() ==
4860 P.beginOffset() - Offsets.S->beginOffset() &&
4861 "Previous split does not end where this one begins!");
4862
4863 // Record each split. The last partition's end isn't needed as the size
4864 // of the slice dictates that.
4865 if (S->endOffset() > P.endOffset())
4866 Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
4867 }
4868 }
4869
4870 // We may have split loads where some of their stores are split stores. For
4871 // such loads and stores, we can only pre-split them if their splits exactly
4872 // match relative to their starting offset. We have to verify this prior to
4873 // any rewriting.
4874 llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
4875 // Lookup the load we are storing in our map of split
4876 // offsets.
4877 auto *LI = cast<LoadInst>(SI->getValueOperand());
4878 // If it was completely unsplittable, then we're done,
4879 // and this store can't be pre-split.
4880 if (UnsplittableLoads.count(LI))
4881 return true;
4882
4883 auto LoadOffsetsI = SplitOffsetsMap.find(LI);
4884 if (LoadOffsetsI == SplitOffsetsMap.end())
4885 return false; // Unrelated loads are definitely safe.
4886 auto &LoadOffsets = LoadOffsetsI->second;
4887
4888 // Now lookup the store's offsets.
4889 auto &StoreOffsets = SplitOffsetsMap[SI];
4890
4891 // If the relative offsets of each split in the load and
4892 // store match exactly, then we can split them and we
4893 // don't need to remove them here.
4894 if (LoadOffsets.Splits == StoreOffsets.Splits)
4895 return false;
4896
4897 LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
4898 << " " << *LI << "\n"
4899 << " " << *SI << "\n");
4900
4901 // We've found a store and load that we need to split
4902 // with mismatched relative splits. Just give up on them
4903 // and remove both instructions from our list of
4904 // candidates.
4905 UnsplittableLoads.insert(LI);
4906 return true;
4907 });
4908 // Now we have to go *back* through all the stores, because a later store may
4909 // have caused an earlier store's load to become unsplittable and if it is
4910 // unsplittable for the later store, then we can't rely on it being split in
4911 // the earlier store either.
4912 llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
4913 auto *LI = cast<LoadInst>(SI->getValueOperand());
4914 return UnsplittableLoads.count(LI);
4915 });
4916 // Once we've established all the loads that can't be split for some reason,
4917 // filter any that made it into our list out.
4918 llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
4919 return UnsplittableLoads.count(LI);
4920 });
4921
4922 // If no loads or stores are left, there is no pre-splitting to be done for
4923 // this alloca.
4924 if (Loads.empty() && Stores.empty())
4925 return false;
4926
4927 // From here on, we can't fail and will be building new accesses, so rig up
4928 // an IR builder.
4929 IRBuilderTy IRB(&AI);
4930
4931 // Collect the new slices which we will merge into the alloca slices.
4932 SmallVector<Slice, 4> NewSlices;
4933
4934 // Track any allocas we end up splitting loads and stores for so we iterate
4935 // on them.
4936 SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
4937
4938 // At this point, we have collected all of the loads and stores we can
4939 // pre-split, and the specific splits needed for them. We actually do the
4940 // splitting in a specific order in order to handle when one of the loads in
4941 // the value operand to one of the stores.
4942 //
4943 // First, we rewrite all of the split loads, and just accumulate each split
4944 // load in a parallel structure. We also build the slices for them and append
4945 // them to the alloca slices.
4946 SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
4947 std::vector<LoadInst *> SplitLoads;
4948 const DataLayout &DL = AI.getDataLayout();
4949 for (LoadInst *LI : Loads) {
4950 SplitLoads.clear();
4951
4952 auto &Offsets = SplitOffsetsMap[LI];
4953 unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
4954 assert(LI->getType()->getIntegerBitWidth() % 8 == 0 &&
4955 "Load must have type size equal to store size");
4956 assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize &&
4957 "Load must be >= slice size");
4958
4959 uint64_t BaseOffset = Offsets.S->beginOffset();
4960 assert(BaseOffset + SliceSize > BaseOffset &&
4961 "Cannot represent alloca access size using 64-bit integers!");
4962
4964 IRB.SetInsertPoint(LI);
4965
4966 LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
4967
4968 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4969 int Idx = 0, Size = Offsets.Splits.size();
4970 for (;;) {
4971 auto *PartTy = Type::getIntNTy(LI->getContext(), PartSize * 8);
4972 auto AS = LI->getPointerAddressSpace();
4973 auto *PartPtrTy = LI->getPointerOperandType();
4974 LoadInst *PLoad = IRB.CreateAlignedLoad(
4975 PartTy,
4976 getAdjustedPtr(IRB, DL, BasePtr,
4977 APInt(DL.getIndexSizeInBits(AS), PartOffset),
4978 PartPtrTy, BasePtr->getName() + "."),
4979 getAdjustedAlignment(LI, PartOffset),
4980 /*IsVolatile*/ false, LI->getName());
4981 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4982 LLVMContext::MD_access_group});
4983
4984 // Append this load onto the list of split loads so we can find it later
4985 // to rewrite the stores.
4986 SplitLoads.push_back(PLoad);
4987
4988 // Now build a new slice for the alloca.
4989 NewSlices.push_back(
4990 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4991 &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
4992 /*IsSplittable*/ false, nullptr));
4993 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4994 << ", " << NewSlices.back().endOffset()
4995 << "): " << *PLoad << "\n");
4996
4997 // See if we've handled all the splits.
4998 if (Idx >= Size)
4999 break;
5000
5001 // Setup the next partition.
5002 PartOffset = Offsets.Splits[Idx];
5003 ++Idx;
5004 PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset;
5005 }
5006
5007 // Now that we have the split loads, do the slow walk over all uses of the
5008 // load and rewrite them as split stores, or save the split loads to use
5009 // below if the store is going to be split there anyways.
5010 bool DeferredStores = false;
5011 for (User *LU : LI->users()) {
5012 StoreInst *SI = cast<StoreInst>(LU);
5013 if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
5014 DeferredStores = true;
5015 LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
5016 << "\n");
5017 continue;
5018 }
5019
5020 Value *StoreBasePtr = SI->getPointerOperand();
5021 IRB.SetInsertPoint(SI);
5022 AAMDNodes AATags = SI->getAAMetadata();
5023
5024 LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
5025
5026 for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
5027 LoadInst *PLoad = SplitLoads[Idx];
5028 uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
5029 auto *PartPtrTy = SI->getPointerOperandType();
5030
5031 auto AS = SI->getPointerAddressSpace();
5032 StoreInst *PStore = IRB.CreateAlignedStore(
5033 PLoad,
5034 getAdjustedPtr(IRB, DL, StoreBasePtr,
5035 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5036 PartPtrTy, StoreBasePtr->getName() + "."),
5037 getAdjustedAlignment(SI, PartOffset),
5038 /*IsVolatile*/ false);
5039 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
5040 LLVMContext::MD_access_group,
5041 LLVMContext::MD_DIAssignID});
5042
5043 if (AATags)
5044 PStore->setAAMetadata(
5045 AATags.adjustForAccess(PartOffset, PLoad->getType(), DL));
5046 LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
5047 }
5048
5049 // We want to immediately iterate on any allocas impacted by splitting
5050 // this store, and we have to track any promotable alloca (indicated by
5051 // a direct store) as needing to be resplit because it is no longer
5052 // promotable.
5053 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
5054 ResplitPromotableAllocas.insert(OtherAI);
5055 Worklist.insert(OtherAI);
5056 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
5057 StoreBasePtr->stripInBoundsOffsets())) {
5058 Worklist.insert(OtherAI);
5059 }
5060
5061 // Mark the original store as dead.
5062 DeadInsts.push_back(SI);
5063 }
5064
5065 // Save the split loads if there are deferred stores among the users.
5066 if (DeferredStores)
5067 SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
5068
5069 // Mark the original load as dead and kill the original slice.
5070 DeadInsts.push_back(LI);
5071 Offsets.S->kill();
5072 }
5073
5074 // Second, we rewrite all of the split stores. At this point, we know that
5075 // all loads from this alloca have been split already. For stores of such
5076 // loads, we can simply look up the pre-existing split loads. For stores of
5077 // other loads, we split those loads first and then write split stores of
5078 // them.
5079 for (StoreInst *SI : Stores) {
5080 auto *LI = cast<LoadInst>(SI->getValueOperand());
5081 IntegerType *Ty = cast<IntegerType>(LI->getType());
5082 assert(Ty->getBitWidth() % 8 == 0);
5083 uint64_t StoreSize = Ty->getBitWidth() / 8;
5084 assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
5085
5086 auto &Offsets = SplitOffsetsMap[SI];
5087 assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
5088 "Slice size should always match load size exactly!");
5089 uint64_t BaseOffset = Offsets.S->beginOffset();
5090 assert(BaseOffset + StoreSize > BaseOffset &&
5091 "Cannot represent alloca access size using 64-bit integers!");
5092
5093 Value *LoadBasePtr = LI->getPointerOperand();
5094 Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
5095
5096 LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
5097
5098 // Check whether we have an already split load.
5099 auto SplitLoadsMapI = SplitLoadsMap.find(LI);
5100 std::vector<LoadInst *> *SplitLoads = nullptr;
5101 if (SplitLoadsMapI != SplitLoadsMap.end()) {
5102 SplitLoads = &SplitLoadsMapI->second;
5103 assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
5104 "Too few split loads for the number of splits in the store!");
5105 } else {
5106 LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
5107 }
5108
5109 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
5110 int Idx = 0, Size = Offsets.Splits.size();
5111 for (;;) {
5112 auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
5113 auto *LoadPartPtrTy = LI->getPointerOperandType();
5114 auto *StorePartPtrTy = SI->getPointerOperandType();
5115
5116 // Either lookup a split load or create one.
5117 LoadInst *PLoad;
5118 if (SplitLoads) {
5119 PLoad = (*SplitLoads)[Idx];
5120 } else {
5121 IRB.SetInsertPoint(LI);
5122 auto AS = LI->getPointerAddressSpace();
5123 PLoad = IRB.CreateAlignedLoad(
5124 PartTy,
5125 getAdjustedPtr(IRB, DL, LoadBasePtr,
5126 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5127 LoadPartPtrTy, LoadBasePtr->getName() + "."),
5128 getAdjustedAlignment(LI, PartOffset),
5129 /*IsVolatile*/ false, LI->getName());
5130 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
5131 LLVMContext::MD_access_group});
5132 }
5133
5134 // And store this partition.
5135 IRB.SetInsertPoint(SI);
5136 auto AS = SI->getPointerAddressSpace();
5137 StoreInst *PStore = IRB.CreateAlignedStore(
5138 PLoad,
5139 getAdjustedPtr(IRB, DL, StoreBasePtr,
5140 APInt(DL.getIndexSizeInBits(AS), PartOffset),
5141 StorePartPtrTy, StoreBasePtr->getName() + "."),
5142 getAdjustedAlignment(SI, PartOffset),
5143 /*IsVolatile*/ false);
5144 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
5145 LLVMContext::MD_access_group});
5146
5147 // Now build a new slice for the alloca.
5148 // ProtectedFieldDisc==nullptr is a lie, but it doesn't matter because we
5149 // already determined that all accesses are consistent.
5150 NewSlices.push_back(
5151 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
5152 &PStore->getOperandUse(PStore->getPointerOperandIndex()),
5153 /*IsSplittable*/ false, nullptr));
5154 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
5155 << ", " << NewSlices.back().endOffset()
5156 << "): " << *PStore << "\n");
5157 if (!SplitLoads) {
5158 LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
5159 }
5160
5161 // See if we've finished all the splits.
5162 if (Idx >= Size)
5163 break;
5164
5165 // Setup the next partition.
5166 PartOffset = Offsets.Splits[Idx];
5167 ++Idx;
5168 PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
5169 }
5170
5171 // We want to immediately iterate on any allocas impacted by splitting
5172 // this load, which is only relevant if it isn't a load of this alloca and
5173 // thus we didn't already split the loads above. We also have to keep track
5174 // of any promotable allocas we split loads on as they can no longer be
5175 // promoted.
5176 if (!SplitLoads) {
5177 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
5178 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5179 ResplitPromotableAllocas.insert(OtherAI);
5180 Worklist.insert(OtherAI);
5181 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
5182 LoadBasePtr->stripInBoundsOffsets())) {
5183 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5184 Worklist.insert(OtherAI);
5185 }
5186 }
5187
5188 // Mark the original store as dead now that we've split it up and kill its
5189 // slice. Note that we leave the original load in place unless this store
5190 // was its only use. It may in turn be split up if it is an alloca load
5191 // for some other alloca, but it may be a normal load. This may introduce
5192 // redundant loads, but where those can be merged the rest of the optimizer
5193 // should handle the merging, and this uncovers SSA splits which is more
5194 // important. In practice, the original loads will almost always be fully
5195 // split and removed eventually, and the splits will be merged by any
5196 // trivial CSE, including instcombine.
5197 if (LI->hasOneUse()) {
5198 assert(*LI->user_begin() == SI && "Single use isn't this store!");
5199 DeadInsts.push_back(LI);
5200 }
5201 DeadInsts.push_back(SI);
5202 Offsets.S->kill();
5203 }
5204
5205 // Remove the killed slices that have ben pre-split.
5206 llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
5207
5208 // Insert our new slices. This will sort and merge them into the sorted
5209 // sequence.
5210 AS.insert(NewSlices);
5211
5212 LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
5213#ifndef NDEBUG
5214 for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
5215 LLVM_DEBUG(AS.print(dbgs(), I, " "));
5216#endif
5217
5218 // Finally, don't try to promote any allocas that new require re-splitting.
5219 // They have already been added to the worklist above.
5220 PromotableAllocas.set_subtract(ResplitPromotableAllocas);
5221
5222 return true;
5223}
5224
5225/// Rewrite an alloca partition's users.
5226///
5227/// This routine drives both of the rewriting goals of the SROA pass. It tries
5228/// to rewrite uses of an alloca partition to be conducive for SSA value
5229/// promotion. If the partition needs a new, more refined alloca, this will
5230/// build that new alloca, preserving as much type information as possible, and
5231/// rewrite the uses of the old alloca to point at the new one and have the
5232/// appropriate new offsets. It also evaluates how successful the rewrite was
5233/// at enabling promotion and if it was successful queues the alloca to be
5234/// promoted.
5235AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
5236 Partition &P) {
5237 // Try to compute a friendly type for this partition of the alloca. This
5238 // won't always succeed, in which case we fall back to a legal integer type
5239 // or an i8 array of an appropriate size.
5240 Type *SliceTy = nullptr;
5241 const DataLayout &DL = AI.getDataLayout();
5242 unsigned VScale = AI.getFunction()->getVScaleValue();
5243
5244 std::pair<Type *, IntegerType *> CommonUseTy =
5245 findCommonType(P.begin(), P.end(), P.endOffset());
5246 // Do all uses operate on the same type?
5247 if (CommonUseTy.first) {
5248 TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first);
5249 if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size())
5250 SliceTy = CommonUseTy.first;
5251 }
5252 // If not, can we find an appropriate subtype in the original allocated type?
5253 if (!SliceTy)
5254 if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
5255 P.beginOffset(), P.size()))
5256 SliceTy = TypePartitionTy;
5257
5258 // If still not, can we use the largest bitwidth integer type used?
5259 if (!SliceTy && CommonUseTy.second)
5260 if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size())
5261 SliceTy = CommonUseTy.second;
5262 if ((!SliceTy || (SliceTy->isArrayTy() &&
5263 SliceTy->getArrayElementType()->isIntegerTy())) &&
5264 DL.isLegalInteger(P.size() * 8)) {
5265 SliceTy = Type::getIntNTy(*C, P.size() * 8);
5266 }
5267
5268 if (!SliceTy)
5269 SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
5270 assert(DL.getTypeAllocSize(SliceTy).getFixedValue() >= P.size());
5271
5272 bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
5273
5274 VectorType *VecTy =
5275 IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale);
5276 if (VecTy)
5277 SliceTy = VecTy;
5278
5279 // Check for the case where we're going to rewrite to a new alloca of the
5280 // exact same type as the original, and with the same access offsets. In that
5281 // case, re-use the existing alloca, but still run through the rewriter to
5282 // perform phi and select speculation.
5283 // P.beginOffset() can be non-zero even with the same type in a case with
5284 // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
5285 AllocaInst *NewAI;
5286 if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
5287 NewAI = &AI;
5288 // FIXME: We should be able to bail at this point with "nothing changed".
5289 // FIXME: We might want to defer PHI speculation until after here.
5290 // FIXME: return nullptr;
5291 } else {
5292 // Make sure the alignment is compatible with P.beginOffset().
5293 const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
5294 // If we will get at least this much alignment from the type alone, leave
5295 // the alloca's alignment unconstrained.
5296 const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy);
5297 NewAI = new AllocaInst(
5298 SliceTy, AI.getAddressSpace(), nullptr,
5299 IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment,
5300 AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()),
5301 AI.getIterator());
5302 // Copy the old AI debug location over to the new one.
5303 NewAI->setDebugLoc(AI.getDebugLoc());
5304 ++NumNewAllocas;
5305 }
5306
5307 LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
5308 << "," << P.endOffset() << ") to: " << *NewAI << "\n");
5309
5310 // Track the high watermark on the worklist as it is only relevant for
5311 // promoted allocas. We will reset it to this point if the alloca is not in
5312 // fact scheduled for promotion.
5313 unsigned PPWOldSize = PostPromotionWorklist.size();
5314 unsigned NumUses = 0;
5315 SmallSetVector<PHINode *, 8> PHIUsers;
5316 SmallSetVector<SelectInst *, 8> SelectUsers;
5317
5318 AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
5319 P.endOffset(), IsIntegerPromotable, VecTy,
5320 PHIUsers, SelectUsers);
5321 bool Promotable = true;
5322 // Check whether we can have tree-structured merge.
5323 if (auto DeletedValues = Rewriter.rewriteTreeStructuredMerge(P)) {
5324 NumUses += DeletedValues->size() + 1;
5325 for (Value *V : *DeletedValues)
5326 DeadInsts.push_back(V);
5327 } else {
5328 for (Slice *S : P.splitSliceTails()) {
5329 Promotable &= Rewriter.visit(S);
5330 ++NumUses;
5331 }
5332 for (Slice &S : P) {
5333 Promotable &= Rewriter.visit(&S);
5334 ++NumUses;
5335 }
5336 }
5337
5338 NumAllocaPartitionUses += NumUses;
5339 MaxUsesPerAllocaPartition.updateMax(NumUses);
5340
5341 // Now that we've processed all the slices in the new partition, check if any
5342 // PHIs or Selects would block promotion.
5343 for (PHINode *PHI : PHIUsers)
5344 if (!isSafePHIToSpeculate(*PHI)) {
5345 Promotable = false;
5346 PHIUsers.clear();
5347 SelectUsers.clear();
5348 break;
5349 }
5350
5352 NewSelectsToRewrite;
5353 NewSelectsToRewrite.reserve(SelectUsers.size());
5354 for (SelectInst *Sel : SelectUsers) {
5355 std::optional<RewriteableMemOps> Ops =
5356 isSafeSelectToSpeculate(*Sel, PreserveCFG);
5357 if (!Ops) {
5358 Promotable = false;
5359 PHIUsers.clear();
5360 SelectUsers.clear();
5361 NewSelectsToRewrite.clear();
5362 break;
5363 }
5364 NewSelectsToRewrite.emplace_back(std::make_pair(Sel, *Ops));
5365 }
5366
5367 if (Promotable) {
5368 for (Use *U : AS.getDeadUsesIfPromotable()) {
5369 auto *OldInst = dyn_cast<Instruction>(U->get());
5370 Value::dropDroppableUse(*U);
5371 if (OldInst)
5372 if (isInstructionTriviallyDead(OldInst))
5373 DeadInsts.push_back(OldInst);
5374 }
5375 if (PHIUsers.empty() && SelectUsers.empty()) {
5376 // Promote the alloca.
5377 PromotableAllocas.insert(NewAI);
5378 } else {
5379 // If we have either PHIs or Selects to speculate, add them to those
5380 // worklists and re-queue the new alloca so that we promote in on the
5381 // next iteration.
5382 SpeculatablePHIs.insert_range(PHIUsers);
5383 SelectsToRewrite.reserve(SelectsToRewrite.size() +
5384 NewSelectsToRewrite.size());
5385 for (auto &&KV : llvm::make_range(
5386 std::make_move_iterator(NewSelectsToRewrite.begin()),
5387 std::make_move_iterator(NewSelectsToRewrite.end())))
5388 SelectsToRewrite.insert(std::move(KV));
5389 Worklist.insert(NewAI);
5390 }
5391 } else {
5392 // Drop any post-promotion work items if promotion didn't happen.
5393 while (PostPromotionWorklist.size() > PPWOldSize)
5394 PostPromotionWorklist.pop_back();
5395
5396 // We couldn't promote and we didn't create a new partition, nothing
5397 // happened.
5398 if (NewAI == &AI)
5399 return nullptr;
5400
5401 // If we can't promote the alloca, iterate on it to check for new
5402 // refinements exposed by splitting the current alloca. Don't iterate on an
5403 // alloca which didn't actually change and didn't get promoted.
5404 Worklist.insert(NewAI);
5405 }
5406
5407 return NewAI;
5408}
5409
5410// There isn't a shared interface to get the "address" parts out of a
5411// dbg.declare and dbg.assign, so provide some wrappers.
5414 return DVR->isKillAddress();
5415 return DVR->isKillLocation();
5416}
5417
5420 return DVR->getAddressExpression();
5421 return DVR->getExpression();
5422}
5423
5424/// Create or replace an existing fragment in a DIExpression with \p Frag.
5425/// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext
5426/// operation, add \p BitExtractOffset to the offset part.
5427///
5428/// Returns the new expression, or nullptr if this fails (see details below).
5429///
5430/// This function is similar to DIExpression::createFragmentExpression except
5431/// for 3 important distinctions:
5432/// 1. The new fragment isn't relative to an existing fragment.
5433/// 2. It assumes the computed location is a memory location. This means we
5434/// don't need to perform checks that creating the fragment preserves the
5435/// expression semantics.
5436/// 3. Existing extract_bits are modified independently of fragment changes
5437/// using \p BitExtractOffset. A change to the fragment offset or size
5438/// may affect a bit extract. But a bit extract offset can change
5439/// independently of the fragment dimensions.
5440///
5441/// Returns the new expression, or nullptr if one couldn't be created.
5442/// Ideally this is only used to signal that a bit-extract has become
5443/// zero-sized (and thus the new debug record has no size and can be
5444/// dropped), however, it fails for other reasons too - see the FIXME below.
5445///
5446/// FIXME: To keep the change that introduces this function NFC it bails
5447/// in some situations unecessarily, e.g. when fragment and bit extract
5448/// sizes differ.
5451 int64_t BitExtractOffset) {
5453 bool HasFragment = false;
5454 bool HasBitExtract = false;
5455
5456 for (auto &Op : Expr->expr_ops()) {
5457 if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
5458 HasFragment = true;
5459 continue;
5460 }
5461 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5463 HasBitExtract = true;
5464 int64_t ExtractOffsetInBits = Op.getArg(0);
5465 int64_t ExtractSizeInBits = Op.getArg(1);
5466
5467 // DIExpression::createFragmentExpression doesn't know how to handle
5468 // a fragment that is smaller than the extract. Copy the behaviour
5469 // (bail) to avoid non-NFC changes.
5470 // FIXME: Don't do this.
5471 if (Frag.SizeInBits < uint64_t(ExtractSizeInBits))
5472 return nullptr;
5473
5474 assert(BitExtractOffset <= 0);
5475 int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset;
5476
5477 // DIExpression::createFragmentExpression doesn't know what to do
5478 // if the new extract starts "outside" the existing one. Copy the
5479 // behaviour (bail) to avoid non-NFC changes.
5480 // FIXME: Don't do this.
5481 if (AdjustedOffset < 0)
5482 return nullptr;
5483
5484 Ops.push_back(Op.getOp());
5485 Ops.push_back(std::max<int64_t>(0, AdjustedOffset));
5486 Ops.push_back(ExtractSizeInBits);
5487 continue;
5488 }
5489 Op.appendToVector(Ops);
5490 }
5491
5492 // Unsupported by createFragmentExpression, so don't support it here yet to
5493 // preserve NFC-ness.
5494 if (HasFragment && HasBitExtract)
5495 return nullptr;
5496
5497 if (!HasBitExtract) {
5499 Ops.push_back(Frag.OffsetInBits);
5500 Ops.push_back(Frag.SizeInBits);
5501 }
5502 return DIExpression::get(Expr->getContext(), Ops);
5503}
5504
5505/// Insert a new DbgRecord.
5506/// \p Orig Original to copy record type, debug loc and variable from, and
5507/// additionally value and value expression for dbg_assign records.
5508/// \p NewAddr Location's new base address.
5509/// \p NewAddrExpr New expression to apply to address.
5510/// \p BeforeInst Insert position.
5511/// \p NewFragment New fragment (absolute, non-relative).
5512/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5513static void
5515 DIExpression *NewAddrExpr, Instruction *BeforeInst,
5516 std::optional<DIExpression::FragmentInfo> NewFragment,
5517 int64_t BitExtractAdjustment) {
5518 (void)DIB;
5519
5520 // A dbg_assign puts fragment info in the value expression only. The address
5521 // expression has already been built: NewAddrExpr. A dbg_declare puts the
5522 // new fragment info into NewAddrExpr (as it only has one expression).
5523 DIExpression *NewFragmentExpr =
5524 Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr;
5525 if (NewFragment)
5526 NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment,
5527 BitExtractAdjustment);
5528 if (!NewFragmentExpr)
5529 return;
5530
5531 if (Orig->isDbgDeclare()) {
5533 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5534 BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5535 BeforeInst->getIterator());
5536 return;
5537 }
5538
5539 if (Orig->isDbgValue()) {
5541 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5542 // Drop debug information if the expression doesn't start with a
5543 // DW_OP_deref. This is because without a DW_OP_deref, the #dbg_value
5544 // describes the address of alloca rather than the value inside the alloca.
5545 if (!NewFragmentExpr->startsWithDeref())
5546 DVR->setKillAddress();
5547 BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5548 BeforeInst->getIterator());
5549 return;
5550 }
5551
5552 // Apply a DIAssignID to the store if it doesn't already have it.
5553 if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) {
5554 NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
5556 }
5557
5559 NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
5560 NewAddrExpr, Orig->getDebugLoc());
5561 LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
5562 (void)NewAssign;
5563}
5564
5565/// Walks the slices of an alloca and form partitions based on them,
5566/// rewriting each of their uses.
5567bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
5568 if (AS.begin() == AS.end())
5569 return false;
5570
5571 unsigned NumPartitions = 0;
5572 bool Changed = false;
5573 const DataLayout &DL = AI.getModule()->getDataLayout();
5574
5575 // First try to pre-split loads and stores.
5576 Changed |= presplitLoadsAndStores(AI, AS);
5577
5578 // Now that we have identified any pre-splitting opportunities,
5579 // mark loads and stores unsplittable except for the following case.
5580 // We leave a slice splittable if all other slices are disjoint or fully
5581 // included in the slice, such as whole-alloca loads and stores.
5582 // If we fail to split these during pre-splitting, we want to force them
5583 // to be rewritten into a partition.
5584 bool IsSorted = true;
5585
5586 uint64_t AllocaSize =
5587 DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue();
5588 const uint64_t MaxBitVectorSize = 1024;
5589 if (AllocaSize <= MaxBitVectorSize) {
5590 // If a byte boundary is included in any load or store, a slice starting or
5591 // ending at the boundary is not splittable.
5592 SmallBitVector SplittableOffset(AllocaSize + 1, true);
5593 for (Slice &S : AS)
5594 for (unsigned O = S.beginOffset() + 1;
5595 O < S.endOffset() && O < AllocaSize; O++)
5596 SplittableOffset.reset(O);
5597
5598 for (Slice &S : AS) {
5599 if (!S.isSplittable())
5600 continue;
5601
5602 if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
5603 (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
5604 continue;
5605
5606 if (isa<LoadInst>(S.getUse()->getUser()) ||
5607 isa<StoreInst>(S.getUse()->getUser())) {
5608 S.makeUnsplittable();
5609 IsSorted = false;
5610 }
5611 }
5612 } else {
5613 // We only allow whole-alloca splittable loads and stores
5614 // for a large alloca to avoid creating too large BitVector.
5615 for (Slice &S : AS) {
5616 if (!S.isSplittable())
5617 continue;
5618
5619 if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
5620 continue;
5621
5622 if (isa<LoadInst>(S.getUse()->getUser()) ||
5623 isa<StoreInst>(S.getUse()->getUser())) {
5624 S.makeUnsplittable();
5625 IsSorted = false;
5626 }
5627 }
5628 }
5629
5630 if (!IsSorted)
5632
5633 /// Describes the allocas introduced by rewritePartition in order to migrate
5634 /// the debug info.
5635 struct Fragment {
5636 AllocaInst *Alloca;
5637 uint64_t Offset;
5638 uint64_t Size;
5639 Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
5640 : Alloca(AI), Offset(O), Size(S) {}
5641 };
5642 SmallVector<Fragment, 4> Fragments;
5643
5644 // Rewrite each partition.
5645 for (auto &P : AS.partitions()) {
5646 if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
5647 Changed = true;
5648 if (NewAI != &AI) {
5649 uint64_t SizeOfByte = 8;
5650 uint64_t AllocaSize =
5651 DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedValue();
5652 // Don't include any padding.
5653 uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
5654 Fragments.push_back(
5655 Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
5656 }
5657 }
5658 ++NumPartitions;
5659 }
5660
5661 NumAllocaPartitions += NumPartitions;
5662 MaxPartitionsPerAlloca.updateMax(NumPartitions);
5663
5664 // Migrate debug information from the old alloca to the new alloca(s)
5665 // and the individual partitions.
5666 auto MigrateOne = [&](DbgVariableRecord *DbgVariable) {
5667 // Can't overlap with undef memory.
5668 if (isKillAddress(DbgVariable))
5669 return;
5670
5671 const Value *DbgPtr = DbgVariable->getAddress();
5673 DbgVariable->getFragmentOrEntireVariable();
5674 // Get the address expression constant offset if one exists and the ops
5675 // that come after it.
5676 int64_t CurrentExprOffsetInBytes = 0;
5677 SmallVector<uint64_t> PostOffsetOps;
5678 if (!getAddressExpression(DbgVariable)
5679 ->extractLeadingOffset(CurrentExprOffsetInBytes, PostOffsetOps))
5680 return; // Couldn't interpret this DIExpression - drop the var.
5681
5682 // Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext.
5683 int64_t ExtractOffsetInBits = 0;
5684 for (auto Op : getAddressExpression(DbgVariable)->expr_ops()) {
5685 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5687 ExtractOffsetInBits = Op.getArg(0);
5688 break;
5689 }
5690 }
5691
5692 DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
5693 for (auto Fragment : Fragments) {
5694 int64_t OffsetFromLocationInBits;
5695 std::optional<DIExpression::FragmentInfo> NewDbgFragment;
5696 // Find the variable fragment that the new alloca slice covers.
5697 // Drop debug info for this variable fragment if we can't compute an
5698 // intersect between it and the alloca slice.
5700 DL, &AI, Fragment.Offset, Fragment.Size, DbgPtr,
5701 CurrentExprOffsetInBytes * 8, ExtractOffsetInBits, VarFrag,
5702 NewDbgFragment, OffsetFromLocationInBits))
5703 continue; // Do not migrate this fragment to this slice.
5704
5705 // Zero sized fragment indicates there's no intersect between the variable
5706 // fragment and the alloca slice. Skip this slice for this variable
5707 // fragment.
5708 if (NewDbgFragment && !NewDbgFragment->SizeInBits)
5709 continue; // Do not migrate this fragment to this slice.
5710
5711 // No fragment indicates DbgVariable's variable or fragment exactly
5712 // overlaps the slice; copy its fragment (or nullopt if there isn't one).
5713 if (!NewDbgFragment)
5714 NewDbgFragment = DbgVariable->getFragment();
5715
5716 // Reduce the new expression offset by the bit-extract offset since
5717 // we'll be keeping that.
5718 int64_t OffestFromNewAllocaInBits =
5719 OffsetFromLocationInBits - ExtractOffsetInBits;
5720 // We need to adjust an existing bit extract if the offset expression
5721 // can't eat the slack (i.e., if the new offset would be negative).
5722 int64_t BitExtractOffset =
5723 std::min<int64_t>(0, OffestFromNewAllocaInBits);
5724 // The magnitude of a negative value indicates the number of bits into
5725 // the existing variable fragment that the memory region begins. The new
5726 // variable fragment already excludes those bits - the new DbgPtr offset
5727 // only needs to be applied if it's positive.
5728 OffestFromNewAllocaInBits =
5729 std::max(int64_t(0), OffestFromNewAllocaInBits);
5730
5731 // Rebuild the expression:
5732 // {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment}
5733 // Add NewDbgFragment later, because dbg.assigns don't want it in the
5734 // address expression but the value expression instead.
5735 DIExpression *NewExpr = DIExpression::get(AI.getContext(), PostOffsetOps);
5736 if (OffestFromNewAllocaInBits > 0) {
5737 int64_t OffsetInBytes = (OffestFromNewAllocaInBits + 7) / 8;
5738 NewExpr = DIExpression::prepend(NewExpr, /*flags=*/0, OffsetInBytes);
5739 }
5740
5741 // Remove any existing intrinsics on the new alloca describing
5742 // the variable fragment.
5743 auto RemoveOne = [DbgVariable](auto *OldDII) {
5744 auto SameVariableFragment = [](const auto *LHS, const auto *RHS) {
5745 return LHS->getVariable() == RHS->getVariable() &&
5746 LHS->getDebugLoc()->getInlinedAt() ==
5747 RHS->getDebugLoc()->getInlinedAt();
5748 };
5749 if (SameVariableFragment(OldDII, DbgVariable))
5750 OldDII->eraseFromParent();
5751 };
5752 for_each(findDVRDeclares(Fragment.Alloca), RemoveOne);
5753 for_each(findDVRValues(Fragment.Alloca), RemoveOne);
5754 insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI,
5755 NewDbgFragment, BitExtractOffset);
5756 }
5757 };
5758
5759 // Migrate debug information from the old alloca to the new alloca(s)
5760 // and the individual partitions.
5761 for_each(findDVRDeclares(&AI), MigrateOne);
5762 for_each(findDVRValues(&AI), MigrateOne);
5763 for_each(at::getDVRAssignmentMarkers(&AI), MigrateOne);
5764
5765 return Changed;
5766}
5767
5768/// Clobber a use with poison, deleting the used value if it becomes dead.
5769void SROA::clobberUse(Use &U) {
5770 Value *OldV = U;
5771 // Replace the use with an poison value.
5772 U = PoisonValue::get(OldV->getType());
5773
5774 // Check for this making an instruction dead. We have to garbage collect
5775 // all the dead instructions to ensure the uses of any alloca end up being
5776 // minimal.
5777 if (Instruction *OldI = dyn_cast<Instruction>(OldV))
5778 if (isInstructionTriviallyDead(OldI)) {
5779 DeadInsts.push_back(OldI);
5780 }
5781}
5782
5783/// A basic LoadAndStorePromoter that does not remove store nodes.
5785public:
5787 Type *ZeroType)
5788 : LoadAndStorePromoter(Insts, S), ZeroType(ZeroType) {}
5789 bool shouldDelete(Instruction *I) const override {
5790 return !isa<StoreInst>(I) && !isa<AllocaInst>(I);
5791 }
5792
5794 return UndefValue::get(ZeroType);
5795 }
5796
5797private:
5798 Type *ZeroType;
5799};
5800
5801bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
5802 // Look through each "partition", looking for slices with the same start/end
5803 // that do not overlap with any before them. The slices are sorted by
5804 // increasing beginOffset. We don't use AS.partitions(), as it will use a more
5805 // sophisticated algorithm that takes splittable slices into account.
5806 LLVM_DEBUG(dbgs() << "Attempting to propagate values on " << AI << "\n");
5807 bool AllSameAndValid = true;
5808 Type *PartitionType = nullptr;
5810 uint64_t BeginOffset = 0;
5811 uint64_t EndOffset = 0;
5812
5813 auto Flush = [&]() {
5814 if (AllSameAndValid && !Insts.empty()) {
5815 LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
5816 << EndOffset << ")\n");
5818 SSAUpdater SSA(&NewPHIs);
5819 Insts.push_back(&AI);
5820 BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
5821 Promoter.run(Insts);
5822 }
5823 AllSameAndValid = true;
5824 PartitionType = nullptr;
5825 Insts.clear();
5826 };
5827
5828 for (Slice &S : AS) {
5829 auto *User = cast<Instruction>(S.getUse()->getUser());
5830 if (isAssumeLikeIntrinsic(User)) {
5831 LLVM_DEBUG({
5832 dbgs() << "Ignoring slice: ";
5833 AS.print(dbgs(), &S);
5834 });
5835 continue;
5836 }
5837 if (S.beginOffset() >= EndOffset) {
5838 Flush();
5839 BeginOffset = S.beginOffset();
5840 EndOffset = S.endOffset();
5841 } else if (S.beginOffset() != BeginOffset || S.endOffset() != EndOffset) {
5842 if (AllSameAndValid) {
5843 LLVM_DEBUG({
5844 dbgs() << "Slice does not match range [" << BeginOffset << ", "
5845 << EndOffset << ")";
5846 AS.print(dbgs(), &S);
5847 });
5848 AllSameAndValid = false;
5849 }
5850 EndOffset = std::max(EndOffset, S.endOffset());
5851 continue;
5852 }
5853
5854 if (auto *LI = dyn_cast<LoadInst>(User)) {
5855 Type *UserTy = LI->getType();
5856 // LoadAndStorePromoter requires all the types to be the same.
5857 if (!LI->isSimple() || (PartitionType && UserTy != PartitionType))
5858 AllSameAndValid = false;
5859 PartitionType = UserTy;
5860 Insts.push_back(User);
5861 } else if (auto *SI = dyn_cast<StoreInst>(User)) {
5862 Type *UserTy = SI->getValueOperand()->getType();
5863 if (!SI->isSimple() || (PartitionType && UserTy != PartitionType))
5864 AllSameAndValid = false;
5865 PartitionType = UserTy;
5866 Insts.push_back(User);
5867 } else {
5868 AllSameAndValid = false;
5869 }
5870 }
5871
5872 Flush();
5873 return true;
5874}
5875
5876/// Analyze an alloca for SROA.
5877///
5878/// This analyzes the alloca to ensure we can reason about it, builds
5879/// the slices of the alloca, and then hands it off to be split and
5880/// rewritten as needed.
5881std::pair<bool /*Changed*/, bool /*CFGChanged*/>
5882SROA::runOnAlloca(AllocaInst &AI) {
5883 bool Changed = false;
5884 bool CFGChanged = false;
5885
5886 LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
5887 ++NumAllocasAnalyzed;
5888
5889 // Special case dead allocas, as they're trivial.
5890 if (AI.use_empty()) {
5891 AI.eraseFromParent();
5892 Changed = true;
5893 return {Changed, CFGChanged};
5894 }
5895 const DataLayout &DL = AI.getDataLayout();
5896
5897 // Skip alloca forms that this analysis can't handle.
5898 auto *AT = AI.getAllocatedType();
5899 TypeSize Size = DL.getTypeAllocSize(AT);
5900 if (AI.isArrayAllocation() || !AT->isSized() || Size.isScalable() ||
5901 Size.getFixedValue() == 0)
5902 return {Changed, CFGChanged};
5903
5904 // First, split any FCA loads and stores touching this alloca to promote
5905 // better splitting and promotion opportunities.
5906 IRBuilderTy IRB(&AI);
5907 AggLoadStoreRewriter AggRewriter(DL, IRB);
5908 Changed |= AggRewriter.rewrite(AI);
5909
5910 // Build the slices using a recursive instruction-visiting builder.
5911 AllocaSlices AS(DL, AI);
5912 LLVM_DEBUG(AS.print(dbgs()));
5913 if (AS.isEscaped())
5914 return {Changed, CFGChanged};
5915
5916 if (AS.isEscapedReadOnly()) {
5917 Changed |= propagateStoredValuesToLoads(AI, AS);
5918 return {Changed, CFGChanged};
5919 }
5920
5921 for (auto &P : AS.partitions()) {
5922 // For now, we can't split if a field is accessed both via protected field
5923 // and not, because that would mean that we would need to introduce sign and
5924 // auth operations to convert between the protected and non-protected uses,
5925 // and this pass doesn't know how to do that. Also, this case is unlikely to
5926 // occur in normal code.
5927 std::optional<Value *> ProtectedFieldDisc;
5928 auto SliceHasMismatch = [&](Slice &S) {
5929 if (auto *II = dyn_cast<IntrinsicInst>(S.getUse()->getUser()))
5930 if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
5931 II->getIntrinsicID() == Intrinsic::lifetime_end)
5932 return false;
5933 if (!ProtectedFieldDisc)
5934 ProtectedFieldDisc = S.ProtectedFieldDisc;
5935 return *ProtectedFieldDisc != S.ProtectedFieldDisc;
5936 };
5937 for (Slice &S : P)
5938 if (SliceHasMismatch(S))
5939 return {Changed, CFGChanged};
5940 for (Slice *S : P.splitSliceTails())
5941 if (SliceHasMismatch(*S))
5942 return {Changed, CFGChanged};
5943 }
5944
5945 // Delete all the dead users of this alloca before splitting and rewriting it.
5946 for (Instruction *DeadUser : AS.getDeadUsers()) {
5947 // Free up everything used by this instruction.
5948 for (Use &DeadOp : DeadUser->operands())
5949 clobberUse(DeadOp);
5950
5951 // Now replace the uses of this instruction.
5952 DeadUser->replaceAllUsesWith(PoisonValue::get(DeadUser->getType()));
5953
5954 // And mark it for deletion.
5955 DeadInsts.push_back(DeadUser);
5956 Changed = true;
5957 }
5958 for (Use *DeadOp : AS.getDeadOperands()) {
5959 clobberUse(*DeadOp);
5960 Changed = true;
5961 }
5962 for (IntrinsicInst *PFPUser : AS.getPFPUsers()) {
5963 PFPUser->replaceAllUsesWith(PFPUser->getArgOperand(0));
5964
5965 DeadInsts.push_back(PFPUser);
5966 Changed = true;
5967 }
5968
5969 // No slices to split. Leave the dead alloca for a later pass to clean up.
5970 if (AS.begin() == AS.end())
5971 return {Changed, CFGChanged};
5972
5973 Changed |= splitAlloca(AI, AS);
5974
5975 LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
5976 while (!SpeculatablePHIs.empty())
5977 speculatePHINodeLoads(IRB, *SpeculatablePHIs.pop_back_val());
5978
5979 LLVM_DEBUG(dbgs() << " Rewriting Selects\n");
5980 auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector();
5981 while (!RemainingSelectsToRewrite.empty()) {
5982 const auto [K, V] = RemainingSelectsToRewrite.pop_back_val();
5983 CFGChanged |=
5984 rewriteSelectInstMemOps(*K, V, IRB, PreserveCFG ? nullptr : DTU);
5985 }
5986
5987 return {Changed, CFGChanged};
5988}
5989
5990/// Delete the dead instructions accumulated in this run.
5991///
5992/// Recursively deletes the dead instructions we've accumulated. This is done
5993/// at the very end to maximize locality of the recursive delete and to
5994/// minimize the problems of invalidated instruction pointers as such pointers
5995/// are used heavily in the intermediate stages of the algorithm.
5996///
5997/// We also record the alloca instructions deleted here so that they aren't
5998/// subsequently handed to mem2reg to promote.
5999bool SROA::deleteDeadInstructions(
6000 SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
6001 bool Changed = false;
6002 while (!DeadInsts.empty()) {
6003 Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
6004 if (!I)
6005 continue;
6006 LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
6007
6008 // If the instruction is an alloca, find the possible dbg.declare connected
6009 // to it, and remove it too. We must do this before calling RAUW or we will
6010 // not be able to find it.
6011 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
6012 DeletedAllocas.insert(AI);
6013 for (DbgVariableRecord *OldDII : findDVRDeclares(AI))
6014 OldDII->eraseFromParent();
6015 }
6016
6018 I->replaceAllUsesWith(UndefValue::get(I->getType()));
6019
6020 for (Use &Operand : I->operands())
6021 if (Instruction *U = dyn_cast<Instruction>(Operand)) {
6022 // Zero out the operand and see if it becomes trivially dead.
6023 Operand = nullptr;
6025 DeadInsts.push_back(U);
6026 }
6027
6028 ++NumDeleted;
6029 I->eraseFromParent();
6030 Changed = true;
6031 }
6032 return Changed;
6033}
6034/// Promote the allocas, using the best available technique.
6035///
6036/// This attempts to promote whatever allocas have been identified as viable in
6037/// the PromotableAllocas list. If that list is empty, there is nothing to do.
6038/// This function returns whether any promotion occurred.
6039bool SROA::promoteAllocas() {
6040 if (PromotableAllocas.empty())
6041 return false;
6042
6043 if (SROASkipMem2Reg) {
6044 LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
6045 } else {
6046 LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
6047 NumPromoted += PromotableAllocas.size();
6048 PromoteMemToReg(PromotableAllocas.getArrayRef(), DTU->getDomTree(), AC);
6049 }
6050
6051 PromotableAllocas.clear();
6052 return true;
6053}
6054
6055std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) {
6056 LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
6057
6058 const DataLayout &DL = F.getDataLayout();
6059 BasicBlock &EntryBB = F.getEntryBlock();
6060 for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
6061 I != E; ++I) {
6062 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
6063 if (DL.getTypeAllocSize(AI->getAllocatedType()).isScalable() &&
6065 PromotableAllocas.insert(AI);
6066 else
6067 Worklist.insert(AI);
6068 }
6069 }
6070
6071 bool Changed = false;
6072 bool CFGChanged = false;
6073 // A set of deleted alloca instruction pointers which should be removed from
6074 // the list of promotable allocas.
6075 SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
6076
6077 do {
6078 while (!Worklist.empty()) {
6079 auto [IterationChanged, IterationCFGChanged] =
6080 runOnAlloca(*Worklist.pop_back_val());
6081 Changed |= IterationChanged;
6082 CFGChanged |= IterationCFGChanged;
6083
6084 Changed |= deleteDeadInstructions(DeletedAllocas);
6085
6086 // Remove the deleted allocas from various lists so that we don't try to
6087 // continue processing them.
6088 if (!DeletedAllocas.empty()) {
6089 Worklist.set_subtract(DeletedAllocas);
6090 PostPromotionWorklist.set_subtract(DeletedAllocas);
6091 PromotableAllocas.set_subtract(DeletedAllocas);
6092 DeletedAllocas.clear();
6093 }
6094 }
6095
6096 Changed |= promoteAllocas();
6097
6098 Worklist = PostPromotionWorklist;
6099 PostPromotionWorklist.clear();
6100 } while (!Worklist.empty());
6101
6102 assert((!CFGChanged || Changed) && "Can not only modify the CFG.");
6103 assert((!CFGChanged || !PreserveCFG) &&
6104 "Should not have modified the CFG when told to preserve it.");
6105
6106 if (Changed && isAssignmentTrackingEnabled(*F.getParent())) {
6107 for (auto &BB : F) {
6109 }
6110 }
6111
6112 return {Changed, CFGChanged};
6113}
6114
6118 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6119 auto [Changed, CFGChanged] =
6120 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6121 if (!Changed)
6122 return PreservedAnalyses::all();
6124 if (!CFGChanged)
6127 return PA;
6128}
6129
6131 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
6132 static_cast<PassInfoMixin<SROAPass> *>(this)->printPipeline(
6133 OS, MapClassName2PassName);
6134 OS << (PreserveCFG == SROAOptions::PreserveCFG ? "<preserve-cfg>"
6135 : "<modify-cfg>");
6136}
6137
6138SROAPass::SROAPass(SROAOptions PreserveCFG) : PreserveCFG(PreserveCFG) {}
6139
6140namespace {
6141
6142/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
6143class SROALegacyPass : public FunctionPass {
6145
6146public:
6147 static char ID;
6148
6152 }
6153
6154 bool runOnFunction(Function &F) override {
6155 if (skipFunction(F))
6156 return false;
6157
6158 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
6159 AssumptionCache &AC =
6160 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
6161 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6162 auto [Changed, _] =
6163 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6164 return Changed;
6165 }
6166
6167 void getAnalysisUsage(AnalysisUsage &AU) const override {
6168 AU.addRequired<AssumptionCacheTracker>();
6169 AU.addRequired<DominatorTreeWrapperPass>();
6170 AU.addPreserved<GlobalsAAWrapperPass>();
6171 AU.addPreserved<DominatorTreeWrapperPass>();
6172 }
6173
6174 StringRef getPassName() const override { return "SROA"; }
6175};
6176
6177} // end anonymous namespace
6178
6179char SROALegacyPass::ID = 0;
6180
6185
6186INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
6187 "Scalar Replacement Of Aggregates", false, false)
6190INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
This defines the Use class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
print mir2vec MIR2Vec Vocabulary Printer Pass
Definition MIR2Vec.cpp:593
This file implements a map that provides insertion order iteration.
static std::optional< uint64_t > getSizeInBytes(std::optional< uint64_t > SizeInBits)
Memory SSA
Definition MemorySSA.cpp:72
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
PassBuilder PB(Machine, PassOpts->PTO, std::nullopt, &PIC)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PointerIntPair class.
This file provides a collection of visitors which walk the (instruction) uses of a pointer.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
bool isDead(const MachineInstr &MI, const MachineRegisterInfo &MRI)
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, uint64_t OldAllocaOffsetInBits, uint64_t SliceSizeInBits, Instruction *OldInst, Instruction *Inst, Value *Dest, Value *Value, const DataLayout &DL)
Find linked dbg.assign and generate a new one with the correct FragmentInfo.
Definition SROA.cpp:343
static VectorType * isVectorPromotionViable(Partition &P, const DataLayout &DL, unsigned VScale)
Test whether the given alloca partitioning and range of slices can be promoted to a vector.
Definition SROA.cpp:2376
static Align getAdjustedAlignment(Instruction *I, uint64_t Offset)
Compute the adjusted alignment for a load or store from an offset.
Definition SROA.cpp:1961
static VectorType * checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, SmallVectorImpl< VectorType * > &CandidateTys, bool HaveCommonEltTy, Type *CommonEltTy, bool HaveVecPtrTy, bool HaveCommonVecPtrTy, VectorType *CommonVecPtrTy, unsigned VScale)
Test whether any vector type in CandidateTys is viable for promotion.
Definition SROA.cpp:2227
static std::pair< Type *, IntegerType * > findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E, uint64_t EndOffset)
Walk the range of a partitioning looking for a common type to cover this sequence of slices.
Definition SROA.cpp:1527
static Type * stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty)
Strip aggregate type wrapping.
Definition SROA.cpp:4580
static FragCalcResult calculateFragment(DILocalVariable *Variable, uint64_t NewStorageSliceOffsetInBits, uint64_t NewStorageSliceSizeInBits, std::optional< DIExpression::FragmentInfo > StorageFragment, std::optional< DIExpression::FragmentInfo > CurrentFragment, DIExpression::FragmentInfo &Target)
Definition SROA.cpp:278
static DIExpression * createOrReplaceFragment(const DIExpression *Expr, DIExpression::FragmentInfo Frag, int64_t BitExtractOffset)
Create or replace an existing fragment in a DIExpression with Frag.
Definition SROA.cpp:5449
static Value * insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, Value *V, uint64_t Offset, const Twine &Name)
Definition SROA.cpp:2619
static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, VectorType *Ty, uint64_t ElementSize, const DataLayout &DL, unsigned VScale)
Test whether the given slice use can be promoted to a vector.
Definition SROA.cpp:2152
static Value * getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Offset, Type *PointerTy, const Twine &NamePrefix)
Compute an adjusted pointer from Ptr by Offset bytes where the resulting pointer has PointerTy.
Definition SROA.cpp:1950
static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t AllocBeginOffset, Type *AllocaTy, const DataLayout &DL, bool &WholeAllocaOp)
Test whether a slice of an alloca is valid for integer widening.
Definition SROA.cpp:2458
static Value * extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, unsigned EndIndex, const Twine &Name)
Definition SROA.cpp:2652
static Value * foldPHINodeOrSelectInst(Instruction &I)
A helper that folds a PHI node or a select.
Definition SROA.cpp:1020
static bool rewriteSelectInstMemOps(SelectInst &SI, const RewriteableMemOps &Ops, IRBuilderTy &IRB, DomTreeUpdater *DTU)
Definition SROA.cpp:1916
static void rewriteMemOpOfSelect(SelectInst &SI, T &I, SelectHandSpeculativity Spec, DomTreeUpdater &DTU)
Definition SROA.cpp:1849
static Value * foldSelectInst(SelectInst &SI)
Definition SROA.cpp:1007
bool isKillAddress(const DbgVariableRecord *DVR)
Definition SROA.cpp:5412
static Value * insertVector(IRBuilderTy &IRB, Value *Old, Value *V, unsigned BeginIndex, const Twine &Name)
Definition SROA.cpp:2674
static bool isIntegerWideningViable(Partition &P, Type *AllocaTy, const DataLayout &DL)
Test whether the given alloca partition's integer operations can be widened to promotable ones.
Definition SROA.cpp:2553
static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN)
Definition SROA.cpp:1667
static VectorType * createAndCheckVectorTypesForPromotion(SetVector< Type * > &OtherTys, ArrayRef< VectorType * > CandidateTysCopy, function_ref< void(Type *)> CheckCandidateType, Partition &P, const DataLayout &DL, SmallVectorImpl< VectorType * > &CandidateTys, bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy, bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale)
Definition SROA.cpp:2332
static DebugVariable getAggregateVariable(DbgVariableRecord *DVR)
Definition SROA.cpp:324
static bool isSafePHIToSpeculate(PHINode &PN)
PHI instructions that use an alloca and are subsequently loaded can be rewritten to load both input p...
Definition SROA.cpp:1593
static Value * extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, IntegerType *Ty, uint64_t Offset, const Twine &Name)
Definition SROA.cpp:2594
static void insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, AllocaInst *NewAddr, DIExpression *NewAddrExpr, Instruction *BeforeInst, std::optional< DIExpression::FragmentInfo > NewFragment, int64_t BitExtractAdjustment)
Insert a new DbgRecord.
Definition SROA.cpp:5514
static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI, IRBuilderTy &IRB)
Definition SROA.cpp:1810
static Value * mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL, Type *NewAIEltTy, IRBuilder<> &Builder)
This function takes two vector values and combines them into a single vector by concatenating their e...
Definition SROA.cpp:2748
const DIExpression * getAddressExpression(const DbgVariableRecord *DVR)
Definition SROA.cpp:5418
static Type * getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, uint64_t Size)
Try to find a partition of the aggregate type passed in for a given offset and size.
Definition SROA.cpp:4618
static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy, unsigned VScale=0)
Test whether we can convert a value from the old to the new type.
Definition SROA.cpp:1971
static SelectHandSpeculativity isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG)
Definition SROA.cpp:1748
static Value * convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, Type *NewTy)
Generic routine to convert an SSA value to a value of a different type.
Definition SROA.cpp:2061
This file provides the interface for LLVM's Scalar Replacement of Aggregates pass.
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Virtual Register Rewriter
Value * RHS
Value * LHS
Builder for the alloca slices.
Definition SROA.cpp:1032
SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
Definition SROA.cpp:1052
An iterator over partitions of the alloca's slices.
Definition SROA.cpp:820
bool operator==(const partition_iterator &RHS) const
Definition SROA.cpp:967
partition_iterator & operator++()
Definition SROA.cpp:987
bool shouldDelete(Instruction *I) const override
Return false if a sub-class wants to keep one of the loads/stores after the SSA construction.
Definition SROA.cpp:5789
BasicLoadAndStorePromoter(ArrayRef< const Instruction * > Insts, SSAUpdater &S, Type *ZeroType)
Definition SROA.cpp:5786
Value * getValueToUseForAlloca(Instruction *I) const override
Return the value to use for the point in the code that the alloca is positioned.
Definition SROA.cpp:5793
Class for arbitrary precision integers.
Definition APInt.h:78
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:131
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
iterator begin() const
Definition ArrayRef.h:130
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
LLVM_ABI CaptureInfo getCaptureInfo(unsigned OpNo) const
Return which pointer components this operand may capture.
bool onlyReadsMemory(unsigned OpNo) const
bool isDataOperand(const Use *U) const
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static DIAssignID * getDistinct(LLVMContext &Context)
LLVM_ABI DbgInstPtr insertDbgAssign(Instruction *LinkedInstr, Value *Val, DILocalVariable *SrcVar, DIExpression *ValExpr, Value *Addr, DIExpression *AddrExpr, const DILocation *DL)
Insert a new llvm.dbg.assign intrinsic call.
DWARF expression.
iterator_range< expr_op_iterator > expr_ops() const
DbgVariableFragmentInfo FragmentInfo
LLVM_ABI bool startsWithDeref() const
Return whether the first element a DW_OP_deref.
static LLVM_ABI bool calculateFragmentIntersect(const DataLayout &DL, const Value *SliceStart, uint64_t SliceOffsetInBits, uint64_t SliceSizeInBits, const Value *DbgPtr, int64_t DbgPtrOffsetInBits, int64_t DbgExtractOffsetInBits, DIExpression::FragmentInfo VarFrag, std::optional< DIExpression::FragmentInfo > &Result, int64_t &OffsetFromLocationInBits)
Computes a fragment, bit-extract operation if needed, and new constant offset to describe a part of a...
static LLVM_ABI std::optional< DIExpression * > createFragmentExpression(const DIExpression *Expr, unsigned OffsetInBits, unsigned SizeInBits)
Create a DIExpression to describe one part of an aggregate variable that is fragmented across multipl...
static LLVM_ABI DIExpression * prepend(const DIExpression *Expr, uint8_t Flags, int64_t Offset=0)
Prepend DIExpr with a deref and offset operation and optionally turn it into a stack value or/and an ...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI void moveBefore(DbgRecord *MoveBefore)
DebugLoc getDebugLoc() const
void setDebugLoc(DebugLoc Loc)
Record of a variable value-assignment, aka a non instruction representation of the dbg....
LLVM_ABI void setKillAddress()
Kill the address component.
LLVM_ABI bool isKillLocation() const
LLVM_ABI bool isKillAddress() const
Check whether this kills the address component.
LLVM_ABI void replaceVariableLocationOp(Value *OldValue, Value *NewValue, bool AllowEmpty=false)
Value * getValue(unsigned OpIdx=0) const
static LLVM_ABI DbgVariableRecord * createLinkedDVRAssign(Instruction *LinkedInstr, Value *Val, DILocalVariable *Variable, DIExpression *Expression, Value *Address, DIExpression *AddressExpression, const DILocation *DI)
LLVM_ABI void setAssignId(DIAssignID *New)
DIExpression * getExpression() const
static LLVM_ABI DbgVariableRecord * createDVRDeclare(Value *Address, DILocalVariable *DV, DIExpression *Expr, const DILocation *DI)
static LLVM_ABI DbgVariableRecord * createDbgVariableRecord(Value *Location, DILocalVariable *DV, DIExpression *Expr, const DILocation *DI)
DILocalVariable * getVariable() const
DIExpression * getAddressExpression() const
LLVM_ABI DILocation * getInlinedAt() const
Definition DebugLoc.cpp:67
Identifies a unique instance of a variable.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:321
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
Class to represent fixed width SIMD vectors.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
unsigned getVScaleValue() const
Return the value for vscale based on the vscale_range attribute or 0 when unknown.
const BasicBlock & getEntryBlock() const
Definition Function.h:807
LLVM_ABI bool accumulateConstantOffset(const DataLayout &DL, APInt &Offset, function_ref< bool(Value &, APInt &)> ExternalAnalysis=nullptr) const
Accumulate the constant address offset of this GEP if possible.
Definition Operator.cpp:114
iterator_range< op_iterator > indices()
Type * getSourceElementType() const
LLVM_ABI GEPNoWrapFlags getNoWrapFlags() const
Get the nowrap flags for the GEP instruction.
This provides the default implementation of the IRBuilder 'InsertHelper' method that is called whenev...
Definition IRBuilder.h:61
virtual void InsertHelper(Instruction *I, const Twine &Name, BasicBlock::iterator InsertPt) const
Definition IRBuilder.h:65
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
Base class for instruction visitors.
Definition InstVisitor.h:78
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void setAAMetadata(const AAMDNodes &N)
Sets the AA metadata on this instruction from the AAMDNodes structure.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Class to represent integer types.
@ MAX_INT_BITS
Maximum number of bits that can be specified.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
A wrapper class for inspecting calls to intrinsic functions.
LoadAndStorePromoter(ArrayRef< const Instruction * > Insts, SSAUpdater &S, StringRef Name=StringRef())
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Value * getPointerOperand()
bool isVolatile() const
Return true if this is a load from a volatile memory location.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this load instruction.
Type * getPointerOperandType() const
static unsigned getPointerOperandIndex()
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this load instruction.
bool isSimple() const
Align getAlign() const
Return the alignment of the access that is being performed.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
LLVMContext & getContext() const
Definition Metadata.h:1242
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
This is the common base class for memset/memcpy/memmove.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
PointerIntPair - This class implements a pair of a pointer and small integer.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
PtrUseVisitor(const DataLayout &DL)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Run the pass over the function.
Definition SROA.cpp:6115
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
Definition SROA.cpp:6130
SROAPass(SROAOptions PreserveCFG)
If PreserveCFG is set, then the pass is not allowed to modify CFG in any way, even if it would update...
Definition SROA.cpp:6138
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
Value * getValueOperand()
static unsigned getPointerOperandIndex()
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static constexpr size_t npos
Definition StringRef.h:57
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition StringRef.h:573
size_t rfind(char C, size_t From=npos) const
Search for the last character C in the string.
Definition StringRef.h:345
size_t find(char C, size_t From=0) const
Search for the first character C in the string.
Definition StringRef.h:293
LLVM_ABI size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Used to lazily calculate structure layout information for a target machine, based on the DataLayout s...
Definition DataLayout.h:723
TypeSize getSizeInBytes() const
Definition DataLayout.h:732
LLVM_ABI unsigned getElementContainingOffset(uint64_t FixedOffset) const
Given a valid byte offset into the structure, returns the structure index that contains it.
TypeSize getElementOffset(unsigned Idx) const
Definition DataLayout.h:754
TypeSize getSizeInBits() const
Definition DataLayout.h:734
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:413
element_iterator element_end() const
element_iterator element_begin() const
bool isPacked() const
Type * getElementType(unsigned N) const
Type::subtype_iterator element_iterator
Target - Wrapper for Target specific information.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getIntegerBitWidth() const
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:264
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
Type * getArrayElementType() const
Definition Type.h:408
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:296
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
bool isTargetExtTy() const
Return true if this is a target extension type.
Definition Type.h:203
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:292
op_iterator op_begin()
Definition User.h:284
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
op_iterator op_end()
Definition User.h:286
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI const Value * stripInBoundsOffsets(function_ref< void(const Value *)> Func=[](const Value *) {}) const
Strip off pointer casts and inbounds GEPs.
Definition Value.cpp:812
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void dropDroppableUsesIn(User &Usr)
Remove every use of this value in User that can safely be removed.
Definition Value.cpp:218
LLVM_ABI const Value * stripAndAccumulateConstantOffsets(const DataLayout &DL, APInt &Offset, bool AllowNonInbounds, bool AllowInvariantGroup=false, function_ref< bool(Value &Value, APInt &Offset)> ExternalAnalysis=nullptr, bool LookThroughIntToPtr=false) const
Accumulate the constant offset this value has compared to a base pointer.
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static VectorType * getWithSizeAndScalar(VectorType *SizeTy, Type *EltTy)
This static method attempts to construct a VectorType with the same size-in-bits as SizeTy but with a...
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition iterator.h:80
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
Offsets
Offsets in bytes from the start of the input buffer.
SmallVector< DbgVariableRecord * > getDVRAssignmentMarkers(const Instruction *Inst)
Return a range of dbg_assign records for which Inst performs the assignment they encode.
Definition DebugInfo.h:195
LLVM_ABI void deleteAssignmentMarkers(const Instruction *Inst)
Delete the llvm.dbg.assign intrinsics linked to Inst.
initializer< Ty > init(const Ty &Val)
@ DW_OP_LLVM_extract_bits_zext
Only used in LLVM metadata.
Definition Dwarf.h:151
@ DW_OP_LLVM_fragment
Only used in LLVM metadata.
Definition Dwarf.h:144
@ DW_OP_LLVM_extract_bits_sext
Only used in LLVM metadata.
Definition Dwarf.h:150
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
bool empty() const
Definition BasicBlock.h:101
Context & getContext() const
Definition BasicBlock.h:99
iterator end() const
Definition BasicBlock.h:89
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
static cl::opt< bool > SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false), cl::Hidden)
Disable running mem2reg during SROA in order to test or debug SROA.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:532
@ Length
Definition DWP.cpp:532
bool operator<(int64_t V1, const APSInt &V2)
Definition APSInt.h:362
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2070
LLVM_ABI bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1730
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
LLVM_ABI void PromoteMemToReg(ArrayRef< AllocaInst * > Allocas, DominatorTree &DT, AssumptionCache *AC=nullptr)
Promote the specified list of alloca instructions into scalar registers, inserting PHI nodes as appro...
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2484
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto successors(const MachineBasicBlock *BB)
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2114
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI std::optional< RegOrConstant > getVectorSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI)
Definition Utils.cpp:1499
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
void * PointerTy
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2088
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI bool isAllocaPromotable(const AllocaInst *AI)
Return true if this alloca is legal for promotion.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2140
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
bool capturesFullProvenance(CaptureComponents CC)
Definition ModRef.h:341
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void initializeSROALegacyPassPass(PassRegistry &)
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
LLVM_ABI TinyPtrVector< DbgVariableRecord * > findDVRValues(Value *V)
As above, for DVRValues.
Definition DebugInfo.cpp:82
LLVM_ABI void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2132
LLVM_ABI TinyPtrVector< DbgVariableRecord * > findDVRDeclares(Value *V)
Finds dbg.declare records declaring local variables as living in the memory that 'V' points to.
Definition DebugInfo.cpp:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1909
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
cl::opt< bool > ProfcheckDisableMetadataFixes("profcheck-disable-metadata-fixes", cl::Hidden, cl::init(false), cl::desc("Disable metadata propagation fixes discovered through Issue #147390"))
LLVM_ABI Instruction * SplitBlockAndInsertIfThen(Value *Cond, BasicBlock::iterator SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI FunctionPass * createSROAPass(bool PreserveCFG=true)
Definition SROA.cpp:6181
SROAOptions
Definition SROA.h:24
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define NDEBUG
Definition regutils.h:48
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
AAMDNodes shift(size_t Offset) const
Create a new AAMDNode that describes this AAMDNode after applying a constant offset to the start of t...
Definition Metadata.h:820
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Describes an element of a Bitfield.
Definition Bitfields.h:176
static Bitfield::Type get(StorageType Packed)
Unpacks the field from the Packed value.
Definition Bitfields.h:207
static void set(StorageType &Packed, typename Bitfield::Type Value)
Sets the typed value in the provided Packed value.
Definition Bitfields.h:223
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:69