LLVM 23.0.0git
LowerMemIntrinsics.cpp
Go to the documentation of this file.
1//===- LowerMemIntrinsics.cpp ----------------------------------*- C++ -*--===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
12#include "llvm/IR/IRBuilder.h"
14#include "llvm/IR/MDBuilder.h"
17#include "llvm/Support/Debug.h"
21#include <limits>
22#include <optional>
23
24#define DEBUG_TYPE "lower-mem-intrinsics"
25
26using namespace llvm;
27
28namespace llvm {
30}
31
32/// \returns \p Len urem \p OpSize, checking for optimization opportunities.
33/// \p OpSizeVal must be the integer value of the \c ConstantInt \p OpSize.
35 Value *OpSize, unsigned OpSizeVal) {
36 // For powers of 2, we can and by (OpSizeVal - 1) instead of using urem.
37 if (isPowerOf2_32(OpSizeVal))
38 return B.CreateAnd(Len, OpSizeVal - 1);
39 return B.CreateURem(Len, OpSize);
40}
41
42/// \returns (\p Len udiv \p OpSize) mul \p OpSize, checking for optimization
43/// opportunities.
44/// If \p RTLoopRemainder is provided, it must be the result of
45/// \c getRuntimeLoopRemainder() with the same arguments.
47 unsigned OpSizeVal,
48 Value *RTLoopRemainder = nullptr) {
49 if (!RTLoopRemainder)
50 RTLoopRemainder = getRuntimeLoopRemainder(B, Len, OpSize, OpSizeVal);
51 return B.CreateSub(Len, RTLoopRemainder);
52}
53
54namespace {
55/// Container for the return values of insertLoopExpansion.
56struct LoopExpansionInfo {
57 /// The instruction at the end of the main loop body.
58 Instruction *MainLoopIP = nullptr;
59
60 /// The unit index in the main loop body.
61 Value *MainLoopIndex = nullptr;
62
63 /// The instruction at the end of the residual loop body. Can be nullptr if no
64 /// residual is required.
65 Instruction *ResidualLoopIP = nullptr;
66
67 /// The unit index in the residual loop body. Can be nullptr if no residual is
68 /// required.
69 Value *ResidualLoopIndex = nullptr;
70};
71
72std::optional<uint64_t> getAverageMemOpLoopTripCount(const MemIntrinsic &I) {
74 return std::nullopt;
75 if (std::optional<Function::ProfileCount> EC =
76 I.getFunction()->getEntryCount();
77 !EC || !EC->getCount())
78 return std::nullopt;
79 if (const auto Len = I.getLengthInBytes())
80 return Len->getZExtValue();
81 uint64_t Total = 0;
83 getValueProfDataFromInst(I, InstrProfValueKind::IPVK_MemOPSize,
84 std::numeric_limits<uint32_t>::max(), Total);
85 if (!Total)
86 return std::nullopt;
87 uint64_t TripCount = 0;
88 for (const auto &P : ProfData)
89 TripCount += P.Count * P.Value;
90 return std::round(1.0 * TripCount / Total);
91}
92
93} // namespace
94
95/// Insert the control flow and loop counters for a memcpy/memset loop
96/// expansion.
97///
98/// This function inserts IR corresponding to the following C code before
99/// \p InsertBefore:
100/// \code
101/// LoopUnits = (Len / MainLoopStep) * MainLoopStep;
102/// ResidualUnits = Len - LoopUnits;
103/// MainLoopIndex = 0;
104/// if (LoopUnits > 0) {
105/// do {
106/// // MainLoopIP
107/// MainLoopIndex += MainLoopStep;
108/// } while (MainLoopIndex < LoopUnits);
109/// }
110/// for (size_t i = 0; i < ResidualUnits; i += ResidualLoopStep) {
111/// ResidualLoopIndex = LoopUnits + i;
112/// // ResidualLoopIP
113/// }
114/// \endcode
115///
116/// \p MainLoopStep and \p ResidualLoopStep determine by how many "units" the
117/// loop index is increased in each iteration of the main and residual loops,
118/// respectively. In most cases, the "unit" will be bytes, but larger units are
119/// useful for lowering memset.pattern.
120///
121/// The computation of \c LoopUnits and \c ResidualUnits is performed at compile
122/// time if \p Len is a \c ConstantInt.
123/// The second (residual) loop is omitted if \p ResidualLoopStep is 0 or equal
124/// to \p MainLoopStep.
125/// The generated \c MainLoopIP, \c MainLoopIndex, \c ResidualLoopIP, and
126/// \c ResidualLoopIndex are returned in a \c LoopExpansionInfo object.
127///
128/// If provided, \p ExpectedUnits is used as the expected number of units
129/// handled by the loop expansion when computing branch weights.
130static LoopExpansionInfo
132 unsigned MainLoopStep, unsigned ResidualLoopStep,
133 StringRef BBNamePrefix,
134 std::optional<uint64_t> ExpectedUnits) {
135 assert((ResidualLoopStep == 0 || MainLoopStep % ResidualLoopStep == 0) &&
136 "ResidualLoopStep must divide MainLoopStep if specified");
137 assert(ResidualLoopStep <= MainLoopStep &&
138 "ResidualLoopStep cannot be larger than MainLoopStep");
139 assert(MainLoopStep > 0 && "MainLoopStep must be non-zero");
140 LoopExpansionInfo LEI;
141
142 // If the length is known to be zero, there is nothing to do.
143 if (auto *CLen = dyn_cast<ConstantInt>(Len))
144 if (CLen->isZero())
145 return LEI;
146
147 BasicBlock *PreLoopBB = InsertBefore->getParent();
148 BasicBlock *PostLoopBB = PreLoopBB->splitBasicBlock(
149 InsertBefore, BBNamePrefix + "-post-expansion");
150 Function *ParentFunc = PreLoopBB->getParent();
151 LLVMContext &Ctx = PreLoopBB->getContext();
152 const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
153 IRBuilder<> PreLoopBuilder(PreLoopBB->getTerminator());
154 PreLoopBuilder.SetCurrentDebugLocation(DbgLoc);
155
156 // Calculate the main loop trip count and remaining units to cover after the
157 // loop.
158 Type *LenType = Len->getType();
159 IntegerType *ILenType = cast<IntegerType>(LenType);
160 ConstantInt *CIMainLoopStep = ConstantInt::get(ILenType, MainLoopStep);
161 ConstantInt *Zero = ConstantInt::get(ILenType, 0U);
162
163 // We can avoid conditional branches and/or entire loops if we know any of the
164 // following:
165 // - that the main loop must be executed at least once
166 // - that the main loop will not be executed at all
167 // - that the residual loop must be executed at least once
168 // - that the residual loop will not be executed at all
169 bool MustTakeMainLoop = false;
170 bool MayTakeMainLoop = true;
171 bool MustTakeResidualLoop = false;
172 bool MayTakeResidualLoop = true;
173
174 Value *LoopUnits = Len;
175 Value *ResidualUnits = nullptr;
176 if (MainLoopStep != 1) {
177 if (auto *CLen = dyn_cast<ConstantInt>(Len)) {
178 uint64_t TotalUnits = CLen->getZExtValue();
179 uint64_t LoopEndCount = alignDown(TotalUnits, MainLoopStep);
180 uint64_t ResidualCount = TotalUnits - LoopEndCount;
181 LoopUnits = ConstantInt::get(LenType, LoopEndCount);
182 ResidualUnits = ConstantInt::get(LenType, ResidualCount);
183 MustTakeMainLoop = LoopEndCount > 0;
184 MayTakeMainLoop = MustTakeMainLoop;
185 MustTakeResidualLoop = ResidualCount > 0;
186 MayTakeResidualLoop = MustTakeResidualLoop;
187 // TODO: This could also use known bits to check if a non-constant loop
188 // count is guaranteed to be a multiple of MainLoopStep, in which case we
189 // could omit the residual loop. It's unclear if that is worthwhile.
190 } else {
191 ResidualUnits = getRuntimeLoopRemainder(PreLoopBuilder, Len,
192 CIMainLoopStep, MainLoopStep);
193 LoopUnits = getRuntimeLoopUnits(PreLoopBuilder, Len, CIMainLoopStep,
194 MainLoopStep, ResidualUnits);
195 }
196 } else if (auto *CLen = dyn_cast<ConstantInt>(Len)) {
197 MustTakeMainLoop = CLen->getZExtValue() > 0;
198 MayTakeMainLoop = MustTakeMainLoop;
199 }
200
201 // The case where both loops are omitted (i.e., the length is known zero) is
202 // already handled at the beginning of this function.
203 assert((MayTakeMainLoop || MayTakeResidualLoop) &&
204 "At least one of the loops must be generated");
205
206 BasicBlock *MainLoopBB = nullptr;
207 CondBrInst *MainLoopBr = nullptr;
208
209 // Construct the main loop unless we statically known that it is not taken.
210 if (MayTakeMainLoop) {
211 MainLoopBB = BasicBlock::Create(Ctx, BBNamePrefix + "-expansion-main-body",
212 ParentFunc, PostLoopBB);
213 IRBuilder<> LoopBuilder(MainLoopBB);
214 LoopBuilder.SetCurrentDebugLocation(DbgLoc);
215
216 PHINode *LoopIndex = LoopBuilder.CreatePHI(LenType, 2, "loop-index");
217 LEI.MainLoopIndex = LoopIndex;
218 LoopIndex->addIncoming(ConstantInt::get(LenType, 0U), PreLoopBB);
219
220 Value *NewIndex = LoopBuilder.CreateAdd(
221 LoopIndex, ConstantInt::get(LenType, MainLoopStep));
222 LoopIndex->addIncoming(NewIndex, MainLoopBB);
223
224 // One argument of the addition is a loop-variant PHI, so it must be an
225 // Instruction (i.e., it cannot be a Constant).
226 LEI.MainLoopIP = cast<Instruction>(NewIndex);
227
228 // Stay in the MainLoop until we have handled all the LoopUnits. The False
229 // target is adjusted below if a residual is generated.
230 MainLoopBr = LoopBuilder.CreateCondBr(
231 LoopBuilder.CreateICmpULT(NewIndex, LoopUnits), MainLoopBB, PostLoopBB);
232
233 if (ExpectedUnits.has_value()) {
234 uint64_t BackedgeTakenCount = ExpectedUnits.value() / MainLoopStep;
235 if (BackedgeTakenCount > 0)
236 BackedgeTakenCount -= 1; // The last iteration goes to the False target.
237 MDBuilder MDB(ParentFunc->getContext());
238 setFittedBranchWeights(*MainLoopBr, {BackedgeTakenCount, 1},
239 /*IsExpected=*/false);
240 } else {
242 }
243 }
244
245 // Construct the residual loop if it is requested from the caller unless we
246 // statically know that it won't be taken.
247 bool ResidualLoopRequested =
248 ResidualLoopStep > 0 && ResidualLoopStep < MainLoopStep;
249 BasicBlock *ResidualLoopBB = nullptr;
250 BasicBlock *ResidualCondBB = nullptr;
251 if (ResidualLoopRequested && MayTakeResidualLoop) {
252 ResidualLoopBB =
253 BasicBlock::Create(Ctx, BBNamePrefix + "-expansion-residual-body",
254 PreLoopBB->getParent(), PostLoopBB);
255
256 // The residual loop body is either reached from the ResidualCondBB (which
257 // checks if the residual loop needs to be executed), from the main loop
258 // body if we know statically that the residual must be executed, or from
259 // the pre-loop BB (conditionally or unconditionally) if the main loop is
260 // omitted.
261 BasicBlock *PredOfResLoopBody = PreLoopBB;
262 if (MainLoopBB) {
263 // If it's statically known that the residual must be executed, we don't
264 // need to create a preheader BB.
265 if (MustTakeResidualLoop) {
266 MainLoopBr->setSuccessor(1, ResidualLoopBB);
267 PredOfResLoopBody = MainLoopBB;
268 } else {
269 // Construct a preheader BB to check if the residual loop is executed.
270 ResidualCondBB =
271 BasicBlock::Create(Ctx, BBNamePrefix + "-expansion-residual-cond",
272 PreLoopBB->getParent(), ResidualLoopBB);
273
274 // Determine if we need to branch to the residual loop or bypass it.
275 IRBuilder<> RCBuilder(ResidualCondBB);
276 RCBuilder.SetCurrentDebugLocation(DbgLoc);
277 auto *BR =
278 RCBuilder.CreateCondBr(RCBuilder.CreateICmpNE(ResidualUnits, Zero),
279 ResidualLoopBB, PostLoopBB);
280 if (ExpectedUnits.has_value()) {
281 MDBuilder MDB(ParentFunc->getContext());
282 BR->setMetadata(LLVMContext::MD_prof,
284 } else {
286 }
287
288 MainLoopBr->setSuccessor(1, ResidualCondBB);
289 PredOfResLoopBody = ResidualCondBB;
290 }
291 }
292
293 IRBuilder<> ResBuilder(ResidualLoopBB);
294 ResBuilder.SetCurrentDebugLocation(DbgLoc);
295 PHINode *ResidualIndex =
296 ResBuilder.CreatePHI(LenType, 2, "residual-loop-index");
297 ResidualIndex->addIncoming(Zero, PredOfResLoopBody);
298
299 // Add the offset at the end of the main loop to the loop counter of the
300 // residual loop to get the proper index. If the main loop was omitted, we
301 // can also omit the addition.
302 if (MainLoopBB)
303 LEI.ResidualLoopIndex = ResBuilder.CreateAdd(LoopUnits, ResidualIndex);
304 else
305 LEI.ResidualLoopIndex = ResidualIndex;
306
307 Value *ResNewIndex = ResBuilder.CreateAdd(
308 ResidualIndex, ConstantInt::get(LenType, ResidualLoopStep));
309 ResidualIndex->addIncoming(ResNewIndex, ResidualLoopBB);
310
311 // One argument of the addition is a loop-variant PHI, so it must be an
312 // Instruction (i.e., it cannot be a Constant).
313 LEI.ResidualLoopIP = cast<Instruction>(ResNewIndex);
314
315 // Stay in the residual loop until all ResidualUnits are handled.
316 CondBrInst *BR = ResBuilder.CreateCondBr(
317 ResBuilder.CreateICmpULT(ResNewIndex, ResidualUnits), ResidualLoopBB,
318 PostLoopBB);
319
320 if (ExpectedUnits.has_value()) {
321 uint64_t BackedgeTakenCount =
322 (ExpectedUnits.value() % MainLoopStep) / ResidualLoopStep;
323 if (BackedgeTakenCount > 0)
324 BackedgeTakenCount -= 1; // The last iteration goes to the False target.
325 MDBuilder MDB(ParentFunc->getContext());
326 setFittedBranchWeights(*BR, {BackedgeTakenCount, 1},
327 /*IsExpected=*/false);
328 } else {
330 }
331 }
332
333 // Create the branch in the pre-loop block.
334 if (MustTakeMainLoop) {
335 // Go unconditionally to the main loop if it's statically known that it must
336 // be executed.
337 assert(MainLoopBB);
338 PreLoopBuilder.CreateBr(MainLoopBB);
339 } else if (!MainLoopBB && ResidualLoopBB) {
340 if (MustTakeResidualLoop) {
341 // If the main loop is omitted and the residual loop is statically known
342 // to be executed, go there unconditionally.
343 PreLoopBuilder.CreateBr(ResidualLoopBB);
344 } else {
345 // If the main loop is omitted and we don't know if the residual loop is
346 // executed, go there if necessary. The PreLoopBB takes the role of the
347 // preheader for the residual loop in this case.
348 auto *BR = PreLoopBuilder.CreateCondBr(
349 PreLoopBuilder.CreateICmpNE(ResidualUnits, Zero), ResidualLoopBB,
350 PostLoopBB);
351 if (ExpectedUnits.has_value()) {
352 MDBuilder MDB(ParentFunc->getContext());
353 BR->setMetadata(LLVMContext::MD_prof, MDB.createLikelyBranchWeights());
354 } else {
356 }
357 }
358 } else {
359 // Otherwise, go conditionally to the main loop or its successor.
360 // If there is no residual loop, the successor is the post-loop BB.
361 BasicBlock *FalseBB = PostLoopBB;
362 if (ResidualCondBB) {
363 // If we constructed a pre-header for the residual loop, that is the
364 // successor.
365 FalseBB = ResidualCondBB;
366 } else if (ResidualLoopBB) {
367 // If there is a residual loop but the preheader is omitted (because the
368 // residual loop is statically known to be executed), the successor
369 // is the residual loop body.
370 assert(MustTakeResidualLoop);
371 FalseBB = ResidualLoopBB;
372 }
373
374 auto *BR = PreLoopBuilder.CreateCondBr(
375 PreLoopBuilder.CreateICmpNE(LoopUnits, Zero), MainLoopBB, FalseBB);
376
377 if (ExpectedUnits.has_value()) {
378 MDBuilder MDB(ParentFunc->getContext());
379 BR->setMetadata(LLVMContext::MD_prof, MDB.createLikelyBranchWeights());
380 } else {
382 }
383 }
384 // Delete the unconditional branch inserted by splitBasicBlock.
385 PreLoopBB->getTerminator()->eraseFromParent();
386
387 return LEI;
388}
389
391 Value *DstAddr, ConstantInt *CopyLen,
392 Align SrcAlign, Align DstAlign,
393 bool SrcIsVolatile, bool DstIsVolatile,
394 bool CanOverlap,
396 std::optional<uint32_t> AtomicElementSize,
397 std::optional<uint64_t> AverageTripCount) {
398 // No need to expand zero length copies.
399 if (CopyLen->isZero())
400 return;
401
402 BasicBlock *PreLoopBB = InsertBefore->getParent();
403 Function *ParentFunc = PreLoopBB->getParent();
404 LLVMContext &Ctx = PreLoopBB->getContext();
405 const DataLayout &DL = ParentFunc->getDataLayout();
406 MDBuilder MDB(Ctx);
407 MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("MemCopyDomain");
408 StringRef Name = "MemCopyAliasScope";
409 MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
410
411 unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
412 unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
413
414 Type *TypeOfCopyLen = CopyLen->getType();
415 Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
416 Ctx, CopyLen, SrcAS, DstAS, SrcAlign, DstAlign, AtomicElementSize);
417 assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
418 "Atomic memcpy lowering is not supported for vector operand type");
419
420 Type *Int8Type = Type::getInt8Ty(Ctx);
421 TypeSize LoopOpSize = DL.getTypeStoreSize(LoopOpType);
422 assert(LoopOpSize.isFixed() && "LoopOpType cannot be a scalable vector type");
423 assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) &&
424 "Atomic memcpy lowering is not supported for selected operand size");
425
426 uint64_t LoopEndCount =
427 alignDown(CopyLen->getZExtValue(), LoopOpSize.getFixedValue());
428
429 // Skip the loop expansion entirely if the loop would never be taken.
430 if (LoopEndCount != 0) {
431 LoopExpansionInfo LEI =
432 insertLoopExpansion(InsertBefore, CopyLen, LoopOpSize, 0,
433 "static-memcpy", AverageTripCount);
434 assert(LEI.MainLoopIP && LEI.MainLoopIndex &&
435 "Main loop should be generated for non-zero loop count");
436
437 // Fill MainLoopBB
438 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
439 Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
440 Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
441
442 // If we used LoopOpType as GEP element type, we would iterate over the
443 // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
444 // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use
445 // byte offsets computed from the TypeStoreSize.
446 Value *SrcGEP =
447 MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LEI.MainLoopIndex);
448 LoadInst *Load = MainLoopBuilder.CreateAlignedLoad(
449 LoopOpType, SrcGEP, PartSrcAlign, SrcIsVolatile);
450 if (!CanOverlap) {
451 // Set alias scope for loads.
452 Load->setMetadata(LLVMContext::MD_alias_scope,
453 MDNode::get(Ctx, NewScope));
454 }
455 Value *DstGEP =
456 MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex);
457 StoreInst *Store = MainLoopBuilder.CreateAlignedStore(
458 Load, DstGEP, PartDstAlign, DstIsVolatile);
459 if (!CanOverlap) {
460 // Indicate that stores don't overlap loads.
461 Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
462 }
463 if (AtomicElementSize) {
464 Load->setAtomic(AtomicOrdering::Unordered);
465 Store->setAtomic(AtomicOrdering::Unordered);
466 }
467 assert(!LEI.ResidualLoopIP && !LEI.ResidualLoopIndex &&
468 "No residual loop was requested");
469 }
470
471 // Copy the remaining bytes with straight-line code.
472 uint64_t BytesCopied = LoopEndCount;
473 uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied;
474 if (RemainingBytes == 0)
475 return;
476
477 IRBuilder<> RBuilder(InsertBefore);
478 SmallVector<Type *, 5> RemainingOps;
479 TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
480 SrcAS, DstAS, SrcAlign, DstAlign,
481 AtomicElementSize);
482
483 for (auto *OpTy : RemainingOps) {
484 Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied));
485 Align PartDstAlign(commonAlignment(DstAlign, BytesCopied));
486
487 TypeSize OperandSize = DL.getTypeStoreSize(OpTy);
488 assert((!AtomicElementSize || OperandSize % *AtomicElementSize == 0) &&
489 "Atomic memcpy lowering is not supported for selected operand size");
490
491 Value *SrcGEP = RBuilder.CreateInBoundsGEP(
492 Int8Type, SrcAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
493 LoadInst *Load =
494 RBuilder.CreateAlignedLoad(OpTy, SrcGEP, PartSrcAlign, SrcIsVolatile);
495 if (!CanOverlap) {
496 // Set alias scope for loads.
497 Load->setMetadata(LLVMContext::MD_alias_scope,
498 MDNode::get(Ctx, NewScope));
499 }
500 Value *DstGEP = RBuilder.CreateInBoundsGEP(
501 Int8Type, DstAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
502 StoreInst *Store =
503 RBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
504 if (!CanOverlap) {
505 // Indicate that stores don't overlap loads.
506 Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
507 }
508 if (AtomicElementSize) {
509 Load->setAtomic(AtomicOrdering::Unordered);
510 Store->setAtomic(AtomicOrdering::Unordered);
511 }
512 BytesCopied += OperandSize;
513 }
514 assert(BytesCopied == CopyLen->getZExtValue() &&
515 "Bytes copied should match size in the call!");
516}
517
519 Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen,
520 Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile,
521 bool CanOverlap, const TargetTransformInfo &TTI,
522 std::optional<uint32_t> AtomicElementSize,
523 std::optional<uint64_t> AverageTripCount) {
524 BasicBlock *PreLoopBB = InsertBefore->getParent();
525 Function *ParentFunc = PreLoopBB->getParent();
526 const DataLayout &DL = ParentFunc->getDataLayout();
527 LLVMContext &Ctx = PreLoopBB->getContext();
528 MDBuilder MDB(Ctx);
529 MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("MemCopyDomain");
530 StringRef Name = "MemCopyAliasScope";
531 MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
532
533 unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
534 unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
535
536 Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
537 Ctx, CopyLen, SrcAS, DstAS, SrcAlign, DstAlign, AtomicElementSize);
538 assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
539 "Atomic memcpy lowering is not supported for vector operand type");
540 TypeSize LoopOpSize = DL.getTypeStoreSize(LoopOpType);
541 assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) &&
542 "Atomic memcpy lowering is not supported for selected operand size");
543
544 Type *Int8Type = Type::getInt8Ty(Ctx);
545
546 Type *ResidualLoopOpType = AtomicElementSize
547 ? Type::getIntNTy(Ctx, *AtomicElementSize * 8)
548 : Int8Type;
549 TypeSize ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
550 assert(ResidualLoopOpSize == (AtomicElementSize ? *AtomicElementSize : 1) &&
551 "Store size is expected to match type size");
552
553 LoopExpansionInfo LEI =
554 insertLoopExpansion(InsertBefore, CopyLen, LoopOpSize, ResidualLoopOpSize,
555 "dynamic-memcpy", AverageTripCount);
556 assert(LEI.MainLoopIP && LEI.MainLoopIndex &&
557 "Main loop should be generated for unknown size copy");
558
559 // Fill MainLoopBB
560 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
561 Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
562 Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
563
564 // If we used LoopOpType as GEP element type, we would iterate over the
565 // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
566 // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use byte
567 // offsets computed from the TypeStoreSize.
568 Value *SrcGEP =
569 MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LEI.MainLoopIndex);
570 LoadInst *Load = MainLoopBuilder.CreateAlignedLoad(
571 LoopOpType, SrcGEP, PartSrcAlign, SrcIsVolatile);
572 if (!CanOverlap) {
573 // Set alias scope for loads.
574 Load->setMetadata(LLVMContext::MD_alias_scope, MDNode::get(Ctx, NewScope));
575 }
576 Value *DstGEP =
577 MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex);
578 StoreInst *Store = MainLoopBuilder.CreateAlignedStore(
579 Load, DstGEP, PartDstAlign, DstIsVolatile);
580 if (!CanOverlap) {
581 // Indicate that stores don't overlap loads.
582 Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
583 }
584 if (AtomicElementSize) {
587 }
588
589 // Fill ResidualLoopBB.
590 if (!LEI.ResidualLoopIP)
591 return;
592
593 Align ResSrcAlign(commonAlignment(PartSrcAlign, ResidualLoopOpSize));
594 Align ResDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize));
595
596 IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP);
597 Value *ResSrcGEP = ResLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr,
598 LEI.ResidualLoopIndex);
599 LoadInst *ResLoad = ResLoopBuilder.CreateAlignedLoad(
600 ResidualLoopOpType, ResSrcGEP, ResSrcAlign, SrcIsVolatile);
601 if (!CanOverlap) {
602 // Set alias scope for loads.
603 ResLoad->setMetadata(LLVMContext::MD_alias_scope,
604 MDNode::get(Ctx, NewScope));
605 }
606 Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
607 LEI.ResidualLoopIndex);
608 StoreInst *ResStore = ResLoopBuilder.CreateAlignedStore(
609 ResLoad, ResDstGEP, ResDstAlign, DstIsVolatile);
610 if (!CanOverlap) {
611 // Indicate that stores don't overlap loads.
612 ResStore->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
613 }
614 if (AtomicElementSize) {
617 }
618}
619
620// If \p Addr1 and \p Addr2 are pointers to different address spaces, create an
621// addresspacecast to obtain a pair of pointers in the same addressspace. The
622// caller needs to ensure that addrspacecasting is possible.
623// No-op if the pointers are in the same address space.
624static std::pair<Value *, Value *>
626 const TargetTransformInfo &TTI) {
627 Value *ResAddr1 = Addr1;
628 Value *ResAddr2 = Addr2;
629
630 unsigned AS1 = cast<PointerType>(Addr1->getType())->getAddressSpace();
631 unsigned AS2 = cast<PointerType>(Addr2->getType())->getAddressSpace();
632 if (AS1 != AS2) {
633 if (TTI.isValidAddrSpaceCast(AS2, AS1))
634 ResAddr2 = B.CreateAddrSpaceCast(Addr2, Addr1->getType());
635 else if (TTI.isValidAddrSpaceCast(AS1, AS2))
636 ResAddr1 = B.CreateAddrSpaceCast(Addr1, Addr2->getType());
637 else
638 llvm_unreachable("Can only lower memmove between address spaces if they "
639 "support addrspacecast");
640 }
641 return {ResAddr1, ResAddr2};
642}
643
644// Lower memmove to IR. memmove is required to correctly copy overlapping memory
645// regions; therefore, it has to check the relative positions of the source and
646// destination pointers and choose the copy direction accordingly.
647//
648// The code below is an IR rendition of this C function:
649//
650// void* memmove(void* dst, const void* src, size_t n) {
651// unsigned char* d = dst;
652// const unsigned char* s = src;
653// if (s < d) {
654// // copy backwards
655// while (n--) {
656// d[n] = s[n];
657// }
658// } else {
659// // copy forward
660// for (size_t i = 0; i < n; ++i) {
661// d[i] = s[i];
662// }
663// }
664// return dst;
665// }
666//
667// If the TargetTransformInfo specifies a wider MemcpyLoopLoweringType, it is
668// used for the memory accesses in the loops. Then, additional loops with
669// byte-wise accesses are added for the remaining bytes.
671 Value *SrcAddr, Value *DstAddr,
672 Value *CopyLen, Align SrcAlign,
673 Align DstAlign, bool SrcIsVolatile,
674 bool DstIsVolatile,
675 const TargetTransformInfo &TTI) {
676 Type *TypeOfCopyLen = CopyLen->getType();
677 BasicBlock *OrigBB = InsertBefore->getParent();
678 Function *F = OrigBB->getParent();
679 const DataLayout &DL = F->getDataLayout();
680 LLVMContext &Ctx = OrigBB->getContext();
681 unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
682 unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
683
684 Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
685 SrcAlign, DstAlign);
686 TypeSize LoopOpSize = DL.getTypeStoreSize(LoopOpType);
687 Type *Int8Type = Type::getInt8Ty(Ctx);
688 bool LoopOpIsInt8 = LoopOpType == Int8Type;
689
690 // If the memory accesses are wider than one byte, residual loops with
691 // i8-accesses are required to move remaining bytes.
692 bool RequiresResidual = !LoopOpIsInt8;
693
694 Type *ResidualLoopOpType = Int8Type;
695 TypeSize ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
696
697 // Calculate the loop trip count and remaining bytes to copy after the loop.
698 IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
699 ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
700 ConstantInt *CIResidualLoopOpSize =
701 ConstantInt::get(ILengthType, ResidualLoopOpSize);
702 ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
703
704 const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
705 IRBuilder<> PLBuilder(InsertBefore);
706 PLBuilder.SetCurrentDebugLocation(DbgLoc);
707
708 Value *RuntimeLoopBytes = CopyLen;
709 Value *RuntimeLoopRemainder = nullptr;
710 Value *SkipResidualCondition = nullptr;
711 if (RequiresResidual) {
712 RuntimeLoopRemainder =
713 getRuntimeLoopRemainder(PLBuilder, CopyLen, CILoopOpSize, LoopOpSize);
714 RuntimeLoopBytes = getRuntimeLoopUnits(PLBuilder, CopyLen, CILoopOpSize,
715 LoopOpSize, RuntimeLoopRemainder);
716 SkipResidualCondition =
717 PLBuilder.CreateICmpEQ(RuntimeLoopRemainder, Zero, "skip_residual");
718 }
719 Value *SkipMainCondition =
720 PLBuilder.CreateICmpEQ(RuntimeLoopBytes, Zero, "skip_main");
721
722 // Create the a comparison of src and dst, based on which we jump to either
723 // the forward-copy part of the function (if src >= dst) or the backwards-copy
724 // part (if src < dst).
725 // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
726 // structure. Its block terminators (unconditional branches) are replaced by
727 // the appropriate conditional branches when the loop is built.
728 // If the pointers are in different address spaces, they need to be converted
729 // to a compatible one. Cases where memory ranges in the different address
730 // spaces cannot overlap are lowered as memcpy and not handled here.
731 auto [CmpSrcAddr, CmpDstAddr] =
732 tryInsertCastToCommonAddrSpace(PLBuilder, SrcAddr, DstAddr, TTI);
733 Value *PtrCompare =
734 PLBuilder.CreateICmpULT(CmpSrcAddr, CmpDstAddr, "compare_src_dst");
735 Instruction *ThenTerm, *ElseTerm;
736 SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
737 &ThenTerm, &ElseTerm);
738
739 // If the LoopOpSize is greater than 1, each part of the function consists of
740 // four blocks:
741 // memmove_copy_backwards:
742 // skip the residual loop when 0 iterations are required
743 // memmove_bwd_residual_loop:
744 // copy the last few bytes individually so that the remaining length is
745 // a multiple of the LoopOpSize
746 // memmove_bwd_middle: skip the main loop when 0 iterations are required
747 // memmove_bwd_main_loop: the actual backwards loop BB with wide accesses
748 // memmove_copy_forward: skip the main loop when 0 iterations are required
749 // memmove_fwd_main_loop: the actual forward loop BB with wide accesses
750 // memmove_fwd_middle: skip the residual loop when 0 iterations are required
751 // memmove_fwd_residual_loop: copy the last few bytes individually
752 //
753 // The main and residual loop are switched between copying forward and
754 // backward so that the residual loop always operates on the end of the moved
755 // range. This is based on the assumption that buffers whose start is aligned
756 // with the LoopOpSize are more common than buffers whose end is.
757 //
758 // If the LoopOpSize is 1, each part of the function consists of two blocks:
759 // memmove_copy_backwards: skip the loop when 0 iterations are required
760 // memmove_bwd_main_loop: the actual backwards loop BB
761 // memmove_copy_forward: skip the loop when 0 iterations are required
762 // memmove_fwd_main_loop: the actual forward loop BB
763 BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
764 CopyBackwardsBB->setName("memmove_copy_backwards");
765 BasicBlock *CopyForwardBB = ElseTerm->getParent();
766 CopyForwardBB->setName("memmove_copy_forward");
767 BasicBlock *ExitBB = InsertBefore->getParent();
768 ExitBB->setName("memmove_done");
769
770 Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
771 Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
772
773 // Accesses in the residual loops do not share the same alignment as those in
774 // the main loops.
775 Align ResidualSrcAlign(commonAlignment(PartSrcAlign, ResidualLoopOpSize));
776 Align ResidualDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize));
777
778 // Copying backwards.
779 {
780 BasicBlock *MainLoopBB = BasicBlock::Create(
781 F->getContext(), "memmove_bwd_main_loop", F, CopyForwardBB);
782
783 // The predecessor of the memmove_bwd_main_loop. Updated in the
784 // following if a residual loop is emitted first.
785 BasicBlock *PredBB = CopyBackwardsBB;
786
787 if (RequiresResidual) {
788 // backwards residual loop
789 BasicBlock *ResidualLoopBB = BasicBlock::Create(
790 F->getContext(), "memmove_bwd_residual_loop", F, MainLoopBB);
791 IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
792 ResidualLoopBuilder.SetCurrentDebugLocation(DbgLoc);
793 PHINode *ResidualLoopPhi = ResidualLoopBuilder.CreatePHI(ILengthType, 0);
794 Value *ResidualIndex = ResidualLoopBuilder.CreateSub(
795 ResidualLoopPhi, CIResidualLoopOpSize, "bwd_residual_index");
796 // If we used LoopOpType as GEP element type, we would iterate over the
797 // buffers in TypeStoreSize strides while copying TypeAllocSize bytes,
798 // i.e., we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore,
799 // use byte offsets computed from the TypeStoreSize.
800 Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr,
801 ResidualIndex);
802 Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
803 ResidualLoopOpType, LoadGEP, ResidualSrcAlign, SrcIsVolatile,
804 "element");
805 Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
806 ResidualIndex);
807 ResidualLoopBuilder.CreateAlignedStore(Element, StoreGEP,
808 ResidualDstAlign, DstIsVolatile);
809
810 // After the residual loop, go to an intermediate block.
811 BasicBlock *IntermediateBB = BasicBlock::Create(
812 F->getContext(), "memmove_bwd_middle", F, MainLoopBB);
813 // Later code expects a terminator in the PredBB.
814 IRBuilder<> IntermediateBuilder(IntermediateBB);
815 IntermediateBuilder.SetCurrentDebugLocation(DbgLoc);
816 IntermediateBuilder.CreateUnreachable();
817 ResidualLoopBuilder.CreateCondBr(
818 ResidualLoopBuilder.CreateICmpEQ(ResidualIndex, RuntimeLoopBytes),
819 IntermediateBB, ResidualLoopBB);
820
821 ResidualLoopPhi->addIncoming(ResidualIndex, ResidualLoopBB);
822 ResidualLoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
823
824 // How to get to the residual:
825 CondBrInst *BrInst =
826 CondBrInst::Create(SkipResidualCondition, IntermediateBB,
827 ResidualLoopBB, ThenTerm->getIterator());
828 BrInst->setDebugLoc(DbgLoc);
829 ThenTerm->eraseFromParent();
830
831 PredBB = IntermediateBB;
832 }
833
834 // main loop
835 IRBuilder<> MainLoopBuilder(MainLoopBB);
836 MainLoopBuilder.SetCurrentDebugLocation(DbgLoc);
837 PHINode *MainLoopPhi = MainLoopBuilder.CreatePHI(ILengthType, 0);
838 Value *MainIndex =
839 MainLoopBuilder.CreateSub(MainLoopPhi, CILoopOpSize, "bwd_main_index");
840 Value *LoadGEP =
841 MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, MainIndex);
842 Value *Element = MainLoopBuilder.CreateAlignedLoad(
843 LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
844 Value *StoreGEP =
845 MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, MainIndex);
846 MainLoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
847 DstIsVolatile);
848 MainLoopBuilder.CreateCondBr(MainLoopBuilder.CreateICmpEQ(MainIndex, Zero),
849 ExitBB, MainLoopBB);
850 MainLoopPhi->addIncoming(MainIndex, MainLoopBB);
851 MainLoopPhi->addIncoming(RuntimeLoopBytes, PredBB);
852
853 // How to get to the main loop:
854 Instruction *PredBBTerm = PredBB->getTerminator();
856 SkipMainCondition, ExitBB, MainLoopBB, PredBBTerm->getIterator());
857 BrInst->setDebugLoc(DbgLoc);
858 PredBBTerm->eraseFromParent();
859 }
860
861 // Copying forward.
862 // main loop
863 {
864 BasicBlock *MainLoopBB =
865 BasicBlock::Create(F->getContext(), "memmove_fwd_main_loop", F, ExitBB);
866 IRBuilder<> MainLoopBuilder(MainLoopBB);
867 MainLoopBuilder.SetCurrentDebugLocation(DbgLoc);
868 PHINode *MainLoopPhi =
869 MainLoopBuilder.CreatePHI(ILengthType, 0, "fwd_main_index");
870 Value *LoadGEP =
871 MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, MainLoopPhi);
872 Value *Element = MainLoopBuilder.CreateAlignedLoad(
873 LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
874 Value *StoreGEP =
875 MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, MainLoopPhi);
876 MainLoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
877 DstIsVolatile);
878 Value *MainIndex = MainLoopBuilder.CreateAdd(MainLoopPhi, CILoopOpSize);
879 MainLoopPhi->addIncoming(MainIndex, MainLoopBB);
880 MainLoopPhi->addIncoming(Zero, CopyForwardBB);
881
882 Instruction *CopyFwdBBTerm = CopyForwardBB->getTerminator();
883 BasicBlock *SuccessorBB = ExitBB;
884 if (RequiresResidual)
885 SuccessorBB =
886 BasicBlock::Create(F->getContext(), "memmove_fwd_middle", F, ExitBB);
887
888 // leaving or staying in the main loop
889 MainLoopBuilder.CreateCondBr(
890 MainLoopBuilder.CreateICmpEQ(MainIndex, RuntimeLoopBytes), SuccessorBB,
891 MainLoopBB);
892
893 // getting in or skipping the main loop
894 CondBrInst *BrInst =
895 CondBrInst::Create(SkipMainCondition, SuccessorBB, MainLoopBB,
896 CopyFwdBBTerm->getIterator());
897 BrInst->setDebugLoc(DbgLoc);
898 CopyFwdBBTerm->eraseFromParent();
899
900 if (RequiresResidual) {
901 BasicBlock *IntermediateBB = SuccessorBB;
902 IRBuilder<> IntermediateBuilder(IntermediateBB);
903 IntermediateBuilder.SetCurrentDebugLocation(DbgLoc);
904 BasicBlock *ResidualLoopBB = BasicBlock::Create(
905 F->getContext(), "memmove_fwd_residual_loop", F, ExitBB);
906 IntermediateBuilder.CreateCondBr(SkipResidualCondition, ExitBB,
907 ResidualLoopBB);
908
909 // Residual loop
910 IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
911 ResidualLoopBuilder.SetCurrentDebugLocation(DbgLoc);
912 PHINode *ResidualLoopPhi =
913 ResidualLoopBuilder.CreatePHI(ILengthType, 0, "fwd_residual_index");
914 Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr,
915 ResidualLoopPhi);
916 Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
917 ResidualLoopOpType, LoadGEP, ResidualSrcAlign, SrcIsVolatile,
918 "element");
919 Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
920 ResidualLoopPhi);
921 ResidualLoopBuilder.CreateAlignedStore(Element, StoreGEP,
922 ResidualDstAlign, DstIsVolatile);
923 Value *ResidualIndex =
924 ResidualLoopBuilder.CreateAdd(ResidualLoopPhi, CIResidualLoopOpSize);
925 ResidualLoopBuilder.CreateCondBr(
926 ResidualLoopBuilder.CreateICmpEQ(ResidualIndex, CopyLen), ExitBB,
927 ResidualLoopBB);
928 ResidualLoopPhi->addIncoming(ResidualIndex, ResidualLoopBB);
929 ResidualLoopPhi->addIncoming(RuntimeLoopBytes, IntermediateBB);
930 }
931 }
932}
933
934// Similar to createMemMoveLoopUnknownSize, only the trip counts are computed at
935// compile time, obsolete loops and branches are omitted, and the residual code
936// is straight-line code instead of a loop.
937static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
938 Value *SrcAddr, Value *DstAddr,
939 ConstantInt *CopyLen, Align SrcAlign,
940 Align DstAlign, bool SrcIsVolatile,
941 bool DstIsVolatile,
942 const TargetTransformInfo &TTI) {
943 // No need to expand zero length moves.
944 if (CopyLen->isZero())
945 return;
946
947 Type *TypeOfCopyLen = CopyLen->getType();
948 BasicBlock *OrigBB = InsertBefore->getParent();
949 Function *F = OrigBB->getParent();
950 const DataLayout &DL = F->getDataLayout();
951 LLVMContext &Ctx = OrigBB->getContext();
952 unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
953 unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
954
955 Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
956 SrcAlign, DstAlign);
957 TypeSize LoopOpSize = DL.getTypeStoreSize(LoopOpType);
958 assert(LoopOpSize.isFixed() && "LoopOpType cannot be a scalable vector type");
959 Type *Int8Type = Type::getInt8Ty(Ctx);
960
961 // Calculate the loop trip count and remaining bytes to copy after the loop.
962 uint64_t BytesCopiedInLoop =
963 alignDown(CopyLen->getZExtValue(), LoopOpSize.getFixedValue());
964 uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopiedInLoop;
965
966 IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
967 ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
968 ConstantInt *LoopBound = ConstantInt::get(ILengthType, BytesCopiedInLoop);
969 ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
970
971 const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
972 IRBuilder<> PLBuilder(InsertBefore);
973 PLBuilder.SetCurrentDebugLocation(DbgLoc);
974
975 auto [CmpSrcAddr, CmpDstAddr] =
976 tryInsertCastToCommonAddrSpace(PLBuilder, SrcAddr, DstAddr, TTI);
977 Value *PtrCompare =
978 PLBuilder.CreateICmpULT(CmpSrcAddr, CmpDstAddr, "compare_src_dst");
979 Instruction *ThenTerm, *ElseTerm;
980 SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
981 &ThenTerm, &ElseTerm);
982
983 BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
984 BasicBlock *CopyForwardBB = ElseTerm->getParent();
985 BasicBlock *ExitBB = InsertBefore->getParent();
986 ExitBB->setName("memmove_done");
987
988 Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
989 Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
990
991 // Helper function to generate a load/store pair of a given type in the
992 // residual. Used in the forward and backward branches.
993 auto GenerateResidualLdStPair = [&](Type *OpTy, IRBuilderBase &Builder,
994 uint64_t &BytesCopied) {
995 Align ResSrcAlign(commonAlignment(SrcAlign, BytesCopied));
996 Align ResDstAlign(commonAlignment(DstAlign, BytesCopied));
997
998 TypeSize OperandSize = DL.getTypeStoreSize(OpTy);
999
1000 // If we used LoopOpType as GEP element type, we would iterate over the
1001 // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
1002 // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use
1003 // byte offsets computed from the TypeStoreSize.
1004 Value *SrcGEP = Builder.CreateInBoundsGEP(
1005 Int8Type, SrcAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
1006 LoadInst *Load =
1007 Builder.CreateAlignedLoad(OpTy, SrcGEP, ResSrcAlign, SrcIsVolatile);
1008 Value *DstGEP = Builder.CreateInBoundsGEP(
1009 Int8Type, DstAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
1010 Builder.CreateAlignedStore(Load, DstGEP, ResDstAlign, DstIsVolatile);
1011 BytesCopied += OperandSize;
1012 };
1013
1014 // Copying backwards.
1015 if (RemainingBytes != 0) {
1016 CopyBackwardsBB->setName("memmove_bwd_residual");
1017 uint64_t BytesCopied = BytesCopiedInLoop;
1018
1019 // Residual code is required to move the remaining bytes. We need the same
1020 // instructions as in the forward case, only in reverse. So we generate code
1021 // the same way, except that we change the IRBuilder insert point for each
1022 // load/store pair so that each one is inserted before the previous one
1023 // instead of after it.
1024 IRBuilder<> BwdResBuilder(CopyBackwardsBB,
1025 CopyBackwardsBB->getFirstNonPHIIt());
1026 BwdResBuilder.SetCurrentDebugLocation(DbgLoc);
1027 SmallVector<Type *, 5> RemainingOps;
1028 TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
1029 SrcAS, DstAS, PartSrcAlign,
1030 PartDstAlign);
1031 for (auto *OpTy : RemainingOps) {
1032 // reverse the order of the emitted operations
1033 BwdResBuilder.SetInsertPoint(CopyBackwardsBB,
1034 CopyBackwardsBB->getFirstNonPHIIt());
1035 GenerateResidualLdStPair(OpTy, BwdResBuilder, BytesCopied);
1036 }
1037 }
1038 if (BytesCopiedInLoop != 0) {
1039 BasicBlock *LoopBB = CopyBackwardsBB;
1040 BasicBlock *PredBB = OrigBB;
1041 if (RemainingBytes != 0) {
1042 // if we introduce residual code, it needs its separate BB
1043 LoopBB = CopyBackwardsBB->splitBasicBlock(
1044 CopyBackwardsBB->getTerminator(), "memmove_bwd_loop");
1045 PredBB = CopyBackwardsBB;
1046 } else {
1047 CopyBackwardsBB->setName("memmove_bwd_loop");
1048 }
1049 IRBuilder<> LoopBuilder(LoopBB->getTerminator());
1050 LoopBuilder.SetCurrentDebugLocation(DbgLoc);
1051 PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0);
1052 Value *Index = LoopBuilder.CreateSub(LoopPhi, CILoopOpSize, "bwd_index");
1053 Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, Index);
1054 Value *Element = LoopBuilder.CreateAlignedLoad(
1055 LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
1056 Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, Index);
1057 LoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
1058 DstIsVolatile);
1059
1060 // Replace the unconditional branch introduced by
1061 // SplitBlockAndInsertIfThenElse to turn LoopBB into a loop.
1062 Instruction *UncondTerm = LoopBB->getTerminator();
1063 LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpEQ(Index, Zero), ExitBB,
1064 LoopBB);
1065 UncondTerm->eraseFromParent();
1066
1067 LoopPhi->addIncoming(Index, LoopBB);
1068 LoopPhi->addIncoming(LoopBound, PredBB);
1069 }
1070
1071 // Copying forward.
1072 BasicBlock *FwdResidualBB = CopyForwardBB;
1073 if (BytesCopiedInLoop != 0) {
1074 CopyForwardBB->setName("memmove_fwd_loop");
1075 BasicBlock *LoopBB = CopyForwardBB;
1076 BasicBlock *SuccBB = ExitBB;
1077 if (RemainingBytes != 0) {
1078 // if we introduce residual code, it needs its separate BB
1079 SuccBB = CopyForwardBB->splitBasicBlock(CopyForwardBB->getTerminator(),
1080 "memmove_fwd_residual");
1081 FwdResidualBB = SuccBB;
1082 }
1083 IRBuilder<> LoopBuilder(LoopBB->getTerminator());
1084 LoopBuilder.SetCurrentDebugLocation(DbgLoc);
1085 PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0, "fwd_index");
1086 Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LoopPhi);
1087 Value *Element = LoopBuilder.CreateAlignedLoad(
1088 LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
1089 Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LoopPhi);
1090 LoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
1091 DstIsVolatile);
1092 Value *Index = LoopBuilder.CreateAdd(LoopPhi, CILoopOpSize);
1093 LoopPhi->addIncoming(Index, LoopBB);
1094 LoopPhi->addIncoming(Zero, OrigBB);
1095
1096 // Replace the unconditional branch to turn LoopBB into a loop.
1097 Instruction *UncondTerm = LoopBB->getTerminator();
1098 LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpEQ(Index, LoopBound), SuccBB,
1099 LoopBB);
1100 UncondTerm->eraseFromParent();
1101 }
1102
1103 if (RemainingBytes != 0) {
1104 uint64_t BytesCopied = BytesCopiedInLoop;
1105
1106 // Residual code is required to move the remaining bytes. In the forward
1107 // case, we emit it in the normal order.
1108 IRBuilder<> FwdResBuilder(FwdResidualBB->getTerminator());
1109 FwdResBuilder.SetCurrentDebugLocation(DbgLoc);
1110 SmallVector<Type *, 5> RemainingOps;
1111 TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
1112 SrcAS, DstAS, PartSrcAlign,
1113 PartDstAlign);
1114 for (auto *OpTy : RemainingOps)
1115 GenerateResidualLdStPair(OpTy, FwdResBuilder, BytesCopied);
1116 }
1117}
1118
1119/// Create a Value of \p DstType that consists of a sequence of copies of
1120/// \p SetValue, using bitcasts and a vector splat.
1122 Value *SetValue, Type *DstType) {
1123 TypeSize DstSize = DL.getTypeStoreSize(DstType);
1124 Type *SetValueType = SetValue->getType();
1125 TypeSize SetValueSize = DL.getTypeStoreSize(SetValueType);
1126 assert(SetValueSize == DL.getTypeAllocSize(SetValueType) &&
1127 "Store size and alloc size of SetValue's type must match");
1128 assert(SetValueSize != 0 && DstSize % SetValueSize == 0 &&
1129 "DstType size must be a multiple of SetValue size");
1130
1131 Value *Result = SetValue;
1132 if (DstSize != SetValueSize) {
1133 if (!SetValueType->isIntegerTy() && !SetValueType->isFloatingPointTy()) {
1134 // If the type cannot be put into a vector, bitcast to iN first.
1135 LLVMContext &Ctx = SetValue->getContext();
1136 Result = B.CreateBitCast(Result, Type::getIntNTy(Ctx, SetValueSize * 8),
1137 "setvalue.toint");
1138 }
1139 // Form a sufficiently large vector consisting of SetValue, repeated.
1140 Result =
1141 B.CreateVectorSplat(DstSize / SetValueSize, Result, "setvalue.splat");
1142 }
1143
1144 // The value has the right size, but we might have to bitcast it to the right
1145 // type.
1146 Result = B.CreateBitCast(Result, DstType, "setvalue.splat.cast");
1147 return Result;
1148}
1149
1150static void
1152 ConstantInt *Len, Value *SetValue, Align DstAlign,
1153 bool IsVolatile, const TargetTransformInfo *TTI,
1154 std::optional<uint64_t> AverageTripCount) {
1155 // No need to expand zero length memsets.
1156 if (Len->isZero())
1157 return;
1158
1159 BasicBlock *PreLoopBB = InsertBefore->getParent();
1160 Function *ParentFunc = PreLoopBB->getParent();
1161 const DataLayout &DL = ParentFunc->getDataLayout();
1162 LLVMContext &Ctx = PreLoopBB->getContext();
1163
1164 unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
1165
1166 Type *TypeOfLen = Len->getType();
1167 Type *Int8Type = Type::getInt8Ty(Ctx);
1168 assert(SetValue->getType() == Int8Type && "Can only set bytes");
1169
1170 Type *LoopOpType = Int8Type;
1171 if (TTI) {
1172 // Use the same memory access type as for a memcpy with the same Dst and Src
1173 // alignment and address space.
1174 LoopOpType = TTI->getMemcpyLoopLoweringType(
1175 Ctx, Len, DstAS, DstAS, DstAlign, DstAlign, std::nullopt);
1176 }
1177 TypeSize LoopOpSize = DL.getTypeStoreSize(LoopOpType);
1178 assert(LoopOpSize.isFixed() && "LoopOpType cannot be a scalable vector type");
1179
1180 uint64_t LoopEndCount =
1181 alignDown(Len->getZExtValue(), LoopOpSize.getFixedValue());
1182
1183 if (LoopEndCount != 0) {
1184 Value *SplatSetValue = nullptr;
1185 {
1186 IRBuilder<> PreLoopBuilder(InsertBefore);
1187 SplatSetValue =
1188 createMemSetSplat(DL, PreLoopBuilder, SetValue, LoopOpType);
1189 }
1190
1191 // Don't generate a residual loop, the remaining bytes are set with
1192 // straight-line code.
1193 LoopExpansionInfo LEI = insertLoopExpansion(
1194 InsertBefore, Len, LoopOpSize, 0, "static-memset", AverageTripCount);
1195 assert(LEI.MainLoopIP && LEI.MainLoopIndex &&
1196 "Main loop should be generated for non-zero loop count");
1197
1198 // Fill MainLoopBB
1199 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
1200 Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
1201
1202 Value *DstGEP =
1203 MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex);
1204
1205 MainLoopBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign,
1206 IsVolatile);
1207
1208 assert(!LEI.ResidualLoopIP && !LEI.ResidualLoopIndex &&
1209 "No residual loop was requested");
1210 }
1211
1212 uint64_t BytesSet = LoopEndCount;
1213 uint64_t RemainingBytes = Len->getZExtValue() - BytesSet;
1214 if (RemainingBytes == 0)
1215 return;
1216
1217 IRBuilder<> RBuilder(InsertBefore);
1218
1219 assert(TTI && "there cannot be a residual loop without TTI");
1220 SmallVector<Type *, 5> RemainingOps;
1221 TTI->getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
1222 DstAS, DstAS, DstAlign, DstAlign,
1223 std::nullopt);
1224
1225 Type *PreviousOpTy = nullptr;
1226 Value *SplatSetValue = nullptr;
1227 for (auto *OpTy : RemainingOps) {
1228 TypeSize OperandSize = DL.getTypeStoreSize(OpTy);
1229 assert(OperandSize.isFixed() &&
1230 "Operand types cannot be scalable vector types");
1231 Align PartDstAlign(commonAlignment(DstAlign, BytesSet));
1232
1233 // Avoid recomputing the splat SetValue if it's the same as for the last
1234 // iteration.
1235 if (OpTy != PreviousOpTy)
1236 SplatSetValue = createMemSetSplat(DL, RBuilder, SetValue, OpTy);
1237
1238 Value *DstGEP = RBuilder.CreateInBoundsGEP(
1239 Int8Type, DstAddr, ConstantInt::get(TypeOfLen, BytesSet));
1240 RBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign,
1241 IsVolatile);
1242 BytesSet += OperandSize;
1243 PreviousOpTy = OpTy;
1244 }
1245 assert(BytesSet == Len->getZExtValue() &&
1246 "Bytes set should match size in the call!");
1247}
1248
1249static void
1251 Value *Len, Value *SetValue, Align DstAlign,
1252 bool IsVolatile, const TargetTransformInfo *TTI,
1253 std::optional<uint64_t> AverageTripCount) {
1254 BasicBlock *PreLoopBB = InsertBefore->getParent();
1255 Function *ParentFunc = PreLoopBB->getParent();
1256 const DataLayout &DL = ParentFunc->getDataLayout();
1257 LLVMContext &Ctx = PreLoopBB->getContext();
1258
1259 unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
1260
1261 Type *Int8Type = Type::getInt8Ty(Ctx);
1262 assert(SetValue->getType() == Int8Type && "Can only set bytes");
1263
1264 Type *LoopOpType = Int8Type;
1265 if (TTI) {
1266 LoopOpType = TTI->getMemcpyLoopLoweringType(
1267 Ctx, Len, DstAS, DstAS, DstAlign, DstAlign, std::nullopt);
1268 }
1269 TypeSize LoopOpSize = DL.getTypeStoreSize(LoopOpType);
1270 assert(LoopOpSize.isFixed() && "LoopOpType cannot be a scalable vector type");
1271
1272 Type *ResidualLoopOpType = Int8Type;
1273 TypeSize ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
1274
1275 Value *SplatSetValue = SetValue;
1276 {
1277 IRBuilder<> PreLoopBuilder(InsertBefore);
1278 SplatSetValue = createMemSetSplat(DL, PreLoopBuilder, SetValue, LoopOpType);
1279 }
1280
1281 LoopExpansionInfo LEI =
1282 insertLoopExpansion(InsertBefore, Len, LoopOpSize, ResidualLoopOpSize,
1283 "dynamic-memset", AverageTripCount);
1284 assert(LEI.MainLoopIP && LEI.MainLoopIndex &&
1285 "Main loop should be generated for unknown size memset");
1286
1287 // Fill MainLoopBB
1288 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
1289 Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
1290
1291 Value *DstGEP =
1292 MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex);
1293 MainLoopBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign,
1294 IsVolatile);
1295
1296 // Fill ResidualLoopBB
1297 if (!LEI.ResidualLoopIP)
1298 return;
1299
1300 Align ResDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize));
1301
1302 IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP);
1303
1304 Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
1305 LEI.ResidualLoopIndex);
1306 ResLoopBuilder.CreateAlignedStore(SetValue, ResDstGEP, ResDstAlign,
1307 IsVolatile);
1308}
1309
1310static void createMemSetPatternLoop(Instruction *InsertBefore, Value *DstAddr,
1311 Value *Len, Value *SetValue, Align DstAlign,
1312 bool IsVolatile,
1313 const TargetTransformInfo *TTI,
1314 std::optional<uint64_t> AverageTripCount) {
1315 // No need to expand zero length memset.pattern.
1316 if (auto *CLen = dyn_cast<ConstantInt>(Len))
1317 if (CLen->isZero())
1318 return;
1319
1320 BasicBlock *PreLoopBB = InsertBefore->getParent();
1321 Function *ParentFunc = PreLoopBB->getParent();
1322 const DataLayout &DL = ParentFunc->getDataLayout();
1323 LLVMContext &Ctx = PreLoopBB->getContext();
1324
1325 unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
1326
1327 Type *PreferredLoopOpType = SetValue->getType();
1328 if (TTI) {
1329 PreferredLoopOpType = TTI->getMemcpyLoopLoweringType(
1330 Ctx, Len, DstAS, DstAS, DstAlign, DstAlign, std::nullopt);
1331 }
1332 TypeSize PreferredLoopOpStoreSize = DL.getTypeStoreSize(PreferredLoopOpType);
1333 assert(PreferredLoopOpStoreSize.isFixed() &&
1334 "PreferredLoopOpType cannot be a scalable vector type");
1335
1336 TypeSize PreferredLoopOpAllocSize = DL.getTypeAllocSize(PreferredLoopOpType);
1337
1338 Type *OriginalType = SetValue->getType();
1339 TypeSize OriginalTypeStoreSize = DL.getTypeStoreSize(OriginalType);
1340 TypeSize OriginalTypeAllocSize = DL.getTypeAllocSize(OriginalType);
1341
1342 // The semantics of memset.pattern restrict what vectorization we can do: It
1343 // has to behave like a series of stores of the SetValue type at offsets that
1344 // are spaced by the alloc size of the SetValue type. If store and alloc size
1345 // of the SetValue type don't match, the bytes that aren't covered by these
1346 // stores must not be overwritten. We therefore only vectorize memset.pattern
1347 // if the store and alloc sizes of the SetValue are equal and properly divide
1348 // the size of the preferred lowering type (and only if store and alloc size
1349 // for the preferred lowering type are also equal).
1350
1351 unsigned MainLoopStep = 1;
1352 Type *MainLoopType = OriginalType;
1353 TypeSize MainLoopAllocSize = OriginalTypeAllocSize;
1354 unsigned ResidualLoopStep = 0;
1355 Type *ResidualLoopType = nullptr;
1356
1357 if (PreferredLoopOpStoreSize == PreferredLoopOpAllocSize &&
1358 OriginalTypeStoreSize == OriginalTypeAllocSize &&
1359 OriginalTypeStoreSize < PreferredLoopOpStoreSize &&
1360 PreferredLoopOpStoreSize % OriginalTypeStoreSize == 0) {
1361 // Multiple instances of SetValue can be combined to reach the preferred
1362 // loop op size.
1363 MainLoopStep = PreferredLoopOpStoreSize / OriginalTypeStoreSize;
1364 MainLoopType = PreferredLoopOpType;
1365 MainLoopAllocSize = PreferredLoopOpStoreSize;
1366
1367 ResidualLoopStep = 1;
1368 ResidualLoopType = OriginalType;
1369 }
1370
1371 // The step arguments here are in terms of the alloc size of the SetValue, not
1372 // in terms of bytes.
1373 LoopExpansionInfo LEI =
1374 insertLoopExpansion(InsertBefore, Len, MainLoopStep, ResidualLoopStep,
1375 "memset.pattern", AverageTripCount);
1376
1377 Align PartDstAlign(commonAlignment(DstAlign, MainLoopAllocSize));
1378
1379 if (LEI.MainLoopIP) {
1380 // Create the loop-invariant splat value before the loop.
1381 IRBuilder<> PreLoopBuilder(PreLoopBB->getTerminator());
1382 Value *MainLoopSetValue = SetValue;
1383 if (MainLoopType != OriginalType)
1384 MainLoopSetValue =
1385 createMemSetSplat(DL, PreLoopBuilder, SetValue, MainLoopType);
1386
1387 // Fill MainLoopBB
1388 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
1389 Value *DstGEP = MainLoopBuilder.CreateInBoundsGEP(MainLoopType, DstAddr,
1390 LEI.MainLoopIndex);
1391 MainLoopBuilder.CreateAlignedStore(MainLoopSetValue, DstGEP, PartDstAlign,
1392 IsVolatile);
1393 }
1394
1395 if (!LEI.ResidualLoopIP)
1396 return;
1397
1398 // Fill ResidualLoopBB
1399 Align ResDstAlign(
1400 commonAlignment(PartDstAlign, DL.getTypeAllocSize(ResidualLoopType)));
1401
1402 IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP);
1403 Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(ResidualLoopType, DstAddr,
1404 LEI.ResidualLoopIndex);
1405 ResLoopBuilder.CreateAlignedStore(SetValue, ResDstGEP, ResDstAlign,
1406 IsVolatile);
1407}
1408
1409template <typename T>
1411 if (SE) {
1412 const SCEV *SrcSCEV = SE->getSCEV(Memcpy->getRawSource());
1413 const SCEV *DestSCEV = SE->getSCEV(Memcpy->getRawDest());
1414 if (SE->isKnownPredicateAt(CmpInst::ICMP_NE, SrcSCEV, DestSCEV, Memcpy))
1415 return false;
1416 }
1417 return true;
1418}
1419
1421 const TargetTransformInfo &TTI,
1422 ScalarEvolution *SE) {
1423 bool CanOverlap = canOverlap(Memcpy, SE);
1424 auto TripCount = getAverageMemOpLoopTripCount(*Memcpy);
1425 if (ConstantInt *CI = dyn_cast<ConstantInt>(Memcpy->getLength())) {
1427 /*InsertBefore=*/Memcpy,
1428 /*SrcAddr=*/Memcpy->getRawSource(),
1429 /*DstAddr=*/Memcpy->getRawDest(),
1430 /*CopyLen=*/CI,
1431 /*SrcAlign=*/Memcpy->getSourceAlign().valueOrOne(),
1432 /*DstAlign=*/Memcpy->getDestAlign().valueOrOne(),
1433 /*SrcIsVolatile=*/Memcpy->isVolatile(),
1434 /*DstIsVolatile=*/Memcpy->isVolatile(),
1435 /*CanOverlap=*/CanOverlap,
1436 /*TTI=*/TTI,
1437 /*AtomicElementSize=*/std::nullopt,
1438 /*AverageTripCount=*/TripCount);
1439 } else {
1441 /*InsertBefore=*/Memcpy,
1442 /*SrcAddr=*/Memcpy->getRawSource(),
1443 /*DstAddr=*/Memcpy->getRawDest(),
1444 /*CopyLen=*/Memcpy->getLength(),
1445 /*SrcAlign=*/Memcpy->getSourceAlign().valueOrOne(),
1446 /*DstAlign=*/Memcpy->getDestAlign().valueOrOne(),
1447 /*SrcIsVolatile=*/Memcpy->isVolatile(),
1448 /*DstIsVolatile=*/Memcpy->isVolatile(),
1449 /*CanOverlap=*/CanOverlap,
1450 /*TTI=*/TTI,
1451 /*AtomicElementSize=*/std::nullopt,
1452 /*AverageTripCount=*/TripCount);
1453 }
1454}
1455
1457 const TargetTransformInfo &TTI) {
1458 Value *CopyLen = Memmove->getLength();
1459 Value *SrcAddr = Memmove->getRawSource();
1460 Value *DstAddr = Memmove->getRawDest();
1461 Align SrcAlign = Memmove->getSourceAlign().valueOrOne();
1462 Align DstAlign = Memmove->getDestAlign().valueOrOne();
1463 bool SrcIsVolatile = Memmove->isVolatile();
1464 bool DstIsVolatile = SrcIsVolatile;
1465 IRBuilder<> CastBuilder(Memmove);
1466 CastBuilder.SetCurrentDebugLocation(Memmove->getStableDebugLoc());
1467
1468 unsigned SrcAS = SrcAddr->getType()->getPointerAddressSpace();
1469 unsigned DstAS = DstAddr->getType()->getPointerAddressSpace();
1470 if (SrcAS != DstAS) {
1471 if (!TTI.addrspacesMayAlias(SrcAS, DstAS)) {
1472 // We may not be able to emit a pointer comparison, but we don't have
1473 // to. Expand as memcpy.
1474 auto AverageTripCount = getAverageMemOpLoopTripCount(*Memmove);
1475 if (ConstantInt *CI = dyn_cast<ConstantInt>(CopyLen)) {
1477 /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CI, SrcAlign, DstAlign,
1478 SrcIsVolatile, DstIsVolatile,
1479 /*CanOverlap=*/false, TTI, std::nullopt, AverageTripCount);
1480 } else {
1482 /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign,
1483 DstAlign, SrcIsVolatile, DstIsVolatile,
1484 /*CanOverlap=*/false, TTI, std::nullopt, AverageTripCount);
1485 }
1486
1487 return true;
1488 }
1489
1490 if (!(TTI.isValidAddrSpaceCast(DstAS, SrcAS) ||
1491 TTI.isValidAddrSpaceCast(SrcAS, DstAS))) {
1492 // We don't know generically if it's legal to introduce an
1493 // addrspacecast. We need to know either if it's legal to insert an
1494 // addrspacecast, or if the address spaces cannot alias.
1495 LLVM_DEBUG(
1496 dbgs() << "Do not know how to expand memmove between different "
1497 "address spaces\n");
1498 return false;
1499 }
1500 }
1501
1502 if (ConstantInt *CI = dyn_cast<ConstantInt>(CopyLen)) {
1504 /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CI, SrcAlign, DstAlign,
1505 SrcIsVolatile, DstIsVolatile, TTI);
1506 } else {
1508 /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign, DstAlign,
1509 SrcIsVolatile, DstIsVolatile, TTI);
1510 }
1511 return true;
1512}
1513
1515 const TargetTransformInfo *TTI) {
1516 auto AverageTripCount = getAverageMemOpLoopTripCount(*Memset);
1517 if (ConstantInt *CI = dyn_cast<ConstantInt>(Memset->getLength())) {
1519 /*InsertBefore=*/Memset,
1520 /*DstAddr=*/Memset->getRawDest(),
1521 /*Len=*/CI,
1522 /*SetValue=*/Memset->getValue(),
1523 /*DstAlign=*/Memset->getDestAlign().valueOrOne(),
1524 /*IsVolatile=*/Memset->isVolatile(),
1525 /*TTI=*/TTI,
1526 /*AverageTripCount=*/AverageTripCount);
1527 } else {
1529 /*InsertBefore=*/Memset,
1530 /*DstAddr=*/Memset->getRawDest(),
1531 /*Len=*/Memset->getLength(),
1532 /*SetValue=*/Memset->getValue(),
1533 /*DstAlign=*/Memset->getDestAlign().valueOrOne(),
1534 /*IsVolatile=*/Memset->isVolatile(),
1535 /*TTI=*/TTI,
1536 /*AverageTripCount=*/AverageTripCount);
1537 }
1538}
1539
1541 const TargetTransformInfo &TTI) {
1542 expandMemSetAsLoop(MemSet, &TTI);
1543}
1544
1546 const TargetTransformInfo *TTI) {
1548 /*InsertBefore=*/Memset,
1549 /*DstAddr=*/Memset->getRawDest(),
1550 /*Len=*/Memset->getLength(),
1551 /*SetValue=*/Memset->getValue(),
1552 /*DstAlign=*/Memset->getDestAlign().valueOrOne(),
1553 /*IsVolatile=*/Memset->isVolatile(),
1554 /*TTI=*/TTI,
1555 /*AverageTripCount=*/getAverageMemOpLoopTripCount(*Memset));
1556}
1557
1562
1564 const TargetTransformInfo &TTI,
1565 ScalarEvolution *SE) {
1566 assert(AtomicMemcpy->isAtomic());
1567 if (ConstantInt *CI = dyn_cast<ConstantInt>(AtomicMemcpy->getLength())) {
1569 /*InsertBefore=*/AtomicMemcpy,
1570 /*SrcAddr=*/AtomicMemcpy->getRawSource(),
1571 /*DstAddr=*/AtomicMemcpy->getRawDest(),
1572 /*CopyLen=*/CI,
1573 /*SrcAlign=*/AtomicMemcpy->getSourceAlign().valueOrOne(),
1574 /*DstAlign=*/AtomicMemcpy->getDestAlign().valueOrOne(),
1575 /*SrcIsVolatile=*/AtomicMemcpy->isVolatile(),
1576 /*DstIsVolatile=*/AtomicMemcpy->isVolatile(),
1577 /*CanOverlap=*/false, // SrcAddr & DstAddr may not overlap by spec.
1578 /*TTI=*/TTI,
1579 /*AtomicElementSize=*/AtomicMemcpy->getElementSizeInBytes());
1580 } else {
1582 /*InsertBefore=*/AtomicMemcpy,
1583 /*SrcAddr=*/AtomicMemcpy->getRawSource(),
1584 /*DstAddr=*/AtomicMemcpy->getRawDest(),
1585 /*CopyLen=*/AtomicMemcpy->getLength(),
1586 /*SrcAlign=*/AtomicMemcpy->getSourceAlign().valueOrOne(),
1587 /*DstAlign=*/AtomicMemcpy->getDestAlign().valueOrOne(),
1588 /*SrcIsVolatile=*/AtomicMemcpy->isVolatile(),
1589 /*DstIsVolatile=*/AtomicMemcpy->isVolatile(),
1590 /*CanOverlap=*/false, // SrcAddr & DstAddr may not overlap by spec.
1591 /*TargetTransformInfo=*/TTI,
1592 /*AtomicElementSize=*/AtomicMemcpy->getElementSizeInBytes());
1593 }
1594}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static void SetValue(Value *V, GenericValue Val, ExecutionContext &SF)
Definition Execution.cpp:41
#define DEBUG_TYPE
static Value * createMemSetSplat(const DataLayout &DL, IRBuilderBase &B, Value *SetValue, Type *DstType)
Create a Value of DstType that consists of a sequence of copies of SetValue, using bitcasts and a vec...
static std::pair< Value *, Value * > tryInsertCastToCommonAddrSpace(IRBuilderBase &B, Value *Addr1, Value *Addr2, const TargetTransformInfo &TTI)
static void createMemSetPatternLoop(Instruction *InsertBefore, Value *DstAddr, Value *Len, Value *SetValue, Align DstAlign, bool IsVolatile, const TargetTransformInfo *TTI, std::optional< uint64_t > AverageTripCount)
static bool canOverlap(MemTransferBase< T > *Memcpy, ScalarEvolution *SE)
static void createMemMoveLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, ConstantInt *CopyLen, Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile, const TargetTransformInfo &TTI)
static void createMemSetLoopUnknownSize(Instruction *InsertBefore, Value *DstAddr, Value *Len, Value *SetValue, Align DstAlign, bool IsVolatile, const TargetTransformInfo *TTI, std::optional< uint64_t > AverageTripCount)
static Value * getRuntimeLoopRemainder(IRBuilderBase &B, Value *Len, Value *OpSize, unsigned OpSizeVal)
static void createMemSetLoopKnownSize(Instruction *InsertBefore, Value *DstAddr, ConstantInt *Len, Value *SetValue, Align DstAlign, bool IsVolatile, const TargetTransformInfo *TTI, std::optional< uint64_t > AverageTripCount)
static Value * getRuntimeLoopUnits(IRBuilderBase &B, Value *Len, Value *OpSize, unsigned OpSizeVal, Value *RTLoopRemainder=nullptr)
static LoopExpansionInfo insertLoopExpansion(Instruction *InsertBefore, Value *Len, unsigned MainLoopStep, unsigned ResidualLoopStep, StringRef BBNamePrefix, std::optional< uint64_t > ExpectedUnits)
Insert the control flow and loop counters for a memcpy/memset loop expansion.
static void createMemMoveLoopUnknownSize(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen, Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile, const TargetTransformInfo &TTI)
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define P(N)
This file contains the declarations for profiling metadata utility functions.
#define LLVM_DEBUG(...)
Definition Debug.h:114
This pass exposes codegen information to IR-level passes.
This class represents any memcpy intrinsic i.e.
uint32_t getElementSizeInBytes() const
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
@ ICMP_NE
not equal
Definition InstrTypes.h:698
Conditional Branch instruction.
static CondBrInst * Create(Value *Cond, BasicBlock *IfTrue, BasicBlock *IfFalse, InsertPosition InsertBefore=nullptr)
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2347
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1894
CondBrInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition IRBuilder.h:1223
UnreachableInst * CreateUnreachable()
Definition IRBuilder.h:1365
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1975
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2335
UncondBrInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition IRBuilder.h:1217
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2496
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2331
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1446
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1429
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition IRBuilder.h:1913
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2811
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Class to represent integer types.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
MDNode * createAnonymousAliasScope(MDNode *Domain, StringRef Name=StringRef())
Return metadata appropriate for an alias scope root node.
Definition MDBuilder.h:195
LLVM_ABI MDNode * createLikelyBranchWeights()
Return metadata containing two branch weights, with significant bias towards true destination.
Definition MDBuilder.cpp:43
MDNode * createAnonymousAliasScopeDomain(StringRef Name=StringRef())
Return metadata appropriate for an alias scope domain node.
Definition MDBuilder.h:188
Metadata node.
Definition Metadata.h:1080
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
This class wraps the llvm.memcpy intrinsic.
Value * getLength() const
Value * getRawDest() const
MaybeAlign getDestAlign() const
This is the common base class for memset/memcpy/memmove.
bool isVolatile() const
This class wraps the llvm.memmove intrinsic.
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
This class wraps the llvm.experimental.memset.pattern intrinsic.
Common base class for all memory transfer intrinsics.
Value * getRawSource() const
Return the arguments to the instruction.
MaybeAlign getSourceAlign() const
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI bool isKnownPredicateAt(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS, const Instruction *CtxI)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:397
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
LLVM_ABI void createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, ConstantInt *CopyLen, Align SrcAlign, Align DestAlign, bool SrcIsVolatile, bool DstIsVolatile, bool CanOverlap, const TargetTransformInfo &TTI, std::optional< uint32_t > AtomicCpySize=std::nullopt, std::optional< uint64_t > AverageTripCount=std::nullopt)
Emit a loop implementing the semantics of an llvm.memcpy whose size is a compile time constant.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
cl::opt< bool > ProfcheckDisableMetadataFixes
Definition Metadata.cpp:64
LLVM_ABI void setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I, StringRef PassName, const Function *F=nullptr)
Like setExplicitlyUnknownBranchWeights(...), but only sets unknown branch weights in the new instruct...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool expandMemMoveAsLoop(MemMoveInst *MemMove, const TargetTransformInfo &TTI)
Expand MemMove as a loop.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI SmallVector< InstrProfValueData, 4 > getValueProfDataFromInst(const Instruction &Inst, InstrProfValueKind ValueKind, uint32_t MaxNumValueData, uint64_t &TotalC, bool GetNoICPValue=false)
Extract the value profile data from Inst and returns them if Inst is annotated with value profile dat...
TargetTransformInfo TTI
LLVM_ABI void expandAtomicMemCpyAsLoop(AnyMemCpyInst *AtomicMemCpy, const TargetTransformInfo &TTI, ScalarEvolution *SE)
Expand AtomicMemCpy as a loop. AtomicMemCpy is not deleted.
LLVM_ABI void expandMemSetAsLoop(MemSetInst *MemSet, const TargetTransformInfo *TTI=nullptr)
Expand MemSet as a loop.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI void expandMemSetPatternAsLoop(MemSetPatternInst *MemSet, const TargetTransformInfo *TTI=nullptr)
Expand MemSetPattern as a loop.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI void expandMemCpyAsLoop(MemCpyInst *MemCpy, const TargetTransformInfo &TTI, ScalarEvolution *SE=nullptr)
Expand MemCpy as a loop. MemCpy is not deleted.
LLVM_ABI void setFittedBranchWeights(Instruction &I, ArrayRef< uint64_t > Weights, bool IsExpected, bool ElideAllZero=false)
Variant of setBranchWeights where the Weights will be fit first to uint32_t by shifting right.
LLVM_ABI void createMemCpyLoopUnknownSize(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen, Align SrcAlign, Align DestAlign, bool SrcIsVolatile, bool DstIsVolatile, bool CanOverlap, const TargetTransformInfo &TTI, std::optional< unsigned > AtomicSize=std::nullopt, std::optional< uint64_t > AverageTripCount=std::nullopt)
Emit a loop implementing the semantics of llvm.memcpy where the size is not a compile-time constant.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130