LLVM 22.0.0git
LowerMemIntrinsics.cpp
Go to the documentation of this file.
1//===- LowerMemIntrinsics.cpp ----------------------------------*- C++ -*--===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
12#include "llvm/IR/IRBuilder.h"
14#include "llvm/IR/MDBuilder.h"
15#include "llvm/Support/Debug.h"
18#include <optional>
19
20#define DEBUG_TYPE "lower-mem-intrinsics"
21
22using namespace llvm;
23
24/// \returns \p Len urem \p OpSize, checking for optimization opportunities.
25/// \p OpSizeVal must be the integer value of the \c ConstantInt \p OpSize.
27 Value *OpSize, unsigned OpSizeVal) {
28 // For powers of 2, we can and by (OpSizeVal - 1) instead of using urem.
29 if (isPowerOf2_32(OpSizeVal))
30 return B.CreateAnd(Len, OpSizeVal - 1);
31 return B.CreateURem(Len, OpSize);
32}
33
34/// \returns (\p Len udiv \p OpSize) mul \p OpSize, checking for optimization
35/// opportunities.
36/// If \p RTLoopRemainder is provided, it must be the result of
37/// \c getRuntimeLoopRemainder() with the same arguments.
39 unsigned OpSizeVal,
40 Value *RTLoopRemainder = nullptr) {
41 if (!RTLoopRemainder)
42 RTLoopRemainder = getRuntimeLoopRemainder(B, Len, OpSize, OpSizeVal);
43 return B.CreateSub(Len, RTLoopRemainder);
44}
45
46namespace {
47/// Container for the return values of insertLoopExpansion.
48struct LoopExpansionInfo {
49 /// The instruction at the end of the main loop body.
50 Instruction *MainLoopIP = nullptr;
51
52 /// The unit index in the main loop body.
53 Value *MainLoopIndex = nullptr;
54
55 /// The instruction at the end of the residual loop body. Can be nullptr if no
56 /// residual is required.
57 Instruction *ResidualLoopIP = nullptr;
58
59 /// The unit index in the residual loop body. Can be nullptr if no residual is
60 /// required.
61 Value *ResidualLoopIndex = nullptr;
62};
63} // namespace
64
65/// Insert the control flow and loop counters for a memcpy/memset loop
66/// expansion.
67///
68/// This function inserts IR corresponding to the following C code before
69/// \p InsertBefore:
70/// \code
71/// LoopUnits = (Len / MainLoopStep) * MainLoopStep;
72/// ResidualUnits = Len - LoopUnits;
73/// MainLoopIndex = 0;
74/// if (LoopUnits > 0) {
75/// do {
76/// // MainLoopIP
77/// MainLoopIndex += MainLoopStep;
78/// } while (MainLoopIndex < LoopUnits);
79/// }
80/// for (size_t i = 0; i < ResidualUnits; i += ResidualLoopStep) {
81/// ResidualLoopIndex = LoopUnits + i;
82/// // ResidualLoopIP
83/// }
84/// \endcode
85///
86/// \p MainLoopStep and \p ResidualLoopStep determine by how many "units" the
87/// loop index is increased in each iteration of the main and residual loops,
88/// respectively. In most cases, the "unit" will be bytes, but larger units are
89/// useful for lowering memset.pattern.
90///
91/// The computation of \c LoopUnits and \c ResidualUnits is performed at compile
92/// time if \p Len is a \c ConstantInt.
93/// The second (residual) loop is omitted if \p ResidualLoopStep is 0 or equal
94/// to \p MainLoopStep.
95/// The generated \c MainLoopIP, \c MainLoopIndex, \c ResidualLoopIP, and
96/// \c ResidualLoopIndex are returned in a \c LoopExpansionInfo object.
97static LoopExpansionInfo insertLoopExpansion(Instruction *InsertBefore,
98 Value *Len, unsigned MainLoopStep,
99 unsigned ResidualLoopStep,
100 StringRef BBNamePrefix) {
101 assert((ResidualLoopStep == 0 || MainLoopStep % ResidualLoopStep == 0) &&
102 "ResidualLoopStep must divide MainLoopStep if specified");
103 assert(ResidualLoopStep <= MainLoopStep &&
104 "ResidualLoopStep cannot be larger than MainLoopStep");
105 assert(MainLoopStep > 0 && "MainLoopStep must be non-zero");
106 LoopExpansionInfo LEI;
107 BasicBlock *PreLoopBB = InsertBefore->getParent();
108 BasicBlock *PostLoopBB = PreLoopBB->splitBasicBlock(
109 InsertBefore, BBNamePrefix + "-post-expansion");
110 Function *ParentFunc = PreLoopBB->getParent();
111 LLVMContext &Ctx = PreLoopBB->getContext();
112 IRBuilder<> PreLoopBuilder(PreLoopBB->getTerminator());
113
114 // Calculate the main loop trip count and remaining units to cover after the
115 // loop.
116 Type *LenType = Len->getType();
117 IntegerType *ILenType = cast<IntegerType>(LenType);
118 ConstantInt *CIMainLoopStep = ConstantInt::get(ILenType, MainLoopStep);
119
120 Value *LoopUnits = Len;
121 Value *ResidualUnits = nullptr;
122 // We can make a conditional branch unconditional if we know that the
123 // MainLoop must be executed at least once.
124 bool MustTakeMainLoop = false;
125 if (MainLoopStep != 1) {
126 if (auto *CLen = dyn_cast<ConstantInt>(Len)) {
127 uint64_t TotalUnits = CLen->getZExtValue();
128 uint64_t LoopEndCount = alignDown(TotalUnits, MainLoopStep);
129 uint64_t ResidualCount = TotalUnits - LoopEndCount;
130 LoopUnits = ConstantInt::get(LenType, LoopEndCount);
131 ResidualUnits = ConstantInt::get(LenType, ResidualCount);
132 MustTakeMainLoop = LoopEndCount > 0;
133 // As an optimization, we could skip generating the residual loop if
134 // ResidualCount is known to be 0. However, current uses of this function
135 // don't request a residual loop if the length is constant (they generate
136 // a (potentially empty) sequence of loads and stores instead), so this
137 // optimization would have no effect here.
138 } else {
139 ResidualUnits = getRuntimeLoopRemainder(PreLoopBuilder, Len,
140 CIMainLoopStep, MainLoopStep);
141 LoopUnits = getRuntimeLoopUnits(PreLoopBuilder, Len, CIMainLoopStep,
142 MainLoopStep, ResidualUnits);
143 }
144 } else if (auto *CLen = dyn_cast<ConstantInt>(Len)) {
145 MustTakeMainLoop = CLen->getZExtValue() > 0;
146 }
147
148 BasicBlock *MainLoopBB = BasicBlock::Create(
149 Ctx, BBNamePrefix + "-expansion-main-body", ParentFunc, PostLoopBB);
150 IRBuilder<> LoopBuilder(MainLoopBB);
151
152 PHINode *LoopIndex = LoopBuilder.CreatePHI(LenType, 2, "loop-index");
153 LEI.MainLoopIndex = LoopIndex;
154 LoopIndex->addIncoming(ConstantInt::get(LenType, 0U), PreLoopBB);
155
156 Value *NewIndex =
157 LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(LenType, MainLoopStep));
158 LoopIndex->addIncoming(NewIndex, MainLoopBB);
159
160 // One argument of the addition is a loop-variant PHI, so it must be an
161 // Instruction (i.e., it cannot be a Constant).
162 LEI.MainLoopIP = cast<Instruction>(NewIndex);
163
164 if (ResidualLoopStep > 0 && ResidualLoopStep < MainLoopStep) {
165 // Loop body for the residual accesses.
166 BasicBlock *ResLoopBB =
167 BasicBlock::Create(Ctx, BBNamePrefix + "-expansion-residual-body",
168 PreLoopBB->getParent(), PostLoopBB);
169 // BB to check if the residual loop is needed.
170 BasicBlock *ResidualCondBB =
171 BasicBlock::Create(Ctx, BBNamePrefix + "-expansion-residual-cond",
172 PreLoopBB->getParent(), ResLoopBB);
173
174 // Enter the MainLoop unless no main loop iteration is required.
175 ConstantInt *Zero = ConstantInt::get(ILenType, 0U);
176 if (MustTakeMainLoop)
177 PreLoopBuilder.CreateBr(MainLoopBB);
178 else
179 PreLoopBuilder.CreateCondBr(PreLoopBuilder.CreateICmpNE(LoopUnits, Zero),
180 MainLoopBB, ResidualCondBB);
181 PreLoopBB->getTerminator()->eraseFromParent();
182
183 // Stay in the MainLoop until we have handled all the LoopUnits. Then go to
184 // the residual condition BB.
185 LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, LoopUnits),
186 MainLoopBB, ResidualCondBB);
187
188 // Determine if we need to branch to the residual loop or bypass it.
189 IRBuilder<> RCBuilder(ResidualCondBB);
190 RCBuilder.CreateCondBr(RCBuilder.CreateICmpNE(ResidualUnits, Zero),
191 ResLoopBB, PostLoopBB);
192
193 IRBuilder<> ResBuilder(ResLoopBB);
194 PHINode *ResidualIndex =
195 ResBuilder.CreatePHI(LenType, 2, "residual-loop-index");
196 ResidualIndex->addIncoming(Zero, ResidualCondBB);
197
198 // Add the offset at the end of the main loop to the loop counter of the
199 // residual loop to get the proper index.
200 Value *FullOffset = ResBuilder.CreateAdd(LoopUnits, ResidualIndex);
201 LEI.ResidualLoopIndex = FullOffset;
202
203 Value *ResNewIndex = ResBuilder.CreateAdd(
204 ResidualIndex, ConstantInt::get(LenType, ResidualLoopStep));
205 ResidualIndex->addIncoming(ResNewIndex, ResLoopBB);
206
207 // One argument of the addition is a loop-variant PHI, so it must be an
208 // Instruction (i.e., it cannot be a Constant).
209 LEI.ResidualLoopIP = cast<Instruction>(ResNewIndex);
210
211 // Stay in the residual loop until all ResidualUnits are handled.
212 ResBuilder.CreateCondBr(
213 ResBuilder.CreateICmpULT(ResNewIndex, ResidualUnits), ResLoopBB,
214 PostLoopBB);
215 } else {
216 // There is no need for a residual loop after the main loop. We do however
217 // need to patch up the control flow by creating the terminators for the
218 // preloop block and the main loop.
219
220 // Enter the MainLoop unless no main loop iteration is required.
221 if (MustTakeMainLoop) {
222 PreLoopBuilder.CreateBr(MainLoopBB);
223 } else {
224 ConstantInt *Zero = ConstantInt::get(ILenType, 0U);
225 PreLoopBuilder.CreateCondBr(PreLoopBuilder.CreateICmpNE(LoopUnits, Zero),
226 MainLoopBB, PostLoopBB);
227 }
228 PreLoopBB->getTerminator()->eraseFromParent();
229 // Stay in the MainLoop until we have handled all the LoopUnits.
230 LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, LoopUnits),
231 MainLoopBB, PostLoopBB);
232 }
233 return LEI;
234}
235
237 Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr,
238 ConstantInt *CopyLen, Align SrcAlign, Align DstAlign, bool SrcIsVolatile,
239 bool DstIsVolatile, bool CanOverlap, const TargetTransformInfo &TTI,
240 std::optional<uint32_t> AtomicElementSize) {
241 // No need to expand zero length copies.
242 if (CopyLen->isZero())
243 return;
244
245 BasicBlock *PreLoopBB = InsertBefore->getParent();
246 Function *ParentFunc = PreLoopBB->getParent();
247 LLVMContext &Ctx = PreLoopBB->getContext();
248 const DataLayout &DL = ParentFunc->getDataLayout();
249 MDBuilder MDB(Ctx);
250 MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("MemCopyDomain");
251 StringRef Name = "MemCopyAliasScope";
252 MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
253
254 unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
255 unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
256
257 Type *TypeOfCopyLen = CopyLen->getType();
258 Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
259 Ctx, CopyLen, SrcAS, DstAS, SrcAlign, DstAlign, AtomicElementSize);
260 assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
261 "Atomic memcpy lowering is not supported for vector operand type");
262
263 Type *Int8Type = Type::getInt8Ty(Ctx);
264 unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
265 assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) &&
266 "Atomic memcpy lowering is not supported for selected operand size");
267
268 uint64_t LoopEndCount = alignDown(CopyLen->getZExtValue(), LoopOpSize);
269
270 // Skip the loop expansion entirely if the loop would never be taken.
271 if (LoopEndCount != 0) {
272 LoopExpansionInfo LEI = insertLoopExpansion(InsertBefore, CopyLen,
273 LoopOpSize, 0, "static-memcpy");
274
275 // Fill MainLoopBB
276 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
277 Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
278 Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
279
280 // If we used LoopOpType as GEP element type, we would iterate over the
281 // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
282 // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use
283 // byte offsets computed from the TypeStoreSize.
284 Value *SrcGEP =
285 MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LEI.MainLoopIndex);
286 LoadInst *Load = MainLoopBuilder.CreateAlignedLoad(
287 LoopOpType, SrcGEP, PartSrcAlign, SrcIsVolatile);
288 if (!CanOverlap) {
289 // Set alias scope for loads.
290 Load->setMetadata(LLVMContext::MD_alias_scope,
291 MDNode::get(Ctx, NewScope));
292 }
293 Value *DstGEP =
294 MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex);
295 StoreInst *Store = MainLoopBuilder.CreateAlignedStore(
296 Load, DstGEP, PartDstAlign, DstIsVolatile);
297 if (!CanOverlap) {
298 // Indicate that stores don't overlap loads.
299 Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
300 }
301 if (AtomicElementSize) {
302 Load->setAtomic(AtomicOrdering::Unordered);
303 Store->setAtomic(AtomicOrdering::Unordered);
304 }
305 assert(!LEI.ResidualLoopIP && !LEI.ResidualLoopIndex &&
306 "No residual loop was requested");
307 }
308
309 // Copy the remaining bytes with straight-line code.
310 uint64_t BytesCopied = LoopEndCount;
311 uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied;
312 if (RemainingBytes == 0)
313 return;
314
315 IRBuilder<> RBuilder(InsertBefore);
316 SmallVector<Type *, 5> RemainingOps;
317 TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
318 SrcAS, DstAS, SrcAlign, DstAlign,
319 AtomicElementSize);
320
321 for (auto *OpTy : RemainingOps) {
322 Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied));
323 Align PartDstAlign(commonAlignment(DstAlign, BytesCopied));
324
325 unsigned OperandSize = DL.getTypeStoreSize(OpTy);
326 assert((!AtomicElementSize || OperandSize % *AtomicElementSize == 0) &&
327 "Atomic memcpy lowering is not supported for selected operand size");
328
329 Value *SrcGEP = RBuilder.CreateInBoundsGEP(
330 Int8Type, SrcAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
331 LoadInst *Load =
332 RBuilder.CreateAlignedLoad(OpTy, SrcGEP, PartSrcAlign, SrcIsVolatile);
333 if (!CanOverlap) {
334 // Set alias scope for loads.
335 Load->setMetadata(LLVMContext::MD_alias_scope,
336 MDNode::get(Ctx, NewScope));
337 }
338 Value *DstGEP = RBuilder.CreateInBoundsGEP(
339 Int8Type, DstAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
340 StoreInst *Store =
341 RBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
342 if (!CanOverlap) {
343 // Indicate that stores don't overlap loads.
344 Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
345 }
346 if (AtomicElementSize) {
347 Load->setAtomic(AtomicOrdering::Unordered);
348 Store->setAtomic(AtomicOrdering::Unordered);
349 }
350 BytesCopied += OperandSize;
351 }
352 assert(BytesCopied == CopyLen->getZExtValue() &&
353 "Bytes copied should match size in the call!");
354}
355
357 Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen,
358 Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile,
359 bool CanOverlap, const TargetTransformInfo &TTI,
360 std::optional<uint32_t> AtomicElementSize) {
361 BasicBlock *PreLoopBB = InsertBefore->getParent();
362 Function *ParentFunc = PreLoopBB->getParent();
363 const DataLayout &DL = ParentFunc->getDataLayout();
364 LLVMContext &Ctx = PreLoopBB->getContext();
365 MDBuilder MDB(Ctx);
366 MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("MemCopyDomain");
367 StringRef Name = "MemCopyAliasScope";
368 MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
369
370 unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
371 unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
372
373 Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
374 Ctx, CopyLen, SrcAS, DstAS, SrcAlign, DstAlign, AtomicElementSize);
375 assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
376 "Atomic memcpy lowering is not supported for vector operand type");
377 unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
378 assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) &&
379 "Atomic memcpy lowering is not supported for selected operand size");
380
381 Type *Int8Type = Type::getInt8Ty(Ctx);
382
383 Type *ResidualLoopOpType = AtomicElementSize
384 ? Type::getIntNTy(Ctx, *AtomicElementSize * 8)
385 : Int8Type;
386 unsigned ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
387 assert(ResidualLoopOpSize == (AtomicElementSize ? *AtomicElementSize : 1) &&
388 "Store size is expected to match type size");
389
390 LoopExpansionInfo LEI = insertLoopExpansion(
391 InsertBefore, CopyLen, LoopOpSize, ResidualLoopOpSize, "dynamic-memcpy");
392
393 // Fill MainLoopBB
394 IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
395 Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
396 Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
397
398 // If we used LoopOpType as GEP element type, we would iterate over the
399 // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
400 // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use byte
401 // offsets computed from the TypeStoreSize.
402 Value *SrcGEP =
403 MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LEI.MainLoopIndex);
404 LoadInst *Load = MainLoopBuilder.CreateAlignedLoad(
405 LoopOpType, SrcGEP, PartSrcAlign, SrcIsVolatile);
406 if (!CanOverlap) {
407 // Set alias scope for loads.
408 Load->setMetadata(LLVMContext::MD_alias_scope, MDNode::get(Ctx, NewScope));
409 }
410 Value *DstGEP =
411 MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex);
412 StoreInst *Store = MainLoopBuilder.CreateAlignedStore(
413 Load, DstGEP, PartDstAlign, DstIsVolatile);
414 if (!CanOverlap) {
415 // Indicate that stores don't overlap loads.
416 Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
417 }
418 if (AtomicElementSize) {
421 }
422
423 // Fill ResidualLoopBB.
424 if (!LEI.ResidualLoopIP)
425 return;
426
427 Align ResSrcAlign(commonAlignment(PartSrcAlign, ResidualLoopOpSize));
428 Align ResDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize));
429
430 IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP);
431 Value *ResSrcGEP = ResLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr,
432 LEI.ResidualLoopIndex);
433 LoadInst *ResLoad = ResLoopBuilder.CreateAlignedLoad(
434 ResidualLoopOpType, ResSrcGEP, ResSrcAlign, SrcIsVolatile);
435 if (!CanOverlap) {
436 // Set alias scope for loads.
437 ResLoad->setMetadata(LLVMContext::MD_alias_scope,
438 MDNode::get(Ctx, NewScope));
439 }
440 Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
441 LEI.ResidualLoopIndex);
442 StoreInst *ResStore = ResLoopBuilder.CreateAlignedStore(
443 ResLoad, ResDstGEP, ResDstAlign, DstIsVolatile);
444 if (!CanOverlap) {
445 // Indicate that stores don't overlap loads.
446 ResStore->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));
447 }
448 if (AtomicElementSize) {
451 }
452}
453
454// If \p Addr1 and \p Addr2 are pointers to different address spaces, create an
455// addresspacecast to obtain a pair of pointers in the same addressspace. The
456// caller needs to ensure that addrspacecasting is possible.
457// No-op if the pointers are in the same address space.
458static std::pair<Value *, Value *>
460 const TargetTransformInfo &TTI) {
461 Value *ResAddr1 = Addr1;
462 Value *ResAddr2 = Addr2;
463
464 unsigned AS1 = cast<PointerType>(Addr1->getType())->getAddressSpace();
465 unsigned AS2 = cast<PointerType>(Addr2->getType())->getAddressSpace();
466 if (AS1 != AS2) {
467 if (TTI.isValidAddrSpaceCast(AS2, AS1))
468 ResAddr2 = B.CreateAddrSpaceCast(Addr2, Addr1->getType());
469 else if (TTI.isValidAddrSpaceCast(AS1, AS2))
470 ResAddr1 = B.CreateAddrSpaceCast(Addr1, Addr2->getType());
471 else
472 llvm_unreachable("Can only lower memmove between address spaces if they "
473 "support addrspacecast");
474 }
475 return {ResAddr1, ResAddr2};
476}
477
478// Lower memmove to IR. memmove is required to correctly copy overlapping memory
479// regions; therefore, it has to check the relative positions of the source and
480// destination pointers and choose the copy direction accordingly.
481//
482// The code below is an IR rendition of this C function:
483//
484// void* memmove(void* dst, const void* src, size_t n) {
485// unsigned char* d = dst;
486// const unsigned char* s = src;
487// if (s < d) {
488// // copy backwards
489// while (n--) {
490// d[n] = s[n];
491// }
492// } else {
493// // copy forward
494// for (size_t i = 0; i < n; ++i) {
495// d[i] = s[i];
496// }
497// }
498// return dst;
499// }
500//
501// If the TargetTransformInfo specifies a wider MemcpyLoopLoweringType, it is
502// used for the memory accesses in the loops. Then, additional loops with
503// byte-wise accesses are added for the remaining bytes.
505 Value *SrcAddr, Value *DstAddr,
506 Value *CopyLen, Align SrcAlign,
507 Align DstAlign, bool SrcIsVolatile,
508 bool DstIsVolatile,
509 const TargetTransformInfo &TTI) {
510 Type *TypeOfCopyLen = CopyLen->getType();
511 BasicBlock *OrigBB = InsertBefore->getParent();
512 Function *F = OrigBB->getParent();
513 const DataLayout &DL = F->getDataLayout();
514 LLVMContext &Ctx = OrigBB->getContext();
515 unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
516 unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
517
518 Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
519 SrcAlign, DstAlign);
520 unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
521 Type *Int8Type = Type::getInt8Ty(Ctx);
522 bool LoopOpIsInt8 = LoopOpType == Int8Type;
523
524 // If the memory accesses are wider than one byte, residual loops with
525 // i8-accesses are required to move remaining bytes.
526 bool RequiresResidual = !LoopOpIsInt8;
527
528 Type *ResidualLoopOpType = Int8Type;
529 unsigned ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
530
531 // Calculate the loop trip count and remaining bytes to copy after the loop.
532 IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
533 ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
534 ConstantInt *CIResidualLoopOpSize =
535 ConstantInt::get(ILengthType, ResidualLoopOpSize);
536 ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
537
538 IRBuilder<> PLBuilder(InsertBefore);
539
540 Value *RuntimeLoopBytes = CopyLen;
541 Value *RuntimeLoopRemainder = nullptr;
542 Value *SkipResidualCondition = nullptr;
543 if (RequiresResidual) {
544 RuntimeLoopRemainder =
545 getRuntimeLoopRemainder(PLBuilder, CopyLen, CILoopOpSize, LoopOpSize);
546 RuntimeLoopBytes = getRuntimeLoopUnits(PLBuilder, CopyLen, CILoopOpSize,
547 LoopOpSize, RuntimeLoopRemainder);
548 SkipResidualCondition =
549 PLBuilder.CreateICmpEQ(RuntimeLoopRemainder, Zero, "skip_residual");
550 }
551 Value *SkipMainCondition =
552 PLBuilder.CreateICmpEQ(RuntimeLoopBytes, Zero, "skip_main");
553
554 // Create the a comparison of src and dst, based on which we jump to either
555 // the forward-copy part of the function (if src >= dst) or the backwards-copy
556 // part (if src < dst).
557 // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
558 // structure. Its block terminators (unconditional branches) are replaced by
559 // the appropriate conditional branches when the loop is built.
560 // If the pointers are in different address spaces, they need to be converted
561 // to a compatible one. Cases where memory ranges in the different address
562 // spaces cannot overlap are lowered as memcpy and not handled here.
563 auto [CmpSrcAddr, CmpDstAddr] =
564 tryInsertCastToCommonAddrSpace(PLBuilder, SrcAddr, DstAddr, TTI);
565 Value *PtrCompare =
566 PLBuilder.CreateICmpULT(CmpSrcAddr, CmpDstAddr, "compare_src_dst");
567 Instruction *ThenTerm, *ElseTerm;
568 SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
569 &ThenTerm, &ElseTerm);
570
571 // If the LoopOpSize is greater than 1, each part of the function consists of
572 // four blocks:
573 // memmove_copy_backwards:
574 // skip the residual loop when 0 iterations are required
575 // memmove_bwd_residual_loop:
576 // copy the last few bytes individually so that the remaining length is
577 // a multiple of the LoopOpSize
578 // memmove_bwd_middle: skip the main loop when 0 iterations are required
579 // memmove_bwd_main_loop: the actual backwards loop BB with wide accesses
580 // memmove_copy_forward: skip the main loop when 0 iterations are required
581 // memmove_fwd_main_loop: the actual forward loop BB with wide accesses
582 // memmove_fwd_middle: skip the residual loop when 0 iterations are required
583 // memmove_fwd_residual_loop: copy the last few bytes individually
584 //
585 // The main and residual loop are switched between copying forward and
586 // backward so that the residual loop always operates on the end of the moved
587 // range. This is based on the assumption that buffers whose start is aligned
588 // with the LoopOpSize are more common than buffers whose end is.
589 //
590 // If the LoopOpSize is 1, each part of the function consists of two blocks:
591 // memmove_copy_backwards: skip the loop when 0 iterations are required
592 // memmove_bwd_main_loop: the actual backwards loop BB
593 // memmove_copy_forward: skip the loop when 0 iterations are required
594 // memmove_fwd_main_loop: the actual forward loop BB
595 BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
596 CopyBackwardsBB->setName("memmove_copy_backwards");
597 BasicBlock *CopyForwardBB = ElseTerm->getParent();
598 CopyForwardBB->setName("memmove_copy_forward");
599 BasicBlock *ExitBB = InsertBefore->getParent();
600 ExitBB->setName("memmove_done");
601
602 Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
603 Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
604
605 // Accesses in the residual loops do not share the same alignment as those in
606 // the main loops.
607 Align ResidualSrcAlign(commonAlignment(PartSrcAlign, ResidualLoopOpSize));
608 Align ResidualDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize));
609
610 // Copying backwards.
611 {
612 BasicBlock *MainLoopBB = BasicBlock::Create(
613 F->getContext(), "memmove_bwd_main_loop", F, CopyForwardBB);
614
615 // The predecessor of the memmove_bwd_main_loop. Updated in the
616 // following if a residual loop is emitted first.
617 BasicBlock *PredBB = CopyBackwardsBB;
618
619 if (RequiresResidual) {
620 // backwards residual loop
621 BasicBlock *ResidualLoopBB = BasicBlock::Create(
622 F->getContext(), "memmove_bwd_residual_loop", F, MainLoopBB);
623 IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
624 PHINode *ResidualLoopPhi = ResidualLoopBuilder.CreatePHI(ILengthType, 0);
625 Value *ResidualIndex = ResidualLoopBuilder.CreateSub(
626 ResidualLoopPhi, CIResidualLoopOpSize, "bwd_residual_index");
627 // If we used LoopOpType as GEP element type, we would iterate over the
628 // buffers in TypeStoreSize strides while copying TypeAllocSize bytes,
629 // i.e., we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore,
630 // use byte offsets computed from the TypeStoreSize.
631 Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr,
632 ResidualIndex);
633 Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
634 ResidualLoopOpType, LoadGEP, ResidualSrcAlign, SrcIsVolatile,
635 "element");
636 Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
637 ResidualIndex);
638 ResidualLoopBuilder.CreateAlignedStore(Element, StoreGEP,
639 ResidualDstAlign, DstIsVolatile);
640
641 // After the residual loop, go to an intermediate block.
642 BasicBlock *IntermediateBB = BasicBlock::Create(
643 F->getContext(), "memmove_bwd_middle", F, MainLoopBB);
644 // Later code expects a terminator in the PredBB.
645 IRBuilder<> IntermediateBuilder(IntermediateBB);
646 IntermediateBuilder.CreateUnreachable();
647 ResidualLoopBuilder.CreateCondBr(
648 ResidualLoopBuilder.CreateICmpEQ(ResidualIndex, RuntimeLoopBytes),
649 IntermediateBB, ResidualLoopBB);
650
651 ResidualLoopPhi->addIncoming(ResidualIndex, ResidualLoopBB);
652 ResidualLoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
653
654 // How to get to the residual:
655 BranchInst::Create(IntermediateBB, ResidualLoopBB, SkipResidualCondition,
656 ThenTerm->getIterator());
657 ThenTerm->eraseFromParent();
658
659 PredBB = IntermediateBB;
660 }
661
662 // main loop
663 IRBuilder<> MainLoopBuilder(MainLoopBB);
664 PHINode *MainLoopPhi = MainLoopBuilder.CreatePHI(ILengthType, 0);
665 Value *MainIndex =
666 MainLoopBuilder.CreateSub(MainLoopPhi, CILoopOpSize, "bwd_main_index");
667 Value *LoadGEP =
668 MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, MainIndex);
669 Value *Element = MainLoopBuilder.CreateAlignedLoad(
670 LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
671 Value *StoreGEP =
672 MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, MainIndex);
673 MainLoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
674 DstIsVolatile);
675 MainLoopBuilder.CreateCondBr(MainLoopBuilder.CreateICmpEQ(MainIndex, Zero),
676 ExitBB, MainLoopBB);
677 MainLoopPhi->addIncoming(MainIndex, MainLoopBB);
678 MainLoopPhi->addIncoming(RuntimeLoopBytes, PredBB);
679
680 // How to get to the main loop:
681 Instruction *PredBBTerm = PredBB->getTerminator();
682 BranchInst::Create(ExitBB, MainLoopBB, SkipMainCondition,
683 PredBBTerm->getIterator());
684 PredBBTerm->eraseFromParent();
685 }
686
687 // Copying forward.
688 // main loop
689 {
690 BasicBlock *MainLoopBB =
691 BasicBlock::Create(F->getContext(), "memmove_fwd_main_loop", F, ExitBB);
692 IRBuilder<> MainLoopBuilder(MainLoopBB);
693 PHINode *MainLoopPhi =
694 MainLoopBuilder.CreatePHI(ILengthType, 0, "fwd_main_index");
695 Value *LoadGEP =
696 MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, MainLoopPhi);
697 Value *Element = MainLoopBuilder.CreateAlignedLoad(
698 LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
699 Value *StoreGEP =
700 MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, MainLoopPhi);
701 MainLoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
702 DstIsVolatile);
703 Value *MainIndex = MainLoopBuilder.CreateAdd(MainLoopPhi, CILoopOpSize);
704 MainLoopPhi->addIncoming(MainIndex, MainLoopBB);
705 MainLoopPhi->addIncoming(Zero, CopyForwardBB);
706
707 Instruction *CopyFwdBBTerm = CopyForwardBB->getTerminator();
708 BasicBlock *SuccessorBB = ExitBB;
709 if (RequiresResidual)
710 SuccessorBB =
711 BasicBlock::Create(F->getContext(), "memmove_fwd_middle", F, ExitBB);
712
713 // leaving or staying in the main loop
714 MainLoopBuilder.CreateCondBr(
715 MainLoopBuilder.CreateICmpEQ(MainIndex, RuntimeLoopBytes), SuccessorBB,
716 MainLoopBB);
717
718 // getting in or skipping the main loop
719 BranchInst::Create(SuccessorBB, MainLoopBB, SkipMainCondition,
720 CopyFwdBBTerm->getIterator());
721 CopyFwdBBTerm->eraseFromParent();
722
723 if (RequiresResidual) {
724 BasicBlock *IntermediateBB = SuccessorBB;
725 IRBuilder<> IntermediateBuilder(IntermediateBB);
726 BasicBlock *ResidualLoopBB = BasicBlock::Create(
727 F->getContext(), "memmove_fwd_residual_loop", F, ExitBB);
728 IntermediateBuilder.CreateCondBr(SkipResidualCondition, ExitBB,
729 ResidualLoopBB);
730
731 // Residual loop
732 IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
733 PHINode *ResidualLoopPhi =
734 ResidualLoopBuilder.CreatePHI(ILengthType, 0, "fwd_residual_index");
735 Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr,
736 ResidualLoopPhi);
737 Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
738 ResidualLoopOpType, LoadGEP, ResidualSrcAlign, SrcIsVolatile,
739 "element");
740 Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
741 ResidualLoopPhi);
742 ResidualLoopBuilder.CreateAlignedStore(Element, StoreGEP,
743 ResidualDstAlign, DstIsVolatile);
744 Value *ResidualIndex =
745 ResidualLoopBuilder.CreateAdd(ResidualLoopPhi, CIResidualLoopOpSize);
746 ResidualLoopBuilder.CreateCondBr(
747 ResidualLoopBuilder.CreateICmpEQ(ResidualIndex, CopyLen), ExitBB,
748 ResidualLoopBB);
749 ResidualLoopPhi->addIncoming(ResidualIndex, ResidualLoopBB);
750 ResidualLoopPhi->addIncoming(RuntimeLoopBytes, IntermediateBB);
751 }
752 }
753}
754
755// Similar to createMemMoveLoopUnknownSize, only the trip counts are computed at
756// compile time, obsolete loops and branches are omitted, and the residual code
757// is straight-line code instead of a loop.
758static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
759 Value *SrcAddr, Value *DstAddr,
760 ConstantInt *CopyLen, Align SrcAlign,
761 Align DstAlign, bool SrcIsVolatile,
762 bool DstIsVolatile,
763 const TargetTransformInfo &TTI) {
764 // No need to expand zero length moves.
765 if (CopyLen->isZero())
766 return;
767
768 Type *TypeOfCopyLen = CopyLen->getType();
769 BasicBlock *OrigBB = InsertBefore->getParent();
770 Function *F = OrigBB->getParent();
771 const DataLayout &DL = F->getDataLayout();
772 LLVMContext &Ctx = OrigBB->getContext();
773 unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
774 unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
775
776 Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
777 SrcAlign, DstAlign);
778 unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
779 Type *Int8Type = Type::getInt8Ty(Ctx);
780
781 // Calculate the loop trip count and remaining bytes to copy after the loop.
782 uint64_t BytesCopiedInLoop = alignDown(CopyLen->getZExtValue(), LoopOpSize);
783 uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopiedInLoop;
784
785 IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
786 ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
787 ConstantInt *LoopBound = ConstantInt::get(ILengthType, BytesCopiedInLoop);
788 ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
789
790 IRBuilder<> PLBuilder(InsertBefore);
791
792 auto [CmpSrcAddr, CmpDstAddr] =
793 tryInsertCastToCommonAddrSpace(PLBuilder, SrcAddr, DstAddr, TTI);
794 Value *PtrCompare =
795 PLBuilder.CreateICmpULT(CmpSrcAddr, CmpDstAddr, "compare_src_dst");
796 Instruction *ThenTerm, *ElseTerm;
797 SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
798 &ThenTerm, &ElseTerm);
799
800 BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
801 BasicBlock *CopyForwardBB = ElseTerm->getParent();
802 BasicBlock *ExitBB = InsertBefore->getParent();
803 ExitBB->setName("memmove_done");
804
805 Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
806 Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
807
808 // Helper function to generate a load/store pair of a given type in the
809 // residual. Used in the forward and backward branches.
810 auto GenerateResidualLdStPair = [&](Type *OpTy, IRBuilderBase &Builder,
811 uint64_t &BytesCopied) {
812 Align ResSrcAlign(commonAlignment(SrcAlign, BytesCopied));
813 Align ResDstAlign(commonAlignment(DstAlign, BytesCopied));
814
815 unsigned OperandSize = DL.getTypeStoreSize(OpTy);
816
817 // If we used LoopOpType as GEP element type, we would iterate over the
818 // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
819 // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use
820 // byte offsets computed from the TypeStoreSize.
821 Value *SrcGEP = Builder.CreateInBoundsGEP(
822 Int8Type, SrcAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
823 LoadInst *Load =
824 Builder.CreateAlignedLoad(OpTy, SrcGEP, ResSrcAlign, SrcIsVolatile);
825 Value *DstGEP = Builder.CreateInBoundsGEP(
826 Int8Type, DstAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
827 Builder.CreateAlignedStore(Load, DstGEP, ResDstAlign, DstIsVolatile);
828 BytesCopied += OperandSize;
829 };
830
831 // Copying backwards.
832 if (RemainingBytes != 0) {
833 CopyBackwardsBB->setName("memmove_bwd_residual");
834 uint64_t BytesCopied = BytesCopiedInLoop;
835
836 // Residual code is required to move the remaining bytes. We need the same
837 // instructions as in the forward case, only in reverse. So we generate code
838 // the same way, except that we change the IRBuilder insert point for each
839 // load/store pair so that each one is inserted before the previous one
840 // instead of after it.
841 IRBuilder<> BwdResBuilder(CopyBackwardsBB,
842 CopyBackwardsBB->getFirstNonPHIIt());
843 SmallVector<Type *, 5> RemainingOps;
844 TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
845 SrcAS, DstAS, PartSrcAlign,
846 PartDstAlign);
847 for (auto *OpTy : RemainingOps) {
848 // reverse the order of the emitted operations
849 BwdResBuilder.SetInsertPoint(CopyBackwardsBB,
850 CopyBackwardsBB->getFirstNonPHIIt());
851 GenerateResidualLdStPair(OpTy, BwdResBuilder, BytesCopied);
852 }
853 }
854 if (BytesCopiedInLoop != 0) {
855 BasicBlock *LoopBB = CopyBackwardsBB;
856 BasicBlock *PredBB = OrigBB;
857 if (RemainingBytes != 0) {
858 // if we introduce residual code, it needs its separate BB
859 LoopBB = CopyBackwardsBB->splitBasicBlock(
860 CopyBackwardsBB->getTerminator(), "memmove_bwd_loop");
861 PredBB = CopyBackwardsBB;
862 } else {
863 CopyBackwardsBB->setName("memmove_bwd_loop");
864 }
865 IRBuilder<> LoopBuilder(LoopBB->getTerminator());
866 PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0);
867 Value *Index = LoopBuilder.CreateSub(LoopPhi, CILoopOpSize, "bwd_index");
868 Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, Index);
869 Value *Element = LoopBuilder.CreateAlignedLoad(
870 LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
871 Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, Index);
872 LoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
873 DstIsVolatile);
874
875 // Replace the unconditional branch introduced by
876 // SplitBlockAndInsertIfThenElse to turn LoopBB into a loop.
877 Instruction *UncondTerm = LoopBB->getTerminator();
878 LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpEQ(Index, Zero), ExitBB,
879 LoopBB);
880 UncondTerm->eraseFromParent();
881
882 LoopPhi->addIncoming(Index, LoopBB);
883 LoopPhi->addIncoming(LoopBound, PredBB);
884 }
885
886 // Copying forward.
887 BasicBlock *FwdResidualBB = CopyForwardBB;
888 if (BytesCopiedInLoop != 0) {
889 CopyForwardBB->setName("memmove_fwd_loop");
890 BasicBlock *LoopBB = CopyForwardBB;
891 BasicBlock *SuccBB = ExitBB;
892 if (RemainingBytes != 0) {
893 // if we introduce residual code, it needs its separate BB
894 SuccBB = CopyForwardBB->splitBasicBlock(CopyForwardBB->getTerminator(),
895 "memmove_fwd_residual");
896 FwdResidualBB = SuccBB;
897 }
898 IRBuilder<> LoopBuilder(LoopBB->getTerminator());
899 PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0, "fwd_index");
900 Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LoopPhi);
901 Value *Element = LoopBuilder.CreateAlignedLoad(
902 LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
903 Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LoopPhi);
904 LoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
905 DstIsVolatile);
906 Value *Index = LoopBuilder.CreateAdd(LoopPhi, CILoopOpSize);
907 LoopPhi->addIncoming(Index, LoopBB);
908 LoopPhi->addIncoming(Zero, OrigBB);
909
910 // Replace the unconditional branch to turn LoopBB into a loop.
911 Instruction *UncondTerm = LoopBB->getTerminator();
912 LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpEQ(Index, LoopBound), SuccBB,
913 LoopBB);
914 UncondTerm->eraseFromParent();
915 }
916
917 if (RemainingBytes != 0) {
918 uint64_t BytesCopied = BytesCopiedInLoop;
919
920 // Residual code is required to move the remaining bytes. In the forward
921 // case, we emit it in the normal order.
922 IRBuilder<> FwdResBuilder(FwdResidualBB->getTerminator());
923 SmallVector<Type *, 5> RemainingOps;
924 TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
925 SrcAS, DstAS, PartSrcAlign,
926 PartDstAlign);
927 for (auto *OpTy : RemainingOps)
928 GenerateResidualLdStPair(OpTy, FwdResBuilder, BytesCopied);
929 }
930}
931
932static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
933 Value *CopyLen, Value *SetValue, Align DstAlign,
934 bool IsVolatile) {
935 Type *TypeOfCopyLen = CopyLen->getType();
936 BasicBlock *OrigBB = InsertBefore->getParent();
937 Function *F = OrigBB->getParent();
938 const DataLayout &DL = F->getDataLayout();
939 BasicBlock *NewBB =
940 OrigBB->splitBasicBlock(InsertBefore, "split");
941 BasicBlock *LoopBB
942 = BasicBlock::Create(F->getContext(), "loadstoreloop", F, NewBB);
943
944 IRBuilder<> Builder(OrigBB->getTerminator());
945
946 Builder.CreateCondBr(
947 Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB,
948 LoopBB);
949 OrigBB->getTerminator()->eraseFromParent();
950
951 unsigned PartSize = DL.getTypeStoreSize(SetValue->getType());
952 Align PartAlign(commonAlignment(DstAlign, PartSize));
953
954 IRBuilder<> LoopBuilder(LoopBB);
955 PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
956 LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
957
958 LoopBuilder.CreateAlignedStore(
959 SetValue,
960 LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex),
961 PartAlign, IsVolatile);
962
963 Value *NewIndex =
964 LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
965 LoopIndex->addIncoming(NewIndex, LoopBB);
966
967 LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
968 NewBB);
969}
970
971template <typename T>
973 if (SE) {
974 const SCEV *SrcSCEV = SE->getSCEV(Memcpy->getRawSource());
975 const SCEV *DestSCEV = SE->getSCEV(Memcpy->getRawDest());
976 if (SE->isKnownPredicateAt(CmpInst::ICMP_NE, SrcSCEV, DestSCEV, Memcpy))
977 return false;
978 }
979 return true;
980}
981
984 ScalarEvolution *SE) {
985 bool CanOverlap = canOverlap(Memcpy, SE);
986 if (ConstantInt *CI = dyn_cast<ConstantInt>(Memcpy->getLength())) {
988 /* InsertBefore */ Memcpy,
989 /* SrcAddr */ Memcpy->getRawSource(),
990 /* DstAddr */ Memcpy->getRawDest(),
991 /* CopyLen */ CI,
992 /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(),
993 /* DestAlign */ Memcpy->getDestAlign().valueOrOne(),
994 /* SrcIsVolatile */ Memcpy->isVolatile(),
995 /* DstIsVolatile */ Memcpy->isVolatile(),
996 /* CanOverlap */ CanOverlap,
997 /* TargetTransformInfo */ TTI);
998 } else {
1000 /* InsertBefore */ Memcpy,
1001 /* SrcAddr */ Memcpy->getRawSource(),
1002 /* DstAddr */ Memcpy->getRawDest(),
1003 /* CopyLen */ Memcpy->getLength(),
1004 /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(),
1005 /* DestAlign */ Memcpy->getDestAlign().valueOrOne(),
1006 /* SrcIsVolatile */ Memcpy->isVolatile(),
1007 /* DstIsVolatile */ Memcpy->isVolatile(),
1008 /* CanOverlap */ CanOverlap,
1009 /* TargetTransformInfo */ TTI);
1010 }
1011}
1012
1014 const TargetTransformInfo &TTI) {
1015 Value *CopyLen = Memmove->getLength();
1016 Value *SrcAddr = Memmove->getRawSource();
1017 Value *DstAddr = Memmove->getRawDest();
1018 Align SrcAlign = Memmove->getSourceAlign().valueOrOne();
1019 Align DstAlign = Memmove->getDestAlign().valueOrOne();
1020 bool SrcIsVolatile = Memmove->isVolatile();
1021 bool DstIsVolatile = SrcIsVolatile;
1022 IRBuilder<> CastBuilder(Memmove);
1023
1024 unsigned SrcAS = SrcAddr->getType()->getPointerAddressSpace();
1025 unsigned DstAS = DstAddr->getType()->getPointerAddressSpace();
1026 if (SrcAS != DstAS) {
1027 if (!TTI.addrspacesMayAlias(SrcAS, DstAS)) {
1028 // We may not be able to emit a pointer comparison, but we don't have
1029 // to. Expand as memcpy.
1030 if (ConstantInt *CI = dyn_cast<ConstantInt>(CopyLen)) {
1031 createMemCpyLoopKnownSize(/*InsertBefore=*/Memmove, SrcAddr, DstAddr,
1032 CI, SrcAlign, DstAlign, SrcIsVolatile,
1033 DstIsVolatile,
1034 /*CanOverlap=*/false, TTI);
1035 } else {
1036 createMemCpyLoopUnknownSize(/*InsertBefore=*/Memmove, SrcAddr, DstAddr,
1037 CopyLen, SrcAlign, DstAlign, SrcIsVolatile,
1038 DstIsVolatile,
1039 /*CanOverlap=*/false, TTI);
1040 }
1041
1042 return true;
1043 }
1044
1045 if (!(TTI.isValidAddrSpaceCast(DstAS, SrcAS) ||
1046 TTI.isValidAddrSpaceCast(SrcAS, DstAS))) {
1047 // We don't know generically if it's legal to introduce an
1048 // addrspacecast. We need to know either if it's legal to insert an
1049 // addrspacecast, or if the address spaces cannot alias.
1050 LLVM_DEBUG(
1051 dbgs() << "Do not know how to expand memmove between different "
1052 "address spaces\n");
1053 return false;
1054 }
1055 }
1056
1057 if (ConstantInt *CI = dyn_cast<ConstantInt>(CopyLen)) {
1059 /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CI, SrcAlign, DstAlign,
1060 SrcIsVolatile, DstIsVolatile, TTI);
1061 } else {
1063 /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign, DstAlign,
1064 SrcIsVolatile, DstIsVolatile, TTI);
1065 }
1066 return true;
1067}
1068
1070 createMemSetLoop(/* InsertBefore */ Memset,
1071 /* DstAddr */ Memset->getRawDest(),
1072 /* CopyLen */ Memset->getLength(),
1073 /* SetValue */ Memset->getValue(),
1074 /* Alignment */ Memset->getDestAlign().valueOrOne(),
1075 Memset->isVolatile());
1076}
1077
1079 createMemSetLoop(/* InsertBefore=*/Memset,
1080 /* DstAddr=*/Memset->getRawDest(),
1081 /* CopyLen=*/Memset->getLength(),
1082 /* SetValue=*/Memset->getValue(),
1083 /* Alignment=*/Memset->getDestAlign().valueOrOne(),
1084 Memset->isVolatile());
1085}
1086
1088 const TargetTransformInfo &TTI,
1089 ScalarEvolution *SE) {
1090 assert(AtomicMemcpy->isAtomic());
1091 if (ConstantInt *CI = dyn_cast<ConstantInt>(AtomicMemcpy->getLength())) {
1093 /* InsertBefore */ AtomicMemcpy,
1094 /* SrcAddr */ AtomicMemcpy->getRawSource(),
1095 /* DstAddr */ AtomicMemcpy->getRawDest(),
1096 /* CopyLen */ CI,
1097 /* SrcAlign */ AtomicMemcpy->getSourceAlign().valueOrOne(),
1098 /* DestAlign */ AtomicMemcpy->getDestAlign().valueOrOne(),
1099 /* SrcIsVolatile */ AtomicMemcpy->isVolatile(),
1100 /* DstIsVolatile */ AtomicMemcpy->isVolatile(),
1101 /* CanOverlap */ false, // SrcAddr & DstAddr may not overlap by spec.
1102 /* TargetTransformInfo */ TTI,
1103 /* AtomicCpySize */ AtomicMemcpy->getElementSizeInBytes());
1104 } else {
1106 /* InsertBefore */ AtomicMemcpy,
1107 /* SrcAddr */ AtomicMemcpy->getRawSource(),
1108 /* DstAddr */ AtomicMemcpy->getRawDest(),
1109 /* CopyLen */ AtomicMemcpy->getLength(),
1110 /* SrcAlign */ AtomicMemcpy->getSourceAlign().valueOrOne(),
1111 /* DestAlign */ AtomicMemcpy->getDestAlign().valueOrOne(),
1112 /* SrcIsVolatile */ AtomicMemcpy->isVolatile(),
1113 /* DstIsVolatile */ AtomicMemcpy->isVolatile(),
1114 /* CanOverlap */ false, // SrcAddr & DstAddr may not overlap by spec.
1115 /* TargetTransformInfo */ TTI,
1116 /* AtomicCpySize */ AtomicMemcpy->getElementSizeInBytes());
1117 }
1118}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static void SetValue(Value *V, GenericValue Val, ExecutionContext &SF)
Definition Execution.cpp:41
static std::pair< Value *, Value * > tryInsertCastToCommonAddrSpace(IRBuilderBase &B, Value *Addr1, Value *Addr2, const TargetTransformInfo &TTI)
static bool canOverlap(MemTransferBase< T > *Memcpy, ScalarEvolution *SE)
static void createMemMoveLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, ConstantInt *CopyLen, Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile, const TargetTransformInfo &TTI)
static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr, Value *CopyLen, Value *SetValue, Align DstAlign, bool IsVolatile)
static Value * getRuntimeLoopRemainder(IRBuilderBase &B, Value *Len, Value *OpSize, unsigned OpSizeVal)
static Value * getRuntimeLoopUnits(IRBuilderBase &B, Value *Len, Value *OpSize, unsigned OpSizeVal, Value *RTLoopRemainder=nullptr)
static LoopExpansionInfo insertLoopExpansion(Instruction *InsertBefore, Value *Len, unsigned MainLoopStep, unsigned ResidualLoopStep, StringRef BBNamePrefix)
Insert the control flow and loop counters for a memcpy/memset loop expansion.
static void createMemMoveLoopUnknownSize(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen, Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile, const TargetTransformInfo &TTI)
#define F(x, y, z)
Definition MD5.cpp:54
#define LLVM_DEBUG(...)
Definition Debug.h:114
This pass exposes codegen information to IR-level passes.
This class represents any memcpy intrinsic i.e.
uint32_t getElementSizeInBytes() const
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
@ ICMP_NE
not equal
Definition InstrTypes.h:698
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2348
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1867
UnreachableInst * CreateUnreachable()
Definition IRBuilder.h:1339
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1934
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2336
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2497
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2332
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1420
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition IRBuilder.h:1197
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition IRBuilder.h:1191
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition IRBuilder.h:1886
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Class to represent integer types.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
MDNode * createAnonymousAliasScope(MDNode *Domain, StringRef Name=StringRef())
Return metadata appropriate for an alias scope root node.
Definition MDBuilder.h:181
MDNode * createAnonymousAliasScopeDomain(StringRef Name=StringRef())
Return metadata appropriate for an alias scope domain node.
Definition MDBuilder.h:174
Metadata node.
Definition Metadata.h:1078
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
This class wraps the llvm.memcpy intrinsic.
Value * getLength() const
Value * getRawDest() const
MaybeAlign getDestAlign() const
bool isVolatile() const
This class wraps the llvm.memmove intrinsic.
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
This class wraps the llvm.experimental.memset.pattern intrinsic.
Common base class for all memory transfer intrinsics.
Value * getRawSource() const
Return the arguments to the instruction.
MaybeAlign getSourceAlign() const
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI bool isKnownPredicateAt(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS, const Instruction *CtxI)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI void createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, ConstantInt *CopyLen, Align SrcAlign, Align DestAlign, bool SrcIsVolatile, bool DstIsVolatile, bool CanOverlap, const TargetTransformInfo &TTI, std::optional< uint32_t > AtomicCpySize=std::nullopt)
Emit a loop implementing the semantics of an llvm.memcpy whose size is a compile time constant.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void expandMemSetPatternAsLoop(MemSetPatternInst *MemSet)
Expand MemSetPattern as a loop. MemSet is not deleted.
LLVM_ABI bool expandMemMoveAsLoop(MemMoveInst *MemMove, const TargetTransformInfo &TTI)
Expand MemMove as a loop.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void createMemCpyLoopUnknownSize(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen, Align SrcAlign, Align DestAlign, bool SrcIsVolatile, bool DstIsVolatile, bool CanOverlap, const TargetTransformInfo &TTI, std::optional< unsigned > AtomicSize=std::nullopt)
Emit a loop implementing the semantics of llvm.memcpy where the size is not a compile-time constant.
TargetTransformInfo TTI
LLVM_ABI void expandAtomicMemCpyAsLoop(AnyMemCpyInst *AtomicMemCpy, const TargetTransformInfo &TTI, ScalarEvolution *SE)
Expand AtomicMemCpy as a loop. AtomicMemCpy is not deleted.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI void expandMemCpyAsLoop(MemCpyInst *MemCpy, const TargetTransformInfo &TTI, ScalarEvolution *SE=nullptr)
Expand MemCpy as a loop. MemCpy is not deleted.
LLVM_ABI void expandMemSetAsLoop(MemSetInst *MemSet)
Expand MemSet as a loop. MemSet is not deleted.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130