LLVM 23.0.0git
AggressiveInstCombine.cpp
Go to the documentation of this file.
1//===- AggressiveInstCombine.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the aggressive expression pattern combiner classes.
10// Currently, it handles expression patterns for:
11// * Truncate instruction
12//
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/Statistic.h"
27#include "llvm/IR/DataLayout.h"
28#include "llvm/IR/Dominators.h"
29#include "llvm/IR/Function.h"
30#include "llvm/IR/IRBuilder.h"
31#include "llvm/IR/Instruction.h"
32#include "llvm/IR/MDBuilder.h"
40
41using namespace llvm;
42using namespace PatternMatch;
43
44#define DEBUG_TYPE "aggressive-instcombine"
45
46namespace llvm {
48}
49
50STATISTIC(NumAnyOrAllBitsSet, "Number of any/all-bits-set patterns folded");
51STATISTIC(NumGuardedRotates,
52 "Number of guarded rotates transformed into funnel shifts");
53STATISTIC(NumGuardedFunnelShifts,
54 "Number of guarded funnel shifts transformed into funnel shifts");
55STATISTIC(NumPopCountRecognized, "Number of popcount idioms recognized");
56
58 "aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden,
59 cl::desc("Max number of instructions to scan for aggressive instcombine."));
60
62 "strncmp-inline-threshold", cl::init(3), cl::Hidden,
63 cl::desc("The maximum length of a constant string for a builtin string cmp "
64 "call eligible for inlining. The default value is 3."));
65
67 MemChrInlineThreshold("memchr-inline-threshold", cl::init(3), cl::Hidden,
68 cl::desc("The maximum length of a constant string to "
69 "inline a memchr call."));
70
71/// Match a pattern for a bitwise funnel/rotate operation that partially guards
72/// against undefined behavior by branching around the funnel-shift/rotation
73/// when the shift amount is 0.
75 if (I.getOpcode() != Instruction::PHI || I.getNumOperands() != 2)
76 return false;
77
78 // As with the one-use checks below, this is not strictly necessary, but we
79 // are being cautious to avoid potential perf regressions on targets that
80 // do not actually have a funnel/rotate instruction (where the funnel shift
81 // would be expanded back into math/shift/logic ops).
82 if (!isPowerOf2_32(I.getType()->getScalarSizeInBits()))
83 return false;
84
85 // Match V to funnel shift left/right and capture the source operands and
86 // shift amount.
87 auto matchFunnelShift = [](Value *V, Value *&ShVal0, Value *&ShVal1,
88 Value *&ShAmt) {
89 unsigned Width = V->getType()->getScalarSizeInBits();
90
91 // fshl(ShVal0, ShVal1, ShAmt)
92 // == (ShVal0 << ShAmt) | (ShVal1 >> (Width -ShAmt))
93 if (match(V, m_OneUse(m_c_Or(
94 m_Shl(m_Value(ShVal0), m_Value(ShAmt)),
95 m_LShr(m_Value(ShVal1), m_Sub(m_SpecificInt(Width),
96 m_Deferred(ShAmt))))))) {
97 return Intrinsic::fshl;
98 }
99
100 // fshr(ShVal0, ShVal1, ShAmt)
101 // == (ShVal0 >> ShAmt) | (ShVal1 << (Width - ShAmt))
102 if (match(V,
104 m_Value(ShAmt))),
105 m_LShr(m_Value(ShVal1), m_Deferred(ShAmt)))))) {
106 return Intrinsic::fshr;
107 }
108
110 };
111
112 // One phi operand must be a funnel/rotate operation, and the other phi
113 // operand must be the source value of that funnel/rotate operation:
114 // phi [ rotate(RotSrc, ShAmt), FunnelBB ], [ RotSrc, GuardBB ]
115 // phi [ fshl(ShVal0, ShVal1, ShAmt), FunnelBB ], [ ShVal0, GuardBB ]
116 // phi [ fshr(ShVal0, ShVal1, ShAmt), FunnelBB ], [ ShVal1, GuardBB ]
117 PHINode &Phi = cast<PHINode>(I);
118 unsigned FunnelOp = 0, GuardOp = 1;
119 Value *P0 = Phi.getOperand(0), *P1 = Phi.getOperand(1);
120 Value *ShVal0, *ShVal1, *ShAmt;
121 Intrinsic::ID IID = matchFunnelShift(P0, ShVal0, ShVal1, ShAmt);
122 if (IID == Intrinsic::not_intrinsic ||
123 (IID == Intrinsic::fshl && ShVal0 != P1) ||
124 (IID == Intrinsic::fshr && ShVal1 != P1)) {
125 IID = matchFunnelShift(P1, ShVal0, ShVal1, ShAmt);
126 if (IID == Intrinsic::not_intrinsic ||
127 (IID == Intrinsic::fshl && ShVal0 != P0) ||
128 (IID == Intrinsic::fshr && ShVal1 != P0))
129 return false;
130 assert((IID == Intrinsic::fshl || IID == Intrinsic::fshr) &&
131 "Pattern must match funnel shift left or right");
132 std::swap(FunnelOp, GuardOp);
133 }
134
135 // The incoming block with our source operand must be the "guard" block.
136 // That must contain a cmp+branch to avoid the funnel/rotate when the shift
137 // amount is equal to 0. The other incoming block is the block with the
138 // funnel/rotate.
139 BasicBlock *GuardBB = Phi.getIncomingBlock(GuardOp);
140 BasicBlock *FunnelBB = Phi.getIncomingBlock(FunnelOp);
141 Instruction *TermI = GuardBB->getTerminator();
142
143 // Ensure that the shift values dominate each block.
144 if (!DT.dominates(ShVal0, TermI) || !DT.dominates(ShVal1, TermI))
145 return false;
146
147 BasicBlock *PhiBB = Phi.getParent();
149 m_ZeroInt()),
150 m_SpecificBB(PhiBB), m_SpecificBB(FunnelBB))))
151 return false;
152
153 IRBuilder<> Builder(PhiBB, PhiBB->getFirstInsertionPt());
154
155 if (ShVal0 == ShVal1)
156 ++NumGuardedRotates;
157 else
158 ++NumGuardedFunnelShifts;
159
160 // If this is not a rotate then the select was blocking poison from the
161 // 'shift-by-zero' non-TVal, but a funnel shift won't - so freeze it.
162 bool IsFshl = IID == Intrinsic::fshl;
163 if (ShVal0 != ShVal1) {
164 if (IsFshl && !llvm::isGuaranteedNotToBePoison(ShVal1))
165 ShVal1 = Builder.CreateFreeze(ShVal1);
166 else if (!IsFshl && !llvm::isGuaranteedNotToBePoison(ShVal0))
167 ShVal0 = Builder.CreateFreeze(ShVal0);
168 }
169
170 // We matched a variation of this IR pattern:
171 // GuardBB:
172 // %cmp = icmp eq i32 %ShAmt, 0
173 // br i1 %cmp, label %PhiBB, label %FunnelBB
174 // FunnelBB:
175 // %sub = sub i32 32, %ShAmt
176 // %shr = lshr i32 %ShVal1, %sub
177 // %shl = shl i32 %ShVal0, %ShAmt
178 // %fsh = or i32 %shr, %shl
179 // br label %PhiBB
180 // PhiBB:
181 // %cond = phi i32 [ %fsh, %FunnelBB ], [ %ShVal0, %GuardBB ]
182 // -->
183 // llvm.fshl.i32(i32 %ShVal0, i32 %ShVal1, i32 %ShAmt)
184 Phi.replaceAllUsesWith(
185 Builder.CreateIntrinsic(IID, Phi.getType(), {ShVal0, ShVal1, ShAmt}));
186 return true;
187}
188
189/// This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and
190/// the bit indexes (Mask) needed by a masked compare. If we're matching a chain
191/// of 'and' ops, then we also need to capture the fact that we saw an
192/// "and X, 1", so that's an extra return value for that case.
193namespace {
194struct MaskOps {
195 Value *Root = nullptr;
196 APInt Mask;
197 bool MatchAndChain;
198 bool FoundAnd1 = false;
199
200 MaskOps(unsigned BitWidth, bool MatchAnds)
201 : Mask(APInt::getZero(BitWidth)), MatchAndChain(MatchAnds) {}
202};
203} // namespace
204
205/// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a
206/// chain of 'and' or 'or' instructions looking for shift ops of a common source
207/// value. Examples:
208/// or (or (or X, (X >> 3)), (X >> 5)), (X >> 8)
209/// returns { X, 0x129 }
210/// and (and (X >> 1), 1), (X >> 4)
211/// returns { X, 0x12 }
212static bool matchAndOrChain(Value *V, MaskOps &MOps) {
213 Value *Op0, *Op1;
214 if (MOps.MatchAndChain) {
215 // Recurse through a chain of 'and' operands. This requires an extra check
216 // vs. the 'or' matcher: we must find an "and X, 1" instruction somewhere
217 // in the chain to know that all of the high bits are cleared.
218 if (match(V, m_And(m_Value(Op0), m_One()))) {
219 MOps.FoundAnd1 = true;
220 return matchAndOrChain(Op0, MOps);
221 }
222 if (match(V, m_And(m_Value(Op0), m_Value(Op1))))
223 return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);
224 } else {
225 // Recurse through a chain of 'or' operands.
226 if (match(V, m_Or(m_Value(Op0), m_Value(Op1))))
227 return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);
228 }
229
230 // We need a shift-right or a bare value representing a compare of bit 0 of
231 // the original source operand.
232 Value *Candidate;
233 const APInt *BitIndex = nullptr;
234 if (!match(V, m_LShr(m_Value(Candidate), m_APInt(BitIndex))))
235 Candidate = V;
236
237 // Initialize result source operand.
238 if (!MOps.Root)
239 MOps.Root = Candidate;
240
241 // The shift constant is out-of-range? This code hasn't been simplified.
242 if (BitIndex && BitIndex->uge(MOps.Mask.getBitWidth()))
243 return false;
244
245 // Fill in the mask bit derived from the shift constant.
246 MOps.Mask.setBit(BitIndex ? BitIndex->getZExtValue() : 0);
247 return MOps.Root == Candidate;
248}
249
250/// Match patterns that correspond to "any-bits-set" and "all-bits-set".
251/// These will include a chain of 'or' or 'and'-shifted bits from a
252/// common source value:
253/// and (or (lshr X, C), ...), 1 --> (X & CMask) != 0
254/// and (and (lshr X, C), ...), 1 --> (X & CMask) == CMask
255/// Note: "any-bits-clear" and "all-bits-clear" are variations of these patterns
256/// that differ only with a final 'not' of the result. We expect that final
257/// 'not' to be folded with the compare that we create here (invert predicate).
259 // The 'any-bits-set' ('or' chain) pattern is simpler to match because the
260 // final "and X, 1" instruction must be the final op in the sequence.
261 bool MatchAllBitsSet;
262 bool MatchTrunc;
263 Value *X;
264 if (I.getType()->isIntOrIntVectorTy(1)) {
265 if (match(&I, m_Trunc(m_OneUse(m_And(m_Value(), m_Value())))))
266 MatchAllBitsSet = true;
267 else if (match(&I, m_Trunc(m_OneUse(m_Or(m_Value(), m_Value())))))
268 MatchAllBitsSet = false;
269 else
270 return false;
271 MatchTrunc = true;
272 X = I.getOperand(0);
273 } else {
274 if (match(&I, m_c_And(m_OneUse(m_And(m_Value(), m_Value())), m_Value()))) {
275 X = &I;
276 MatchAllBitsSet = true;
277 } else if (match(&I,
278 m_And(m_OneUse(m_Or(m_Value(), m_Value())), m_One()))) {
279 X = I.getOperand(0);
280 MatchAllBitsSet = false;
281 } else
282 return false;
283 MatchTrunc = false;
284 }
285 Type *Ty = X->getType();
286
287 MaskOps MOps(Ty->getScalarSizeInBits(), MatchAllBitsSet);
288 if (!matchAndOrChain(X, MOps) ||
289 (MatchAllBitsSet && !MatchTrunc && !MOps.FoundAnd1))
290 return false;
291
292 // The pattern was found. Create a masked compare that replaces all of the
293 // shift and logic ops.
294 IRBuilder<> Builder(&I);
295 Constant *Mask = ConstantInt::get(Ty, MOps.Mask);
296 Value *And = Builder.CreateAnd(MOps.Root, Mask);
297 Value *Cmp = MatchAllBitsSet ? Builder.CreateICmpEQ(And, Mask)
298 : Builder.CreateIsNotNull(And);
299 Value *Zext = MatchTrunc ? Cmp : Builder.CreateZExt(Cmp, Ty);
300 I.replaceAllUsesWith(Zext);
301 ++NumAnyOrAllBitsSet;
302 return true;
303}
304
305// Try to recognize below function as popcount intrinsic.
306// This is the "best" algorithm from
307// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
308// Also used in TargetLowering::expandCTPOP().
309//
310// int popcount(unsigned int i) {
311// i = i - ((i >> 1) & 0x55555555);
312// i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
313// i = ((i + (i >> 4)) & 0x0F0F0F0F);
314// return (i * 0x01010101) >> 24;
315// }
317 if (I.getOpcode() != Instruction::LShr)
318 return false;
319
320 Type *Ty = I.getType();
321 if (!Ty->isIntOrIntVectorTy())
322 return false;
323
324 unsigned Len = Ty->getScalarSizeInBits();
325 // FIXME: fix Len == 8 and other irregular type lengths.
326 if (!(Len <= 128 && Len > 8 && Len % 8 == 0))
327 return false;
328
329 APInt Mask55 = APInt::getSplat(Len, APInt(8, 0x55));
330 APInt Mask33 = APInt::getSplat(Len, APInt(8, 0x33));
331 APInt Mask0F = APInt::getSplat(Len, APInt(8, 0x0F));
332 APInt Mask01 = APInt::getSplat(Len, APInt(8, 0x01));
333 APInt MaskShift = APInt(Len, Len - 8);
334
335 Value *Op0 = I.getOperand(0);
336 Value *Op1 = I.getOperand(1);
337 Value *MulOp0;
338 // Matching "(i * 0x01010101...) >> 24".
339 if ((match(Op0, m_Mul(m_Value(MulOp0), m_SpecificInt(Mask01)))) &&
341 Value *ShiftOp0;
342 // Matching "((i + (i >> 4)) & 0x0F0F0F0F...)".
343 if (match(MulOp0, m_And(m_c_Add(m_LShr(m_Value(ShiftOp0), m_SpecificInt(4)),
344 m_Deferred(ShiftOp0)),
345 m_SpecificInt(Mask0F)))) {
346 Value *AndOp0;
347 // Matching "(i & 0x33333333...) + ((i >> 2) & 0x33333333...)".
348 if (match(ShiftOp0,
349 m_c_Add(m_And(m_Value(AndOp0), m_SpecificInt(Mask33)),
351 m_SpecificInt(Mask33))))) {
352 Value *Root, *SubOp1;
353 // Matching "i - ((i >> 1) & 0x55555555...)".
354 const APInt *AndMask;
355 if (match(AndOp0, m_Sub(m_Value(Root), m_Value(SubOp1))) &&
356 match(SubOp1, m_And(m_LShr(m_Specific(Root), m_SpecificInt(1)),
357 m_APInt(AndMask)))) {
358 auto CheckAndMask = [&]() {
359 if (*AndMask == Mask55)
360 return true;
361
362 // Exact match failed, see if any bits are known to be 0 where we
363 // expect a 1 in the mask.
364 if (!AndMask->isSubsetOf(Mask55))
365 return false;
366
367 APInt NeededMask = Mask55 & ~*AndMask;
368 return MaskedValueIsZero(cast<Instruction>(SubOp1)->getOperand(0),
369 NeededMask,
370 SimplifyQuery(I.getDataLayout()));
371 };
372
373 if (CheckAndMask()) {
374 LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n");
375 IRBuilder<> Builder(&I);
376 I.replaceAllUsesWith(
377 Builder.CreateIntrinsic(Intrinsic::ctpop, I.getType(), {Root}));
378 ++NumPopCountRecognized;
379 return true;
380 }
381 }
382 }
383 }
384 }
385
386 return false;
387}
388
389/// Fold smin(smax(fptosi(x), C1), C2) to llvm.fptosi.sat(x), providing C1 and
390/// C2 saturate the value of the fp conversion. The transform is not reversable
391/// as the fptosi.sat is more defined than the input - all values produce a
392/// valid value for the fptosi.sat, where as some produce poison for original
393/// that were out of range of the integer conversion. The reversed pattern may
394/// use fmax and fmin instead. As we cannot directly reverse the transform, and
395/// it is not always profitable, we make it conditional on the cost being
396/// reported as lower by TTI.
398 // Look for min(max(fptosi, converting to fptosi_sat.
399 Value *In;
400 const APInt *MinC, *MaxC;
402 m_APInt(MinC))),
403 m_APInt(MaxC))) &&
405 m_APInt(MaxC))),
406 m_APInt(MinC))))
407 return false;
408
409 // Check that the constants clamp a saturate.
410 if (!(*MinC + 1).isPowerOf2() || -*MaxC != *MinC + 1)
411 return false;
412
413 Type *IntTy = I.getType();
414 Type *FpTy = In->getType();
415 Type *SatTy =
416 IntegerType::get(IntTy->getContext(), (*MinC + 1).exactLogBase2() + 1);
417 if (auto *VecTy = dyn_cast<VectorType>(IntTy))
418 SatTy = VectorType::get(SatTy, VecTy->getElementCount());
419
420 // Get the cost of the intrinsic, and check that against the cost of
421 // fptosi+smin+smax
422 InstructionCost SatCost = TTI.getIntrinsicInstrCost(
423 IntrinsicCostAttributes(Intrinsic::fptosi_sat, SatTy, {In}, {FpTy}),
425 SatCost += TTI.getCastInstrCost(Instruction::SExt, IntTy, SatTy,
428
429 InstructionCost MinMaxCost = TTI.getCastInstrCost(
430 Instruction::FPToSI, IntTy, FpTy, TTI::CastContextHint::None,
432 MinMaxCost += TTI.getIntrinsicInstrCost(
433 IntrinsicCostAttributes(Intrinsic::smin, IntTy, {IntTy}),
435 MinMaxCost += TTI.getIntrinsicInstrCost(
436 IntrinsicCostAttributes(Intrinsic::smax, IntTy, {IntTy}),
438
439 if (SatCost >= MinMaxCost)
440 return false;
441
442 IRBuilder<> Builder(&I);
443 Value *Sat =
444 Builder.CreateIntrinsic(Intrinsic::fptosi_sat, {SatTy, FpTy}, In);
445 I.replaceAllUsesWith(Builder.CreateSExt(Sat, IntTy));
446 return true;
447}
448
449/// Try to replace a mathlib call to sqrt with the LLVM intrinsic. This avoids
450/// pessimistic codegen that has to account for setting errno and can enable
451/// vectorization.
452static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,
454 DominatorTree &DT) {
455 // If (1) this is a sqrt libcall, (2) we can assume that NAN is not created
456 // (because NNAN or the operand arg must not be less than -0.0) and (2) we
457 // would not end up lowering to a libcall anyway (which could change the value
458 // of errno), then:
459 // (1) errno won't be set.
460 // (2) it is safe to convert this to an intrinsic call.
461 Type *Ty = Call->getType();
462 Value *Arg = Call->getArgOperand(0);
463 if (TTI.haveFastSqrt(Ty) &&
464 (Call->hasNoNaNs() ||
466 Arg, SimplifyQuery(Call->getDataLayout(), &TLI, &DT, &AC, Call)))) {
467 IRBuilder<> Builder(Call);
468 Value *NewSqrt =
469 Builder.CreateIntrinsic(Intrinsic::sqrt, Ty, Arg, Call, "sqrt");
470 Call->replaceAllUsesWith(NewSqrt);
471
472 // Explicitly erase the old call because a call with side effects is not
473 // trivially dead.
474 Call->eraseFromParent();
475 return true;
476 }
477
478 return false;
479}
480
481// Check if this array of constants represents a cttz table.
482// Iterate over the elements from \p Table by trying to find/match all
483// the numbers from 0 to \p InputBits that should represent cttz results.
484static bool isCTTZTable(Constant *Table, const APInt &Mul, const APInt &Shift,
485 const APInt &AndMask, Type *AccessTy,
486 unsigned InputBits, const APInt &GEPIdxFactor,
487 const DataLayout &DL) {
488 for (unsigned Idx = 0; Idx < InputBits; Idx++) {
489 APInt Index = (APInt(InputBits, 1).shl(Idx) * Mul).lshr(Shift) & AndMask;
491 ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
492 if (!C || C->getValue() != Idx)
493 return false;
494 }
495
496 return true;
497}
498
499// Try to recognize table-based ctz implementation.
500// E.g., an example in C (for more cases please see the llvm/tests):
501// int f(unsigned x) {
502// static const char table[32] =
503// {0, 1, 28, 2, 29, 14, 24, 3, 30,
504// 22, 20, 15, 25, 17, 4, 8, 31, 27,
505// 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
506// return table[((unsigned)((x & -x) * 0x077CB531U)) >> 27];
507// }
508// this can be lowered to `cttz` instruction.
509// There is also a special case when the element is 0.
510//
511// The (x & -x) sets the lowest non-zero bit to 1. The multiply is a de-bruijn
512// sequence that contains each pattern of bits in it. The shift extracts
513// the top bits after the multiply, and that index into the table should
514// represent the number of trailing zeros in the original number.
515//
516// Here are some examples or LLVM IR for a 64-bit target:
517//
518// CASE 1:
519// %sub = sub i32 0, %x
520// %and = and i32 %sub, %x
521// %mul = mul i32 %and, 125613361
522// %shr = lshr i32 %mul, 27
523// %idxprom = zext i32 %shr to i64
524// %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @ctz1.table, i64 0,
525// i64 %idxprom
526// %0 = load i8, i8* %arrayidx, align 1, !tbaa !8
527//
528// CASE 2:
529// %sub = sub i32 0, %x
530// %and = and i32 %sub, %x
531// %mul = mul i32 %and, 72416175
532// %shr = lshr i32 %mul, 26
533// %idxprom = zext i32 %shr to i64
534// %arrayidx = getelementptr inbounds [64 x i16], [64 x i16]* @ctz2.table,
535// i64 0, i64 %idxprom
536// %0 = load i16, i16* %arrayidx, align 2, !tbaa !8
537//
538// CASE 3:
539// %sub = sub i32 0, %x
540// %and = and i32 %sub, %x
541// %mul = mul i32 %and, 81224991
542// %shr = lshr i32 %mul, 27
543// %idxprom = zext i32 %shr to i64
544// %arrayidx = getelementptr inbounds [32 x i32], [32 x i32]* @ctz3.table,
545// i64 0, i64 %idxprom
546// %0 = load i32, i32* %arrayidx, align 4, !tbaa !8
547//
548// CASE 4:
549// %sub = sub i64 0, %x
550// %and = and i64 %sub, %x
551// %mul = mul i64 %and, 283881067100198605
552// %shr = lshr i64 %mul, 58
553// %arrayidx = getelementptr inbounds [64 x i8], [64 x i8]* @table, i64 0,
554// i64 %shr
555// %0 = load i8, i8* %arrayidx, align 1, !tbaa !8
556//
557// All these can be lowered to @llvm.cttz.i32/64 intrinsics.
560 if (!LI)
561 return false;
562
563 Type *AccessType = LI->getType();
564 if (!AccessType->isIntegerTy())
565 return false;
566
568 if (!GEP || !GEP->hasNoUnsignedSignedWrap())
569 return false;
570
571 GlobalVariable *GVTable = dyn_cast<GlobalVariable>(GEP->getPointerOperand());
572 if (!GVTable || !GVTable->hasInitializer() || !GVTable->isConstant())
573 return false;
574
575 unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
576 APInt ModOffset(BW, 0);
578 if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset) ||
579 VarOffsets.size() != 1 || ModOffset != 0)
580 return false;
581 auto [GepIdx, GEPScale] = VarOffsets.front();
582
583 Value *X1;
584 const APInt *MulConst, *ShiftConst, *AndCst = nullptr;
585 // Check that the gep variable index is ((x & -x) * MulConst) >> ShiftConst.
586 // This might be extended to the pointer index type, and if the gep index type
587 // has been replaced with an i8 then a new And (and different ShiftConst) will
588 // be present.
589 auto MatchInner = m_LShr(
590 m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)), m_APInt(MulConst)),
591 m_APInt(ShiftConst));
592 if (!match(GepIdx, m_CastOrSelf(MatchInner)) &&
593 !match(GepIdx, m_CastOrSelf(m_And(MatchInner, m_APInt(AndCst)))))
594 return false;
595
596 unsigned InputBits = X1->getType()->getScalarSizeInBits();
597 if (InputBits != 16 && InputBits != 32 && InputBits != 64 && InputBits != 128)
598 return false;
599
600 if (!GEPScale.isIntN(InputBits) ||
601 !isCTTZTable(GVTable->getInitializer(), *MulConst, *ShiftConst,
602 AndCst ? *AndCst : APInt::getAllOnes(InputBits), AccessType,
603 InputBits, GEPScale.zextOrTrunc(InputBits), DL))
604 return false;
605
606 ConstantInt *ZeroTableElem = cast<ConstantInt>(
607 ConstantFoldLoadFromConst(GVTable->getInitializer(), AccessType, DL));
608 bool DefinedForZero = ZeroTableElem->getZExtValue() == InputBits;
609
610 IRBuilder<> B(LI);
611 ConstantInt *BoolConst = B.getInt1(!DefinedForZero);
612 Type *XType = X1->getType();
613 auto Cttz = B.CreateIntrinsic(Intrinsic::cttz, {XType}, {X1, BoolConst});
614 Value *ZExtOrTrunc = nullptr;
615
616 if (DefinedForZero) {
617 ZExtOrTrunc = B.CreateZExtOrTrunc(Cttz, AccessType);
618 } else {
619 // If the value in elem 0 isn't the same as InputBits, we still want to
620 // produce the value from the table.
621 auto Cmp = B.CreateICmpEQ(X1, ConstantInt::get(XType, 0));
622 auto Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Cttz);
623
624 // The true branch of select handles the cttz(0) case, which is rare.
627 SelectI->setMetadata(
628 LLVMContext::MD_prof,
629 MDBuilder(SelectI->getContext()).createUnlikelyBranchWeights());
630 }
631
632 // NOTE: If the table[0] is 0, but the cttz(0) is defined by the Target
633 // it should be handled as: `cttz(x) & (typeSize - 1)`.
634
635 ZExtOrTrunc = B.CreateZExtOrTrunc(Select, AccessType);
636 }
637
638 LI->replaceAllUsesWith(ZExtOrTrunc);
639
640 return true;
641}
642
643// Check if this array of constants represents a log2 table.
644// Iterate over the elements from \p Table by trying to find/match all
645// the numbers from 0 to \p InputBits that should represent log2 results.
646static bool isLog2Table(Constant *Table, const APInt &Mul, const APInt &Shift,
647 Type *AccessTy, unsigned InputBits,
648 const APInt &GEPIdxFactor, const DataLayout &DL) {
649 for (unsigned Idx = 0; Idx < InputBits; Idx++) {
650 APInt Index = (APInt::getLowBitsSet(InputBits, Idx + 1) * Mul).lshr(Shift);
652 ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
653 if (!C || C->getValue() != Idx)
654 return false;
655 }
656
657 // Verify that an input of zero will select table index 0.
658 APInt ZeroIndex = Mul.lshr(Shift);
659 if (!ZeroIndex.isZero())
660 return false;
661
662 return true;
663}
664
665// Try to recognize table-based log2 implementation.
666// E.g., an example in C (for more cases please the llvm/tests):
667// int f(unsigned v) {
668// static const char table[32] =
669// {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
670// 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31};
671//
672// v |= v >> 1; // first round down to one less than a power of 2
673// v |= v >> 2;
674// v |= v >> 4;
675// v |= v >> 8;
676// v |= v >> 16;
677//
678// return table[(unsigned)(v * 0x07C4ACDDU) >> 27];
679// }
680// this can be lowered to `ctlz` instruction.
681// There is also a special case when the element is 0.
682//
683// The >> and |= sequence sets all bits below the most significant set bit. The
684// multiply is a de-bruijn sequence that contains each pattern of bits in it.
685// The shift extracts the top bits after the multiply, and that index into the
686// table should represent the floor log base 2 of the original number.
687//
688// Here are some examples of LLVM IR for a 64-bit target.
689//
690// CASE 1:
691// %shr = lshr i32 %v, 1
692// %or = or i32 %shr, %v
693// %shr1 = lshr i32 %or, 2
694// %or2 = or i32 %shr1, %or
695// %shr3 = lshr i32 %or2, 4
696// %or4 = or i32 %shr3, %or2
697// %shr5 = lshr i32 %or4, 8
698// %or6 = or i32 %shr5, %or4
699// %shr7 = lshr i32 %or6, 16
700// %or8 = or i32 %shr7, %or6
701// %mul = mul i32 %or8, 130329821
702// %shr9 = lshr i32 %mul, 27
703// %idxprom = zext nneg i32 %shr9 to i64
704// %arrayidx = getelementptr inbounds i8, ptr @table, i64 %idxprom
705// %0 = load i8, ptr %arrayidx, align 1
706//
707// CASE 2:
708// %shr = lshr i64 %v, 1
709// %or = or i64 %shr, %v
710// %shr1 = lshr i64 %or, 2
711// %or2 = or i64 %shr1, %or
712// %shr3 = lshr i64 %or2, 4
713// %or4 = or i64 %shr3, %or2
714// %shr5 = lshr i64 %or4, 8
715// %or6 = or i64 %shr5, %or4
716// %shr7 = lshr i64 %or6, 16
717// %or8 = or i64 %shr7, %or6
718// %shr9 = lshr i64 %or8, 32
719// %or10 = or i64 %shr9, %or8
720// %mul = mul i64 %or10, 285870213051386505
721// %shr11 = lshr i64 %mul, 58
722// %arrayidx = getelementptr inbounds i8, ptr @table, i64 %shr11
723// %0 = load i8, ptr %arrayidx, align 1
724//
725// All these can be lowered to @llvm.ctlz.i32/64 intrinsics and a subtract.
729 if (!LI)
730 return false;
731
732 Type *AccessType = LI->getType();
733 if (!AccessType->isIntegerTy())
734 return false;
735
737 if (!GEP || !GEP->hasNoUnsignedSignedWrap())
738 return false;
739
740 GlobalVariable *GVTable = dyn_cast<GlobalVariable>(GEP->getPointerOperand());
741 if (!GVTable || !GVTable->hasInitializer() || !GVTable->isConstant())
742 return false;
743
744 unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
745 APInt ModOffset(BW, 0);
747 if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset) ||
748 VarOffsets.size() != 1 || ModOffset != 0)
749 return false;
750 auto [GepIdx, GEPScale] = VarOffsets.front();
751
752 Value *X;
753 const APInt *MulConst, *ShiftConst;
754 // Check that the gep variable index is (x * MulConst) >> ShiftConst.
755 auto MatchInner =
756 m_LShr(m_Mul(m_Value(X), m_APInt(MulConst)), m_APInt(ShiftConst));
757 if (!match(GepIdx, m_CastOrSelf(MatchInner)))
758 return false;
759
760 unsigned InputBits = X->getType()->getScalarSizeInBits();
761 if (InputBits != 16 && InputBits != 32 && InputBits != 64 && InputBits != 128)
762 return false;
763
764 // Verify shift amount.
765 // TODO: Allow other shift amounts when we have proper test coverage.
766 if (*ShiftConst != InputBits - Log2_32(InputBits))
767 return false;
768
769 // Match the sequence of OR operations with right shifts by powers of 2.
770 for (unsigned ShiftAmt = InputBits / 2; ShiftAmt != 0; ShiftAmt /= 2) {
771 Value *Y;
772 if (!match(X, m_c_Or(m_LShr(m_Value(Y), m_SpecificInt(ShiftAmt)),
773 m_Deferred(Y))))
774 return false;
775 X = Y;
776 }
777
778 if (!GEPScale.isIntN(InputBits) ||
779 !isLog2Table(GVTable->getInitializer(), *MulConst, *ShiftConst,
780 AccessType, InputBits, GEPScale.zextOrTrunc(InputBits), DL))
781 return false;
782
783 ConstantInt *ZeroTableElem = cast<ConstantInt>(
784 ConstantFoldLoadFromConst(GVTable->getInitializer(), AccessType, DL));
785
786 // Use InputBits - 1 - ctlz(X) to compute log2(X).
787 IRBuilder<> B(LI);
788 ConstantInt *BoolConst = B.getTrue();
789 Type *XType = X->getType();
790
791 // Check the the backend has an efficient ctlz instruction.
792 // FIXME: Teach the backend to emit the original code when ctlz isn't
793 // supported like we do for cttz.
795 Intrinsic::ctlz, XType,
796 {PoisonValue::get(XType), /*is_zero_poison=*/BoolConst});
797 InstructionCost Cost =
798 TTI.getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
800 return false;
801
802 Value *Ctlz = B.CreateIntrinsic(Intrinsic::ctlz, {XType}, {X, BoolConst});
803
804 Constant *InputBitsM1 = ConstantInt::get(XType, InputBits - 1);
805 Value *Sub = B.CreateSub(InputBitsM1, Ctlz);
806
807 // The table won't produce a sensible result for 0.
808 Value *Cmp = B.CreateICmpEQ(X, ConstantInt::get(XType, 0));
809 Value *Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Sub);
810
811 // The true branch of select handles the log2(0) case, which is rare.
814 SelectI->setMetadata(
815 LLVMContext::MD_prof,
816 MDBuilder(SelectI->getContext()).createUnlikelyBranchWeights());
817 }
818
819 Value *ZExtOrTrunc = B.CreateZExtOrTrunc(Select, AccessType);
820
821 LI->replaceAllUsesWith(ZExtOrTrunc);
822
823 return true;
824}
825
826/// This is used by foldLoadsRecursive() to capture a Root Load node which is
827/// of type or(load, load) and recursively build the wide load. Also capture the
828/// shift amount, zero extend type and loadSize.
838
839// Identify and Merge consecutive loads recursively which is of the form
840// (ZExt(L1) << shift1) | (ZExt(L2) << shift2) -> ZExt(L3) << shift1
841// (ZExt(L1) << shift1) | ZExt(L2) -> ZExt(L3)
842static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,
843 AliasAnalysis &AA, bool IsRoot = false) {
844 uint64_t ShAmt2;
845 Value *X;
846 Instruction *L1, *L2;
847
848 // For the root instruction, allow multiple uses since the final result
849 // may legitimately be used in multiple places. For intermediate values,
850 // require single use to avoid creating duplicate loads.
851 if (!IsRoot && !V->hasOneUse())
852 return false;
853
854 if (!match(V, m_c_Or(m_Value(X),
856 ShAmt2)))))
857 return false;
858
859 if (!foldLoadsRecursive(X, LOps, DL, AA, /*IsRoot=*/false) && LOps.FoundRoot)
860 // Avoid Partial chain merge.
861 return false;
862
863 // Check if the pattern has loads
864 LoadInst *LI1 = LOps.Root;
865 uint64_t ShAmt1 = LOps.Shift;
866 if (LOps.FoundRoot == false &&
868 m_ShlOrSelf(m_OneUse(m_ZExt(m_Instruction(L1))), ShAmt1)))) {
869 LI1 = dyn_cast<LoadInst>(L1);
870 }
871 LoadInst *LI2 = dyn_cast<LoadInst>(L2);
872
873 // Check if loads are same, atomic, volatile and having same address space.
874 if (LI1 == LI2 || !LI1 || !LI2 || !LI1->isSimple() || !LI2->isSimple() ||
876 return false;
877
878 // Check if Loads come from same BB.
879 if (LI1->getParent() != LI2->getParent())
880 return false;
881
882 // Find the data layout
883 bool IsBigEndian = DL.isBigEndian();
884
885 // Check if loads are consecutive and same size.
886 Value *Load1Ptr = LI1->getPointerOperand();
887 APInt Offset1(DL.getIndexTypeSizeInBits(Load1Ptr->getType()), 0);
888 Load1Ptr =
889 Load1Ptr->stripAndAccumulateConstantOffsets(DL, Offset1,
890 /* AllowNonInbounds */ true);
891
892 Value *Load2Ptr = LI2->getPointerOperand();
893 APInt Offset2(DL.getIndexTypeSizeInBits(Load2Ptr->getType()), 0);
894 Load2Ptr =
895 Load2Ptr->stripAndAccumulateConstantOffsets(DL, Offset2,
896 /* AllowNonInbounds */ true);
897
898 // Verify if both loads have same base pointers
899 uint64_t LoadSize1 = LI1->getType()->getPrimitiveSizeInBits();
900 uint64_t LoadSize2 = LI2->getType()->getPrimitiveSizeInBits();
901 if (Load1Ptr != Load2Ptr)
902 return false;
903
904 // Make sure that there are no padding bits.
905 if (!DL.typeSizeEqualsStoreSize(LI1->getType()) ||
906 !DL.typeSizeEqualsStoreSize(LI2->getType()))
907 return false;
908
909 // Alias Analysis to check for stores b/w the loads.
910 LoadInst *Start = LOps.FoundRoot ? LOps.RootInsert : LI1, *End = LI2;
912 if (!Start->comesBefore(End)) {
913 std::swap(Start, End);
914 // If LOps.RootInsert comes after LI2, since we use LI2 as the new insert
915 // point, we should make sure whether the memory region accessed by LOps
916 // isn't modified.
917 if (LOps.FoundRoot)
919 LOps.Root->getPointerOperand(),
920 LocationSize::precise(DL.getTypeStoreSize(
921 IntegerType::get(LI1->getContext(), LOps.LoadSize))),
922 LOps.AATags);
923 else
925 } else
927 unsigned NumScanned = 0;
928 for (Instruction &Inst :
929 make_range(Start->getIterator(), End->getIterator())) {
930 if (Inst.mayWriteToMemory() && isModSet(AA.getModRefInfo(&Inst, Loc)))
931 return false;
932
933 if (++NumScanned > MaxInstrsToScan)
934 return false;
935 }
936
937 // Make sure Load with lower Offset is at LI1
938 bool Reverse = false;
939 if (Offset2.slt(Offset1)) {
940 std::swap(LI1, LI2);
941 std::swap(ShAmt1, ShAmt2);
942 std::swap(Offset1, Offset2);
943 std::swap(Load1Ptr, Load2Ptr);
944 std::swap(LoadSize1, LoadSize2);
945 Reverse = true;
946 }
947
948 // Big endian swap the shifts
949 if (IsBigEndian)
950 std::swap(ShAmt1, ShAmt2);
951
952 // First load is always LI1. This is where we put the new load.
953 // Use the merged load size available from LI1 for forward loads.
954 if (LOps.FoundRoot) {
955 if (!Reverse)
956 LoadSize1 = LOps.LoadSize;
957 else
958 LoadSize2 = LOps.LoadSize;
959 }
960
961 // Verify if shift amount and load index aligns and verifies that loads
962 // are consecutive.
963 uint64_t ShiftDiff = IsBigEndian ? LoadSize2 : LoadSize1;
964 uint64_t PrevSize =
965 DL.getTypeStoreSize(IntegerType::get(LI1->getContext(), LoadSize1));
966 if ((ShAmt2 - ShAmt1) != ShiftDiff || (Offset2 - Offset1) != PrevSize)
967 return false;
968
969 // Update LOps
970 AAMDNodes AATags1 = LOps.AATags;
971 AAMDNodes AATags2 = LI2->getAAMetadata();
972 if (LOps.FoundRoot == false) {
973 LOps.FoundRoot = true;
974 AATags1 = LI1->getAAMetadata();
975 }
976 LOps.LoadSize = LoadSize1 + LoadSize2;
977 LOps.RootInsert = Start;
978
979 // Concatenate the AATags of the Merged Loads.
980 LOps.AATags = AATags1.concat(AATags2);
981
982 LOps.Root = LI1;
983 LOps.Shift = ShAmt1;
984 LOps.ZextType = X->getType();
985 return true;
986}
987
988// For a given BB instruction, evaluate all loads in the chain that form a
989// pattern which suggests that the loads can be combined. The one and only use
990// of the loads is to form a wider load.
993 const DominatorTree &DT) {
994 // Only consider load chains of scalar values.
995 if (isa<VectorType>(I.getType()))
996 return false;
997
998 LoadOps LOps;
999 if (!foldLoadsRecursive(&I, LOps, DL, AA, /*IsRoot=*/true) || !LOps.FoundRoot)
1000 return false;
1001
1002 IRBuilder<> Builder(&I);
1003 LoadInst *NewLoad = nullptr, *LI1 = LOps.Root;
1004
1005 IntegerType *WiderType = IntegerType::get(I.getContext(), LOps.LoadSize);
1006 // TTI based checks if we want to proceed with wider load
1007 bool Allowed = TTI.isTypeLegal(WiderType);
1008 if (!Allowed)
1009 return false;
1010
1011 unsigned AS = LI1->getPointerAddressSpace();
1012 unsigned Fast = 0;
1013 Allowed = TTI.allowsMisalignedMemoryAccesses(I.getContext(), LOps.LoadSize,
1014 AS, LI1->getAlign(), &Fast);
1015 if (!Allowed || !Fast)
1016 return false;
1017
1018 // Get the Index and Ptr for the new GEP.
1019 Value *Load1Ptr = LI1->getPointerOperand();
1020 Builder.SetInsertPoint(LOps.RootInsert);
1021 if (!DT.dominates(Load1Ptr, LOps.RootInsert)) {
1022 APInt Offset1(DL.getIndexTypeSizeInBits(Load1Ptr->getType()), 0);
1023 Load1Ptr = Load1Ptr->stripAndAccumulateConstantOffsets(
1024 DL, Offset1, /* AllowNonInbounds */ true);
1025 Load1Ptr = Builder.CreatePtrAdd(Load1Ptr, Builder.getInt(Offset1));
1026 }
1027 // Generate wider load.
1028 NewLoad = Builder.CreateAlignedLoad(WiderType, Load1Ptr, LI1->getAlign(),
1029 LI1->isVolatile(), "");
1030 NewLoad->takeName(LI1);
1031 // Set the New Load AATags Metadata.
1032 if (LOps.AATags)
1033 NewLoad->setAAMetadata(LOps.AATags);
1034
1035 Value *NewOp = NewLoad;
1036 // Check if zero extend needed.
1037 if (LOps.ZextType)
1038 NewOp = Builder.CreateZExt(NewOp, LOps.ZextType);
1039
1040 // Check if shift needed. We need to shift with the amount of load1
1041 // shift if not zero.
1042 if (LOps.Shift)
1043 NewOp = Builder.CreateShl(NewOp, LOps.Shift);
1044 I.replaceAllUsesWith(NewOp);
1045
1046 return true;
1047}
1048
1049/// ValWidth bits starting at ValOffset of Val stored at PtrBase+PtrOffset.
1057
1058 bool isCompatibleWith(const PartStore &Other) const {
1059 return PtrBase == Other.PtrBase && Val == Other.Val;
1060 }
1061
1062 bool operator<(const PartStore &Other) const {
1063 return PtrOffset.slt(Other.PtrOffset);
1064 }
1065};
1066
1067static std::optional<PartStore> matchPartStore(Instruction &I,
1068 const DataLayout &DL) {
1069 auto *Store = dyn_cast<StoreInst>(&I);
1070 if (!Store || !Store->isSimple())
1071 return std::nullopt;
1072
1073 Value *StoredVal = Store->getValueOperand();
1074 Type *StoredTy = StoredVal->getType();
1075 if (!StoredTy->isIntegerTy() || !DL.typeSizeEqualsStoreSize(StoredTy))
1076 return std::nullopt;
1077
1078 uint64_t ValWidth = StoredTy->getPrimitiveSizeInBits();
1079 uint64_t ValOffset;
1080 Value *Val;
1081 if (!match(StoredVal, m_Trunc(m_LShrOrSelf(m_Value(Val), ValOffset))))
1082 return std::nullopt;
1083
1084 Value *Ptr = Store->getPointerOperand();
1085 APInt PtrOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
1087 DL, PtrOffset, /*AllowNonInbounds=*/true);
1088 return {{PtrBase, PtrOffset, Val, ValOffset, ValWidth, Store}};
1089}
1090
1092 unsigned Width, const DataLayout &DL,
1094 if (Parts.size() < 2)
1095 return false;
1096
1097 // Check whether combining the stores is profitable.
1098 // FIXME: We could generate smaller stores if we can't produce a large one.
1099 const PartStore &First = Parts.front();
1100 LLVMContext &Ctx = First.Store->getContext();
1101 Type *NewTy = Type::getIntNTy(Ctx, Width);
1102 unsigned Fast = 0;
1103 if (!TTI.isTypeLegal(NewTy) ||
1104 !TTI.allowsMisalignedMemoryAccesses(Ctx, Width,
1105 First.Store->getPointerAddressSpace(),
1106 First.Store->getAlign(), &Fast) ||
1107 !Fast)
1108 return false;
1109
1110 // Generate the combined store.
1111 IRBuilder<> Builder(First.Store);
1112 Value *Val = First.Val;
1113 if (First.ValOffset != 0)
1114 Val = Builder.CreateLShr(Val, First.ValOffset);
1115 Val = Builder.CreateZExtOrTrunc(Val, NewTy);
1116 StoreInst *Store = Builder.CreateAlignedStore(
1117 Val, First.Store->getPointerOperand(), First.Store->getAlign());
1118
1119 // Merge various metadata onto the new store.
1120 AAMDNodes AATags = First.Store->getAAMetadata();
1121 SmallVector<Instruction *> Stores = {First.Store};
1122 Stores.reserve(Parts.size());
1123 SmallVector<DebugLoc> DbgLocs = {First.Store->getDebugLoc()};
1124 DbgLocs.reserve(Parts.size());
1125 for (const PartStore &Part : drop_begin(Parts)) {
1126 AATags = AATags.concat(Part.Store->getAAMetadata());
1127 Stores.push_back(Part.Store);
1128 DbgLocs.push_back(Part.Store->getDebugLoc());
1129 }
1130 Store->setAAMetadata(AATags);
1131 Store->mergeDIAssignID(Stores);
1132 Store->setDebugLoc(DebugLoc::getMergedLocations(DbgLocs));
1133
1134 // Remove the old stores.
1135 for (const PartStore &Part : Parts)
1136 Part.Store->eraseFromParent();
1137
1138 return true;
1139}
1140
1143 if (Parts.size() < 2)
1144 return false;
1145
1146 // We now have multiple parts of the same value stored to the same pointer.
1147 // Sort the parts by pointer offset, and make sure they are consistent with
1148 // the value offsets. Also check that the value is fully covered without
1149 // overlaps.
1150 bool Changed = false;
1151 llvm::sort(Parts);
1152 int64_t LastEndOffsetFromFirst = 0;
1153 const PartStore *First = &Parts[0];
1154 for (const PartStore &Part : Parts) {
1155 APInt PtrOffsetFromFirst = Part.PtrOffset - First->PtrOffset;
1156 int64_t ValOffsetFromFirst = Part.ValOffset - First->ValOffset;
1157 if (PtrOffsetFromFirst * 8 != ValOffsetFromFirst ||
1158 LastEndOffsetFromFirst != ValOffsetFromFirst) {
1160 LastEndOffsetFromFirst, DL, TTI);
1161 First = &Part;
1162 LastEndOffsetFromFirst = Part.ValWidth;
1163 continue;
1164 }
1165
1166 LastEndOffsetFromFirst = ValOffsetFromFirst + Part.ValWidth;
1167 }
1168
1170 LastEndOffsetFromFirst, DL, TTI);
1171 return Changed;
1172}
1173
1176 // FIXME: Add big endian support.
1177 if (DL.isBigEndian())
1178 return false;
1179
1180 BatchAAResults BatchAA(AA);
1182 bool MadeChange = false;
1183 for (Instruction &I : make_early_inc_range(BB)) {
1184 if (std::optional<PartStore> Part = matchPartStore(I, DL)) {
1185 if (Parts.empty() || Part->isCompatibleWith(Parts[0])) {
1186 Parts.push_back(std::move(*Part));
1187 continue;
1188 }
1189
1190 MadeChange |= mergePartStores(Parts, DL, TTI);
1191 Parts.clear();
1192 Parts.push_back(std::move(*Part));
1193 continue;
1194 }
1195
1196 if (Parts.empty())
1197 continue;
1198
1199 if (I.mayThrow() ||
1200 (I.mayReadOrWriteMemory() &&
1202 &I, MemoryLocation::getBeforeOrAfter(Parts[0].PtrBase))))) {
1203 MadeChange |= mergePartStores(Parts, DL, TTI);
1204 Parts.clear();
1205 continue;
1206 }
1207 }
1208
1209 MadeChange |= mergePartStores(Parts, DL, TTI);
1210 return MadeChange;
1211}
1212
1213/// Combine away instructions providing they are still equivalent when compared
1214/// against 0. i.e do they have any bits set.
1216 auto *I = dyn_cast<Instruction>(V);
1217 if (!I || I->getOpcode() != Instruction::Or || !I->hasOneUse())
1218 return nullptr;
1219
1220 Value *A;
1221
1222 // Look deeper into the chain of or's, combining away shl (so long as they are
1223 // nuw or nsw).
1224 Value *Op0 = I->getOperand(0);
1225 if (match(Op0, m_CombineOr(m_NSWShl(m_Value(A), m_Value()),
1226 m_NUWShl(m_Value(A), m_Value()))))
1227 Op0 = A;
1228 else if (auto *NOp = optimizeShiftInOrChain(Op0, Builder))
1229 Op0 = NOp;
1230
1231 Value *Op1 = I->getOperand(1);
1232 if (match(Op1, m_CombineOr(m_NSWShl(m_Value(A), m_Value()),
1233 m_NUWShl(m_Value(A), m_Value()))))
1234 Op1 = A;
1235 else if (auto *NOp = optimizeShiftInOrChain(Op1, Builder))
1236 Op1 = NOp;
1237
1238 if (Op0 != I->getOperand(0) || Op1 != I->getOperand(1))
1239 return Builder.CreateOr(Op0, Op1);
1240 return nullptr;
1241}
1242
1245 const DominatorTree &DT) {
1246 CmpPredicate Pred;
1247 Value *Op0;
1248 if (!match(&I, m_ICmp(Pred, m_Value(Op0), m_Zero())) ||
1249 !ICmpInst::isEquality(Pred))
1250 return false;
1251
1252 // If the chain or or's matches a load, combine to that before attempting to
1253 // remove shifts.
1254 if (auto OpI = dyn_cast<Instruction>(Op0))
1255 if (OpI->getOpcode() == Instruction::Or)
1256 if (foldConsecutiveLoads(*OpI, DL, TTI, AA, DT))
1257 return true;
1258
1259 IRBuilder<> Builder(&I);
1260 // icmp eq/ne or(shl(a), b), 0 -> icmp eq/ne or(a, b), 0
1261 if (auto *Res = optimizeShiftInOrChain(Op0, Builder)) {
1262 I.replaceAllUsesWith(Builder.CreateICmp(Pred, Res, I.getOperand(1)));
1263 return true;
1264 }
1265
1266 return false;
1267}
1268
1269// Calculate GEP Stride and accumulated const ModOffset. Return Stride and
1270// ModOffset
1271static std::pair<APInt, APInt>
1273 unsigned BW = DL.getIndexTypeSizeInBits(PtrOp->getType());
1274 std::optional<APInt> Stride;
1275 APInt ModOffset(BW, 0);
1276 // Return a minimum gep stride, greatest common divisor of consective gep
1277 // index scales(c.f. Bézout's identity).
1278 while (auto *GEP = dyn_cast<GEPOperator>(PtrOp)) {
1280 if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset))
1281 break;
1282
1283 for (auto [V, Scale] : VarOffsets) {
1284 // Only keep a power of two factor for non-inbounds
1285 if (!GEP->hasNoUnsignedSignedWrap())
1286 Scale = APInt::getOneBitSet(Scale.getBitWidth(), Scale.countr_zero());
1287
1288 if (!Stride)
1289 Stride = Scale;
1290 else
1291 Stride = APIntOps::GreatestCommonDivisor(*Stride, Scale);
1292 }
1293
1294 PtrOp = GEP->getPointerOperand();
1295 }
1296
1297 // Check whether pointer arrives back at Global Variable via at least one GEP.
1298 // Even if it doesn't, we can check by alignment.
1299 if (!isa<GlobalVariable>(PtrOp) || !Stride)
1300 return {APInt(BW, 1), APInt(BW, 0)};
1301
1302 // In consideration of signed GEP indices, non-negligible offset become
1303 // remainder of division by minimum GEP stride.
1304 ModOffset = ModOffset.srem(*Stride);
1305 if (ModOffset.isNegative())
1306 ModOffset += *Stride;
1307
1308 return {*Stride, ModOffset};
1309}
1310
1311/// If C is a constant patterned array and all valid loaded results for given
1312/// alignment are same to a constant, return that constant.
1314 auto *LI = dyn_cast<LoadInst>(&I);
1315 if (!LI || LI->isVolatile())
1316 return false;
1317
1318 // We can only fold the load if it is from a constant global with definitive
1319 // initializer. Skip expensive logic if this is not the case.
1320 auto *PtrOp = LI->getPointerOperand();
1322 if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
1323 return false;
1324
1325 // Bail for large initializers in excess of 4K to avoid too many scans.
1326 Constant *C = GV->getInitializer();
1327 uint64_t GVSize = DL.getTypeAllocSize(C->getType());
1328 if (!GVSize || 4096 < GVSize)
1329 return false;
1330
1331 Type *LoadTy = LI->getType();
1332 unsigned BW = DL.getIndexTypeSizeInBits(PtrOp->getType());
1333 auto [Stride, ConstOffset] = getStrideAndModOffsetOfGEP(PtrOp, DL);
1334
1335 // Any possible offset could be multiple of GEP stride. And any valid
1336 // offset is multiple of load alignment, so checking only multiples of bigger
1337 // one is sufficient to say results' equality.
1338 if (auto LA = LI->getAlign();
1339 LA <= GV->getAlign().valueOrOne() && Stride.getZExtValue() < LA.value()) {
1340 ConstOffset = APInt(BW, 0);
1341 Stride = APInt(BW, LA.value());
1342 }
1343
1344 Constant *Ca = ConstantFoldLoadFromConst(C, LoadTy, ConstOffset, DL);
1345 if (!Ca)
1346 return false;
1347
1348 unsigned E = GVSize - DL.getTypeStoreSize(LoadTy);
1349 for (; ConstOffset.getZExtValue() <= E; ConstOffset += Stride)
1350 if (Ca != ConstantFoldLoadFromConst(C, LoadTy, ConstOffset, DL))
1351 return false;
1352
1353 I.replaceAllUsesWith(Ca);
1354
1355 return true;
1356}
1357
1358namespace {
1359class StrNCmpInliner {
1360public:
1361 StrNCmpInliner(CallInst *CI, LibFunc Func, DomTreeUpdater *DTU,
1362 const DataLayout &DL)
1363 : CI(CI), Func(Func), DTU(DTU), DL(DL) {}
1364
1365 bool optimizeStrNCmp();
1366
1367private:
1368 void inlineCompare(Value *LHS, StringRef RHS, uint64_t N, bool Swapped);
1369
1370 CallInst *CI;
1371 LibFunc Func;
1372 DomTreeUpdater *DTU;
1373 const DataLayout &DL;
1374};
1375
1376} // namespace
1377
1378/// First we normalize calls to strncmp/strcmp to the form of
1379/// compare(s1, s2, N), which means comparing first N bytes of s1 and s2
1380/// (without considering '\0').
1381///
1382/// Examples:
1383///
1384/// \code
1385/// strncmp(s, "a", 3) -> compare(s, "a", 2)
1386/// strncmp(s, "abc", 3) -> compare(s, "abc", 3)
1387/// strncmp(s, "a\0b", 3) -> compare(s, "a\0b", 2)
1388/// strcmp(s, "a") -> compare(s, "a", 2)
1389///
1390/// char s2[] = {'a'}
1391/// strncmp(s, s2, 3) -> compare(s, s2, 3)
1392///
1393/// char s2[] = {'a', 'b', 'c', 'd'}
1394/// strncmp(s, s2, 3) -> compare(s, s2, 3)
1395/// \endcode
1396///
1397/// We only handle cases where N and exactly one of s1 and s2 are constant.
1398/// Cases that s1 and s2 are both constant are already handled by the
1399/// instcombine pass.
1400///
1401/// We do not handle cases where N > StrNCmpInlineThreshold.
1402///
1403/// We also do not handles cases where N < 2, which are already
1404/// handled by the instcombine pass.
1405///
1406bool StrNCmpInliner::optimizeStrNCmp() {
1407 if (StrNCmpInlineThreshold < 2)
1408 return false;
1409
1411 return false;
1412
1413 Value *Str1P = CI->getArgOperand(0);
1414 Value *Str2P = CI->getArgOperand(1);
1415 // Should be handled elsewhere.
1416 if (Str1P == Str2P)
1417 return false;
1418
1419 StringRef Str1, Str2;
1420 bool HasStr1 = getConstantStringInfo(Str1P, Str1, /*TrimAtNul=*/false);
1421 bool HasStr2 = getConstantStringInfo(Str2P, Str2, /*TrimAtNul=*/false);
1422 if (HasStr1 == HasStr2)
1423 return false;
1424
1425 // Note that '\0' and characters after it are not trimmed.
1426 StringRef Str = HasStr1 ? Str1 : Str2;
1427 Value *StrP = HasStr1 ? Str2P : Str1P;
1428
1429 size_t Idx = Str.find('\0');
1430 uint64_t N = Idx == StringRef::npos ? UINT64_MAX : Idx + 1;
1431 if (Func == LibFunc_strncmp) {
1432 if (auto *ConstInt = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
1433 N = std::min(N, ConstInt->getZExtValue());
1434 else
1435 return false;
1436 }
1437 // Now N means how many bytes we need to compare at most.
1438 if (N > Str.size() || N < 2 || N > StrNCmpInlineThreshold)
1439 return false;
1440
1441 // Cases where StrP has two or more dereferenceable bytes might be better
1442 // optimized elsewhere.
1443 bool CanBeNull = false, CanBeFreed = false;
1444 if (StrP->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed) > 1)
1445 return false;
1446 inlineCompare(StrP, Str, N, HasStr1);
1447 return true;
1448}
1449
1450/// Convert
1451///
1452/// \code
1453/// ret = compare(s1, s2, N)
1454/// \endcode
1455///
1456/// into
1457///
1458/// \code
1459/// ret = (int)s1[0] - (int)s2[0]
1460/// if (ret != 0)
1461/// goto NE
1462/// ...
1463/// ret = (int)s1[N-2] - (int)s2[N-2]
1464/// if (ret != 0)
1465/// goto NE
1466/// ret = (int)s1[N-1] - (int)s2[N-1]
1467/// NE:
1468/// \endcode
1469///
1470/// CFG before and after the transformation:
1471///
1472/// (before)
1473/// BBCI
1474///
1475/// (after)
1476/// BBCI -> BBSubs[0] (sub,icmp) --NE-> BBNE -> BBTail
1477/// | ^
1478/// E |
1479/// | |
1480/// BBSubs[1] (sub,icmp) --NE-----+
1481/// ... |
1482/// BBSubs[N-1] (sub) ---------+
1483///
1484void StrNCmpInliner::inlineCompare(Value *LHS, StringRef RHS, uint64_t N,
1485 bool Swapped) {
1486 auto &Ctx = CI->getContext();
1487 IRBuilder<> B(Ctx);
1488 // We want these instructions to be recognized as inlined instructions for the
1489 // compare call, but we don't have a source location for the definition of
1490 // that function, since we're generating that code now. Because the generated
1491 // code is a viable point for a memory access error, we make the pragmatic
1492 // choice here to directly use CI's location so that we have useful
1493 // attribution for the generated code.
1494 B.SetCurrentDebugLocation(CI->getDebugLoc());
1495
1496 BasicBlock *BBCI = CI->getParent();
1497 BasicBlock *BBTail =
1498 SplitBlock(BBCI, CI, DTU, nullptr, nullptr, BBCI->getName() + ".tail");
1499
1501 for (uint64_t I = 0; I < N; ++I)
1502 BBSubs.push_back(
1503 BasicBlock::Create(Ctx, "sub_" + Twine(I), BBCI->getParent(), BBTail));
1504 BasicBlock *BBNE = BasicBlock::Create(Ctx, "ne", BBCI->getParent(), BBTail);
1505
1506 cast<UncondBrInst>(BBCI->getTerminator())->setSuccessor(BBSubs[0]);
1507
1508 B.SetInsertPoint(BBNE);
1509 PHINode *Phi = B.CreatePHI(CI->getType(), N);
1510 B.CreateBr(BBTail);
1511
1512 Value *Base = LHS;
1513 for (uint64_t i = 0; i < N; ++i) {
1514 B.SetInsertPoint(BBSubs[i]);
1515 Value *VL =
1516 B.CreateZExt(B.CreateLoad(B.getInt8Ty(),
1517 B.CreateInBoundsPtrAdd(Base, B.getInt64(i))),
1518 CI->getType());
1519 Value *VR =
1520 ConstantInt::get(CI->getType(), static_cast<unsigned char>(RHS[i]));
1521 Value *Sub = Swapped ? B.CreateSub(VR, VL) : B.CreateSub(VL, VR);
1522 if (i < N - 1) {
1523 CondBrInst *CondBrInst = B.CreateCondBr(
1524 B.CreateICmpNE(Sub, ConstantInt::get(CI->getType(), 0)), BBNE,
1525 BBSubs[i + 1]);
1526
1527 Function *F = CI->getFunction();
1528 assert(F && "Instruction does not belong to a function!");
1529 std::optional<Function::ProfileCount> EC = F->getEntryCount();
1530 if (EC && EC->getCount() > 0)
1532 } else {
1533 B.CreateBr(BBNE);
1534 }
1535
1536 Phi->addIncoming(Sub, BBSubs[i]);
1537 }
1538
1539 CI->replaceAllUsesWith(Phi);
1540 CI->eraseFromParent();
1541
1542 if (DTU) {
1544 Updates.push_back({DominatorTree::Insert, BBCI, BBSubs[0]});
1545 for (uint64_t i = 0; i < N; ++i) {
1546 if (i < N - 1)
1547 Updates.push_back({DominatorTree::Insert, BBSubs[i], BBSubs[i + 1]});
1548 Updates.push_back({DominatorTree::Insert, BBSubs[i], BBNE});
1549 }
1550 Updates.push_back({DominatorTree::Insert, BBNE, BBTail});
1551 Updates.push_back({DominatorTree::Delete, BBCI, BBTail});
1552 DTU->applyUpdates(Updates);
1553 }
1554}
1555
1556/// Convert memchr with a small constant string into a switch
1558 const DataLayout &DL) {
1559 if (isa<Constant>(Call->getArgOperand(1)))
1560 return false;
1561
1562 StringRef Str;
1563 Value *Base = Call->getArgOperand(0);
1564 if (!getConstantStringInfo(Base, Str, /*TrimAtNul=*/false))
1565 return false;
1566
1567 uint64_t N = Str.size();
1568 if (auto *ConstInt = dyn_cast<ConstantInt>(Call->getArgOperand(2))) {
1569 uint64_t Val = ConstInt->getZExtValue();
1570 // Ignore the case that n is larger than the size of string.
1571 if (Val > N)
1572 return false;
1573 N = Val;
1574 } else
1575 return false;
1576
1578 return false;
1579
1580 BasicBlock *BB = Call->getParent();
1581 BasicBlock *BBNext = SplitBlock(BB, Call, DTU);
1582 IRBuilder<> IRB(BB);
1583 IRB.SetCurrentDebugLocation(Call->getDebugLoc());
1584 IntegerType *ByteTy = IRB.getInt8Ty();
1586 SwitchInst *SI = IRB.CreateSwitch(
1587 IRB.CreateTrunc(Call->getArgOperand(1), ByteTy), BBNext, N);
1588 // We can't know the precise weights here, as they would depend on the value
1589 // distribution of Call->getArgOperand(1). So we just mark it as "unknown".
1591 Type *IndexTy = DL.getIndexType(Call->getType());
1593
1594 BasicBlock *BBSuccess = BasicBlock::Create(
1595 Call->getContext(), "memchr.success", BB->getParent(), BBNext);
1596 IRB.SetInsertPoint(BBSuccess);
1597 PHINode *IndexPHI = IRB.CreatePHI(IndexTy, N, "memchr.idx");
1598 Value *FirstOccursLocation = IRB.CreateInBoundsPtrAdd(Base, IndexPHI);
1599 IRB.CreateBr(BBNext);
1600 if (DTU)
1601 Updates.push_back({DominatorTree::Insert, BBSuccess, BBNext});
1602
1604 for (uint64_t I = 0; I < N; ++I) {
1605 ConstantInt *CaseVal =
1606 ConstantInt::get(ByteTy, static_cast<unsigned char>(Str[I]));
1607 if (!Cases.insert(CaseVal).second)
1608 continue;
1609
1610 BasicBlock *BBCase = BasicBlock::Create(Call->getContext(), "memchr.case",
1611 BB->getParent(), BBSuccess);
1612 SI->addCase(CaseVal, BBCase);
1613 IRB.SetInsertPoint(BBCase);
1614 IndexPHI->addIncoming(ConstantInt::get(IndexTy, I), BBCase);
1615 IRB.CreateBr(BBSuccess);
1616 if (DTU) {
1617 Updates.push_back({DominatorTree::Insert, BB, BBCase});
1618 Updates.push_back({DominatorTree::Insert, BBCase, BBSuccess});
1619 }
1620 }
1621
1622 PHINode *PHI =
1623 PHINode::Create(Call->getType(), 2, Call->getName(), BBNext->begin());
1624 PHI->addIncoming(Constant::getNullValue(Call->getType()), BB);
1625 PHI->addIncoming(FirstOccursLocation, BBSuccess);
1626
1627 Call->replaceAllUsesWith(PHI);
1628 Call->eraseFromParent();
1629
1630 if (DTU)
1631 DTU->applyUpdates(Updates);
1632
1633 return true;
1634}
1635
1638 DominatorTree &DT, const DataLayout &DL,
1639 bool &MadeCFGChange) {
1640
1641 auto *CI = dyn_cast<CallInst>(&I);
1642 if (!CI || CI->isNoBuiltin())
1643 return false;
1644
1645 Function *CalledFunc = CI->getCalledFunction();
1646 if (!CalledFunc)
1647 return false;
1648
1649 LibFunc LF;
1650 if (!TLI.getLibFunc(*CalledFunc, LF) ||
1651 !isLibFuncEmittable(CI->getModule(), &TLI, LF))
1652 return false;
1653
1654 DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Lazy);
1655
1656 switch (LF) {
1657 case LibFunc_sqrt:
1658 case LibFunc_sqrtf:
1659 case LibFunc_sqrtl:
1660 return foldSqrt(CI, LF, TTI, TLI, AC, DT);
1661 case LibFunc_strcmp:
1662 case LibFunc_strncmp:
1663 if (StrNCmpInliner(CI, LF, &DTU, DL).optimizeStrNCmp()) {
1664 MadeCFGChange = true;
1665 return true;
1666 }
1667 break;
1668 case LibFunc_memchr:
1669 if (foldMemChr(CI, &DTU, DL)) {
1670 MadeCFGChange = true;
1671 return true;
1672 }
1673 break;
1674 default:;
1675 }
1676 return false;
1677}
1678
1679/// Match high part of long multiplication.
1680///
1681/// Considering a multiply made up of high and low parts, we can split the
1682/// multiply into:
1683/// x * y == (xh*T + xl) * (yh*T + yl)
1684/// where xh == x>>32 and xl == x & 0xffffffff. T = 2^32.
1685/// This expands to
1686/// xh*yh*T*T + xh*yl*T + xl*yh*T + xl*yl
1687/// which can be drawn as
1688/// [ xh*yh ]
1689/// [ xh*yl ]
1690/// [ xl*yh ]
1691/// [ xl*yl ]
1692/// We are looking for the "high" half, which is xh*yh + xh*yl>>32 + xl*yh>>32 +
1693/// some carrys. The carry makes this difficult and there are multiple ways of
1694/// representing it. The ones we attempt to support here are:
1695/// Carry: xh*yh + carry + lowsum
1696/// carry = lowsum < xh*yl ? 0x1000000 : 0
1697/// lowsum = xh*yl + xl*yh + (xl*yl>>32)
1698/// Ladder: xh*yh + c2>>32 + c3>>32
1699/// c2 = xh*yl + (xl*yl>>32); c3 = c2&0xffffffff + xl*yh
1700/// or c2 = (xl*yh&0xffffffff) + xh*yl + (xl*yl>>32); c3 = xl*yh
1701/// Carry4: xh*yh + carry + crosssum>>32 + (xl*yl + crosssum&0xffffffff) >> 32
1702/// crosssum = xh*yl + xl*yh
1703/// carry = crosssum < xh*yl ? 0x1000000 : 0
1704/// Ladder4: xh*yh + (xl*yh)>>32 + (xh*yl)>>32 + low>>32;
1705/// low = (xl*yl)>>32 + (xl*yh)&0xffffffff + (xh*yl)&0xffffffff
1706///
1707/// They all start by matching xh*yh + 2 or 3 other operands. The bottom of the
1708/// tree is xh*yh, xh*yl, xl*yh and xl*yl.
1710 Type *Ty = I.getType();
1711 if (!Ty->isIntOrIntVectorTy())
1712 return false;
1713
1714 unsigned BitWidth = Ty->getScalarSizeInBits();
1716 if (BitWidth % 2 != 0)
1717 return false;
1718
1719 auto CreateMulHigh = [&](Value *X, Value *Y) {
1720 IRBuilder<> Builder(&I);
1721 Type *NTy = Ty->getWithNewBitWidth(BitWidth * 2);
1722 Value *XExt = Builder.CreateZExt(X, NTy);
1723 Value *YExt = Builder.CreateZExt(Y, NTy);
1724 Value *Mul = Builder.CreateMul(XExt, YExt, "", /*HasNUW=*/true);
1725 Value *High = Builder.CreateLShr(Mul, BitWidth);
1726 Value *Res = Builder.CreateTrunc(High, Ty, "", /*HasNUW=*/true);
1727 Res->takeName(&I);
1728 I.replaceAllUsesWith(Res);
1729 LLVM_DEBUG(dbgs() << "Created long multiply from parts of " << *X << " and "
1730 << *Y << "\n");
1731 return true;
1732 };
1733
1734 // Common check routines for X_lo*Y_lo and X_hi*Y_lo
1735 auto CheckLoLo = [&](Value *XlYl, Value *X, Value *Y) {
1736 return match(XlYl, m_c_Mul(m_And(m_Specific(X), m_SpecificInt(LowMask)),
1737 m_And(m_Specific(Y), m_SpecificInt(LowMask))));
1738 };
1739 auto CheckHiLo = [&](Value *XhYl, Value *X, Value *Y) {
1740 return match(XhYl,
1742 m_And(m_Specific(Y), m_SpecificInt(LowMask))));
1743 };
1744
1745 auto FoldMulHighCarry = [&](Value *X, Value *Y, Instruction *Carry,
1746 Instruction *B) {
1747 // Looking for LowSum >> 32 and carry (select)
1748 if (Carry->getOpcode() != Instruction::Select)
1749 std::swap(Carry, B);
1750
1751 // Carry = LowSum < XhYl ? 0x100000000 : 0
1752 Value *LowSum, *XhYl;
1753 if (!match(Carry,
1756 m_Value(XhYl))),
1758 m_Zero()))))
1759 return false;
1760
1761 // XhYl can be Xh*Yl or Xl*Yh
1762 if (!CheckHiLo(XhYl, X, Y)) {
1763 if (CheckHiLo(XhYl, Y, X))
1764 std::swap(X, Y);
1765 else
1766 return false;
1767 }
1768 if (XhYl->hasNUsesOrMore(3))
1769 return false;
1770
1771 // B = LowSum >> 32
1772 if (!match(B, m_OneUse(m_LShr(m_Specific(LowSum),
1773 m_SpecificInt(BitWidth / 2)))) ||
1774 LowSum->hasNUsesOrMore(3))
1775 return false;
1776
1777 // LowSum = XhYl + XlYh + XlYl>>32
1778 Value *XlYh, *XlYl;
1779 auto XlYlHi = m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2));
1780 if (!match(LowSum,
1781 m_c_Add(m_Specific(XhYl),
1782 m_OneUse(m_c_Add(m_OneUse(m_Value(XlYh)), XlYlHi)))) &&
1783 !match(LowSum, m_c_Add(m_OneUse(m_Value(XlYh)),
1784 m_OneUse(m_c_Add(m_Specific(XhYl), XlYlHi)))) &&
1785 !match(LowSum,
1786 m_c_Add(XlYlHi, m_OneUse(m_c_Add(m_Specific(XhYl),
1787 m_OneUse(m_Value(XlYh)))))))
1788 return false;
1789
1790 // Check XlYl and XlYh
1791 if (!CheckLoLo(XlYl, X, Y))
1792 return false;
1793 if (!CheckHiLo(XlYh, Y, X))
1794 return false;
1795
1796 return CreateMulHigh(X, Y);
1797 };
1798
1799 auto FoldMulHighLadder = [&](Value *X, Value *Y, Instruction *A,
1800 Instruction *B) {
1801 // xh*yh + c2>>32 + c3>>32
1802 // c2 = xh*yl + (xl*yl>>32); c3 = c2&0xffffffff + xl*yh
1803 // or c2 = (xl*yh&0xffffffff) + xh*yl + (xl*yl>>32); c3 = xh*yl
1804 Value *XlYh, *XhYl, *XlYl, *C2, *C3;
1805 // Strip off the two expected shifts.
1806 if (!match(A, m_LShr(m_Value(C2), m_SpecificInt(BitWidth / 2))) ||
1808 return false;
1809
1810 if (match(C3, m_c_Add(m_Add(m_Value(), m_Value()), m_Value())))
1811 std::swap(C2, C3);
1812 // Try to match c2 = (xl*yh&0xffffffff) + xh*yl + (xl*yl>>32)
1813 if (match(C2,
1815 m_Value(XlYh)),
1816 m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2)))) ||
1818 m_LShr(m_Value(XlYl),
1819 m_SpecificInt(BitWidth / 2))),
1820 m_Value(XlYh))) ||
1822 m_SpecificInt(BitWidth / 2)),
1823 m_Value(XlYh)),
1824 m_And(m_Specific(C3), m_SpecificInt(LowMask))))) {
1825 XhYl = C3;
1826 } else {
1827 // Match c3 = c2&0xffffffff + xl*yh
1828 if (!match(C3, m_c_Add(m_And(m_Specific(C2), m_SpecificInt(LowMask)),
1829 m_Value(XlYh))))
1830 std::swap(C2, C3);
1831 if (!match(C3, m_c_Add(m_OneUse(
1832 m_And(m_Specific(C2), m_SpecificInt(LowMask))),
1833 m_Value(XlYh))) ||
1834 !C3->hasOneUse() || C2->hasNUsesOrMore(3))
1835 return false;
1836
1837 // Match c2 = xh*yl + (xl*yl >> 32)
1838 if (!match(C2, m_c_Add(m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2)),
1839 m_Value(XhYl))))
1840 return false;
1841 }
1842
1843 // Match XhYl and XlYh - they can appear either way around.
1844 if (!CheckHiLo(XlYh, Y, X))
1845 std::swap(XlYh, XhYl);
1846 if (!CheckHiLo(XlYh, Y, X))
1847 return false;
1848 if (!CheckHiLo(XhYl, X, Y))
1849 return false;
1850 if (!CheckLoLo(XlYl, X, Y))
1851 return false;
1852
1853 return CreateMulHigh(X, Y);
1854 };
1855
1856 auto FoldMulHighLadder4 = [&](Value *X, Value *Y, Instruction *A,
1858 /// Ladder4: xh*yh + (xl*yh)>>32 + (xh+yl)>>32 + low>>32;
1859 /// low = (xl*yl)>>32 + (xl*yh)&0xffffffff + (xh*yl)&0xffffffff
1860
1861 // Find A = Low >> 32 and B/C = XhYl>>32, XlYh>>32.
1862 auto ShiftAdd =
1864 if (!match(A, ShiftAdd))
1865 std::swap(A, B);
1866 if (!match(A, ShiftAdd))
1867 std::swap(A, C);
1868 Value *Low;
1870 return false;
1871
1872 // Match B == XhYl>>32 and C == XlYh>>32
1873 Value *XhYl, *XlYh;
1874 if (!match(B, m_LShr(m_Value(XhYl), m_SpecificInt(BitWidth / 2))) ||
1875 !match(C, m_LShr(m_Value(XlYh), m_SpecificInt(BitWidth / 2))))
1876 return false;
1877 if (!CheckHiLo(XhYl, X, Y))
1878 std::swap(XhYl, XlYh);
1879 if (!CheckHiLo(XhYl, X, Y) || XhYl->hasNUsesOrMore(3))
1880 return false;
1881 if (!CheckHiLo(XlYh, Y, X) || XlYh->hasNUsesOrMore(3))
1882 return false;
1883
1884 // Match Low as XlYl>>32 + XhYl&0xffffffff + XlYh&0xffffffff
1885 Value *XlYl;
1886 if (!match(
1887 Low,
1888 m_c_Add(
1890 m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask))),
1891 m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))))),
1892 m_OneUse(
1893 m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2))))) &&
1894 !match(
1895 Low,
1896 m_c_Add(
1898 m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask))),
1899 m_OneUse(
1900 m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2))))),
1901 m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))))) &&
1902 !match(
1903 Low,
1904 m_c_Add(
1906 m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))),
1907 m_OneUse(
1908 m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2))))),
1909 m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask))))))
1910 return false;
1911 if (!CheckLoLo(XlYl, X, Y))
1912 return false;
1913
1914 return CreateMulHigh(X, Y);
1915 };
1916
1917 auto FoldMulHighCarry4 = [&](Value *X, Value *Y, Instruction *Carry,
1919 // xh*yh + carry + crosssum>>32 + (xl*yl + crosssum&0xffffffff) >> 32
1920 // crosssum = xh*yl+xl*yh
1921 // carry = crosssum < xh*yl ? 0x1000000 : 0
1922 if (Carry->getOpcode() != Instruction::Select)
1923 std::swap(Carry, B);
1924 if (Carry->getOpcode() != Instruction::Select)
1925 std::swap(Carry, C);
1926
1927 // Carry = CrossSum < XhYl ? 0x100000000 : 0
1928 Value *CrossSum, *XhYl;
1929 if (!match(Carry,
1932 m_Value(CrossSum), m_Value(XhYl))),
1934 m_Zero()))))
1935 return false;
1936
1937 if (!match(B, m_LShr(m_Specific(CrossSum), m_SpecificInt(BitWidth / 2))))
1938 std::swap(B, C);
1939 if (!match(B, m_LShr(m_Specific(CrossSum), m_SpecificInt(BitWidth / 2))))
1940 return false;
1941
1942 Value *XlYl, *LowAccum;
1943 if (!match(C, m_LShr(m_Value(LowAccum), m_SpecificInt(BitWidth / 2))) ||
1944 !match(LowAccum, m_c_Add(m_OneUse(m_LShr(m_Value(XlYl),
1945 m_SpecificInt(BitWidth / 2))),
1946 m_OneUse(m_And(m_Specific(CrossSum),
1947 m_SpecificInt(LowMask))))) ||
1948 LowAccum->hasNUsesOrMore(3))
1949 return false;
1950 if (!CheckLoLo(XlYl, X, Y))
1951 return false;
1952
1953 if (!CheckHiLo(XhYl, X, Y))
1954 std::swap(X, Y);
1955 if (!CheckHiLo(XhYl, X, Y))
1956 return false;
1957 Value *XlYh;
1958 if (!match(CrossSum, m_c_Add(m_Specific(XhYl), m_OneUse(m_Value(XlYh)))) ||
1959 !CheckHiLo(XlYh, Y, X) || CrossSum->hasNUsesOrMore(4) ||
1960 XhYl->hasNUsesOrMore(3))
1961 return false;
1962
1963 return CreateMulHigh(X, Y);
1964 };
1965
1966 // X and Y are the two inputs, A, B and C are other parts of the pattern
1967 // (crosssum>>32, carry, etc).
1968 Value *X, *Y;
1969 Instruction *A, *B, *C;
1970 auto HiHi = m_OneUse(m_Mul(m_LShr(m_Value(X), m_SpecificInt(BitWidth / 2)),
1972 if ((match(&I, m_c_Add(HiHi, m_OneUse(m_Add(m_Instruction(A),
1973 m_Instruction(B))))) ||
1975 m_OneUse(m_c_Add(HiHi, m_Instruction(B)))))) &&
1976 A->hasOneUse() && B->hasOneUse())
1977 if (FoldMulHighCarry(X, Y, A, B) || FoldMulHighLadder(X, Y, A, B))
1978 return true;
1979
1980 if ((match(&I, m_c_Add(HiHi, m_OneUse(m_c_Add(
1983 m_Instruction(C))))))) ||
1987 m_Instruction(C))))))) ||
1991 m_OneUse(m_c_Add(HiHi, m_Instruction(C))))))) ||
1992 match(&I,
1995 A->hasOneUse() && B->hasOneUse() && C->hasOneUse())
1996 return FoldMulHighCarry4(X, Y, A, B, C) ||
1997 FoldMulHighLadder4(X, Y, A, B, C);
1998
1999 return false;
2000}
2001
2002/// This is the entry point for folds that could be implemented in regular
2003/// InstCombine, but they are separated because they are not expected to
2004/// occur frequently and/or have more than a constant-length pattern match.
2008 AssumptionCache &AC, bool &MadeCFGChange) {
2009 bool MadeChange = false;
2010 for (BasicBlock &BB : F) {
2011 // Ignore unreachable basic blocks.
2012 if (!DT.isReachableFromEntry(&BB))
2013 continue;
2014
2015 const DataLayout &DL = F.getDataLayout();
2016
2017 // Walk the block backwards for efficiency. We're matching a chain of
2018 // use->defs, so we're more likely to succeed by starting from the bottom.
2019 // Also, we want to avoid matching partial patterns.
2020 // TODO: It would be more efficient if we removed dead instructions
2021 // iteratively in this loop rather than waiting until the end.
2023 MadeChange |= foldAnyOrAllBitsSet(I);
2024 MadeChange |= foldGuardedFunnelShift(I, DT);
2025 MadeChange |= tryToRecognizePopCount(I);
2026 MadeChange |= tryToFPToSat(I, TTI);
2027 MadeChange |= tryToRecognizeTableBasedCttz(I, DL);
2028 MadeChange |= tryToRecognizeTableBasedLog2(I, DL, TTI);
2029 MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
2030 MadeChange |= foldPatternedLoads(I, DL);
2031 MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT);
2032 MadeChange |= foldMulHigh(I);
2033 // NOTE: This function introduces erasing of the instruction `I`, so it
2034 // needs to be called at the end of this sequence, otherwise we may make
2035 // bugs.
2036 MadeChange |= foldLibCalls(I, TTI, TLI, AC, DT, DL, MadeCFGChange);
2037 }
2038
2039 // Do this separately to avoid redundantly scanning stores multiple times.
2040 MadeChange |= foldConsecutiveStores(BB, DL, TTI, AA);
2041 }
2042
2043 // We're done with transforms, so remove dead instructions.
2044 if (MadeChange)
2045 for (BasicBlock &BB : F)
2047
2048 return MadeChange;
2049}
2050
2051/// This is the entry point for all transforms. Pass manager differences are
2052/// handled in the callers of this function.
2055 AliasAnalysis &AA, bool &MadeCFGChange) {
2056 bool MadeChange = false;
2057 const DataLayout &DL = F.getDataLayout();
2058 TruncInstCombine TIC(AC, TLI, DL, DT);
2059 MadeChange |= TIC.run(F);
2060 MadeChange |= foldUnusualPatterns(F, DT, TTI, TLI, AA, AC, MadeCFGChange);
2061 return MadeChange;
2062}
2063
2066 auto &AC = AM.getResult<AssumptionAnalysis>(F);
2067 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
2068 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
2069 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
2070 auto &AA = AM.getResult<AAManager>(F);
2071 bool MadeCFGChange = false;
2072 if (!runImpl(F, AC, TTI, TLI, DT, AA, MadeCFGChange)) {
2073 // No changes, all analyses are preserved.
2074 return PreservedAnalyses::all();
2075 }
2076 // Mark all the analyses that instcombine updates as preserved.
2078 if (MadeCFGChange)
2080 else
2082 return PA;
2083}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool tryToRecognizePopCount(Instruction &I)
static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI, TargetLibraryInfo &TLI, AssumptionCache &AC, DominatorTree &DT)
Try to replace a mathlib call to sqrt with the LLVM intrinsic.
static bool isLog2Table(Constant *Table, const APInt &Mul, const APInt &Shift, Type *AccessTy, unsigned InputBits, const APInt &GEPIdxFactor, const DataLayout &DL)
static bool foldAnyOrAllBitsSet(Instruction &I)
Match patterns that correspond to "any-bits-set" and "all-bits-set".
static cl::opt< unsigned > MemChrInlineThreshold("memchr-inline-threshold", cl::init(3), cl::Hidden, cl::desc("The maximum length of a constant string to " "inline a memchr call."))
static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI)
Fold smin(smax(fptosi(x), C1), C2) to llvm.fptosi.sat(x), providing C1 and C2 saturate the value of t...
static cl::opt< unsigned > StrNCmpInlineThreshold("strncmp-inline-threshold", cl::init(3), cl::Hidden, cl::desc("The maximum length of a constant string for a builtin string cmp " "call eligible for inlining. The default value is 3."))
static bool matchAndOrChain(Value *V, MaskOps &MOps)
This is a recursive helper for foldAnyOrAllBitsSet() that walks through a chain of 'and' or 'or' inst...
static bool foldMemChr(CallInst *Call, DomTreeUpdater *DTU, const DataLayout &DL)
Convert memchr with a small constant string into a switch.
static Value * optimizeShiftInOrChain(Value *V, IRBuilder<> &Builder)
Combine away instructions providing they are still equivalent when compared against 0.
static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL, TargetTransformInfo &TTI, AliasAnalysis &AA, const DominatorTree &DT)
static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT)
Match a pattern for a bitwise funnel/rotate operation that partially guards against undefined behavio...
static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL)
static bool mergePartStores(SmallVectorImpl< PartStore > &Parts, const DataLayout &DL, TargetTransformInfo &TTI)
static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL, AliasAnalysis &AA, bool IsRoot=false)
static bool mergeConsecutivePartStores(ArrayRef< PartStore > Parts, unsigned Width, const DataLayout &DL, TargetTransformInfo &TTI)
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
static bool foldICmpOrChain(Instruction &I, const DataLayout &DL, TargetTransformInfo &TTI, AliasAnalysis &AA, const DominatorTree &DT)
static bool isCTTZTable(Constant *Table, const APInt &Mul, const APInt &Shift, const APInt &AndMask, Type *AccessTy, unsigned InputBits, const APInt &GEPIdxFactor, const DataLayout &DL)
static std::optional< PartStore > matchPartStore(Instruction &I, const DataLayout &DL)
static bool foldConsecutiveStores(BasicBlock &BB, const DataLayout &DL, TargetTransformInfo &TTI, AliasAnalysis &AA)
static std::pair< APInt, APInt > getStrideAndModOffsetOfGEP(Value *PtrOp, const DataLayout &DL)
static bool foldPatternedLoads(Instruction &I, const DataLayout &DL)
If C is a constant patterned array and all valid loaded results for given alignment are same to a con...
static bool tryToRecognizeTableBasedLog2(Instruction &I, const DataLayout &DL, TargetTransformInfo &TTI)
static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI, TargetLibraryInfo &TLI, AssumptionCache &AC, DominatorTree &DT, const DataLayout &DL, bool &MadeCFGChange)
static bool foldMulHigh(Instruction &I)
Match high part of long multiplication.
static bool foldUnusualPatterns(Function &F, DominatorTree &DT, TargetTransformInfo &TTI, TargetLibraryInfo &TLI, AliasAnalysis &AA, AssumptionCache &AC, bool &MadeCFGChange)
This is the entry point for folds that could be implemented in regular InstCombine,...
AggressiveInstCombiner - Combine expression patterns to form expressions with fewer,...
This is the interface for LLVM's primary stateless and local alias analysis.
#define X(NUM, ENUM, NAME)
Definition ELF.h:849
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runImpl(Function &F, const TargetLowering &TLI, const LibcallLoweringInfo &Libcalls, AssumptionCache *AC)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
static MaybeAlign getAlign(Value *Ptr)
static Instruction * matchFunnelShift(Instruction &Or, InstCombinerImpl &IC)
Match UB-safe variants of the funnel shift intrinsic.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t High
This file contains the declarations for profiling metadata utility functions.
static const MCExpr * MaskShift(const MCExpr *Val, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
This pass exposes codegen information to IR-level passes.
Value * RHS
Value * LHS
BinaryOperator * Mul
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1345
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
LLVM_ABI APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition APInt.cpp:1747
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition APInt.h:880
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1264
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition APInt.h:1137
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:449
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
static LLVM_ABI DebugLoc getMergedLocations(ArrayRef< DebugLoc > Locs)
Try to combine the vector of locations passed as input in a single one.
Definition DebugLoc.cpp:166
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
void applyUpdates(ArrayRef< UpdateT > Updates)
Submit updates to all available trees.
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
bool isConstant() const
If the value is a global constant, its value is immutable throughout the runtime execution of the pro...
static bool isEquality(Predicate P)
Return true if this predicate is either EQ or NE.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
UncondBrInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition IRBuilder.h:1217
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2496
SwitchInst * CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases=10, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a switch instruction with the specified value, default dest, and with a hint for the number of...
Definition IRBuilder.h:1246
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2063
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateInBoundsPtrAdd(Value *Ptr, Value *Offset, const Twine &Name="")
Definition IRBuilder.h:2053
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition IRBuilder.h:569
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2811
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void setAAMetadata(const AAMDNodes &N)
Sets the AA metadata on this instruction from the AAMDNodes structure.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
bool isSimple() const
static LocationSize precise(uint64_t Value)
LLVM_ABI MDNode * createUnlikelyBranchWeights()
Return metadata containing two branch weights, with significant bias towards false destination.
Definition MDBuilder.cpp:48
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
static MemoryLocation getBeforeOrAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location before or after Ptr, while remaining within the underl...
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static constexpr size_t npos
Definition StringRef.h:57
Multiway switch.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
bool getLibFunc(StringRef funcName, LibFunc &F) const
Searches for a particular function name.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
@ None
The insert/extract is not used with a load/store.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCC_Basic
The cost of a typical 'add' instruction.
bool run(Function &F)
Perform TruncInst pattern optimization on given function.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition Value.cpp:158
LLVM_ABI const Value * stripAndAccumulateConstantOffsets(const DataLayout &DL, APInt &Offset, bool AllowNonInbounds, bool AllowInvariantGroup=false, function_ref< bool(Value &Value, APInt &Offset)> ExternalAnalysis=nullptr, bool LookThroughIntToPtr=false) const
Accumulate the constant offset this value has compared to a base pointer.
LLVM_ABI uint64_t getPointerDereferenceableBytes(const DataLayout &DL, bool &CanBeNull, bool &CanBeFreed) const
Returns the number of bytes known to be dereferenceable for the pointer value.
Definition Value.cpp:893
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
Changed
#define UINT64_MAX
Definition DataTypes.h:77
Abstract Attribute helper functions.
Definition Attributor.h:165
LLVM_ABI APInt GreatestCommonDivisor(APInt A, APInt B)
Compute GCD of two unsigned APInt values.
Definition APInt.cpp:798
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
ShiftLike_match< LHS, Instruction::LShr > m_LShrOrSelf(const LHS &L, uint64_t &R)
Matches lshr L, ConstShAmt or L itself (R will be set to zero in this case).
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, CastInst >, OpTy > m_CastOrSelf(const OpTy &Op)
Matches any cast or self. Used to ignore casts.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
ShiftLike_match< LHS, Instruction::Shl > m_ShlOrSelf(const LHS &L, uint64_t &R)
Matches shl L, ConstShAmt or L itself (R will be set to zero in this case).
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specific_bbval m_SpecificBB(BasicBlock *BB)
Match a specific basic block value.
OverflowingBinaryOp_match< LHS, RHS, Instruction::Shl, OverflowingBinaryOperator::NoSignedWrap > m_NSWShl(const LHS &L, const RHS &R)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
OverflowingBinaryOp_match< LHS, RHS, Instruction::Shl, OverflowingBinaryOperator::NoUnsignedWrap > m_NUWShl(const LHS &L, const RHS &R)
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CastInst_match< OpTy, FPToSIInst > m_FPToSI(const OpTy &Op)
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
cl::opt< bool > ProfcheckDisableMetadataFixes
Definition Metadata.cpp:64
LLVM_ABI void setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I, StringRef PassName, const Function *F=nullptr)
Like setExplicitlyUnknownBranchWeights(...), but only sets unknown branch weights in the new instruct...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool isOnlyUsedInZeroComparison(const Instruction *CxtI)
LLVM_ABI bool getConstantStringInfo(const Value *V, StringRef &Str, bool TrimAtNul=true)
This function computes the length of a null-terminated C string pointed to by V.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
LLVM_ABI bool SimplifyInstructionsInBlock(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr)
Scan the specified basic block and try to simplify any instructions in it and recursively delete dead...
Definition Local.cpp:723
LLVM_ABI void setExplicitlyUnknownBranchWeights(Instruction &I, StringRef PassName)
Specify that the branch weights for this terminator cannot be known at compile time.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
LLVM_ABI bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, LibFunc TheLibFunc)
Check whether the library function is available on target and also that it in the current Module is a...
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
LLVM_ABI Constant * ConstantFoldLoadFromConst(Constant *C, Type *Ty, const APInt &Offset, const DataLayout &DL)
Extract value of C at the given Offset reinterpreted as Ty.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ Sub
Subtraction of integers.
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI bool cannotBeOrderedLessThanZero(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if we can prove that the specified FP value is either NaN or never less than -0....
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This is used by foldLoadsRecursive() to capture a Root Load node which is of type or(load,...
LoadInst * RootInsert
ValWidth bits starting at ValOffset of Val stored at PtrBase+PtrOffset.
bool operator<(const PartStore &Other) const
bool isCompatibleWith(const PartStore &Other) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
LLVM_ABI AAMDNodes concat(const AAMDNodes &Other) const
Determine the best AAMDNodes after concatenating two different locations together.
Matching combinators.
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:276