LLVM  13.0.0git
X86InstCombineIntrinsic.cpp
Go to the documentation of this file.
1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "X86TargetTransformInfo.h"
17 #include "llvm/IR/IntrinsicInst.h"
18 #include "llvm/IR/IntrinsicsX86.h"
19 #include "llvm/Support/KnownBits.h"
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "x86tti"
25 
26 /// Return a constant boolean vector that has true elements in all positions
27 /// where the input constant data vector has an element with the sign bit set.
29  VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
30  V = ConstantExpr::getBitCast(V, IntTy);
32  V);
33  return V;
34 }
35 
36 /// Convert the x86 XMM integer vector mask to a vector of bools based on
37 /// each element's most significant bit (the sign bit).
39  // Fold Constant Mask.
40  if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
41  return getNegativeIsTrueBoolVec(ConstantMask);
42 
43  // Mask was extended from a boolean vector.
44  Value *ExtMask;
47  ExtMask->getType()->isIntOrIntVectorTy(1))
48  return ExtMask;
49 
50  return nullptr;
51 }
52 
53 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
54 // XMM register mask efficiently, we could transform all x86 masked intrinsics
55 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
57  Value *Ptr = II.getOperand(0);
58  Value *Mask = II.getOperand(1);
59  Constant *ZeroVec = Constant::getNullValue(II.getType());
60 
61  // Zero Mask - masked load instruction creates a zero vector.
62  if (isa<ConstantAggregateZero>(Mask))
63  return IC.replaceInstUsesWith(II, ZeroVec);
64 
65  // The mask is constant or extended from a bool vector. Convert this x86
66  // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
67  if (Value *BoolMask = getBoolVecFromMask(Mask)) {
68  // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
69  // the LLVM intrinsic definition for the pointer argument.
70  unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
71  PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
72  Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
73 
74  // The pass-through vector for an x86 masked load is a zero vector.
75  CallInst *NewMaskedLoad =
76  IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec);
77  return IC.replaceInstUsesWith(II, NewMaskedLoad);
78  }
79 
80  return nullptr;
81 }
82 
83 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
84 // XMM register mask efficiently, we could transform all x86 masked intrinsics
85 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
87  Value *Ptr = II.getOperand(0);
88  Value *Mask = II.getOperand(1);
89  Value *Vec = II.getOperand(2);
90 
91  // Zero Mask - this masked store instruction does nothing.
92  if (isa<ConstantAggregateZero>(Mask)) {
93  IC.eraseInstFromFunction(II);
94  return true;
95  }
96 
97  // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
98  // anything else at this level.
99  if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
100  return false;
101 
102  // The mask is constant or extended from a bool vector. Convert this x86
103  // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
104  if (Value *BoolMask = getBoolVecFromMask(Mask)) {
105  unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
106  PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
107  Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
108 
109  IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
110 
111  // 'Replace uses' doesn't work for stores. Erase the original masked store.
112  IC.eraseInstFromFunction(II);
113  return true;
114  }
115 
116  return false;
117 }
118 
121  bool LogicalShift = false;
122  bool ShiftLeft = false;
123  bool IsImm = false;
124 
125  switch (II.getIntrinsicID()) {
126  default:
127  llvm_unreachable("Unexpected intrinsic!");
128  case Intrinsic::x86_sse2_psrai_d:
129  case Intrinsic::x86_sse2_psrai_w:
130  case Intrinsic::x86_avx2_psrai_d:
131  case Intrinsic::x86_avx2_psrai_w:
132  case Intrinsic::x86_avx512_psrai_q_128:
133  case Intrinsic::x86_avx512_psrai_q_256:
134  case Intrinsic::x86_avx512_psrai_d_512:
135  case Intrinsic::x86_avx512_psrai_q_512:
136  case Intrinsic::x86_avx512_psrai_w_512:
137  IsImm = true;
139  case Intrinsic::x86_sse2_psra_d:
140  case Intrinsic::x86_sse2_psra_w:
141  case Intrinsic::x86_avx2_psra_d:
142  case Intrinsic::x86_avx2_psra_w:
143  case Intrinsic::x86_avx512_psra_q_128:
144  case Intrinsic::x86_avx512_psra_q_256:
145  case Intrinsic::x86_avx512_psra_d_512:
146  case Intrinsic::x86_avx512_psra_q_512:
147  case Intrinsic::x86_avx512_psra_w_512:
148  LogicalShift = false;
149  ShiftLeft = false;
150  break;
151  case Intrinsic::x86_sse2_psrli_d:
152  case Intrinsic::x86_sse2_psrli_q:
153  case Intrinsic::x86_sse2_psrli_w:
154  case Intrinsic::x86_avx2_psrli_d:
155  case Intrinsic::x86_avx2_psrli_q:
156  case Intrinsic::x86_avx2_psrli_w:
157  case Intrinsic::x86_avx512_psrli_d_512:
158  case Intrinsic::x86_avx512_psrli_q_512:
159  case Intrinsic::x86_avx512_psrli_w_512:
160  IsImm = true;
162  case Intrinsic::x86_sse2_psrl_d:
163  case Intrinsic::x86_sse2_psrl_q:
164  case Intrinsic::x86_sse2_psrl_w:
165  case Intrinsic::x86_avx2_psrl_d:
166  case Intrinsic::x86_avx2_psrl_q:
167  case Intrinsic::x86_avx2_psrl_w:
168  case Intrinsic::x86_avx512_psrl_d_512:
169  case Intrinsic::x86_avx512_psrl_q_512:
170  case Intrinsic::x86_avx512_psrl_w_512:
171  LogicalShift = true;
172  ShiftLeft = false;
173  break;
174  case Intrinsic::x86_sse2_pslli_d:
175  case Intrinsic::x86_sse2_pslli_q:
176  case Intrinsic::x86_sse2_pslli_w:
177  case Intrinsic::x86_avx2_pslli_d:
178  case Intrinsic::x86_avx2_pslli_q:
179  case Intrinsic::x86_avx2_pslli_w:
180  case Intrinsic::x86_avx512_pslli_d_512:
181  case Intrinsic::x86_avx512_pslli_q_512:
182  case Intrinsic::x86_avx512_pslli_w_512:
183  IsImm = true;
185  case Intrinsic::x86_sse2_psll_d:
186  case Intrinsic::x86_sse2_psll_q:
187  case Intrinsic::x86_sse2_psll_w:
188  case Intrinsic::x86_avx2_psll_d:
189  case Intrinsic::x86_avx2_psll_q:
190  case Intrinsic::x86_avx2_psll_w:
191  case Intrinsic::x86_avx512_psll_d_512:
192  case Intrinsic::x86_avx512_psll_q_512:
193  case Intrinsic::x86_avx512_psll_w_512:
194  LogicalShift = true;
195  ShiftLeft = true;
196  break;
197  }
198  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
199 
200  auto Vec = II.getArgOperand(0);
201  auto Amt = II.getArgOperand(1);
202  auto VT = cast<FixedVectorType>(Vec->getType());
203  auto SVT = VT->getElementType();
204  auto AmtVT = Amt->getType();
205  unsigned VWidth = VT->getNumElements();
206  unsigned BitWidth = SVT->getPrimitiveSizeInBits();
207 
208  // If the shift amount is guaranteed to be in-range we can replace it with a
209  // generic shift. If its guaranteed to be out of range, logical shifts combine
210  // to zero and arithmetic shifts are clamped to (BitWidth - 1).
211  if (IsImm) {
212  assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
213  KnownBits KnownAmtBits =
215  if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
216  Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
217  Amt = Builder.CreateVectorSplat(VWidth, Amt);
218  return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
219  : Builder.CreateLShr(Vec, Amt))
220  : Builder.CreateAShr(Vec, Amt));
221  }
222  if (KnownAmtBits.getMinValue().uge(BitWidth)) {
223  if (LogicalShift)
224  return ConstantAggregateZero::get(VT);
225  Amt = ConstantInt::get(SVT, BitWidth - 1);
226  return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
227  }
228  } else {
229  // Ensure the first element has an in-range value and the rest of the
230  // elements in the bottom 64 bits are zero.
231  assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
232  cast<VectorType>(AmtVT)->getElementType() == SVT &&
233  "Unexpected shift-by-scalar type");
234  unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
235  APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
236  APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
237  KnownBits KnownLowerBits = llvm::computeKnownBits(
238  Amt, DemandedLower, II.getModule()->getDataLayout());
239  KnownBits KnownUpperBits = llvm::computeKnownBits(
240  Amt, DemandedUpper, II.getModule()->getDataLayout());
241  if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
242  (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) {
243  SmallVector<int, 16> ZeroSplat(VWidth, 0);
244  Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
245  return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
246  : Builder.CreateLShr(Vec, Amt))
247  : Builder.CreateAShr(Vec, Amt));
248  }
249  }
250 
251  // Simplify if count is constant vector.
252  auto CDV = dyn_cast<ConstantDataVector>(Amt);
253  if (!CDV)
254  return nullptr;
255 
256  // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
257  // operand to compute the shift amount.
258  assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
259  cast<VectorType>(AmtVT)->getElementType() == SVT &&
260  "Unexpected shift-by-scalar type");
261 
262  // Concatenate the sub-elements to create the 64-bit value.
263  APInt Count(64, 0);
264  for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
265  unsigned SubEltIdx = (NumSubElts - 1) - i;
266  auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
267  Count <<= BitWidth;
268  Count |= SubElt->getValue().zextOrTrunc(64);
269  }
270 
271  // If shift-by-zero then just return the original value.
272  if (Count.isNullValue())
273  return Vec;
274 
275  // Handle cases when Shift >= BitWidth.
276  if (Count.uge(BitWidth)) {
277  // If LogicalShift - just return zero.
278  if (LogicalShift)
279  return ConstantAggregateZero::get(VT);
280 
281  // If ArithmeticShift - clamp Shift to (BitWidth - 1).
282  Count = APInt(64, BitWidth - 1);
283  }
284 
285  // Get a constant vector of the same type as the first operand.
286  auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
287  auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
288 
289  if (ShiftLeft)
290  return Builder.CreateShl(Vec, ShiftVec);
291 
292  if (LogicalShift)
293  return Builder.CreateLShr(Vec, ShiftVec);
294 
295  return Builder.CreateAShr(Vec, ShiftVec);
296 }
297 
298 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
299 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
300 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
303  bool LogicalShift = false;
304  bool ShiftLeft = false;
305 
306  switch (II.getIntrinsicID()) {
307  default:
308  llvm_unreachable("Unexpected intrinsic!");
309  case Intrinsic::x86_avx2_psrav_d:
310  case Intrinsic::x86_avx2_psrav_d_256:
311  case Intrinsic::x86_avx512_psrav_q_128:
312  case Intrinsic::x86_avx512_psrav_q_256:
313  case Intrinsic::x86_avx512_psrav_d_512:
314  case Intrinsic::x86_avx512_psrav_q_512:
315  case Intrinsic::x86_avx512_psrav_w_128:
316  case Intrinsic::x86_avx512_psrav_w_256:
317  case Intrinsic::x86_avx512_psrav_w_512:
318  LogicalShift = false;
319  ShiftLeft = false;
320  break;
321  case Intrinsic::x86_avx2_psrlv_d:
322  case Intrinsic::x86_avx2_psrlv_d_256:
323  case Intrinsic::x86_avx2_psrlv_q:
324  case Intrinsic::x86_avx2_psrlv_q_256:
325  case Intrinsic::x86_avx512_psrlv_d_512:
326  case Intrinsic::x86_avx512_psrlv_q_512:
327  case Intrinsic::x86_avx512_psrlv_w_128:
328  case Intrinsic::x86_avx512_psrlv_w_256:
329  case Intrinsic::x86_avx512_psrlv_w_512:
330  LogicalShift = true;
331  ShiftLeft = false;
332  break;
333  case Intrinsic::x86_avx2_psllv_d:
334  case Intrinsic::x86_avx2_psllv_d_256:
335  case Intrinsic::x86_avx2_psllv_q:
336  case Intrinsic::x86_avx2_psllv_q_256:
337  case Intrinsic::x86_avx512_psllv_d_512:
338  case Intrinsic::x86_avx512_psllv_q_512:
339  case Intrinsic::x86_avx512_psllv_w_128:
340  case Intrinsic::x86_avx512_psllv_w_256:
341  case Intrinsic::x86_avx512_psllv_w_512:
342  LogicalShift = true;
343  ShiftLeft = true;
344  break;
345  }
346  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
347 
348  auto Vec = II.getArgOperand(0);
349  auto Amt = II.getArgOperand(1);
350  auto VT = cast<FixedVectorType>(II.getType());
351  auto SVT = VT->getElementType();
352  int NumElts = VT->getNumElements();
353  int BitWidth = SVT->getIntegerBitWidth();
354 
355  // If the shift amount is guaranteed to be in-range we can replace it with a
356  // generic shift.
357  APInt UpperBits =
359  if (llvm::MaskedValueIsZero(Amt, UpperBits,
360  II.getModule()->getDataLayout())) {
361  return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
362  : Builder.CreateLShr(Vec, Amt))
363  : Builder.CreateAShr(Vec, Amt));
364  }
365 
366  // Simplify if all shift amounts are constant/undef.
367  auto *CShift = dyn_cast<Constant>(Amt);
368  if (!CShift)
369  return nullptr;
370 
371  // Collect each element's shift amount.
372  // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
373  bool AnyOutOfRange = false;
374  SmallVector<int, 8> ShiftAmts;
375  for (int I = 0; I < NumElts; ++I) {
376  auto *CElt = CShift->getAggregateElement(I);
377  if (isa_and_nonnull<UndefValue>(CElt)) {
378  ShiftAmts.push_back(-1);
379  continue;
380  }
381 
382  auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
383  if (!COp)
384  return nullptr;
385 
386  // Handle out of range shifts.
387  // If LogicalShift - set to BitWidth (special case).
388  // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
389  APInt ShiftVal = COp->getValue();
390  if (ShiftVal.uge(BitWidth)) {
391  AnyOutOfRange = LogicalShift;
392  ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
393  continue;
394  }
395 
396  ShiftAmts.push_back((int)ShiftVal.getZExtValue());
397  }
398 
399  // If all elements out of range or UNDEF, return vector of zeros/undefs.
400  // ArithmeticShift should only hit this if they are all UNDEF.
401  auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
402  if (llvm::all_of(ShiftAmts, OutOfRange)) {
403  SmallVector<Constant *, 8> ConstantVec;
404  for (int Idx : ShiftAmts) {
405  if (Idx < 0) {
406  ConstantVec.push_back(UndefValue::get(SVT));
407  } else {
408  assert(LogicalShift && "Logical shift expected");
409  ConstantVec.push_back(ConstantInt::getNullValue(SVT));
410  }
411  }
412  return ConstantVector::get(ConstantVec);
413  }
414 
415  // We can't handle only some out of range values with generic logical shifts.
416  if (AnyOutOfRange)
417  return nullptr;
418 
419  // Build the shift amount constant vector.
420  SmallVector<Constant *, 8> ShiftVecAmts;
421  for (int Idx : ShiftAmts) {
422  if (Idx < 0)
423  ShiftVecAmts.push_back(UndefValue::get(SVT));
424  else
425  ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
426  }
427  auto ShiftVec = ConstantVector::get(ShiftVecAmts);
428 
429  if (ShiftLeft)
430  return Builder.CreateShl(Vec, ShiftVec);
431 
432  if (LogicalShift)
433  return Builder.CreateLShr(Vec, ShiftVec);
434 
435  return Builder.CreateAShr(Vec, ShiftVec);
436 }
437 
439  InstCombiner::BuilderTy &Builder, bool IsSigned) {
440  Value *Arg0 = II.getArgOperand(0);
441  Value *Arg1 = II.getArgOperand(1);
442  Type *ResTy = II.getType();
443 
444  // Fast all undef handling.
445  if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
446  return UndefValue::get(ResTy);
447 
448  auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
449  unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
450  unsigned NumSrcElts = ArgTy->getNumElements();
451  assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
452  "Unexpected packing types");
453 
454  unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
455  unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
456  unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
457  assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
458  "Unexpected packing types");
459 
460  // Constant folding.
461  if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
462  return nullptr;
463 
464  // Clamp Values - signed/unsigned both use signed clamp values, but they
465  // differ on the min/max values.
466  APInt MinValue, MaxValue;
467  if (IsSigned) {
468  // PACKSS: Truncate signed value with signed saturation.
469  // Source values less than dst minint are saturated to minint.
470  // Source values greater than dst maxint are saturated to maxint.
471  MinValue =
472  APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
473  MaxValue =
474  APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
475  } else {
476  // PACKUS: Truncate signed value with unsigned saturation.
477  // Source values less than zero are saturated to zero.
478  // Source values greater than dst maxuint are saturated to maxuint.
479  MinValue = APInt::getNullValue(SrcScalarSizeInBits);
480  MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
481  }
482 
483  auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
484  auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
485  Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
486  Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
487  Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
488  Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
489 
490  // Shuffle clamped args together at the lane level.
491  SmallVector<int, 32> PackMask;
492  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
493  for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
494  PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
495  for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
496  PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
497  }
498  auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
499 
500  // Truncate to dst size.
501  return Builder.CreateTrunc(Shuffle, ResTy);
502 }
503 
506  Value *Arg = II.getArgOperand(0);
507  Type *ResTy = II.getType();
508 
509  // movmsk(undef) -> zero as we must ensure the upper bits are zero.
510  if (isa<UndefValue>(Arg))
511  return Constant::getNullValue(ResTy);
512 
513  auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
514  // We can't easily peek through x86_mmx types.
515  if (!ArgTy)
516  return nullptr;
517 
518  // Expand MOVMSK to compare/bitcast/zext:
519  // e.g. PMOVMSKB(v16i8 x):
520  // %cmp = icmp slt <16 x i8> %x, zeroinitializer
521  // %int = bitcast <16 x i1> %cmp to i16
522  // %res = zext i16 %int to i32
523  unsigned NumElts = ArgTy->getNumElements();
524  Type *IntegerVecTy = VectorType::getInteger(ArgTy);
525  Type *IntegerTy = Builder.getIntNTy(NumElts);
526 
527  Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy);
528  Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy));
529  Res = Builder.CreateBitCast(Res, IntegerTy);
530  Res = Builder.CreateZExtOrTrunc(Res, ResTy);
531  return Res;
532 }
533 
536  Value *CarryIn = II.getArgOperand(0);
537  Value *Op1 = II.getArgOperand(1);
538  Value *Op2 = II.getArgOperand(2);
539  Type *RetTy = II.getType();
540  Type *OpTy = Op1->getType();
541  assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
542  RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
543  "Unexpected types for x86 addcarry");
544 
545  // If carry-in is zero, this is just an unsigned add with overflow.
546  if (match(CarryIn, PatternMatch::m_ZeroInt())) {
547  Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
548  {Op1, Op2});
549  // The types have to be adjusted to match the x86 call types.
550  Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
551  Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
552  Builder.getInt8Ty());
553  Value *Res = UndefValue::get(RetTy);
554  Res = Builder.CreateInsertValue(Res, UAddOV, 0);
555  return Builder.CreateInsertValue(Res, UAddResult, 1);
556  }
557 
558  return nullptr;
559 }
560 
563  auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
564  if (!CInt)
565  return nullptr;
566 
567  auto *VecTy = cast<FixedVectorType>(II.getType());
568  assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
569 
570  // The immediate permute control byte looks like this:
571  // [3:0] - zero mask for each 32-bit lane
572  // [5:4] - select one 32-bit destination lane
573  // [7:6] - select one 32-bit source lane
574 
575  uint8_t Imm = CInt->getZExtValue();
576  uint8_t ZMask = Imm & 0xf;
577  uint8_t DestLane = (Imm >> 4) & 0x3;
578  uint8_t SourceLane = (Imm >> 6) & 0x3;
579 
581 
582  // If all zero mask bits are set, this was just a weird way to
583  // generate a zero vector.
584  if (ZMask == 0xf)
585  return ZeroVector;
586 
587  // Initialize by passing all of the first source bits through.
588  int ShuffleMask[4] = {0, 1, 2, 3};
589 
590  // We may replace the second operand with the zero vector.
591  Value *V1 = II.getArgOperand(1);
592 
593  if (ZMask) {
594  // If the zero mask is being used with a single input or the zero mask
595  // overrides the destination lane, this is a shuffle with the zero vector.
596  if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
597  (ZMask & (1 << DestLane))) {
598  V1 = ZeroVector;
599  // We may still move 32-bits of the first source vector from one lane
600  // to another.
601  ShuffleMask[DestLane] = SourceLane;
602  // The zero mask may override the previous insert operation.
603  for (unsigned i = 0; i < 4; ++i)
604  if ((ZMask >> i) & 0x1)
605  ShuffleMask[i] = i + 4;
606  } else {
607  // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
608  return nullptr;
609  }
610  } else {
611  // Replace the selected destination lane with the selected source lane.
612  ShuffleMask[DestLane] = SourceLane + 4;
613  }
614 
615  return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
616 }
617 
618 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
619 /// or conversion to a shuffle vector.
621  ConstantInt *CILength, ConstantInt *CIIndex,
623  auto LowConstantHighUndef = [&](uint64_t Val) {
624  Type *IntTy64 = Type::getInt64Ty(II.getContext());
625  Constant *Args[] = {ConstantInt::get(IntTy64, Val),
626  UndefValue::get(IntTy64)};
627  return ConstantVector::get(Args);
628  };
629 
630  // See if we're dealing with constant values.
631  Constant *C0 = dyn_cast<Constant>(Op0);
632  ConstantInt *CI0 =
633  C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
634  : nullptr;
635 
636  // Attempt to constant fold.
637  if (CILength && CIIndex) {
638  // From AMD documentation: "The bit index and field length are each six
639  // bits in length other bits of the field are ignored."
640  APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
641  APInt APLength = CILength->getValue().zextOrTrunc(6);
642 
643  unsigned Index = APIndex.getZExtValue();
644 
645  // From AMD documentation: "a value of zero in the field length is
646  // defined as length of 64".
647  unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
648 
649  // From AMD documentation: "If the sum of the bit index + length field
650  // is greater than 64, the results are undefined".
651  unsigned End = Index + Length;
652 
653  // Note that both field index and field length are 8-bit quantities.
654  // Since variables 'Index' and 'Length' are unsigned values
655  // obtained from zero-extending field index and field length
656  // respectively, their sum should never wrap around.
657  if (End > 64)
658  return UndefValue::get(II.getType());
659 
660  // If we are inserting whole bytes, we can convert this to a shuffle.
661  // Lowering can recognize EXTRQI shuffle masks.
662  if ((Length % 8) == 0 && (Index % 8) == 0) {
663  // Convert bit indices to byte indices.
664  Length /= 8;
665  Index /= 8;
666 
667  Type *IntTy8 = Type::getInt8Ty(II.getContext());
668  auto *ShufTy = FixedVectorType::get(IntTy8, 16);
669 
670  SmallVector<int, 16> ShuffleMask;
671  for (int i = 0; i != (int)Length; ++i)
672  ShuffleMask.push_back(i + Index);
673  for (int i = Length; i != 8; ++i)
674  ShuffleMask.push_back(i + 16);
675  for (int i = 8; i != 16; ++i)
676  ShuffleMask.push_back(-1);
677 
678  Value *SV = Builder.CreateShuffleVector(
679  Builder.CreateBitCast(Op0, ShufTy),
680  ConstantAggregateZero::get(ShufTy), ShuffleMask);
681  return Builder.CreateBitCast(SV, II.getType());
682  }
683 
684  // Constant Fold - shift Index'th bit to lowest position and mask off
685  // Length bits.
686  if (CI0) {
687  APInt Elt = CI0->getValue();
688  Elt.lshrInPlace(Index);
689  Elt = Elt.zextOrTrunc(Length);
690  return LowConstantHighUndef(Elt.getZExtValue());
691  }
692 
693  // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
694  if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
695  Value *Args[] = {Op0, CILength, CIIndex};
696  Module *M = II.getModule();
697  Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
698  return Builder.CreateCall(F, Args);
699  }
700  }
701 
702  // Constant Fold - extraction from zero is always {zero, undef}.
703  if (CI0 && CI0->isZero())
704  return LowConstantHighUndef(0);
705 
706  return nullptr;
707 }
708 
709 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
710 /// folding or conversion to a shuffle vector.
712  APInt APLength, APInt APIndex,
714  // From AMD documentation: "The bit index and field length are each six bits
715  // in length other bits of the field are ignored."
716  APIndex = APIndex.zextOrTrunc(6);
717  APLength = APLength.zextOrTrunc(6);
718 
719  // Attempt to constant fold.
720  unsigned Index = APIndex.getZExtValue();
721 
722  // From AMD documentation: "a value of zero in the field length is
723  // defined as length of 64".
724  unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
725 
726  // From AMD documentation: "If the sum of the bit index + length field
727  // is greater than 64, the results are undefined".
728  unsigned End = Index + Length;
729 
730  // Note that both field index and field length are 8-bit quantities.
731  // Since variables 'Index' and 'Length' are unsigned values
732  // obtained from zero-extending field index and field length
733  // respectively, their sum should never wrap around.
734  if (End > 64)
735  return UndefValue::get(II.getType());
736 
737  // If we are inserting whole bytes, we can convert this to a shuffle.
738  // Lowering can recognize INSERTQI shuffle masks.
739  if ((Length % 8) == 0 && (Index % 8) == 0) {
740  // Convert bit indices to byte indices.
741  Length /= 8;
742  Index /= 8;
743 
744  Type *IntTy8 = Type::getInt8Ty(II.getContext());
745  auto *ShufTy = FixedVectorType::get(IntTy8, 16);
746 
747  SmallVector<int, 16> ShuffleMask;
748  for (int i = 0; i != (int)Index; ++i)
749  ShuffleMask.push_back(i);
750  for (int i = 0; i != (int)Length; ++i)
751  ShuffleMask.push_back(i + 16);
752  for (int i = Index + Length; i != 8; ++i)
753  ShuffleMask.push_back(i);
754  for (int i = 8; i != 16; ++i)
755  ShuffleMask.push_back(-1);
756 
757  Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
758  Builder.CreateBitCast(Op1, ShufTy),
759  ShuffleMask);
760  return Builder.CreateBitCast(SV, II.getType());
761  }
762 
763  // See if we're dealing with constant values.
764  Constant *C0 = dyn_cast<Constant>(Op0);
765  Constant *C1 = dyn_cast<Constant>(Op1);
766  ConstantInt *CI00 =
767  C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
768  : nullptr;
769  ConstantInt *CI10 =
770  C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
771  : nullptr;
772 
773  // Constant Fold - insert bottom Length bits starting at the Index'th bit.
774  if (CI00 && CI10) {
775  APInt V00 = CI00->getValue();
776  APInt V10 = CI10->getValue();
777  APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
778  V00 = V00 & ~Mask;
779  V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
780  APInt Val = V00 | V10;
781  Type *IntTy64 = Type::getInt64Ty(II.getContext());
782  Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
783  UndefValue::get(IntTy64)};
784  return ConstantVector::get(Args);
785  }
786 
787  // If we were an INSERTQ call, we'll save demanded elements if we convert to
788  // INSERTQI.
789  if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
790  Type *IntTy8 = Type::getInt8Ty(II.getContext());
791  Constant *CILength = ConstantInt::get(IntTy8, Length, false);
792  Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
793 
794  Value *Args[] = {Op0, Op1, CILength, CIIndex};
795  Module *M = II.getModule();
796  Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
797  return Builder.CreateCall(F, Args);
798  }
799 
800  return nullptr;
801 }
802 
803 /// Attempt to convert pshufb* to shufflevector if the mask is constant.
806  Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
807  if (!V)
808  return nullptr;
809 
810  auto *VecTy = cast<FixedVectorType>(II.getType());
811  unsigned NumElts = VecTy->getNumElements();
812  assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
813  "Unexpected number of elements in shuffle mask!");
814 
815  // Construct a shuffle mask from constant integers or UNDEFs.
816  int Indexes[64];
817 
818  // Each byte in the shuffle control mask forms an index to permute the
819  // corresponding byte in the destination operand.
820  for (unsigned I = 0; I < NumElts; ++I) {
821  Constant *COp = V->getAggregateElement(I);
822  if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
823  return nullptr;
824 
825  if (isa<UndefValue>(COp)) {
826  Indexes[I] = -1;
827  continue;
828  }
829 
830  int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
831 
832  // If the most significant bit (bit[7]) of each byte of the shuffle
833  // control mask is set, then zero is written in the result byte.
834  // The zero vector is in the right-hand side of the resulting
835  // shufflevector.
836 
837  // The value of each index for the high 128-bit lane is the least
838  // significant 4 bits of the respective shuffle control byte.
839  Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
840  Indexes[I] = Index;
841  }
842 
843  auto V1 = II.getArgOperand(0);
844  auto V2 = Constant::getNullValue(VecTy);
845  return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
846 }
847 
848 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
851  Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
852  if (!V)
853  return nullptr;
854 
855  auto *VecTy = cast<FixedVectorType>(II.getType());
856  unsigned NumElts = VecTy->getNumElements();
857  bool IsPD = VecTy->getScalarType()->isDoubleTy();
858  unsigned NumLaneElts = IsPD ? 2 : 4;
859  assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
860 
861  // Construct a shuffle mask from constant integers or UNDEFs.
862  int Indexes[16];
863 
864  // The intrinsics only read one or two bits, clear the rest.
865  for (unsigned I = 0; I < NumElts; ++I) {
866  Constant *COp = V->getAggregateElement(I);
867  if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
868  return nullptr;
869 
870  if (isa<UndefValue>(COp)) {
871  Indexes[I] = -1;
872  continue;
873  }
874 
875  APInt Index = cast<ConstantInt>(COp)->getValue();
876  Index = Index.zextOrTrunc(32).getLoBits(2);
877 
878  // The PD variants uses bit 1 to select per-lane element index, so
879  // shift down to convert to generic shuffle mask index.
880  if (IsPD)
881  Index.lshrInPlace(1);
882 
883  // The _256 variants are a bit trickier since the mask bits always index
884  // into the corresponding 128 half. In order to convert to a generic
885  // shuffle, we have to make that explicit.
886  Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
887 
888  Indexes[I] = Index.getZExtValue();
889  }
890 
891  auto V1 = II.getArgOperand(0);
892  return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts));
893 }
894 
895 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
898  auto *V = dyn_cast<Constant>(II.getArgOperand(1));
899  if (!V)
900  return nullptr;
901 
902  auto *VecTy = cast<FixedVectorType>(II.getType());
903  unsigned Size = VecTy->getNumElements();
904  assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
905  "Unexpected shuffle mask size");
906 
907  // Construct a shuffle mask from constant integers or UNDEFs.
908  int Indexes[64];
909 
910  for (unsigned I = 0; I < Size; ++I) {
911  Constant *COp = V->getAggregateElement(I);
912  if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
913  return nullptr;
914 
915  if (isa<UndefValue>(COp)) {
916  Indexes[I] = -1;
917  continue;
918  }
919 
920  uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
921  Index &= Size - 1;
922  Indexes[I] = Index;
923  }
924 
925  auto V1 = II.getArgOperand(0);
926  return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size));
927 }
928 
931  auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
932  unsigned DemandedWidth) {
933  APInt UndefElts(Width, 0);
934  APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
935  return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
936  };
937 
938  Intrinsic::ID IID = II.getIntrinsicID();
939  switch (IID) {
940  case Intrinsic::x86_bmi_bextr_32:
941  case Intrinsic::x86_bmi_bextr_64:
942  case Intrinsic::x86_tbm_bextri_u32:
943  case Intrinsic::x86_tbm_bextri_u64:
944  // If the RHS is a constant we can try some simplifications.
945  if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
946  uint64_t Shift = C->getZExtValue();
947  uint64_t Length = (Shift >> 8) & 0xff;
948  Shift &= 0xff;
949  unsigned BitWidth = II.getType()->getIntegerBitWidth();
950  // If the length is 0 or the shift is out of range, replace with zero.
951  if (Length == 0 || Shift >= BitWidth) {
952  return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
953  }
954  // If the LHS is also a constant, we can completely constant fold this.
955  if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
956  uint64_t Result = InC->getZExtValue() >> Shift;
957  if (Length > BitWidth)
958  Length = BitWidth;
959  Result &= maskTrailingOnes<uint64_t>(Length);
960  return IC.replaceInstUsesWith(II,
961  ConstantInt::get(II.getType(), Result));
962  }
963  // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
964  // are only masking bits that a shift already cleared?
965  }
966  break;
967 
968  case Intrinsic::x86_bmi_bzhi_32:
969  case Intrinsic::x86_bmi_bzhi_64:
970  // If the RHS is a constant we can try some simplifications.
971  if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
972  uint64_t Index = C->getZExtValue() & 0xff;
973  unsigned BitWidth = II.getType()->getIntegerBitWidth();
974  if (Index >= BitWidth) {
975  return IC.replaceInstUsesWith(II, II.getArgOperand(0));
976  }
977  if (Index == 0) {
978  return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
979  }
980  // If the LHS is also a constant, we can completely constant fold this.
981  if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
982  uint64_t Result = InC->getZExtValue();
983  Result &= maskTrailingOnes<uint64_t>(Index);
984  return IC.replaceInstUsesWith(II,
985  ConstantInt::get(II.getType(), Result));
986  }
987  // TODO should we convert this to an AND if the RHS is constant?
988  }
989  break;
990  case Intrinsic::x86_bmi_pext_32:
991  case Intrinsic::x86_bmi_pext_64:
992  if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
993  if (MaskC->isNullValue()) {
994  return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
995  }
996  if (MaskC->isAllOnesValue()) {
997  return IC.replaceInstUsesWith(II, II.getArgOperand(0));
998  }
999 
1000  if (MaskC->getValue().isShiftedMask()) {
1001  // any single contingous sequence of 1s anywhere in the mask simply
1002  // describes a subset of the input bits shifted to the appropriate
1003  // position. Replace with the straight forward IR.
1004  unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
1005  Value *Input = II.getArgOperand(0);
1006  Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
1007  Value *Shifted = IC.Builder.CreateLShr(Masked,
1009  ShiftAmount));
1010  return IC.replaceInstUsesWith(II, Shifted);
1011  }
1012 
1013 
1014  if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
1015  uint64_t Src = SrcC->getZExtValue();
1016  uint64_t Mask = MaskC->getZExtValue();
1017  uint64_t Result = 0;
1018  uint64_t BitToSet = 1;
1019 
1020  while (Mask) {
1021  // Isolate lowest set bit.
1022  uint64_t BitToTest = Mask & -Mask;
1023  if (BitToTest & Src)
1024  Result |= BitToSet;
1025 
1026  BitToSet <<= 1;
1027  // Clear lowest set bit.
1028  Mask &= Mask - 1;
1029  }
1030 
1031  return IC.replaceInstUsesWith(II,
1032  ConstantInt::get(II.getType(), Result));
1033  }
1034  }
1035  break;
1036  case Intrinsic::x86_bmi_pdep_32:
1037  case Intrinsic::x86_bmi_pdep_64:
1038  if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
1039  if (MaskC->isNullValue()) {
1040  return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
1041  }
1042  if (MaskC->isAllOnesValue()) {
1043  return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1044  }
1045  if (MaskC->getValue().isShiftedMask()) {
1046  // any single contingous sequence of 1s anywhere in the mask simply
1047  // describes a subset of the input bits shifted to the appropriate
1048  // position. Replace with the straight forward IR.
1049  unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
1050  Value *Input = II.getArgOperand(0);
1051  Value *Shifted = IC.Builder.CreateShl(Input,
1053  ShiftAmount));
1054  Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
1055  return IC.replaceInstUsesWith(II, Masked);
1056  }
1057 
1058  if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
1059  uint64_t Src = SrcC->getZExtValue();
1060  uint64_t Mask = MaskC->getZExtValue();
1061  uint64_t Result = 0;
1062  uint64_t BitToTest = 1;
1063 
1064  while (Mask) {
1065  // Isolate lowest set bit.
1066  uint64_t BitToSet = Mask & -Mask;
1067  if (BitToTest & Src)
1068  Result |= BitToSet;
1069 
1070  BitToTest <<= 1;
1071  // Clear lowest set bit;
1072  Mask &= Mask - 1;
1073  }
1074 
1075  return IC.replaceInstUsesWith(II,
1076  ConstantInt::get(II.getType(), Result));
1077  }
1078  }
1079  break;
1080 
1081  case Intrinsic::x86_sse_cvtss2si:
1082  case Intrinsic::x86_sse_cvtss2si64:
1083  case Intrinsic::x86_sse_cvttss2si:
1084  case Intrinsic::x86_sse_cvttss2si64:
1085  case Intrinsic::x86_sse2_cvtsd2si:
1086  case Intrinsic::x86_sse2_cvtsd2si64:
1087  case Intrinsic::x86_sse2_cvttsd2si:
1088  case Intrinsic::x86_sse2_cvttsd2si64:
1089  case Intrinsic::x86_avx512_vcvtss2si32:
1090  case Intrinsic::x86_avx512_vcvtss2si64:
1091  case Intrinsic::x86_avx512_vcvtss2usi32:
1092  case Intrinsic::x86_avx512_vcvtss2usi64:
1093  case Intrinsic::x86_avx512_vcvtsd2si32:
1094  case Intrinsic::x86_avx512_vcvtsd2si64:
1095  case Intrinsic::x86_avx512_vcvtsd2usi32:
1096  case Intrinsic::x86_avx512_vcvtsd2usi64:
1097  case Intrinsic::x86_avx512_cvttss2si:
1098  case Intrinsic::x86_avx512_cvttss2si64:
1099  case Intrinsic::x86_avx512_cvttss2usi:
1100  case Intrinsic::x86_avx512_cvttss2usi64:
1101  case Intrinsic::x86_avx512_cvttsd2si:
1102  case Intrinsic::x86_avx512_cvttsd2si64:
1103  case Intrinsic::x86_avx512_cvttsd2usi:
1104  case Intrinsic::x86_avx512_cvttsd2usi64: {
1105  // These intrinsics only demand the 0th element of their input vectors. If
1106  // we can simplify the input based on that, do so now.
1107  Value *Arg = II.getArgOperand(0);
1108  unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
1109  if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
1110  return IC.replaceOperand(II, 0, V);
1111  }
1112  break;
1113  }
1114 
1115  case Intrinsic::x86_mmx_pmovmskb:
1116  case Intrinsic::x86_sse_movmsk_ps:
1117  case Intrinsic::x86_sse2_movmsk_pd:
1118  case Intrinsic::x86_sse2_pmovmskb_128:
1119  case Intrinsic::x86_avx_movmsk_pd_256:
1120  case Intrinsic::x86_avx_movmsk_ps_256:
1121  case Intrinsic::x86_avx2_pmovmskb:
1122  if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
1123  return IC.replaceInstUsesWith(II, V);
1124  }
1125  break;
1126 
1127  case Intrinsic::x86_sse_comieq_ss:
1128  case Intrinsic::x86_sse_comige_ss:
1129  case Intrinsic::x86_sse_comigt_ss:
1130  case Intrinsic::x86_sse_comile_ss:
1131  case Intrinsic::x86_sse_comilt_ss:
1132  case Intrinsic::x86_sse_comineq_ss:
1133  case Intrinsic::x86_sse_ucomieq_ss:
1134  case Intrinsic::x86_sse_ucomige_ss:
1135  case Intrinsic::x86_sse_ucomigt_ss:
1136  case Intrinsic::x86_sse_ucomile_ss:
1137  case Intrinsic::x86_sse_ucomilt_ss:
1138  case Intrinsic::x86_sse_ucomineq_ss:
1139  case Intrinsic::x86_sse2_comieq_sd:
1140  case Intrinsic::x86_sse2_comige_sd:
1141  case Intrinsic::x86_sse2_comigt_sd:
1142  case Intrinsic::x86_sse2_comile_sd:
1143  case Intrinsic::x86_sse2_comilt_sd:
1144  case Intrinsic::x86_sse2_comineq_sd:
1145  case Intrinsic::x86_sse2_ucomieq_sd:
1146  case Intrinsic::x86_sse2_ucomige_sd:
1147  case Intrinsic::x86_sse2_ucomigt_sd:
1148  case Intrinsic::x86_sse2_ucomile_sd:
1149  case Intrinsic::x86_sse2_ucomilt_sd:
1150  case Intrinsic::x86_sse2_ucomineq_sd:
1151  case Intrinsic::x86_avx512_vcomi_ss:
1152  case Intrinsic::x86_avx512_vcomi_sd:
1153  case Intrinsic::x86_avx512_mask_cmp_ss:
1154  case Intrinsic::x86_avx512_mask_cmp_sd: {
1155  // These intrinsics only demand the 0th element of their input vectors. If
1156  // we can simplify the input based on that, do so now.
1157  bool MadeChange = false;
1158  Value *Arg0 = II.getArgOperand(0);
1159  Value *Arg1 = II.getArgOperand(1);
1160  unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
1161  if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
1162  IC.replaceOperand(II, 0, V);
1163  MadeChange = true;
1164  }
1165  if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
1166  IC.replaceOperand(II, 1, V);
1167  MadeChange = true;
1168  }
1169  if (MadeChange) {
1170  return &II;
1171  }
1172  break;
1173  }
1174 
1175  case Intrinsic::x86_avx512_add_ps_512:
1176  case Intrinsic::x86_avx512_div_ps_512:
1177  case Intrinsic::x86_avx512_mul_ps_512:
1178  case Intrinsic::x86_avx512_sub_ps_512:
1179  case Intrinsic::x86_avx512_add_pd_512:
1180  case Intrinsic::x86_avx512_div_pd_512:
1181  case Intrinsic::x86_avx512_mul_pd_512:
1182  case Intrinsic::x86_avx512_sub_pd_512:
1183  // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
1184  // IR operations.
1185  if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
1186  if (R->getValue() == 4) {
1187  Value *Arg0 = II.getArgOperand(0);
1188  Value *Arg1 = II.getArgOperand(1);
1189 
1190  Value *V;
1191  switch (IID) {
1192  default:
1193  llvm_unreachable("Case stmts out of sync!");
1194  case Intrinsic::x86_avx512_add_ps_512:
1195  case Intrinsic::x86_avx512_add_pd_512:
1196  V = IC.Builder.CreateFAdd(Arg0, Arg1);
1197  break;
1198  case Intrinsic::x86_avx512_sub_ps_512:
1199  case Intrinsic::x86_avx512_sub_pd_512:
1200  V = IC.Builder.CreateFSub(Arg0, Arg1);
1201  break;
1202  case Intrinsic::x86_avx512_mul_ps_512:
1203  case Intrinsic::x86_avx512_mul_pd_512:
1204  V = IC.Builder.CreateFMul(Arg0, Arg1);
1205  break;
1206  case Intrinsic::x86_avx512_div_ps_512:
1207  case Intrinsic::x86_avx512_div_pd_512:
1208  V = IC.Builder.CreateFDiv(Arg0, Arg1);
1209  break;
1210  }
1211 
1212  return IC.replaceInstUsesWith(II, V);
1213  }
1214  }
1215  break;
1216 
1217  case Intrinsic::x86_avx512_mask_add_ss_round:
1218  case Intrinsic::x86_avx512_mask_div_ss_round:
1219  case Intrinsic::x86_avx512_mask_mul_ss_round:
1220  case Intrinsic::x86_avx512_mask_sub_ss_round:
1221  case Intrinsic::x86_avx512_mask_add_sd_round:
1222  case Intrinsic::x86_avx512_mask_div_sd_round:
1223  case Intrinsic::x86_avx512_mask_mul_sd_round:
1224  case Intrinsic::x86_avx512_mask_sub_sd_round:
1225  // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
1226  // IR operations.
1227  if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
1228  if (R->getValue() == 4) {
1229  // Extract the element as scalars.
1230  Value *Arg0 = II.getArgOperand(0);
1231  Value *Arg1 = II.getArgOperand(1);
1232  Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
1233  Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
1234 
1235  Value *V;
1236  switch (IID) {
1237  default:
1238  llvm_unreachable("Case stmts out of sync!");
1239  case Intrinsic::x86_avx512_mask_add_ss_round:
1240  case Intrinsic::x86_avx512_mask_add_sd_round:
1241  V = IC.Builder.CreateFAdd(LHS, RHS);
1242  break;
1243  case Intrinsic::x86_avx512_mask_sub_ss_round:
1244  case Intrinsic::x86_avx512_mask_sub_sd_round:
1245  V = IC.Builder.CreateFSub(LHS, RHS);
1246  break;
1247  case Intrinsic::x86_avx512_mask_mul_ss_round:
1248  case Intrinsic::x86_avx512_mask_mul_sd_round:
1249  V = IC.Builder.CreateFMul(LHS, RHS);
1250  break;
1251  case Intrinsic::x86_avx512_mask_div_ss_round:
1252  case Intrinsic::x86_avx512_mask_div_sd_round:
1253  V = IC.Builder.CreateFDiv(LHS, RHS);
1254  break;
1255  }
1256 
1257  // Handle the masking aspect of the intrinsic.
1258  Value *Mask = II.getArgOperand(3);
1259  auto *C = dyn_cast<ConstantInt>(Mask);
1260  // We don't need a select if we know the mask bit is a 1.
1261  if (!C || !C->getValue()[0]) {
1262  // Cast the mask to an i1 vector and then extract the lowest element.
1263  auto *MaskTy = FixedVectorType::get(
1264  IC.Builder.getInt1Ty(),
1265  cast<IntegerType>(Mask->getType())->getBitWidth());
1266  Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
1267  Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
1268  // Extract the lowest element from the passthru operand.
1269  Value *Passthru =
1270  IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
1271  V = IC.Builder.CreateSelect(Mask, V, Passthru);
1272  }
1273 
1274  // Insert the result back into the original argument 0.
1275  V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
1276 
1277  return IC.replaceInstUsesWith(II, V);
1278  }
1279  }
1280  break;
1281 
1282  // Constant fold ashr( <A x Bi>, Ci ).
1283  // Constant fold lshr( <A x Bi>, Ci ).
1284  // Constant fold shl( <A x Bi>, Ci ).
1285  case Intrinsic::x86_sse2_psrai_d:
1286  case Intrinsic::x86_sse2_psrai_w:
1287  case Intrinsic::x86_avx2_psrai_d:
1288  case Intrinsic::x86_avx2_psrai_w:
1289  case Intrinsic::x86_avx512_psrai_q_128:
1290  case Intrinsic::x86_avx512_psrai_q_256:
1291  case Intrinsic::x86_avx512_psrai_d_512:
1292  case Intrinsic::x86_avx512_psrai_q_512:
1293  case Intrinsic::x86_avx512_psrai_w_512:
1294  case Intrinsic::x86_sse2_psrli_d:
1295  case Intrinsic::x86_sse2_psrli_q:
1296  case Intrinsic::x86_sse2_psrli_w:
1297  case Intrinsic::x86_avx2_psrli_d:
1298  case Intrinsic::x86_avx2_psrli_q:
1299  case Intrinsic::x86_avx2_psrli_w:
1300  case Intrinsic::x86_avx512_psrli_d_512:
1301  case Intrinsic::x86_avx512_psrli_q_512:
1302  case Intrinsic::x86_avx512_psrli_w_512:
1303  case Intrinsic::x86_sse2_pslli_d:
1304  case Intrinsic::x86_sse2_pslli_q:
1305  case Intrinsic::x86_sse2_pslli_w:
1306  case Intrinsic::x86_avx2_pslli_d:
1307  case Intrinsic::x86_avx2_pslli_q:
1308  case Intrinsic::x86_avx2_pslli_w:
1309  case Intrinsic::x86_avx512_pslli_d_512:
1310  case Intrinsic::x86_avx512_pslli_q_512:
1311  case Intrinsic::x86_avx512_pslli_w_512:
1312  if (Value *V = simplifyX86immShift(II, IC.Builder)) {
1313  return IC.replaceInstUsesWith(II, V);
1314  }
1315  break;
1316 
1317  case Intrinsic::x86_sse2_psra_d:
1318  case Intrinsic::x86_sse2_psra_w:
1319  case Intrinsic::x86_avx2_psra_d:
1320  case Intrinsic::x86_avx2_psra_w:
1321  case Intrinsic::x86_avx512_psra_q_128:
1322  case Intrinsic::x86_avx512_psra_q_256:
1323  case Intrinsic::x86_avx512_psra_d_512:
1324  case Intrinsic::x86_avx512_psra_q_512:
1325  case Intrinsic::x86_avx512_psra_w_512:
1326  case Intrinsic::x86_sse2_psrl_d:
1327  case Intrinsic::x86_sse2_psrl_q:
1328  case Intrinsic::x86_sse2_psrl_w:
1329  case Intrinsic::x86_avx2_psrl_d:
1330  case Intrinsic::x86_avx2_psrl_q:
1331  case Intrinsic::x86_avx2_psrl_w:
1332  case Intrinsic::x86_avx512_psrl_d_512:
1333  case Intrinsic::x86_avx512_psrl_q_512:
1334  case Intrinsic::x86_avx512_psrl_w_512:
1335  case Intrinsic::x86_sse2_psll_d:
1336  case Intrinsic::x86_sse2_psll_q:
1337  case Intrinsic::x86_sse2_psll_w:
1338  case Intrinsic::x86_avx2_psll_d:
1339  case Intrinsic::x86_avx2_psll_q:
1340  case Intrinsic::x86_avx2_psll_w:
1341  case Intrinsic::x86_avx512_psll_d_512:
1342  case Intrinsic::x86_avx512_psll_q_512:
1343  case Intrinsic::x86_avx512_psll_w_512: {
1344  if (Value *V = simplifyX86immShift(II, IC.Builder)) {
1345  return IC.replaceInstUsesWith(II, V);
1346  }
1347 
1348  // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
1349  // operand to compute the shift amount.
1350  Value *Arg1 = II.getArgOperand(1);
1351  assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
1352  "Unexpected packed shift size");
1353  unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
1354 
1355  if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
1356  return IC.replaceOperand(II, 1, V);
1357  }
1358  break;
1359  }
1360 
1361  case Intrinsic::x86_avx2_psllv_d:
1362  case Intrinsic::x86_avx2_psllv_d_256:
1363  case Intrinsic::x86_avx2_psllv_q:
1364  case Intrinsic::x86_avx2_psllv_q_256:
1365  case Intrinsic::x86_avx512_psllv_d_512:
1366  case Intrinsic::x86_avx512_psllv_q_512:
1367  case Intrinsic::x86_avx512_psllv_w_128:
1368  case Intrinsic::x86_avx512_psllv_w_256:
1369  case Intrinsic::x86_avx512_psllv_w_512:
1370  case Intrinsic::x86_avx2_psrav_d:
1371  case Intrinsic::x86_avx2_psrav_d_256:
1372  case Intrinsic::x86_avx512_psrav_q_128:
1373  case Intrinsic::x86_avx512_psrav_q_256:
1374  case Intrinsic::x86_avx512_psrav_d_512:
1375  case Intrinsic::x86_avx512_psrav_q_512:
1376  case Intrinsic::x86_avx512_psrav_w_128:
1377  case Intrinsic::x86_avx512_psrav_w_256:
1378  case Intrinsic::x86_avx512_psrav_w_512:
1379  case Intrinsic::x86_avx2_psrlv_d:
1380  case Intrinsic::x86_avx2_psrlv_d_256:
1381  case Intrinsic::x86_avx2_psrlv_q:
1382  case Intrinsic::x86_avx2_psrlv_q_256:
1383  case Intrinsic::x86_avx512_psrlv_d_512:
1384  case Intrinsic::x86_avx512_psrlv_q_512:
1385  case Intrinsic::x86_avx512_psrlv_w_128:
1386  case Intrinsic::x86_avx512_psrlv_w_256:
1387  case Intrinsic::x86_avx512_psrlv_w_512:
1388  if (Value *V = simplifyX86varShift(II, IC.Builder)) {
1389  return IC.replaceInstUsesWith(II, V);
1390  }
1391  break;
1392 
1393  case Intrinsic::x86_sse2_packssdw_128:
1394  case Intrinsic::x86_sse2_packsswb_128:
1395  case Intrinsic::x86_avx2_packssdw:
1396  case Intrinsic::x86_avx2_packsswb:
1397  case Intrinsic::x86_avx512_packssdw_512:
1398  case Intrinsic::x86_avx512_packsswb_512:
1399  if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
1400  return IC.replaceInstUsesWith(II, V);
1401  }
1402  break;
1403 
1404  case Intrinsic::x86_sse2_packuswb_128:
1405  case Intrinsic::x86_sse41_packusdw:
1406  case Intrinsic::x86_avx2_packusdw:
1407  case Intrinsic::x86_avx2_packuswb:
1408  case Intrinsic::x86_avx512_packusdw_512:
1409  case Intrinsic::x86_avx512_packuswb_512:
1410  if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
1411  return IC.replaceInstUsesWith(II, V);
1412  }
1413  break;
1414 
1415  case Intrinsic::x86_pclmulqdq:
1416  case Intrinsic::x86_pclmulqdq_256:
1417  case Intrinsic::x86_pclmulqdq_512: {
1418  if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
1419  unsigned Imm = C->getZExtValue();
1420 
1421  bool MadeChange = false;
1422  Value *Arg0 = II.getArgOperand(0);
1423  Value *Arg1 = II.getArgOperand(1);
1424  unsigned VWidth =
1425  cast<FixedVectorType>(Arg0->getType())->getNumElements();
1426 
1427  APInt UndefElts1(VWidth, 0);
1428  APInt DemandedElts1 =
1429  APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
1430  if (Value *V =
1431  IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
1432  IC.replaceOperand(II, 0, V);
1433  MadeChange = true;
1434  }
1435 
1436  APInt UndefElts2(VWidth, 0);
1437  APInt DemandedElts2 =
1438  APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
1439  if (Value *V =
1440  IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
1441  IC.replaceOperand(II, 1, V);
1442  MadeChange = true;
1443  }
1444 
1445  // If either input elements are undef, the result is zero.
1446  if (DemandedElts1.isSubsetOf(UndefElts1) ||
1447  DemandedElts2.isSubsetOf(UndefElts2)) {
1448  return IC.replaceInstUsesWith(II,
1450  }
1451 
1452  if (MadeChange) {
1453  return &II;
1454  }
1455  }
1456  break;
1457  }
1458 
1459  case Intrinsic::x86_sse41_insertps:
1460  if (Value *V = simplifyX86insertps(II, IC.Builder)) {
1461  return IC.replaceInstUsesWith(II, V);
1462  }
1463  break;
1464 
1465  case Intrinsic::x86_sse4a_extrq: {
1466  Value *Op0 = II.getArgOperand(0);
1467  Value *Op1 = II.getArgOperand(1);
1468  unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
1469  unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
1470  assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1471  Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
1472  VWidth1 == 16 && "Unexpected operand sizes");
1473 
1474  // See if we're dealing with constant values.
1475  Constant *C1 = dyn_cast<Constant>(Op1);
1476  ConstantInt *CILength =
1477  C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1478  : nullptr;
1479  ConstantInt *CIIndex =
1480  C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
1481  : nullptr;
1482 
1483  // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
1484  if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
1485  return IC.replaceInstUsesWith(II, V);
1486  }
1487 
1488  // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
1489  // operands and the lowest 16-bits of the second.
1490  bool MadeChange = false;
1491  if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
1492  IC.replaceOperand(II, 0, V);
1493  MadeChange = true;
1494  }
1495  if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
1496  IC.replaceOperand(II, 1, V);
1497  MadeChange = true;
1498  }
1499  if (MadeChange) {
1500  return &II;
1501  }
1502  break;
1503  }
1504 
1505  case Intrinsic::x86_sse4a_extrqi: {
1506  // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
1507  // bits of the lower 64-bits. The upper 64-bits are undefined.
1508  Value *Op0 = II.getArgOperand(0);
1509  unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
1510  assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
1511  "Unexpected operand size");
1512 
1513  // See if we're dealing with constant values.
1514  ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
1515  ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
1516 
1517  // Attempt to simplify to a constant or shuffle vector.
1518  if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
1519  return IC.replaceInstUsesWith(II, V);
1520  }
1521 
1522  // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
1523  // operand.
1524  if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
1525  return IC.replaceOperand(II, 0, V);
1526  }
1527  break;
1528  }
1529 
1530  case Intrinsic::x86_sse4a_insertq: {
1531  Value *Op0 = II.getArgOperand(0);
1532  Value *Op1 = II.getArgOperand(1);
1533  unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
1534  assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1535  Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
1536  cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
1537  "Unexpected operand size");
1538 
1539  // See if we're dealing with constant values.
1540  Constant *C1 = dyn_cast<Constant>(Op1);
1541  ConstantInt *CI11 =
1542  C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
1543  : nullptr;
1544 
1545  // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
1546  if (CI11) {
1547  const APInt &V11 = CI11->getValue();
1548  APInt Len = V11.zextOrTrunc(6);
1549  APInt Idx = V11.lshr(8).zextOrTrunc(6);
1550  if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
1551  return IC.replaceInstUsesWith(II, V);
1552  }
1553  }
1554 
1555  // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
1556  // operand.
1557  if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
1558  return IC.replaceOperand(II, 0, V);
1559  }
1560  break;
1561  }
1562 
1563  case Intrinsic::x86_sse4a_insertqi: {
1564  // INSERTQI: Extract lowest Length bits from lower half of second source and
1565  // insert over first source starting at Index bit. The upper 64-bits are
1566  // undefined.
1567  Value *Op0 = II.getArgOperand(0);
1568  Value *Op1 = II.getArgOperand(1);
1569  unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
1570  unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
1571  assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1572  Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
1573  VWidth1 == 2 && "Unexpected operand sizes");
1574 
1575  // See if we're dealing with constant values.
1576  ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
1577  ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
1578 
1579  // Attempt to simplify to a constant or shuffle vector.
1580  if (CILength && CIIndex) {
1581  APInt Len = CILength->getValue().zextOrTrunc(6);
1582  APInt Idx = CIIndex->getValue().zextOrTrunc(6);
1583  if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
1584  return IC.replaceInstUsesWith(II, V);
1585  }
1586  }
1587 
1588  // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
1589  // operands.
1590  bool MadeChange = false;
1591  if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
1592  IC.replaceOperand(II, 0, V);
1593  MadeChange = true;
1594  }
1595  if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
1596  IC.replaceOperand(II, 1, V);
1597  MadeChange = true;
1598  }
1599  if (MadeChange) {
1600  return &II;
1601  }
1602  break;
1603  }
1604 
1605  case Intrinsic::x86_sse41_pblendvb:
1606  case Intrinsic::x86_sse41_blendvps:
1607  case Intrinsic::x86_sse41_blendvpd:
1608  case Intrinsic::x86_avx_blendv_ps_256:
1609  case Intrinsic::x86_avx_blendv_pd_256:
1610  case Intrinsic::x86_avx2_pblendvb: {
1611  // fold (blend A, A, Mask) -> A
1612  Value *Op0 = II.getArgOperand(0);
1613  Value *Op1 = II.getArgOperand(1);
1614  Value *Mask = II.getArgOperand(2);
1615  if (Op0 == Op1) {
1616  return IC.replaceInstUsesWith(II, Op0);
1617  }
1618 
1619  // Zero Mask - select 1st argument.
1620  if (isa<ConstantAggregateZero>(Mask)) {
1621  return IC.replaceInstUsesWith(II, Op0);
1622  }
1623 
1624  // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
1625  if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
1626  Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
1627  return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
1628  }
1629 
1630  // Convert to a vector select if we can bypass casts and find a boolean
1631  // vector condition value.
1632  Value *BoolVec;
1635  BoolVec->getType()->isVectorTy() &&
1636  BoolVec->getType()->getScalarSizeInBits() == 1) {
1637  assert(Mask->getType()->getPrimitiveSizeInBits() ==
1638  II.getType()->getPrimitiveSizeInBits() &&
1639  "Not expecting mask and operands with different sizes");
1640 
1641  unsigned NumMaskElts =
1642  cast<FixedVectorType>(Mask->getType())->getNumElements();
1643  unsigned NumOperandElts =
1644  cast<FixedVectorType>(II.getType())->getNumElements();
1645  if (NumMaskElts == NumOperandElts) {
1646  return SelectInst::Create(BoolVec, Op1, Op0);
1647  }
1648 
1649  // If the mask has less elements than the operands, each mask bit maps to
1650  // multiple elements of the operands. Bitcast back and forth.
1651  if (NumMaskElts < NumOperandElts) {
1652  Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
1653  Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
1654  Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
1655  return new BitCastInst(Sel, II.getType());
1656  }
1657  }
1658 
1659  break;
1660  }
1661 
1662  case Intrinsic::x86_ssse3_pshuf_b_128:
1663  case Intrinsic::x86_avx2_pshuf_b:
1664  case Intrinsic::x86_avx512_pshuf_b_512:
1665  if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
1666  return IC.replaceInstUsesWith(II, V);
1667  }
1668  break;
1669 
1670  case Intrinsic::x86_avx_vpermilvar_ps:
1671  case Intrinsic::x86_avx_vpermilvar_ps_256:
1672  case Intrinsic::x86_avx512_vpermilvar_ps_512:
1673  case Intrinsic::x86_avx_vpermilvar_pd:
1674  case Intrinsic::x86_avx_vpermilvar_pd_256:
1675  case Intrinsic::x86_avx512_vpermilvar_pd_512:
1676  if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
1677  return IC.replaceInstUsesWith(II, V);
1678  }
1679  break;
1680 
1681  case Intrinsic::x86_avx2_permd:
1682  case Intrinsic::x86_avx2_permps:
1683  case Intrinsic::x86_avx512_permvar_df_256:
1684  case Intrinsic::x86_avx512_permvar_df_512:
1685  case Intrinsic::x86_avx512_permvar_di_256:
1686  case Intrinsic::x86_avx512_permvar_di_512:
1687  case Intrinsic::x86_avx512_permvar_hi_128:
1688  case Intrinsic::x86_avx512_permvar_hi_256:
1689  case Intrinsic::x86_avx512_permvar_hi_512:
1690  case Intrinsic::x86_avx512_permvar_qi_128:
1691  case Intrinsic::x86_avx512_permvar_qi_256:
1692  case Intrinsic::x86_avx512_permvar_qi_512:
1693  case Intrinsic::x86_avx512_permvar_sf_512:
1694  case Intrinsic::x86_avx512_permvar_si_512:
1695  if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
1696  return IC.replaceInstUsesWith(II, V);
1697  }
1698  break;
1699 
1700  case Intrinsic::x86_avx_maskload_ps:
1701  case Intrinsic::x86_avx_maskload_pd:
1702  case Intrinsic::x86_avx_maskload_ps_256:
1703  case Intrinsic::x86_avx_maskload_pd_256:
1704  case Intrinsic::x86_avx2_maskload_d:
1705  case Intrinsic::x86_avx2_maskload_q:
1706  case Intrinsic::x86_avx2_maskload_d_256:
1707  case Intrinsic::x86_avx2_maskload_q_256:
1708  if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
1709  return I;
1710  }
1711  break;
1712 
1713  case Intrinsic::x86_sse2_maskmov_dqu:
1714  case Intrinsic::x86_avx_maskstore_ps:
1715  case Intrinsic::x86_avx_maskstore_pd:
1716  case Intrinsic::x86_avx_maskstore_ps_256:
1717  case Intrinsic::x86_avx_maskstore_pd_256:
1718  case Intrinsic::x86_avx2_maskstore_d:
1719  case Intrinsic::x86_avx2_maskstore_q:
1720  case Intrinsic::x86_avx2_maskstore_d_256:
1721  case Intrinsic::x86_avx2_maskstore_q_256:
1722  if (simplifyX86MaskedStore(II, IC)) {
1723  return nullptr;
1724  }
1725  break;
1726 
1727  case Intrinsic::x86_addcarry_32:
1728  case Intrinsic::x86_addcarry_64:
1729  if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
1730  return IC.replaceInstUsesWith(II, V);
1731  }
1732  break;
1733 
1734  default:
1735  break;
1736  }
1737  return None;
1738 }
1739 
1741  InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
1742  bool &KnownBitsComputed) const {
1743  switch (II.getIntrinsicID()) {
1744  default:
1745  break;
1746  case Intrinsic::x86_mmx_pmovmskb:
1747  case Intrinsic::x86_sse_movmsk_ps:
1748  case Intrinsic::x86_sse2_movmsk_pd:
1749  case Intrinsic::x86_sse2_pmovmskb_128:
1750  case Intrinsic::x86_avx_movmsk_ps_256:
1751  case Intrinsic::x86_avx_movmsk_pd_256:
1752  case Intrinsic::x86_avx2_pmovmskb: {
1753  // MOVMSK copies the vector elements' sign bits to the low bits
1754  // and zeros the high bits.
1755  unsigned ArgWidth;
1756  if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
1757  ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
1758  } else {
1759  auto Arg = II.getArgOperand(0);
1760  auto ArgType = cast<FixedVectorType>(Arg->getType());
1761  ArgWidth = ArgType->getNumElements();
1762  }
1763 
1764  // If we don't need any of low bits then return zero,
1765  // we know that DemandedMask is non-zero already.
1766  APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
1767  Type *VTy = II.getType();
1768  if (DemandedElts.isNullValue()) {
1769  return ConstantInt::getNullValue(VTy);
1770  }
1771 
1772  // We know that the upper bits are set to zero.
1773  Known.Zero.setBitsFrom(ArgWidth);
1774  KnownBitsComputed = true;
1775  break;
1776  }
1777  }
1778  return None;
1779 }
1780 
1782  InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1783  APInt &UndefElts2, APInt &UndefElts3,
1784  std::function<void(Instruction *, unsigned, APInt, APInt &)>
1785  simplifyAndSetOp) const {
1786  unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
1787  switch (II.getIntrinsicID()) {
1788  default:
1789  break;
1790  case Intrinsic::x86_xop_vfrcz_ss:
1791  case Intrinsic::x86_xop_vfrcz_sd:
1792  // The instructions for these intrinsics are speced to zero upper bits not
1793  // pass them through like other scalar intrinsics. So we shouldn't just
1794  // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
1795  // Instead we should return a zero vector.
1796  if (!DemandedElts[0]) {
1797  IC.addToWorklist(&II);
1798  return ConstantAggregateZero::get(II.getType());
1799  }
1800 
1801  // Only the lower element is used.
1802  DemandedElts = 1;
1803  simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1804 
1805  // Only the lower element is undefined. The high elements are zero.
1806  UndefElts = UndefElts[0];
1807  break;
1808 
1809  // Unary scalar-as-vector operations that work column-wise.
1810  case Intrinsic::x86_sse_rcp_ss:
1811  case Intrinsic::x86_sse_rsqrt_ss:
1812  simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1813 
1814  // If lowest element of a scalar op isn't used then use Arg0.
1815  if (!DemandedElts[0]) {
1816  IC.addToWorklist(&II);
1817  return II.getArgOperand(0);
1818  }
1819  // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
1820  // checks).
1821  break;
1822 
1823  // Binary scalar-as-vector operations that work column-wise. The high
1824  // elements come from operand 0. The low element is a function of both
1825  // operands.
1826  case Intrinsic::x86_sse_min_ss:
1827  case Intrinsic::x86_sse_max_ss:
1828  case Intrinsic::x86_sse_cmp_ss:
1829  case Intrinsic::x86_sse2_min_sd:
1830  case Intrinsic::x86_sse2_max_sd:
1831  case Intrinsic::x86_sse2_cmp_sd: {
1832  simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1833 
1834  // If lowest element of a scalar op isn't used then use Arg0.
1835  if (!DemandedElts[0]) {
1836  IC.addToWorklist(&II);
1837  return II.getArgOperand(0);
1838  }
1839 
1840  // Only lower element is used for operand 1.
1841  DemandedElts = 1;
1842  simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1843 
1844  // Lower element is undefined if both lower elements are undefined.
1845  // Consider things like undef&0. The result is known zero, not undef.
1846  if (!UndefElts2[0])
1847  UndefElts.clearBit(0);
1848 
1849  break;
1850  }
1851 
1852  // Binary scalar-as-vector operations that work column-wise. The high
1853  // elements come from operand 0 and the low element comes from operand 1.
1854  case Intrinsic::x86_sse41_round_ss:
1855  case Intrinsic::x86_sse41_round_sd: {
1856  // Don't use the low element of operand 0.
1857  APInt DemandedElts2 = DemandedElts;
1858  DemandedElts2.clearBit(0);
1859  simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
1860 
1861  // If lowest element of a scalar op isn't used then use Arg0.
1862  if (!DemandedElts[0]) {
1863  IC.addToWorklist(&II);
1864  return II.getArgOperand(0);
1865  }
1866 
1867  // Only lower element is used for operand 1.
1868  DemandedElts = 1;
1869  simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1870 
1871  // Take the high undef elements from operand 0 and take the lower element
1872  // from operand 1.
1873  UndefElts.clearBit(0);
1874  UndefElts |= UndefElts2[0];
1875  break;
1876  }
1877 
1878  // Three input scalar-as-vector operations that work column-wise. The high
1879  // elements come from operand 0 and the low element is a function of all
1880  // three inputs.
1881  case Intrinsic::x86_avx512_mask_add_ss_round:
1882  case Intrinsic::x86_avx512_mask_div_ss_round:
1883  case Intrinsic::x86_avx512_mask_mul_ss_round:
1884  case Intrinsic::x86_avx512_mask_sub_ss_round:
1885  case Intrinsic::x86_avx512_mask_max_ss_round:
1886  case Intrinsic::x86_avx512_mask_min_ss_round:
1887  case Intrinsic::x86_avx512_mask_add_sd_round:
1888  case Intrinsic::x86_avx512_mask_div_sd_round:
1889  case Intrinsic::x86_avx512_mask_mul_sd_round:
1890  case Intrinsic::x86_avx512_mask_sub_sd_round:
1891  case Intrinsic::x86_avx512_mask_max_sd_round:
1892  case Intrinsic::x86_avx512_mask_min_sd_round:
1893  simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1894 
1895  // If lowest element of a scalar op isn't used then use Arg0.
1896  if (!DemandedElts[0]) {
1897  IC.addToWorklist(&II);
1898  return II.getArgOperand(0);
1899  }
1900 
1901  // Only lower element is used for operand 1 and 2.
1902  DemandedElts = 1;
1903  simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1904  simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
1905 
1906  // Lower element is undefined if all three lower elements are undefined.
1907  // Consider things like undef&0. The result is known zero, not undef.
1908  if (!UndefElts2[0] || !UndefElts3[0])
1909  UndefElts.clearBit(0);
1910  break;
1911 
1912  // TODO: Add fmaddsub support?
1913  case Intrinsic::x86_sse3_addsub_pd:
1914  case Intrinsic::x86_sse3_addsub_ps:
1915  case Intrinsic::x86_avx_addsub_pd_256:
1916  case Intrinsic::x86_avx_addsub_ps_256: {
1917  // If none of the even or none of the odd lanes are required, turn this
1918  // into a generic FP math instruction.
1919  APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
1920  APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
1921  bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
1922  bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
1923  if (IsSubOnly || IsAddOnly) {
1924  assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
1926  IC.Builder.SetInsertPoint(&II);
1927  Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
1928  return IC.Builder.CreateBinOp(
1929  IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
1930  }
1931 
1932  simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1933  simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1934  UndefElts &= UndefElts2;
1935  break;
1936  }
1937 
1938  case Intrinsic::x86_sse2_packssdw_128:
1939  case Intrinsic::x86_sse2_packsswb_128:
1940  case Intrinsic::x86_sse2_packuswb_128:
1941  case Intrinsic::x86_sse41_packusdw:
1942  case Intrinsic::x86_avx2_packssdw:
1943  case Intrinsic::x86_avx2_packsswb:
1944  case Intrinsic::x86_avx2_packusdw:
1945  case Intrinsic::x86_avx2_packuswb:
1946  case Intrinsic::x86_avx512_packssdw_512:
1947  case Intrinsic::x86_avx512_packsswb_512:
1948  case Intrinsic::x86_avx512_packusdw_512:
1949  case Intrinsic::x86_avx512_packuswb_512: {
1950  auto *Ty0 = II.getArgOperand(0)->getType();
1951  unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
1952  assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
1953 
1954  unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
1955  unsigned VWidthPerLane = VWidth / NumLanes;
1956  unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
1957 
1958  // Per lane, pack the elements of the first input and then the second.
1959  // e.g.
1960  // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
1961  // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
1962  for (int OpNum = 0; OpNum != 2; ++OpNum) {
1963  APInt OpDemandedElts(InnerVWidth, 0);
1964  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1965  unsigned LaneIdx = Lane * VWidthPerLane;
1966  for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
1967  unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
1968  if (DemandedElts[Idx])
1969  OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
1970  }
1971  }
1972 
1973  // Demand elements from the operand.
1974  APInt OpUndefElts(InnerVWidth, 0);
1975  simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
1976 
1977  // Pack the operand's UNDEF elements, one lane at a time.
1978  OpUndefElts = OpUndefElts.zext(VWidth);
1979  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1980  APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
1981  LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
1982  LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
1983  UndefElts |= LaneElts;
1984  }
1985  }
1986  break;
1987  }
1988 
1989  // PSHUFB
1990  case Intrinsic::x86_ssse3_pshuf_b_128:
1991  case Intrinsic::x86_avx2_pshuf_b:
1992  case Intrinsic::x86_avx512_pshuf_b_512:
1993  // PERMILVAR
1994  case Intrinsic::x86_avx_vpermilvar_ps:
1995  case Intrinsic::x86_avx_vpermilvar_ps_256:
1996  case Intrinsic::x86_avx512_vpermilvar_ps_512:
1997  case Intrinsic::x86_avx_vpermilvar_pd:
1998  case Intrinsic::x86_avx_vpermilvar_pd_256:
1999  case Intrinsic::x86_avx512_vpermilvar_pd_512:
2000  // PERMV
2001  case Intrinsic::x86_avx2_permd:
2002  case Intrinsic::x86_avx2_permps: {
2003  simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
2004  break;
2005  }
2006 
2007  // SSE4A instructions leave the upper 64-bits of the 128-bit result
2008  // in an undefined state.
2009  case Intrinsic::x86_sse4a_extrq:
2010  case Intrinsic::x86_sse4a_extrqi:
2011  case Intrinsic::x86_sse4a_insertq:
2012  case Intrinsic::x86_sse4a_insertqi:
2013  UndefElts.setHighBits(VWidth / 2);
2014  break;
2015  }
2016  return None;
2017 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
llvm::X86TTIImpl::simplifyDemandedVectorEltsIntrinsic
Optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
Definition: X86InstCombineIntrinsic.cpp:1781
llvm::IRBuilderBase::SetInsertPoint
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:184
llvm
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
llvm::MaskedValueIsZero
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return true if 'V & Mask' is known to be zero.
Definition: ValueTracking.cpp:360
InstCombiner.h
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1329
IntrinsicInst.h
llvm::KnownBits::getMinValue
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition: KnownBits.h:118
llvm::Function
Definition: Function.h:61
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:52
simplifyX86insertq
static Value * simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, APInt APLength, APInt APIndex, InstCombiner::BuilderTy &Builder)
Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant folding or conversion to a shu...
Definition: X86InstCombineIntrinsic.cpp:711
llvm::PointerType::get
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Definition: Type.cpp:687
simplifyX86vpermilvar
static Value * simplifyX86vpermilvar(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Attempt to convert vpermilvar* to shufflevector if the mask is constant.
Definition: X86InstCombineIntrinsic.cpp:849
llvm::BitCastInst
This class represents a no-op cast from one type to another.
Definition: Instructions.h:5166
llvm::KnownBits::Zero
APInt Zero
Definition: KnownBits.h:24
C1
instcombine should handle this C2 when C1
Definition: README.txt:263
llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:131
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1167
simplifyX86pack
static Value * simplifyX86pack(IntrinsicInst &II, InstCombiner::BuilderTy &Builder, bool IsSigned)
Definition: X86InstCombineIntrinsic.cpp:438
llvm::InstCombiner::Builder
BuilderTy & Builder
Definition: InstCombiner.h:56
llvm::ConstantExpr::getICmp
static Constant * getICmp(unsigned short pred, Constant *LHS, Constant *RHS, bool OnlyIfReduced=false)
get* - Return some common constants without having to specify the full Instruction::OPCODE identifier...
Definition: Constants.cpp:2489
llvm::IRBuilder< TargetFolder, IRBuilderCallbackInserter >
llvm::APInt::zextOrTrunc
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:952
llvm::ConstantExpr::getBitCast
static Constant * getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2220
llvm::IRBuilderBase::CreateMaskedLoad
CallInst * CreateMaskedLoad(Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:489
llvm::APInt::getBitsSet
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:612
llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:749
llvm::APInt::getSignedMaxValue
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:540
Shift
bool Shift
Definition: README.txt:468
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:46
llvm::ConstantAggregateZero
All zero aggregate value.
Definition: Constants.h:334
llvm::IRBuilderBase::CreateFSub
Value * CreateFSub(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1438
llvm::Optional
Definition: APInt.h:33
llvm::KnownBits::isZero
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:72
llvm::InstCombiner::addToWorklist
void addToWorklist(Instruction *I)
Definition: InstCombiner.h:365
llvm::APInt::lshr
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition: APInt.h:987
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::Type::getInt8Ty
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:195
F
#define F(x, y, z)
Definition: MD5.cpp:56
KnownBits.h
llvm::IRBuilderBase::CreateBinOp
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1538
llvm::APInt::setHighBits
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1510
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:206
llvm::APInt::uge
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1313
simplifyX86pshufb
static Value * simplifyX86pshufb(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Attempt to convert pshufb* to shufflevector if the mask is constant.
Definition: X86InstCombineIntrinsic.cpp:804
x3
In x86 we generate this spiffy xmm0 xmm0 ret in x86 we generate this which could be xmm1 movss xmm1 xmm0 ret In sse4 we could use insertps to make both better Here s another testcase that could use x3
Definition: README-SSE.txt:547
llvm::IRBuilderBase::CreateFMul
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1463
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:77
llvm::InstCombiner::replaceOperand
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:437
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1482
llvm::SelectInst::Create
static SelectInst * Create(Value *C, Value *S1, Value *S2, const Twine &NameStr="", Instruction *InsertBefore=nullptr, Instruction *MDFrom=nullptr)
Definition: Instructions.h:1774
simplifyX86varShift
static Value * simplifyX86varShift(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Definition: X86InstCombineIntrinsic.cpp:301
llvm::APInt::setBit
void setBit(unsigned BitPosition)
Set a given bit to 1.
Definition: APInt.h:1442
llvm::APInt::lshrInPlace
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:994
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
getBoolVecFromMask
static Value * getBoolVecFromMask(Value *Mask)
Convert the x86 XMM integer vector mask to a vector of bools based on each element's most significant...
Definition: X86InstCombineIntrinsic.cpp:38
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
int
Clang compiles this i1 i64 store i64 i64 store i64 i64 store i64 i64 store i64 align Which gets codegen d xmm0 movaps rbp movaps rbp movaps rbp movaps rbp rbp rbp rbp rbp It would be better to have movq s of instead of the movaps s LLVM produces ret int
Definition: README.txt:536
llvm::VectorType::getInteger
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
Definition: DerivedTypes.h:442
llvm::X86TTIImpl::instCombineIntrinsic
Optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
Definition: X86InstCombineIntrinsic.cpp:930
simplifyX86extrq
static Value * simplifyX86extrq(IntrinsicInst &II, Value *Op0, ConstantInt *CILength, ConstantInt *CIIndex, InstCombiner::BuilderTy &Builder)
Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding or conversion to a shuffle...
Definition: X86InstCombineIntrinsic.cpp:620
simplifyX86movmsk
static Value * simplifyX86movmsk(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Definition: X86InstCombineIntrinsic.cpp:504
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:235
llvm::Log2_32
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:596
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:147
llvm::InstCombiner::eraseInstFromFunction
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1631
llvm::InstCombiner::SimplifyDemandedVectorElts
virtual Value * SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt &UndefElts, unsigned Depth=0, bool AllowMultipleUsers=false)=0
llvm::IRBuilderBase::CreateInsertElement
Value * CreateInsertElement(Value *Vec, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2412
llvm::APInt::getHighBitsSet
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Get a value with high bits set.
Definition: APInt.h:655
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1783
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:898
Align
uint64_t Align
Definition: ELFObjHandler.cpp:83
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:644
llvm::None
const NoneType None
Definition: None.h:23
llvm::Type::getIntegerBitWidth
unsigned getIntegerBitWidth() const
Definition: DerivedTypes.h:96
llvm::IRBuilderBase::CreateAnd
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1348
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:202
llvm::APInt::isSubsetOf
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1349
llvm::APInt::getOneBitSet
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:593
getNegativeIsTrueBoolVec
static Constant * getNegativeIsTrueBoolVec(Constant *V)
Return a constant boolean vector that has true elements in all positions where the input constant dat...
Definition: X86InstCombineIntrinsic.cpp:28
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:391
llvm::IRBuilderBase::CreateFAdd
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1413
llvm::IRBuilderBase::CreateFDiv
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1488
simplifyX86MaskedStore
static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC)
Definition: X86InstCombineIntrinsic.cpp:86
llvm::IRBuilderBase::CreateBitCast
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2074
x2
gcc mainline compiles it x2(%rip)
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
Index
uint32_t Index
Definition: ELFObjHandler.cpp:84
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::IRBuilderBase::CreateSelect
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:952
llvm::PointerType
Class to represent pointers.
Definition: DerivedTypes.h:634
llvm::KnownBits::getMaxValue
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:134
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:212
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::IRBuilderBase::CreateExtractElement
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2399
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:83
llvm::X86TTIImpl::simplifyDemandedUseBitsIntrinsic
Optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed) const
Definition: X86InstCombineIntrinsic.cpp:1740
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:70
llvm::PatternMatch::m_SExt
CastClass_match< OpTy, Instruction::SExt > m_SExt(const OpTy &Op)
Matches SExt.
Definition: PatternMatch.h:1633
llvm::Type::getStructElementType
Type * getStructElementType(unsigned N) const
Definition: DerivedTypes.h:354
X86TargetTransformInfo.h
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::Constant::getAggregateElement
Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
Definition: Constants.cpp:420
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
llvm::ConstantInt::isZero
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:192
uint32_t
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:965
llvm::IRBuilderBase::InsertPointGuard
Definition: IRBuilder.h:367
llvm::ConstantVector::get
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1367
llvm::APInt::ult
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1205
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:281
llvm::APInt::clearBit
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1525
llvm::InstCombiner::replaceInstUsesWith
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:416
llvm::APInt::zext
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:934
llvm::APInt::getNullValue
static APInt getNullValue(unsigned numBits)
Get the '0' value.
Definition: APInt.h:574
llvm::NVPTX::PTXLdStInstCode::V2
@ V2
Definition: NVPTX.h:123
llvm::Type::getInt64Ty
static IntegerType * getInt64Ty(LLVMContext &C)
Definition: Type.cpp:198
simplifyX86insertps
static Value * simplifyX86insertps(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Definition: X86InstCombineIntrinsic.cpp:561
llvm::Constant::getNullValue
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:346
llvm::KnownBits
Definition: KnownBits.h:23
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:314
llvm::Type::isIntOrIntVectorTy
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:208
llvm::BitWidth
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:147
llvm::APInt::getLoBits
APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition: APInt.cpp:571
llvm::APInt::getSplat
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:578
simplifyX86vpermv
static Value * simplifyX86vpermv(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
Definition: X86InstCombineIntrinsic.cpp:896
llvm::PatternMatch::m_ZeroInt
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:522
llvm::APInt::getSignedMinValue
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:550
llvm::InstCombiner
The core instruction combiner logic.
Definition: InstCombiner.h:45
llvm::AMDGPU::Hwreg::Width
Width
Definition: SIDefines.h:403
llvm::APInt::sext
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:910
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:45
llvm::makeArrayRef
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
Definition: ArrayRef.h:476
llvm::APInt::isNullValue
bool isNullValue() const
Determine if all bits are clear.
Definition: APInt.h:411
llvm::IRBuilderBase::getInt1Ty
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:503
llvm::Constant::getIntegerValue
static Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
Definition: Constants.cpp:387
simplifyX86addcarry
static Value * simplifyX86addcarry(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Definition: X86InstCombineIntrinsic.cpp:534
llvm::IRBuilderBase::CreateLShr
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1308
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1341
llvm::IRBuilderBase::CreateMaskedStore
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:510
simplifyX86MaskedLoad
static Instruction * simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC)
Definition: X86InstCombineIntrinsic.cpp:56
llvm::IRBuilderBase::CreateShl
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1287
llvm::IntegerType::getBitWidth
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:71
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:397
llvm::APInt::getLowBitsSet
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Get a value with low bits set.
Definition: APInt.h:667
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1478
llvm::APInt::setBitsFrom
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1500
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:389
llvm::InstCombiner::peekThroughBitcast
static Value * peekThroughBitcast(Value *V, bool OneUseOnly=false)
Return the source operand of a potentially bitcasted value while optionally checking if it has one us...
Definition: InstCombiner.h:99
llvm::APInt::shl
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition: APInt.h:1009
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::ConstantAggregateZero::get
static ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1662
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
simplifyX86immShift
static Value * simplifyX86immShift(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Definition: X86InstCombineIntrinsic.cpp:119
llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition: Type.cpp:122
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38