LLVM 23.0.0git
X86InstCombineIntrinsic.cpp
Go to the documentation of this file.
1//===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15
18#include "llvm/IR/IntrinsicsX86.h"
21#include <optional>
22
23using namespace llvm;
24using namespace llvm::PatternMatch;
25
26#define DEBUG_TYPE "x86tti"
27
28/// Return a constant boolean vector that has true elements in all positions
29/// where the input constant data vector has an element with the sign bit set.
32 V = ConstantExpr::getBitCast(V, IntTy);
34 Constant::getNullValue(IntTy), V, DL);
35 assert(V && "Vector must be foldable");
36 return V;
37}
38
39/// Convert the x86 XMM integer vector mask to a vector of bools based on
40/// each element's most significant bit (the sign bit).
41static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) {
42 // Fold Constant Mask.
43 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
44 return getNegativeIsTrueBoolVec(ConstantMask, DL);
45
46 // Mask was extended from a boolean vector.
47 Value *ExtMask;
48 if (match(Mask, m_SExt(m_Value(ExtMask))) &&
49 ExtMask->getType()->isIntOrIntVectorTy(1))
50 return ExtMask;
51
52 return nullptr;
53}
54
55// TODO: If the x86 backend knew how to convert a bool vector mask back to an
56// XMM register mask efficiently, we could transform all x86 masked intrinsics
57// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
59 Value *Ptr = II.getOperand(0);
60 Value *Mask = II.getOperand(1);
61 Constant *ZeroVec = Constant::getNullValue(II.getType());
62
63 // Zero Mask - masked load instruction creates a zero vector.
65 return IC.replaceInstUsesWith(II, ZeroVec);
66
67 // The mask is constant or extended from a bool vector. Convert this x86
68 // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
69 if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
70 // The pass-through vector for an x86 masked load is a zero vector.
71 CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
72 II.getType(), Ptr, Align(1), BoolMask, ZeroVec);
73 return IC.replaceInstUsesWith(II, NewMaskedLoad);
74 }
75
76 return nullptr;
77}
78
79// TODO: If the x86 backend knew how to convert a bool vector mask back to an
80// XMM register mask efficiently, we could transform all x86 masked intrinsics
81// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
83 Value *Ptr = II.getOperand(0);
84 Value *Mask = II.getOperand(1);
85 Value *Vec = II.getOperand(2);
86
87 // Zero Mask - this masked store instruction does nothing.
90 return true;
91 }
92
93 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
94 // anything else at this level.
95 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
96 return false;
97
98 // The mask is constant or extended from a bool vector. Convert this x86
99 // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
100 if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
101 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
102 PointerType *VecPtrTy = PointerType::get(Vec->getContext(), AddrSpace);
103 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
104
105 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
106
107 // 'Replace uses' doesn't work for stores. Erase the original masked store.
109 return true;
110 }
111
112 return false;
113}
114
116 InstCombiner::BuilderTy &Builder) {
117 bool LogicalShift = false;
118 bool ShiftLeft = false;
119 bool IsImm = false;
120
121 switch (II.getIntrinsicID()) {
122 default:
123 llvm_unreachable("Unexpected intrinsic!");
124 case Intrinsic::x86_sse2_psrai_d:
125 case Intrinsic::x86_sse2_psrai_w:
126 case Intrinsic::x86_avx2_psrai_d:
127 case Intrinsic::x86_avx2_psrai_w:
128 case Intrinsic::x86_avx512_psrai_q_128:
129 case Intrinsic::x86_avx512_psrai_q_256:
130 case Intrinsic::x86_avx512_psrai_d_512:
131 case Intrinsic::x86_avx512_psrai_q_512:
132 case Intrinsic::x86_avx512_psrai_w_512:
133 IsImm = true;
134 [[fallthrough]];
135 case Intrinsic::x86_sse2_psra_d:
136 case Intrinsic::x86_sse2_psra_w:
137 case Intrinsic::x86_avx2_psra_d:
138 case Intrinsic::x86_avx2_psra_w:
139 case Intrinsic::x86_avx512_psra_q_128:
140 case Intrinsic::x86_avx512_psra_q_256:
141 case Intrinsic::x86_avx512_psra_d_512:
142 case Intrinsic::x86_avx512_psra_q_512:
143 case Intrinsic::x86_avx512_psra_w_512:
144 LogicalShift = false;
145 ShiftLeft = false;
146 break;
147 case Intrinsic::x86_sse2_psrli_d:
148 case Intrinsic::x86_sse2_psrli_q:
149 case Intrinsic::x86_sse2_psrli_w:
150 case Intrinsic::x86_avx2_psrli_d:
151 case Intrinsic::x86_avx2_psrli_q:
152 case Intrinsic::x86_avx2_psrli_w:
153 case Intrinsic::x86_avx512_psrli_d_512:
154 case Intrinsic::x86_avx512_psrli_q_512:
155 case Intrinsic::x86_avx512_psrli_w_512:
156 IsImm = true;
157 [[fallthrough]];
158 case Intrinsic::x86_sse2_psrl_d:
159 case Intrinsic::x86_sse2_psrl_q:
160 case Intrinsic::x86_sse2_psrl_w:
161 case Intrinsic::x86_avx2_psrl_d:
162 case Intrinsic::x86_avx2_psrl_q:
163 case Intrinsic::x86_avx2_psrl_w:
164 case Intrinsic::x86_avx512_psrl_d_512:
165 case Intrinsic::x86_avx512_psrl_q_512:
166 case Intrinsic::x86_avx512_psrl_w_512:
167 LogicalShift = true;
168 ShiftLeft = false;
169 break;
170 case Intrinsic::x86_sse2_pslli_d:
171 case Intrinsic::x86_sse2_pslli_q:
172 case Intrinsic::x86_sse2_pslli_w:
173 case Intrinsic::x86_avx2_pslli_d:
174 case Intrinsic::x86_avx2_pslli_q:
175 case Intrinsic::x86_avx2_pslli_w:
176 case Intrinsic::x86_avx512_pslli_d_512:
177 case Intrinsic::x86_avx512_pslli_q_512:
178 case Intrinsic::x86_avx512_pslli_w_512:
179 IsImm = true;
180 [[fallthrough]];
181 case Intrinsic::x86_sse2_psll_d:
182 case Intrinsic::x86_sse2_psll_q:
183 case Intrinsic::x86_sse2_psll_w:
184 case Intrinsic::x86_avx2_psll_d:
185 case Intrinsic::x86_avx2_psll_q:
186 case Intrinsic::x86_avx2_psll_w:
187 case Intrinsic::x86_avx512_psll_d_512:
188 case Intrinsic::x86_avx512_psll_q_512:
189 case Intrinsic::x86_avx512_psll_w_512:
190 LogicalShift = true;
191 ShiftLeft = true;
192 break;
193 }
194 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
195
196 Value *Vec = II.getArgOperand(0);
197 Value *Amt = II.getArgOperand(1);
198 auto *VT = cast<FixedVectorType>(Vec->getType());
199 Type *SVT = VT->getElementType();
200 Type *AmtVT = Amt->getType();
201 unsigned VWidth = VT->getNumElements();
202 unsigned BitWidth = SVT->getPrimitiveSizeInBits();
203
204 // If the shift amount is guaranteed to be in-range we can replace it with a
205 // generic shift. If its guaranteed to be out of range, logical shifts combine
206 // to zero and arithmetic shifts are clamped to (BitWidth - 1).
207 if (IsImm) {
208 assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
209 KnownBits KnownAmtBits =
210 llvm::computeKnownBits(Amt, II.getDataLayout());
211 if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
212 Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
213 Amt = Builder.CreateVectorSplat(VWidth, Amt);
214 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
215 : Builder.CreateLShr(Vec, Amt))
216 : Builder.CreateAShr(Vec, Amt));
217 }
218 if (KnownAmtBits.getMinValue().uge(BitWidth)) {
219 if (LogicalShift)
221 Amt = ConstantInt::get(SVT, BitWidth - 1);
222 return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
223 }
224 } else {
225 // Ensure the first element has an in-range value and the rest of the
226 // elements in the bottom 64 bits are zero.
227 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
228 cast<VectorType>(AmtVT)->getElementType() == SVT &&
229 "Unexpected shift-by-scalar type");
230 unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
231 APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
232 APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
233 KnownBits KnownLowerBits = llvm::computeKnownBits(
234 Amt, DemandedLower, II.getDataLayout());
235 KnownBits KnownUpperBits = llvm::computeKnownBits(
236 Amt, DemandedUpper, II.getDataLayout());
237 if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
238 (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
239 SmallVector<int, 16> ZeroSplat(VWidth, 0);
240 Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
241 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
242 : Builder.CreateLShr(Vec, Amt))
243 : Builder.CreateAShr(Vec, Amt));
244 }
245 }
246
247 // Simplify if count is constant vector.
248 auto *CDV = dyn_cast<ConstantDataVector>(Amt);
249 if (!CDV)
250 return nullptr;
251
252 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
253 // operand to compute the shift amount.
254 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
255 cast<VectorType>(AmtVT)->getElementType() == SVT &&
256 "Unexpected shift-by-scalar type");
257
258 // Concatenate the sub-elements to create the 64-bit value.
259 APInt Count(64, 0);
260 for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
261 unsigned SubEltIdx = (NumSubElts - 1) - i;
262 auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
263 Count <<= BitWidth;
264 Count |= SubElt->getValue().zextOrTrunc(64);
265 }
266
267 // If shift-by-zero then just return the original value.
268 if (Count.isZero())
269 return Vec;
270
271 // Handle cases when Shift >= BitWidth.
272 if (Count.uge(BitWidth)) {
273 // If LogicalShift - just return zero.
274 if (LogicalShift)
276
277 // If ArithmeticShift - clamp Shift to (BitWidth - 1).
278 Count = APInt(64, BitWidth - 1);
279 }
280
281 // Get a constant vector of the same type as the first operand.
282 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
283 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
284
285 if (ShiftLeft)
286 return Builder.CreateShl(Vec, ShiftVec);
287
288 if (LogicalShift)
289 return Builder.CreateLShr(Vec, ShiftVec);
290
291 return Builder.CreateAShr(Vec, ShiftVec);
292}
293
294// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
295// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
296// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
298 InstCombiner::BuilderTy &Builder) {
299 bool LogicalShift = false;
300 bool ShiftLeft = false;
301
302 switch (II.getIntrinsicID()) {
303 default:
304 llvm_unreachable("Unexpected intrinsic!");
305 case Intrinsic::x86_avx2_psrav_d:
306 case Intrinsic::x86_avx2_psrav_d_256:
307 case Intrinsic::x86_avx512_psrav_q_128:
308 case Intrinsic::x86_avx512_psrav_q_256:
309 case Intrinsic::x86_avx512_psrav_d_512:
310 case Intrinsic::x86_avx512_psrav_q_512:
311 case Intrinsic::x86_avx512_psrav_w_128:
312 case Intrinsic::x86_avx512_psrav_w_256:
313 case Intrinsic::x86_avx512_psrav_w_512:
314 LogicalShift = false;
315 ShiftLeft = false;
316 break;
317 case Intrinsic::x86_avx2_psrlv_d:
318 case Intrinsic::x86_avx2_psrlv_d_256:
319 case Intrinsic::x86_avx2_psrlv_q:
320 case Intrinsic::x86_avx2_psrlv_q_256:
321 case Intrinsic::x86_avx512_psrlv_d_512:
322 case Intrinsic::x86_avx512_psrlv_q_512:
323 case Intrinsic::x86_avx512_psrlv_w_128:
324 case Intrinsic::x86_avx512_psrlv_w_256:
325 case Intrinsic::x86_avx512_psrlv_w_512:
326 LogicalShift = true;
327 ShiftLeft = false;
328 break;
329 case Intrinsic::x86_avx2_psllv_d:
330 case Intrinsic::x86_avx2_psllv_d_256:
331 case Intrinsic::x86_avx2_psllv_q:
332 case Intrinsic::x86_avx2_psllv_q_256:
333 case Intrinsic::x86_avx512_psllv_d_512:
334 case Intrinsic::x86_avx512_psllv_q_512:
335 case Intrinsic::x86_avx512_psllv_w_128:
336 case Intrinsic::x86_avx512_psllv_w_256:
337 case Intrinsic::x86_avx512_psllv_w_512:
338 LogicalShift = true;
339 ShiftLeft = true;
340 break;
341 }
342 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
343
344 Value *Vec = II.getArgOperand(0);
345 Value *Amt = II.getArgOperand(1);
346 auto *VT = cast<FixedVectorType>(II.getType());
347 Type *SVT = VT->getElementType();
348 int NumElts = VT->getNumElements();
349 int BitWidth = SVT->getIntegerBitWidth();
350
351 // If the shift amount is guaranteed to be in-range we can replace it with a
352 // generic shift.
353 KnownBits KnownAmt =
354 llvm::computeKnownBits(Amt, II.getDataLayout());
355 if (KnownAmt.getMaxValue().ult(BitWidth)) {
356 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
357 : Builder.CreateLShr(Vec, Amt))
358 : Builder.CreateAShr(Vec, Amt));
359 }
360
361 // Simplify if all shift amounts are constant/undef.
362 auto *CShift = dyn_cast<Constant>(Amt);
363 if (!CShift)
364 return nullptr;
365
366 // Collect each element's shift amount.
367 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
368 bool AnyOutOfRange = false;
369 SmallVector<int, 8> ShiftAmts;
370 for (int I = 0; I < NumElts; ++I) {
371 auto *CElt = CShift->getAggregateElement(I);
372 if (isa_and_nonnull<UndefValue>(CElt)) {
373 ShiftAmts.push_back(-1);
374 continue;
375 }
376
377 auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
378 if (!COp)
379 return nullptr;
380
381 // Handle out of range shifts.
382 // If LogicalShift - set to BitWidth (special case).
383 // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
384 APInt ShiftVal = COp->getValue();
385 if (ShiftVal.uge(BitWidth)) {
386 AnyOutOfRange = LogicalShift;
387 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
388 continue;
389 }
390
391 ShiftAmts.push_back((int)ShiftVal.getZExtValue());
392 }
393
394 // If all elements out of range or UNDEF, return vector of zeros/undefs.
395 // ArithmeticShift should only hit this if they are all UNDEF.
396 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
397 if (llvm::all_of(ShiftAmts, OutOfRange)) {
398 SmallVector<Constant *, 8> ConstantVec;
399 for (int Idx : ShiftAmts) {
400 if (Idx < 0) {
401 ConstantVec.push_back(UndefValue::get(SVT));
402 } else {
403 assert(LogicalShift && "Logical shift expected");
404 ConstantVec.push_back(ConstantInt::getNullValue(SVT));
405 }
406 }
407 return ConstantVector::get(ConstantVec);
408 }
409
410 // We can't handle only some out of range values with generic logical shifts.
411 if (AnyOutOfRange)
412 return nullptr;
413
414 // Build the shift amount constant vector.
415 SmallVector<Constant *, 8> ShiftVecAmts;
416 for (int Idx : ShiftAmts) {
417 if (Idx < 0)
418 ShiftVecAmts.push_back(UndefValue::get(SVT));
419 else
420 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
421 }
422 auto ShiftVec = ConstantVector::get(ShiftVecAmts);
423
424 if (ShiftLeft)
425 return Builder.CreateShl(Vec, ShiftVec);
426
427 if (LogicalShift)
428 return Builder.CreateLShr(Vec, ShiftVec);
429
430 return Builder.CreateAShr(Vec, ShiftVec);
431}
432
434 InstCombiner::BuilderTy &Builder, bool IsSigned) {
435 Value *Arg0 = II.getArgOperand(0);
436 Value *Arg1 = II.getArgOperand(1);
437 Type *ResTy = II.getType();
438
439 // Fast all undef handling.
440 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
441 return UndefValue::get(ResTy);
442
443 auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
444 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
445 unsigned NumSrcElts = ArgTy->getNumElements();
446 assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
447 "Unexpected packing types");
448
449 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
450 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
451 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
452 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
453 "Unexpected packing types");
454
455 // Constant folding.
456 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
457 return nullptr;
458
459 // Clamp Values - signed/unsigned both use signed clamp values, but they
460 // differ on the min/max values.
461 APInt MinValue, MaxValue;
462 if (IsSigned) {
463 // PACKSS: Truncate signed value with signed saturation.
464 // Source values less than dst minint are saturated to minint.
465 // Source values greater than dst maxint are saturated to maxint.
466 MinValue =
467 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
468 MaxValue =
469 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
470 } else {
471 // PACKUS: Truncate signed value with unsigned saturation.
472 // Source values less than zero are saturated to zero.
473 // Source values greater than dst maxuint are saturated to maxuint.
474 MinValue = APInt::getZero(SrcScalarSizeInBits);
475 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
476 }
477
478 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
479 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
480 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
481 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
482 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
483 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
484
485 // Shuffle clamped args together at the lane level.
486 SmallVector<int, 32> PackMask;
487 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
488 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
489 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
490 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
491 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
492 }
493 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
494
495 // Truncate to dst size.
496 return Builder.CreateTrunc(Shuffle, ResTy);
497}
498
500 InstCombiner::BuilderTy &Builder, bool IsSigned,
501 bool IsRounding) {
502 Value *Arg0 = II.getArgOperand(0);
503 Value *Arg1 = II.getArgOperand(1);
504 auto *ResTy = cast<FixedVectorType>(II.getType());
505 auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
506 assert(ArgTy == ResTy && ResTy->getScalarSizeInBits() == 16 &&
507 "Unexpected PMULH types");
508 assert((!IsRounding || IsSigned) && "PMULHRS instruction must be signed");
509
510 // Multiply by undef -> zero (NOT undef!) as other arg could still be zero.
511 if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
512 return ConstantAggregateZero::get(ResTy);
513
514 // Multiply by zero.
516 return ConstantAggregateZero::get(ResTy);
517
518 // Multiply by one.
519 if (!IsRounding) {
520 if (match(Arg0, m_One()))
521 return IsSigned ? Builder.CreateAShr(Arg1, 15)
523 if (match(Arg1, m_One()))
524 return IsSigned ? Builder.CreateAShr(Arg0, 15)
526 }
527
528 // Constant folding.
529 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
530 return nullptr;
531
532 // Extend to twice the width and multiply.
533 auto Cast =
534 IsSigned ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt;
536 Value *LHS = Builder.CreateCast(Cast, Arg0, ExtTy);
537 Value *RHS = Builder.CreateCast(Cast, Arg1, ExtTy);
538 Value *Mul = Builder.CreateMul(LHS, RHS);
539
540 if (IsRounding) {
541 // PMULHRSW: truncate to vXi18 of the most significant bits, add one and
542 // extract bits[16:1].
543 auto *RndEltTy = IntegerType::get(ExtTy->getContext(), 18);
544 auto *RndTy = FixedVectorType::get(RndEltTy, ExtTy);
545 Mul = Builder.CreateLShr(Mul, 14);
546 Mul = Builder.CreateTrunc(Mul, RndTy);
547 Mul = Builder.CreateAdd(Mul, ConstantInt::get(RndTy, 1));
548 Mul = Builder.CreateLShr(Mul, 1);
549 } else {
550 // PMULH/PMULHU: extract the vXi16 most significant bits.
551 Mul = Builder.CreateLShr(Mul, 16);
552 }
553
554 return Builder.CreateTrunc(Mul, ResTy);
555}
556
559 bool IsPMADDWD) {
560 Value *Arg0 = II.getArgOperand(0);
561 Value *Arg1 = II.getArgOperand(1);
562 auto *ResTy = cast<FixedVectorType>(II.getType());
563 [[maybe_unused]] auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
564
565 unsigned NumDstElts = ResTy->getNumElements();
566 assert(ArgTy->getNumElements() == (2 * NumDstElts) &&
567 ResTy->getScalarSizeInBits() == (2 * ArgTy->getScalarSizeInBits()) &&
568 "Unexpected PMADD types");
569
570 // Multiply by undef -> zero (NOT undef!) as other arg could still be zero.
571 if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
572 return ConstantAggregateZero::get(ResTy);
573
574 // Multiply by zero.
576 return ConstantAggregateZero::get(ResTy);
577
578 // Constant folding.
579 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
580 return nullptr;
581
582 // Split Lo/Hi elements pairs, extend and add together.
583 // PMADDWD(X,Y) =
584 // add(mul(sext(lhs[0]),sext(rhs[0])),mul(sext(lhs[1]),sext(rhs[1])))
585 // PMADDUBSW(X,Y) =
586 // sadd_sat(mul(zext(lhs[0]),sext(rhs[0])),mul(zext(lhs[1]),sext(rhs[1])))
587 SmallVector<int> LoMask, HiMask;
588 for (unsigned I = 0; I != NumDstElts; ++I) {
589 LoMask.push_back(2 * I + 0);
590 HiMask.push_back(2 * I + 1);
591 }
592
593 auto *LHSLo = Builder.CreateShuffleVector(Arg0, LoMask);
594 auto *LHSHi = Builder.CreateShuffleVector(Arg0, HiMask);
595 auto *RHSLo = Builder.CreateShuffleVector(Arg1, LoMask);
596 auto *RHSHi = Builder.CreateShuffleVector(Arg1, HiMask);
597
598 auto LHSCast =
599 IsPMADDWD ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt;
600 LHSLo = Builder.CreateCast(LHSCast, LHSLo, ResTy);
601 LHSHi = Builder.CreateCast(LHSCast, LHSHi, ResTy);
602 RHSLo = Builder.CreateCast(Instruction::CastOps::SExt, RHSLo, ResTy);
603 RHSHi = Builder.CreateCast(Instruction::CastOps::SExt, RHSHi, ResTy);
604 Value *Lo = Builder.CreateMul(LHSLo, RHSLo);
605 Value *Hi = Builder.CreateMul(LHSHi, RHSHi);
606 return IsPMADDWD
607 ? Builder.CreateAdd(Lo, Hi)
608 : Builder.CreateIntrinsic(ResTy, Intrinsic::sadd_sat, {Lo, Hi});
609}
610
612 InstCombiner::BuilderTy &Builder) {
613 Value *Arg = II.getArgOperand(0);
614 Type *ResTy = II.getType();
615
616 // movmsk(undef) -> zero as we must ensure the upper bits are zero.
617 if (isa<UndefValue>(Arg))
618 return Constant::getNullValue(ResTy);
619
620 // Preserve previous behavior and give up.
621 // TODO: treat as <8 x i8>.
622 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb)
623 return nullptr;
624
625 auto *ArgTy = cast<FixedVectorType>(Arg->getType());
626
627 // Expand MOVMSK to compare/bitcast/zext:
628 // e.g. PMOVMSKB(v16i8 x):
629 // %cmp = icmp slt <16 x i8> %x, zeroinitializer
630 // %int = bitcast <16 x i1> %cmp to i16
631 // %res = zext i16 %int to i32
632 unsigned NumElts = ArgTy->getNumElements();
633 Type *IntegerTy = Builder.getIntNTy(NumElts);
634
635 Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
636 Res = Builder.CreateIsNeg(Res);
637 Res = Builder.CreateBitCast(Res, IntegerTy);
638 Res = Builder.CreateZExtOrTrunc(Res, ResTy);
639 return Res;
640}
641
643 InstCombiner::BuilderTy &Builder) {
644 Value *CarryIn = II.getArgOperand(0);
645 Value *Op1 = II.getArgOperand(1);
646 Value *Op2 = II.getArgOperand(2);
647 Type *RetTy = II.getType();
648 Type *OpTy = Op1->getType();
650 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
651 "Unexpected types for x86 addcarry");
652
653 // If carry-in is zero, this is just an unsigned add with overflow.
654 if (match(CarryIn, m_ZeroInt())) {
655 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
656 {Op1, Op2});
657 // The types have to be adjusted to match the x86 call types.
658 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
659 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
660 Builder.getInt8Ty());
661 Value *Res = PoisonValue::get(RetTy);
662 Res = Builder.CreateInsertValue(Res, UAddOV, 0);
663 return Builder.CreateInsertValue(Res, UAddResult, 1);
664 }
665
666 return nullptr;
667}
668
670 InstCombiner::BuilderTy &Builder) {
671
672 auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3));
673 if (!ArgImm || ArgImm->getValue().uge(256))
674 return nullptr;
675
676 Value *ArgA = II.getArgOperand(0);
677 Value *ArgB = II.getArgOperand(1);
678 Value *ArgC = II.getArgOperand(2);
679
680 Type *Ty = II.getType();
681
682 auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
683 return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second};
684 };
685 auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
686 return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second};
687 };
688 auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
689 return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second};
690 };
691 auto Not = [&](auto V) -> std::pair<Value *, uint8_t> {
692 return {Builder.CreateNot(V.first), ~V.second};
693 };
694 auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); };
695 auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); };
696 auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); };
697
698 bool AIsConst = match(ArgA, m_ImmConstant());
699 bool BIsConst = match(ArgB, m_ImmConstant());
700 bool CIsConst = match(ArgC, m_ImmConstant());
701
702 bool ABIsConst = AIsConst && BIsConst;
703 bool ACIsConst = AIsConst && CIsConst;
704 bool BCIsConst = BIsConst && CIsConst;
705 bool ABCIsConst = AIsConst && BIsConst && CIsConst;
706
707 // Use for verification. Its a big table. Its difficult to go from Imm ->
708 // logic ops, but easy to verify that a set of logic ops is correct. We track
709 // the logic ops through the second value in the pair. At the end it should
710 // equal Imm.
711 std::pair<Value *, uint8_t> A = {ArgA, 0xf0};
712 std::pair<Value *, uint8_t> B = {ArgB, 0xcc};
713 std::pair<Value *, uint8_t> C = {ArgC, 0xaa};
714 std::pair<Value *, uint8_t> Res = {nullptr, 0};
715
716 // Currently we only handle cases that convert directly to another instruction
717 // or cases where all the ops are constant. This is because we don't properly
718 // handle creating ternary ops in the backend, so splitting them here may
719 // cause regressions. As the backend improves, uncomment more cases.
720
721 uint8_t Imm = ArgImm->getValue().getZExtValue();
722 switch (Imm) {
723 case 0x0:
724 Res = {Constant::getNullValue(Ty), 0};
725 break;
726 case 0x1:
727 if (ABCIsConst)
728 Res = Nor(Or(A, B), C);
729 break;
730 case 0x2:
731 if (ABCIsConst)
732 Res = And(Nor(A, B), C);
733 break;
734 case 0x3:
735 if (ABIsConst)
736 Res = Nor(A, B);
737 break;
738 case 0x4:
739 if (ABCIsConst)
740 Res = And(Nor(A, C), B);
741 break;
742 case 0x5:
743 if (ACIsConst)
744 Res = Nor(A, C);
745 break;
746 case 0x6:
747 if (ABCIsConst)
748 Res = Nor(A, Xnor(B, C));
749 break;
750 case 0x7:
751 if (ABCIsConst)
752 Res = Nor(A, And(B, C));
753 break;
754 case 0x8:
755 if (ABCIsConst)
756 Res = Nor(A, Nand(B, C));
757 break;
758 case 0x9:
759 if (ABCIsConst)
760 Res = Nor(A, Xor(B, C));
761 break;
762 case 0xa:
763 if (ACIsConst)
764 Res = Nor(A, Not(C));
765 break;
766 case 0xb:
767 if (ABCIsConst)
768 Res = Nor(A, Nor(C, Not(B)));
769 break;
770 case 0xc:
771 if (ABIsConst)
772 Res = Nor(A, Not(B));
773 break;
774 case 0xd:
775 if (ABCIsConst)
776 Res = Nor(A, Nor(B, Not(C)));
777 break;
778 case 0xe:
779 if (ABCIsConst)
780 Res = Nor(A, Nor(B, C));
781 break;
782 case 0xf:
783 Res = Not(A);
784 break;
785 case 0x10:
786 if (ABCIsConst)
787 Res = And(A, Nor(B, C));
788 break;
789 case 0x11:
790 if (BCIsConst)
791 Res = Nor(B, C);
792 break;
793 case 0x12:
794 if (ABCIsConst)
795 Res = Nor(Xnor(A, C), B);
796 break;
797 case 0x13:
798 if (ABCIsConst)
799 Res = Nor(And(A, C), B);
800 break;
801 case 0x14:
802 if (ABCIsConst)
803 Res = Nor(Xnor(A, B), C);
804 break;
805 case 0x15:
806 if (ABCIsConst)
807 Res = Nor(And(A, B), C);
808 break;
809 case 0x16:
810 if (ABCIsConst)
811 Res = Xor(Xor(A, B), And(Nand(A, B), C));
812 break;
813 case 0x17:
814 if (ABCIsConst)
815 Res = Xor(Or(A, B), Or(Xnor(A, B), C));
816 break;
817 case 0x18:
818 if (ABCIsConst)
819 Res = Nor(Xnor(A, B), Xnor(A, C));
820 break;
821 case 0x19:
822 if (ABCIsConst)
823 Res = And(Nand(A, B), Xnor(B, C));
824 break;
825 case 0x1a:
826 if (ABCIsConst)
827 Res = Xor(A, Or(And(A, B), C));
828 break;
829 case 0x1b:
830 if (ABCIsConst)
831 Res = Xor(A, Or(Xnor(A, B), C));
832 break;
833 case 0x1c:
834 if (ABCIsConst)
835 Res = Xor(A, Or(And(A, C), B));
836 break;
837 case 0x1d:
838 if (ABCIsConst)
839 Res = Xor(A, Or(Xnor(A, C), B));
840 break;
841 case 0x1e:
842 if (ABCIsConst)
843 Res = Xor(A, Or(B, C));
844 break;
845 case 0x1f:
846 if (ABCIsConst)
847 Res = Nand(A, Or(B, C));
848 break;
849 case 0x20:
850 if (ABCIsConst)
851 Res = Nor(Nand(A, C), B);
852 break;
853 case 0x21:
854 if (ABCIsConst)
855 Res = Nor(Xor(A, C), B);
856 break;
857 case 0x22:
858 if (BCIsConst)
859 Res = Nor(B, Not(C));
860 break;
861 case 0x23:
862 if (ABCIsConst)
863 Res = Nor(B, Nor(C, Not(A)));
864 break;
865 case 0x24:
866 if (ABCIsConst)
867 Res = Nor(Xnor(A, B), Xor(A, C));
868 break;
869 case 0x25:
870 if (ABCIsConst)
871 Res = Xor(A, Nand(Nand(A, B), C));
872 break;
873 case 0x26:
874 if (ABCIsConst)
875 Res = And(Nand(A, B), Xor(B, C));
876 break;
877 case 0x27:
878 if (ABCIsConst)
879 Res = Xor(Or(Xnor(A, B), C), B);
880 break;
881 case 0x28:
882 if (ABCIsConst)
883 Res = And(Xor(A, B), C);
884 break;
885 case 0x29:
886 if (ABCIsConst)
887 Res = Xor(Xor(A, B), Nor(And(A, B), C));
888 break;
889 case 0x2a:
890 if (ABCIsConst)
891 Res = And(Nand(A, B), C);
892 break;
893 case 0x2b:
894 if (ABCIsConst)
895 Res = Xor(Or(Xnor(A, B), Xor(A, C)), A);
896 break;
897 case 0x2c:
898 if (ABCIsConst)
899 Res = Nor(Xnor(A, B), Nor(B, C));
900 break;
901 case 0x2d:
902 if (ABCIsConst)
903 Res = Xor(A, Or(B, Not(C)));
904 break;
905 case 0x2e:
906 if (ABCIsConst)
907 Res = Xor(A, Or(Xor(A, C), B));
908 break;
909 case 0x2f:
910 if (ABCIsConst)
911 Res = Nand(A, Or(B, Not(C)));
912 break;
913 case 0x30:
914 if (ABIsConst)
915 Res = Nor(B, Not(A));
916 break;
917 case 0x31:
918 if (ABCIsConst)
919 Res = Nor(Nor(A, Not(C)), B);
920 break;
921 case 0x32:
922 if (ABCIsConst)
923 Res = Nor(Nor(A, C), B);
924 break;
925 case 0x33:
926 Res = Not(B);
927 break;
928 case 0x34:
929 if (ABCIsConst)
930 Res = And(Xor(A, B), Nand(B, C));
931 break;
932 case 0x35:
933 if (ABCIsConst)
934 Res = Xor(B, Or(A, Xnor(B, C)));
935 break;
936 case 0x36:
937 if (ABCIsConst)
938 Res = Xor(Or(A, C), B);
939 break;
940 case 0x37:
941 if (ABCIsConst)
942 Res = Nand(Or(A, C), B);
943 break;
944 case 0x38:
945 if (ABCIsConst)
946 Res = Nor(Xnor(A, B), Nor(A, C));
947 break;
948 case 0x39:
949 if (ABCIsConst)
950 Res = Xor(Or(A, Not(C)), B);
951 break;
952 case 0x3a:
953 if (ABCIsConst)
954 Res = Xor(B, Or(A, Xor(B, C)));
955 break;
956 case 0x3b:
957 if (ABCIsConst)
958 Res = Nand(Or(A, Not(C)), B);
959 break;
960 case 0x3c:
961 Res = Xor(A, B);
962 break;
963 case 0x3d:
964 if (ABCIsConst)
965 Res = Xor(A, Or(Nor(A, C), B));
966 break;
967 case 0x3e:
968 if (ABCIsConst)
969 Res = Xor(A, Or(Nor(A, Not(C)), B));
970 break;
971 case 0x3f:
972 if (ABIsConst)
973 Res = Nand(A, B);
974 break;
975 case 0x40:
976 if (ABCIsConst)
977 Res = Nor(Nand(A, B), C);
978 break;
979 case 0x41:
980 if (ABCIsConst)
981 Res = Nor(Xor(A, B), C);
982 break;
983 case 0x42:
984 if (ABCIsConst)
985 Res = Nor(Xor(A, B), Xnor(A, C));
986 break;
987 case 0x43:
988 if (ABCIsConst)
989 Res = Xor(A, Nand(Nand(A, C), B));
990 break;
991 case 0x44:
992 if (BCIsConst)
993 Res = Nor(C, Not(B));
994 break;
995 case 0x45:
996 if (ABCIsConst)
997 Res = Nor(Nor(B, Not(A)), C);
998 break;
999 case 0x46:
1000 if (ABCIsConst)
1001 Res = Xor(Or(And(A, C), B), C);
1002 break;
1003 case 0x47:
1004 if (ABCIsConst)
1005 Res = Xor(Or(Xnor(A, C), B), C);
1006 break;
1007 case 0x48:
1008 if (ABCIsConst)
1009 Res = And(Xor(A, C), B);
1010 break;
1011 case 0x49:
1012 if (ABCIsConst)
1013 Res = Xor(Or(Xnor(A, B), And(A, C)), C);
1014 break;
1015 case 0x4a:
1016 if (ABCIsConst)
1017 Res = Nor(Xnor(A, C), Nor(B, C));
1018 break;
1019 case 0x4b:
1020 if (ABCIsConst)
1021 Res = Xor(A, Or(C, Not(B)));
1022 break;
1023 case 0x4c:
1024 if (ABCIsConst)
1025 Res = And(Nand(A, C), B);
1026 break;
1027 case 0x4d:
1028 if (ABCIsConst)
1029 Res = Xor(Or(Xor(A, B), Xnor(A, C)), A);
1030 break;
1031 case 0x4e:
1032 if (ABCIsConst)
1033 Res = Xor(A, Or(Xor(A, B), C));
1034 break;
1035 case 0x4f:
1036 if (ABCIsConst)
1037 Res = Nand(A, Nand(B, Not(C)));
1038 break;
1039 case 0x50:
1040 if (ACIsConst)
1041 Res = Nor(C, Not(A));
1042 break;
1043 case 0x51:
1044 if (ABCIsConst)
1045 Res = Nor(Nor(A, Not(B)), C);
1046 break;
1047 case 0x52:
1048 if (ABCIsConst)
1049 Res = And(Xor(A, C), Nand(B, C));
1050 break;
1051 case 0x53:
1052 if (ABCIsConst)
1053 Res = Xor(Or(Xnor(B, C), A), C);
1054 break;
1055 case 0x54:
1056 if (ABCIsConst)
1057 Res = Nor(Nor(A, B), C);
1058 break;
1059 case 0x55:
1060 Res = Not(C);
1061 break;
1062 case 0x56:
1063 if (ABCIsConst)
1064 Res = Xor(Or(A, B), C);
1065 break;
1066 case 0x57:
1067 if (ABCIsConst)
1068 Res = Nand(Or(A, B), C);
1069 break;
1070 case 0x58:
1071 if (ABCIsConst)
1072 Res = Nor(Nor(A, B), Xnor(A, C));
1073 break;
1074 case 0x59:
1075 if (ABCIsConst)
1076 Res = Xor(Or(A, Not(B)), C);
1077 break;
1078 case 0x5a:
1079 Res = Xor(A, C);
1080 break;
1081 case 0x5b:
1082 if (ABCIsConst)
1083 Res = Xor(A, Or(Nor(A, B), C));
1084 break;
1085 case 0x5c:
1086 if (ABCIsConst)
1087 Res = Xor(Or(Xor(B, C), A), C);
1088 break;
1089 case 0x5d:
1090 if (ABCIsConst)
1091 Res = Nand(Or(A, Not(B)), C);
1092 break;
1093 case 0x5e:
1094 if (ABCIsConst)
1095 Res = Xor(A, Or(Nor(A, Not(B)), C));
1096 break;
1097 case 0x5f:
1098 if (ACIsConst)
1099 Res = Nand(A, C);
1100 break;
1101 case 0x60:
1102 if (ABCIsConst)
1103 Res = And(A, Xor(B, C));
1104 break;
1105 case 0x61:
1106 if (ABCIsConst)
1107 Res = Xor(Or(Xnor(A, B), And(B, C)), C);
1108 break;
1109 case 0x62:
1110 if (ABCIsConst)
1111 Res = Nor(Nor(A, C), Xnor(B, C));
1112 break;
1113 case 0x63:
1114 if (ABCIsConst)
1115 Res = Xor(B, Or(C, Not(A)));
1116 break;
1117 case 0x64:
1118 if (ABCIsConst)
1119 Res = Nor(Nor(A, B), Xnor(B, C));
1120 break;
1121 case 0x65:
1122 if (ABCIsConst)
1123 Res = Xor(Or(B, Not(A)), C);
1124 break;
1125 case 0x66:
1126 Res = Xor(B, C);
1127 break;
1128 case 0x67:
1129 if (ABCIsConst)
1130 Res = Or(Nor(A, B), Xor(B, C));
1131 break;
1132 case 0x68:
1133 if (ABCIsConst)
1134 Res = Xor(Xor(A, B), Nor(Nor(A, B), C));
1135 break;
1136 case 0x69:
1137 if (ABCIsConst)
1138 Res = Xor(Xnor(A, B), C);
1139 break;
1140 case 0x6a:
1141 if (ABCIsConst)
1142 Res = Xor(And(A, B), C);
1143 break;
1144 case 0x6b:
1145 if (ABCIsConst)
1146 Res = Or(Nor(A, B), Xor(Xnor(A, B), C));
1147 break;
1148 case 0x6c:
1149 if (ABCIsConst)
1150 Res = Xor(And(A, C), B);
1151 break;
1152 case 0x6d:
1153 if (ABCIsConst)
1154 Res = Xor(Or(Xnor(A, B), Nor(A, C)), C);
1155 break;
1156 case 0x6e:
1157 if (ABCIsConst)
1158 Res = Or(Nor(A, Not(B)), Xor(B, C));
1159 break;
1160 case 0x6f:
1161 if (ABCIsConst)
1162 Res = Nand(A, Xnor(B, C));
1163 break;
1164 case 0x70:
1165 if (ABCIsConst)
1166 Res = And(A, Nand(B, C));
1167 break;
1168 case 0x71:
1169 if (ABCIsConst)
1170 Res = Xor(Nor(Xor(A, B), Xor(A, C)), A);
1171 break;
1172 case 0x72:
1173 if (ABCIsConst)
1174 Res = Xor(Or(Xor(A, B), C), B);
1175 break;
1176 case 0x73:
1177 if (ABCIsConst)
1178 Res = Nand(Nand(A, Not(C)), B);
1179 break;
1180 case 0x74:
1181 if (ABCIsConst)
1182 Res = Xor(Or(Xor(A, C), B), C);
1183 break;
1184 case 0x75:
1185 if (ABCIsConst)
1186 Res = Nand(Nand(A, Not(B)), C);
1187 break;
1188 case 0x76:
1189 if (ABCIsConst)
1190 Res = Xor(B, Or(Nor(B, Not(A)), C));
1191 break;
1192 case 0x77:
1193 if (BCIsConst)
1194 Res = Nand(B, C);
1195 break;
1196 case 0x78:
1197 if (ABCIsConst)
1198 Res = Xor(A, And(B, C));
1199 break;
1200 case 0x79:
1201 if (ABCIsConst)
1202 Res = Xor(Or(Xnor(A, B), Nor(B, C)), C);
1203 break;
1204 case 0x7a:
1205 if (ABCIsConst)
1206 Res = Or(Xor(A, C), Nor(B, Not(A)));
1207 break;
1208 case 0x7b:
1209 if (ABCIsConst)
1210 Res = Nand(Xnor(A, C), B);
1211 break;
1212 case 0x7c:
1213 if (ABCIsConst)
1214 Res = Or(Xor(A, B), Nor(C, Not(A)));
1215 break;
1216 case 0x7d:
1217 if (ABCIsConst)
1218 Res = Nand(Xnor(A, B), C);
1219 break;
1220 case 0x7e:
1221 if (ABCIsConst)
1222 Res = Or(Xor(A, B), Xor(A, C));
1223 break;
1224 case 0x7f:
1225 if (ABCIsConst)
1226 Res = Nand(And(A, B), C);
1227 break;
1228 case 0x80:
1229 if (ABCIsConst)
1230 Res = And(And(A, B), C);
1231 break;
1232 case 0x81:
1233 if (ABCIsConst)
1234 Res = Nor(Xor(A, B), Xor(A, C));
1235 break;
1236 case 0x82:
1237 if (ABCIsConst)
1238 Res = And(Xnor(A, B), C);
1239 break;
1240 case 0x83:
1241 if (ABCIsConst)
1242 Res = Nor(Xor(A, B), Nor(C, Not(A)));
1243 break;
1244 case 0x84:
1245 if (ABCIsConst)
1246 Res = And(Xnor(A, C), B);
1247 break;
1248 case 0x85:
1249 if (ABCIsConst)
1250 Res = Nor(Xor(A, C), Nor(B, Not(A)));
1251 break;
1252 case 0x86:
1253 if (ABCIsConst)
1254 Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C);
1255 break;
1256 case 0x87:
1257 if (ABCIsConst)
1258 Res = Xor(A, Nand(B, C));
1259 break;
1260 case 0x88:
1261 Res = And(B, C);
1262 break;
1263 case 0x89:
1264 if (ABCIsConst)
1265 Res = Xor(B, Nor(Nor(B, Not(A)), C));
1266 break;
1267 case 0x8a:
1268 if (ABCIsConst)
1269 Res = And(Nand(A, Not(B)), C);
1270 break;
1271 case 0x8b:
1272 if (ABCIsConst)
1273 Res = Xor(Nor(Xor(A, C), B), C);
1274 break;
1275 case 0x8c:
1276 if (ABCIsConst)
1277 Res = And(Nand(A, Not(C)), B);
1278 break;
1279 case 0x8d:
1280 if (ABCIsConst)
1281 Res = Xor(Nor(Xor(A, B), C), B);
1282 break;
1283 case 0x8e:
1284 if (ABCIsConst)
1285 Res = Xor(Or(Xor(A, B), Xor(A, C)), A);
1286 break;
1287 case 0x8f:
1288 if (ABCIsConst)
1289 Res = Nand(A, Nand(B, C));
1290 break;
1291 case 0x90:
1292 if (ABCIsConst)
1293 Res = And(A, Xnor(B, C));
1294 break;
1295 case 0x91:
1296 if (ABCIsConst)
1297 Res = Nor(Nor(A, Not(B)), Xor(B, C));
1298 break;
1299 case 0x92:
1300 if (ABCIsConst)
1301 Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C);
1302 break;
1303 case 0x93:
1304 if (ABCIsConst)
1305 Res = Xor(Nand(A, C), B);
1306 break;
1307 case 0x94:
1308 if (ABCIsConst)
1309 Res = Nor(Nor(A, B), Xor(Xnor(A, B), C));
1310 break;
1311 case 0x95:
1312 if (ABCIsConst)
1313 Res = Xor(Nand(A, B), C);
1314 break;
1315 case 0x96:
1316 if (ABCIsConst)
1317 Res = Xor(Xor(A, B), C);
1318 break;
1319 case 0x97:
1320 if (ABCIsConst)
1321 Res = Xor(Xor(A, B), Or(Nor(A, B), C));
1322 break;
1323 case 0x98:
1324 if (ABCIsConst)
1325 Res = Nor(Nor(A, B), Xor(B, C));
1326 break;
1327 case 0x99:
1328 if (BCIsConst)
1329 Res = Xnor(B, C);
1330 break;
1331 case 0x9a:
1332 if (ABCIsConst)
1333 Res = Xor(Nor(B, Not(A)), C);
1334 break;
1335 case 0x9b:
1336 if (ABCIsConst)
1337 Res = Or(Nor(A, B), Xnor(B, C));
1338 break;
1339 case 0x9c:
1340 if (ABCIsConst)
1341 Res = Xor(B, Nor(C, Not(A)));
1342 break;
1343 case 0x9d:
1344 if (ABCIsConst)
1345 Res = Or(Nor(A, C), Xnor(B, C));
1346 break;
1347 case 0x9e:
1348 if (ABCIsConst)
1349 Res = Xor(And(Xor(A, B), Nand(B, C)), C);
1350 break;
1351 case 0x9f:
1352 if (ABCIsConst)
1353 Res = Nand(A, Xor(B, C));
1354 break;
1355 case 0xa0:
1356 Res = And(A, C);
1357 break;
1358 case 0xa1:
1359 if (ABCIsConst)
1360 Res = Xor(A, Nor(Nor(A, Not(B)), C));
1361 break;
1362 case 0xa2:
1363 if (ABCIsConst)
1364 Res = And(Or(A, Not(B)), C);
1365 break;
1366 case 0xa3:
1367 if (ABCIsConst)
1368 Res = Xor(Nor(Xor(B, C), A), C);
1369 break;
1370 case 0xa4:
1371 if (ABCIsConst)
1372 Res = Xor(A, Nor(Nor(A, B), C));
1373 break;
1374 case 0xa5:
1375 if (ACIsConst)
1376 Res = Xnor(A, C);
1377 break;
1378 case 0xa6:
1379 if (ABCIsConst)
1380 Res = Xor(Nor(A, Not(B)), C);
1381 break;
1382 case 0xa7:
1383 if (ABCIsConst)
1384 Res = Or(Nor(A, B), Xnor(A, C));
1385 break;
1386 case 0xa8:
1387 if (ABCIsConst)
1388 Res = And(Or(A, B), C);
1389 break;
1390 case 0xa9:
1391 if (ABCIsConst)
1392 Res = Xor(Nor(A, B), C);
1393 break;
1394 case 0xaa:
1395 Res = C;
1396 break;
1397 case 0xab:
1398 if (ABCIsConst)
1399 Res = Or(Nor(A, B), C);
1400 break;
1401 case 0xac:
1402 if (ABCIsConst)
1403 Res = Xor(Nor(Xnor(B, C), A), C);
1404 break;
1405 case 0xad:
1406 if (ABCIsConst)
1407 Res = Or(Xnor(A, C), And(B, C));
1408 break;
1409 case 0xae:
1410 if (ABCIsConst)
1411 Res = Or(Nor(A, Not(B)), C);
1412 break;
1413 case 0xaf:
1414 if (ACIsConst)
1415 Res = Or(C, Not(A));
1416 break;
1417 case 0xb0:
1418 if (ABCIsConst)
1419 Res = And(A, Nand(B, Not(C)));
1420 break;
1421 case 0xb1:
1422 if (ABCIsConst)
1423 Res = Xor(A, Nor(Xor(A, B), C));
1424 break;
1425 case 0xb2:
1426 if (ABCIsConst)
1427 Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A);
1428 break;
1429 case 0xb3:
1430 if (ABCIsConst)
1431 Res = Nand(Nand(A, C), B);
1432 break;
1433 case 0xb4:
1434 if (ABCIsConst)
1435 Res = Xor(A, Nor(C, Not(B)));
1436 break;
1437 case 0xb5:
1438 if (ABCIsConst)
1439 Res = Or(Xnor(A, C), Nor(B, C));
1440 break;
1441 case 0xb6:
1442 if (ABCIsConst)
1443 Res = Xor(And(Xor(A, B), Nand(A, C)), C);
1444 break;
1445 case 0xb7:
1446 if (ABCIsConst)
1447 Res = Nand(Xor(A, C), B);
1448 break;
1449 case 0xb8:
1450 if (ABCIsConst)
1451 Res = Xor(Nor(Xnor(A, C), B), C);
1452 break;
1453 case 0xb9:
1454 if (ABCIsConst)
1455 Res = Xor(Nor(And(A, C), B), C);
1456 break;
1457 case 0xba:
1458 if (ABCIsConst)
1459 Res = Or(Nor(B, Not(A)), C);
1460 break;
1461 case 0xbb:
1462 if (BCIsConst)
1463 Res = Or(C, Not(B));
1464 break;
1465 case 0xbc:
1466 if (ABCIsConst)
1467 Res = Xor(A, And(Nand(A, C), B));
1468 break;
1469 case 0xbd:
1470 if (ABCIsConst)
1471 Res = Or(Xor(A, B), Xnor(A, C));
1472 break;
1473 case 0xbe:
1474 if (ABCIsConst)
1475 Res = Or(Xor(A, B), C);
1476 break;
1477 case 0xbf:
1478 if (ABCIsConst)
1479 Res = Or(Nand(A, B), C);
1480 break;
1481 case 0xc0:
1482 Res = And(A, B);
1483 break;
1484 case 0xc1:
1485 if (ABCIsConst)
1486 Res = Xor(A, Nor(Nor(A, Not(C)), B));
1487 break;
1488 case 0xc2:
1489 if (ABCIsConst)
1490 Res = Xor(A, Nor(Nor(A, C), B));
1491 break;
1492 case 0xc3:
1493 if (ABIsConst)
1494 Res = Xnor(A, B);
1495 break;
1496 case 0xc4:
1497 if (ABCIsConst)
1498 Res = And(Or(A, Not(C)), B);
1499 break;
1500 case 0xc5:
1501 if (ABCIsConst)
1502 Res = Xor(B, Nor(A, Xor(B, C)));
1503 break;
1504 case 0xc6:
1505 if (ABCIsConst)
1506 Res = Xor(Nor(A, Not(C)), B);
1507 break;
1508 case 0xc7:
1509 if (ABCIsConst)
1510 Res = Or(Xnor(A, B), Nor(A, C));
1511 break;
1512 case 0xc8:
1513 if (ABCIsConst)
1514 Res = And(Or(A, C), B);
1515 break;
1516 case 0xc9:
1517 if (ABCIsConst)
1518 Res = Xor(Nor(A, C), B);
1519 break;
1520 case 0xca:
1521 if (ABCIsConst)
1522 Res = Xor(B, Nor(A, Xnor(B, C)));
1523 break;
1524 case 0xcb:
1525 if (ABCIsConst)
1526 Res = Or(Xnor(A, B), And(B, C));
1527 break;
1528 case 0xcc:
1529 Res = B;
1530 break;
1531 case 0xcd:
1532 if (ABCIsConst)
1533 Res = Or(Nor(A, C), B);
1534 break;
1535 case 0xce:
1536 if (ABCIsConst)
1537 Res = Or(Nor(A, Not(C)), B);
1538 break;
1539 case 0xcf:
1540 if (ABIsConst)
1541 Res = Or(B, Not(A));
1542 break;
1543 case 0xd0:
1544 if (ABCIsConst)
1545 Res = And(A, Or(B, Not(C)));
1546 break;
1547 case 0xd1:
1548 if (ABCIsConst)
1549 Res = Xor(A, Nor(Xor(A, C), B));
1550 break;
1551 case 0xd2:
1552 if (ABCIsConst)
1553 Res = Xor(A, Nor(B, Not(C)));
1554 break;
1555 case 0xd3:
1556 if (ABCIsConst)
1557 Res = Or(Xnor(A, B), Nor(B, C));
1558 break;
1559 case 0xd4:
1560 if (ABCIsConst)
1561 Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A);
1562 break;
1563 case 0xd5:
1564 if (ABCIsConst)
1565 Res = Nand(Nand(A, B), C);
1566 break;
1567 case 0xd6:
1568 if (ABCIsConst)
1569 Res = Xor(Xor(A, B), Or(And(A, B), C));
1570 break;
1571 case 0xd7:
1572 if (ABCIsConst)
1573 Res = Nand(Xor(A, B), C);
1574 break;
1575 case 0xd8:
1576 if (ABCIsConst)
1577 Res = Xor(Nor(Xnor(A, B), C), B);
1578 break;
1579 case 0xd9:
1580 if (ABCIsConst)
1581 Res = Or(And(A, B), Xnor(B, C));
1582 break;
1583 case 0xda:
1584 if (ABCIsConst)
1585 Res = Xor(A, And(Nand(A, B), C));
1586 break;
1587 case 0xdb:
1588 if (ABCIsConst)
1589 Res = Or(Xnor(A, B), Xor(A, C));
1590 break;
1591 case 0xdc:
1592 if (ABCIsConst)
1593 Res = Or(B, Nor(C, Not(A)));
1594 break;
1595 case 0xdd:
1596 if (BCIsConst)
1597 Res = Or(B, Not(C));
1598 break;
1599 case 0xde:
1600 if (ABCIsConst)
1601 Res = Or(Xor(A, C), B);
1602 break;
1603 case 0xdf:
1604 if (ABCIsConst)
1605 Res = Or(Nand(A, C), B);
1606 break;
1607 case 0xe0:
1608 if (ABCIsConst)
1609 Res = And(A, Or(B, C));
1610 break;
1611 case 0xe1:
1612 if (ABCIsConst)
1613 Res = Xor(A, Nor(B, C));
1614 break;
1615 case 0xe2:
1616 if (ABCIsConst)
1617 Res = Xor(A, Nor(Xnor(A, C), B));
1618 break;
1619 case 0xe3:
1620 if (ABCIsConst)
1621 Res = Xor(A, Nor(And(A, C), B));
1622 break;
1623 case 0xe4:
1624 if (ABCIsConst)
1625 Res = Xor(A, Nor(Xnor(A, B), C));
1626 break;
1627 case 0xe5:
1628 if (ABCIsConst)
1629 Res = Xor(A, Nor(And(A, B), C));
1630 break;
1631 case 0xe6:
1632 if (ABCIsConst)
1633 Res = Or(And(A, B), Xor(B, C));
1634 break;
1635 case 0xe7:
1636 if (ABCIsConst)
1637 Res = Or(Xnor(A, B), Xnor(A, C));
1638 break;
1639 case 0xe8:
1640 if (ABCIsConst)
1641 Res = Xor(Or(A, B), Nor(Xnor(A, B), C));
1642 break;
1643 case 0xe9:
1644 if (ABCIsConst)
1645 Res = Xor(Xor(A, B), Nand(Nand(A, B), C));
1646 break;
1647 case 0xea:
1648 if (ABCIsConst)
1649 Res = Or(And(A, B), C);
1650 break;
1651 case 0xeb:
1652 if (ABCIsConst)
1653 Res = Or(Xnor(A, B), C);
1654 break;
1655 case 0xec:
1656 if (ABCIsConst)
1657 Res = Or(And(A, C), B);
1658 break;
1659 case 0xed:
1660 if (ABCIsConst)
1661 Res = Or(Xnor(A, C), B);
1662 break;
1663 case 0xee:
1664 Res = Or(B, C);
1665 break;
1666 case 0xef:
1667 if (ABCIsConst)
1668 Res = Nand(A, Nor(B, C));
1669 break;
1670 case 0xf0:
1671 Res = A;
1672 break;
1673 case 0xf1:
1674 if (ABCIsConst)
1675 Res = Or(A, Nor(B, C));
1676 break;
1677 case 0xf2:
1678 if (ABCIsConst)
1679 Res = Or(A, Nor(B, Not(C)));
1680 break;
1681 case 0xf3:
1682 if (ABIsConst)
1683 Res = Or(A, Not(B));
1684 break;
1685 case 0xf4:
1686 if (ABCIsConst)
1687 Res = Or(A, Nor(C, Not(B)));
1688 break;
1689 case 0xf5:
1690 if (ACIsConst)
1691 Res = Or(A, Not(C));
1692 break;
1693 case 0xf6:
1694 if (ABCIsConst)
1695 Res = Or(A, Xor(B, C));
1696 break;
1697 case 0xf7:
1698 if (ABCIsConst)
1699 Res = Or(A, Nand(B, C));
1700 break;
1701 case 0xf8:
1702 if (ABCIsConst)
1703 Res = Or(A, And(B, C));
1704 break;
1705 case 0xf9:
1706 if (ABCIsConst)
1707 Res = Or(A, Xnor(B, C));
1708 break;
1709 case 0xfa:
1710 Res = Or(A, C);
1711 break;
1712 case 0xfb:
1713 if (ABCIsConst)
1714 Res = Nand(Nor(A, C), B);
1715 break;
1716 case 0xfc:
1717 Res = Or(A, B);
1718 break;
1719 case 0xfd:
1720 if (ABCIsConst)
1721 Res = Nand(Nor(A, B), C);
1722 break;
1723 case 0xfe:
1724 if (ABCIsConst)
1725 Res = Or(Or(A, B), C);
1726 break;
1727 case 0xff:
1728 Res = {Constant::getAllOnesValue(Ty), 0xff};
1729 break;
1730 }
1731
1732 assert((Res.first == nullptr || Res.second == Imm) &&
1733 "Simplification of ternary logic does not verify!");
1734 return Res.first;
1735}
1736
1738 Intrinsic::ID NewIID, bool IsScalar = false) {
1739
1740 Value *Arg0 = II.getArgOperand(0);
1741 Value *Arg1 = II.getArgOperand(1);
1742 unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
1743
1745 APInt DemandedElts =
1746 IsScalar ? APInt::getOneBitSet(VWidth, 0) : APInt::getAllOnes(VWidth);
1747
1748 FPClassTest Forbidden0 = fcNan | fcInf | fcSubnormal;
1749 FPClassTest Forbidden1 = fcNan | fcInf | fcSubnormal;
1750 if (NewIID == Intrinsic::maxnum) {
1751 // For maxnum, only forbid NegZero in the second operand.
1752 Forbidden1 |= fcNegZero;
1753 } else {
1754 assert(NewIID == Intrinsic::minnum && "Unknown intrinsic");
1755 // For minnum, only forbid NegZero in the first operand.
1756 Forbidden0 |= fcNegZero;
1757 }
1758 KnownFPClass KnownArg0 =
1759 computeKnownFPClass(Arg0, DemandedElts, Forbidden0, SQ);
1760 KnownFPClass KnownArg1 =
1761 computeKnownFPClass(Arg1, DemandedElts, Forbidden1, SQ);
1762
1763 if (KnownArg0.isKnownNever(Forbidden0) &&
1764 KnownArg1.isKnownNever(Forbidden1)) {
1765 if (IsScalar) {
1766 // It performs the operation on the first element and puts it back into
1767 // the vector.
1768 Value *Scalar0 = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
1769 Value *Scalar1 = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
1770
1771 Value *NewScalar = (NewIID == Intrinsic::maxnum)
1772 ? IC.Builder.CreateMaxNum(Scalar0, Scalar1)
1773 : IC.Builder.CreateMinNum(Scalar0, Scalar1);
1774 return IC.Builder.CreateInsertElement(Arg0, NewScalar, (uint64_t)0);
1775 } else {
1776 return (NewIID == Intrinsic::maxnum)
1777 ? IC.Builder.CreateMaxNum(Arg0, Arg1)
1778 : IC.Builder.CreateMinNum(Arg0, Arg1);
1779 }
1780 }
1781
1782 return nullptr;
1783}
1784
1786 InstCombiner::BuilderTy &Builder) {
1787 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
1788 if (!CInt)
1789 return nullptr;
1790
1791 auto *VecTy = cast<FixedVectorType>(II.getType());
1792 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
1793
1794 // The immediate permute control byte looks like this:
1795 // [3:0] - zero mask for each 32-bit lane
1796 // [5:4] - select one 32-bit destination lane
1797 // [7:6] - select one 32-bit source lane
1798
1799 uint8_t Imm = CInt->getZExtValue();
1800 uint8_t ZMask = Imm & 0xf;
1801 uint8_t DestLane = (Imm >> 4) & 0x3;
1802 uint8_t SourceLane = (Imm >> 6) & 0x3;
1803
1805
1806 // If all zero mask bits are set, this was just a weird way to
1807 // generate a zero vector.
1808 if (ZMask == 0xf)
1809 return ZeroVector;
1810
1811 // Initialize by passing all of the first source bits through.
1812 int ShuffleMask[4] = {0, 1, 2, 3};
1813
1814 // We may replace the second operand with the zero vector.
1815 Value *V1 = II.getArgOperand(1);
1816
1817 if (ZMask) {
1818 // If the zero mask is being used with a single input or the zero mask
1819 // overrides the destination lane, this is a shuffle with the zero vector.
1820 if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
1821 (ZMask & (1 << DestLane))) {
1822 V1 = ZeroVector;
1823 // We may still move 32-bits of the first source vector from one lane
1824 // to another.
1825 ShuffleMask[DestLane] = SourceLane;
1826 // The zero mask may override the previous insert operation.
1827 for (unsigned i = 0; i < 4; ++i)
1828 if ((ZMask >> i) & 0x1)
1829 ShuffleMask[i] = i + 4;
1830 } else {
1831 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
1832 return nullptr;
1833 }
1834 } else {
1835 // Replace the selected destination lane with the selected source lane.
1836 ShuffleMask[DestLane] = SourceLane + 4;
1837 }
1838
1839 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
1840}
1841
1842/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
1843/// or conversion to a shuffle vector.
1845 ConstantInt *CILength, ConstantInt *CIIndex,
1846 InstCombiner::BuilderTy &Builder) {
1847 auto LowConstantHighUndef = [&](uint64_t Val) {
1848 Type *IntTy64 = Type::getInt64Ty(II.getContext());
1849 Constant *Args[] = {ConstantInt::get(IntTy64, Val),
1850 UndefValue::get(IntTy64)};
1851 return ConstantVector::get(Args);
1852 };
1853
1854 // See if we're dealing with constant values.
1855 auto *C0 = dyn_cast<Constant>(Op0);
1856 auto *CI0 =
1857 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1858 : nullptr;
1859
1860 // Attempt to constant fold.
1861 if (CILength && CIIndex) {
1862 // From AMD documentation: "The bit index and field length are each six
1863 // bits in length other bits of the field are ignored."
1864 APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
1865 APInt APLength = CILength->getValue().zextOrTrunc(6);
1866
1867 unsigned Index = APIndex.getZExtValue();
1868
1869 // From AMD documentation: "a value of zero in the field length is
1870 // defined as length of 64".
1871 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1872
1873 // From AMD documentation: "If the sum of the bit index + length field
1874 // is greater than 64, the results are undefined".
1875 unsigned End = Index + Length;
1876
1877 // Note that both field index and field length are 8-bit quantities.
1878 // Since variables 'Index' and 'Length' are unsigned values
1879 // obtained from zero-extending field index and field length
1880 // respectively, their sum should never wrap around.
1881 if (End > 64)
1882 return UndefValue::get(II.getType());
1883
1884 // If we are inserting whole bytes, we can convert this to a shuffle.
1885 // Lowering can recognize EXTRQI shuffle masks.
1886 if ((Length % 8) == 0 && (Index % 8) == 0) {
1887 // Convert bit indices to byte indices.
1888 Length /= 8;
1889 Index /= 8;
1890
1891 Type *IntTy8 = Type::getInt8Ty(II.getContext());
1892 auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1893
1894 SmallVector<int, 16> ShuffleMask;
1895 for (int i = 0; i != (int)Length; ++i)
1896 ShuffleMask.push_back(i + Index);
1897 for (int i = Length; i != 8; ++i)
1898 ShuffleMask.push_back(i + 16);
1899 for (int i = 8; i != 16; ++i)
1900 ShuffleMask.push_back(-1);
1901
1902 Value *SV = Builder.CreateShuffleVector(
1903 Builder.CreateBitCast(Op0, ShufTy),
1904 ConstantAggregateZero::get(ShufTy), ShuffleMask);
1905 return Builder.CreateBitCast(SV, II.getType());
1906 }
1907
1908 // Constant Fold - shift Index'th bit to lowest position and mask off
1909 // Length bits.
1910 if (CI0) {
1911 APInt Elt = CI0->getValue();
1912 Elt.lshrInPlace(Index);
1913 Elt = Elt.zextOrTrunc(Length);
1914 return LowConstantHighUndef(Elt.getZExtValue());
1915 }
1916
1917 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
1918 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
1919 Value *Args[] = {Op0, CILength, CIIndex};
1920 return Builder.CreateIntrinsic(Intrinsic::x86_sse4a_extrqi, Args);
1921 }
1922 }
1923
1924 // Constant Fold - extraction from zero is always {zero, undef}.
1925 if (CI0 && CI0->isZero())
1926 return LowConstantHighUndef(0);
1927
1928 return nullptr;
1929}
1930
1931/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
1932/// folding or conversion to a shuffle vector.
1934 APInt APLength, APInt APIndex,
1935 InstCombiner::BuilderTy &Builder) {
1936 // From AMD documentation: "The bit index and field length are each six bits
1937 // in length other bits of the field are ignored."
1938 APIndex = APIndex.zextOrTrunc(6);
1939 APLength = APLength.zextOrTrunc(6);
1940
1941 // Attempt to constant fold.
1942 unsigned Index = APIndex.getZExtValue();
1943
1944 // From AMD documentation: "a value of zero in the field length is
1945 // defined as length of 64".
1946 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1947
1948 // From AMD documentation: "If the sum of the bit index + length field
1949 // is greater than 64, the results are undefined".
1950 unsigned End = Index + Length;
1951
1952 // Note that both field index and field length are 8-bit quantities.
1953 // Since variables 'Index' and 'Length' are unsigned values
1954 // obtained from zero-extending field index and field length
1955 // respectively, their sum should never wrap around.
1956 if (End > 64)
1957 return UndefValue::get(II.getType());
1958
1959 // If we are inserting whole bytes, we can convert this to a shuffle.
1960 // Lowering can recognize INSERTQI shuffle masks.
1961 if ((Length % 8) == 0 && (Index % 8) == 0) {
1962 // Convert bit indices to byte indices.
1963 Length /= 8;
1964 Index /= 8;
1965
1966 Type *IntTy8 = Type::getInt8Ty(II.getContext());
1967 auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1968
1969 SmallVector<int, 16> ShuffleMask;
1970 for (int i = 0; i != (int)Index; ++i)
1971 ShuffleMask.push_back(i);
1972 for (int i = 0; i != (int)Length; ++i)
1973 ShuffleMask.push_back(i + 16);
1974 for (int i = Index + Length; i != 8; ++i)
1975 ShuffleMask.push_back(i);
1976 for (int i = 8; i != 16; ++i)
1977 ShuffleMask.push_back(-1);
1978
1979 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
1980 Builder.CreateBitCast(Op1, ShufTy),
1981 ShuffleMask);
1982 return Builder.CreateBitCast(SV, II.getType());
1983 }
1984
1985 // See if we're dealing with constant values.
1986 auto *C0 = dyn_cast<Constant>(Op0);
1987 auto *C1 = dyn_cast<Constant>(Op1);
1988 auto *CI00 =
1989 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1990 : nullptr;
1991 auto *CI10 =
1992 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1993 : nullptr;
1994
1995 // Constant Fold - insert bottom Length bits starting at the Index'th bit.
1996 if (CI00 && CI10) {
1997 APInt V00 = CI00->getValue();
1998 APInt V10 = CI10->getValue();
1999 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
2000 V00 = V00 & ~Mask;
2001 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
2002 APInt Val = V00 | V10;
2003 Type *IntTy64 = Type::getInt64Ty(II.getContext());
2004 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
2005 UndefValue::get(IntTy64)};
2006 return ConstantVector::get(Args);
2007 }
2008
2009 // If we were an INSERTQ call, we'll save demanded elements if we convert to
2010 // INSERTQI.
2011 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
2012 Type *IntTy8 = Type::getInt8Ty(II.getContext());
2013 Constant *CILength = ConstantInt::get(IntTy8, Length, false);
2014 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
2015
2016 Value *Args[] = {Op0, Op1, CILength, CIIndex};
2017 return Builder.CreateIntrinsic(Intrinsic::x86_sse4a_insertqi, Args);
2018 }
2019
2020 return nullptr;
2021}
2022
2023/// Attempt to convert pshufb* to shufflevector if the mask is constant.
2025 InstCombiner::BuilderTy &Builder) {
2026 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2027 if (!V)
2028 return nullptr;
2029
2030 auto *VecTy = cast<FixedVectorType>(II.getType());
2031 unsigned NumElts = VecTy->getNumElements();
2032 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
2033 "Unexpected number of elements in shuffle mask!");
2034
2035 // Construct a shuffle mask from constant integers or UNDEFs.
2036 int Indexes[64];
2037
2038 // Each byte in the shuffle control mask forms an index to permute the
2039 // corresponding byte in the destination operand.
2040 for (unsigned I = 0; I < NumElts; ++I) {
2041 Constant *COp = V->getAggregateElement(I);
2042 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2043 return nullptr;
2044
2045 if (isa<UndefValue>(COp)) {
2046 Indexes[I] = -1;
2047 continue;
2048 }
2049
2050 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
2051
2052 // If the most significant bit (bit[7]) of each byte of the shuffle
2053 // control mask is set, then zero is written in the result byte.
2054 // The zero vector is in the right-hand side of the resulting
2055 // shufflevector.
2056
2057 // The value of each index for the high 128-bit lane is the least
2058 // significant 4 bits of the respective shuffle control byte.
2059 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
2060 Indexes[I] = Index;
2061 }
2062
2063 auto V1 = II.getArgOperand(0);
2064 auto V2 = Constant::getNullValue(VecTy);
2065 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
2066}
2067
2068/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
2070 InstCombiner::BuilderTy &Builder) {
2071 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2072 if (!V)
2073 return nullptr;
2074
2075 auto *VecTy = cast<FixedVectorType>(II.getType());
2076 unsigned NumElts = VecTy->getNumElements();
2077 bool IsPD = VecTy->getScalarType()->isDoubleTy();
2078 unsigned NumLaneElts = IsPD ? 2 : 4;
2079 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
2080
2081 // Construct a shuffle mask from constant integers or UNDEFs.
2082 int Indexes[16];
2083
2084 // The intrinsics only read one or two bits, clear the rest.
2085 for (unsigned I = 0; I < NumElts; ++I) {
2086 Constant *COp = V->getAggregateElement(I);
2087 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2088 return nullptr;
2089
2090 if (isa<UndefValue>(COp)) {
2091 Indexes[I] = -1;
2092 continue;
2093 }
2094
2095 APInt Index = cast<ConstantInt>(COp)->getValue();
2096 Index = Index.zextOrTrunc(32).getLoBits(2);
2097
2098 // The PD variants uses bit 1 to select per-lane element index, so
2099 // shift down to convert to generic shuffle mask index.
2100 if (IsPD)
2101 Index.lshrInPlace(1);
2102
2103 // The _256 variants are a bit trickier since the mask bits always index
2104 // into the corresponding 128 half. In order to convert to a generic
2105 // shuffle, we have to make that explicit.
2106 Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
2107
2108 Indexes[I] = Index.getZExtValue();
2109 }
2110
2111 auto V1 = II.getArgOperand(0);
2112 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts));
2113}
2114
2115/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
2117 InstCombiner::BuilderTy &Builder) {
2118 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2119 if (!V)
2120 return nullptr;
2121
2122 auto *VecTy = cast<FixedVectorType>(II.getType());
2123 unsigned Size = VecTy->getNumElements();
2124 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
2125 "Unexpected shuffle mask size");
2126
2127 // Construct a shuffle mask from constant integers or UNDEFs.
2128 int Indexes[64];
2129
2130 for (unsigned I = 0; I < Size; ++I) {
2131 Constant *COp = V->getAggregateElement(I);
2132 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2133 return nullptr;
2134
2135 if (isa<UndefValue>(COp)) {
2136 Indexes[I] = -1;
2137 continue;
2138 }
2139
2140 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
2141 Index &= Size - 1;
2142 Indexes[I] = Index;
2143 }
2144
2145 auto V1 = II.getArgOperand(0);
2146 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size));
2147}
2148
2149/// Attempt to convert vpermi2/vpermt2 to shufflevector if the mask is constant.
2151 InstCombiner::BuilderTy &Builder) {
2152 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2153 if (!V)
2154 return nullptr;
2155
2156 auto *VecTy = cast<FixedVectorType>(II.getType());
2157 unsigned Size = VecTy->getNumElements();
2158 assert((Size == 2 || Size == 4 || Size == 8 || Size == 16 || Size == 32 ||
2159 Size == 64) &&
2160 "Unexpected shuffle mask size");
2161
2162 // Construct a shuffle mask from constant integers or UNDEFs.
2163 int Indexes[64];
2164
2165 for (unsigned I = 0; I < Size; ++I) {
2166 Constant *COp = V->getAggregateElement(I);
2167 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2168 return nullptr;
2169
2170 if (isa<UndefValue>(COp)) {
2171 Indexes[I] = -1;
2172 continue;
2173 }
2174
2175 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
2176 Index &= (2 * Size) - 1;
2177 Indexes[I] = Index;
2178 }
2179
2180 auto V1 = II.getArgOperand(0);
2181 auto V2 = II.getArgOperand(2);
2182 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, Size));
2183}
2184
2185// Simplify VPERMV/VPERMV3 mask - only demand the active index bits.
2186static bool simplifyX86VPERMMask(Instruction *II, bool IsBinary,
2187 InstCombiner &IC) {
2188 auto *VecTy = cast<FixedVectorType>(II->getType());
2189 unsigned EltSizeInBits = VecTy->getScalarSizeInBits();
2190 unsigned NumElts = VecTy->getNumElements();
2191 assert(isPowerOf2_32(NumElts) && isPowerOf2_32(EltSizeInBits) &&
2192 "Unexpected shuffle mask size");
2193
2194 unsigned IdxSizeInBits = Log2_32(IsBinary ? (2 * NumElts) : NumElts);
2195 APInt DemandedMask = APInt::getLowBitsSet(EltSizeInBits, IdxSizeInBits);
2196
2197 KnownBits KnownMask(EltSizeInBits);
2198 return IC.SimplifyDemandedBits(II, /*OpNo=*/1, DemandedMask, KnownMask);
2199}
2200
2201std::optional<Instruction *>
2203 auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
2204 unsigned DemandedWidth) {
2205 APInt UndefElts(Width, 0);
2206 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
2207 return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
2208 };
2209
2210 Intrinsic::ID IID = II.getIntrinsicID();
2211 switch (IID) {
2212 case Intrinsic::x86_bmi_bextr_32:
2213 case Intrinsic::x86_bmi_bextr_64:
2214 case Intrinsic::x86_tbm_bextri_u32:
2215 case Intrinsic::x86_tbm_bextri_u64:
2216 // If the RHS is a constant we can try some simplifications.
2217 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2218 uint64_t Shift = C->getZExtValue();
2219 uint64_t Length = (Shift >> 8) & 0xff;
2220 Shift &= 0xff;
2221 unsigned BitWidth = II.getType()->getIntegerBitWidth();
2222 // If the length is 0 or the shift is out of range, replace with zero.
2223 if (Length == 0 || Shift >= BitWidth) {
2224 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2225 }
2226 // If the LHS is also a constant, we can completely constant fold this.
2227 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2228 uint64_t Result = InC->getZExtValue() >> Shift;
2229 if (Length > BitWidth)
2230 Length = BitWidth;
2232 return IC.replaceInstUsesWith(II,
2233 ConstantInt::get(II.getType(), Result));
2234 }
2235 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
2236 // are only masking bits that a shift already cleared?
2237 }
2238 break;
2239
2240 case Intrinsic::x86_bmi_bzhi_32:
2241 case Intrinsic::x86_bmi_bzhi_64:
2242 // If the RHS is a constant we can try some simplifications.
2243 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2244 uint64_t Index = C->getZExtValue() & 0xff;
2245 unsigned BitWidth = II.getType()->getIntegerBitWidth();
2246 if (Index >= BitWidth) {
2247 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2248 }
2249 if (Index == 0) {
2250 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2251 }
2252 // If the LHS is also a constant, we can completely constant fold this.
2253 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2254 uint64_t Result = InC->getZExtValue();
2255 Result &= maskTrailingOnes<uint64_t>(Index);
2256 return IC.replaceInstUsesWith(II,
2257 ConstantInt::get(II.getType(), Result));
2258 }
2259 // TODO should we convert this to an AND if the RHS is constant?
2260 }
2261 break;
2262 case Intrinsic::x86_bmi_pext_32:
2263 case Intrinsic::x86_bmi_pext_64:
2264 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2265 if (MaskC->isNullValue()) {
2266 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2267 }
2268 if (MaskC->isAllOnesValue()) {
2269 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2270 }
2271
2272 unsigned MaskIdx, MaskLen;
2273 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2274 // any single contingous sequence of 1s anywhere in the mask simply
2275 // describes a subset of the input bits shifted to the appropriate
2276 // position. Replace with the straight forward IR.
2277 Value *Input = II.getArgOperand(0);
2278 Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
2279 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2280 Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
2281 return IC.replaceInstUsesWith(II, Shifted);
2282 }
2283
2284 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2285 uint64_t Src = SrcC->getZExtValue();
2286 uint64_t Mask = MaskC->getZExtValue();
2287 uint64_t Result = 0;
2288 uint64_t BitToSet = 1;
2289
2290 while (Mask) {
2291 // Isolate lowest set bit.
2292 uint64_t BitToTest = Mask & -Mask;
2293 if (BitToTest & Src)
2294 Result |= BitToSet;
2295
2296 BitToSet <<= 1;
2297 // Clear lowest set bit.
2298 Mask &= Mask - 1;
2299 }
2300
2301 return IC.replaceInstUsesWith(II,
2302 ConstantInt::get(II.getType(), Result));
2303 }
2304 }
2305 break;
2306 case Intrinsic::x86_bmi_pdep_32:
2307 case Intrinsic::x86_bmi_pdep_64:
2308 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2309 if (MaskC->isNullValue()) {
2310 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2311 }
2312 if (MaskC->isAllOnesValue()) {
2313 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2314 }
2315
2316 unsigned MaskIdx, MaskLen;
2317 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2318 // any single contingous sequence of 1s anywhere in the mask simply
2319 // describes a subset of the input bits shifted to the appropriate
2320 // position. Replace with the straight forward IR.
2321 Value *Input = II.getArgOperand(0);
2322 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2323 Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
2324 Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
2325 return IC.replaceInstUsesWith(II, Masked);
2326 }
2327
2328 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2329 uint64_t Src = SrcC->getZExtValue();
2330 uint64_t Mask = MaskC->getZExtValue();
2331 uint64_t Result = 0;
2332 uint64_t BitToTest = 1;
2333
2334 while (Mask) {
2335 // Isolate lowest set bit.
2336 uint64_t BitToSet = Mask & -Mask;
2337 if (BitToTest & Src)
2338 Result |= BitToSet;
2339
2340 BitToTest <<= 1;
2341 // Clear lowest set bit;
2342 Mask &= Mask - 1;
2343 }
2344
2345 return IC.replaceInstUsesWith(II,
2346 ConstantInt::get(II.getType(), Result));
2347 }
2348 }
2349 break;
2350
2351 case Intrinsic::x86_sse_cvtss2si:
2352 case Intrinsic::x86_sse_cvtss2si64:
2353 case Intrinsic::x86_sse_cvttss2si:
2354 case Intrinsic::x86_sse_cvttss2si64:
2355 case Intrinsic::x86_sse2_cvtsd2si:
2356 case Intrinsic::x86_sse2_cvtsd2si64:
2357 case Intrinsic::x86_sse2_cvttsd2si:
2358 case Intrinsic::x86_sse2_cvttsd2si64:
2359 case Intrinsic::x86_avx512_vcvtss2si32:
2360 case Intrinsic::x86_avx512_vcvtss2si64:
2361 case Intrinsic::x86_avx512_vcvtss2usi32:
2362 case Intrinsic::x86_avx512_vcvtss2usi64:
2363 case Intrinsic::x86_avx512_vcvtsd2si32:
2364 case Intrinsic::x86_avx512_vcvtsd2si64:
2365 case Intrinsic::x86_avx512_vcvtsd2usi32:
2366 case Intrinsic::x86_avx512_vcvtsd2usi64:
2367 case Intrinsic::x86_avx512_cvttss2si:
2368 case Intrinsic::x86_avx512_cvttss2si64:
2369 case Intrinsic::x86_avx512_cvttss2usi:
2370 case Intrinsic::x86_avx512_cvttss2usi64:
2371 case Intrinsic::x86_avx512_cvttsd2si:
2372 case Intrinsic::x86_avx512_cvttsd2si64:
2373 case Intrinsic::x86_avx512_cvttsd2usi:
2374 case Intrinsic::x86_avx512_cvttsd2usi64: {
2375 // These intrinsics only demand the 0th element of their input vectors. If
2376 // we can simplify the input based on that, do so now.
2377 Value *Arg = II.getArgOperand(0);
2378 unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
2379 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
2380 return IC.replaceOperand(II, 0, V);
2381 }
2382 break;
2383 }
2384
2385 case Intrinsic::x86_mmx_pmovmskb:
2386 case Intrinsic::x86_sse_movmsk_ps:
2387 case Intrinsic::x86_sse2_movmsk_pd:
2388 case Intrinsic::x86_sse2_pmovmskb_128:
2389 case Intrinsic::x86_avx_movmsk_pd_256:
2390 case Intrinsic::x86_avx_movmsk_ps_256:
2391 case Intrinsic::x86_avx2_pmovmskb:
2392 if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
2393 return IC.replaceInstUsesWith(II, V);
2394 }
2395 break;
2396
2397 case Intrinsic::x86_sse_comieq_ss:
2398 case Intrinsic::x86_sse_comige_ss:
2399 case Intrinsic::x86_sse_comigt_ss:
2400 case Intrinsic::x86_sse_comile_ss:
2401 case Intrinsic::x86_sse_comilt_ss:
2402 case Intrinsic::x86_sse_comineq_ss:
2403 case Intrinsic::x86_sse_ucomieq_ss:
2404 case Intrinsic::x86_sse_ucomige_ss:
2405 case Intrinsic::x86_sse_ucomigt_ss:
2406 case Intrinsic::x86_sse_ucomile_ss:
2407 case Intrinsic::x86_sse_ucomilt_ss:
2408 case Intrinsic::x86_sse_ucomineq_ss:
2409 case Intrinsic::x86_sse2_comieq_sd:
2410 case Intrinsic::x86_sse2_comige_sd:
2411 case Intrinsic::x86_sse2_comigt_sd:
2412 case Intrinsic::x86_sse2_comile_sd:
2413 case Intrinsic::x86_sse2_comilt_sd:
2414 case Intrinsic::x86_sse2_comineq_sd:
2415 case Intrinsic::x86_sse2_ucomieq_sd:
2416 case Intrinsic::x86_sse2_ucomige_sd:
2417 case Intrinsic::x86_sse2_ucomigt_sd:
2418 case Intrinsic::x86_sse2_ucomile_sd:
2419 case Intrinsic::x86_sse2_ucomilt_sd:
2420 case Intrinsic::x86_sse2_ucomineq_sd:
2421 case Intrinsic::x86_avx512_vcomi_ss:
2422 case Intrinsic::x86_avx512_vcomi_sd:
2423 case Intrinsic::x86_avx512_mask_cmp_ss:
2424 case Intrinsic::x86_avx512_mask_cmp_sd: {
2425 // These intrinsics only demand the 0th element of their input vectors. If
2426 // we can simplify the input based on that, do so now.
2427 bool MadeChange = false;
2428 Value *Arg0 = II.getArgOperand(0);
2429 Value *Arg1 = II.getArgOperand(1);
2430 unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
2431 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
2432 IC.replaceOperand(II, 0, V);
2433 MadeChange = true;
2434 }
2435 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
2436 IC.replaceOperand(II, 1, V);
2437 MadeChange = true;
2438 }
2439 if (MadeChange) {
2440 return &II;
2441 }
2442 break;
2443 }
2444
2445 case Intrinsic::x86_avx512_add_ps_512:
2446 case Intrinsic::x86_avx512_div_ps_512:
2447 case Intrinsic::x86_avx512_mul_ps_512:
2448 case Intrinsic::x86_avx512_sub_ps_512:
2449 case Intrinsic::x86_avx512_add_pd_512:
2450 case Intrinsic::x86_avx512_div_pd_512:
2451 case Intrinsic::x86_avx512_mul_pd_512:
2452 case Intrinsic::x86_avx512_sub_pd_512:
2453 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2454 // IR operations.
2455 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2456 if (R->getValue() == 4) {
2457 Value *Arg0 = II.getArgOperand(0);
2458 Value *Arg1 = II.getArgOperand(1);
2459
2460 Value *V;
2461 switch (IID) {
2462 default:
2463 llvm_unreachable("Case stmts out of sync!");
2464 case Intrinsic::x86_avx512_add_ps_512:
2465 case Intrinsic::x86_avx512_add_pd_512:
2466 V = IC.Builder.CreateFAdd(Arg0, Arg1);
2467 break;
2468 case Intrinsic::x86_avx512_sub_ps_512:
2469 case Intrinsic::x86_avx512_sub_pd_512:
2470 V = IC.Builder.CreateFSub(Arg0, Arg1);
2471 break;
2472 case Intrinsic::x86_avx512_mul_ps_512:
2473 case Intrinsic::x86_avx512_mul_pd_512:
2474 V = IC.Builder.CreateFMul(Arg0, Arg1);
2475 break;
2476 case Intrinsic::x86_avx512_div_ps_512:
2477 case Intrinsic::x86_avx512_div_pd_512:
2478 V = IC.Builder.CreateFDiv(Arg0, Arg1);
2479 break;
2480 }
2481
2482 return IC.replaceInstUsesWith(II, V);
2483 }
2484 }
2485 break;
2486
2487 case Intrinsic::x86_avx512_mask_add_ss_round:
2488 case Intrinsic::x86_avx512_mask_div_ss_round:
2489 case Intrinsic::x86_avx512_mask_mul_ss_round:
2490 case Intrinsic::x86_avx512_mask_sub_ss_round:
2491 case Intrinsic::x86_avx512_mask_add_sd_round:
2492 case Intrinsic::x86_avx512_mask_div_sd_round:
2493 case Intrinsic::x86_avx512_mask_mul_sd_round:
2494 case Intrinsic::x86_avx512_mask_sub_sd_round:
2495 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2496 // IR operations.
2497 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
2498 if (R->getValue() == 4) {
2499 // Extract the element as scalars.
2500 Value *Arg0 = II.getArgOperand(0);
2501 Value *Arg1 = II.getArgOperand(1);
2502 Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
2503 Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
2504
2505 Value *V;
2506 switch (IID) {
2507 default:
2508 llvm_unreachable("Case stmts out of sync!");
2509 case Intrinsic::x86_avx512_mask_add_ss_round:
2510 case Intrinsic::x86_avx512_mask_add_sd_round:
2511 V = IC.Builder.CreateFAdd(LHS, RHS);
2512 break;
2513 case Intrinsic::x86_avx512_mask_sub_ss_round:
2514 case Intrinsic::x86_avx512_mask_sub_sd_round:
2515 V = IC.Builder.CreateFSub(LHS, RHS);
2516 break;
2517 case Intrinsic::x86_avx512_mask_mul_ss_round:
2518 case Intrinsic::x86_avx512_mask_mul_sd_round:
2519 V = IC.Builder.CreateFMul(LHS, RHS);
2520 break;
2521 case Intrinsic::x86_avx512_mask_div_ss_round:
2522 case Intrinsic::x86_avx512_mask_div_sd_round:
2523 V = IC.Builder.CreateFDiv(LHS, RHS);
2524 break;
2525 }
2526
2527 // Handle the masking aspect of the intrinsic.
2528 Value *Mask = II.getArgOperand(3);
2529 auto *C = dyn_cast<ConstantInt>(Mask);
2530 // We don't need a select if we know the mask bit is a 1.
2531 if (!C || !C->getValue()[0]) {
2532 // Cast the mask to an i1 vector and then extract the lowest element.
2533 auto *MaskTy = FixedVectorType::get(
2534 IC.Builder.getInt1Ty(),
2535 cast<IntegerType>(Mask->getType())->getBitWidth());
2536 Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
2537 Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
2538 // Extract the lowest element from the passthru operand.
2539 Value *Passthru =
2540 IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
2541 V = IC.Builder.CreateSelect(Mask, V, Passthru);
2542 }
2543
2544 // Insert the result back into the original argument 0.
2545 V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
2546
2547 return IC.replaceInstUsesWith(II, V);
2548 }
2549 }
2550 break;
2551
2552 // Generalize SSE/AVX FP to maxnum/minnum.
2553 case Intrinsic::x86_sse_max_ps:
2554 case Intrinsic::x86_sse2_max_pd:
2555 case Intrinsic::x86_avx_max_pd_256:
2556 case Intrinsic::x86_avx_max_ps_256:
2557 case Intrinsic::x86_avx512_max_pd_512:
2558 case Intrinsic::x86_avx512_max_ps_512:
2559 case Intrinsic::x86_avx512fp16_max_ph_128:
2560 case Intrinsic::x86_avx512fp16_max_ph_256:
2561 case Intrinsic::x86_avx512fp16_max_ph_512:
2562 if (Value *V = simplifyX86FPMaxMin(II, IC, Intrinsic::maxnum))
2563 return IC.replaceInstUsesWith(II, V);
2564 break;
2565 case Intrinsic::x86_sse_max_ss:
2566 case Intrinsic::x86_sse2_max_sd: {
2567 if (Value *V = simplifyX86FPMaxMin(II, IC, Intrinsic::maxnum, true))
2568 return IC.replaceInstUsesWith(II, V);
2569 break;
2570 }
2571
2572 case Intrinsic::x86_sse_min_ps:
2573 case Intrinsic::x86_sse2_min_pd:
2574 case Intrinsic::x86_avx_min_pd_256:
2575 case Intrinsic::x86_avx_min_ps_256:
2576 case Intrinsic::x86_avx512_min_pd_512:
2577 case Intrinsic::x86_avx512_min_ps_512:
2578 case Intrinsic::x86_avx512fp16_min_ph_128:
2579 case Intrinsic::x86_avx512fp16_min_ph_256:
2580 case Intrinsic::x86_avx512fp16_min_ph_512:
2581 if (Value *V = simplifyX86FPMaxMin(II, IC, Intrinsic::minnum))
2582 return IC.replaceInstUsesWith(II, V);
2583 break;
2584
2585 case Intrinsic::x86_sse_min_ss:
2586 case Intrinsic::x86_sse2_min_sd: {
2587 if (Value *V = simplifyX86FPMaxMin(II, IC, Intrinsic::minnum, true))
2588 return IC.replaceInstUsesWith(II, V);
2589 break;
2590 }
2591
2592 // Constant fold ashr( <A x Bi>, Ci ).
2593 // Constant fold lshr( <A x Bi>, Ci ).
2594 // Constant fold shl( <A x Bi>, Ci ).
2595 case Intrinsic::x86_sse2_psrai_d:
2596 case Intrinsic::x86_sse2_psrai_w:
2597 case Intrinsic::x86_avx2_psrai_d:
2598 case Intrinsic::x86_avx2_psrai_w:
2599 case Intrinsic::x86_avx512_psrai_q_128:
2600 case Intrinsic::x86_avx512_psrai_q_256:
2601 case Intrinsic::x86_avx512_psrai_d_512:
2602 case Intrinsic::x86_avx512_psrai_q_512:
2603 case Intrinsic::x86_avx512_psrai_w_512:
2604 case Intrinsic::x86_sse2_psrli_d:
2605 case Intrinsic::x86_sse2_psrli_q:
2606 case Intrinsic::x86_sse2_psrli_w:
2607 case Intrinsic::x86_avx2_psrli_d:
2608 case Intrinsic::x86_avx2_psrli_q:
2609 case Intrinsic::x86_avx2_psrli_w:
2610 case Intrinsic::x86_avx512_psrli_d_512:
2611 case Intrinsic::x86_avx512_psrli_q_512:
2612 case Intrinsic::x86_avx512_psrli_w_512:
2613 case Intrinsic::x86_sse2_pslli_d:
2614 case Intrinsic::x86_sse2_pslli_q:
2615 case Intrinsic::x86_sse2_pslli_w:
2616 case Intrinsic::x86_avx2_pslli_d:
2617 case Intrinsic::x86_avx2_pslli_q:
2618 case Intrinsic::x86_avx2_pslli_w:
2619 case Intrinsic::x86_avx512_pslli_d_512:
2620 case Intrinsic::x86_avx512_pslli_q_512:
2621 case Intrinsic::x86_avx512_pslli_w_512:
2622 if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2623 return IC.replaceInstUsesWith(II, V);
2624 }
2625 break;
2626
2627 case Intrinsic::x86_sse2_psra_d:
2628 case Intrinsic::x86_sse2_psra_w:
2629 case Intrinsic::x86_avx2_psra_d:
2630 case Intrinsic::x86_avx2_psra_w:
2631 case Intrinsic::x86_avx512_psra_q_128:
2632 case Intrinsic::x86_avx512_psra_q_256:
2633 case Intrinsic::x86_avx512_psra_d_512:
2634 case Intrinsic::x86_avx512_psra_q_512:
2635 case Intrinsic::x86_avx512_psra_w_512:
2636 case Intrinsic::x86_sse2_psrl_d:
2637 case Intrinsic::x86_sse2_psrl_q:
2638 case Intrinsic::x86_sse2_psrl_w:
2639 case Intrinsic::x86_avx2_psrl_d:
2640 case Intrinsic::x86_avx2_psrl_q:
2641 case Intrinsic::x86_avx2_psrl_w:
2642 case Intrinsic::x86_avx512_psrl_d_512:
2643 case Intrinsic::x86_avx512_psrl_q_512:
2644 case Intrinsic::x86_avx512_psrl_w_512:
2645 case Intrinsic::x86_sse2_psll_d:
2646 case Intrinsic::x86_sse2_psll_q:
2647 case Intrinsic::x86_sse2_psll_w:
2648 case Intrinsic::x86_avx2_psll_d:
2649 case Intrinsic::x86_avx2_psll_q:
2650 case Intrinsic::x86_avx2_psll_w:
2651 case Intrinsic::x86_avx512_psll_d_512:
2652 case Intrinsic::x86_avx512_psll_q_512:
2653 case Intrinsic::x86_avx512_psll_w_512: {
2654 if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2655 return IC.replaceInstUsesWith(II, V);
2656 }
2657
2658 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
2659 // operand to compute the shift amount.
2660 Value *Arg1 = II.getArgOperand(1);
2661 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
2662 "Unexpected packed shift size");
2663 unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
2664
2665 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
2666 return IC.replaceOperand(II, 1, V);
2667 }
2668 break;
2669 }
2670
2671 case Intrinsic::x86_avx2_psllv_d:
2672 case Intrinsic::x86_avx2_psllv_d_256:
2673 case Intrinsic::x86_avx2_psllv_q:
2674 case Intrinsic::x86_avx2_psllv_q_256:
2675 case Intrinsic::x86_avx512_psllv_d_512:
2676 case Intrinsic::x86_avx512_psllv_q_512:
2677 case Intrinsic::x86_avx512_psllv_w_128:
2678 case Intrinsic::x86_avx512_psllv_w_256:
2679 case Intrinsic::x86_avx512_psllv_w_512:
2680 case Intrinsic::x86_avx2_psrav_d:
2681 case Intrinsic::x86_avx2_psrav_d_256:
2682 case Intrinsic::x86_avx512_psrav_q_128:
2683 case Intrinsic::x86_avx512_psrav_q_256:
2684 case Intrinsic::x86_avx512_psrav_d_512:
2685 case Intrinsic::x86_avx512_psrav_q_512:
2686 case Intrinsic::x86_avx512_psrav_w_128:
2687 case Intrinsic::x86_avx512_psrav_w_256:
2688 case Intrinsic::x86_avx512_psrav_w_512:
2689 case Intrinsic::x86_avx2_psrlv_d:
2690 case Intrinsic::x86_avx2_psrlv_d_256:
2691 case Intrinsic::x86_avx2_psrlv_q:
2692 case Intrinsic::x86_avx2_psrlv_q_256:
2693 case Intrinsic::x86_avx512_psrlv_d_512:
2694 case Intrinsic::x86_avx512_psrlv_q_512:
2695 case Intrinsic::x86_avx512_psrlv_w_128:
2696 case Intrinsic::x86_avx512_psrlv_w_256:
2697 case Intrinsic::x86_avx512_psrlv_w_512:
2698 if (Value *V = simplifyX86varShift(II, IC.Builder)) {
2699 return IC.replaceInstUsesWith(II, V);
2700 }
2701 break;
2702
2703 case Intrinsic::x86_sse2_packssdw_128:
2704 case Intrinsic::x86_sse2_packsswb_128:
2705 case Intrinsic::x86_avx2_packssdw:
2706 case Intrinsic::x86_avx2_packsswb:
2707 case Intrinsic::x86_avx512_packssdw_512:
2708 case Intrinsic::x86_avx512_packsswb_512:
2709 if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
2710 return IC.replaceInstUsesWith(II, V);
2711 }
2712 break;
2713
2714 case Intrinsic::x86_sse2_packuswb_128:
2715 case Intrinsic::x86_sse41_packusdw:
2716 case Intrinsic::x86_avx2_packusdw:
2717 case Intrinsic::x86_avx2_packuswb:
2718 case Intrinsic::x86_avx512_packusdw_512:
2719 case Intrinsic::x86_avx512_packuswb_512:
2720 if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
2721 return IC.replaceInstUsesWith(II, V);
2722 }
2723 break;
2724
2725 case Intrinsic::x86_sse2_pmulh_w:
2726 case Intrinsic::x86_avx2_pmulh_w:
2727 case Intrinsic::x86_avx512_pmulh_w_512:
2728 if (Value *V = simplifyX86pmulh(II, IC.Builder, true, false)) {
2729 return IC.replaceInstUsesWith(II, V);
2730 }
2731 break;
2732
2733 case Intrinsic::x86_sse2_pmulhu_w:
2734 case Intrinsic::x86_avx2_pmulhu_w:
2735 case Intrinsic::x86_avx512_pmulhu_w_512:
2736 if (Value *V = simplifyX86pmulh(II, IC.Builder, false, false)) {
2737 return IC.replaceInstUsesWith(II, V);
2738 }
2739 break;
2740
2741 case Intrinsic::x86_ssse3_pmul_hr_sw_128:
2742 case Intrinsic::x86_avx2_pmul_hr_sw:
2743 case Intrinsic::x86_avx512_pmul_hr_sw_512:
2744 if (Value *V = simplifyX86pmulh(II, IC.Builder, true, true)) {
2745 return IC.replaceInstUsesWith(II, V);
2746 }
2747 break;
2748
2749 case Intrinsic::x86_sse2_pmadd_wd:
2750 case Intrinsic::x86_avx2_pmadd_wd:
2751 case Intrinsic::x86_avx512_pmaddw_d_512:
2752 if (Value *V = simplifyX86pmadd(II, IC.Builder, true)) {
2753 return IC.replaceInstUsesWith(II, V);
2754 }
2755 break;
2756
2757 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
2758 case Intrinsic::x86_avx2_pmadd_ub_sw:
2759 case Intrinsic::x86_avx512_pmaddubs_w_512:
2760 if (Value *V = simplifyX86pmadd(II, IC.Builder, false)) {
2761 return IC.replaceInstUsesWith(II, V);
2762 }
2763 break;
2764
2765 case Intrinsic::x86_pclmulqdq:
2766 case Intrinsic::x86_pclmulqdq_256:
2767 case Intrinsic::x86_pclmulqdq_512: {
2768 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2769 unsigned Imm = C->getZExtValue();
2770
2771 bool MadeChange = false;
2772 Value *Arg0 = II.getArgOperand(0);
2773 Value *Arg1 = II.getArgOperand(1);
2774 unsigned VWidth =
2775 cast<FixedVectorType>(Arg0->getType())->getNumElements();
2776
2777 APInt UndefElts1(VWidth, 0);
2778 APInt DemandedElts1 =
2779 APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
2780 if (Value *V =
2781 IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
2782 IC.replaceOperand(II, 0, V);
2783 MadeChange = true;
2784 }
2785
2786 APInt UndefElts2(VWidth, 0);
2787 APInt DemandedElts2 =
2788 APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
2789 if (Value *V =
2790 IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
2791 IC.replaceOperand(II, 1, V);
2792 MadeChange = true;
2793 }
2794
2795 // If either input elements are undef, the result is zero.
2796 if (DemandedElts1.isSubsetOf(UndefElts1) ||
2797 DemandedElts2.isSubsetOf(UndefElts2)) {
2798 return IC.replaceInstUsesWith(II,
2799 ConstantAggregateZero::get(II.getType()));
2800 }
2801
2802 if (MadeChange) {
2803 return &II;
2804 }
2805 }
2806 break;
2807 }
2808
2809 case Intrinsic::x86_sse41_insertps:
2810 if (Value *V = simplifyX86insertps(II, IC.Builder)) {
2811 return IC.replaceInstUsesWith(II, V);
2812 }
2813 break;
2814
2815 case Intrinsic::x86_sse4a_extrq: {
2816 Value *Op0 = II.getArgOperand(0);
2817 Value *Op1 = II.getArgOperand(1);
2818 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2819 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2820 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2821 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2822 VWidth1 == 16 && "Unexpected operand sizes");
2823
2824 // See if we're dealing with constant values.
2825 auto *C1 = dyn_cast<Constant>(Op1);
2826 auto *CILength =
2827 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
2828 : nullptr;
2829 auto *CIIndex =
2830 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2831 : nullptr;
2832
2833 // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
2834 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2835 return IC.replaceInstUsesWith(II, V);
2836 }
2837
2838 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
2839 // operands and the lowest 16-bits of the second.
2840 bool MadeChange = false;
2841 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2842 IC.replaceOperand(II, 0, V);
2843 MadeChange = true;
2844 }
2845 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
2846 IC.replaceOperand(II, 1, V);
2847 MadeChange = true;
2848 }
2849 if (MadeChange) {
2850 return &II;
2851 }
2852 break;
2853 }
2854
2855 case Intrinsic::x86_sse4a_extrqi: {
2856 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
2857 // bits of the lower 64-bits. The upper 64-bits are undefined.
2858 Value *Op0 = II.getArgOperand(0);
2859 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2860 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2861 "Unexpected operand size");
2862
2863 // See if we're dealing with constant values.
2864 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
2865 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
2866
2867 // Attempt to simplify to a constant or shuffle vector.
2868 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2869 return IC.replaceInstUsesWith(II, V);
2870 }
2871
2872 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
2873 // operand.
2874 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2875 return IC.replaceOperand(II, 0, V);
2876 }
2877 break;
2878 }
2879
2880 case Intrinsic::x86_sse4a_insertq: {
2881 Value *Op0 = II.getArgOperand(0);
2882 Value *Op1 = II.getArgOperand(1);
2883 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2884 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2885 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2886 cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
2887 "Unexpected operand size");
2888
2889 // See if we're dealing with constant values.
2890 auto *C1 = dyn_cast<Constant>(Op1);
2891 auto *CI11 =
2892 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2893 : nullptr;
2894
2895 // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
2896 if (CI11) {
2897 const APInt &V11 = CI11->getValue();
2898 APInt Len = V11.zextOrTrunc(6);
2899 APInt Idx = V11.lshr(8).zextOrTrunc(6);
2900 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2901 return IC.replaceInstUsesWith(II, V);
2902 }
2903 }
2904
2905 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
2906 // operand.
2907 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2908 return IC.replaceOperand(II, 0, V);
2909 }
2910 break;
2911 }
2912
2913 case Intrinsic::x86_sse4a_insertqi: {
2914 // INSERTQI: Extract lowest Length bits from lower half of second source and
2915 // insert over first source starting at Index bit. The upper 64-bits are
2916 // undefined.
2917 Value *Op0 = II.getArgOperand(0);
2918 Value *Op1 = II.getArgOperand(1);
2919 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2920 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2921 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2922 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2923 VWidth1 == 2 && "Unexpected operand sizes");
2924
2925 // See if we're dealing with constant values.
2926 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
2927 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
2928
2929 // Attempt to simplify to a constant or shuffle vector.
2930 if (CILength && CIIndex) {
2931 APInt Len = CILength->getValue().zextOrTrunc(6);
2932 APInt Idx = CIIndex->getValue().zextOrTrunc(6);
2933 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2934 return IC.replaceInstUsesWith(II, V);
2935 }
2936 }
2937
2938 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
2939 // operands.
2940 bool MadeChange = false;
2941 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2942 IC.replaceOperand(II, 0, V);
2943 MadeChange = true;
2944 }
2945 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
2946 IC.replaceOperand(II, 1, V);
2947 MadeChange = true;
2948 }
2949 if (MadeChange) {
2950 return &II;
2951 }
2952 break;
2953 }
2954
2955 case Intrinsic::x86_sse41_pblendvb:
2956 case Intrinsic::x86_sse41_blendvps:
2957 case Intrinsic::x86_sse41_blendvpd:
2958 case Intrinsic::x86_avx_blendv_ps_256:
2959 case Intrinsic::x86_avx_blendv_pd_256:
2960 case Intrinsic::x86_avx2_pblendvb: {
2961 // fold (blend A, A, Mask) -> A
2962 auto *OpTy = cast<FixedVectorType>(II.getType());
2963 Value *Op0 = II.getArgOperand(0);
2964 Value *Op1 = II.getArgOperand(1);
2965 Value *Mask = II.getArgOperand(2);
2966 if (Op0 == Op1) {
2967 return IC.replaceInstUsesWith(II, Op0);
2968 }
2969
2970 // Zero Mask - select 1st argument.
2971 if (isa<ConstantAggregateZero>(Mask)) {
2972 return IC.replaceInstUsesWith(II, Op0);
2973 }
2974
2975 // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
2976 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
2977 Constant *NewSelector =
2978 getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout());
2979 return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
2980 }
2981 unsigned BitWidth = Mask->getType()->getScalarSizeInBits();
2982
2983 if (Mask->getType()->isIntOrIntVectorTy()) {
2984 KnownBits Known(BitWidth);
2986 return &II;
2987 } else if (auto *BC = dyn_cast<BitCastInst>(Mask)) {
2988 if (BC->hasOneUse()) {
2989 Value *Src = BC->getOperand(0);
2990 if (Src->getType()->isIntOrIntVectorTy()) {
2991 unsigned SrcBitWidth = Src->getType()->getScalarSizeInBits();
2992 if (SrcBitWidth == BitWidth) {
2993 KnownBits KnownSrc(SrcBitWidth);
2994 if (IC.SimplifyDemandedBits(BC, 0, APInt::getSignMask(SrcBitWidth),
2995 KnownSrc))
2996 return &II;
2997 }
2998 }
2999 }
3000 }
3002
3003 // Bitshift upto the signbit can always be converted to an efficient
3004 // test+select pattern.
3005 if (match(Mask, m_Shl(m_Value(), m_Value()))) {
3006 if (auto *MaskTy = dyn_cast<FixedVectorType>(Mask->getType())) {
3007 if (MaskTy->getScalarSizeInBits() == OpTy->getScalarSizeInBits()) {
3008 Value *BoolVec = IC.Builder.CreateICmpSGT(
3009 ConstantAggregateZero::get(MaskTy), Mask);
3010 Value *Sel = IC.Builder.CreateSelect(BoolVec, Op1, Op0);
3011 return new BitCastInst(Sel, II.getType());
3012 }
3013 }
3014 }
3015
3016 // Peek through a one-use shuffle - VectorCombine should have simplified
3017 // this for cases where we're splitting wider vectors to use blendv
3018 // intrinsics.
3019 Value *MaskSrc = nullptr;
3020 ArrayRef<int> ShuffleMask;
3021 if (match(Mask, m_OneUse(m_Shuffle(m_Value(MaskSrc), m_Undef(),
3022 m_Mask(ShuffleMask))))) {
3023 // Bail if the shuffle was irregular or contains undefs.
3024 int NumElts = cast<FixedVectorType>(MaskSrc->getType())->getNumElements();
3025 if (NumElts < (int)ShuffleMask.size() || !isPowerOf2_32(NumElts) ||
3026 any_of(ShuffleMask,
3027 [NumElts](int M) { return M < 0 || M >= NumElts; }))
3028 break;
3029 Mask = InstCombiner::peekThroughBitcast(MaskSrc);
3030 }
3031
3032 // Convert to a vector select if we can bypass casts and find a boolean
3033 // vector condition value.
3034 Value *BoolVec;
3035 if (match(Mask, m_SExt(m_Value(BoolVec))) &&
3036 BoolVec->getType()->isVectorTy() &&
3037 BoolVec->getType()->getScalarSizeInBits() == 1) {
3038 auto *MaskTy = cast<FixedVectorType>(Mask->getType());
3039 unsigned NumMaskElts = MaskTy->getNumElements();
3040 unsigned NumOperandElts = OpTy->getNumElements();
3041
3042 // If we peeked through a shuffle, reapply the shuffle to the bool vector.
3043 if (MaskSrc) {
3044 unsigned NumMaskSrcElts =
3045 cast<FixedVectorType>(MaskSrc->getType())->getNumElements();
3046 NumMaskElts = (ShuffleMask.size() * NumMaskElts) / NumMaskSrcElts;
3047 // Multiple mask bits maps to the same operand element - bail out.
3048 if (NumMaskElts > NumOperandElts)
3049 break;
3050 SmallVector<int> ScaledMask;
3051 if (!llvm::scaleShuffleMaskElts(NumMaskElts, ShuffleMask, ScaledMask))
3052 break;
3053 BoolVec = IC.Builder.CreateShuffleVector(BoolVec, ScaledMask);
3054 MaskTy = FixedVectorType::get(MaskTy->getElementType(), NumMaskElts);
3055 }
3056 assert(MaskTy->getPrimitiveSizeInBits() ==
3057 OpTy->getPrimitiveSizeInBits() &&
3058 "Not expecting mask and operands with different sizes");
3059
3060 if (NumMaskElts == NumOperandElts) {
3061 return SelectInst::Create(BoolVec, Op1, Op0);
3062 }
3063
3064 // If the mask has less elements than the operands, each mask bit maps to
3065 // multiple elements of the operands. Bitcast back and forth.
3066 if (NumMaskElts < NumOperandElts) {
3067 Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy);
3068 Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy);
3069 Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
3070 return new BitCastInst(Sel, II.getType());
3071 }
3072 }
3073
3074 break;
3075 }
3076
3077 case Intrinsic::x86_ssse3_pshuf_b_128:
3078 case Intrinsic::x86_avx2_pshuf_b:
3079 case Intrinsic::x86_avx512_pshuf_b_512: {
3080 if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
3081 return IC.replaceInstUsesWith(II, V);
3082 }
3083
3084 KnownBits KnownMask(8);
3085 if (IC.SimplifyDemandedBits(&II, 1, APInt(8, 0b10001111), KnownMask))
3086 return &II;
3087 break;
3088 }
3089
3090 case Intrinsic::x86_avx_vpermilvar_ps:
3091 case Intrinsic::x86_avx_vpermilvar_ps_256:
3092 case Intrinsic::x86_avx512_vpermilvar_ps_512: {
3093 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
3094 return IC.replaceInstUsesWith(II, V);
3095 }
3096
3097 KnownBits KnownMask(32);
3098 if (IC.SimplifyDemandedBits(&II, 1, APInt(32, 0b00011), KnownMask))
3099 return &II;
3100 break;
3101 }
3102
3103 case Intrinsic::x86_avx_vpermilvar_pd:
3104 case Intrinsic::x86_avx_vpermilvar_pd_256:
3105 case Intrinsic::x86_avx512_vpermilvar_pd_512: {
3106 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
3107 return IC.replaceInstUsesWith(II, V);
3108 }
3109
3110 KnownBits KnownMask(64);
3111 if (IC.SimplifyDemandedBits(&II, 1, APInt(64, 0b00010), KnownMask))
3112 return &II;
3113 break;
3114 }
3115
3116 case Intrinsic::x86_avx2_permd:
3117 case Intrinsic::x86_avx2_permps:
3118 case Intrinsic::x86_avx512_permvar_df_256:
3119 case Intrinsic::x86_avx512_permvar_df_512:
3120 case Intrinsic::x86_avx512_permvar_di_256:
3121 case Intrinsic::x86_avx512_permvar_di_512:
3122 case Intrinsic::x86_avx512_permvar_hi_128:
3123 case Intrinsic::x86_avx512_permvar_hi_256:
3124 case Intrinsic::x86_avx512_permvar_hi_512:
3125 case Intrinsic::x86_avx512_permvar_qi_128:
3126 case Intrinsic::x86_avx512_permvar_qi_256:
3127 case Intrinsic::x86_avx512_permvar_qi_512:
3128 case Intrinsic::x86_avx512_permvar_sf_512:
3129 case Intrinsic::x86_avx512_permvar_si_512:
3130 if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
3131 return IC.replaceInstUsesWith(II, V);
3132 }
3133 if (simplifyX86VPERMMask(&II, /*IsBinary=*/false, IC))
3134 return &II;
3135 break;
3136
3137 case Intrinsic::x86_avx512_vpermi2var_d_128:
3138 case Intrinsic::x86_avx512_vpermi2var_d_256:
3139 case Intrinsic::x86_avx512_vpermi2var_d_512:
3140 case Intrinsic::x86_avx512_vpermi2var_hi_128:
3141 case Intrinsic::x86_avx512_vpermi2var_hi_256:
3142 case Intrinsic::x86_avx512_vpermi2var_hi_512:
3143 case Intrinsic::x86_avx512_vpermi2var_pd_128:
3144 case Intrinsic::x86_avx512_vpermi2var_pd_256:
3145 case Intrinsic::x86_avx512_vpermi2var_pd_512:
3146 case Intrinsic::x86_avx512_vpermi2var_ps_128:
3147 case Intrinsic::x86_avx512_vpermi2var_ps_256:
3148 case Intrinsic::x86_avx512_vpermi2var_ps_512:
3149 case Intrinsic::x86_avx512_vpermi2var_q_128:
3150 case Intrinsic::x86_avx512_vpermi2var_q_256:
3151 case Intrinsic::x86_avx512_vpermi2var_q_512:
3152 case Intrinsic::x86_avx512_vpermi2var_qi_128:
3153 case Intrinsic::x86_avx512_vpermi2var_qi_256:
3154 case Intrinsic::x86_avx512_vpermi2var_qi_512:
3155 if (Value *V = simplifyX86vpermv3(II, IC.Builder)) {
3156 return IC.replaceInstUsesWith(II, V);
3157 }
3158 if (simplifyX86VPERMMask(&II, /*IsBinary=*/true, IC))
3159 return &II;
3160 break;
3161
3162 case Intrinsic::x86_avx_maskload_ps:
3163 case Intrinsic::x86_avx_maskload_pd:
3164 case Intrinsic::x86_avx_maskload_ps_256:
3165 case Intrinsic::x86_avx_maskload_pd_256:
3166 case Intrinsic::x86_avx2_maskload_d:
3167 case Intrinsic::x86_avx2_maskload_q:
3168 case Intrinsic::x86_avx2_maskload_d_256:
3169 case Intrinsic::x86_avx2_maskload_q_256:
3170 if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
3171 return I;
3172 }
3173 break;
3174
3175 case Intrinsic::x86_sse2_maskmov_dqu:
3176 case Intrinsic::x86_avx_maskstore_ps:
3177 case Intrinsic::x86_avx_maskstore_pd:
3178 case Intrinsic::x86_avx_maskstore_ps_256:
3179 case Intrinsic::x86_avx_maskstore_pd_256:
3180 case Intrinsic::x86_avx2_maskstore_d:
3181 case Intrinsic::x86_avx2_maskstore_q:
3182 case Intrinsic::x86_avx2_maskstore_d_256:
3183 case Intrinsic::x86_avx2_maskstore_q_256:
3184 if (simplifyX86MaskedStore(II, IC)) {
3185 return nullptr;
3186 }
3187 break;
3188
3189 case Intrinsic::x86_addcarry_32:
3190 case Intrinsic::x86_addcarry_64:
3191 if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
3192 return IC.replaceInstUsesWith(II, V);
3193 }
3194 break;
3195
3196 case Intrinsic::x86_avx512_pternlog_d_128:
3197 case Intrinsic::x86_avx512_pternlog_d_256:
3198 case Intrinsic::x86_avx512_pternlog_d_512:
3199 case Intrinsic::x86_avx512_pternlog_q_128:
3200 case Intrinsic::x86_avx512_pternlog_q_256:
3201 case Intrinsic::x86_avx512_pternlog_q_512:
3202 if (Value *V = simplifyTernarylogic(II, IC.Builder)) {
3203 return IC.replaceInstUsesWith(II, V);
3204 }
3205 break;
3206
3207 default:
3208 break;
3209 }
3210 return std::nullopt;
3211}
3212
3214 InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
3215 bool &KnownBitsComputed) const {
3216 switch (II.getIntrinsicID()) {
3217 default:
3218 break;
3219 case Intrinsic::x86_mmx_pmovmskb:
3220 case Intrinsic::x86_sse_movmsk_ps:
3221 case Intrinsic::x86_sse2_movmsk_pd:
3222 case Intrinsic::x86_sse2_pmovmskb_128:
3223 case Intrinsic::x86_avx_movmsk_ps_256:
3224 case Intrinsic::x86_avx_movmsk_pd_256:
3225 case Intrinsic::x86_avx2_pmovmskb: {
3226 // MOVMSK copies the vector elements' sign bits to the low bits
3227 // and zeros the high bits.
3228 unsigned ArgWidth;
3229 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
3230 ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
3231 } else {
3232 auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
3233 ArgWidth = ArgType->getNumElements();
3234 }
3235
3236 // If we don't need any of low bits then return zero,
3237 // we know that DemandedMask is non-zero already.
3238 APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
3239 Type *VTy = II.getType();
3240 if (DemandedElts.isZero()) {
3241 return ConstantInt::getNullValue(VTy);
3242 }
3243
3244 // We know that the upper bits are set to zero.
3245 Known.Zero.setBitsFrom(ArgWidth);
3246 KnownBitsComputed = true;
3247 break;
3248 }
3249 }
3250 return std::nullopt;
3251}
3252
3254 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
3255 APInt &UndefElts2, APInt &UndefElts3,
3256 std::function<void(Instruction *, unsigned, APInt, APInt &)>
3257 simplifyAndSetOp) const {
3258 unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
3259 switch (II.getIntrinsicID()) {
3260 default:
3261 break;
3262 case Intrinsic::x86_xop_vfrcz_ss:
3263 case Intrinsic::x86_xop_vfrcz_sd:
3264 // The instructions for these intrinsics are speced to zero upper bits not
3265 // pass them through like other scalar intrinsics. So we shouldn't just
3266 // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
3267 // Instead we should return a zero vector.
3268 if (!DemandedElts[0]) {
3269 IC.addToWorklist(&II);
3270 return ConstantAggregateZero::get(II.getType());
3271 }
3272
3273 // Only the lower element is used.
3274 DemandedElts = 1;
3275 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3276
3277 // Only the lower element is undefined. The high elements are zero.
3278 UndefElts = UndefElts[0];
3279 break;
3280
3281 // Unary scalar-as-vector operations that work column-wise.
3282 case Intrinsic::x86_sse_rcp_ss:
3283 case Intrinsic::x86_sse_rsqrt_ss:
3284 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3285
3286 // If lowest element of a scalar op isn't used then use Arg0.
3287 if (!DemandedElts[0]) {
3288 IC.addToWorklist(&II);
3289 return II.getArgOperand(0);
3290 }
3291 // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
3292 // checks).
3293 break;
3294
3295 // Binary scalar-as-vector operations that work column-wise. The high
3296 // elements come from operand 0. The low element is a function of both
3297 // operands.
3298 case Intrinsic::x86_sse_min_ss:
3299 case Intrinsic::x86_sse_max_ss:
3300 case Intrinsic::x86_sse_cmp_ss:
3301 case Intrinsic::x86_sse2_min_sd:
3302 case Intrinsic::x86_sse2_max_sd:
3303 case Intrinsic::x86_sse2_cmp_sd: {
3304 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3305
3306 // If lowest element of a scalar op isn't used then use Arg0.
3307 if (!DemandedElts[0]) {
3308 IC.addToWorklist(&II);
3309 return II.getArgOperand(0);
3310 }
3311
3312 // Only lower element is used for operand 1.
3313 DemandedElts = 1;
3314 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3315
3316 // Lower element is undefined if both lower elements are undefined.
3317 // Consider things like undef&0. The result is known zero, not undef.
3318 if (!UndefElts2[0])
3319 UndefElts.clearBit(0);
3320
3321 break;
3322 }
3323
3324 // Binary scalar-as-vector operations that work column-wise. The high
3325 // elements come from operand 0 and the low element comes from operand 1.
3326 case Intrinsic::x86_sse41_round_ss:
3327 case Intrinsic::x86_sse41_round_sd: {
3328 // Don't use the low element of operand 0.
3329 APInt DemandedElts2 = DemandedElts;
3330 DemandedElts2.clearBit(0);
3331 simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
3332
3333 // If lowest element of a scalar op isn't used then use Arg0.
3334 if (!DemandedElts[0]) {
3335 IC.addToWorklist(&II);
3336 return II.getArgOperand(0);
3337 }
3338
3339 // Only lower element is used for operand 1.
3340 DemandedElts = 1;
3341 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3342
3343 // Take the high undef elements from operand 0 and take the lower element
3344 // from operand 1.
3345 UndefElts.clearBit(0);
3346 UndefElts |= UndefElts2[0];
3347 break;
3348 }
3349
3350 // Three input scalar-as-vector operations that work column-wise. The high
3351 // elements come from operand 0 and the low element is a function of all
3352 // three inputs.
3353 case Intrinsic::x86_avx512_mask_add_ss_round:
3354 case Intrinsic::x86_avx512_mask_div_ss_round:
3355 case Intrinsic::x86_avx512_mask_mul_ss_round:
3356 case Intrinsic::x86_avx512_mask_sub_ss_round:
3357 case Intrinsic::x86_avx512_mask_max_ss_round:
3358 case Intrinsic::x86_avx512_mask_min_ss_round:
3359 case Intrinsic::x86_avx512_mask_add_sd_round:
3360 case Intrinsic::x86_avx512_mask_div_sd_round:
3361 case Intrinsic::x86_avx512_mask_mul_sd_round:
3362 case Intrinsic::x86_avx512_mask_sub_sd_round:
3363 case Intrinsic::x86_avx512_mask_max_sd_round:
3364 case Intrinsic::x86_avx512_mask_min_sd_round:
3365 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3366
3367 // If lowest element of a scalar op isn't used then use Arg0.
3368 if (!DemandedElts[0]) {
3369 IC.addToWorklist(&II);
3370 return II.getArgOperand(0);
3371 }
3372
3373 // Only lower element is used for operand 1 and 2.
3374 DemandedElts = 1;
3375 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3376 simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
3377
3378 // Lower element is undefined if all three lower elements are undefined.
3379 // Consider things like undef&0. The result is known zero, not undef.
3380 if (!UndefElts2[0] || !UndefElts3[0])
3381 UndefElts.clearBit(0);
3382 break;
3383
3384 // TODO: Add fmaddsub support?
3385 case Intrinsic::x86_sse3_addsub_pd:
3386 case Intrinsic::x86_sse3_addsub_ps:
3387 case Intrinsic::x86_avx_addsub_pd_256:
3388 case Intrinsic::x86_avx_addsub_ps_256: {
3389 // If none of the even or none of the odd lanes are required, turn this
3390 // into a generic FP math instruction.
3391 APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
3392 APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
3393 bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
3394 bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
3395 if (IsSubOnly || IsAddOnly) {
3396 assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
3399 Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
3400 return IC.Builder.CreateBinOp(
3401 IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
3402 }
3403
3404 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3405 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3406 UndefElts &= UndefElts2;
3407 break;
3408 }
3409
3410 // General per-element vector operations.
3411 case Intrinsic::x86_avx2_psllv_d:
3412 case Intrinsic::x86_avx2_psllv_d_256:
3413 case Intrinsic::x86_avx2_psllv_q:
3414 case Intrinsic::x86_avx2_psllv_q_256:
3415 case Intrinsic::x86_avx2_psrlv_d:
3416 case Intrinsic::x86_avx2_psrlv_d_256:
3417 case Intrinsic::x86_avx2_psrlv_q:
3418 case Intrinsic::x86_avx2_psrlv_q_256:
3419 case Intrinsic::x86_avx2_psrav_d:
3420 case Intrinsic::x86_avx2_psrav_d_256: {
3421 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3422 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3423 UndefElts &= UndefElts2;
3424 break;
3425 }
3426
3427 case Intrinsic::x86_sse2_pmulh_w:
3428 case Intrinsic::x86_avx2_pmulh_w:
3429 case Intrinsic::x86_avx512_pmulh_w_512:
3430 case Intrinsic::x86_sse2_pmulhu_w:
3431 case Intrinsic::x86_avx2_pmulhu_w:
3432 case Intrinsic::x86_avx512_pmulhu_w_512:
3433 case Intrinsic::x86_ssse3_pmul_hr_sw_128:
3434 case Intrinsic::x86_avx2_pmul_hr_sw:
3435 case Intrinsic::x86_avx512_pmul_hr_sw_512: {
3436 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3437 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3438 // NOTE: mulh(undef,undef) != undef.
3439 break;
3440 }
3441
3442 case Intrinsic::x86_sse2_packssdw_128:
3443 case Intrinsic::x86_sse2_packsswb_128:
3444 case Intrinsic::x86_sse2_packuswb_128:
3445 case Intrinsic::x86_sse41_packusdw:
3446 case Intrinsic::x86_avx2_packssdw:
3447 case Intrinsic::x86_avx2_packsswb:
3448 case Intrinsic::x86_avx2_packusdw:
3449 case Intrinsic::x86_avx2_packuswb:
3450 case Intrinsic::x86_avx512_packssdw_512:
3451 case Intrinsic::x86_avx512_packsswb_512:
3452 case Intrinsic::x86_avx512_packusdw_512:
3453 case Intrinsic::x86_avx512_packuswb_512: {
3454 auto *Ty0 = II.getArgOperand(0)->getType();
3455 unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
3456 assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
3457
3458 unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
3459 unsigned VWidthPerLane = VWidth / NumLanes;
3460 unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
3461
3462 // Per lane, pack the elements of the first input and then the second.
3463 // e.g.
3464 // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
3465 // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
3466 for (int OpNum = 0; OpNum != 2; ++OpNum) {
3467 APInt OpDemandedElts(InnerVWidth, 0);
3468 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3469 unsigned LaneIdx = Lane * VWidthPerLane;
3470 for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
3471 unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
3472 if (DemandedElts[Idx])
3473 OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
3474 }
3475 }
3476
3477 // Demand elements from the operand.
3478 APInt OpUndefElts(InnerVWidth, 0);
3479 simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
3480
3481 // Pack the operand's UNDEF elements, one lane at a time.
3482 OpUndefElts = OpUndefElts.zext(VWidth);
3483 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3484 APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
3485 LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
3486 LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
3487 UndefElts |= LaneElts;
3488 }
3489 }
3490 break;
3491 }
3492
3493 case Intrinsic::x86_sse2_pmadd_wd:
3494 case Intrinsic::x86_avx2_pmadd_wd:
3495 case Intrinsic::x86_avx512_pmaddw_d_512:
3496 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
3497 case Intrinsic::x86_avx2_pmadd_ub_sw:
3498 case Intrinsic::x86_avx512_pmaddubs_w_512: {
3499 // PMADD - demand both src elements that map to each dst element.
3500 auto *ArgTy = II.getArgOperand(0)->getType();
3501 unsigned InnerVWidth = cast<FixedVectorType>(ArgTy)->getNumElements();
3502 assert((VWidth * 2) == InnerVWidth && "Unexpected input size");
3503 APInt OpDemandedElts = APIntOps::ScaleBitMask(DemandedElts, InnerVWidth);
3504 APInt Op0UndefElts(InnerVWidth, 0);
3505 APInt Op1UndefElts(InnerVWidth, 0);
3506 simplifyAndSetOp(&II, 0, OpDemandedElts, Op0UndefElts);
3507 simplifyAndSetOp(&II, 1, OpDemandedElts, Op1UndefElts);
3508 // NOTE: madd(undef,undef) != undef.
3509 break;
3510 }
3511
3512 // PSHUFB
3513 case Intrinsic::x86_ssse3_pshuf_b_128:
3514 case Intrinsic::x86_avx2_pshuf_b:
3515 case Intrinsic::x86_avx512_pshuf_b_512:
3516 // PERMILVAR
3517 case Intrinsic::x86_avx_vpermilvar_ps:
3518 case Intrinsic::x86_avx_vpermilvar_ps_256:
3519 case Intrinsic::x86_avx512_vpermilvar_ps_512:
3520 case Intrinsic::x86_avx_vpermilvar_pd:
3521 case Intrinsic::x86_avx_vpermilvar_pd_256:
3522 case Intrinsic::x86_avx512_vpermilvar_pd_512:
3523 // PERMV
3524 case Intrinsic::x86_avx2_permd:
3525 case Intrinsic::x86_avx2_permps: {
3526 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
3527 break;
3528 }
3529
3530 // SSE4A instructions leave the upper 64-bits of the 128-bit result
3531 // in an undefined state.
3532 case Intrinsic::x86_sse4a_extrq:
3533 case Intrinsic::x86_sse4a_extrqi:
3534 case Intrinsic::x86_sse4a_insertq:
3535 case Intrinsic::x86_sse4a_insertqi:
3536 UndefElts.setHighBits(VWidth / 2);
3537 break;
3538 }
3539 return std::nullopt;
3540}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file provides the interface for the instcombine pass implementation.
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
static unsigned getNumElements(Type *Ty)
static Value * simplifyTernarylogic(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
static Instruction * simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC)
static Value * simplifyX86immShift(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
static Value * simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, APInt APLength, APInt APIndex, InstCombiner::BuilderTy &Builder)
Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant folding or conversion to a shu...
static Value * simplifyX86addcarry(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
static Value * simplifyX86pack(IntrinsicInst &II, InstCombiner::BuilderTy &Builder, bool IsSigned)
static Constant * getNegativeIsTrueBoolVec(Constant *V, const DataLayout &DL)
Return a constant boolean vector that has true elements in all positions where the input constant dat...
static Value * simplifyX86FPMaxMin(const IntrinsicInst &II, InstCombiner &IC, Intrinsic::ID NewIID, bool IsScalar=false)
static Value * simplifyX86pshufb(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Attempt to convert pshufb* to shufflevector if the mask is constant.
static Value * simplifyX86vpermv3(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Attempt to convert vpermi2/vpermt2 to shufflevector if the mask is constant.
static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC)
static Value * simplifyX86vpermilvar(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Attempt to convert vpermilvar* to shufflevector if the mask is constant.
static Value * simplifyX86pmulh(IntrinsicInst &II, InstCombiner::BuilderTy &Builder, bool IsSigned, bool IsRounding)
static Value * simplifyX86movmsk(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
static Value * simplifyX86vpermv(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
static Value * simplifyX86pmadd(IntrinsicInst &II, InstCombiner::BuilderTy &Builder, bool IsPMADDWD)
static Value * simplifyX86insertps(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
static bool simplifyX86VPERMMask(Instruction *II, bool IsBinary, InstCombiner &IC)
static Value * simplifyX86extrq(IntrinsicInst &II, Value *Op0, ConstantInt *CILength, ConstantInt *CIIndex, InstCombiner::BuilderTy &Builder)
Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding or conversion to a shuffle...
static Value * getBoolVecFromMask(Value *Mask, const DataLayout &DL)
Convert the x86 XMM integer vector mask to a vector of bools based on each element's most significant...
static Value * simplifyX86varShift(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Value * RHS
Value * LHS
BinaryOperator * Mul
This file a TargetTransformInfoImplBase conforming object specific to the X86 target machine.
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
LLVM_ABI APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition APInt.cpp:644
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1421
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1023
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:230
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1406
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1400
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1044
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1345
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1118
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:996
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition APInt.h:880
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1264
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:865
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:858
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
This class represents a no-op cast from one type to another.
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
All zero aggregate value.
Definition Constants.h:364
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
static LLVM_ABI Constant * getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
LLVM_ABI Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
static FixedVectorType * getExtendedElementVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
Value * CreateFSub(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1635
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2561
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:546
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1673
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2549
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2332
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1616
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1516
Value * CreateMaxNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the maxnum intrinsic.
Definition IRBuilder.h:1027
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2176
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1495
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2583
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1554
Value * CreateMinNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the minnum intrinsic.
Definition IRBuilder.h:1015
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1711
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1654
The core instruction combiner logic.
const DataLayout & getDataLayout() const
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
IRBuilder< TargetFolder, IRBuilderCallbackInserter > BuilderTy
An IRBuilder that automatically inserts new instructions into the worklist.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, const SimplifyQuery &Q, unsigned Depth=0)=0
virtual Value * SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt &UndefElts, unsigned Depth=0, bool AllowMultipleUsers=false)=0
static Value * peekThroughBitcast(Value *V, bool OneUseOnly=false)
Return the source operand of a potentially bitcasted value while optionally checking if it has one us...
void addToWorklist(Instruction *I)
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
const SimplifyQuery & getSimplifyQuery() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
A wrapper class for inspecting calls to intrinsic functions.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
static SelectInst * Create(Value *C, Value *S1, Value *S2, const Twine &NameStr="", InsertPosition InsertBefore=nullptr, const Instruction *MDFrom=nullptr)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
LLVM_ABI unsigned getIntegerBitWidth() const
LLVM_ABI Type * getStructElementType(unsigned N) const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed) const override
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3020
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
bool match(Val *V, const Pattern &P)
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
match_immconstant_ty m_ImmConstant()
Match an arbitrary immediate Constant and ignore it.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Length
Definition DWP.cpp:532
LLVM_ABI KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, const SimplifyQuery &SQ, unsigned Depth=0)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI Constant * ConstantFoldCompareInstOperands(unsigned Predicate, Constant *LHS, Constant *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, const Instruction *I=nullptr)
Attempt to constant fold a compare instruction (icmp/fcmp) with the specified operands.
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
@ Or
Bitwise or logical OR of integers.
@ Xor
Bitwise or logical XOR of integers.
@ And
Bitwise or logical AND of integers.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Attempt to narrow/widen the Mask shuffle mask to the NumDstElts target width.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:148
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:132
bool isKnownNever(FPClassTest Mask) const
Return true if it's known this can never be one of the mask entries.
Matching combinators.
SimplifyQuery getWithInstruction(const Instruction *I) const