LLVM 19.0.0git
X86InstCombineIntrinsic.cpp
Go to the documentation of this file.
1//===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements a TargetTransformInfo analysis pass specific to the
10/// X86 target machine. It uses the target's detailed information to provide
11/// more precise answers to certain TTI queries, while letting the target
12/// independent and default TTI implementations handle the rest.
13///
14//===----------------------------------------------------------------------===//
15
18#include "llvm/IR/IntrinsicsX86.h"
21#include <optional>
22
23using namespace llvm;
24
25#define DEBUG_TYPE "x86tti"
26
27/// Return a constant boolean vector that has true elements in all positions
28/// where the input constant data vector has an element with the sign bit set.
30 VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
31 V = ConstantExpr::getBitCast(V, IntTy);
33 Constant::getNullValue(IntTy), V, DL);
34 assert(V && "Vector must be foldable");
35 return V;
36}
37
38/// Convert the x86 XMM integer vector mask to a vector of bools based on
39/// each element's most significant bit (the sign bit).
40static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) {
41 // Fold Constant Mask.
42 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
43 return getNegativeIsTrueBoolVec(ConstantMask, DL);
44
45 // Mask was extended from a boolean vector.
46 Value *ExtMask;
49 ExtMask->getType()->isIntOrIntVectorTy(1))
50 return ExtMask;
51
52 return nullptr;
53}
54
55// TODO: If the x86 backend knew how to convert a bool vector mask back to an
56// XMM register mask efficiently, we could transform all x86 masked intrinsics
57// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
59 Value *Ptr = II.getOperand(0);
60 Value *Mask = II.getOperand(1);
61 Constant *ZeroVec = Constant::getNullValue(II.getType());
62
63 // Zero Mask - masked load instruction creates a zero vector.
64 if (isa<ConstantAggregateZero>(Mask))
65 return IC.replaceInstUsesWith(II, ZeroVec);
66
67 // The mask is constant or extended from a bool vector. Convert this x86
68 // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
69 if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
70 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
71 // the LLVM intrinsic definition for the pointer argument.
72 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
73 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
74 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
75
76 // The pass-through vector for an x86 masked load is a zero vector.
77 CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
78 II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
79 return IC.replaceInstUsesWith(II, NewMaskedLoad);
80 }
81
82 return nullptr;
83}
84
85// TODO: If the x86 backend knew how to convert a bool vector mask back to an
86// XMM register mask efficiently, we could transform all x86 masked intrinsics
87// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
89 Value *Ptr = II.getOperand(0);
90 Value *Mask = II.getOperand(1);
91 Value *Vec = II.getOperand(2);
92
93 // Zero Mask - this masked store instruction does nothing.
94 if (isa<ConstantAggregateZero>(Mask)) {
96 return true;
97 }
98
99 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
100 // anything else at this level.
101 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
102 return false;
103
104 // The mask is constant or extended from a bool vector. Convert this x86
105 // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
106 if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
107 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
108 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
109 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
110
111 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
112
113 // 'Replace uses' doesn't work for stores. Erase the original masked store.
115 return true;
116 }
117
118 return false;
119}
120
122 InstCombiner::BuilderTy &Builder) {
123 bool LogicalShift = false;
124 bool ShiftLeft = false;
125 bool IsImm = false;
126
127 switch (II.getIntrinsicID()) {
128 default:
129 llvm_unreachable("Unexpected intrinsic!");
130 case Intrinsic::x86_sse2_psrai_d:
131 case Intrinsic::x86_sse2_psrai_w:
132 case Intrinsic::x86_avx2_psrai_d:
133 case Intrinsic::x86_avx2_psrai_w:
134 case Intrinsic::x86_avx512_psrai_q_128:
135 case Intrinsic::x86_avx512_psrai_q_256:
136 case Intrinsic::x86_avx512_psrai_d_512:
137 case Intrinsic::x86_avx512_psrai_q_512:
138 case Intrinsic::x86_avx512_psrai_w_512:
139 IsImm = true;
140 [[fallthrough]];
141 case Intrinsic::x86_sse2_psra_d:
142 case Intrinsic::x86_sse2_psra_w:
143 case Intrinsic::x86_avx2_psra_d:
144 case Intrinsic::x86_avx2_psra_w:
145 case Intrinsic::x86_avx512_psra_q_128:
146 case Intrinsic::x86_avx512_psra_q_256:
147 case Intrinsic::x86_avx512_psra_d_512:
148 case Intrinsic::x86_avx512_psra_q_512:
149 case Intrinsic::x86_avx512_psra_w_512:
150 LogicalShift = false;
151 ShiftLeft = false;
152 break;
153 case Intrinsic::x86_sse2_psrli_d:
154 case Intrinsic::x86_sse2_psrli_q:
155 case Intrinsic::x86_sse2_psrli_w:
156 case Intrinsic::x86_avx2_psrli_d:
157 case Intrinsic::x86_avx2_psrli_q:
158 case Intrinsic::x86_avx2_psrli_w:
159 case Intrinsic::x86_avx512_psrli_d_512:
160 case Intrinsic::x86_avx512_psrli_q_512:
161 case Intrinsic::x86_avx512_psrli_w_512:
162 IsImm = true;
163 [[fallthrough]];
164 case Intrinsic::x86_sse2_psrl_d:
165 case Intrinsic::x86_sse2_psrl_q:
166 case Intrinsic::x86_sse2_psrl_w:
167 case Intrinsic::x86_avx2_psrl_d:
168 case Intrinsic::x86_avx2_psrl_q:
169 case Intrinsic::x86_avx2_psrl_w:
170 case Intrinsic::x86_avx512_psrl_d_512:
171 case Intrinsic::x86_avx512_psrl_q_512:
172 case Intrinsic::x86_avx512_psrl_w_512:
173 LogicalShift = true;
174 ShiftLeft = false;
175 break;
176 case Intrinsic::x86_sse2_pslli_d:
177 case Intrinsic::x86_sse2_pslli_q:
178 case Intrinsic::x86_sse2_pslli_w:
179 case Intrinsic::x86_avx2_pslli_d:
180 case Intrinsic::x86_avx2_pslli_q:
181 case Intrinsic::x86_avx2_pslli_w:
182 case Intrinsic::x86_avx512_pslli_d_512:
183 case Intrinsic::x86_avx512_pslli_q_512:
184 case Intrinsic::x86_avx512_pslli_w_512:
185 IsImm = true;
186 [[fallthrough]];
187 case Intrinsic::x86_sse2_psll_d:
188 case Intrinsic::x86_sse2_psll_q:
189 case Intrinsic::x86_sse2_psll_w:
190 case Intrinsic::x86_avx2_psll_d:
191 case Intrinsic::x86_avx2_psll_q:
192 case Intrinsic::x86_avx2_psll_w:
193 case Intrinsic::x86_avx512_psll_d_512:
194 case Intrinsic::x86_avx512_psll_q_512:
195 case Intrinsic::x86_avx512_psll_w_512:
196 LogicalShift = true;
197 ShiftLeft = true;
198 break;
199 }
200 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
201
202 Value *Vec = II.getArgOperand(0);
203 Value *Amt = II.getArgOperand(1);
204 auto *VT = cast<FixedVectorType>(Vec->getType());
205 Type *SVT = VT->getElementType();
206 Type *AmtVT = Amt->getType();
207 unsigned VWidth = VT->getNumElements();
208 unsigned BitWidth = SVT->getPrimitiveSizeInBits();
209
210 // If the shift amount is guaranteed to be in-range we can replace it with a
211 // generic shift. If its guaranteed to be out of range, logical shifts combine
212 // to zero and arithmetic shifts are clamped to (BitWidth - 1).
213 if (IsImm) {
214 assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
215 KnownBits KnownAmtBits =
216 llvm::computeKnownBits(Amt, II.getDataLayout());
217 if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
218 Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
219 Amt = Builder.CreateVectorSplat(VWidth, Amt);
220 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
221 : Builder.CreateLShr(Vec, Amt))
222 : Builder.CreateAShr(Vec, Amt));
223 }
224 if (KnownAmtBits.getMinValue().uge(BitWidth)) {
225 if (LogicalShift)
227 Amt = ConstantInt::get(SVT, BitWidth - 1);
228 return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
229 }
230 } else {
231 // Ensure the first element has an in-range value and the rest of the
232 // elements in the bottom 64 bits are zero.
233 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
234 cast<VectorType>(AmtVT)->getElementType() == SVT &&
235 "Unexpected shift-by-scalar type");
236 unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
237 APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
238 APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
239 KnownBits KnownLowerBits = llvm::computeKnownBits(
240 Amt, DemandedLower, II.getDataLayout());
241 KnownBits KnownUpperBits = llvm::computeKnownBits(
242 Amt, DemandedUpper, II.getDataLayout());
243 if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
244 (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
245 SmallVector<int, 16> ZeroSplat(VWidth, 0);
246 Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
247 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
248 : Builder.CreateLShr(Vec, Amt))
249 : Builder.CreateAShr(Vec, Amt));
250 }
251 }
252
253 // Simplify if count is constant vector.
254 auto *CDV = dyn_cast<ConstantDataVector>(Amt);
255 if (!CDV)
256 return nullptr;
257
258 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
259 // operand to compute the shift amount.
260 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
261 cast<VectorType>(AmtVT)->getElementType() == SVT &&
262 "Unexpected shift-by-scalar type");
263
264 // Concatenate the sub-elements to create the 64-bit value.
265 APInt Count(64, 0);
266 for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
267 unsigned SubEltIdx = (NumSubElts - 1) - i;
268 auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
269 Count <<= BitWidth;
270 Count |= SubElt->getValue().zextOrTrunc(64);
271 }
272
273 // If shift-by-zero then just return the original value.
274 if (Count.isZero())
275 return Vec;
276
277 // Handle cases when Shift >= BitWidth.
278 if (Count.uge(BitWidth)) {
279 // If LogicalShift - just return zero.
280 if (LogicalShift)
282
283 // If ArithmeticShift - clamp Shift to (BitWidth - 1).
284 Count = APInt(64, BitWidth - 1);
285 }
286
287 // Get a constant vector of the same type as the first operand.
288 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
289 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
290
291 if (ShiftLeft)
292 return Builder.CreateShl(Vec, ShiftVec);
293
294 if (LogicalShift)
295 return Builder.CreateLShr(Vec, ShiftVec);
296
297 return Builder.CreateAShr(Vec, ShiftVec);
298}
299
300// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
301// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
302// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
304 InstCombiner::BuilderTy &Builder) {
305 bool LogicalShift = false;
306 bool ShiftLeft = false;
307
308 switch (II.getIntrinsicID()) {
309 default:
310 llvm_unreachable("Unexpected intrinsic!");
311 case Intrinsic::x86_avx2_psrav_d:
312 case Intrinsic::x86_avx2_psrav_d_256:
313 case Intrinsic::x86_avx512_psrav_q_128:
314 case Intrinsic::x86_avx512_psrav_q_256:
315 case Intrinsic::x86_avx512_psrav_d_512:
316 case Intrinsic::x86_avx512_psrav_q_512:
317 case Intrinsic::x86_avx512_psrav_w_128:
318 case Intrinsic::x86_avx512_psrav_w_256:
319 case Intrinsic::x86_avx512_psrav_w_512:
320 LogicalShift = false;
321 ShiftLeft = false;
322 break;
323 case Intrinsic::x86_avx2_psrlv_d:
324 case Intrinsic::x86_avx2_psrlv_d_256:
325 case Intrinsic::x86_avx2_psrlv_q:
326 case Intrinsic::x86_avx2_psrlv_q_256:
327 case Intrinsic::x86_avx512_psrlv_d_512:
328 case Intrinsic::x86_avx512_psrlv_q_512:
329 case Intrinsic::x86_avx512_psrlv_w_128:
330 case Intrinsic::x86_avx512_psrlv_w_256:
331 case Intrinsic::x86_avx512_psrlv_w_512:
332 LogicalShift = true;
333 ShiftLeft = false;
334 break;
335 case Intrinsic::x86_avx2_psllv_d:
336 case Intrinsic::x86_avx2_psllv_d_256:
337 case Intrinsic::x86_avx2_psllv_q:
338 case Intrinsic::x86_avx2_psllv_q_256:
339 case Intrinsic::x86_avx512_psllv_d_512:
340 case Intrinsic::x86_avx512_psllv_q_512:
341 case Intrinsic::x86_avx512_psllv_w_128:
342 case Intrinsic::x86_avx512_psllv_w_256:
343 case Intrinsic::x86_avx512_psllv_w_512:
344 LogicalShift = true;
345 ShiftLeft = true;
346 break;
347 }
348 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
349
350 Value *Vec = II.getArgOperand(0);
351 Value *Amt = II.getArgOperand(1);
352 auto *VT = cast<FixedVectorType>(II.getType());
353 Type *SVT = VT->getElementType();
354 int NumElts = VT->getNumElements();
355 int BitWidth = SVT->getIntegerBitWidth();
356
357 // If the shift amount is guaranteed to be in-range we can replace it with a
358 // generic shift.
359 KnownBits KnownAmt =
360 llvm::computeKnownBits(Amt, II.getDataLayout());
361 if (KnownAmt.getMaxValue().ult(BitWidth)) {
362 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
363 : Builder.CreateLShr(Vec, Amt))
364 : Builder.CreateAShr(Vec, Amt));
365 }
366
367 // Simplify if all shift amounts are constant/undef.
368 auto *CShift = dyn_cast<Constant>(Amt);
369 if (!CShift)
370 return nullptr;
371
372 // Collect each element's shift amount.
373 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
374 bool AnyOutOfRange = false;
375 SmallVector<int, 8> ShiftAmts;
376 for (int I = 0; I < NumElts; ++I) {
377 auto *CElt = CShift->getAggregateElement(I);
378 if (isa_and_nonnull<UndefValue>(CElt)) {
379 ShiftAmts.push_back(-1);
380 continue;
381 }
382
383 auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
384 if (!COp)
385 return nullptr;
386
387 // Handle out of range shifts.
388 // If LogicalShift - set to BitWidth (special case).
389 // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
390 APInt ShiftVal = COp->getValue();
391 if (ShiftVal.uge(BitWidth)) {
392 AnyOutOfRange = LogicalShift;
393 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
394 continue;
395 }
396
397 ShiftAmts.push_back((int)ShiftVal.getZExtValue());
398 }
399
400 // If all elements out of range or UNDEF, return vector of zeros/undefs.
401 // ArithmeticShift should only hit this if they are all UNDEF.
402 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
403 if (llvm::all_of(ShiftAmts, OutOfRange)) {
404 SmallVector<Constant *, 8> ConstantVec;
405 for (int Idx : ShiftAmts) {
406 if (Idx < 0) {
407 ConstantVec.push_back(UndefValue::get(SVT));
408 } else {
409 assert(LogicalShift && "Logical shift expected");
410 ConstantVec.push_back(ConstantInt::getNullValue(SVT));
411 }
412 }
413 return ConstantVector::get(ConstantVec);
414 }
415
416 // We can't handle only some out of range values with generic logical shifts.
417 if (AnyOutOfRange)
418 return nullptr;
419
420 // Build the shift amount constant vector.
421 SmallVector<Constant *, 8> ShiftVecAmts;
422 for (int Idx : ShiftAmts) {
423 if (Idx < 0)
424 ShiftVecAmts.push_back(UndefValue::get(SVT));
425 else
426 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
427 }
428 auto ShiftVec = ConstantVector::get(ShiftVecAmts);
429
430 if (ShiftLeft)
431 return Builder.CreateShl(Vec, ShiftVec);
432
433 if (LogicalShift)
434 return Builder.CreateLShr(Vec, ShiftVec);
435
436 return Builder.CreateAShr(Vec, ShiftVec);
437}
438
440 InstCombiner::BuilderTy &Builder, bool IsSigned) {
441 Value *Arg0 = II.getArgOperand(0);
442 Value *Arg1 = II.getArgOperand(1);
443 Type *ResTy = II.getType();
444
445 // Fast all undef handling.
446 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
447 return UndefValue::get(ResTy);
448
449 auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
450 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
451 unsigned NumSrcElts = ArgTy->getNumElements();
452 assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
453 "Unexpected packing types");
454
455 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
456 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
457 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
458 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
459 "Unexpected packing types");
460
461 // Constant folding.
462 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
463 return nullptr;
464
465 // Clamp Values - signed/unsigned both use signed clamp values, but they
466 // differ on the min/max values.
467 APInt MinValue, MaxValue;
468 if (IsSigned) {
469 // PACKSS: Truncate signed value with signed saturation.
470 // Source values less than dst minint are saturated to minint.
471 // Source values greater than dst maxint are saturated to maxint.
472 MinValue =
473 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
474 MaxValue =
475 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
476 } else {
477 // PACKUS: Truncate signed value with unsigned saturation.
478 // Source values less than zero are saturated to zero.
479 // Source values greater than dst maxuint are saturated to maxuint.
480 MinValue = APInt::getZero(SrcScalarSizeInBits);
481 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
482 }
483
484 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
485 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
486 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
487 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
488 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
489 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
490
491 // Shuffle clamped args together at the lane level.
492 SmallVector<int, 32> PackMask;
493 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
494 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
495 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
496 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
497 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
498 }
499 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
500
501 // Truncate to dst size.
502 return Builder.CreateTrunc(Shuffle, ResTy);
503}
504
507 bool IsPMADDWD) {
508 Value *Arg0 = II.getArgOperand(0);
509 Value *Arg1 = II.getArgOperand(1);
510 auto *ResTy = cast<FixedVectorType>(II.getType());
511 [[maybe_unused]] auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
512
513 unsigned NumDstElts = ResTy->getNumElements();
514 assert(ArgTy->getNumElements() == (2 * NumDstElts) &&
515 ResTy->getScalarSizeInBits() == (2 * ArgTy->getScalarSizeInBits()) &&
516 "Unexpected PMADD types");
517
518 // Multiply by undef -> zero (NOT undef!) as other arg could still be zero.
519 if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
520 return ConstantAggregateZero::get(ResTy);
521
522 // Multiply by zero.
523 if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1))
524 return ConstantAggregateZero::get(ResTy);
525
526 // Constant folding.
527 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
528 return nullptr;
529
530 // Split Lo/Hi elements pairs, extend and add together.
531 // PMADDWD(X,Y) =
532 // add(mul(sext(lhs[0]),sext(rhs[0])),mul(sext(lhs[1]),sext(rhs[1])))
533 // PMADDUBSW(X,Y) =
534 // sadd_sat(mul(zext(lhs[0]),sext(rhs[0])),mul(zext(lhs[1]),sext(rhs[1])))
535 SmallVector<int> LoMask, HiMask;
536 for (unsigned I = 0; I != NumDstElts; ++I) {
537 LoMask.push_back(2 * I + 0);
538 HiMask.push_back(2 * I + 1);
539 }
540
541 auto *LHSLo = Builder.CreateShuffleVector(Arg0, LoMask);
542 auto *LHSHi = Builder.CreateShuffleVector(Arg0, HiMask);
543 auto *RHSLo = Builder.CreateShuffleVector(Arg1, LoMask);
544 auto *RHSHi = Builder.CreateShuffleVector(Arg1, HiMask);
545
546 auto LHSCast =
547 IsPMADDWD ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt;
548 LHSLo = Builder.CreateCast(LHSCast, LHSLo, ResTy);
549 LHSHi = Builder.CreateCast(LHSCast, LHSHi, ResTy);
550 RHSLo = Builder.CreateCast(Instruction::CastOps::SExt, RHSLo, ResTy);
551 RHSHi = Builder.CreateCast(Instruction::CastOps::SExt, RHSHi, ResTy);
552 Value *Lo = Builder.CreateMul(LHSLo, RHSLo);
553 Value *Hi = Builder.CreateMul(LHSHi, RHSHi);
554 return IsPMADDWD
555 ? Builder.CreateAdd(Lo, Hi)
556 : Builder.CreateIntrinsic(ResTy, Intrinsic::sadd_sat, {Lo, Hi});
557}
558
560 InstCombiner::BuilderTy &Builder) {
561 Value *Arg = II.getArgOperand(0);
562 Type *ResTy = II.getType();
563
564 // movmsk(undef) -> zero as we must ensure the upper bits are zero.
565 if (isa<UndefValue>(Arg))
566 return Constant::getNullValue(ResTy);
567
568 auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
569 // We can't easily peek through x86_mmx types.
570 if (!ArgTy)
571 return nullptr;
572
573 // Expand MOVMSK to compare/bitcast/zext:
574 // e.g. PMOVMSKB(v16i8 x):
575 // %cmp = icmp slt <16 x i8> %x, zeroinitializer
576 // %int = bitcast <16 x i1> %cmp to i16
577 // %res = zext i16 %int to i32
578 unsigned NumElts = ArgTy->getNumElements();
579 Type *IntegerTy = Builder.getIntNTy(NumElts);
580
581 Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
582 Res = Builder.CreateIsNeg(Res);
583 Res = Builder.CreateBitCast(Res, IntegerTy);
584 Res = Builder.CreateZExtOrTrunc(Res, ResTy);
585 return Res;
586}
587
589 InstCombiner::BuilderTy &Builder) {
590 Value *CarryIn = II.getArgOperand(0);
591 Value *Op1 = II.getArgOperand(1);
592 Value *Op2 = II.getArgOperand(2);
593 Type *RetTy = II.getType();
594 Type *OpTy = Op1->getType();
595 assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
596 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
597 "Unexpected types for x86 addcarry");
598
599 // If carry-in is zero, this is just an unsigned add with overflow.
600 if (match(CarryIn, PatternMatch::m_ZeroInt())) {
601 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
602 {Op1, Op2});
603 // The types have to be adjusted to match the x86 call types.
604 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
605 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
606 Builder.getInt8Ty());
608 Res = Builder.CreateInsertValue(Res, UAddOV, 0);
609 return Builder.CreateInsertValue(Res, UAddResult, 1);
610 }
611
612 return nullptr;
613}
614
616 InstCombiner::BuilderTy &Builder) {
617
618 auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3));
619 if (!ArgImm || ArgImm->getValue().uge(256))
620 return nullptr;
621
622 Value *ArgA = II.getArgOperand(0);
623 Value *ArgB = II.getArgOperand(1);
624 Value *ArgC = II.getArgOperand(2);
625
626 Type *Ty = II.getType();
627
628 auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
629 return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second};
630 };
631 auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
632 return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second};
633 };
634 auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
635 return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second};
636 };
637 auto Not = [&](auto V) -> std::pair<Value *, uint8_t> {
638 return {Builder.CreateNot(V.first), ~V.second};
639 };
640 auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); };
641 auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); };
642 auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); };
643
644 bool AIsConst = match(ArgA, PatternMatch::m_ImmConstant());
645 bool BIsConst = match(ArgB, PatternMatch::m_ImmConstant());
646 bool CIsConst = match(ArgC, PatternMatch::m_ImmConstant());
647
648 bool ABIsConst = AIsConst && BIsConst;
649 bool ACIsConst = AIsConst && CIsConst;
650 bool BCIsConst = BIsConst && CIsConst;
651 bool ABCIsConst = AIsConst && BIsConst && CIsConst;
652
653 // Use for verification. Its a big table. Its difficult to go from Imm ->
654 // logic ops, but easy to verify that a set of logic ops is correct. We track
655 // the logic ops through the second value in the pair. At the end it should
656 // equal Imm.
657 std::pair<Value *, uint8_t> A = {ArgA, 0xf0};
658 std::pair<Value *, uint8_t> B = {ArgB, 0xcc};
659 std::pair<Value *, uint8_t> C = {ArgC, 0xaa};
660 std::pair<Value *, uint8_t> Res = {nullptr, 0};
661
662 // Currently we only handle cases that convert directly to another instruction
663 // or cases where all the ops are constant. This is because we don't properly
664 // handle creating ternary ops in the backend, so splitting them here may
665 // cause regressions. As the backend improves, uncomment more cases.
666
667 uint8_t Imm = ArgImm->getValue().getZExtValue();
668 switch (Imm) {
669 case 0x0:
670 Res = {Constant::getNullValue(Ty), 0};
671 break;
672 case 0x1:
673 if (ABCIsConst)
674 Res = Nor(Or(A, B), C);
675 break;
676 case 0x2:
677 if (ABCIsConst)
678 Res = And(Nor(A, B), C);
679 break;
680 case 0x3:
681 if (ABIsConst)
682 Res = Nor(A, B);
683 break;
684 case 0x4:
685 if (ABCIsConst)
686 Res = And(Nor(A, C), B);
687 break;
688 case 0x5:
689 if (ACIsConst)
690 Res = Nor(A, C);
691 break;
692 case 0x6:
693 if (ABCIsConst)
694 Res = Nor(A, Xnor(B, C));
695 break;
696 case 0x7:
697 if (ABCIsConst)
698 Res = Nor(A, And(B, C));
699 break;
700 case 0x8:
701 if (ABCIsConst)
702 Res = Nor(A, Nand(B, C));
703 break;
704 case 0x9:
705 if (ABCIsConst)
706 Res = Nor(A, Xor(B, C));
707 break;
708 case 0xa:
709 if (ACIsConst)
710 Res = Nor(A, Not(C));
711 break;
712 case 0xb:
713 if (ABCIsConst)
714 Res = Nor(A, Nor(C, Not(B)));
715 break;
716 case 0xc:
717 if (ABIsConst)
718 Res = Nor(A, Not(B));
719 break;
720 case 0xd:
721 if (ABCIsConst)
722 Res = Nor(A, Nor(B, Not(C)));
723 break;
724 case 0xe:
725 if (ABCIsConst)
726 Res = Nor(A, Nor(B, C));
727 break;
728 case 0xf:
729 Res = Not(A);
730 break;
731 case 0x10:
732 if (ABCIsConst)
733 Res = And(A, Nor(B, C));
734 break;
735 case 0x11:
736 if (BCIsConst)
737 Res = Nor(B, C);
738 break;
739 case 0x12:
740 if (ABCIsConst)
741 Res = Nor(Xnor(A, C), B);
742 break;
743 case 0x13:
744 if (ABCIsConst)
745 Res = Nor(And(A, C), B);
746 break;
747 case 0x14:
748 if (ABCIsConst)
749 Res = Nor(Xnor(A, B), C);
750 break;
751 case 0x15:
752 if (ABCIsConst)
753 Res = Nor(And(A, B), C);
754 break;
755 case 0x16:
756 if (ABCIsConst)
757 Res = Xor(Xor(A, B), And(Nand(A, B), C));
758 break;
759 case 0x17:
760 if (ABCIsConst)
761 Res = Xor(Or(A, B), Or(Xnor(A, B), C));
762 break;
763 case 0x18:
764 if (ABCIsConst)
765 Res = Nor(Xnor(A, B), Xnor(A, C));
766 break;
767 case 0x19:
768 if (ABCIsConst)
769 Res = And(Nand(A, B), Xnor(B, C));
770 break;
771 case 0x1a:
772 if (ABCIsConst)
773 Res = Xor(A, Or(And(A, B), C));
774 break;
775 case 0x1b:
776 if (ABCIsConst)
777 Res = Xor(A, Or(Xnor(A, B), C));
778 break;
779 case 0x1c:
780 if (ABCIsConst)
781 Res = Xor(A, Or(And(A, C), B));
782 break;
783 case 0x1d:
784 if (ABCIsConst)
785 Res = Xor(A, Or(Xnor(A, C), B));
786 break;
787 case 0x1e:
788 if (ABCIsConst)
789 Res = Xor(A, Or(B, C));
790 break;
791 case 0x1f:
792 if (ABCIsConst)
793 Res = Nand(A, Or(B, C));
794 break;
795 case 0x20:
796 if (ABCIsConst)
797 Res = Nor(Nand(A, C), B);
798 break;
799 case 0x21:
800 if (ABCIsConst)
801 Res = Nor(Xor(A, C), B);
802 break;
803 case 0x22:
804 if (BCIsConst)
805 Res = Nor(B, Not(C));
806 break;
807 case 0x23:
808 if (ABCIsConst)
809 Res = Nor(B, Nor(C, Not(A)));
810 break;
811 case 0x24:
812 if (ABCIsConst)
813 Res = Nor(Xnor(A, B), Xor(A, C));
814 break;
815 case 0x25:
816 if (ABCIsConst)
817 Res = Xor(A, Nand(Nand(A, B), C));
818 break;
819 case 0x26:
820 if (ABCIsConst)
821 Res = And(Nand(A, B), Xor(B, C));
822 break;
823 case 0x27:
824 if (ABCIsConst)
825 Res = Xor(Or(Xnor(A, B), C), B);
826 break;
827 case 0x28:
828 if (ABCIsConst)
829 Res = And(Xor(A, B), C);
830 break;
831 case 0x29:
832 if (ABCIsConst)
833 Res = Xor(Xor(A, B), Nor(And(A, B), C));
834 break;
835 case 0x2a:
836 if (ABCIsConst)
837 Res = And(Nand(A, B), C);
838 break;
839 case 0x2b:
840 if (ABCIsConst)
841 Res = Xor(Or(Xnor(A, B), Xor(A, C)), A);
842 break;
843 case 0x2c:
844 if (ABCIsConst)
845 Res = Nor(Xnor(A, B), Nor(B, C));
846 break;
847 case 0x2d:
848 if (ABCIsConst)
849 Res = Xor(A, Or(B, Not(C)));
850 break;
851 case 0x2e:
852 if (ABCIsConst)
853 Res = Xor(A, Or(Xor(A, C), B));
854 break;
855 case 0x2f:
856 if (ABCIsConst)
857 Res = Nand(A, Or(B, Not(C)));
858 break;
859 case 0x30:
860 if (ABIsConst)
861 Res = Nor(B, Not(A));
862 break;
863 case 0x31:
864 if (ABCIsConst)
865 Res = Nor(Nor(A, Not(C)), B);
866 break;
867 case 0x32:
868 if (ABCIsConst)
869 Res = Nor(Nor(A, C), B);
870 break;
871 case 0x33:
872 Res = Not(B);
873 break;
874 case 0x34:
875 if (ABCIsConst)
876 Res = And(Xor(A, B), Nand(B, C));
877 break;
878 case 0x35:
879 if (ABCIsConst)
880 Res = Xor(B, Or(A, Xnor(B, C)));
881 break;
882 case 0x36:
883 if (ABCIsConst)
884 Res = Xor(Or(A, C), B);
885 break;
886 case 0x37:
887 if (ABCIsConst)
888 Res = Nand(Or(A, C), B);
889 break;
890 case 0x38:
891 if (ABCIsConst)
892 Res = Nor(Xnor(A, B), Nor(A, C));
893 break;
894 case 0x39:
895 if (ABCIsConst)
896 Res = Xor(Or(A, Not(C)), B);
897 break;
898 case 0x3a:
899 if (ABCIsConst)
900 Res = Xor(B, Or(A, Xor(B, C)));
901 break;
902 case 0x3b:
903 if (ABCIsConst)
904 Res = Nand(Or(A, Not(C)), B);
905 break;
906 case 0x3c:
907 Res = Xor(A, B);
908 break;
909 case 0x3d:
910 if (ABCIsConst)
911 Res = Xor(A, Or(Nor(A, C), B));
912 break;
913 case 0x3e:
914 if (ABCIsConst)
915 Res = Xor(A, Or(Nor(A, Not(C)), B));
916 break;
917 case 0x3f:
918 if (ABIsConst)
919 Res = Nand(A, B);
920 break;
921 case 0x40:
922 if (ABCIsConst)
923 Res = Nor(Nand(A, B), C);
924 break;
925 case 0x41:
926 if (ABCIsConst)
927 Res = Nor(Xor(A, B), C);
928 break;
929 case 0x42:
930 if (ABCIsConst)
931 Res = Nor(Xor(A, B), Xnor(A, C));
932 break;
933 case 0x43:
934 if (ABCIsConst)
935 Res = Xor(A, Nand(Nand(A, C), B));
936 break;
937 case 0x44:
938 if (BCIsConst)
939 Res = Nor(C, Not(B));
940 break;
941 case 0x45:
942 if (ABCIsConst)
943 Res = Nor(Nor(B, Not(A)), C);
944 break;
945 case 0x46:
946 if (ABCIsConst)
947 Res = Xor(Or(And(A, C), B), C);
948 break;
949 case 0x47:
950 if (ABCIsConst)
951 Res = Xor(Or(Xnor(A, C), B), C);
952 break;
953 case 0x48:
954 if (ABCIsConst)
955 Res = And(Xor(A, C), B);
956 break;
957 case 0x49:
958 if (ABCIsConst)
959 Res = Xor(Or(Xnor(A, B), And(A, C)), C);
960 break;
961 case 0x4a:
962 if (ABCIsConst)
963 Res = Nor(Xnor(A, C), Nor(B, C));
964 break;
965 case 0x4b:
966 if (ABCIsConst)
967 Res = Xor(A, Or(C, Not(B)));
968 break;
969 case 0x4c:
970 if (ABCIsConst)
971 Res = And(Nand(A, C), B);
972 break;
973 case 0x4d:
974 if (ABCIsConst)
975 Res = Xor(Or(Xor(A, B), Xnor(A, C)), A);
976 break;
977 case 0x4e:
978 if (ABCIsConst)
979 Res = Xor(A, Or(Xor(A, B), C));
980 break;
981 case 0x4f:
982 if (ABCIsConst)
983 Res = Nand(A, Nand(B, Not(C)));
984 break;
985 case 0x50:
986 if (ACIsConst)
987 Res = Nor(C, Not(A));
988 break;
989 case 0x51:
990 if (ABCIsConst)
991 Res = Nor(Nor(A, Not(B)), C);
992 break;
993 case 0x52:
994 if (ABCIsConst)
995 Res = And(Xor(A, C), Nand(B, C));
996 break;
997 case 0x53:
998 if (ABCIsConst)
999 Res = Xor(Or(Xnor(B, C), A), C);
1000 break;
1001 case 0x54:
1002 if (ABCIsConst)
1003 Res = Nor(Nor(A, B), C);
1004 break;
1005 case 0x55:
1006 Res = Not(C);
1007 break;
1008 case 0x56:
1009 if (ABCIsConst)
1010 Res = Xor(Or(A, B), C);
1011 break;
1012 case 0x57:
1013 if (ABCIsConst)
1014 Res = Nand(Or(A, B), C);
1015 break;
1016 case 0x58:
1017 if (ABCIsConst)
1018 Res = Nor(Nor(A, B), Xnor(A, C));
1019 break;
1020 case 0x59:
1021 if (ABCIsConst)
1022 Res = Xor(Or(A, Not(B)), C);
1023 break;
1024 case 0x5a:
1025 Res = Xor(A, C);
1026 break;
1027 case 0x5b:
1028 if (ABCIsConst)
1029 Res = Xor(A, Or(Nor(A, B), C));
1030 break;
1031 case 0x5c:
1032 if (ABCIsConst)
1033 Res = Xor(Or(Xor(B, C), A), C);
1034 break;
1035 case 0x5d:
1036 if (ABCIsConst)
1037 Res = Nand(Or(A, Not(B)), C);
1038 break;
1039 case 0x5e:
1040 if (ABCIsConst)
1041 Res = Xor(A, Or(Nor(A, Not(B)), C));
1042 break;
1043 case 0x5f:
1044 if (ACIsConst)
1045 Res = Nand(A, C);
1046 break;
1047 case 0x60:
1048 if (ABCIsConst)
1049 Res = And(A, Xor(B, C));
1050 break;
1051 case 0x61:
1052 if (ABCIsConst)
1053 Res = Xor(Or(Xnor(A, B), And(B, C)), C);
1054 break;
1055 case 0x62:
1056 if (ABCIsConst)
1057 Res = Nor(Nor(A, C), Xnor(B, C));
1058 break;
1059 case 0x63:
1060 if (ABCIsConst)
1061 Res = Xor(B, Or(C, Not(A)));
1062 break;
1063 case 0x64:
1064 if (ABCIsConst)
1065 Res = Nor(Nor(A, B), Xnor(B, C));
1066 break;
1067 case 0x65:
1068 if (ABCIsConst)
1069 Res = Xor(Or(B, Not(A)), C);
1070 break;
1071 case 0x66:
1072 Res = Xor(B, C);
1073 break;
1074 case 0x67:
1075 if (ABCIsConst)
1076 Res = Or(Nor(A, B), Xor(B, C));
1077 break;
1078 case 0x68:
1079 if (ABCIsConst)
1080 Res = Xor(Xor(A, B), Nor(Nor(A, B), C));
1081 break;
1082 case 0x69:
1083 if (ABCIsConst)
1084 Res = Xor(Xnor(A, B), C);
1085 break;
1086 case 0x6a:
1087 if (ABCIsConst)
1088 Res = Xor(And(A, B), C);
1089 break;
1090 case 0x6b:
1091 if (ABCIsConst)
1092 Res = Or(Nor(A, B), Xor(Xnor(A, B), C));
1093 break;
1094 case 0x6c:
1095 if (ABCIsConst)
1096 Res = Xor(And(A, C), B);
1097 break;
1098 case 0x6d:
1099 if (ABCIsConst)
1100 Res = Xor(Or(Xnor(A, B), Nor(A, C)), C);
1101 break;
1102 case 0x6e:
1103 if (ABCIsConst)
1104 Res = Or(Nor(A, Not(B)), Xor(B, C));
1105 break;
1106 case 0x6f:
1107 if (ABCIsConst)
1108 Res = Nand(A, Xnor(B, C));
1109 break;
1110 case 0x70:
1111 if (ABCIsConst)
1112 Res = And(A, Nand(B, C));
1113 break;
1114 case 0x71:
1115 if (ABCIsConst)
1116 Res = Xor(Nor(Xor(A, B), Xor(A, C)), A);
1117 break;
1118 case 0x72:
1119 if (ABCIsConst)
1120 Res = Xor(Or(Xor(A, B), C), B);
1121 break;
1122 case 0x73:
1123 if (ABCIsConst)
1124 Res = Nand(Nand(A, Not(C)), B);
1125 break;
1126 case 0x74:
1127 if (ABCIsConst)
1128 Res = Xor(Or(Xor(A, C), B), C);
1129 break;
1130 case 0x75:
1131 if (ABCIsConst)
1132 Res = Nand(Nand(A, Not(B)), C);
1133 break;
1134 case 0x76:
1135 if (ABCIsConst)
1136 Res = Xor(B, Or(Nor(B, Not(A)), C));
1137 break;
1138 case 0x77:
1139 if (BCIsConst)
1140 Res = Nand(B, C);
1141 break;
1142 case 0x78:
1143 if (ABCIsConst)
1144 Res = Xor(A, And(B, C));
1145 break;
1146 case 0x79:
1147 if (ABCIsConst)
1148 Res = Xor(Or(Xnor(A, B), Nor(B, C)), C);
1149 break;
1150 case 0x7a:
1151 if (ABCIsConst)
1152 Res = Or(Xor(A, C), Nor(B, Not(A)));
1153 break;
1154 case 0x7b:
1155 if (ABCIsConst)
1156 Res = Nand(Xnor(A, C), B);
1157 break;
1158 case 0x7c:
1159 if (ABCIsConst)
1160 Res = Or(Xor(A, B), Nor(C, Not(A)));
1161 break;
1162 case 0x7d:
1163 if (ABCIsConst)
1164 Res = Nand(Xnor(A, B), C);
1165 break;
1166 case 0x7e:
1167 if (ABCIsConst)
1168 Res = Or(Xor(A, B), Xor(A, C));
1169 break;
1170 case 0x7f:
1171 if (ABCIsConst)
1172 Res = Nand(And(A, B), C);
1173 break;
1174 case 0x80:
1175 if (ABCIsConst)
1176 Res = And(And(A, B), C);
1177 break;
1178 case 0x81:
1179 if (ABCIsConst)
1180 Res = Nor(Xor(A, B), Xor(A, C));
1181 break;
1182 case 0x82:
1183 if (ABCIsConst)
1184 Res = And(Xnor(A, B), C);
1185 break;
1186 case 0x83:
1187 if (ABCIsConst)
1188 Res = Nor(Xor(A, B), Nor(C, Not(A)));
1189 break;
1190 case 0x84:
1191 if (ABCIsConst)
1192 Res = And(Xnor(A, C), B);
1193 break;
1194 case 0x85:
1195 if (ABCIsConst)
1196 Res = Nor(Xor(A, C), Nor(B, Not(A)));
1197 break;
1198 case 0x86:
1199 if (ABCIsConst)
1200 Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C);
1201 break;
1202 case 0x87:
1203 if (ABCIsConst)
1204 Res = Xor(A, Nand(B, C));
1205 break;
1206 case 0x88:
1207 Res = And(B, C);
1208 break;
1209 case 0x89:
1210 if (ABCIsConst)
1211 Res = Xor(B, Nor(Nor(B, Not(A)), C));
1212 break;
1213 case 0x8a:
1214 if (ABCIsConst)
1215 Res = And(Nand(A, Not(B)), C);
1216 break;
1217 case 0x8b:
1218 if (ABCIsConst)
1219 Res = Xor(Nor(Xor(A, C), B), C);
1220 break;
1221 case 0x8c:
1222 if (ABCIsConst)
1223 Res = And(Nand(A, Not(C)), B);
1224 break;
1225 case 0x8d:
1226 if (ABCIsConst)
1227 Res = Xor(Nor(Xor(A, B), C), B);
1228 break;
1229 case 0x8e:
1230 if (ABCIsConst)
1231 Res = Xor(Or(Xor(A, B), Xor(A, C)), A);
1232 break;
1233 case 0x8f:
1234 if (ABCIsConst)
1235 Res = Nand(A, Nand(B, C));
1236 break;
1237 case 0x90:
1238 if (ABCIsConst)
1239 Res = And(A, Xnor(B, C));
1240 break;
1241 case 0x91:
1242 if (ABCIsConst)
1243 Res = Nor(Nor(A, Not(B)), Xor(B, C));
1244 break;
1245 case 0x92:
1246 if (ABCIsConst)
1247 Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C);
1248 break;
1249 case 0x93:
1250 if (ABCIsConst)
1251 Res = Xor(Nand(A, C), B);
1252 break;
1253 case 0x94:
1254 if (ABCIsConst)
1255 Res = Nor(Nor(A, B), Xor(Xnor(A, B), C));
1256 break;
1257 case 0x95:
1258 if (ABCIsConst)
1259 Res = Xor(Nand(A, B), C);
1260 break;
1261 case 0x96:
1262 if (ABCIsConst)
1263 Res = Xor(Xor(A, B), C);
1264 break;
1265 case 0x97:
1266 if (ABCIsConst)
1267 Res = Xor(Xor(A, B), Or(Nor(A, B), C));
1268 break;
1269 case 0x98:
1270 if (ABCIsConst)
1271 Res = Nor(Nor(A, B), Xor(B, C));
1272 break;
1273 case 0x99:
1274 if (BCIsConst)
1275 Res = Xnor(B, C);
1276 break;
1277 case 0x9a:
1278 if (ABCIsConst)
1279 Res = Xor(Nor(B, Not(A)), C);
1280 break;
1281 case 0x9b:
1282 if (ABCIsConst)
1283 Res = Or(Nor(A, B), Xnor(B, C));
1284 break;
1285 case 0x9c:
1286 if (ABCIsConst)
1287 Res = Xor(B, Nor(C, Not(A)));
1288 break;
1289 case 0x9d:
1290 if (ABCIsConst)
1291 Res = Or(Nor(A, C), Xnor(B, C));
1292 break;
1293 case 0x9e:
1294 if (ABCIsConst)
1295 Res = Xor(And(Xor(A, B), Nand(B, C)), C);
1296 break;
1297 case 0x9f:
1298 if (ABCIsConst)
1299 Res = Nand(A, Xor(B, C));
1300 break;
1301 case 0xa0:
1302 Res = And(A, C);
1303 break;
1304 case 0xa1:
1305 if (ABCIsConst)
1306 Res = Xor(A, Nor(Nor(A, Not(B)), C));
1307 break;
1308 case 0xa2:
1309 if (ABCIsConst)
1310 Res = And(Or(A, Not(B)), C);
1311 break;
1312 case 0xa3:
1313 if (ABCIsConst)
1314 Res = Xor(Nor(Xor(B, C), A), C);
1315 break;
1316 case 0xa4:
1317 if (ABCIsConst)
1318 Res = Xor(A, Nor(Nor(A, B), C));
1319 break;
1320 case 0xa5:
1321 if (ACIsConst)
1322 Res = Xnor(A, C);
1323 break;
1324 case 0xa6:
1325 if (ABCIsConst)
1326 Res = Xor(Nor(A, Not(B)), C);
1327 break;
1328 case 0xa7:
1329 if (ABCIsConst)
1330 Res = Or(Nor(A, B), Xnor(A, C));
1331 break;
1332 case 0xa8:
1333 if (ABCIsConst)
1334 Res = And(Or(A, B), C);
1335 break;
1336 case 0xa9:
1337 if (ABCIsConst)
1338 Res = Xor(Nor(A, B), C);
1339 break;
1340 case 0xaa:
1341 Res = C;
1342 break;
1343 case 0xab:
1344 if (ABCIsConst)
1345 Res = Or(Nor(A, B), C);
1346 break;
1347 case 0xac:
1348 if (ABCIsConst)
1349 Res = Xor(Nor(Xnor(B, C), A), C);
1350 break;
1351 case 0xad:
1352 if (ABCIsConst)
1353 Res = Or(Xnor(A, C), And(B, C));
1354 break;
1355 case 0xae:
1356 if (ABCIsConst)
1357 Res = Or(Nor(A, Not(B)), C);
1358 break;
1359 case 0xaf:
1360 if (ACIsConst)
1361 Res = Or(C, Not(A));
1362 break;
1363 case 0xb0:
1364 if (ABCIsConst)
1365 Res = And(A, Nand(B, Not(C)));
1366 break;
1367 case 0xb1:
1368 if (ABCIsConst)
1369 Res = Xor(A, Nor(Xor(A, B), C));
1370 break;
1371 case 0xb2:
1372 if (ABCIsConst)
1373 Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A);
1374 break;
1375 case 0xb3:
1376 if (ABCIsConst)
1377 Res = Nand(Nand(A, C), B);
1378 break;
1379 case 0xb4:
1380 if (ABCIsConst)
1381 Res = Xor(A, Nor(C, Not(B)));
1382 break;
1383 case 0xb5:
1384 if (ABCIsConst)
1385 Res = Or(Xnor(A, C), Nor(B, C));
1386 break;
1387 case 0xb6:
1388 if (ABCIsConst)
1389 Res = Xor(And(Xor(A, B), Nand(A, C)), C);
1390 break;
1391 case 0xb7:
1392 if (ABCIsConst)
1393 Res = Nand(Xor(A, C), B);
1394 break;
1395 case 0xb8:
1396 if (ABCIsConst)
1397 Res = Xor(Nor(Xnor(A, C), B), C);
1398 break;
1399 case 0xb9:
1400 if (ABCIsConst)
1401 Res = Xor(Nor(And(A, C), B), C);
1402 break;
1403 case 0xba:
1404 if (ABCIsConst)
1405 Res = Or(Nor(B, Not(A)), C);
1406 break;
1407 case 0xbb:
1408 if (BCIsConst)
1409 Res = Or(C, Not(B));
1410 break;
1411 case 0xbc:
1412 if (ABCIsConst)
1413 Res = Xor(A, And(Nand(A, C), B));
1414 break;
1415 case 0xbd:
1416 if (ABCIsConst)
1417 Res = Or(Xor(A, B), Xnor(A, C));
1418 break;
1419 case 0xbe:
1420 if (ABCIsConst)
1421 Res = Or(Xor(A, B), C);
1422 break;
1423 case 0xbf:
1424 if (ABCIsConst)
1425 Res = Or(Nand(A, B), C);
1426 break;
1427 case 0xc0:
1428 Res = And(A, B);
1429 break;
1430 case 0xc1:
1431 if (ABCIsConst)
1432 Res = Xor(A, Nor(Nor(A, Not(C)), B));
1433 break;
1434 case 0xc2:
1435 if (ABCIsConst)
1436 Res = Xor(A, Nor(Nor(A, C), B));
1437 break;
1438 case 0xc3:
1439 if (ABIsConst)
1440 Res = Xnor(A, B);
1441 break;
1442 case 0xc4:
1443 if (ABCIsConst)
1444 Res = And(Or(A, Not(C)), B);
1445 break;
1446 case 0xc5:
1447 if (ABCIsConst)
1448 Res = Xor(B, Nor(A, Xor(B, C)));
1449 break;
1450 case 0xc6:
1451 if (ABCIsConst)
1452 Res = Xor(Nor(A, Not(C)), B);
1453 break;
1454 case 0xc7:
1455 if (ABCIsConst)
1456 Res = Or(Xnor(A, B), Nor(A, C));
1457 break;
1458 case 0xc8:
1459 if (ABCIsConst)
1460 Res = And(Or(A, C), B);
1461 break;
1462 case 0xc9:
1463 if (ABCIsConst)
1464 Res = Xor(Nor(A, C), B);
1465 break;
1466 case 0xca:
1467 if (ABCIsConst)
1468 Res = Xor(B, Nor(A, Xnor(B, C)));
1469 break;
1470 case 0xcb:
1471 if (ABCIsConst)
1472 Res = Or(Xnor(A, B), And(B, C));
1473 break;
1474 case 0xcc:
1475 Res = B;
1476 break;
1477 case 0xcd:
1478 if (ABCIsConst)
1479 Res = Or(Nor(A, C), B);
1480 break;
1481 case 0xce:
1482 if (ABCIsConst)
1483 Res = Or(Nor(A, Not(C)), B);
1484 break;
1485 case 0xcf:
1486 if (ABIsConst)
1487 Res = Or(B, Not(A));
1488 break;
1489 case 0xd0:
1490 if (ABCIsConst)
1491 Res = And(A, Or(B, Not(C)));
1492 break;
1493 case 0xd1:
1494 if (ABCIsConst)
1495 Res = Xor(A, Nor(Xor(A, C), B));
1496 break;
1497 case 0xd2:
1498 if (ABCIsConst)
1499 Res = Xor(A, Nor(B, Not(C)));
1500 break;
1501 case 0xd3:
1502 if (ABCIsConst)
1503 Res = Or(Xnor(A, B), Nor(B, C));
1504 break;
1505 case 0xd4:
1506 if (ABCIsConst)
1507 Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A);
1508 break;
1509 case 0xd5:
1510 if (ABCIsConst)
1511 Res = Nand(Nand(A, B), C);
1512 break;
1513 case 0xd6:
1514 if (ABCIsConst)
1515 Res = Xor(Xor(A, B), Or(And(A, B), C));
1516 break;
1517 case 0xd7:
1518 if (ABCIsConst)
1519 Res = Nand(Xor(A, B), C);
1520 break;
1521 case 0xd8:
1522 if (ABCIsConst)
1523 Res = Xor(Nor(Xnor(A, B), C), B);
1524 break;
1525 case 0xd9:
1526 if (ABCIsConst)
1527 Res = Or(And(A, B), Xnor(B, C));
1528 break;
1529 case 0xda:
1530 if (ABCIsConst)
1531 Res = Xor(A, And(Nand(A, B), C));
1532 break;
1533 case 0xdb:
1534 if (ABCIsConst)
1535 Res = Or(Xnor(A, B), Xor(A, C));
1536 break;
1537 case 0xdc:
1538 if (ABCIsConst)
1539 Res = Or(B, Nor(C, Not(A)));
1540 break;
1541 case 0xdd:
1542 if (BCIsConst)
1543 Res = Or(B, Not(C));
1544 break;
1545 case 0xde:
1546 if (ABCIsConst)
1547 Res = Or(Xor(A, C), B);
1548 break;
1549 case 0xdf:
1550 if (ABCIsConst)
1551 Res = Or(Nand(A, C), B);
1552 break;
1553 case 0xe0:
1554 if (ABCIsConst)
1555 Res = And(A, Or(B, C));
1556 break;
1557 case 0xe1:
1558 if (ABCIsConst)
1559 Res = Xor(A, Nor(B, C));
1560 break;
1561 case 0xe2:
1562 if (ABCIsConst)
1563 Res = Xor(A, Nor(Xnor(A, C), B));
1564 break;
1565 case 0xe3:
1566 if (ABCIsConst)
1567 Res = Xor(A, Nor(And(A, C), B));
1568 break;
1569 case 0xe4:
1570 if (ABCIsConst)
1571 Res = Xor(A, Nor(Xnor(A, B), C));
1572 break;
1573 case 0xe5:
1574 if (ABCIsConst)
1575 Res = Xor(A, Nor(And(A, B), C));
1576 break;
1577 case 0xe6:
1578 if (ABCIsConst)
1579 Res = Or(And(A, B), Xor(B, C));
1580 break;
1581 case 0xe7:
1582 if (ABCIsConst)
1583 Res = Or(Xnor(A, B), Xnor(A, C));
1584 break;
1585 case 0xe8:
1586 if (ABCIsConst)
1587 Res = Xor(Or(A, B), Nor(Xnor(A, B), C));
1588 break;
1589 case 0xe9:
1590 if (ABCIsConst)
1591 Res = Xor(Xor(A, B), Nand(Nand(A, B), C));
1592 break;
1593 case 0xea:
1594 if (ABCIsConst)
1595 Res = Or(And(A, B), C);
1596 break;
1597 case 0xeb:
1598 if (ABCIsConst)
1599 Res = Or(Xnor(A, B), C);
1600 break;
1601 case 0xec:
1602 if (ABCIsConst)
1603 Res = Or(And(A, C), B);
1604 break;
1605 case 0xed:
1606 if (ABCIsConst)
1607 Res = Or(Xnor(A, C), B);
1608 break;
1609 case 0xee:
1610 Res = Or(B, C);
1611 break;
1612 case 0xef:
1613 if (ABCIsConst)
1614 Res = Nand(A, Nor(B, C));
1615 break;
1616 case 0xf0:
1617 Res = A;
1618 break;
1619 case 0xf1:
1620 if (ABCIsConst)
1621 Res = Or(A, Nor(B, C));
1622 break;
1623 case 0xf2:
1624 if (ABCIsConst)
1625 Res = Or(A, Nor(B, Not(C)));
1626 break;
1627 case 0xf3:
1628 if (ABIsConst)
1629 Res = Or(A, Not(B));
1630 break;
1631 case 0xf4:
1632 if (ABCIsConst)
1633 Res = Or(A, Nor(C, Not(B)));
1634 break;
1635 case 0xf5:
1636 if (ACIsConst)
1637 Res = Or(A, Not(C));
1638 break;
1639 case 0xf6:
1640 if (ABCIsConst)
1641 Res = Or(A, Xor(B, C));
1642 break;
1643 case 0xf7:
1644 if (ABCIsConst)
1645 Res = Or(A, Nand(B, C));
1646 break;
1647 case 0xf8:
1648 if (ABCIsConst)
1649 Res = Or(A, And(B, C));
1650 break;
1651 case 0xf9:
1652 if (ABCIsConst)
1653 Res = Or(A, Xnor(B, C));
1654 break;
1655 case 0xfa:
1656 Res = Or(A, C);
1657 break;
1658 case 0xfb:
1659 if (ABCIsConst)
1660 Res = Nand(Nor(A, C), B);
1661 break;
1662 case 0xfc:
1663 Res = Or(A, B);
1664 break;
1665 case 0xfd:
1666 if (ABCIsConst)
1667 Res = Nand(Nor(A, B), C);
1668 break;
1669 case 0xfe:
1670 if (ABCIsConst)
1671 Res = Or(Or(A, B), C);
1672 break;
1673 case 0xff:
1674 Res = {Constant::getAllOnesValue(Ty), 0xff};
1675 break;
1676 }
1677
1678 assert((Res.first == nullptr || Res.second == Imm) &&
1679 "Simplification of ternary logic does not verify!");
1680 return Res.first;
1681}
1682
1684 InstCombiner::BuilderTy &Builder) {
1685 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
1686 if (!CInt)
1687 return nullptr;
1688
1689 auto *VecTy = cast<FixedVectorType>(II.getType());
1690 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
1691
1692 // The immediate permute control byte looks like this:
1693 // [3:0] - zero mask for each 32-bit lane
1694 // [5:4] - select one 32-bit destination lane
1695 // [7:6] - select one 32-bit source lane
1696
1697 uint8_t Imm = CInt->getZExtValue();
1698 uint8_t ZMask = Imm & 0xf;
1699 uint8_t DestLane = (Imm >> 4) & 0x3;
1700 uint8_t SourceLane = (Imm >> 6) & 0x3;
1701
1703
1704 // If all zero mask bits are set, this was just a weird way to
1705 // generate a zero vector.
1706 if (ZMask == 0xf)
1707 return ZeroVector;
1708
1709 // Initialize by passing all of the first source bits through.
1710 int ShuffleMask[4] = {0, 1, 2, 3};
1711
1712 // We may replace the second operand with the zero vector.
1713 Value *V1 = II.getArgOperand(1);
1714
1715 if (ZMask) {
1716 // If the zero mask is being used with a single input or the zero mask
1717 // overrides the destination lane, this is a shuffle with the zero vector.
1718 if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
1719 (ZMask & (1 << DestLane))) {
1720 V1 = ZeroVector;
1721 // We may still move 32-bits of the first source vector from one lane
1722 // to another.
1723 ShuffleMask[DestLane] = SourceLane;
1724 // The zero mask may override the previous insert operation.
1725 for (unsigned i = 0; i < 4; ++i)
1726 if ((ZMask >> i) & 0x1)
1727 ShuffleMask[i] = i + 4;
1728 } else {
1729 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
1730 return nullptr;
1731 }
1732 } else {
1733 // Replace the selected destination lane with the selected source lane.
1734 ShuffleMask[DestLane] = SourceLane + 4;
1735 }
1736
1737 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
1738}
1739
1740/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
1741/// or conversion to a shuffle vector.
1743 ConstantInt *CILength, ConstantInt *CIIndex,
1744 InstCombiner::BuilderTy &Builder) {
1745 auto LowConstantHighUndef = [&](uint64_t Val) {
1746 Type *IntTy64 = Type::getInt64Ty(II.getContext());
1747 Constant *Args[] = {ConstantInt::get(IntTy64, Val),
1748 UndefValue::get(IntTy64)};
1749 return ConstantVector::get(Args);
1750 };
1751
1752 // See if we're dealing with constant values.
1753 auto *C0 = dyn_cast<Constant>(Op0);
1754 auto *CI0 =
1755 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1756 : nullptr;
1757
1758 // Attempt to constant fold.
1759 if (CILength && CIIndex) {
1760 // From AMD documentation: "The bit index and field length are each six
1761 // bits in length other bits of the field are ignored."
1762 APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
1763 APInt APLength = CILength->getValue().zextOrTrunc(6);
1764
1765 unsigned Index = APIndex.getZExtValue();
1766
1767 // From AMD documentation: "a value of zero in the field length is
1768 // defined as length of 64".
1769 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1770
1771 // From AMD documentation: "If the sum of the bit index + length field
1772 // is greater than 64, the results are undefined".
1773 unsigned End = Index + Length;
1774
1775 // Note that both field index and field length are 8-bit quantities.
1776 // Since variables 'Index' and 'Length' are unsigned values
1777 // obtained from zero-extending field index and field length
1778 // respectively, their sum should never wrap around.
1779 if (End > 64)
1780 return UndefValue::get(II.getType());
1781
1782 // If we are inserting whole bytes, we can convert this to a shuffle.
1783 // Lowering can recognize EXTRQI shuffle masks.
1784 if ((Length % 8) == 0 && (Index % 8) == 0) {
1785 // Convert bit indices to byte indices.
1786 Length /= 8;
1787 Index /= 8;
1788
1789 Type *IntTy8 = Type::getInt8Ty(II.getContext());
1790 auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1791
1792 SmallVector<int, 16> ShuffleMask;
1793 for (int i = 0; i != (int)Length; ++i)
1794 ShuffleMask.push_back(i + Index);
1795 for (int i = Length; i != 8; ++i)
1796 ShuffleMask.push_back(i + 16);
1797 for (int i = 8; i != 16; ++i)
1798 ShuffleMask.push_back(-1);
1799
1800 Value *SV = Builder.CreateShuffleVector(
1801 Builder.CreateBitCast(Op0, ShufTy),
1802 ConstantAggregateZero::get(ShufTy), ShuffleMask);
1803 return Builder.CreateBitCast(SV, II.getType());
1804 }
1805
1806 // Constant Fold - shift Index'th bit to lowest position and mask off
1807 // Length bits.
1808 if (CI0) {
1809 APInt Elt = CI0->getValue();
1810 Elt.lshrInPlace(Index);
1811 Elt = Elt.zextOrTrunc(Length);
1812 return LowConstantHighUndef(Elt.getZExtValue());
1813 }
1814
1815 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
1816 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
1817 Value *Args[] = {Op0, CILength, CIIndex};
1818 Module *M = II.getModule();
1819 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
1820 return Builder.CreateCall(F, Args);
1821 }
1822 }
1823
1824 // Constant Fold - extraction from zero is always {zero, undef}.
1825 if (CI0 && CI0->isZero())
1826 return LowConstantHighUndef(0);
1827
1828 return nullptr;
1829}
1830
1831/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
1832/// folding or conversion to a shuffle vector.
1834 APInt APLength, APInt APIndex,
1835 InstCombiner::BuilderTy &Builder) {
1836 // From AMD documentation: "The bit index and field length are each six bits
1837 // in length other bits of the field are ignored."
1838 APIndex = APIndex.zextOrTrunc(6);
1839 APLength = APLength.zextOrTrunc(6);
1840
1841 // Attempt to constant fold.
1842 unsigned Index = APIndex.getZExtValue();
1843
1844 // From AMD documentation: "a value of zero in the field length is
1845 // defined as length of 64".
1846 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1847
1848 // From AMD documentation: "If the sum of the bit index + length field
1849 // is greater than 64, the results are undefined".
1850 unsigned End = Index + Length;
1851
1852 // Note that both field index and field length are 8-bit quantities.
1853 // Since variables 'Index' and 'Length' are unsigned values
1854 // obtained from zero-extending field index and field length
1855 // respectively, their sum should never wrap around.
1856 if (End > 64)
1857 return UndefValue::get(II.getType());
1858
1859 // If we are inserting whole bytes, we can convert this to a shuffle.
1860 // Lowering can recognize INSERTQI shuffle masks.
1861 if ((Length % 8) == 0 && (Index % 8) == 0) {
1862 // Convert bit indices to byte indices.
1863 Length /= 8;
1864 Index /= 8;
1865
1866 Type *IntTy8 = Type::getInt8Ty(II.getContext());
1867 auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1868
1869 SmallVector<int, 16> ShuffleMask;
1870 for (int i = 0; i != (int)Index; ++i)
1871 ShuffleMask.push_back(i);
1872 for (int i = 0; i != (int)Length; ++i)
1873 ShuffleMask.push_back(i + 16);
1874 for (int i = Index + Length; i != 8; ++i)
1875 ShuffleMask.push_back(i);
1876 for (int i = 8; i != 16; ++i)
1877 ShuffleMask.push_back(-1);
1878
1879 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
1880 Builder.CreateBitCast(Op1, ShufTy),
1881 ShuffleMask);
1882 return Builder.CreateBitCast(SV, II.getType());
1883 }
1884
1885 // See if we're dealing with constant values.
1886 auto *C0 = dyn_cast<Constant>(Op0);
1887 auto *C1 = dyn_cast<Constant>(Op1);
1888 auto *CI00 =
1889 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1890 : nullptr;
1891 auto *CI10 =
1892 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1893 : nullptr;
1894
1895 // Constant Fold - insert bottom Length bits starting at the Index'th bit.
1896 if (CI00 && CI10) {
1897 APInt V00 = CI00->getValue();
1898 APInt V10 = CI10->getValue();
1900 V00 = V00 & ~Mask;
1901 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
1902 APInt Val = V00 | V10;
1903 Type *IntTy64 = Type::getInt64Ty(II.getContext());
1904 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
1905 UndefValue::get(IntTy64)};
1906 return ConstantVector::get(Args);
1907 }
1908
1909 // If we were an INSERTQ call, we'll save demanded elements if we convert to
1910 // INSERTQI.
1911 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
1912 Type *IntTy8 = Type::getInt8Ty(II.getContext());
1913 Constant *CILength = ConstantInt::get(IntTy8, Length, false);
1914 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
1915
1916 Value *Args[] = {Op0, Op1, CILength, CIIndex};
1917 Module *M = II.getModule();
1918 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
1919 return Builder.CreateCall(F, Args);
1920 }
1921
1922 return nullptr;
1923}
1924
1925/// Attempt to convert pshufb* to shufflevector if the mask is constant.
1927 InstCombiner::BuilderTy &Builder) {
1928 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1929 if (!V)
1930 return nullptr;
1931
1932 auto *VecTy = cast<FixedVectorType>(II.getType());
1933 unsigned NumElts = VecTy->getNumElements();
1934 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
1935 "Unexpected number of elements in shuffle mask!");
1936
1937 // Construct a shuffle mask from constant integers or UNDEFs.
1938 int Indexes[64];
1939
1940 // Each byte in the shuffle control mask forms an index to permute the
1941 // corresponding byte in the destination operand.
1942 for (unsigned I = 0; I < NumElts; ++I) {
1943 Constant *COp = V->getAggregateElement(I);
1944 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
1945 return nullptr;
1946
1947 if (isa<UndefValue>(COp)) {
1948 Indexes[I] = -1;
1949 continue;
1950 }
1951
1952 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
1953
1954 // If the most significant bit (bit[7]) of each byte of the shuffle
1955 // control mask is set, then zero is written in the result byte.
1956 // The zero vector is in the right-hand side of the resulting
1957 // shufflevector.
1958
1959 // The value of each index for the high 128-bit lane is the least
1960 // significant 4 bits of the respective shuffle control byte.
1961 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
1962 Indexes[I] = Index;
1963 }
1964
1965 auto V1 = II.getArgOperand(0);
1966 auto V2 = Constant::getNullValue(VecTy);
1967 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
1968}
1969
1970/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
1972 InstCombiner::BuilderTy &Builder) {
1973 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1974 if (!V)
1975 return nullptr;
1976
1977 auto *VecTy = cast<FixedVectorType>(II.getType());
1978 unsigned NumElts = VecTy->getNumElements();
1979 bool IsPD = VecTy->getScalarType()->isDoubleTy();
1980 unsigned NumLaneElts = IsPD ? 2 : 4;
1981 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
1982
1983 // Construct a shuffle mask from constant integers or UNDEFs.
1984 int Indexes[16];
1985
1986 // The intrinsics only read one or two bits, clear the rest.
1987 for (unsigned I = 0; I < NumElts; ++I) {
1988 Constant *COp = V->getAggregateElement(I);
1989 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
1990 return nullptr;
1991
1992 if (isa<UndefValue>(COp)) {
1993 Indexes[I] = -1;
1994 continue;
1995 }
1996
1997 APInt Index = cast<ConstantInt>(COp)->getValue();
1999
2000 // The PD variants uses bit 1 to select per-lane element index, so
2001 // shift down to convert to generic shuffle mask index.
2002 if (IsPD)
2003 Index.lshrInPlace(1);
2004
2005 // The _256 variants are a bit trickier since the mask bits always index
2006 // into the corresponding 128 half. In order to convert to a generic
2007 // shuffle, we have to make that explicit.
2008 Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
2009
2010 Indexes[I] = Index.getZExtValue();
2011 }
2012
2013 auto V1 = II.getArgOperand(0);
2014 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts));
2015}
2016
2017/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
2019 InstCombiner::BuilderTy &Builder) {
2020 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2021 if (!V)
2022 return nullptr;
2023
2024 auto *VecTy = cast<FixedVectorType>(II.getType());
2025 unsigned Size = VecTy->getNumElements();
2026 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
2027 "Unexpected shuffle mask size");
2028
2029 // Construct a shuffle mask from constant integers or UNDEFs.
2030 int Indexes[64];
2031
2032 for (unsigned I = 0; I < Size; ++I) {
2033 Constant *COp = V->getAggregateElement(I);
2034 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2035 return nullptr;
2036
2037 if (isa<UndefValue>(COp)) {
2038 Indexes[I] = -1;
2039 continue;
2040 }
2041
2042 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
2043 Index &= Size - 1;
2044 Indexes[I] = Index;
2045 }
2046
2047 auto V1 = II.getArgOperand(0);
2048 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size));
2049}
2050
2051std::optional<Instruction *>
2053 auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
2054 unsigned DemandedWidth) {
2055 APInt UndefElts(Width, 0);
2056 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
2057 return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
2058 };
2059
2060 Intrinsic::ID IID = II.getIntrinsicID();
2061 switch (IID) {
2062 case Intrinsic::x86_bmi_bextr_32:
2063 case Intrinsic::x86_bmi_bextr_64:
2064 case Intrinsic::x86_tbm_bextri_u32:
2065 case Intrinsic::x86_tbm_bextri_u64:
2066 // If the RHS is a constant we can try some simplifications.
2067 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2068 uint64_t Shift = C->getZExtValue();
2069 uint64_t Length = (Shift >> 8) & 0xff;
2070 Shift &= 0xff;
2071 unsigned BitWidth = II.getType()->getIntegerBitWidth();
2072 // If the length is 0 or the shift is out of range, replace with zero.
2073 if (Length == 0 || Shift >= BitWidth) {
2074 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2075 }
2076 // If the LHS is also a constant, we can completely constant fold this.
2077 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2078 uint64_t Result = InC->getZExtValue() >> Shift;
2079 if (Length > BitWidth)
2080 Length = BitWidth;
2081 Result &= maskTrailingOnes<uint64_t>(Length);
2082 return IC.replaceInstUsesWith(II,
2083 ConstantInt::get(II.getType(), Result));
2084 }
2085 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
2086 // are only masking bits that a shift already cleared?
2087 }
2088 break;
2089
2090 case Intrinsic::x86_bmi_bzhi_32:
2091 case Intrinsic::x86_bmi_bzhi_64:
2092 // If the RHS is a constant we can try some simplifications.
2093 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2094 uint64_t Index = C->getZExtValue() & 0xff;
2095 unsigned BitWidth = II.getType()->getIntegerBitWidth();
2096 if (Index >= BitWidth) {
2097 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2098 }
2099 if (Index == 0) {
2100 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2101 }
2102 // If the LHS is also a constant, we can completely constant fold this.
2103 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2104 uint64_t Result = InC->getZExtValue();
2105 Result &= maskTrailingOnes<uint64_t>(Index);
2106 return IC.replaceInstUsesWith(II,
2107 ConstantInt::get(II.getType(), Result));
2108 }
2109 // TODO should we convert this to an AND if the RHS is constant?
2110 }
2111 break;
2112 case Intrinsic::x86_bmi_pext_32:
2113 case Intrinsic::x86_bmi_pext_64:
2114 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2115 if (MaskC->isNullValue()) {
2116 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2117 }
2118 if (MaskC->isAllOnesValue()) {
2119 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2120 }
2121
2122 unsigned MaskIdx, MaskLen;
2123 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2124 // any single contingous sequence of 1s anywhere in the mask simply
2125 // describes a subset of the input bits shifted to the appropriate
2126 // position. Replace with the straight forward IR.
2127 Value *Input = II.getArgOperand(0);
2128 Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
2129 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2130 Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
2131 return IC.replaceInstUsesWith(II, Shifted);
2132 }
2133
2134 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2135 uint64_t Src = SrcC->getZExtValue();
2136 uint64_t Mask = MaskC->getZExtValue();
2137 uint64_t Result = 0;
2138 uint64_t BitToSet = 1;
2139
2140 while (Mask) {
2141 // Isolate lowest set bit.
2142 uint64_t BitToTest = Mask & -Mask;
2143 if (BitToTest & Src)
2144 Result |= BitToSet;
2145
2146 BitToSet <<= 1;
2147 // Clear lowest set bit.
2148 Mask &= Mask - 1;
2149 }
2150
2151 return IC.replaceInstUsesWith(II,
2152 ConstantInt::get(II.getType(), Result));
2153 }
2154 }
2155 break;
2156 case Intrinsic::x86_bmi_pdep_32:
2157 case Intrinsic::x86_bmi_pdep_64:
2158 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2159 if (MaskC->isNullValue()) {
2160 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2161 }
2162 if (MaskC->isAllOnesValue()) {
2163 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2164 }
2165
2166 unsigned MaskIdx, MaskLen;
2167 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2168 // any single contingous sequence of 1s anywhere in the mask simply
2169 // describes a subset of the input bits shifted to the appropriate
2170 // position. Replace with the straight forward IR.
2171 Value *Input = II.getArgOperand(0);
2172 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2173 Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
2174 Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
2175 return IC.replaceInstUsesWith(II, Masked);
2176 }
2177
2178 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2179 uint64_t Src = SrcC->getZExtValue();
2180 uint64_t Mask = MaskC->getZExtValue();
2181 uint64_t Result = 0;
2182 uint64_t BitToTest = 1;
2183
2184 while (Mask) {
2185 // Isolate lowest set bit.
2186 uint64_t BitToSet = Mask & -Mask;
2187 if (BitToTest & Src)
2188 Result |= BitToSet;
2189
2190 BitToTest <<= 1;
2191 // Clear lowest set bit;
2192 Mask &= Mask - 1;
2193 }
2194
2195 return IC.replaceInstUsesWith(II,
2196 ConstantInt::get(II.getType(), Result));
2197 }
2198 }
2199 break;
2200
2201 case Intrinsic::x86_sse_cvtss2si:
2202 case Intrinsic::x86_sse_cvtss2si64:
2203 case Intrinsic::x86_sse_cvttss2si:
2204 case Intrinsic::x86_sse_cvttss2si64:
2205 case Intrinsic::x86_sse2_cvtsd2si:
2206 case Intrinsic::x86_sse2_cvtsd2si64:
2207 case Intrinsic::x86_sse2_cvttsd2si:
2208 case Intrinsic::x86_sse2_cvttsd2si64:
2209 case Intrinsic::x86_avx512_vcvtss2si32:
2210 case Intrinsic::x86_avx512_vcvtss2si64:
2211 case Intrinsic::x86_avx512_vcvtss2usi32:
2212 case Intrinsic::x86_avx512_vcvtss2usi64:
2213 case Intrinsic::x86_avx512_vcvtsd2si32:
2214 case Intrinsic::x86_avx512_vcvtsd2si64:
2215 case Intrinsic::x86_avx512_vcvtsd2usi32:
2216 case Intrinsic::x86_avx512_vcvtsd2usi64:
2217 case Intrinsic::x86_avx512_cvttss2si:
2218 case Intrinsic::x86_avx512_cvttss2si64:
2219 case Intrinsic::x86_avx512_cvttss2usi:
2220 case Intrinsic::x86_avx512_cvttss2usi64:
2221 case Intrinsic::x86_avx512_cvttsd2si:
2222 case Intrinsic::x86_avx512_cvttsd2si64:
2223 case Intrinsic::x86_avx512_cvttsd2usi:
2224 case Intrinsic::x86_avx512_cvttsd2usi64: {
2225 // These intrinsics only demand the 0th element of their input vectors. If
2226 // we can simplify the input based on that, do so now.
2227 Value *Arg = II.getArgOperand(0);
2228 unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
2229 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
2230 return IC.replaceOperand(II, 0, V);
2231 }
2232 break;
2233 }
2234
2235 case Intrinsic::x86_mmx_pmovmskb:
2236 case Intrinsic::x86_sse_movmsk_ps:
2237 case Intrinsic::x86_sse2_movmsk_pd:
2238 case Intrinsic::x86_sse2_pmovmskb_128:
2239 case Intrinsic::x86_avx_movmsk_pd_256:
2240 case Intrinsic::x86_avx_movmsk_ps_256:
2241 case Intrinsic::x86_avx2_pmovmskb:
2242 if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
2243 return IC.replaceInstUsesWith(II, V);
2244 }
2245 break;
2246
2247 case Intrinsic::x86_sse_comieq_ss:
2248 case Intrinsic::x86_sse_comige_ss:
2249 case Intrinsic::x86_sse_comigt_ss:
2250 case Intrinsic::x86_sse_comile_ss:
2251 case Intrinsic::x86_sse_comilt_ss:
2252 case Intrinsic::x86_sse_comineq_ss:
2253 case Intrinsic::x86_sse_ucomieq_ss:
2254 case Intrinsic::x86_sse_ucomige_ss:
2255 case Intrinsic::x86_sse_ucomigt_ss:
2256 case Intrinsic::x86_sse_ucomile_ss:
2257 case Intrinsic::x86_sse_ucomilt_ss:
2258 case Intrinsic::x86_sse_ucomineq_ss:
2259 case Intrinsic::x86_sse2_comieq_sd:
2260 case Intrinsic::x86_sse2_comige_sd:
2261 case Intrinsic::x86_sse2_comigt_sd:
2262 case Intrinsic::x86_sse2_comile_sd:
2263 case Intrinsic::x86_sse2_comilt_sd:
2264 case Intrinsic::x86_sse2_comineq_sd:
2265 case Intrinsic::x86_sse2_ucomieq_sd:
2266 case Intrinsic::x86_sse2_ucomige_sd:
2267 case Intrinsic::x86_sse2_ucomigt_sd:
2268 case Intrinsic::x86_sse2_ucomile_sd:
2269 case Intrinsic::x86_sse2_ucomilt_sd:
2270 case Intrinsic::x86_sse2_ucomineq_sd:
2271 case Intrinsic::x86_avx512_vcomi_ss:
2272 case Intrinsic::x86_avx512_vcomi_sd:
2273 case Intrinsic::x86_avx512_mask_cmp_ss:
2274 case Intrinsic::x86_avx512_mask_cmp_sd: {
2275 // These intrinsics only demand the 0th element of their input vectors. If
2276 // we can simplify the input based on that, do so now.
2277 bool MadeChange = false;
2278 Value *Arg0 = II.getArgOperand(0);
2279 Value *Arg1 = II.getArgOperand(1);
2280 unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
2281 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
2282 IC.replaceOperand(II, 0, V);
2283 MadeChange = true;
2284 }
2285 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
2286 IC.replaceOperand(II, 1, V);
2287 MadeChange = true;
2288 }
2289 if (MadeChange) {
2290 return &II;
2291 }
2292 break;
2293 }
2294
2295 case Intrinsic::x86_avx512_add_ps_512:
2296 case Intrinsic::x86_avx512_div_ps_512:
2297 case Intrinsic::x86_avx512_mul_ps_512:
2298 case Intrinsic::x86_avx512_sub_ps_512:
2299 case Intrinsic::x86_avx512_add_pd_512:
2300 case Intrinsic::x86_avx512_div_pd_512:
2301 case Intrinsic::x86_avx512_mul_pd_512:
2302 case Intrinsic::x86_avx512_sub_pd_512:
2303 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2304 // IR operations.
2305 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2306 if (R->getValue() == 4) {
2307 Value *Arg0 = II.getArgOperand(0);
2308 Value *Arg1 = II.getArgOperand(1);
2309
2310 Value *V;
2311 switch (IID) {
2312 default:
2313 llvm_unreachable("Case stmts out of sync!");
2314 case Intrinsic::x86_avx512_add_ps_512:
2315 case Intrinsic::x86_avx512_add_pd_512:
2316 V = IC.Builder.CreateFAdd(Arg0, Arg1);
2317 break;
2318 case Intrinsic::x86_avx512_sub_ps_512:
2319 case Intrinsic::x86_avx512_sub_pd_512:
2320 V = IC.Builder.CreateFSub(Arg0, Arg1);
2321 break;
2322 case Intrinsic::x86_avx512_mul_ps_512:
2323 case Intrinsic::x86_avx512_mul_pd_512:
2324 V = IC.Builder.CreateFMul(Arg0, Arg1);
2325 break;
2326 case Intrinsic::x86_avx512_div_ps_512:
2327 case Intrinsic::x86_avx512_div_pd_512:
2328 V = IC.Builder.CreateFDiv(Arg0, Arg1);
2329 break;
2330 }
2331
2332 return IC.replaceInstUsesWith(II, V);
2333 }
2334 }
2335 break;
2336
2337 case Intrinsic::x86_avx512_mask_add_ss_round:
2338 case Intrinsic::x86_avx512_mask_div_ss_round:
2339 case Intrinsic::x86_avx512_mask_mul_ss_round:
2340 case Intrinsic::x86_avx512_mask_sub_ss_round:
2341 case Intrinsic::x86_avx512_mask_add_sd_round:
2342 case Intrinsic::x86_avx512_mask_div_sd_round:
2343 case Intrinsic::x86_avx512_mask_mul_sd_round:
2344 case Intrinsic::x86_avx512_mask_sub_sd_round:
2345 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2346 // IR operations.
2347 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
2348 if (R->getValue() == 4) {
2349 // Extract the element as scalars.
2350 Value *Arg0 = II.getArgOperand(0);
2351 Value *Arg1 = II.getArgOperand(1);
2354
2355 Value *V;
2356 switch (IID) {
2357 default:
2358 llvm_unreachable("Case stmts out of sync!");
2359 case Intrinsic::x86_avx512_mask_add_ss_round:
2360 case Intrinsic::x86_avx512_mask_add_sd_round:
2361 V = IC.Builder.CreateFAdd(LHS, RHS);
2362 break;
2363 case Intrinsic::x86_avx512_mask_sub_ss_round:
2364 case Intrinsic::x86_avx512_mask_sub_sd_round:
2365 V = IC.Builder.CreateFSub(LHS, RHS);
2366 break;
2367 case Intrinsic::x86_avx512_mask_mul_ss_round:
2368 case Intrinsic::x86_avx512_mask_mul_sd_round:
2369 V = IC.Builder.CreateFMul(LHS, RHS);
2370 break;
2371 case Intrinsic::x86_avx512_mask_div_ss_round:
2372 case Intrinsic::x86_avx512_mask_div_sd_round:
2373 V = IC.Builder.CreateFDiv(LHS, RHS);
2374 break;
2375 }
2376
2377 // Handle the masking aspect of the intrinsic.
2378 Value *Mask = II.getArgOperand(3);
2379 auto *C = dyn_cast<ConstantInt>(Mask);
2380 // We don't need a select if we know the mask bit is a 1.
2381 if (!C || !C->getValue()[0]) {
2382 // Cast the mask to an i1 vector and then extract the lowest element.
2383 auto *MaskTy = FixedVectorType::get(
2384 IC.Builder.getInt1Ty(),
2385 cast<IntegerType>(Mask->getType())->getBitWidth());
2386 Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
2387 Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
2388 // Extract the lowest element from the passthru operand.
2389 Value *Passthru =
2390 IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
2391 V = IC.Builder.CreateSelect(Mask, V, Passthru);
2392 }
2393
2394 // Insert the result back into the original argument 0.
2395 V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
2396
2397 return IC.replaceInstUsesWith(II, V);
2398 }
2399 }
2400 break;
2401
2402 // Constant fold ashr( <A x Bi>, Ci ).
2403 // Constant fold lshr( <A x Bi>, Ci ).
2404 // Constant fold shl( <A x Bi>, Ci ).
2405 case Intrinsic::x86_sse2_psrai_d:
2406 case Intrinsic::x86_sse2_psrai_w:
2407 case Intrinsic::x86_avx2_psrai_d:
2408 case Intrinsic::x86_avx2_psrai_w:
2409 case Intrinsic::x86_avx512_psrai_q_128:
2410 case Intrinsic::x86_avx512_psrai_q_256:
2411 case Intrinsic::x86_avx512_psrai_d_512:
2412 case Intrinsic::x86_avx512_psrai_q_512:
2413 case Intrinsic::x86_avx512_psrai_w_512:
2414 case Intrinsic::x86_sse2_psrli_d:
2415 case Intrinsic::x86_sse2_psrli_q:
2416 case Intrinsic::x86_sse2_psrli_w:
2417 case Intrinsic::x86_avx2_psrli_d:
2418 case Intrinsic::x86_avx2_psrli_q:
2419 case Intrinsic::x86_avx2_psrli_w:
2420 case Intrinsic::x86_avx512_psrli_d_512:
2421 case Intrinsic::x86_avx512_psrli_q_512:
2422 case Intrinsic::x86_avx512_psrli_w_512:
2423 case Intrinsic::x86_sse2_pslli_d:
2424 case Intrinsic::x86_sse2_pslli_q:
2425 case Intrinsic::x86_sse2_pslli_w:
2426 case Intrinsic::x86_avx2_pslli_d:
2427 case Intrinsic::x86_avx2_pslli_q:
2428 case Intrinsic::x86_avx2_pslli_w:
2429 case Intrinsic::x86_avx512_pslli_d_512:
2430 case Intrinsic::x86_avx512_pslli_q_512:
2431 case Intrinsic::x86_avx512_pslli_w_512:
2432 if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2433 return IC.replaceInstUsesWith(II, V);
2434 }
2435 break;
2436
2437 case Intrinsic::x86_sse2_psra_d:
2438 case Intrinsic::x86_sse2_psra_w:
2439 case Intrinsic::x86_avx2_psra_d:
2440 case Intrinsic::x86_avx2_psra_w:
2441 case Intrinsic::x86_avx512_psra_q_128:
2442 case Intrinsic::x86_avx512_psra_q_256:
2443 case Intrinsic::x86_avx512_psra_d_512:
2444 case Intrinsic::x86_avx512_psra_q_512:
2445 case Intrinsic::x86_avx512_psra_w_512:
2446 case Intrinsic::x86_sse2_psrl_d:
2447 case Intrinsic::x86_sse2_psrl_q:
2448 case Intrinsic::x86_sse2_psrl_w:
2449 case Intrinsic::x86_avx2_psrl_d:
2450 case Intrinsic::x86_avx2_psrl_q:
2451 case Intrinsic::x86_avx2_psrl_w:
2452 case Intrinsic::x86_avx512_psrl_d_512:
2453 case Intrinsic::x86_avx512_psrl_q_512:
2454 case Intrinsic::x86_avx512_psrl_w_512:
2455 case Intrinsic::x86_sse2_psll_d:
2456 case Intrinsic::x86_sse2_psll_q:
2457 case Intrinsic::x86_sse2_psll_w:
2458 case Intrinsic::x86_avx2_psll_d:
2459 case Intrinsic::x86_avx2_psll_q:
2460 case Intrinsic::x86_avx2_psll_w:
2461 case Intrinsic::x86_avx512_psll_d_512:
2462 case Intrinsic::x86_avx512_psll_q_512:
2463 case Intrinsic::x86_avx512_psll_w_512: {
2464 if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2465 return IC.replaceInstUsesWith(II, V);
2466 }
2467
2468 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
2469 // operand to compute the shift amount.
2470 Value *Arg1 = II.getArgOperand(1);
2471 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
2472 "Unexpected packed shift size");
2473 unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
2474
2475 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
2476 return IC.replaceOperand(II, 1, V);
2477 }
2478 break;
2479 }
2480
2481 case Intrinsic::x86_avx2_psllv_d:
2482 case Intrinsic::x86_avx2_psllv_d_256:
2483 case Intrinsic::x86_avx2_psllv_q:
2484 case Intrinsic::x86_avx2_psllv_q_256:
2485 case Intrinsic::x86_avx512_psllv_d_512:
2486 case Intrinsic::x86_avx512_psllv_q_512:
2487 case Intrinsic::x86_avx512_psllv_w_128:
2488 case Intrinsic::x86_avx512_psllv_w_256:
2489 case Intrinsic::x86_avx512_psllv_w_512:
2490 case Intrinsic::x86_avx2_psrav_d:
2491 case Intrinsic::x86_avx2_psrav_d_256:
2492 case Intrinsic::x86_avx512_psrav_q_128:
2493 case Intrinsic::x86_avx512_psrav_q_256:
2494 case Intrinsic::x86_avx512_psrav_d_512:
2495 case Intrinsic::x86_avx512_psrav_q_512:
2496 case Intrinsic::x86_avx512_psrav_w_128:
2497 case Intrinsic::x86_avx512_psrav_w_256:
2498 case Intrinsic::x86_avx512_psrav_w_512:
2499 case Intrinsic::x86_avx2_psrlv_d:
2500 case Intrinsic::x86_avx2_psrlv_d_256:
2501 case Intrinsic::x86_avx2_psrlv_q:
2502 case Intrinsic::x86_avx2_psrlv_q_256:
2503 case Intrinsic::x86_avx512_psrlv_d_512:
2504 case Intrinsic::x86_avx512_psrlv_q_512:
2505 case Intrinsic::x86_avx512_psrlv_w_128:
2506 case Intrinsic::x86_avx512_psrlv_w_256:
2507 case Intrinsic::x86_avx512_psrlv_w_512:
2508 if (Value *V = simplifyX86varShift(II, IC.Builder)) {
2509 return IC.replaceInstUsesWith(II, V);
2510 }
2511 break;
2512
2513 case Intrinsic::x86_sse2_packssdw_128:
2514 case Intrinsic::x86_sse2_packsswb_128:
2515 case Intrinsic::x86_avx2_packssdw:
2516 case Intrinsic::x86_avx2_packsswb:
2517 case Intrinsic::x86_avx512_packssdw_512:
2518 case Intrinsic::x86_avx512_packsswb_512:
2519 if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
2520 return IC.replaceInstUsesWith(II, V);
2521 }
2522 break;
2523
2524 case Intrinsic::x86_sse2_packuswb_128:
2525 case Intrinsic::x86_sse41_packusdw:
2526 case Intrinsic::x86_avx2_packusdw:
2527 case Intrinsic::x86_avx2_packuswb:
2528 case Intrinsic::x86_avx512_packusdw_512:
2529 case Intrinsic::x86_avx512_packuswb_512:
2530 if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
2531 return IC.replaceInstUsesWith(II, V);
2532 }
2533 break;
2534
2535 case Intrinsic::x86_sse2_pmadd_wd:
2536 case Intrinsic::x86_avx2_pmadd_wd:
2537 case Intrinsic::x86_avx512_pmaddw_d_512:
2538 if (Value *V = simplifyX86pmadd(II, IC.Builder, true)) {
2539 return IC.replaceInstUsesWith(II, V);
2540 }
2541 break;
2542
2543 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
2544 case Intrinsic::x86_avx2_pmadd_ub_sw:
2545 case Intrinsic::x86_avx512_pmaddubs_w_512:
2546 if (Value *V = simplifyX86pmadd(II, IC.Builder, false)) {
2547 return IC.replaceInstUsesWith(II, V);
2548 }
2549 break;
2550
2551 case Intrinsic::x86_pclmulqdq:
2552 case Intrinsic::x86_pclmulqdq_256:
2553 case Intrinsic::x86_pclmulqdq_512: {
2554 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2555 unsigned Imm = C->getZExtValue();
2556
2557 bool MadeChange = false;
2558 Value *Arg0 = II.getArgOperand(0);
2559 Value *Arg1 = II.getArgOperand(1);
2560 unsigned VWidth =
2561 cast<FixedVectorType>(Arg0->getType())->getNumElements();
2562
2563 APInt UndefElts1(VWidth, 0);
2564 APInt DemandedElts1 =
2565 APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
2566 if (Value *V =
2567 IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
2568 IC.replaceOperand(II, 0, V);
2569 MadeChange = true;
2570 }
2571
2572 APInt UndefElts2(VWidth, 0);
2573 APInt DemandedElts2 =
2574 APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
2575 if (Value *V =
2576 IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
2577 IC.replaceOperand(II, 1, V);
2578 MadeChange = true;
2579 }
2580
2581 // If either input elements are undef, the result is zero.
2582 if (DemandedElts1.isSubsetOf(UndefElts1) ||
2583 DemandedElts2.isSubsetOf(UndefElts2)) {
2584 return IC.replaceInstUsesWith(II,
2585 ConstantAggregateZero::get(II.getType()));
2586 }
2587
2588 if (MadeChange) {
2589 return &II;
2590 }
2591 }
2592 break;
2593 }
2594
2595 case Intrinsic::x86_sse41_insertps:
2596 if (Value *V = simplifyX86insertps(II, IC.Builder)) {
2597 return IC.replaceInstUsesWith(II, V);
2598 }
2599 break;
2600
2601 case Intrinsic::x86_sse4a_extrq: {
2602 Value *Op0 = II.getArgOperand(0);
2603 Value *Op1 = II.getArgOperand(1);
2604 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2605 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2606 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2607 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2608 VWidth1 == 16 && "Unexpected operand sizes");
2609
2610 // See if we're dealing with constant values.
2611 auto *C1 = dyn_cast<Constant>(Op1);
2612 auto *CILength =
2613 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
2614 : nullptr;
2615 auto *CIIndex =
2616 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2617 : nullptr;
2618
2619 // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
2620 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2621 return IC.replaceInstUsesWith(II, V);
2622 }
2623
2624 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
2625 // operands and the lowest 16-bits of the second.
2626 bool MadeChange = false;
2627 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2628 IC.replaceOperand(II, 0, V);
2629 MadeChange = true;
2630 }
2631 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
2632 IC.replaceOperand(II, 1, V);
2633 MadeChange = true;
2634 }
2635 if (MadeChange) {
2636 return &II;
2637 }
2638 break;
2639 }
2640
2641 case Intrinsic::x86_sse4a_extrqi: {
2642 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
2643 // bits of the lower 64-bits. The upper 64-bits are undefined.
2644 Value *Op0 = II.getArgOperand(0);
2645 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2646 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2647 "Unexpected operand size");
2648
2649 // See if we're dealing with constant values.
2650 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
2651 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
2652
2653 // Attempt to simplify to a constant or shuffle vector.
2654 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2655 return IC.replaceInstUsesWith(II, V);
2656 }
2657
2658 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
2659 // operand.
2660 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2661 return IC.replaceOperand(II, 0, V);
2662 }
2663 break;
2664 }
2665
2666 case Intrinsic::x86_sse4a_insertq: {
2667 Value *Op0 = II.getArgOperand(0);
2668 Value *Op1 = II.getArgOperand(1);
2669 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2670 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2671 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2672 cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
2673 "Unexpected operand size");
2674
2675 // See if we're dealing with constant values.
2676 auto *C1 = dyn_cast<Constant>(Op1);
2677 auto *CI11 =
2678 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2679 : nullptr;
2680
2681 // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
2682 if (CI11) {
2683 const APInt &V11 = CI11->getValue();
2684 APInt Len = V11.zextOrTrunc(6);
2685 APInt Idx = V11.lshr(8).zextOrTrunc(6);
2686 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2687 return IC.replaceInstUsesWith(II, V);
2688 }
2689 }
2690
2691 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
2692 // operand.
2693 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2694 return IC.replaceOperand(II, 0, V);
2695 }
2696 break;
2697 }
2698
2699 case Intrinsic::x86_sse4a_insertqi: {
2700 // INSERTQI: Extract lowest Length bits from lower half of second source and
2701 // insert over first source starting at Index bit. The upper 64-bits are
2702 // undefined.
2703 Value *Op0 = II.getArgOperand(0);
2704 Value *Op1 = II.getArgOperand(1);
2705 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2706 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2707 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2708 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2709 VWidth1 == 2 && "Unexpected operand sizes");
2710
2711 // See if we're dealing with constant values.
2712 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
2713 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
2714
2715 // Attempt to simplify to a constant or shuffle vector.
2716 if (CILength && CIIndex) {
2717 APInt Len = CILength->getValue().zextOrTrunc(6);
2718 APInt Idx = CIIndex->getValue().zextOrTrunc(6);
2719 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2720 return IC.replaceInstUsesWith(II, V);
2721 }
2722 }
2723
2724 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
2725 // operands.
2726 bool MadeChange = false;
2727 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2728 IC.replaceOperand(II, 0, V);
2729 MadeChange = true;
2730 }
2731 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
2732 IC.replaceOperand(II, 1, V);
2733 MadeChange = true;
2734 }
2735 if (MadeChange) {
2736 return &II;
2737 }
2738 break;
2739 }
2740
2741 case Intrinsic::x86_sse41_pblendvb:
2742 case Intrinsic::x86_sse41_blendvps:
2743 case Intrinsic::x86_sse41_blendvpd:
2744 case Intrinsic::x86_avx_blendv_ps_256:
2745 case Intrinsic::x86_avx_blendv_pd_256:
2746 case Intrinsic::x86_avx2_pblendvb: {
2747 // fold (blend A, A, Mask) -> A
2748 Value *Op0 = II.getArgOperand(0);
2749 Value *Op1 = II.getArgOperand(1);
2750 Value *Mask = II.getArgOperand(2);
2751 if (Op0 == Op1) {
2752 return IC.replaceInstUsesWith(II, Op0);
2753 }
2754
2755 // Zero Mask - select 1st argument.
2756 if (isa<ConstantAggregateZero>(Mask)) {
2757 return IC.replaceInstUsesWith(II, Op0);
2758 }
2759
2760 // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
2761 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
2762 Constant *NewSelector =
2763 getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout());
2764 return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
2765 }
2766
2767 // Convert to a vector select if we can bypass casts and find a boolean
2768 // vector condition value.
2769 Value *BoolVec;
2771 if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
2772 BoolVec->getType()->isVectorTy() &&
2773 BoolVec->getType()->getScalarSizeInBits() == 1) {
2774 auto *MaskTy = cast<FixedVectorType>(Mask->getType());
2775 auto *OpTy = cast<FixedVectorType>(II.getType());
2776 assert(MaskTy->getPrimitiveSizeInBits() ==
2777 OpTy->getPrimitiveSizeInBits() &&
2778 "Not expecting mask and operands with different sizes");
2779 unsigned NumMaskElts = MaskTy->getNumElements();
2780 unsigned NumOperandElts = OpTy->getNumElements();
2781
2782 if (NumMaskElts == NumOperandElts) {
2783 return SelectInst::Create(BoolVec, Op1, Op0);
2784 }
2785
2786 // If the mask has less elements than the operands, each mask bit maps to
2787 // multiple elements of the operands. Bitcast back and forth.
2788 if (NumMaskElts < NumOperandElts) {
2789 Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy);
2790 Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy);
2791 Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
2792 return new BitCastInst(Sel, II.getType());
2793 }
2794 }
2795
2796 break;
2797 }
2798
2799 case Intrinsic::x86_ssse3_pshuf_b_128:
2800 case Intrinsic::x86_avx2_pshuf_b:
2801 case Intrinsic::x86_avx512_pshuf_b_512:
2802 if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
2803 return IC.replaceInstUsesWith(II, V);
2804 }
2805 break;
2806
2807 case Intrinsic::x86_avx_vpermilvar_ps:
2808 case Intrinsic::x86_avx_vpermilvar_ps_256:
2809 case Intrinsic::x86_avx512_vpermilvar_ps_512:
2810 case Intrinsic::x86_avx_vpermilvar_pd:
2811 case Intrinsic::x86_avx_vpermilvar_pd_256:
2812 case Intrinsic::x86_avx512_vpermilvar_pd_512:
2813 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
2814 return IC.replaceInstUsesWith(II, V);
2815 }
2816 break;
2817
2818 case Intrinsic::x86_avx2_permd:
2819 case Intrinsic::x86_avx2_permps:
2820 case Intrinsic::x86_avx512_permvar_df_256:
2821 case Intrinsic::x86_avx512_permvar_df_512:
2822 case Intrinsic::x86_avx512_permvar_di_256:
2823 case Intrinsic::x86_avx512_permvar_di_512:
2824 case Intrinsic::x86_avx512_permvar_hi_128:
2825 case Intrinsic::x86_avx512_permvar_hi_256:
2826 case Intrinsic::x86_avx512_permvar_hi_512:
2827 case Intrinsic::x86_avx512_permvar_qi_128:
2828 case Intrinsic::x86_avx512_permvar_qi_256:
2829 case Intrinsic::x86_avx512_permvar_qi_512:
2830 case Intrinsic::x86_avx512_permvar_sf_512:
2831 case Intrinsic::x86_avx512_permvar_si_512:
2832 if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
2833 return IC.replaceInstUsesWith(II, V);
2834 }
2835 break;
2836
2837 case Intrinsic::x86_avx_maskload_ps:
2838 case Intrinsic::x86_avx_maskload_pd:
2839 case Intrinsic::x86_avx_maskload_ps_256:
2840 case Intrinsic::x86_avx_maskload_pd_256:
2841 case Intrinsic::x86_avx2_maskload_d:
2842 case Intrinsic::x86_avx2_maskload_q:
2843 case Intrinsic::x86_avx2_maskload_d_256:
2844 case Intrinsic::x86_avx2_maskload_q_256:
2845 if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
2846 return I;
2847 }
2848 break;
2849
2850 case Intrinsic::x86_sse2_maskmov_dqu:
2851 case Intrinsic::x86_avx_maskstore_ps:
2852 case Intrinsic::x86_avx_maskstore_pd:
2853 case Intrinsic::x86_avx_maskstore_ps_256:
2854 case Intrinsic::x86_avx_maskstore_pd_256:
2855 case Intrinsic::x86_avx2_maskstore_d:
2856 case Intrinsic::x86_avx2_maskstore_q:
2857 case Intrinsic::x86_avx2_maskstore_d_256:
2858 case Intrinsic::x86_avx2_maskstore_q_256:
2859 if (simplifyX86MaskedStore(II, IC)) {
2860 return nullptr;
2861 }
2862 break;
2863
2864 case Intrinsic::x86_addcarry_32:
2865 case Intrinsic::x86_addcarry_64:
2866 if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
2867 return IC.replaceInstUsesWith(II, V);
2868 }
2869 break;
2870
2871 case Intrinsic::x86_avx512_pternlog_d_128:
2872 case Intrinsic::x86_avx512_pternlog_d_256:
2873 case Intrinsic::x86_avx512_pternlog_d_512:
2874 case Intrinsic::x86_avx512_pternlog_q_128:
2875 case Intrinsic::x86_avx512_pternlog_q_256:
2876 case Intrinsic::x86_avx512_pternlog_q_512:
2877 if (Value *V = simplifyTernarylogic(II, IC.Builder)) {
2878 return IC.replaceInstUsesWith(II, V);
2879 }
2880 break;
2881 default:
2882 break;
2883 }
2884 return std::nullopt;
2885}
2886
2888 InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
2889 bool &KnownBitsComputed) const {
2890 switch (II.getIntrinsicID()) {
2891 default:
2892 break;
2893 case Intrinsic::x86_mmx_pmovmskb:
2894 case Intrinsic::x86_sse_movmsk_ps:
2895 case Intrinsic::x86_sse2_movmsk_pd:
2896 case Intrinsic::x86_sse2_pmovmskb_128:
2897 case Intrinsic::x86_avx_movmsk_ps_256:
2898 case Intrinsic::x86_avx_movmsk_pd_256:
2899 case Intrinsic::x86_avx2_pmovmskb: {
2900 // MOVMSK copies the vector elements' sign bits to the low bits
2901 // and zeros the high bits.
2902 unsigned ArgWidth;
2903 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
2904 ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
2905 } else {
2906 auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
2907 ArgWidth = ArgType->getNumElements();
2908 }
2909
2910 // If we don't need any of low bits then return zero,
2911 // we know that DemandedMask is non-zero already.
2912 APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
2913 Type *VTy = II.getType();
2914 if (DemandedElts.isZero()) {
2915 return ConstantInt::getNullValue(VTy);
2916 }
2917
2918 // We know that the upper bits are set to zero.
2919 Known.Zero.setBitsFrom(ArgWidth);
2920 KnownBitsComputed = true;
2921 break;
2922 }
2923 }
2924 return std::nullopt;
2925}
2926
2928 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
2929 APInt &UndefElts2, APInt &UndefElts3,
2930 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2931 simplifyAndSetOp) const {
2932 unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
2933 switch (II.getIntrinsicID()) {
2934 default:
2935 break;
2936 case Intrinsic::x86_xop_vfrcz_ss:
2937 case Intrinsic::x86_xop_vfrcz_sd:
2938 // The instructions for these intrinsics are speced to zero upper bits not
2939 // pass them through like other scalar intrinsics. So we shouldn't just
2940 // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
2941 // Instead we should return a zero vector.
2942 if (!DemandedElts[0]) {
2943 IC.addToWorklist(&II);
2944 return ConstantAggregateZero::get(II.getType());
2945 }
2946
2947 // Only the lower element is used.
2948 DemandedElts = 1;
2949 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2950
2951 // Only the lower element is undefined. The high elements are zero.
2952 UndefElts = UndefElts[0];
2953 break;
2954
2955 // Unary scalar-as-vector operations that work column-wise.
2956 case Intrinsic::x86_sse_rcp_ss:
2957 case Intrinsic::x86_sse_rsqrt_ss:
2958 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2959
2960 // If lowest element of a scalar op isn't used then use Arg0.
2961 if (!DemandedElts[0]) {
2962 IC.addToWorklist(&II);
2963 return II.getArgOperand(0);
2964 }
2965 // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
2966 // checks).
2967 break;
2968
2969 // Binary scalar-as-vector operations that work column-wise. The high
2970 // elements come from operand 0. The low element is a function of both
2971 // operands.
2972 case Intrinsic::x86_sse_min_ss:
2973 case Intrinsic::x86_sse_max_ss:
2974 case Intrinsic::x86_sse_cmp_ss:
2975 case Intrinsic::x86_sse2_min_sd:
2976 case Intrinsic::x86_sse2_max_sd:
2977 case Intrinsic::x86_sse2_cmp_sd: {
2978 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2979
2980 // If lowest element of a scalar op isn't used then use Arg0.
2981 if (!DemandedElts[0]) {
2982 IC.addToWorklist(&II);
2983 return II.getArgOperand(0);
2984 }
2985
2986 // Only lower element is used for operand 1.
2987 DemandedElts = 1;
2988 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
2989
2990 // Lower element is undefined if both lower elements are undefined.
2991 // Consider things like undef&0. The result is known zero, not undef.
2992 if (!UndefElts2[0])
2993 UndefElts.clearBit(0);
2994
2995 break;
2996 }
2997
2998 // Binary scalar-as-vector operations that work column-wise. The high
2999 // elements come from operand 0 and the low element comes from operand 1.
3000 case Intrinsic::x86_sse41_round_ss:
3001 case Intrinsic::x86_sse41_round_sd: {
3002 // Don't use the low element of operand 0.
3003 APInt DemandedElts2 = DemandedElts;
3004 DemandedElts2.clearBit(0);
3005 simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
3006
3007 // If lowest element of a scalar op isn't used then use Arg0.
3008 if (!DemandedElts[0]) {
3009 IC.addToWorklist(&II);
3010 return II.getArgOperand(0);
3011 }
3012
3013 // Only lower element is used for operand 1.
3014 DemandedElts = 1;
3015 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3016
3017 // Take the high undef elements from operand 0 and take the lower element
3018 // from operand 1.
3019 UndefElts.clearBit(0);
3020 UndefElts |= UndefElts2[0];
3021 break;
3022 }
3023
3024 // Three input scalar-as-vector operations that work column-wise. The high
3025 // elements come from operand 0 and the low element is a function of all
3026 // three inputs.
3027 case Intrinsic::x86_avx512_mask_add_ss_round:
3028 case Intrinsic::x86_avx512_mask_div_ss_round:
3029 case Intrinsic::x86_avx512_mask_mul_ss_round:
3030 case Intrinsic::x86_avx512_mask_sub_ss_round:
3031 case Intrinsic::x86_avx512_mask_max_ss_round:
3032 case Intrinsic::x86_avx512_mask_min_ss_round:
3033 case Intrinsic::x86_avx512_mask_add_sd_round:
3034 case Intrinsic::x86_avx512_mask_div_sd_round:
3035 case Intrinsic::x86_avx512_mask_mul_sd_round:
3036 case Intrinsic::x86_avx512_mask_sub_sd_round:
3037 case Intrinsic::x86_avx512_mask_max_sd_round:
3038 case Intrinsic::x86_avx512_mask_min_sd_round:
3039 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3040
3041 // If lowest element of a scalar op isn't used then use Arg0.
3042 if (!DemandedElts[0]) {
3043 IC.addToWorklist(&II);
3044 return II.getArgOperand(0);
3045 }
3046
3047 // Only lower element is used for operand 1 and 2.
3048 DemandedElts = 1;
3049 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3050 simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
3051
3052 // Lower element is undefined if all three lower elements are undefined.
3053 // Consider things like undef&0. The result is known zero, not undef.
3054 if (!UndefElts2[0] || !UndefElts3[0])
3055 UndefElts.clearBit(0);
3056 break;
3057
3058 // TODO: Add fmaddsub support?
3059 case Intrinsic::x86_sse3_addsub_pd:
3060 case Intrinsic::x86_sse3_addsub_ps:
3061 case Intrinsic::x86_avx_addsub_pd_256:
3062 case Intrinsic::x86_avx_addsub_ps_256: {
3063 // If none of the even or none of the odd lanes are required, turn this
3064 // into a generic FP math instruction.
3065 APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
3066 APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
3067 bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
3068 bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
3069 if (IsSubOnly || IsAddOnly) {
3070 assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
3073 Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
3074 return IC.Builder.CreateBinOp(
3075 IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
3076 }
3077
3078 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3079 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3080 UndefElts &= UndefElts2;
3081 break;
3082 }
3083
3084 // General per-element vector operations.
3085 case Intrinsic::x86_avx2_psllv_d:
3086 case Intrinsic::x86_avx2_psllv_d_256:
3087 case Intrinsic::x86_avx2_psllv_q:
3088 case Intrinsic::x86_avx2_psllv_q_256:
3089 case Intrinsic::x86_avx2_psrlv_d:
3090 case Intrinsic::x86_avx2_psrlv_d_256:
3091 case Intrinsic::x86_avx2_psrlv_q:
3092 case Intrinsic::x86_avx2_psrlv_q_256:
3093 case Intrinsic::x86_avx2_psrav_d:
3094 case Intrinsic::x86_avx2_psrav_d_256: {
3095 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3096 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3097 UndefElts &= UndefElts2;
3098 break;
3099 }
3100
3101 case Intrinsic::x86_sse2_packssdw_128:
3102 case Intrinsic::x86_sse2_packsswb_128:
3103 case Intrinsic::x86_sse2_packuswb_128:
3104 case Intrinsic::x86_sse41_packusdw:
3105 case Intrinsic::x86_avx2_packssdw:
3106 case Intrinsic::x86_avx2_packsswb:
3107 case Intrinsic::x86_avx2_packusdw:
3108 case Intrinsic::x86_avx2_packuswb:
3109 case Intrinsic::x86_avx512_packssdw_512:
3110 case Intrinsic::x86_avx512_packsswb_512:
3111 case Intrinsic::x86_avx512_packusdw_512:
3112 case Intrinsic::x86_avx512_packuswb_512: {
3113 auto *Ty0 = II.getArgOperand(0)->getType();
3114 unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
3115 assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
3116
3117 unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
3118 unsigned VWidthPerLane = VWidth / NumLanes;
3119 unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
3120
3121 // Per lane, pack the elements of the first input and then the second.
3122 // e.g.
3123 // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
3124 // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
3125 for (int OpNum = 0; OpNum != 2; ++OpNum) {
3126 APInt OpDemandedElts(InnerVWidth, 0);
3127 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3128 unsigned LaneIdx = Lane * VWidthPerLane;
3129 for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
3130 unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
3131 if (DemandedElts[Idx])
3132 OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
3133 }
3134 }
3135
3136 // Demand elements from the operand.
3137 APInt OpUndefElts(InnerVWidth, 0);
3138 simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
3139
3140 // Pack the operand's UNDEF elements, one lane at a time.
3141 OpUndefElts = OpUndefElts.zext(VWidth);
3142 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3143 APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
3144 LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
3145 LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
3146 UndefElts |= LaneElts;
3147 }
3148 }
3149 break;
3150 }
3151
3152 case Intrinsic::x86_sse2_pmadd_wd:
3153 case Intrinsic::x86_avx2_pmadd_wd:
3154 case Intrinsic::x86_avx512_pmaddw_d_512:
3155 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
3156 case Intrinsic::x86_avx2_pmadd_ub_sw:
3157 case Intrinsic::x86_avx512_pmaddubs_w_512: {
3158 // PMADD - demand both src elements that map to each dst element.
3159 auto *ArgTy = II.getArgOperand(0)->getType();
3160 unsigned InnerVWidth = cast<FixedVectorType>(ArgTy)->getNumElements();
3161 assert((VWidth * 2) == InnerVWidth && "Unexpected input size");
3162 APInt OpDemandedElts = APIntOps::ScaleBitMask(DemandedElts, InnerVWidth);
3163 APInt Op0UndefElts(InnerVWidth, 0);
3164 APInt Op1UndefElts(InnerVWidth, 0);
3165 simplifyAndSetOp(&II, 0, OpDemandedElts, Op0UndefElts);
3166 simplifyAndSetOp(&II, 1, OpDemandedElts, Op1UndefElts);
3167 break;
3168 }
3169
3170 // PSHUFB
3171 case Intrinsic::x86_ssse3_pshuf_b_128:
3172 case Intrinsic::x86_avx2_pshuf_b:
3173 case Intrinsic::x86_avx512_pshuf_b_512:
3174 // PERMILVAR
3175 case Intrinsic::x86_avx_vpermilvar_ps:
3176 case Intrinsic::x86_avx_vpermilvar_ps_256:
3177 case Intrinsic::x86_avx512_vpermilvar_ps_512:
3178 case Intrinsic::x86_avx_vpermilvar_pd:
3179 case Intrinsic::x86_avx_vpermilvar_pd_256:
3180 case Intrinsic::x86_avx512_vpermilvar_pd_512:
3181 // PERMV
3182 case Intrinsic::x86_avx2_permd:
3183 case Intrinsic::x86_avx2_permps: {
3184 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
3185 break;
3186 }
3187
3188 // SSE4A instructions leave the upper 64-bits of the 128-bit result
3189 // in an undefined state.
3190 case Intrinsic::x86_sse4a_extrq:
3191 case Intrinsic::x86_sse4a_extrqi:
3192 case Intrinsic::x86_sse4a_insertq:
3193 case Intrinsic::x86_sse4a_insertqi:
3194 UndefElts.setHighBits(VWidth / 2);
3195 break;
3196 }
3197 return std::nullopt;
3198}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
This file provides the interface for the instcombine pass implementation.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
uint64_t IntrinsicInst * II
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static Value * simplifyTernarylogic(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
static Instruction * simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC)
static Value * simplifyX86immShift(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
static Value * simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, APInt APLength, APInt APIndex, InstCombiner::BuilderTy &Builder)
Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant folding or conversion to a shu...
static Value * simplifyX86addcarry(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
static Value * simplifyX86pack(IntrinsicInst &II, InstCombiner::BuilderTy &Builder, bool IsSigned)
static Constant * getNegativeIsTrueBoolVec(Constant *V, const DataLayout &DL)
Return a constant boolean vector that has true elements in all positions where the input constant dat...
static Value * simplifyX86pshufb(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Attempt to convert pshufb* to shufflevector if the mask is constant.
static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC)
static Value * simplifyX86vpermilvar(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Attempt to convert vpermilvar* to shufflevector if the mask is constant.
static Value * simplifyX86movmsk(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
static Value * simplifyX86vpermv(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
static Value * simplifyX86pmadd(IntrinsicInst &II, InstCombiner::BuilderTy &Builder, bool IsPMADDWD)
static Value * simplifyX86insertps(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
static Value * simplifyX86extrq(IntrinsicInst &II, Value *Op0, ConstantInt *CILength, ConstantInt *CIIndex, InstCombiner::BuilderTy &Builder)
Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding or conversion to a shuffle...
static Value * getBoolVecFromMask(Value *Mask, const DataLayout &DL)
Convert the x86 XMM integer vector mask to a vector of bools based on each element's most significant...
static Value * simplifyX86varShift(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder)
Value * RHS
Value * LHS
This file a TargetTransformInfo::Concept conforming object specific to the X86 target machine.
support::ulittle16_t & Lo
Definition: aarch32.cpp:206
support::ulittle16_t & Hi
Definition: aarch32.cpp:205
Class for arbitrary precision integers.
Definition: APInt.h:77
APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition: APInt.cpp:613
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1386
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1499
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1371
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1365
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1002
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1309
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:237
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:359
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1090
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:188
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:620
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:198
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:954
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition: APInt.h:852
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1236
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:285
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:179
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:218
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:837
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition: APInt.h:830
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1200
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
This class represents a no-op cast from one type to another.
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
All zero aggregate value.
Definition: Constants.h:351
static ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1650
static Constant * getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2245
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:146
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1399
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
Definition: Constants.cpp:400
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:417
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
Value * CreateFSub(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1558
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2470
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:509
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2521
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1612
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2458
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:537
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2263
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2037
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1531
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1192
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2514
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:578
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1090
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1435
Value * CreateNot(Value *V, const Twine &Name="")
Definition: IRBuilder.h:1747
Value * CreateIsNeg(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg < 0.
Definition: IRBuilder.h:2552
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2125
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1414
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2019
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2492
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1473
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:598
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1325
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2005
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1495
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1664
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2271
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2159
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:178
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2410
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1454
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1517
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1585
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:514
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1359
The core instruction combiner logic.
Definition: InstCombiner.h:47
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:341
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:386
virtual Value * SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt &UndefElts, unsigned Depth=0, bool AllowMultipleUsers=false)=0
static Value * peekThroughBitcast(Value *V, bool OneUseOnly=false)
Return the source operand of a potentially bitcasted value while optionally checking if it has one us...
Definition: InstCombiner.h:113
void addToWorklist(Instruction *I)
Definition: InstCombiner.h:336
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
Definition: InstCombiner.h:410
BuilderTy & Builder
Definition: InstCombiner.h:60
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1814
static SelectInst * Create(Value *C, Value *S1, Value *S2, const Twine &NameStr="", InsertPosition InsertBefore=nullptr, Instruction *MDFrom=nullptr)
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static IntegerType * getInt8Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1795
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed) const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1484
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
match_combine_and< class_match< Constant >, match_unless< constantexpr_match > > m_ImmConstant()
Match an arbitrary immediate Constant and ignore it.
Definition: PatternMatch.h:854
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Length
Definition: DWP.cpp:480
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
Constant * ConstantFoldCompareInstOperands(unsigned Predicate, Constant *LHS, Constant *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, const Instruction *I=nullptr)
Attempt to constant fold a compare instruction (icmp/fcmp) with the specified operands.
@ Or
Bitwise or logical OR of integers.
@ Xor
Bitwise or logical XOR of integers.
@ And
Bitwise or logical AND of integers.
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:76
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:134
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition: KnownBits.h:118