LLVM 23.0.0git
AMDGPUInstCombineIntrinsic.cpp File Reference
#include "AMDGPUInstrInfo.h"
#include "AMDGPUTargetTransformInfo.h"
#include "GCNSubtarget.h"
#include "SIDefines.h"
#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Sequence.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include <optional>
#include "AMDGPUGenSearchableTables.inc"

Go to the source code of this file.

Macros

#define DEBUG_TYPE   "AMDGPUtti"
#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL

Functions

static APFloat fmed3AMDGCN (const APFloat &Src0, const APFloat &Src1, const APFloat &Src2)
static bool canSafelyConvertTo16Bit (Value &V, bool IsFloat)
static ValueconvertTo16Bit (Value &V, InstCombiner::BuilderTy &Builder)
static std::optional< Instruction * > modifyIntrinsicCall (IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr, InstCombiner &IC, std::function< void(SmallVectorImpl< Value * > &, SmallVectorImpl< Type * > &)> Func)
 Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with modified arguments (based on OldIntr) and replaces InstToReplace with this newly created intrinsic call.
static std::optional< Instruction * > simplifyAMDGCNImageIntrinsic (const GCNSubtarget *ST, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, IntrinsicInst &II, InstCombiner &IC)
static ValuematchFPExtFromF16 (Value *Arg)
 Match an fpext from half to float, or a constant we can convert.
static APInt trimTrailingZerosInVector (InstCombiner &IC, Value *UseV, Instruction *I)
static APInt defaultComponentBroadcast (Value *V)
static ValuesimplifyAMDGCNMemoryIntrinsicDemanded (InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, int DMaskIdx, bool IsLoad)
 Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
static bool canContractSqrtToRsq (const FPMathOperator *SqrtOp)
 Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
static bool isTriviallyUniform (const Use &U)
 Return true if we can easily prove that use U is uniform.
static CallInstrewriteCall (IRBuilderBase &B, CallInst &Old, Function &NewCallee, ArrayRef< Value * > Ops)
static bool isThreadID (const GCNSubtarget &ST, Value *V)
static std::optional< unsignedevalLaneExpr (Value *V, unsigned Lane, const GCNSubtarget &ST, const DataLayout &DL, unsigned Depth=0)
 Evaluate V as a function of the lane ID and return its value on Lane, or std::nullopt if V is not a closed-form expression of the lane ID.
static bool tryBuildShuffleMap (Value *Index, const GCNSubtarget &ST, SmallVectorImpl< uint8_t > &Ids, const DataLayout &DL)
 Build the per-lane shuffle map by evaluating Index for every lane in the wave.
template<unsigned Period>
static bool hasPeriodicLayout (ArrayRef< uint8_t > Ids)
 Lanes are partitioned into groups of Period; each group is a translated copy of the first: Ids[I] = Ids[I % Period] + (I & ~(Period - 1)).
template<unsigned N>
static bool isRowPattern (ArrayRef< uint8_t > Ids)
 Match an N-lane row pattern: each lane in [0, N) reads from a source lane in the same N-lane row, and the pattern repeats periodically across rows.
static std::optional< unsignedmatchQuadPermPattern (ArrayRef< uint8_t > Ids)
 Match a 4-lane (quad) permutation, encoded as the v_mov_b32_dpp QUAD_PERM control word: bits[1:0]=Ids[0], [3:2]=Ids[1], [5:4]=Ids[2], [7:6]=Ids[3].
template<unsigned N>
static bool matchMirrorPattern (ArrayRef< uint8_t > Ids)
 Match an N-lane reversal (mirror) pattern.
static std::optional< unsignedmatchRowRotatePattern (ArrayRef< uint8_t > Ids)
 Match a 16-lane cyclic rotation; returns the rotation amount in [1, 15].
static std::optional< unsignedmatchRowSharePattern (ArrayRef< uint8_t > Ids)
 Match a row-share pattern: all 16 lanes of each row read the same source lane.
static std::optional< unsignedmatchRowXMaskPattern (ArrayRef< uint8_t > Ids)
 Match an XOR mask pattern within each 16-lane row: Ids[J] == Mask ^ J, with Mask in [1, 15].
static std::optional< unsignedmatchHalfRowPermPattern (ArrayRef< uint8_t > Ids)
 Match an 8-lane arbitrary permutation, encoded as the v_mov_b32_dpp8 24-bit selector (three bits per output lane).
static uint64_t computePermlane16Masks (ArrayRef< uint8_t > Ids)
 Pack a 16-lane permutation into a single 64-bit value: four bits per output lane, lane J in bits [J*4 + 3 : J*4].
static bool matchHalfWaveSwapPattern (ArrayRef< uint8_t > Ids)
 Match a half-wave swap: lane J reads from lane J ^ 32.
static bool isCrossRowPattern (ArrayRef< uint8_t > Ids)
 Match a cross-row permutation suitable for v_permlanex16: every lane in the low 16-lane half reads from the high half of its own row, and vice versa.
static std::optional< unsignedmatchDsSwizzleBitmaskPattern (ArrayRef< uint8_t > Ids)
 Match a DS_SWIZZLE bitmask-mode permutation: dst_lane = ((src_lane & AND) | OR) ^ XOR with each mask being five bits.
static ValuecreateUpdateDpp (IRBuilderBase &B, Value *Val, unsigned Ctrl)
 Emit v_mov_b32_dpp with the given control word, row/bank masks 0xF, and bound_ctrl=1 so out-of-bounds lanes are well-defined and the DPP mov can be folded into a consuming VALU op by GCNDPPCombine.
static ValuecreateMovDpp8 (IRBuilderBase &B, Value *Val, unsigned Selector)
 Emit v_mov_b32_dpp8 with the given 24-bit lane selector.
static ValuecreatePermlane16 (IRBuilderBase &B, Value *Val, uint32_t Lo, uint32_t Hi)
 Emit v_permlane16 with the precomputed lane-select halves.
static ValuecreatePermlaneX16 (IRBuilderBase &B, Value *Val, uint32_t Lo, uint32_t Hi)
 Emit v_permlanex16 with the precomputed lane-select halves.
static ValuecreateDsSwizzle (IRBuilderBase &B, Value *Val, unsigned Offset, const DataLayout &DL)
 Emit ds_swizzle with the given immediate, bitcasting/converting between pointer/float types and i32 as required by the intrinsic signature.
static ValuecreatePermlane64 (IRBuilderBase &B, Value *Val)
 Emit v_permlane64 (swap of the two 32-lane halves of a wave64).
static ValuematchShuffleToHWIntrinsic (IRBuilderBase &B, Value *Src, ArrayRef< uint8_t > Ids, const GCNSubtarget &ST, const DataLayout &DL)
 Given a shuffle map, try to emit the best hardware intrinsic.
static std::optional< Instruction * > tryOptimizeShufflePattern (InstCombiner &IC, IntrinsicInst &II, const GCNSubtarget &ST)
 Try to fold a wave_shuffle/ds_bpermute whose lane index is a constant function of the lane ID into a hardware-specific lane permutation intrinsic.

Variables

static constexpr auto isQuadPattern = isRowPattern<4>
static constexpr auto isHalfRowPattern = isRowPattern<8>
static constexpr auto isFullRowPattern = isRowPattern<16>
static constexpr auto matchHalfRowMirrorPattern = matchMirrorPattern<8>
static constexpr auto matchFullRowMirrorPattern = matchMirrorPattern<16>

Macro Definition Documentation

◆ DEBUG_TYPE

#define DEBUG_TYPE   "AMDGPUtti"

Definition at line 36 of file AMDGPUInstCombineIntrinsic.cpp.

◆ GET_AMDGPUImageDMaskIntrinsicTable_IMPL

#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL

Definition at line 44 of file AMDGPUInstCombineIntrinsic.cpp.

Function Documentation

◆ canContractSqrtToRsq()

bool canContractSqrtToRsq ( const FPMathOperator * SqrtOp)
static

◆ canSafelyConvertTo16Bit()

◆ computePermlane16Masks()

uint64_t computePermlane16Masks ( ArrayRef< uint8_t > Ids)
static

Pack a 16-lane permutation into a single 64-bit value: four bits per output lane, lane J in bits [J*4 + 3 : J*4].

The caller splits it into the low and high 32-bit selector operands of v_permlane16 / v_permlanex16.

Definition at line 812 of file AMDGPUInstCombineIntrinsic.cpp.

Referenced by matchShuffleToHWIntrinsic().

◆ convertTo16Bit()

Value * convertTo16Bit ( Value & V,
InstCombiner::BuilderTy & Builder )
static

◆ createDsSwizzle()

Value * createDsSwizzle ( IRBuilderBase & B,
Value * Val,
unsigned Offset,
const DataLayout & DL )
static

Emit ds_swizzle with the given immediate, bitcasting/converting between pointer/float types and i32 as required by the intrinsic signature.

Definition at line 922 of file AMDGPUInstCombineIntrinsic.cpp.

References assert(), B(), DL, llvm::Value::getType(), llvm::Type::isPointerTy(), and llvm::Offset.

Referenced by matchShuffleToHWIntrinsic().

◆ createMovDpp8()

Value * createMovDpp8 ( IRBuilderBase & B,
Value * Val,
unsigned Selector )
static

Emit v_mov_b32_dpp8 with the given 24-bit lane selector.

Definition at line 896 of file AMDGPUInstCombineIntrinsic.cpp.

References B(), and llvm::Value::getType().

Referenced by matchShuffleToHWIntrinsic().

◆ createPermlane16()

Value * createPermlane16 ( IRBuilderBase & B,
Value * Val,
uint32_t Lo,
uint32_t Hi )
static

Emit v_permlane16 with the precomputed lane-select halves.

Definition at line 902 of file AMDGPUInstCombineIntrinsic.cpp.

References B(), llvm::PoisonValue::get(), llvm::Value::getType(), llvm::Hi, and llvm::Lo.

Referenced by matchShuffleToHWIntrinsic().

◆ createPermlane64()

Value * createPermlane64 ( IRBuilderBase & B,
Value * Val )
static

Emit v_permlane64 (swap of the two 32-lane halves of a wave64).

Definition at line 943 of file AMDGPUInstCombineIntrinsic.cpp.

References B(), and llvm::Value::getType().

Referenced by matchShuffleToHWIntrinsic().

◆ createPermlaneX16()

Value * createPermlaneX16 ( IRBuilderBase & B,
Value * Val,
uint32_t Lo,
uint32_t Hi )
static

Emit v_permlanex16 with the precomputed lane-select halves.

Each output lane reads from the other 16-lane half of the same row.

Definition at line 912 of file AMDGPUInstCombineIntrinsic.cpp.

References B(), llvm::PoisonValue::get(), llvm::Value::getType(), llvm::Hi, and llvm::Lo.

Referenced by matchShuffleToHWIntrinsic().

◆ createUpdateDpp()

Value * createUpdateDpp ( IRBuilderBase & B,
Value * Val,
unsigned Ctrl )
static

Emit v_mov_b32_dpp with the given control word, row/bank masks 0xF, and bound_ctrl=1 so out-of-bounds lanes are well-defined and the DPP mov can be folded into a consuming VALU op by GCNDPPCombine.

Definition at line 888 of file AMDGPUInstCombineIntrinsic.cpp.

References B(), llvm::PoisonValue::get(), and llvm::Value::getType().

Referenced by matchShuffleToHWIntrinsic().

◆ defaultComponentBroadcast()

◆ evalLaneExpr()

std::optional< unsigned > evalLaneExpr ( Value * V,
unsigned Lane,
const GCNSubtarget & ST,
const DataLayout & DL,
unsigned Depth = 0 )
static

Evaluate V as a function of the lane ID and return its value on Lane, or std::nullopt if V is not a closed-form expression of the lane ID.

Definition at line 666 of file AMDGPUInstCombineIntrinsic.cpp.

References AbstractManglingParser< Derived, Alloc >::Ops, llvm::ConstantFoldInstOperands(), llvm::Depth, DL, llvm::dyn_cast(), llvm::dyn_cast_or_null(), evalLaneExpr(), llvm::User::getOperand(), llvm::Value::getType(), llvm::isa(), isThreadID(), LHS, llvm::MaxAnalysisRecursionDepth, and RHS.

Referenced by evalLaneExpr(), tryBuildShuffleMap(), and tryOptimizeShufflePattern().

◆ fmed3AMDGCN()

◆ hasPeriodicLayout()

template<unsigned Period>
bool hasPeriodicLayout ( ArrayRef< uint8_t > Ids)
static

Lanes are partitioned into groups of Period; each group is a translated copy of the first: Ids[I] = Ids[I % Period] + (I & ~(Period - 1)).

Definition at line 723 of file AMDGPUInstCombineIntrinsic.cpp.

References E(), I, llvm::isPowerOf2_32(), and llvm::ArrayRef< T >::size().

Referenced by isCrossRowPattern(), isRowPattern(), and matchDsSwizzleBitmaskPattern().

◆ isCrossRowPattern()

bool isCrossRowPattern ( ArrayRef< uint8_t > Ids)
static

Match a cross-row permutation suitable for v_permlanex16: every lane in the low 16-lane half reads from the high half of its own row, and vice versa.

Definition at line 833 of file AMDGPUInstCombineIntrinsic.cpp.

References hasPeriodicLayout().

Referenced by matchShuffleToHWIntrinsic().

◆ isRowPattern()

template<unsigned N>
bool isRowPattern ( ArrayRef< uint8_t > Ids)
static

Match an N-lane row pattern: each lane in [0, N) reads from a source lane in the same N-lane row, and the pattern repeats periodically across rows.

Definition at line 733 of file AMDGPUInstCombineIntrinsic.cpp.

References hasPeriodicLayout(), I, and N.

Referenced by matchMirrorPattern().

◆ isThreadID()

◆ isTriviallyUniform()

bool isTriviallyUniform ( const Use & U)
static

◆ matchDsSwizzleBitmaskPattern()

std::optional< unsigned > matchDsSwizzleBitmaskPattern ( ArrayRef< uint8_t > Ids)
static

Match a DS_SWIZZLE bitmask-mode permutation: dst_lane = ((src_lane & AND) | OR) ^ XOR with each mask being five bits.

Returns the encoded swizzle immediate. The hardware applies the formula independently within each 32-lane group, so on wave64 the high group must replicate the low one (translated by 32).

Definition at line 851 of file AMDGPUInstCombineIntrinsic.cpp.

References B(), llvm::AMDGPU::Swizzle::BITMASK_AND_SHIFT, llvm::AMDGPU::Swizzle::BITMASK_OR_SHIFT, llvm::AMDGPU::Swizzle::BITMASK_PERM_ENC, llvm::AMDGPU::Swizzle::BITMASK_XOR_SHIFT, hasPeriodicLayout(), I, and llvm::seq().

Referenced by matchShuffleToHWIntrinsic().

◆ matchFPExtFromF16()

◆ matchHalfRowPermPattern()

std::optional< unsigned > matchHalfRowPermPattern ( ArrayRef< uint8_t > Ids)
static

Match an 8-lane arbitrary permutation, encoded as the v_mov_b32_dpp8 24-bit selector (three bits per output lane).

Definition at line 800 of file AMDGPUInstCombineIntrinsic.cpp.

References isHalfRowPattern.

Referenced by matchShuffleToHWIntrinsic().

◆ matchHalfWaveSwapPattern()

bool matchHalfWaveSwapPattern ( ArrayRef< uint8_t > Ids)
static

Match a half-wave swap: lane J reads from lane J ^ 32.

Only meaningful on wave64 targets.

Definition at line 821 of file AMDGPUInstCombineIntrinsic.cpp.

References llvm::ArrayRef< T >::size().

Referenced by matchShuffleToHWIntrinsic().

◆ matchMirrorPattern()

template<unsigned N>
bool matchMirrorPattern ( ArrayRef< uint8_t > Ids)
static

Match an N-lane reversal (mirror) pattern.

Definition at line 754 of file AMDGPUInstCombineIntrinsic.cpp.

References isRowPattern(), and N.

◆ matchQuadPermPattern()

std::optional< unsigned > matchQuadPermPattern ( ArrayRef< uint8_t > Ids)
static

Match a 4-lane (quad) permutation, encoded as the v_mov_b32_dpp QUAD_PERM control word: bits[1:0]=Ids[0], [3:2]=Ids[1], [5:4]=Ids[2], [7:6]=Ids[3].

Definition at line 747 of file AMDGPUInstCombineIntrinsic.cpp.

References isQuadPattern.

Referenced by matchShuffleToHWIntrinsic().

◆ matchRowRotatePattern()

std::optional< unsigned > matchRowRotatePattern ( ArrayRef< uint8_t > Ids)
static

Match a 16-lane cyclic rotation; returns the rotation amount in [1, 15].

Definition at line 767 of file AMDGPUInstCombineIntrinsic.cpp.

References isFullRowPattern.

Referenced by matchShuffleToHWIntrinsic().

◆ matchRowSharePattern()

std::optional< unsigned > matchRowSharePattern ( ArrayRef< uint8_t > Ids)
static

Match a row-share pattern: all 16 lanes of each row read the same source lane.

Returns the shared source lane index in [0, 16).

Definition at line 778 of file AMDGPUInstCombineIntrinsic.cpp.

References llvm::all_equal(), isFullRowPattern, and llvm::ArrayRef< T >::take_front().

Referenced by matchShuffleToHWIntrinsic().

◆ matchRowXMaskPattern()

std::optional< unsigned > matchRowXMaskPattern ( ArrayRef< uint8_t > Ids)
static

Match an XOR mask pattern within each 16-lane row: Ids[J] == Mask ^ J, with Mask in [1, 15].

Definition at line 788 of file AMDGPUInstCombineIntrinsic.cpp.

References isFullRowPattern.

Referenced by matchShuffleToHWIntrinsic().

◆ matchShuffleToHWIntrinsic()

◆ modifyIntrinsicCall()

std::optional< Instruction * > modifyIntrinsicCall ( IntrinsicInst & OldIntr,
Instruction & InstToReplace,
unsigned NewIntr,
InstCombiner & IC,
std::function< void(SmallVectorImpl< Value * > &, SmallVectorImpl< Type * > &)> Func )
static

◆ rewriteCall()

◆ simplifyAMDGCNImageIntrinsic()

std::optional< Instruction * > simplifyAMDGCNImageIntrinsic ( const GCNSubtarget * ST,
const AMDGPU::ImageDimIntrinsicInfo * ImageDimIntr,
IntrinsicInst & II,
InstCombiner & IC )
static

Definition at line 161 of file AMDGPUInstCombineIntrinsic.cpp.

References assert(), llvm::AMDGPU::ImageDimIntrinsicInfo::BaseOpcode, llvm::AMDGPU::ImageDimIntrinsicInfo::BiasIndex, canSafelyConvertTo16Bit(), llvm::cast(), llvm::AMDGPU::ImageDimIntrinsicInfo::CoordStart, llvm::AMDGPU::ImageDimIntrinsicInfo::Dim, llvm::dyn_cast(), llvm::SmallVectorImpl< T >::emplace_back(), llvm::SmallVectorTemplateCommon< T, typename >::empty(), llvm::InstCombiner::eraseInstFromFunction(), for(), llvm::Type::getHalfTy(), llvm::AMDGPU::getImageDimIntrinsicByBaseOpcode(), llvm::Type::getInt16Ty(), llvm::AMDGPU::getMIMGBaseOpcodeInfo(), llvm::AMDGPU::getMIMGBiasMappingInfo(), llvm::AMDGPU::getMIMGLZMappingInfo(), llvm::AMDGPU::getMIMGMIPMappingInfo(), llvm::AMDGPU::getMIMGOffsetMappingInfo(), llvm::Intrinsic::getOrInsertDeclaration(), llvm::Type::getScalarType(), llvm::Value::getType(), llvm::Type::getWithNewType(), llvm::AMDGPU::ImageDimIntrinsicInfo::GradientStart, llvm::AMDGPU::MIMGBaseOpcodeInfo::HasD16, if(), II, llvm::AMDGPU::ImageDimIntrinsicInfo::Intr, llvm::Type::isFloatingPointTy(), llvm::Type::isHalfTy(), llvm::Intrinsic::isSignatureValid(), llvm::AMDGPU::ImageDimIntrinsicInfo::LodIndex, llvm::AMDGPU::ImageDimIntrinsicInfo::MipIndex, modifyIntrinsicCall(), llvm::AMDGPU::ImageDimIntrinsicInfo::NumBiasArgs, llvm::AMDGPU::ImageDimIntrinsicInfo::OffsetIndex, llvm::AMDGPU::MIMGBaseOpcodeInfo::Sampler, llvm::Value::takeName(), and llvm::AMDGPU::ImageDimIntrinsicInfo::VAddrEnd.

Referenced by llvm::GCNTTIImpl::instCombineIntrinsic().

◆ simplifyAMDGCNMemoryIntrinsicDemanded()

◆ trimTrailingZerosInVector()

◆ tryBuildShuffleMap()

bool tryBuildShuffleMap ( Value * Index,
const GCNSubtarget & ST,
SmallVectorImpl< uint8_t > & Ids,
const DataLayout & DL )
static

Build the per-lane shuffle map by evaluating Index for every lane in the wave.

Returns false if any lane index is non-constant or out of range.

Definition at line 706 of file AMDGPUInstCombineIntrinsic.cpp.

References DL, evalLaneExpr(), llvm::SmallVectorImpl< T >::resize(), and llvm::seq().

Referenced by tryOptimizeShufflePattern().

◆ tryOptimizeShufflePattern()

std::optional< Instruction * > tryOptimizeShufflePattern ( InstCombiner & IC,
IntrinsicInst & II,
const GCNSubtarget & ST )
static

Try to fold a wave_shuffle/ds_bpermute whose lane index is a constant function of the lane ID into a hardware-specific lane permutation intrinsic.

Definition at line 1016 of file AMDGPUInstCombineIntrinsic.cpp.

References llvm::InstCombiner::Builder, DL, evalLaneExpr(), llvm::InstCombiner::getDataLayout(), II, matchShuffleToHWIntrinsic(), llvm::InstCombiner::replaceInstUsesWith(), llvm::SmallVectorImpl< T >::resize(), llvm::seq(), and tryBuildShuffleMap().

Referenced by llvm::GCNTTIImpl::instCombineIntrinsic().

Variable Documentation

◆ isFullRowPattern

auto isFullRowPattern = isRowPattern<16>
staticconstexpr

◆ isHalfRowPattern

auto isHalfRowPattern = isRowPattern<8>
staticconstexpr

Definition at line 741 of file AMDGPUInstCombineIntrinsic.cpp.

Referenced by matchHalfRowPermPattern().

◆ isQuadPattern

auto isQuadPattern = isRowPattern<4>
staticconstexpr

Definition at line 740 of file AMDGPUInstCombineIntrinsic.cpp.

Referenced by matchQuadPermPattern().

◆ matchFullRowMirrorPattern

auto matchFullRowMirrorPattern = matchMirrorPattern<16>
staticconstexpr

Definition at line 764 of file AMDGPUInstCombineIntrinsic.cpp.

Referenced by matchShuffleToHWIntrinsic().

◆ matchHalfRowMirrorPattern

auto matchHalfRowMirrorPattern = matchMirrorPattern<8>
staticconstexpr

Definition at line 763 of file AMDGPUInstCombineIntrinsic.cpp.

Referenced by matchShuffleToHWIntrinsic().