| File: | build/source/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |
| Warning: | line 3340, column 62 The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// | ||||
| 2 | // | ||||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | ||||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||
| 6 | // | ||||
| 7 | //===----------------------------------------------------------------------===// | ||||
| 8 | /// \file | ||||
| 9 | /// This file implements the targeting of the Machinelegalizer class for | ||||
| 10 | /// AMDGPU. | ||||
| 11 | /// \todo This should be generated by TableGen. | ||||
| 12 | //===----------------------------------------------------------------------===// | ||||
| 13 | |||||
| 14 | #include "AMDGPULegalizerInfo.h" | ||||
| 15 | |||||
| 16 | #include "AMDGPU.h" | ||||
| 17 | #include "AMDGPUGlobalISelUtils.h" | ||||
| 18 | #include "AMDGPUInstrInfo.h" | ||||
| 19 | #include "AMDGPUTargetMachine.h" | ||||
| 20 | #include "SIMachineFunctionInfo.h" | ||||
| 21 | #include "Utils/AMDGPUBaseInfo.h" | ||||
| 22 | #include "llvm/ADT/ScopeExit.h" | ||||
| 23 | #include "llvm/BinaryFormat/ELF.h" | ||||
| 24 | #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" | ||||
| 25 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" | ||||
| 26 | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" | ||||
| 27 | #include "llvm/IR/DiagnosticInfo.h" | ||||
| 28 | #include "llvm/IR/IntrinsicsAMDGPU.h" | ||||
| 29 | #include "llvm/IR/IntrinsicsR600.h" | ||||
| 30 | |||||
| 31 | #define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo" | ||||
| 32 | |||||
| 33 | using namespace llvm; | ||||
| 34 | using namespace LegalizeActions; | ||||
| 35 | using namespace LegalizeMutations; | ||||
| 36 | using namespace LegalityPredicates; | ||||
| 37 | using namespace MIPatternMatch; | ||||
| 38 | |||||
| 39 | // Hack until load/store selection patterns support any tuple of legal types. | ||||
| 40 | static cl::opt<bool> EnableNewLegality( | ||||
| 41 | "amdgpu-global-isel-new-legality", | ||||
| 42 | cl::desc("Use GlobalISel desired legality, rather than try to use" | ||||
| 43 | "rules compatible with selection patterns"), | ||||
| 44 | cl::init(false), | ||||
| 45 | cl::ReallyHidden); | ||||
| 46 | |||||
| 47 | static constexpr unsigned MaxRegisterSize = 1024; | ||||
| 48 | |||||
| 49 | // Round the number of elements to the next power of two elements | ||||
| 50 | static LLT getPow2VectorType(LLT Ty) { | ||||
| 51 | unsigned NElts = Ty.getNumElements(); | ||||
| 52 | unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); | ||||
| 53 | return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); | ||||
| 54 | } | ||||
| 55 | |||||
| 56 | // Round the number of bits to the next power of two bits | ||||
| 57 | static LLT getPow2ScalarType(LLT Ty) { | ||||
| 58 | unsigned Bits = Ty.getSizeInBits(); | ||||
| 59 | unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); | ||||
| 60 | return LLT::scalar(Pow2Bits); | ||||
| 61 | } | ||||
| 62 | |||||
| 63 | /// \returns true if this is an odd sized vector which should widen by adding an | ||||
| 64 | /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This | ||||
| 65 | /// excludes s1 vectors, which should always be scalarized. | ||||
| 66 | static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { | ||||
| 67 | return [=](const LegalityQuery &Query) { | ||||
| 68 | const LLT Ty = Query.Types[TypeIdx]; | ||||
| 69 | if (!Ty.isVector()) | ||||
| 70 | return false; | ||||
| 71 | |||||
| 72 | const LLT EltTy = Ty.getElementType(); | ||||
| 73 | const unsigned EltSize = EltTy.getSizeInBits(); | ||||
| 74 | return Ty.getNumElements() % 2 != 0 && | ||||
| 75 | EltSize > 1 && EltSize < 32 && | ||||
| 76 | Ty.getSizeInBits() % 32 != 0; | ||||
| 77 | }; | ||||
| 78 | } | ||||
| 79 | |||||
| 80 | static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { | ||||
| 81 | return [=](const LegalityQuery &Query) { | ||||
| 82 | const LLT Ty = Query.Types[TypeIdx]; | ||||
| 83 | return Ty.getSizeInBits() % 32 == 0; | ||||
| 84 | }; | ||||
| 85 | } | ||||
| 86 | |||||
| 87 | static LegalityPredicate isWideVec16(unsigned TypeIdx) { | ||||
| 88 | return [=](const LegalityQuery &Query) { | ||||
| 89 | const LLT Ty = Query.Types[TypeIdx]; | ||||
| 90 | const LLT EltTy = Ty.getScalarType(); | ||||
| 91 | return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; | ||||
| 92 | }; | ||||
| 93 | } | ||||
| 94 | |||||
| 95 | static LegalizeMutation oneMoreElement(unsigned TypeIdx) { | ||||
| 96 | return [=](const LegalityQuery &Query) { | ||||
| 97 | const LLT Ty = Query.Types[TypeIdx]; | ||||
| 98 | const LLT EltTy = Ty.getElementType(); | ||||
| 99 | return std::pair(TypeIdx, | ||||
| 100 | LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); | ||||
| 101 | }; | ||||
| 102 | } | ||||
| 103 | |||||
| 104 | static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { | ||||
| 105 | return [=](const LegalityQuery &Query) { | ||||
| 106 | const LLT Ty = Query.Types[TypeIdx]; | ||||
| 107 | const LLT EltTy = Ty.getElementType(); | ||||
| 108 | unsigned Size = Ty.getSizeInBits(); | ||||
| 109 | unsigned Pieces = (Size + 63) / 64; | ||||
| 110 | unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; | ||||
| 111 | return std::pair(TypeIdx, LLT::scalarOrVector( | ||||
| 112 | ElementCount::getFixed(NewNumElts), EltTy)); | ||||
| 113 | }; | ||||
| 114 | } | ||||
| 115 | |||||
| 116 | // Increase the number of vector elements to reach the next multiple of 32-bit | ||||
| 117 | // type. | ||||
| 118 | static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { | ||||
| 119 | return [=](const LegalityQuery &Query) { | ||||
| 120 | const LLT Ty = Query.Types[TypeIdx]; | ||||
| 121 | |||||
| 122 | const LLT EltTy = Ty.getElementType(); | ||||
| 123 | const int Size = Ty.getSizeInBits(); | ||||
| 124 | const int EltSize = EltTy.getSizeInBits(); | ||||
| 125 | const int NextMul32 = (Size + 31) / 32; | ||||
| 126 | |||||
| 127 | assert(EltSize < 32)(static_cast <bool> (EltSize < 32) ? void (0) : __assert_fail ("EltSize < 32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 127, __extension__ __PRETTY_FUNCTION__)); | ||||
| 128 | |||||
| 129 | const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; | ||||
| 130 | return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); | ||||
| 131 | }; | ||||
| 132 | } | ||||
| 133 | |||||
| 134 | // Increase the number of vector elements to reach the next legal RegClass. | ||||
| 135 | static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) { | ||||
| 136 | return [=](const LegalityQuery &Query) { | ||||
| 137 | const LLT Ty = Query.Types[TypeIdx]; | ||||
| 138 | const unsigned NumElts = Ty.getNumElements(); | ||||
| 139 | const unsigned EltSize = Ty.getElementType().getSizeInBits(); | ||||
| 140 | const unsigned MaxNumElts = MaxRegisterSize / EltSize; | ||||
| 141 | |||||
| 142 | assert(EltSize == 32 || EltSize == 64)(static_cast <bool> (EltSize == 32 || EltSize == 64) ? void (0) : __assert_fail ("EltSize == 32 || EltSize == 64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 142, __extension__ __PRETTY_FUNCTION__)); | ||||
| 143 | assert(Ty.getSizeInBits() < MaxRegisterSize)(static_cast <bool> (Ty.getSizeInBits() < MaxRegisterSize ) ? void (0) : __assert_fail ("Ty.getSizeInBits() < MaxRegisterSize" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 143, __extension__ __PRETTY_FUNCTION__)); | ||||
| 144 | |||||
| 145 | unsigned NewNumElts; | ||||
| 146 | // Find the nearest legal RegClass that is larger than the current type. | ||||
| 147 | for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) { | ||||
| 148 | if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize)) | ||||
| 149 | break; | ||||
| 150 | } | ||||
| 151 | |||||
| 152 | return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize)); | ||||
| 153 | }; | ||||
| 154 | } | ||||
| 155 | |||||
| 156 | static LLT getBitcastRegisterType(const LLT Ty) { | ||||
| 157 | const unsigned Size = Ty.getSizeInBits(); | ||||
| 158 | |||||
| 159 | if (Size <= 32) { | ||||
| 160 | // <2 x s8> -> s16 | ||||
| 161 | // <4 x s8> -> s32 | ||||
| 162 | return LLT::scalar(Size); | ||||
| 163 | } | ||||
| 164 | |||||
| 165 | return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); | ||||
| 166 | } | ||||
| 167 | |||||
| 168 | static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { | ||||
| 169 | return [=](const LegalityQuery &Query) { | ||||
| 170 | const LLT Ty = Query.Types[TypeIdx]; | ||||
| 171 | return std::pair(TypeIdx, getBitcastRegisterType(Ty)); | ||||
| 172 | }; | ||||
| 173 | } | ||||
| 174 | |||||
| 175 | static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { | ||||
| 176 | return [=](const LegalityQuery &Query) { | ||||
| 177 | const LLT Ty = Query.Types[TypeIdx]; | ||||
| 178 | unsigned Size = Ty.getSizeInBits(); | ||||
| 179 | assert(Size % 32 == 0)(static_cast <bool> (Size % 32 == 0) ? void (0) : __assert_fail ("Size % 32 == 0", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 179, __extension__ __PRETTY_FUNCTION__)); | ||||
| 180 | return std::pair( | ||||
| 181 | TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); | ||||
| 182 | }; | ||||
| 183 | } | ||||
| 184 | |||||
| 185 | static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { | ||||
| 186 | return [=](const LegalityQuery &Query) { | ||||
| 187 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||
| 188 | return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; | ||||
| 189 | }; | ||||
| 190 | } | ||||
| 191 | |||||
| 192 | static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { | ||||
| 193 | return [=](const LegalityQuery &Query) { | ||||
| 194 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||
| 195 | return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; | ||||
| 196 | }; | ||||
| 197 | } | ||||
| 198 | |||||
| 199 | static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { | ||||
| 200 | return [=](const LegalityQuery &Query) { | ||||
| 201 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||
| 202 | return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; | ||||
| 203 | }; | ||||
| 204 | } | ||||
| 205 | |||||
| 206 | static bool isRegisterSize(unsigned Size) { | ||||
| 207 | return Size % 32 == 0 && Size <= MaxRegisterSize; | ||||
| 208 | } | ||||
| 209 | |||||
| 210 | static bool isRegisterVectorElementType(LLT EltTy) { | ||||
| 211 | const int EltSize = EltTy.getSizeInBits(); | ||||
| 212 | return EltSize == 16 || EltSize % 32 == 0; | ||||
| 213 | } | ||||
| 214 | |||||
| 215 | static bool isRegisterVectorType(LLT Ty) { | ||||
| 216 | const int EltSize = Ty.getElementType().getSizeInBits(); | ||||
| 217 | return EltSize == 32 || EltSize == 64 || | ||||
| 218 | (EltSize == 16 && Ty.getNumElements() % 2 == 0) || | ||||
| 219 | EltSize == 128 || EltSize == 256; | ||||
| 220 | } | ||||
| 221 | |||||
| 222 | static bool isRegisterType(LLT Ty) { | ||||
| 223 | if (!isRegisterSize(Ty.getSizeInBits())) | ||||
| 224 | return false; | ||||
| 225 | |||||
| 226 | if (Ty.isVector()) | ||||
| 227 | return isRegisterVectorType(Ty); | ||||
| 228 | |||||
| 229 | return true; | ||||
| 230 | } | ||||
| 231 | |||||
| 232 | // Any combination of 32 or 64-bit elements up the maximum register size, and | ||||
| 233 | // multiples of v2s16. | ||||
| 234 | static LegalityPredicate isRegisterType(unsigned TypeIdx) { | ||||
| 235 | return [=](const LegalityQuery &Query) { | ||||
| 236 | return isRegisterType(Query.Types[TypeIdx]); | ||||
| 237 | }; | ||||
| 238 | } | ||||
| 239 | |||||
| 240 | // RegisterType that doesn't have a corresponding RegClass. | ||||
| 241 | static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) { | ||||
| 242 | return [=](const LegalityQuery &Query) { | ||||
| 243 | LLT Ty = Query.Types[TypeIdx]; | ||||
| 244 | return isRegisterType(Ty) && | ||||
| 245 | !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits()); | ||||
| 246 | }; | ||||
| 247 | } | ||||
| 248 | |||||
| 249 | static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { | ||||
| 250 | return [=](const LegalityQuery &Query) { | ||||
| 251 | const LLT QueryTy = Query.Types[TypeIdx]; | ||||
| 252 | if (!QueryTy.isVector()) | ||||
| 253 | return false; | ||||
| 254 | const LLT EltTy = QueryTy.getElementType(); | ||||
| 255 | return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; | ||||
| 256 | }; | ||||
| 257 | } | ||||
| 258 | |||||
| 259 | // If we have a truncating store or an extending load with a data size larger | ||||
| 260 | // than 32-bits, we need to reduce to a 32-bit type. | ||||
| 261 | static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { | ||||
| 262 | return [=](const LegalityQuery &Query) { | ||||
| 263 | const LLT Ty = Query.Types[TypeIdx]; | ||||
| 264 | return !Ty.isVector() && Ty.getSizeInBits() > 32 && | ||||
| 265 | Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); | ||||
| 266 | }; | ||||
| 267 | } | ||||
| 268 | |||||
| 269 | // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we | ||||
| 270 | // handle some operations by just promoting the register during | ||||
| 271 | // selection. There are also d16 loads on GFX9+ which preserve the high bits. | ||||
| 272 | static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, | ||||
| 273 | bool IsLoad, bool IsAtomic) { | ||||
| 274 | switch (AS) { | ||||
| 275 | case AMDGPUAS::PRIVATE_ADDRESS: | ||||
| 276 | // FIXME: Private element size. | ||||
| 277 | return ST.enableFlatScratch() ? 128 : 32; | ||||
| 278 | case AMDGPUAS::LOCAL_ADDRESS: | ||||
| 279 | return ST.useDS128() ? 128 : 64; | ||||
| 280 | case AMDGPUAS::GLOBAL_ADDRESS: | ||||
| 281 | case AMDGPUAS::CONSTANT_ADDRESS: | ||||
| 282 | case AMDGPUAS::CONSTANT_ADDRESS_32BIT: | ||||
| 283 | // Treat constant and global as identical. SMRD loads are sometimes usable for | ||||
| 284 | // global loads (ideally constant address space should be eliminated) | ||||
| 285 | // depending on the context. Legality cannot be context dependent, but | ||||
| 286 | // RegBankSelect can split the load as necessary depending on the pointer | ||||
| 287 | // register bank/uniformity and if the memory is invariant or not written in a | ||||
| 288 | // kernel. | ||||
| 289 | return IsLoad ? 512 : 128; | ||||
| 290 | default: | ||||
| 291 | // FIXME: Flat addresses may contextually need to be split to 32-bit parts | ||||
| 292 | // if they may alias scratch depending on the subtarget. This needs to be | ||||
| 293 | // moved to custom handling to use addressMayBeAccessedAsPrivate | ||||
| 294 | return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32; | ||||
| 295 | } | ||||
| 296 | } | ||||
| 297 | |||||
| 298 | static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, | ||||
| 299 | const LegalityQuery &Query) { | ||||
| 300 | const LLT Ty = Query.Types[0]; | ||||
| 301 | |||||
| 302 | // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD | ||||
| 303 | const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; | ||||
| 304 | |||||
| 305 | unsigned RegSize = Ty.getSizeInBits(); | ||||
| 306 | uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); | ||||
| 307 | uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; | ||||
| 308 | unsigned AS = Query.Types[1].getAddressSpace(); | ||||
| 309 | |||||
| 310 | // All of these need to be custom lowered to cast the pointer operand. | ||||
| 311 | if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) | ||||
| 312 | return false; | ||||
| 313 | |||||
| 314 | // Do not handle extending vector loads. | ||||
| 315 | if (Ty.isVector() && MemSize != RegSize) | ||||
| 316 | return false; | ||||
| 317 | |||||
| 318 | // TODO: We should be able to widen loads if the alignment is high enough, but | ||||
| 319 | // we also need to modify the memory access size. | ||||
| 320 | #if 0 | ||||
| 321 | // Accept widening loads based on alignment. | ||||
| 322 | if (IsLoad && MemSize < Size) | ||||
| 323 | MemSize = std::max(MemSize, Align); | ||||
| 324 | #endif | ||||
| 325 | |||||
| 326 | // Only 1-byte and 2-byte to 32-bit extloads are valid. | ||||
| 327 | if (MemSize != RegSize && RegSize != 32) | ||||
| 328 | return false; | ||||
| 329 | |||||
| 330 | if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, | ||||
| 331 | Query.MMODescrs[0].Ordering != | ||||
| 332 | AtomicOrdering::NotAtomic)) | ||||
| 333 | return false; | ||||
| 334 | |||||
| 335 | switch (MemSize) { | ||||
| 336 | case 8: | ||||
| 337 | case 16: | ||||
| 338 | case 32: | ||||
| 339 | case 64: | ||||
| 340 | case 128: | ||||
| 341 | break; | ||||
| 342 | case 96: | ||||
| 343 | if (!ST.hasDwordx3LoadStores()) | ||||
| 344 | return false; | ||||
| 345 | break; | ||||
| 346 | case 256: | ||||
| 347 | case 512: | ||||
| 348 | // These may contextually need to be broken down. | ||||
| 349 | break; | ||||
| 350 | default: | ||||
| 351 | return false; | ||||
| 352 | } | ||||
| 353 | |||||
| 354 | assert(RegSize >= MemSize)(static_cast <bool> (RegSize >= MemSize) ? void (0) : __assert_fail ("RegSize >= MemSize", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 354, __extension__ __PRETTY_FUNCTION__)); | ||||
| 355 | |||||
| 356 | if (AlignBits < MemSize) { | ||||
| 357 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||
| 358 | if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, | ||||
| 359 | Align(AlignBits / 8))) | ||||
| 360 | return false; | ||||
| 361 | } | ||||
| 362 | |||||
| 363 | return true; | ||||
| 364 | } | ||||
| 365 | |||||
| 366 | // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so | ||||
| 367 | // workaround this. Eventually it should ignore the type for loads and only care | ||||
| 368 | // about the size. Return true in cases where we will workaround this for now by | ||||
| 369 | // bitcasting. | ||||
| 370 | static bool loadStoreBitcastWorkaround(const LLT Ty) { | ||||
| 371 | if (EnableNewLegality) | ||||
| 372 | return false; | ||||
| 373 | |||||
| 374 | const unsigned Size = Ty.getSizeInBits(); | ||||
| 375 | if (Size <= 64) | ||||
| 376 | return false; | ||||
| 377 | if (!Ty.isVector()) | ||||
| 378 | return true; | ||||
| 379 | |||||
| 380 | LLT EltTy = Ty.getElementType(); | ||||
| 381 | if (EltTy.isPointer()) | ||||
| 382 | return true; | ||||
| 383 | |||||
| 384 | unsigned EltSize = EltTy.getSizeInBits(); | ||||
| 385 | return EltSize != 32 && EltSize != 64; | ||||
| 386 | } | ||||
| 387 | |||||
| 388 | static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { | ||||
| 389 | const LLT Ty = Query.Types[0]; | ||||
| 390 | return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && | ||||
| 391 | !loadStoreBitcastWorkaround(Ty); | ||||
| 392 | } | ||||
| 393 | |||||
| 394 | /// Return true if a load or store of the type should be lowered with a bitcast | ||||
| 395 | /// to a different type. | ||||
| 396 | static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, | ||||
| 397 | const LLT MemTy) { | ||||
| 398 | const unsigned MemSizeInBits = MemTy.getSizeInBits(); | ||||
| 399 | const unsigned Size = Ty.getSizeInBits(); | ||||
| 400 | if (Size != MemSizeInBits) | ||||
| 401 | return Size <= 32 && Ty.isVector(); | ||||
| 402 | |||||
| 403 | if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) | ||||
| 404 | return true; | ||||
| 405 | |||||
| 406 | // Don't try to handle bitcasting vector ext loads for now. | ||||
| 407 | return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && | ||||
| 408 | (Size <= 32 || isRegisterSize(Size)) && | ||||
| 409 | !isRegisterVectorElementType(Ty.getElementType()); | ||||
| 410 | } | ||||
| 411 | |||||
| 412 | /// Return true if we should legalize a load by widening an odd sized memory | ||||
| 413 | /// access up to the alignment. Note this case when the memory access itself | ||||
| 414 | /// changes, not the size of the result register. | ||||
| 415 | static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, | ||||
| 416 | uint64_t AlignInBits, unsigned AddrSpace, | ||||
| 417 | unsigned Opcode) { | ||||
| 418 | unsigned SizeInBits = MemoryTy.getSizeInBits(); | ||||
| 419 | // We don't want to widen cases that are naturally legal. | ||||
| 420 | if (isPowerOf2_32(SizeInBits)) | ||||
| 421 | return false; | ||||
| 422 | |||||
| 423 | // If we have 96-bit memory operations, we shouldn't touch them. Note we may | ||||
| 424 | // end up widening these for a scalar load during RegBankSelect, since there | ||||
| 425 | // aren't 96-bit scalar loads. | ||||
| 426 | if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) | ||||
| 427 | return false; | ||||
| 428 | |||||
| 429 | if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false)) | ||||
| 430 | return false; | ||||
| 431 | |||||
| 432 | // A load is known dereferenceable up to the alignment, so it's legal to widen | ||||
| 433 | // to it. | ||||
| 434 | // | ||||
| 435 | // TODO: Could check dereferenceable for less aligned cases. | ||||
| 436 | unsigned RoundedSize = NextPowerOf2(SizeInBits); | ||||
| 437 | if (AlignInBits < RoundedSize) | ||||
| 438 | return false; | ||||
| 439 | |||||
| 440 | // Do not widen if it would introduce a slow unaligned load. | ||||
| 441 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||
| 442 | unsigned Fast = 0; | ||||
| 443 | return TLI->allowsMisalignedMemoryAccessesImpl( | ||||
| 444 | RoundedSize, AddrSpace, Align(AlignInBits / 8), | ||||
| 445 | MachineMemOperand::MOLoad, &Fast) && | ||||
| 446 | Fast; | ||||
| 447 | } | ||||
| 448 | |||||
| 449 | static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, | ||||
| 450 | unsigned Opcode) { | ||||
| 451 | if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) | ||||
| 452 | return false; | ||||
| 453 | |||||
| 454 | return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, | ||||
| 455 | Query.MMODescrs[0].AlignInBits, | ||||
| 456 | Query.Types[1].getAddressSpace(), Opcode); | ||||
| 457 | } | ||||
| 458 | |||||
| 459 | AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, | ||||
| 460 | const GCNTargetMachine &TM) | ||||
| 461 | : ST(ST_) { | ||||
| 462 | using namespace TargetOpcode; | ||||
| 463 | |||||
| 464 | auto GetAddrSpacePtr = [&TM](unsigned AS) { | ||||
| 465 | return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); | ||||
| 466 | }; | ||||
| 467 | |||||
| 468 | const LLT S1 = LLT::scalar(1); | ||||
| 469 | const LLT S8 = LLT::scalar(8); | ||||
| 470 | const LLT S16 = LLT::scalar(16); | ||||
| 471 | const LLT S32 = LLT::scalar(32); | ||||
| 472 | const LLT S64 = LLT::scalar(64); | ||||
| 473 | const LLT S128 = LLT::scalar(128); | ||||
| 474 | const LLT S256 = LLT::scalar(256); | ||||
| 475 | const LLT S512 = LLT::scalar(512); | ||||
| 476 | const LLT MaxScalar = LLT::scalar(MaxRegisterSize); | ||||
| 477 | |||||
| 478 | const LLT V2S8 = LLT::fixed_vector(2, 8); | ||||
| 479 | const LLT V2S16 = LLT::fixed_vector(2, 16); | ||||
| 480 | const LLT V4S16 = LLT::fixed_vector(4, 16); | ||||
| 481 | |||||
| 482 | const LLT V2S32 = LLT::fixed_vector(2, 32); | ||||
| 483 | const LLT V3S32 = LLT::fixed_vector(3, 32); | ||||
| 484 | const LLT V4S32 = LLT::fixed_vector(4, 32); | ||||
| 485 | const LLT V5S32 = LLT::fixed_vector(5, 32); | ||||
| 486 | const LLT V6S32 = LLT::fixed_vector(6, 32); | ||||
| 487 | const LLT V7S32 = LLT::fixed_vector(7, 32); | ||||
| 488 | const LLT V8S32 = LLT::fixed_vector(8, 32); | ||||
| 489 | const LLT V9S32 = LLT::fixed_vector(9, 32); | ||||
| 490 | const LLT V10S32 = LLT::fixed_vector(10, 32); | ||||
| 491 | const LLT V11S32 = LLT::fixed_vector(11, 32); | ||||
| 492 | const LLT V12S32 = LLT::fixed_vector(12, 32); | ||||
| 493 | const LLT V13S32 = LLT::fixed_vector(13, 32); | ||||
| 494 | const LLT V14S32 = LLT::fixed_vector(14, 32); | ||||
| 495 | const LLT V15S32 = LLT::fixed_vector(15, 32); | ||||
| 496 | const LLT V16S32 = LLT::fixed_vector(16, 32); | ||||
| 497 | const LLT V32S32 = LLT::fixed_vector(32, 32); | ||||
| 498 | |||||
| 499 | const LLT V2S64 = LLT::fixed_vector(2, 64); | ||||
| 500 | const LLT V3S64 = LLT::fixed_vector(3, 64); | ||||
| 501 | const LLT V4S64 = LLT::fixed_vector(4, 64); | ||||
| 502 | const LLT V5S64 = LLT::fixed_vector(5, 64); | ||||
| 503 | const LLT V6S64 = LLT::fixed_vector(6, 64); | ||||
| 504 | const LLT V7S64 = LLT::fixed_vector(7, 64); | ||||
| 505 | const LLT V8S64 = LLT::fixed_vector(8, 64); | ||||
| 506 | const LLT V16S64 = LLT::fixed_vector(16, 64); | ||||
| 507 | |||||
| 508 | std::initializer_list<LLT> AllS32Vectors = | ||||
| 509 | {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, | ||||
| 510 | V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; | ||||
| 511 | std::initializer_list<LLT> AllS64Vectors = | ||||
| 512 | {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; | ||||
| 513 | |||||
| 514 | const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); | ||||
| 515 | const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); | ||||
| 516 | const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); | ||||
| 517 | const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); | ||||
| 518 | const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); | ||||
| 519 | const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); | ||||
| 520 | const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); | ||||
| 521 | |||||
| 522 | const LLT CodePtr = FlatPtr; | ||||
| 523 | |||||
| 524 | const std::initializer_list<LLT> AddrSpaces64 = { | ||||
| 525 | GlobalPtr, ConstantPtr, FlatPtr | ||||
| 526 | }; | ||||
| 527 | |||||
| 528 | const std::initializer_list<LLT> AddrSpaces32 = { | ||||
| 529 | LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr | ||||
| 530 | }; | ||||
| 531 | |||||
| 532 | const std::initializer_list<LLT> FPTypesBase = { | ||||
| 533 | S32, S64 | ||||
| 534 | }; | ||||
| 535 | |||||
| 536 | const std::initializer_list<LLT> FPTypes16 = { | ||||
| 537 | S32, S64, S16 | ||||
| 538 | }; | ||||
| 539 | |||||
| 540 | const std::initializer_list<LLT> FPTypesPK16 = { | ||||
| 541 | S32, S64, S16, V2S16 | ||||
| 542 | }; | ||||
| 543 | |||||
| 544 | const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; | ||||
| 545 | |||||
| 546 | // s1 for VCC branches, s32 for SCC branches. | ||||
| 547 | getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); | ||||
| 548 | |||||
| 549 | // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more | ||||
| 550 | // elements for v3s16 | ||||
| 551 | getActionDefinitionsBuilder(G_PHI) | ||||
| 552 | .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) | ||||
| 553 | .legalFor(AllS32Vectors) | ||||
| 554 | .legalFor(AllS64Vectors) | ||||
| 555 | .legalFor(AddrSpaces64) | ||||
| 556 | .legalFor(AddrSpaces32) | ||||
| 557 | .legalIf(isPointer(0)) | ||||
| 558 | .clampScalar(0, S16, S256) | ||||
| 559 | .widenScalarToNextPow2(0, 32) | ||||
| 560 | .clampMaxNumElements(0, S32, 16) | ||||
| 561 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||
| 562 | .scalarize(0); | ||||
| 563 | |||||
| 564 | if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { | ||||
| 565 | // Full set of gfx9 features. | ||||
| 566 | getActionDefinitionsBuilder({G_ADD, G_SUB}) | ||||
| 567 | .legalFor({S32, S16, V2S16}) | ||||
| 568 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
| 569 | .scalarize(0) | ||||
| 570 | .minScalar(0, S16) | ||||
| 571 | .widenScalarToNextMultipleOf(0, 32) | ||||
| 572 | .maxScalar(0, S32); | ||||
| 573 | |||||
| 574 | getActionDefinitionsBuilder(G_MUL) | ||||
| 575 | .legalFor({S32, S16, V2S16}) | ||||
| 576 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
| 577 | .scalarize(0) | ||||
| 578 | .minScalar(0, S16) | ||||
| 579 | .widenScalarToNextMultipleOf(0, 32) | ||||
| 580 | .custom(); | ||||
| 581 | assert(ST.hasMad64_32())(static_cast <bool> (ST.hasMad64_32()) ? void (0) : __assert_fail ("ST.hasMad64_32()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 581, __extension__ __PRETTY_FUNCTION__)); | ||||
| 582 | |||||
| 583 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) | ||||
| 584 | .legalFor({S32, S16, V2S16}) // Clamp modifier | ||||
| 585 | .minScalarOrElt(0, S16) | ||||
| 586 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
| 587 | .scalarize(0) | ||||
| 588 | .widenScalarToNextPow2(0, 32) | ||||
| 589 | .lower(); | ||||
| 590 | } else if (ST.has16BitInsts()) { | ||||
| 591 | getActionDefinitionsBuilder({G_ADD, G_SUB}) | ||||
| 592 | .legalFor({S32, S16}) | ||||
| 593 | .minScalar(0, S16) | ||||
| 594 | .widenScalarToNextMultipleOf(0, 32) | ||||
| 595 | .maxScalar(0, S32) | ||||
| 596 | .scalarize(0); | ||||
| 597 | |||||
| 598 | getActionDefinitionsBuilder(G_MUL) | ||||
| 599 | .legalFor({S32, S16}) | ||||
| 600 | .scalarize(0) | ||||
| 601 | .minScalar(0, S16) | ||||
| 602 | .widenScalarToNextMultipleOf(0, 32) | ||||
| 603 | .custom(); | ||||
| 604 | assert(ST.hasMad64_32())(static_cast <bool> (ST.hasMad64_32()) ? void (0) : __assert_fail ("ST.hasMad64_32()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 604, __extension__ __PRETTY_FUNCTION__)); | ||||
| 605 | |||||
| 606 | // Technically the saturating operations require clamp bit support, but this | ||||
| 607 | // was introduced at the same time as 16-bit operations. | ||||
| 608 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||
| 609 | .legalFor({S32, S16}) // Clamp modifier | ||||
| 610 | .minScalar(0, S16) | ||||
| 611 | .scalarize(0) | ||||
| 612 | .widenScalarToNextPow2(0, 16) | ||||
| 613 | .lower(); | ||||
| 614 | |||||
| 615 | // We're just lowering this, but it helps get a better result to try to | ||||
| 616 | // coerce to the desired type first. | ||||
| 617 | getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) | ||||
| 618 | .minScalar(0, S16) | ||||
| 619 | .scalarize(0) | ||||
| 620 | .lower(); | ||||
| 621 | } else { | ||||
| 622 | getActionDefinitionsBuilder({G_ADD, G_SUB}) | ||||
| 623 | .legalFor({S32}) | ||||
| 624 | .widenScalarToNextMultipleOf(0, 32) | ||||
| 625 | .clampScalar(0, S32, S32) | ||||
| 626 | .scalarize(0); | ||||
| 627 | |||||
| 628 | auto &Mul = getActionDefinitionsBuilder(G_MUL) | ||||
| 629 | .legalFor({S32}) | ||||
| 630 | .scalarize(0) | ||||
| 631 | .minScalar(0, S32) | ||||
| 632 | .widenScalarToNextMultipleOf(0, 32); | ||||
| 633 | |||||
| 634 | if (ST.hasMad64_32()) | ||||
| 635 | Mul.custom(); | ||||
| 636 | else | ||||
| 637 | Mul.maxScalar(0, S32); | ||||
| 638 | |||||
| 639 | if (ST.hasIntClamp()) { | ||||
| 640 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||
| 641 | .legalFor({S32}) // Clamp modifier. | ||||
| 642 | .scalarize(0) | ||||
| 643 | .minScalarOrElt(0, S32) | ||||
| 644 | .lower(); | ||||
| 645 | } else { | ||||
| 646 | // Clamp bit support was added in VI, along with 16-bit operations. | ||||
| 647 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) | ||||
| 648 | .minScalar(0, S32) | ||||
| 649 | .scalarize(0) | ||||
| 650 | .lower(); | ||||
| 651 | } | ||||
| 652 | |||||
| 653 | // FIXME: DAG expansion gets better results. The widening uses the smaller | ||||
| 654 | // range values and goes for the min/max lowering directly. | ||||
| 655 | getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) | ||||
| 656 | .minScalar(0, S32) | ||||
| 657 | .scalarize(0) | ||||
| 658 | .lower(); | ||||
| 659 | } | ||||
| 660 | |||||
| 661 | getActionDefinitionsBuilder( | ||||
| 662 | {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) | ||||
| 663 | .customFor({S32, S64}) | ||||
| 664 | .clampScalar(0, S32, S64) | ||||
| 665 | .widenScalarToNextPow2(0, 32) | ||||
| 666 | .scalarize(0); | ||||
| 667 | |||||
| 668 | auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) | ||||
| 669 | .legalFor({S32}) | ||||
| 670 | .maxScalar(0, S32); | ||||
| 671 | |||||
| 672 | if (ST.hasVOP3PInsts()) { | ||||
| 673 | Mulh | ||||
| 674 | .clampMaxNumElements(0, S8, 2) | ||||
| 675 | .lowerFor({V2S8}); | ||||
| 676 | } | ||||
| 677 | |||||
| 678 | Mulh | ||||
| 679 | .scalarize(0) | ||||
| 680 | .lower(); | ||||
| 681 | |||||
| 682 | // Report legal for any types we can handle anywhere. For the cases only legal | ||||
| 683 | // on the SALU, RegBankSelect will be able to re-legalize. | ||||
| 684 | getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) | ||||
| 685 | .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) | ||||
| 686 | .clampScalar(0, S32, S64) | ||||
| 687 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||
| 688 | .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) | ||||
| 689 | .widenScalarToNextPow2(0) | ||||
| 690 | .scalarize(0); | ||||
| 691 | |||||
| 692 | getActionDefinitionsBuilder( | ||||
| 693 | {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) | ||||
| 694 | .legalFor({{S32, S1}, {S32, S32}}) | ||||
| 695 | .clampScalar(0, S32, S32) | ||||
| 696 | .scalarize(0); | ||||
| 697 | |||||
| 698 | getActionDefinitionsBuilder(G_BITCAST) | ||||
| 699 | // Don't worry about the size constraint. | ||||
| 700 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||
| 701 | .lower(); | ||||
| 702 | |||||
| 703 | |||||
| 704 | getActionDefinitionsBuilder(G_CONSTANT) | ||||
| 705 | .legalFor({S1, S32, S64, S16, GlobalPtr, | ||||
| 706 | LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) | ||||
| 707 | .legalIf(isPointer(0)) | ||||
| 708 | .clampScalar(0, S32, S64) | ||||
| 709 | .widenScalarToNextPow2(0); | ||||
| 710 | |||||
| 711 | getActionDefinitionsBuilder(G_FCONSTANT) | ||||
| 712 | .legalFor({S32, S64, S16}) | ||||
| 713 | .clampScalar(0, S16, S64); | ||||
| 714 | |||||
| 715 | getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) | ||||
| 716 | .legalIf(isRegisterType(0)) | ||||
| 717 | // s1 and s16 are special cases because they have legal operations on | ||||
| 718 | // them, but don't really occupy registers in the normal way. | ||||
| 719 | .legalFor({S1, S16}) | ||||
| 720 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||
| 721 | .clampScalarOrElt(0, S32, MaxScalar) | ||||
| 722 | .widenScalarToNextPow2(0, 32) | ||||
| 723 | .clampMaxNumElements(0, S32, 16); | ||||
| 724 | |||||
| 725 | getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); | ||||
| 726 | |||||
| 727 | // If the amount is divergent, we have to do a wave reduction to get the | ||||
| 728 | // maximum value, so this is expanded during RegBankSelect. | ||||
| 729 | getActionDefinitionsBuilder(G_DYN_STACKALLOC) | ||||
| 730 | .legalFor({{PrivatePtr, S32}}); | ||||
| 731 | |||||
| 732 | getActionDefinitionsBuilder(G_GLOBAL_VALUE) | ||||
| 733 | .customIf(typeIsNot(0, PrivatePtr)); | ||||
| 734 | |||||
| 735 | getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); | ||||
| 736 | |||||
| 737 | auto &FPOpActions = getActionDefinitionsBuilder( | ||||
| 738 | { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE, | ||||
| 739 | G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA}) | ||||
| 740 | .legalFor({S32, S64}); | ||||
| 741 | auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) | ||||
| 742 | .customFor({S32, S64}); | ||||
| 743 | auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) | ||||
| 744 | .customFor({S32, S64}); | ||||
| 745 | |||||
| 746 | if (ST.has16BitInsts()) { | ||||
| 747 | if (ST.hasVOP3PInsts()) | ||||
| 748 | FPOpActions.legalFor({S16, V2S16}); | ||||
| 749 | else | ||||
| 750 | FPOpActions.legalFor({S16}); | ||||
| 751 | |||||
| 752 | TrigActions.customFor({S16}); | ||||
| 753 | FDIVActions.customFor({S16}); | ||||
| 754 | } | ||||
| 755 | |||||
| 756 | auto &MinNumMaxNum = getActionDefinitionsBuilder({ | ||||
| 757 | G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); | ||||
| 758 | |||||
| 759 | if (ST.hasVOP3PInsts()) { | ||||
| 760 | MinNumMaxNum.customFor(FPTypesPK16) | ||||
| 761 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||
| 762 | .clampMaxNumElements(0, S16, 2) | ||||
| 763 | .clampScalar(0, S16, S64) | ||||
| 764 | .scalarize(0); | ||||
| 765 | } else if (ST.has16BitInsts()) { | ||||
| 766 | MinNumMaxNum.customFor(FPTypes16) | ||||
| 767 | .clampScalar(0, S16, S64) | ||||
| 768 | .scalarize(0); | ||||
| 769 | } else { | ||||
| 770 | MinNumMaxNum.customFor(FPTypesBase) | ||||
| 771 | .clampScalar(0, S32, S64) | ||||
| 772 | .scalarize(0); | ||||
| 773 | } | ||||
| 774 | |||||
| 775 | if (ST.hasVOP3PInsts()) | ||||
| 776 | FPOpActions.clampMaxNumElementsStrict(0, S16, 2); | ||||
| 777 | |||||
| 778 | FPOpActions | ||||
| 779 | .scalarize(0) | ||||
| 780 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||
| 781 | |||||
| 782 | TrigActions | ||||
| 783 | .scalarize(0) | ||||
| 784 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||
| 785 | |||||
| 786 | FDIVActions | ||||
| 787 | .scalarize(0) | ||||
| 788 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); | ||||
| 789 | |||||
| 790 | getActionDefinitionsBuilder({G_FNEG, G_FABS}) | ||||
| 791 | .legalFor(FPTypesPK16) | ||||
| 792 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
| 793 | .scalarize(0) | ||||
| 794 | .clampScalar(0, S16, S64); | ||||
| 795 | |||||
| 796 | if (ST.has16BitInsts()) { | ||||
| 797 | getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) | ||||
| 798 | .legalFor({S32, S64, S16}) | ||||
| 799 | .scalarize(0) | ||||
| 800 | .clampScalar(0, S16, S64); | ||||
| 801 | } else { | ||||
| 802 | getActionDefinitionsBuilder(G_FSQRT) | ||||
| 803 | .legalFor({S32, S64}) | ||||
| 804 | .scalarize(0) | ||||
| 805 | .clampScalar(0, S32, S64); | ||||
| 806 | |||||
| 807 | if (ST.hasFractBug()) { | ||||
| 808 | getActionDefinitionsBuilder(G_FFLOOR) | ||||
| 809 | .customFor({S64}) | ||||
| 810 | .legalFor({S32, S64}) | ||||
| 811 | .scalarize(0) | ||||
| 812 | .clampScalar(0, S32, S64); | ||||
| 813 | } else { | ||||
| 814 | getActionDefinitionsBuilder(G_FFLOOR) | ||||
| 815 | .legalFor({S32, S64}) | ||||
| 816 | .scalarize(0) | ||||
| 817 | .clampScalar(0, S32, S64); | ||||
| 818 | } | ||||
| 819 | } | ||||
| 820 | |||||
| 821 | getActionDefinitionsBuilder(G_FPTRUNC) | ||||
| 822 | .legalFor({{S32, S64}, {S16, S32}}) | ||||
| 823 | .scalarize(0) | ||||
| 824 | .lower(); | ||||
| 825 | |||||
| 826 | getActionDefinitionsBuilder(G_FPEXT) | ||||
| 827 | .legalFor({{S64, S32}, {S32, S16}}) | ||||
| 828 | .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) | ||||
| 829 | .scalarize(0); | ||||
| 830 | |||||
| 831 | auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB}); | ||||
| 832 | if (ST.has16BitInsts()) { | ||||
| 833 | FSubActions | ||||
| 834 | // Use actual fsub instruction | ||||
| 835 | .legalFor({S32, S16}) | ||||
| 836 | // Must use fadd + fneg | ||||
| 837 | .lowerFor({S64, V2S16}); | ||||
| 838 | } else { | ||||
| 839 | FSubActions | ||||
| 840 | // Use actual fsub instruction | ||||
| 841 | .legalFor({S32}) | ||||
| 842 | // Must use fadd + fneg | ||||
| 843 | .lowerFor({S64, S16, V2S16}); | ||||
| 844 | } | ||||
| 845 | |||||
| 846 | FSubActions | ||||
| 847 | .scalarize(0) | ||||
| 848 | .clampScalar(0, S32, S64); | ||||
| 849 | |||||
| 850 | // Whether this is legal depends on the floating point mode for the function. | ||||
| 851 | auto &FMad = getActionDefinitionsBuilder(G_FMAD); | ||||
| 852 | if (ST.hasMadF16() && ST.hasMadMacF32Insts()) | ||||
| 853 | FMad.customFor({S32, S16}); | ||||
| 854 | else if (ST.hasMadMacF32Insts()) | ||||
| 855 | FMad.customFor({S32}); | ||||
| 856 | else if (ST.hasMadF16()) | ||||
| 857 | FMad.customFor({S16}); | ||||
| 858 | FMad.scalarize(0) | ||||
| 859 | .lower(); | ||||
| 860 | |||||
| 861 | auto &FRem = getActionDefinitionsBuilder(G_FREM); | ||||
| 862 | if (ST.has16BitInsts()) { | ||||
| 863 | FRem.customFor({S16, S32, S64}); | ||||
| 864 | } else { | ||||
| 865 | FRem.minScalar(0, S32) | ||||
| 866 | .customFor({S32, S64}); | ||||
| 867 | } | ||||
| 868 | FRem.scalarize(0); | ||||
| 869 | |||||
| 870 | // TODO: Do we need to clamp maximum bitwidth? | ||||
| 871 | getActionDefinitionsBuilder(G_TRUNC) | ||||
| 872 | .legalIf(isScalar(0)) | ||||
| 873 | .legalFor({{V2S16, V2S32}}) | ||||
| 874 | .clampMaxNumElements(0, S16, 2) | ||||
| 875 | // Avoid scalarizing in cases that should be truly illegal. In unresolvable | ||||
| 876 | // situations (like an invalid implicit use), we don't want to infinite loop | ||||
| 877 | // in the legalizer. | ||||
| 878 | .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) | ||||
| 879 | .alwaysLegal(); | ||||
| 880 | |||||
| 881 | getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) | ||||
| 882 | .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, | ||||
| 883 | {S32, S1}, {S64, S1}, {S16, S1}}) | ||||
| 884 | .scalarize(0) | ||||
| 885 | .clampScalar(0, S32, S64) | ||||
| 886 | .widenScalarToNextPow2(1, 32); | ||||
| 887 | |||||
| 888 | // TODO: Split s1->s64 during regbankselect for VALU. | ||||
| 889 | auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) | ||||
| 890 | .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) | ||||
| 891 | .lowerIf(typeIs(1, S1)) | ||||
| 892 | .customFor({{S32, S64}, {S64, S64}}); | ||||
| 893 | if (ST.has16BitInsts()) | ||||
| 894 | IToFP.legalFor({{S16, S16}}); | ||||
| 895 | IToFP.clampScalar(1, S32, S64) | ||||
| 896 | .minScalar(0, S32) | ||||
| 897 | .scalarize(0) | ||||
| 898 | .widenScalarToNextPow2(1); | ||||
| 899 | |||||
| 900 | auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) | ||||
| 901 | .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) | ||||
| 902 | .customFor({{S64, S32}, {S64, S64}}) | ||||
| 903 | .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); | ||||
| 904 | if (ST.has16BitInsts()) | ||||
| 905 | FPToI.legalFor({{S16, S16}}); | ||||
| 906 | else | ||||
| 907 | FPToI.minScalar(1, S32); | ||||
| 908 | |||||
| 909 | FPToI.minScalar(0, S32) | ||||
| 910 | .widenScalarToNextPow2(0, 32) | ||||
| 911 | .scalarize(0) | ||||
| 912 | .lower(); | ||||
| 913 | |||||
| 914 | getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) | ||||
| 915 | .customFor({S16, S32}) | ||||
| 916 | .scalarize(0) | ||||
| 917 | .lower(); | ||||
| 918 | |||||
| 919 | // Lower roundeven into G_FRINT | ||||
| 920 | getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) | ||||
| 921 | .scalarize(0) | ||||
| 922 | .lower(); | ||||
| 923 | |||||
| 924 | if (ST.has16BitInsts()) { | ||||
| 925 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||
| 926 | .legalFor({S16, S32, S64}) | ||||
| 927 | .clampScalar(0, S16, S64) | ||||
| 928 | .scalarize(0); | ||||
| 929 | } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { | ||||
| 930 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||
| 931 | .legalFor({S32, S64}) | ||||
| 932 | .clampScalar(0, S32, S64) | ||||
| 933 | .scalarize(0); | ||||
| 934 | } else { | ||||
| 935 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) | ||||
| 936 | .legalFor({S32}) | ||||
| 937 | .customFor({S64}) | ||||
| 938 | .clampScalar(0, S32, S64) | ||||
| 939 | .scalarize(0); | ||||
| 940 | } | ||||
| 941 | |||||
| 942 | getActionDefinitionsBuilder(G_PTR_ADD) | ||||
| 943 | .legalIf(all(isPointer(0), sameSize(0, 1))) | ||||
| 944 | .scalarize(0) | ||||
| 945 | .scalarSameSizeAs(1, 0); | ||||
| 946 | |||||
| 947 | getActionDefinitionsBuilder(G_PTRMASK) | ||||
| 948 | .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) | ||||
| 949 | .scalarSameSizeAs(1, 0) | ||||
| 950 | .scalarize(0); | ||||
| 951 | |||||
| 952 | auto &CmpBuilder = | ||||
| 953 | getActionDefinitionsBuilder(G_ICMP) | ||||
| 954 | // The compare output type differs based on the register bank of the output, | ||||
| 955 | // so make both s1 and s32 legal. | ||||
| 956 | // | ||||
| 957 | // Scalar compares producing output in scc will be promoted to s32, as that | ||||
| 958 | // is the allocatable register type that will be needed for the copy from | ||||
| 959 | // scc. This will be promoted during RegBankSelect, and we assume something | ||||
| 960 | // before that won't try to use s32 result types. | ||||
| 961 | // | ||||
| 962 | // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg | ||||
| 963 | // bank. | ||||
| 964 | .legalForCartesianProduct( | ||||
| 965 | {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) | ||||
| 966 | .legalForCartesianProduct( | ||||
| 967 | {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); | ||||
| 968 | if (ST.has16BitInsts()) { | ||||
| 969 | CmpBuilder.legalFor({{S1, S16}}); | ||||
| 970 | } | ||||
| 971 | |||||
| 972 | CmpBuilder | ||||
| 973 | .widenScalarToNextPow2(1) | ||||
| 974 | .clampScalar(1, S32, S64) | ||||
| 975 | .scalarize(0) | ||||
| 976 | .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); | ||||
| 977 | |||||
| 978 | getActionDefinitionsBuilder(G_FCMP) | ||||
| 979 | .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) | ||||
| 980 | .widenScalarToNextPow2(1) | ||||
| 981 | .clampScalar(1, S32, S64) | ||||
| 982 | .scalarize(0); | ||||
| 983 | |||||
| 984 | // FIXME: fpow has a selection pattern that should move to custom lowering. | ||||
| 985 | auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); | ||||
| 986 | if (ST.has16BitInsts()) | ||||
| 987 | Exp2Ops.legalFor({S32, S16}); | ||||
| 988 | else | ||||
| 989 | Exp2Ops.legalFor({S32}); | ||||
| 990 | Exp2Ops.clampScalar(0, MinScalarFPTy, S32); | ||||
| 991 | Exp2Ops.scalarize(0); | ||||
| 992 | |||||
| 993 | auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); | ||||
| 994 | if (ST.has16BitInsts()) | ||||
| 995 | ExpOps.customFor({{S32}, {S16}}); | ||||
| 996 | else | ||||
| 997 | ExpOps.customFor({S32}); | ||||
| 998 | ExpOps.clampScalar(0, MinScalarFPTy, S32) | ||||
| 999 | .scalarize(0); | ||||
| 1000 | |||||
| 1001 | getActionDefinitionsBuilder(G_FPOWI) | ||||
| 1002 | .clampScalar(0, MinScalarFPTy, S32) | ||||
| 1003 | .lower(); | ||||
| 1004 | |||||
| 1005 | // The 64-bit versions produce 32-bit results, but only on the SALU. | ||||
| 1006 | getActionDefinitionsBuilder(G_CTPOP) | ||||
| 1007 | .legalFor({{S32, S32}, {S32, S64}}) | ||||
| 1008 | .clampScalar(0, S32, S32) | ||||
| 1009 | .widenScalarToNextPow2(1, 32) | ||||
| 1010 | .clampScalar(1, S32, S64) | ||||
| 1011 | .scalarize(0) | ||||
| 1012 | .widenScalarToNextPow2(0, 32); | ||||
| 1013 | |||||
| 1014 | // If no 16 bit instr is available, lower into different instructions. | ||||
| 1015 | if (ST.has16BitInsts()) | ||||
| 1016 | getActionDefinitionsBuilder(G_IS_FPCLASS) | ||||
| 1017 | .legalForCartesianProduct({S1}, FPTypes16) | ||||
| 1018 | .widenScalarToNextPow2(1) | ||||
| 1019 | .scalarize(0) | ||||
| 1020 | .lower(); | ||||
| 1021 | else | ||||
| 1022 | getActionDefinitionsBuilder(G_IS_FPCLASS) | ||||
| 1023 | .legalForCartesianProduct({S1}, FPTypesBase) | ||||
| 1024 | .lowerFor({S1, S16}) | ||||
| 1025 | .widenScalarToNextPow2(1) | ||||
| 1026 | .scalarize(0) | ||||
| 1027 | .lower(); | ||||
| 1028 | |||||
| 1029 | // The hardware instructions return a different result on 0 than the generic | ||||
| 1030 | // instructions expect. The hardware produces -1, but these produce the | ||||
| 1031 | // bitwidth. | ||||
| 1032 | getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) | ||||
| 1033 | .scalarize(0) | ||||
| 1034 | .clampScalar(0, S32, S32) | ||||
| 1035 | .clampScalar(1, S32, S64) | ||||
| 1036 | .widenScalarToNextPow2(0, 32) | ||||
| 1037 | .widenScalarToNextPow2(1, 32) | ||||
| 1038 | .custom(); | ||||
| 1039 | |||||
| 1040 | // The 64-bit versions produce 32-bit results, but only on the SALU. | ||||
| 1041 | getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) | ||||
| 1042 | .legalFor({{S32, S32}, {S32, S64}}) | ||||
| 1043 | .clampScalar(0, S32, S32) | ||||
| 1044 | .clampScalar(1, S32, S64) | ||||
| 1045 | .scalarize(0) | ||||
| 1046 | .widenScalarToNextPow2(0, 32) | ||||
| 1047 | .widenScalarToNextPow2(1, 32); | ||||
| 1048 | |||||
| 1049 | // S64 is only legal on SALU, and needs to be broken into 32-bit elements in | ||||
| 1050 | // RegBankSelect. | ||||
| 1051 | getActionDefinitionsBuilder(G_BITREVERSE) | ||||
| 1052 | .legalFor({S32, S64}) | ||||
| 1053 | .clampScalar(0, S32, S64) | ||||
| 1054 | .scalarize(0) | ||||
| 1055 | .widenScalarToNextPow2(0); | ||||
| 1056 | |||||
| 1057 | if (ST.has16BitInsts()) { | ||||
| 1058 | getActionDefinitionsBuilder(G_BSWAP) | ||||
| 1059 | .legalFor({S16, S32, V2S16}) | ||||
| 1060 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
| 1061 | // FIXME: Fixing non-power-of-2 before clamp is workaround for | ||||
| 1062 | // narrowScalar limitation. | ||||
| 1063 | .widenScalarToNextPow2(0) | ||||
| 1064 | .clampScalar(0, S16, S32) | ||||
| 1065 | .scalarize(0); | ||||
| 1066 | |||||
| 1067 | if (ST.hasVOP3PInsts()) { | ||||
| 1068 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | ||||
| 1069 | .legalFor({S32, S16, V2S16}) | ||||
| 1070 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||
| 1071 | .clampMaxNumElements(0, S16, 2) | ||||
| 1072 | .minScalar(0, S16) | ||||
| 1073 | .widenScalarToNextPow2(0) | ||||
| 1074 | .scalarize(0) | ||||
| 1075 | .lower(); | ||||
| 1076 | } else { | ||||
| 1077 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | ||||
| 1078 | .legalFor({S32, S16}) | ||||
| 1079 | .widenScalarToNextPow2(0) | ||||
| 1080 | .minScalar(0, S16) | ||||
| 1081 | .scalarize(0) | ||||
| 1082 | .lower(); | ||||
| 1083 | } | ||||
| 1084 | } else { | ||||
| 1085 | // TODO: Should have same legality without v_perm_b32 | ||||
| 1086 | getActionDefinitionsBuilder(G_BSWAP) | ||||
| 1087 | .legalFor({S32}) | ||||
| 1088 | .lowerIf(scalarNarrowerThan(0, 32)) | ||||
| 1089 | // FIXME: Fixing non-power-of-2 before clamp is workaround for | ||||
| 1090 | // narrowScalar limitation. | ||||
| 1091 | .widenScalarToNextPow2(0) | ||||
| 1092 | .maxScalar(0, S32) | ||||
| 1093 | .scalarize(0) | ||||
| 1094 | .lower(); | ||||
| 1095 | |||||
| 1096 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) | ||||
| 1097 | .legalFor({S32}) | ||||
| 1098 | .minScalar(0, S32) | ||||
| 1099 | .widenScalarToNextPow2(0) | ||||
| 1100 | .scalarize(0) | ||||
| 1101 | .lower(); | ||||
| 1102 | } | ||||
| 1103 | |||||
| 1104 | getActionDefinitionsBuilder(G_INTTOPTR) | ||||
| 1105 | // List the common cases | ||||
| 1106 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||
| 1107 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||
| 1108 | .scalarize(0) | ||||
| 1109 | // Accept any address space as long as the size matches | ||||
| 1110 | .legalIf(sameSize(0, 1)) | ||||
| 1111 | .widenScalarIf(smallerThan(1, 0), | ||||
| 1112 | [](const LegalityQuery &Query) { | ||||
| 1113 | return std::pair( | ||||
| 1114 | 1, LLT::scalar(Query.Types[0].getSizeInBits())); | ||||
| 1115 | }) | ||||
| 1116 | .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) { | ||||
| 1117 | return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); | ||||
| 1118 | }); | ||||
| 1119 | |||||
| 1120 | getActionDefinitionsBuilder(G_PTRTOINT) | ||||
| 1121 | // List the common cases | ||||
| 1122 | .legalForCartesianProduct(AddrSpaces64, {S64}) | ||||
| 1123 | .legalForCartesianProduct(AddrSpaces32, {S32}) | ||||
| 1124 | .scalarize(0) | ||||
| 1125 | // Accept any address space as long as the size matches | ||||
| 1126 | .legalIf(sameSize(0, 1)) | ||||
| 1127 | .widenScalarIf(smallerThan(0, 1), | ||||
| 1128 | [](const LegalityQuery &Query) { | ||||
| 1129 | return std::pair( | ||||
| 1130 | 0, LLT::scalar(Query.Types[1].getSizeInBits())); | ||||
| 1131 | }) | ||||
| 1132 | .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) { | ||||
| 1133 | return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); | ||||
| 1134 | }); | ||||
| 1135 | |||||
| 1136 | getActionDefinitionsBuilder(G_ADDRSPACE_CAST) | ||||
| 1137 | .scalarize(0) | ||||
| 1138 | .custom(); | ||||
| 1139 | |||||
| 1140 | const auto needToSplitMemOp = [=](const LegalityQuery &Query, | ||||
| 1141 | bool IsLoad) -> bool { | ||||
| 1142 | const LLT DstTy = Query.Types[0]; | ||||
| 1143 | |||||
| 1144 | // Split vector extloads. | ||||
| 1145 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); | ||||
| 1146 | |||||
| 1147 | if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) | ||||
| 1148 | return true; | ||||
| 1149 | |||||
| 1150 | const LLT PtrTy = Query.Types[1]; | ||||
| 1151 | unsigned AS = PtrTy.getAddressSpace(); | ||||
| 1152 | if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, | ||||
| 1153 | Query.MMODescrs[0].Ordering != | ||||
| 1154 | AtomicOrdering::NotAtomic)) | ||||
| 1155 | return true; | ||||
| 1156 | |||||
| 1157 | // Catch weird sized loads that don't evenly divide into the access sizes | ||||
| 1158 | // TODO: May be able to widen depending on alignment etc. | ||||
| 1159 | unsigned NumRegs = (MemSize + 31) / 32; | ||||
| 1160 | if (NumRegs == 3) { | ||||
| 1161 | if (!ST.hasDwordx3LoadStores()) | ||||
| 1162 | return true; | ||||
| 1163 | } else { | ||||
| 1164 | // If the alignment allows, these should have been widened. | ||||
| 1165 | if (!isPowerOf2_32(NumRegs)) | ||||
| 1166 | return true; | ||||
| 1167 | } | ||||
| 1168 | |||||
| 1169 | return false; | ||||
| 1170 | }; | ||||
| 1171 | |||||
| 1172 | unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; | ||||
| 1173 | unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; | ||||
| 1174 | unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; | ||||
| 1175 | |||||
| 1176 | // TODO: Refine based on subtargets which support unaligned access or 128-bit | ||||
| 1177 | // LDS | ||||
| 1178 | // TODO: Unsupported flat for SI. | ||||
| 1179 | |||||
| 1180 | for (unsigned Op : {G_LOAD, G_STORE}) { | ||||
| 1181 | const bool IsStore = Op == G_STORE; | ||||
| 1182 | |||||
| 1183 | auto &Actions = getActionDefinitionsBuilder(Op); | ||||
| 1184 | // Explicitly list some common cases. | ||||
| 1185 | // TODO: Does this help compile time at all? | ||||
| 1186 | Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, | ||||
| 1187 | {V2S32, GlobalPtr, V2S32, GlobalAlign32}, | ||||
| 1188 | {V4S32, GlobalPtr, V4S32, GlobalAlign32}, | ||||
| 1189 | {S64, GlobalPtr, S64, GlobalAlign32}, | ||||
| 1190 | {V2S64, GlobalPtr, V2S64, GlobalAlign32}, | ||||
| 1191 | {V2S16, GlobalPtr, V2S16, GlobalAlign32}, | ||||
| 1192 | {S32, GlobalPtr, S8, GlobalAlign8}, | ||||
| 1193 | {S32, GlobalPtr, S16, GlobalAlign16}, | ||||
| 1194 | |||||
| 1195 | {S32, LocalPtr, S32, 32}, | ||||
| 1196 | {S64, LocalPtr, S64, 32}, | ||||
| 1197 | {V2S32, LocalPtr, V2S32, 32}, | ||||
| 1198 | {S32, LocalPtr, S8, 8}, | ||||
| 1199 | {S32, LocalPtr, S16, 16}, | ||||
| 1200 | {V2S16, LocalPtr, S32, 32}, | ||||
| 1201 | |||||
| 1202 | {S32, PrivatePtr, S32, 32}, | ||||
| 1203 | {S32, PrivatePtr, S8, 8}, | ||||
| 1204 | {S32, PrivatePtr, S16, 16}, | ||||
| 1205 | {V2S16, PrivatePtr, S32, 32}, | ||||
| 1206 | |||||
| 1207 | {S32, ConstantPtr, S32, GlobalAlign32}, | ||||
| 1208 | {V2S32, ConstantPtr, V2S32, GlobalAlign32}, | ||||
| 1209 | {V4S32, ConstantPtr, V4S32, GlobalAlign32}, | ||||
| 1210 | {S64, ConstantPtr, S64, GlobalAlign32}, | ||||
| 1211 | {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); | ||||
| 1212 | Actions.legalIf( | ||||
| 1213 | [=](const LegalityQuery &Query) -> bool { | ||||
| 1214 | return isLoadStoreLegal(ST, Query); | ||||
| 1215 | }); | ||||
| 1216 | |||||
| 1217 | // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to | ||||
| 1218 | // 64-bits. | ||||
| 1219 | // | ||||
| 1220 | // TODO: Should generalize bitcast action into coerce, which will also cover | ||||
| 1221 | // inserting addrspacecasts. | ||||
| 1222 | Actions.customIf(typeIs(1, Constant32Ptr)); | ||||
| 1223 | |||||
| 1224 | // Turn any illegal element vectors into something easier to deal | ||||
| 1225 | // with. These will ultimately produce 32-bit scalar shifts to extract the | ||||
| 1226 | // parts anyway. | ||||
| 1227 | // | ||||
| 1228 | // For odd 16-bit element vectors, prefer to split those into pieces with | ||||
| 1229 | // 16-bit vector parts. | ||||
| 1230 | Actions.bitcastIf( | ||||
| 1231 | [=](const LegalityQuery &Query) -> bool { | ||||
| 1232 | return shouldBitcastLoadStoreType(ST, Query.Types[0], | ||||
| 1233 | Query.MMODescrs[0].MemoryTy); | ||||
| 1234 | }, bitcastToRegisterType(0)); | ||||
| 1235 | |||||
| 1236 | if (!IsStore) { | ||||
| 1237 | // Widen suitably aligned loads by loading extra bytes. The standard | ||||
| 1238 | // legalization actions can't properly express widening memory operands. | ||||
| 1239 | Actions.customIf([=](const LegalityQuery &Query) -> bool { | ||||
| 1240 | return shouldWidenLoad(ST, Query, G_LOAD); | ||||
| 1241 | }); | ||||
| 1242 | } | ||||
| 1243 | |||||
| 1244 | // FIXME: load/store narrowing should be moved to lower action | ||||
| 1245 | Actions | ||||
| 1246 | .narrowScalarIf( | ||||
| 1247 | [=](const LegalityQuery &Query) -> bool { | ||||
| 1248 | return !Query.Types[0].isVector() && | ||||
| 1249 | needToSplitMemOp(Query, Op == G_LOAD); | ||||
| 1250 | }, | ||||
| 1251 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | ||||
| 1252 | const LLT DstTy = Query.Types[0]; | ||||
| 1253 | const LLT PtrTy = Query.Types[1]; | ||||
| 1254 | |||||
| 1255 | const unsigned DstSize = DstTy.getSizeInBits(); | ||||
| 1256 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); | ||||
| 1257 | |||||
| 1258 | // Split extloads. | ||||
| 1259 | if (DstSize > MemSize) | ||||
| 1260 | return std::pair(0, LLT::scalar(MemSize)); | ||||
| 1261 | |||||
| 1262 | unsigned MaxSize = maxSizeForAddrSpace( | ||||
| 1263 | ST, PtrTy.getAddressSpace(), Op == G_LOAD, | ||||
| 1264 | Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); | ||||
| 1265 | if (MemSize > MaxSize) | ||||
| 1266 | return std::pair(0, LLT::scalar(MaxSize)); | ||||
| 1267 | |||||
| 1268 | uint64_t Align = Query.MMODescrs[0].AlignInBits; | ||||
| 1269 | return std::pair(0, LLT::scalar(Align)); | ||||
| 1270 | }) | ||||
| 1271 | .fewerElementsIf( | ||||
| 1272 | [=](const LegalityQuery &Query) -> bool { | ||||
| 1273 | return Query.Types[0].isVector() && | ||||
| 1274 | needToSplitMemOp(Query, Op == G_LOAD); | ||||
| 1275 | }, | ||||
| 1276 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { | ||||
| 1277 | const LLT DstTy = Query.Types[0]; | ||||
| 1278 | const LLT PtrTy = Query.Types[1]; | ||||
| 1279 | |||||
| 1280 | LLT EltTy = DstTy.getElementType(); | ||||
| 1281 | unsigned MaxSize = maxSizeForAddrSpace( | ||||
| 1282 | ST, PtrTy.getAddressSpace(), Op == G_LOAD, | ||||
| 1283 | Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); | ||||
| 1284 | |||||
| 1285 | // FIXME: Handle widened to power of 2 results better. This ends | ||||
| 1286 | // up scalarizing. | ||||
| 1287 | // FIXME: 3 element stores scalarized on SI | ||||
| 1288 | |||||
| 1289 | // Split if it's too large for the address space. | ||||
| 1290 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); | ||||
| 1291 | if (MemSize > MaxSize) { | ||||
| 1292 | unsigned NumElts = DstTy.getNumElements(); | ||||
| 1293 | unsigned EltSize = EltTy.getSizeInBits(); | ||||
| 1294 | |||||
| 1295 | if (MaxSize % EltSize == 0) { | ||||
| 1296 | return std::pair( | ||||
| 1297 | 0, LLT::scalarOrVector( | ||||
| 1298 | ElementCount::getFixed(MaxSize / EltSize), EltTy)); | ||||
| 1299 | } | ||||
| 1300 | |||||
| 1301 | unsigned NumPieces = MemSize / MaxSize; | ||||
| 1302 | |||||
| 1303 | // FIXME: Refine when odd breakdowns handled | ||||
| 1304 | // The scalars will need to be re-legalized. | ||||
| 1305 | if (NumPieces == 1 || NumPieces >= NumElts || | ||||
| 1306 | NumElts % NumPieces != 0) | ||||
| 1307 | return std::pair(0, EltTy); | ||||
| 1308 | |||||
| 1309 | return std::pair(0, | ||||
| 1310 | LLT::fixed_vector(NumElts / NumPieces, EltTy)); | ||||
| 1311 | } | ||||
| 1312 | |||||
| 1313 | // FIXME: We could probably handle weird extending loads better. | ||||
| 1314 | if (DstTy.getSizeInBits() > MemSize) | ||||
| 1315 | return std::pair(0, EltTy); | ||||
| 1316 | |||||
| 1317 | unsigned EltSize = EltTy.getSizeInBits(); | ||||
| 1318 | unsigned DstSize = DstTy.getSizeInBits(); | ||||
| 1319 | if (!isPowerOf2_32(DstSize)) { | ||||
| 1320 | // We're probably decomposing an odd sized store. Try to split | ||||
| 1321 | // to the widest type. TODO: Account for alignment. As-is it | ||||
| 1322 | // should be OK, since the new parts will be further legalized. | ||||
| 1323 | unsigned FloorSize = llvm::bit_floor(DstSize); | ||||
| 1324 | return std::pair( | ||||
| 1325 | 0, LLT::scalarOrVector( | ||||
| 1326 | ElementCount::getFixed(FloorSize / EltSize), EltTy)); | ||||
| 1327 | } | ||||
| 1328 | |||||
| 1329 | // May need relegalization for the scalars. | ||||
| 1330 | return std::pair(0, EltTy); | ||||
| 1331 | }) | ||||
| 1332 | .minScalar(0, S32) | ||||
| 1333 | .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) | ||||
| 1334 | .widenScalarToNextPow2(0) | ||||
| 1335 | .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) | ||||
| 1336 | .lower(); | ||||
| 1337 | } | ||||
| 1338 | |||||
| 1339 | // FIXME: Unaligned accesses not lowered. | ||||
| 1340 | auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) | ||||
| 1341 | .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, | ||||
| 1342 | {S32, GlobalPtr, S16, 2 * 8}, | ||||
| 1343 | {S32, LocalPtr, S8, 8}, | ||||
| 1344 | {S32, LocalPtr, S16, 16}, | ||||
| 1345 | {S32, PrivatePtr, S8, 8}, | ||||
| 1346 | {S32, PrivatePtr, S16, 16}, | ||||
| 1347 | {S32, ConstantPtr, S8, 8}, | ||||
| 1348 | {S32, ConstantPtr, S16, 2 * 8}}) | ||||
| 1349 | .legalIf( | ||||
| 1350 | [=](const LegalityQuery &Query) -> bool { | ||||
| 1351 | return isLoadStoreLegal(ST, Query); | ||||
| 1352 | }); | ||||
| 1353 | |||||
| 1354 | if (ST.hasFlatAddressSpace()) { | ||||
| 1355 | ExtLoads.legalForTypesWithMemDesc( | ||||
| 1356 | {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); | ||||
| 1357 | } | ||||
| 1358 | |||||
| 1359 | // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to | ||||
| 1360 | // 64-bits. | ||||
| 1361 | // | ||||
| 1362 | // TODO: Should generalize bitcast action into coerce, which will also cover | ||||
| 1363 | // inserting addrspacecasts. | ||||
| 1364 | ExtLoads.customIf(typeIs(1, Constant32Ptr)); | ||||
| 1365 | |||||
| 1366 | ExtLoads.clampScalar(0, S32, S32) | ||||
| 1367 | .widenScalarToNextPow2(0) | ||||
| 1368 | .lower(); | ||||
| 1369 | |||||
| 1370 | auto &Atomics = getActionDefinitionsBuilder( | ||||
| 1371 | {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, | ||||
| 1372 | G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, | ||||
| 1373 | G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, | ||||
| 1374 | G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP}) | ||||
| 1375 | .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, | ||||
| 1376 | {S64, GlobalPtr}, {S64, LocalPtr}, | ||||
| 1377 | {S32, RegionPtr}, {S64, RegionPtr}}); | ||||
| 1378 | if (ST.hasFlatAddressSpace()) { | ||||
| 1379 | Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); | ||||
| 1380 | } | ||||
| 1381 | |||||
| 1382 | auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); | ||||
| 1383 | if (ST.hasLDSFPAtomicAdd()) { | ||||
| 1384 | Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); | ||||
| 1385 | if (ST.hasGFX90AInsts()) | ||||
| 1386 | Atomic.legalFor({{S64, LocalPtr}}); | ||||
| 1387 | if (ST.hasAtomicDsPkAdd16Insts()) | ||||
| 1388 | Atomic.legalFor({{V2S16, LocalPtr}}); | ||||
| 1389 | } | ||||
| 1390 | if (ST.hasAtomicFaddInsts()) | ||||
| 1391 | Atomic.legalFor({{S32, GlobalPtr}}); | ||||
| 1392 | if (ST.hasFlatAtomicFaddF32Inst()) | ||||
| 1393 | Atomic.legalFor({{S32, FlatPtr}}); | ||||
| 1394 | |||||
| 1395 | if (ST.hasGFX90AInsts()) { | ||||
| 1396 | // These are legal with some caveats, and should have undergone expansion in | ||||
| 1397 | // the IR in most situations | ||||
| 1398 | // TODO: Move atomic expansion into legalizer | ||||
| 1399 | Atomic.legalFor({ | ||||
| 1400 | {S32, GlobalPtr}, | ||||
| 1401 | {S64, GlobalPtr}, | ||||
| 1402 | {S64, FlatPtr} | ||||
| 1403 | }); | ||||
| 1404 | } | ||||
| 1405 | |||||
| 1406 | // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output | ||||
| 1407 | // demarshalling | ||||
| 1408 | getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) | ||||
| 1409 | .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, | ||||
| 1410 | {S32, FlatPtr}, {S64, FlatPtr}}) | ||||
| 1411 | .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, | ||||
| 1412 | {S32, RegionPtr}, {S64, RegionPtr}}); | ||||
| 1413 | // TODO: Pointer types, any 32-bit or 64-bit vector | ||||
| 1414 | |||||
| 1415 | // Condition should be s32 for scalar, s1 for vector. | ||||
| 1416 | getActionDefinitionsBuilder(G_SELECT) | ||||
| 1417 | .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, | ||||
| 1418 | LocalPtr, FlatPtr, PrivatePtr, | ||||
| 1419 | LLT::fixed_vector(2, LocalPtr), | ||||
| 1420 | LLT::fixed_vector(2, PrivatePtr)}, | ||||
| 1421 | {S1, S32}) | ||||
| 1422 | .clampScalar(0, S16, S64) | ||||
| 1423 | .scalarize(1) | ||||
| 1424 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) | ||||
| 1425 | .fewerElementsIf(numElementsNotEven(0), scalarize(0)) | ||||
| 1426 | .clampMaxNumElements(0, S32, 2) | ||||
| 1427 | .clampMaxNumElements(0, LocalPtr, 2) | ||||
| 1428 | .clampMaxNumElements(0, PrivatePtr, 2) | ||||
| 1429 | .scalarize(0) | ||||
| 1430 | .widenScalarToNextPow2(0) | ||||
| 1431 | .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); | ||||
| 1432 | |||||
| 1433 | // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can | ||||
| 1434 | // be more flexible with the shift amount type. | ||||
| 1435 | auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) | ||||
| 1436 | .legalFor({{S32, S32}, {S64, S32}}); | ||||
| 1437 | if (ST.has16BitInsts()) { | ||||
| 1438 | if (ST.hasVOP3PInsts()) { | ||||
| 1439 | Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) | ||||
| 1440 | .clampMaxNumElements(0, S16, 2); | ||||
| 1441 | } else | ||||
| 1442 | Shifts.legalFor({{S16, S16}}); | ||||
| 1443 | |||||
| 1444 | // TODO: Support 16-bit shift amounts for all types | ||||
| 1445 | Shifts.widenScalarIf( | ||||
| 1446 | [=](const LegalityQuery &Query) { | ||||
| 1447 | // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a | ||||
| 1448 | // 32-bit amount. | ||||
| 1449 | const LLT ValTy = Query.Types[0]; | ||||
| 1450 | const LLT AmountTy = Query.Types[1]; | ||||
| 1451 | return ValTy.getSizeInBits() <= 16 && | ||||
| 1452 | AmountTy.getSizeInBits() < 16; | ||||
| 1453 | }, changeTo(1, S16)); | ||||
| 1454 | Shifts.maxScalarIf(typeIs(0, S16), 1, S16); | ||||
| 1455 | Shifts.clampScalar(1, S32, S32); | ||||
| 1456 | Shifts.widenScalarToNextPow2(0, 16); | ||||
| 1457 | Shifts.clampScalar(0, S16, S64); | ||||
| 1458 | |||||
| 1459 | getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) | ||||
| 1460 | .minScalar(0, S16) | ||||
| 1461 | .scalarize(0) | ||||
| 1462 | .lower(); | ||||
| 1463 | } else { | ||||
| 1464 | // Make sure we legalize the shift amount type first, as the general | ||||
| 1465 | // expansion for the shifted type will produce much worse code if it hasn't | ||||
| 1466 | // been truncated already. | ||||
| 1467 | Shifts.clampScalar(1, S32, S32); | ||||
| 1468 | Shifts.widenScalarToNextPow2(0, 32); | ||||
| 1469 | Shifts.clampScalar(0, S32, S64); | ||||
| 1470 | |||||
| 1471 | getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) | ||||
| 1472 | .minScalar(0, S32) | ||||
| 1473 | .scalarize(0) | ||||
| 1474 | .lower(); | ||||
| 1475 | } | ||||
| 1476 | Shifts.scalarize(0); | ||||
| 1477 | |||||
| 1478 | for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { | ||||
| 1479 | unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; | ||||
| 1480 | unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; | ||||
| 1481 | unsigned IdxTypeIdx = 2; | ||||
| 1482 | |||||
| 1483 | getActionDefinitionsBuilder(Op) | ||||
| 1484 | .customIf([=](const LegalityQuery &Query) { | ||||
| 1485 | const LLT EltTy = Query.Types[EltTypeIdx]; | ||||
| 1486 | const LLT VecTy = Query.Types[VecTypeIdx]; | ||||
| 1487 | const LLT IdxTy = Query.Types[IdxTypeIdx]; | ||||
| 1488 | const unsigned EltSize = EltTy.getSizeInBits(); | ||||
| 1489 | const bool isLegalVecType = | ||||
| 1490 | !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits()); | ||||
| 1491 | return (EltSize == 32 || EltSize == 64) && | ||||
| 1492 | VecTy.getSizeInBits() % 32 == 0 && | ||||
| 1493 | VecTy.getSizeInBits() <= MaxRegisterSize && | ||||
| 1494 | IdxTy.getSizeInBits() == 32 && | ||||
| 1495 | isLegalVecType; | ||||
| 1496 | }) | ||||
| 1497 | .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), | ||||
| 1498 | bitcastToVectorElement32(VecTypeIdx)) | ||||
| 1499 | //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) | ||||
| 1500 | .bitcastIf( | ||||
| 1501 | all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), | ||||
| 1502 | [=](const LegalityQuery &Query) { | ||||
| 1503 | // For > 64-bit element types, try to turn this into a 64-bit | ||||
| 1504 | // element vector since we may be able to do better indexing | ||||
| 1505 | // if this is scalar. If not, fall back to 32. | ||||
| 1506 | const LLT EltTy = Query.Types[EltTypeIdx]; | ||||
| 1507 | const LLT VecTy = Query.Types[VecTypeIdx]; | ||||
| 1508 | const unsigned DstEltSize = EltTy.getSizeInBits(); | ||||
| 1509 | const unsigned VecSize = VecTy.getSizeInBits(); | ||||
| 1510 | |||||
| 1511 | const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; | ||||
| 1512 | return std::pair( | ||||
| 1513 | VecTypeIdx, | ||||
| 1514 | LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); | ||||
| 1515 | }) | ||||
| 1516 | .clampScalar(EltTypeIdx, S32, S64) | ||||
| 1517 | .clampScalar(VecTypeIdx, S32, S64) | ||||
| 1518 | .clampScalar(IdxTypeIdx, S32, S32) | ||||
| 1519 | .clampMaxNumElements(VecTypeIdx, S32, 32) | ||||
| 1520 | // TODO: Clamp elements for 64-bit vectors? | ||||
| 1521 | .moreElementsIf( | ||||
| 1522 | isIllegalRegisterType(VecTypeIdx), | ||||
| 1523 | moreElementsToNextExistingRegClass(VecTypeIdx)) | ||||
| 1524 | // It should only be necessary with variable indexes. | ||||
| 1525 | // As a last resort, lower to the stack | ||||
| 1526 | .lower(); | ||||
| 1527 | } | ||||
| 1528 | |||||
| 1529 | getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) | ||||
| 1530 | .unsupportedIf([=](const LegalityQuery &Query) { | ||||
| 1531 | const LLT &EltTy = Query.Types[1].getElementType(); | ||||
| 1532 | return Query.Types[0] != EltTy; | ||||
| 1533 | }); | ||||
| 1534 | |||||
| 1535 | for (unsigned Op : {G_EXTRACT, G_INSERT}) { | ||||
| 1536 | unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; | ||||
| 1537 | unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; | ||||
| 1538 | |||||
| 1539 | // FIXME: Doesn't handle extract of illegal sizes. | ||||
| 1540 | getActionDefinitionsBuilder(Op) | ||||
| 1541 | .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) | ||||
| 1542 | .lowerIf([=](const LegalityQuery &Query) { | ||||
| 1543 | // Sub-vector(or single element) insert and extract. | ||||
| 1544 | // TODO: verify immediate offset here since lower only works with | ||||
| 1545 | // whole elements. | ||||
| 1546 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||
| 1547 | return BigTy.isVector(); | ||||
| 1548 | }) | ||||
| 1549 | // FIXME: Multiples of 16 should not be legal. | ||||
| 1550 | .legalIf([=](const LegalityQuery &Query) { | ||||
| 1551 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||
| 1552 | const LLT LitTy = Query.Types[LitTyIdx]; | ||||
| 1553 | return (BigTy.getSizeInBits() % 32 == 0) && | ||||
| 1554 | (LitTy.getSizeInBits() % 16 == 0); | ||||
| 1555 | }) | ||||
| 1556 | .widenScalarIf( | ||||
| 1557 | [=](const LegalityQuery &Query) { | ||||
| 1558 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||
| 1559 | return (BigTy.getScalarSizeInBits() < 16); | ||||
| 1560 | }, | ||||
| 1561 | LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) | ||||
| 1562 | .widenScalarIf( | ||||
| 1563 | [=](const LegalityQuery &Query) { | ||||
| 1564 | const LLT LitTy = Query.Types[LitTyIdx]; | ||||
| 1565 | return (LitTy.getScalarSizeInBits() < 16); | ||||
| 1566 | }, | ||||
| 1567 | LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) | ||||
| 1568 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | ||||
| 1569 | .widenScalarToNextPow2(BigTyIdx, 32); | ||||
| 1570 | |||||
| 1571 | } | ||||
| 1572 | |||||
| 1573 | auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) | ||||
| 1574 | .legalForCartesianProduct(AllS32Vectors, {S32}) | ||||
| 1575 | .legalForCartesianProduct(AllS64Vectors, {S64}) | ||||
| 1576 | .clampNumElements(0, V16S32, V32S32) | ||||
| 1577 | .clampNumElements(0, V2S64, V16S64) | ||||
| 1578 | .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)) | ||||
| 1579 | .moreElementsIf( | ||||
| 1580 | isIllegalRegisterType(0), | ||||
| 1581 | moreElementsToNextExistingRegClass(0)); | ||||
| 1582 | |||||
| 1583 | if (ST.hasScalarPackInsts()) { | ||||
| 1584 | BuildVector | ||||
| 1585 | // FIXME: Should probably widen s1 vectors straight to s32 | ||||
| 1586 | .minScalarOrElt(0, S16) | ||||
| 1587 | .minScalar(1, S16); | ||||
| 1588 | |||||
| 1589 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | ||||
| 1590 | .legalFor({V2S16, S32}) | ||||
| 1591 | .lower(); | ||||
| 1592 | } else { | ||||
| 1593 | BuildVector.customFor({V2S16, S16}); | ||||
| 1594 | BuildVector.minScalarOrElt(0, S32); | ||||
| 1595 | |||||
| 1596 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) | ||||
| 1597 | .customFor({V2S16, S32}) | ||||
| 1598 | .lower(); | ||||
| 1599 | } | ||||
| 1600 | |||||
| 1601 | BuildVector.legalIf(isRegisterType(0)); | ||||
| 1602 | |||||
| 1603 | // FIXME: Clamp maximum size | ||||
| 1604 | getActionDefinitionsBuilder(G_CONCAT_VECTORS) | ||||
| 1605 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||
| 1606 | .clampMaxNumElements(0, S32, 32) | ||||
| 1607 | .clampMaxNumElements(1, S16, 2) // TODO: Make 4? | ||||
| 1608 | .clampMaxNumElements(0, S16, 64); | ||||
| 1609 | |||||
| 1610 | getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); | ||||
| 1611 | |||||
| 1612 | // Merge/Unmerge | ||||
| 1613 | for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { | ||||
| 1614 | unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; | ||||
| 1615 | unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; | ||||
| 1616 | |||||
| 1617 | auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { | ||||
| 1618 | const LLT Ty = Query.Types[TypeIdx]; | ||||
| 1619 | if (Ty.isVector()) { | ||||
| 1620 | const LLT &EltTy = Ty.getElementType(); | ||||
| 1621 | if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) | ||||
| 1622 | return true; | ||||
| 1623 | if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits())) | ||||
| 1624 | return true; | ||||
| 1625 | } | ||||
| 1626 | return false; | ||||
| 1627 | }; | ||||
| 1628 | |||||
| 1629 | auto &Builder = getActionDefinitionsBuilder(Op) | ||||
| 1630 | .legalIf(all(isRegisterType(0), isRegisterType(1))) | ||||
| 1631 | .lowerFor({{S16, V2S16}}) | ||||
| 1632 | .lowerIf([=](const LegalityQuery &Query) { | ||||
| 1633 | const LLT BigTy = Query.Types[BigTyIdx]; | ||||
| 1634 | return BigTy.getSizeInBits() == 32; | ||||
| 1635 | }) | ||||
| 1636 | // Try to widen to s16 first for small types. | ||||
| 1637 | // TODO: Only do this on targets with legal s16 shifts | ||||
| 1638 | .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) | ||||
| 1639 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) | ||||
| 1640 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) | ||||
| 1641 | .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), | ||||
| 1642 | elementTypeIs(1, S16)), | ||||
| 1643 | changeTo(1, V2S16)) | ||||
| 1644 | // Clamp the little scalar to s8-s256 and make it a power of 2. It's not | ||||
| 1645 | // worth considering the multiples of 64 since 2*192 and 2*384 are not | ||||
| 1646 | // valid. | ||||
| 1647 | .clampScalar(LitTyIdx, S32, S512) | ||||
| 1648 | .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) | ||||
| 1649 | // Break up vectors with weird elements into scalars | ||||
| 1650 | .fewerElementsIf( | ||||
| 1651 | [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, | ||||
| 1652 | scalarize(0)) | ||||
| 1653 | .fewerElementsIf( | ||||
| 1654 | [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, | ||||
| 1655 | scalarize(1)) | ||||
| 1656 | .clampScalar(BigTyIdx, S32, MaxScalar); | ||||
| 1657 | |||||
| 1658 | if (Op == G_MERGE_VALUES) { | ||||
| 1659 | Builder.widenScalarIf( | ||||
| 1660 | // TODO: Use 16-bit shifts if legal for 8-bit values? | ||||
| 1661 | [=](const LegalityQuery &Query) { | ||||
| 1662 | const LLT Ty = Query.Types[LitTyIdx]; | ||||
| 1663 | return Ty.getSizeInBits() < 32; | ||||
| 1664 | }, | ||||
| 1665 | changeTo(LitTyIdx, S32)); | ||||
| 1666 | } | ||||
| 1667 | |||||
| 1668 | Builder.widenScalarIf( | ||||
| 1669 | [=](const LegalityQuery &Query) { | ||||
| 1670 | const LLT Ty = Query.Types[BigTyIdx]; | ||||
| 1671 | return Ty.getSizeInBits() % 16 != 0; | ||||
| 1672 | }, | ||||
| 1673 | [=](const LegalityQuery &Query) { | ||||
| 1674 | // Pick the next power of 2, or a multiple of 64 over 128. | ||||
| 1675 | // Whichever is smaller. | ||||
| 1676 | const LLT &Ty = Query.Types[BigTyIdx]; | ||||
| 1677 | unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); | ||||
| 1678 | if (NewSizeInBits >= 256) { | ||||
| 1679 | unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); | ||||
| 1680 | if (RoundedTo < NewSizeInBits) | ||||
| 1681 | NewSizeInBits = RoundedTo; | ||||
| 1682 | } | ||||
| 1683 | return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits)); | ||||
| 1684 | }) | ||||
| 1685 | // Any vectors left are the wrong size. Scalarize them. | ||||
| 1686 | .scalarize(0) | ||||
| 1687 | .scalarize(1); | ||||
| 1688 | } | ||||
| 1689 | |||||
| 1690 | // S64 is only legal on SALU, and needs to be broken into 32-bit elements in | ||||
| 1691 | // RegBankSelect. | ||||
| 1692 | auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) | ||||
| 1693 | .legalFor({{S32}, {S64}}); | ||||
| 1694 | |||||
| 1695 | if (ST.hasVOP3PInsts()) { | ||||
| 1696 | SextInReg.lowerFor({{V2S16}}) | ||||
| 1697 | // Prefer to reduce vector widths for 16-bit vectors before lowering, to | ||||
| 1698 | // get more vector shift opportunities, since we'll get those when | ||||
| 1699 | // expanded. | ||||
| 1700 | .clampMaxNumElementsStrict(0, S16, 2); | ||||
| 1701 | } else if (ST.has16BitInsts()) { | ||||
| 1702 | SextInReg.lowerFor({{S32}, {S64}, {S16}}); | ||||
| 1703 | } else { | ||||
| 1704 | // Prefer to promote to s32 before lowering if we don't have 16-bit | ||||
| 1705 | // shifts. This avoid a lot of intermediate truncate and extend operations. | ||||
| 1706 | SextInReg.lowerFor({{S32}, {S64}}); | ||||
| 1707 | } | ||||
| 1708 | |||||
| 1709 | SextInReg | ||||
| 1710 | .scalarize(0) | ||||
| 1711 | .clampScalar(0, S32, S64) | ||||
| 1712 | .lower(); | ||||
| 1713 | |||||
| 1714 | getActionDefinitionsBuilder({G_ROTR, G_ROTL}) | ||||
| 1715 | .scalarize(0) | ||||
| 1716 | .lower(); | ||||
| 1717 | |||||
| 1718 | // TODO: Only Try to form v2s16 with legal packed instructions. | ||||
| 1719 | getActionDefinitionsBuilder(G_FSHR) | ||||
| 1720 | .legalFor({{S32, S32}}) | ||||
| 1721 | .lowerFor({{V2S16, V2S16}}) | ||||
| 1722 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
| 1723 | .scalarize(0) | ||||
| 1724 | .lower(); | ||||
| 1725 | |||||
| 1726 | if (ST.hasVOP3PInsts()) { | ||||
| 1727 | getActionDefinitionsBuilder(G_FSHL) | ||||
| 1728 | .lowerFor({{V2S16, V2S16}}) | ||||
| 1729 | .clampMaxNumElementsStrict(0, S16, 2) | ||||
| 1730 | .scalarize(0) | ||||
| 1731 | .lower(); | ||||
| 1732 | } else { | ||||
| 1733 | getActionDefinitionsBuilder(G_FSHL) | ||||
| 1734 | .scalarize(0) | ||||
| 1735 | .lower(); | ||||
| 1736 | } | ||||
| 1737 | |||||
| 1738 | getActionDefinitionsBuilder(G_READCYCLECOUNTER) | ||||
| 1739 | .legalFor({S64}); | ||||
| 1740 | |||||
| 1741 | getActionDefinitionsBuilder(G_FENCE) | ||||
| 1742 | .alwaysLegal(); | ||||
| 1743 | |||||
| 1744 | getActionDefinitionsBuilder({G_SMULO, G_UMULO}) | ||||
| 1745 | .scalarize(0) | ||||
| 1746 | .minScalar(0, S32) | ||||
| 1747 | .lower(); | ||||
| 1748 | |||||
| 1749 | getActionDefinitionsBuilder({G_SBFX, G_UBFX}) | ||||
| 1750 | .legalFor({{S32, S32}, {S64, S32}}) | ||||
| 1751 | .clampScalar(1, S32, S32) | ||||
| 1752 | .clampScalar(0, S32, S64) | ||||
| 1753 | .widenScalarToNextPow2(0) | ||||
| 1754 | .scalarize(0); | ||||
| 1755 | |||||
| 1756 | getActionDefinitionsBuilder({ | ||||
| 1757 | // TODO: Verify V_BFI_B32 is generated from expanded bit ops | ||||
| 1758 | G_FCOPYSIGN, | ||||
| 1759 | |||||
| 1760 | G_ATOMIC_CMPXCHG_WITH_SUCCESS, | ||||
| 1761 | G_ATOMICRMW_NAND, | ||||
| 1762 | G_ATOMICRMW_FSUB, | ||||
| 1763 | G_READ_REGISTER, | ||||
| 1764 | G_WRITE_REGISTER, | ||||
| 1765 | |||||
| 1766 | G_SADDO, G_SSUBO, | ||||
| 1767 | |||||
| 1768 | // TODO: Implement | ||||
| 1769 | G_FMINIMUM, G_FMAXIMUM}).lower(); | ||||
| 1770 | |||||
| 1771 | getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) | ||||
| 1772 | .lower(); | ||||
| 1773 | |||||
| 1774 | getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, | ||||
| 1775 | G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, | ||||
| 1776 | G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) | ||||
| 1777 | .unsupported(); | ||||
| 1778 | |||||
| 1779 | getLegacyLegalizerInfo().computeTables(); | ||||
| 1780 | verify(*ST.getInstrInfo()); | ||||
| 1781 | } | ||||
| 1782 | |||||
| 1783 | bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, | ||||
| 1784 | MachineInstr &MI) const { | ||||
| 1785 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||
| 1786 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||
| 1787 | |||||
| 1788 | switch (MI.getOpcode()) { | ||||
| 1789 | case TargetOpcode::G_ADDRSPACE_CAST: | ||||
| 1790 | return legalizeAddrSpaceCast(MI, MRI, B); | ||||
| 1791 | case TargetOpcode::G_FRINT: | ||||
| 1792 | return legalizeFrint(MI, MRI, B); | ||||
| 1793 | case TargetOpcode::G_FCEIL: | ||||
| 1794 | return legalizeFceil(MI, MRI, B); | ||||
| 1795 | case TargetOpcode::G_FREM: | ||||
| 1796 | return legalizeFrem(MI, MRI, B); | ||||
| 1797 | case TargetOpcode::G_INTRINSIC_TRUNC: | ||||
| 1798 | return legalizeIntrinsicTrunc(MI, MRI, B); | ||||
| 1799 | case TargetOpcode::G_SITOFP: | ||||
| 1800 | return legalizeITOFP(MI, MRI, B, true); | ||||
| 1801 | case TargetOpcode::G_UITOFP: | ||||
| 1802 | return legalizeITOFP(MI, MRI, B, false); | ||||
| 1803 | case TargetOpcode::G_FPTOSI: | ||||
| 1804 | return legalizeFPTOI(MI, MRI, B, true); | ||||
| 1805 | case TargetOpcode::G_FPTOUI: | ||||
| 1806 | return legalizeFPTOI(MI, MRI, B, false); | ||||
| 1807 | case TargetOpcode::G_FMINNUM: | ||||
| 1808 | case TargetOpcode::G_FMAXNUM: | ||||
| 1809 | case TargetOpcode::G_FMINNUM_IEEE: | ||||
| 1810 | case TargetOpcode::G_FMAXNUM_IEEE: | ||||
| 1811 | return legalizeMinNumMaxNum(Helper, MI); | ||||
| 1812 | case TargetOpcode::G_EXTRACT_VECTOR_ELT: | ||||
| 1813 | return legalizeExtractVectorElt(MI, MRI, B); | ||||
| 1814 | case TargetOpcode::G_INSERT_VECTOR_ELT: | ||||
| 1815 | return legalizeInsertVectorElt(MI, MRI, B); | ||||
| 1816 | case TargetOpcode::G_FSIN: | ||||
| 1817 | case TargetOpcode::G_FCOS: | ||||
| 1818 | return legalizeSinCos(MI, MRI, B); | ||||
| 1819 | case TargetOpcode::G_GLOBAL_VALUE: | ||||
| 1820 | return legalizeGlobalValue(MI, MRI, B); | ||||
| 1821 | case TargetOpcode::G_LOAD: | ||||
| 1822 | case TargetOpcode::G_SEXTLOAD: | ||||
| 1823 | case TargetOpcode::G_ZEXTLOAD: | ||||
| 1824 | return legalizeLoad(Helper, MI); | ||||
| 1825 | case TargetOpcode::G_FMAD: | ||||
| 1826 | return legalizeFMad(MI, MRI, B); | ||||
| 1827 | case TargetOpcode::G_FDIV: | ||||
| 1828 | return legalizeFDIV(MI, MRI, B); | ||||
| 1829 | case TargetOpcode::G_UDIV: | ||||
| 1830 | case TargetOpcode::G_UREM: | ||||
| 1831 | case TargetOpcode::G_UDIVREM: | ||||
| 1832 | return legalizeUnsignedDIV_REM(MI, MRI, B); | ||||
| 1833 | case TargetOpcode::G_SDIV: | ||||
| 1834 | case TargetOpcode::G_SREM: | ||||
| 1835 | case TargetOpcode::G_SDIVREM: | ||||
| 1836 | return legalizeSignedDIV_REM(MI, MRI, B); | ||||
| 1837 | case TargetOpcode::G_ATOMIC_CMPXCHG: | ||||
| 1838 | return legalizeAtomicCmpXChg(MI, MRI, B); | ||||
| 1839 | case TargetOpcode::G_FLOG: | ||||
| 1840 | return legalizeFlog(MI, B, numbers::ln2f); | ||||
| 1841 | case TargetOpcode::G_FLOG10: | ||||
| 1842 | return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); | ||||
| 1843 | case TargetOpcode::G_FEXP: | ||||
| 1844 | return legalizeFExp(MI, B); | ||||
| 1845 | case TargetOpcode::G_FPOW: | ||||
| 1846 | return legalizeFPow(MI, B); | ||||
| 1847 | case TargetOpcode::G_FFLOOR: | ||||
| 1848 | return legalizeFFloor(MI, MRI, B); | ||||
| 1849 | case TargetOpcode::G_BUILD_VECTOR: | ||||
| 1850 | case TargetOpcode::G_BUILD_VECTOR_TRUNC: | ||||
| 1851 | return legalizeBuildVector(MI, MRI, B); | ||||
| 1852 | case TargetOpcode::G_MUL: | ||||
| 1853 | return legalizeMul(Helper, MI); | ||||
| 1854 | case TargetOpcode::G_CTLZ: | ||||
| 1855 | case TargetOpcode::G_CTTZ: | ||||
| 1856 | return legalizeCTLZ_CTTZ(MI, MRI, B); | ||||
| 1857 | case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: | ||||
| 1858 | return legalizeFPTruncRound(MI, B); | ||||
| 1859 | default: | ||||
| 1860 | return false; | ||||
| 1861 | } | ||||
| 1862 | |||||
| 1863 | llvm_unreachable("expected switch to return")::llvm::llvm_unreachable_internal("expected switch to return" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1863); | ||||
| 1864 | } | ||||
| 1865 | |||||
| 1866 | Register AMDGPULegalizerInfo::getSegmentAperture( | ||||
| 1867 | unsigned AS, | ||||
| 1868 | MachineRegisterInfo &MRI, | ||||
| 1869 | MachineIRBuilder &B) const { | ||||
| 1870 | MachineFunction &MF = B.getMF(); | ||||
| 1871 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | ||||
| 1872 | const LLT S32 = LLT::scalar(32); | ||||
| 1873 | const LLT S64 = LLT::scalar(64); | ||||
| 1874 | |||||
| 1875 | assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)(static_cast <bool> (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) ? void (0) : __assert_fail ("AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 1875, __extension__ __PRETTY_FUNCTION__)); | ||||
| 1876 | |||||
| 1877 | if (ST.hasApertureRegs()) { | ||||
| 1878 | // Note: this register is somewhat broken. When used as a 32-bit operand, | ||||
| 1879 | // it only returns zeroes. The real value is in the upper 32 bits. | ||||
| 1880 | // Thus, we must emit extract the high 32 bits. | ||||
| 1881 | const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) | ||||
| 1882 | ? AMDGPU::SRC_SHARED_BASE | ||||
| 1883 | : AMDGPU::SRC_PRIVATE_BASE; | ||||
| 1884 | // FIXME: It would be more natural to emit a COPY here, but then copy | ||||
| 1885 | // coalescing would kick in and it would think it's okay to use the "HI" | ||||
| 1886 | // subregister (instead of extracting the HI 32 bits) which is an artificial | ||||
| 1887 | // (unusable) register. | ||||
| 1888 | // Register TableGen definitions would need an overhaul to get rid of the | ||||
| 1889 | // artificial "HI" aperture registers and prevent this kind of issue from | ||||
| 1890 | // happening. | ||||
| 1891 | Register Dst = MRI.createGenericVirtualRegister(S64); | ||||
| 1892 | MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass); | ||||
| 1893 | B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)}); | ||||
| 1894 | return B.buildUnmerge(S32, Dst).getReg(1); | ||||
| 1895 | } | ||||
| 1896 | |||||
| 1897 | // TODO: can we be smarter about machine pointer info? | ||||
| 1898 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); | ||||
| 1899 | Register LoadAddr = MRI.createGenericVirtualRegister( | ||||
| 1900 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||
| 1901 | // For code object version 5, private_base and shared_base are passed through | ||||
| 1902 | // implicit kernargs. | ||||
| 1903 | if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >= | ||||
| 1904 | AMDGPU::AMDHSA_COV5) { | ||||
| 1905 | AMDGPUTargetLowering::ImplicitParameter Param = | ||||
| 1906 | AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE | ||||
| 1907 | : AMDGPUTargetLowering::PRIVATE_BASE; | ||||
| 1908 | uint64_t Offset = | ||||
| 1909 | ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); | ||||
| 1910 | |||||
| 1911 | Register KernargPtrReg = MRI.createGenericVirtualRegister( | ||||
| 1912 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||
| 1913 | |||||
| 1914 | if (!loadInputValue(KernargPtrReg, B, | ||||
| 1915 | AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) | ||||
| 1916 | return Register(); | ||||
| 1917 | |||||
| 1918 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||
| 1919 | PtrInfo, | ||||
| 1920 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||
| 1921 | MachineMemOperand::MOInvariant, | ||||
| 1922 | LLT::scalar(32), commonAlignment(Align(64), Offset)); | ||||
| 1923 | |||||
| 1924 | // Pointer address | ||||
| 1925 | B.buildPtrAdd(LoadAddr, KernargPtrReg, | ||||
| 1926 | B.buildConstant(LLT::scalar(64), Offset).getReg(0)); | ||||
| 1927 | // Load address | ||||
| 1928 | return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); | ||||
| 1929 | } | ||||
| 1930 | |||||
| 1931 | Register QueuePtr = MRI.createGenericVirtualRegister( | ||||
| 1932 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||
| 1933 | |||||
| 1934 | if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) | ||||
| 1935 | return Register(); | ||||
| 1936 | |||||
| 1937 | // Offset into amd_queue_t for group_segment_aperture_base_hi / | ||||
| 1938 | // private_segment_aperture_base_hi. | ||||
| 1939 | uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; | ||||
| 1940 | |||||
| 1941 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||
| 1942 | PtrInfo, | ||||
| 1943 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||
| 1944 | MachineMemOperand::MOInvariant, | ||||
| 1945 | LLT::scalar(32), commonAlignment(Align(64), StructOffset)); | ||||
| 1946 | |||||
| 1947 | B.buildPtrAdd(LoadAddr, QueuePtr, | ||||
| 1948 | B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); | ||||
| 1949 | return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); | ||||
| 1950 | } | ||||
| 1951 | |||||
| 1952 | /// Return true if the value is a known valid address, such that a null check is | ||||
| 1953 | /// not necessary. | ||||
| 1954 | static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, | ||||
| 1955 | const AMDGPUTargetMachine &TM, unsigned AddrSpace) { | ||||
| 1956 | MachineInstr *Def = MRI.getVRegDef(Val); | ||||
| 1957 | switch (Def->getOpcode()) { | ||||
| 1958 | case AMDGPU::G_FRAME_INDEX: | ||||
| 1959 | case AMDGPU::G_GLOBAL_VALUE: | ||||
| 1960 | case AMDGPU::G_BLOCK_ADDR: | ||||
| 1961 | return true; | ||||
| 1962 | case AMDGPU::G_CONSTANT: { | ||||
| 1963 | const ConstantInt *CI = Def->getOperand(1).getCImm(); | ||||
| 1964 | return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); | ||||
| 1965 | } | ||||
| 1966 | default: | ||||
| 1967 | return false; | ||||
| 1968 | } | ||||
| 1969 | |||||
| 1970 | return false; | ||||
| 1971 | } | ||||
| 1972 | |||||
| 1973 | bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( | ||||
| 1974 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
| 1975 | MachineIRBuilder &B) const { | ||||
| 1976 | MachineFunction &MF = B.getMF(); | ||||
| 1977 | |||||
| 1978 | const LLT S32 = LLT::scalar(32); | ||||
| 1979 | Register Dst = MI.getOperand(0).getReg(); | ||||
| 1980 | Register Src = MI.getOperand(1).getReg(); | ||||
| 1981 | |||||
| 1982 | LLT DstTy = MRI.getType(Dst); | ||||
| 1983 | LLT SrcTy = MRI.getType(Src); | ||||
| 1984 | unsigned DestAS = DstTy.getAddressSpace(); | ||||
| 1985 | unsigned SrcAS = SrcTy.getAddressSpace(); | ||||
| 1986 | |||||
| 1987 | // TODO: Avoid reloading from the queue ptr for each cast, or at least each | ||||
| 1988 | // vector element. | ||||
| 1989 | assert(!DstTy.isVector())(static_cast <bool> (!DstTy.isVector()) ? void (0) : __assert_fail ("!DstTy.isVector()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 1989, __extension__ __PRETTY_FUNCTION__)); | ||||
| 1990 | |||||
| 1991 | const AMDGPUTargetMachine &TM | ||||
| 1992 | = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); | ||||
| 1993 | |||||
| 1994 | if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { | ||||
| 1995 | MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); | ||||
| 1996 | return true; | ||||
| 1997 | } | ||||
| 1998 | |||||
| 1999 | if (SrcAS == AMDGPUAS::FLAT_ADDRESS && | ||||
| 2000 | (DestAS == AMDGPUAS::LOCAL_ADDRESS || | ||||
| 2001 | DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { | ||||
| 2002 | if (isKnownNonNull(Src, MRI, TM, SrcAS)) { | ||||
| 2003 | // Extract low 32-bits of the pointer. | ||||
| 2004 | B.buildExtract(Dst, Src, 0); | ||||
| 2005 | MI.eraseFromParent(); | ||||
| 2006 | return true; | ||||
| 2007 | } | ||||
| 2008 | |||||
| 2009 | unsigned NullVal = TM.getNullPointerValue(DestAS); | ||||
| 2010 | |||||
| 2011 | auto SegmentNull = B.buildConstant(DstTy, NullVal); | ||||
| 2012 | auto FlatNull = B.buildConstant(SrcTy, 0); | ||||
| 2013 | |||||
| 2014 | // Extract low 32-bits of the pointer. | ||||
| 2015 | auto PtrLo32 = B.buildExtract(DstTy, Src, 0); | ||||
| 2016 | |||||
| 2017 | auto CmpRes = | ||||
| 2018 | B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); | ||||
| 2019 | B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); | ||||
| 2020 | |||||
| 2021 | MI.eraseFromParent(); | ||||
| 2022 | return true; | ||||
| 2023 | } | ||||
| 2024 | |||||
| 2025 | if (DestAS == AMDGPUAS::FLAT_ADDRESS && | ||||
| 2026 | (SrcAS == AMDGPUAS::LOCAL_ADDRESS || | ||||
| 2027 | SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { | ||||
| 2028 | Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); | ||||
| 2029 | if (!ApertureReg.isValid()) | ||||
| 2030 | return false; | ||||
| 2031 | |||||
| 2032 | // Coerce the type of the low half of the result so we can use merge_values. | ||||
| 2033 | Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); | ||||
| 2034 | |||||
| 2035 | // TODO: Should we allow mismatched types but matching sizes in merges to | ||||
| 2036 | // avoid the ptrtoint? | ||||
| 2037 | auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg}); | ||||
| 2038 | |||||
| 2039 | if (isKnownNonNull(Src, MRI, TM, SrcAS)) { | ||||
| 2040 | B.buildCopy(Dst, BuildPtr); | ||||
| 2041 | MI.eraseFromParent(); | ||||
| 2042 | return true; | ||||
| 2043 | } | ||||
| 2044 | |||||
| 2045 | auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); | ||||
| 2046 | auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); | ||||
| 2047 | |||||
| 2048 | auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, | ||||
| 2049 | SegmentNull.getReg(0)); | ||||
| 2050 | |||||
| 2051 | B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); | ||||
| 2052 | |||||
| 2053 | MI.eraseFromParent(); | ||||
| 2054 | return true; | ||||
| 2055 | } | ||||
| 2056 | |||||
| 2057 | if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && | ||||
| 2058 | SrcTy.getSizeInBits() == 64) { | ||||
| 2059 | // Truncate. | ||||
| 2060 | B.buildExtract(Dst, Src, 0); | ||||
| 2061 | MI.eraseFromParent(); | ||||
| 2062 | return true; | ||||
| 2063 | } | ||||
| 2064 | |||||
| 2065 | if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && | ||||
| 2066 | DstTy.getSizeInBits() == 64) { | ||||
| 2067 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); | ||||
| 2068 | uint32_t AddrHiVal = Info->get32BitAddressHighBits(); | ||||
| 2069 | auto PtrLo = B.buildPtrToInt(S32, Src); | ||||
| 2070 | auto HighAddr = B.buildConstant(S32, AddrHiVal); | ||||
| 2071 | B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr}); | ||||
| 2072 | MI.eraseFromParent(); | ||||
| 2073 | return true; | ||||
| 2074 | } | ||||
| 2075 | |||||
| 2076 | DiagnosticInfoUnsupported InvalidAddrSpaceCast( | ||||
| 2077 | MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); | ||||
| 2078 | |||||
| 2079 | LLVMContext &Ctx = MF.getFunction().getContext(); | ||||
| 2080 | Ctx.diagnose(InvalidAddrSpaceCast); | ||||
| 2081 | B.buildUndef(Dst); | ||||
| 2082 | MI.eraseFromParent(); | ||||
| 2083 | return true; | ||||
| 2084 | } | ||||
| 2085 | |||||
| 2086 | bool AMDGPULegalizerInfo::legalizeFrint( | ||||
| 2087 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
| 2088 | MachineIRBuilder &B) const { | ||||
| 2089 | Register Src = MI.getOperand(1).getReg(); | ||||
| 2090 | LLT Ty = MRI.getType(Src); | ||||
| 2091 | assert(Ty.isScalar() && Ty.getSizeInBits() == 64)(static_cast <bool> (Ty.isScalar() && Ty.getSizeInBits () == 64) ? void (0) : __assert_fail ("Ty.isScalar() && Ty.getSizeInBits() == 64" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2091, __extension__ __PRETTY_FUNCTION__)); | ||||
| 2092 | |||||
| 2093 | APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); | ||||
| 2094 | APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); | ||||
| 2095 | |||||
| 2096 | auto C1 = B.buildFConstant(Ty, C1Val); | ||||
| 2097 | auto CopySign = B.buildFCopysign(Ty, C1, Src); | ||||
| 2098 | |||||
| 2099 | // TODO: Should this propagate fast-math-flags? | ||||
| 2100 | auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); | ||||
| 2101 | auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); | ||||
| 2102 | |||||
| 2103 | auto C2 = B.buildFConstant(Ty, C2Val); | ||||
| 2104 | auto Fabs = B.buildFAbs(Ty, Src); | ||||
| 2105 | |||||
| 2106 | auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); | ||||
| 2107 | B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); | ||||
| 2108 | MI.eraseFromParent(); | ||||
| 2109 | return true; | ||||
| 2110 | } | ||||
| 2111 | |||||
| 2112 | bool AMDGPULegalizerInfo::legalizeFceil( | ||||
| 2113 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
| 2114 | MachineIRBuilder &B) const { | ||||
| 2115 | |||||
| 2116 | const LLT S1 = LLT::scalar(1); | ||||
| 2117 | const LLT S64 = LLT::scalar(64); | ||||
| 2118 | |||||
| 2119 | Register Src = MI.getOperand(1).getReg(); | ||||
| 2120 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2120, __extension__ __PRETTY_FUNCTION__)); | ||||
| 2121 | |||||
| 2122 | // result = trunc(src) | ||||
| 2123 | // if (src > 0.0 && src != result) | ||||
| 2124 | // result += 1.0 | ||||
| 2125 | |||||
| 2126 | auto Trunc = B.buildIntrinsicTrunc(S64, Src); | ||||
| 2127 | |||||
| 2128 | const auto Zero = B.buildFConstant(S64, 0.0); | ||||
| 2129 | const auto One = B.buildFConstant(S64, 1.0); | ||||
| 2130 | auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); | ||||
| 2131 | auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); | ||||
| 2132 | auto And = B.buildAnd(S1, Lt0, NeTrunc); | ||||
| 2133 | auto Add = B.buildSelect(S64, And, One, Zero); | ||||
| 2134 | |||||
| 2135 | // TODO: Should this propagate fast-math-flags? | ||||
| 2136 | B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); | ||||
| 2137 | MI.eraseFromParent(); | ||||
| 2138 | return true; | ||||
| 2139 | } | ||||
| 2140 | |||||
| 2141 | bool AMDGPULegalizerInfo::legalizeFrem( | ||||
| 2142 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
| 2143 | MachineIRBuilder &B) const { | ||||
| 2144 | Register DstReg = MI.getOperand(0).getReg(); | ||||
| 2145 | Register Src0Reg = MI.getOperand(1).getReg(); | ||||
| 2146 | Register Src1Reg = MI.getOperand(2).getReg(); | ||||
| 2147 | auto Flags = MI.getFlags(); | ||||
| 2148 | LLT Ty = MRI.getType(DstReg); | ||||
| 2149 | |||||
| 2150 | auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); | ||||
| 2151 | auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); | ||||
| 2152 | auto Neg = B.buildFNeg(Ty, Trunc, Flags); | ||||
| 2153 | B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); | ||||
| 2154 | MI.eraseFromParent(); | ||||
| 2155 | return true; | ||||
| 2156 | } | ||||
| 2157 | |||||
| 2158 | static MachineInstrBuilder extractF64Exponent(Register Hi, | ||||
| 2159 | MachineIRBuilder &B) { | ||||
| 2160 | const unsigned FractBits = 52; | ||||
| 2161 | const unsigned ExpBits = 11; | ||||
| 2162 | LLT S32 = LLT::scalar(32); | ||||
| 2163 | |||||
| 2164 | auto Const0 = B.buildConstant(S32, FractBits - 32); | ||||
| 2165 | auto Const1 = B.buildConstant(S32, ExpBits); | ||||
| 2166 | |||||
| 2167 | auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) | ||||
| 2168 | .addUse(Hi) | ||||
| 2169 | .addUse(Const0.getReg(0)) | ||||
| 2170 | .addUse(Const1.getReg(0)); | ||||
| 2171 | |||||
| 2172 | return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); | ||||
| 2173 | } | ||||
| 2174 | |||||
| 2175 | bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( | ||||
| 2176 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
| 2177 | MachineIRBuilder &B) const { | ||||
| 2178 | const LLT S1 = LLT::scalar(1); | ||||
| 2179 | const LLT S32 = LLT::scalar(32); | ||||
| 2180 | const LLT S64 = LLT::scalar(64); | ||||
| 2181 | |||||
| 2182 | Register Src = MI.getOperand(1).getReg(); | ||||
| 2183 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2183, __extension__ __PRETTY_FUNCTION__)); | ||||
| 2184 | |||||
| 2185 | // TODO: Should this use extract since the low half is unused? | ||||
| 2186 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | ||||
| 2187 | Register Hi = Unmerge.getReg(1); | ||||
| 2188 | |||||
| 2189 | // Extract the upper half, since this is where we will find the sign and | ||||
| 2190 | // exponent. | ||||
| 2191 | auto Exp = extractF64Exponent(Hi, B); | ||||
| 2192 | |||||
| 2193 | const unsigned FractBits = 52; | ||||
| 2194 | |||||
| 2195 | // Extract the sign bit. | ||||
| 2196 | const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31); | ||||
| 2197 | auto SignBit = B.buildAnd(S32, Hi, SignBitMask); | ||||
| 2198 | |||||
| 2199 | const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1UL << FractBits) - 1); | ||||
| 2200 | |||||
| 2201 | const auto Zero32 = B.buildConstant(S32, 0); | ||||
| 2202 | |||||
| 2203 | // Extend back to 64-bits. | ||||
| 2204 | auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit}); | ||||
| 2205 | |||||
| 2206 | auto Shr = B.buildAShr(S64, FractMask, Exp); | ||||
| 2207 | auto Not = B.buildNot(S64, Shr); | ||||
| 2208 | auto Tmp0 = B.buildAnd(S64, Src, Not); | ||||
| 2209 | auto FiftyOne = B.buildConstant(S32, FractBits - 1); | ||||
| 2210 | |||||
| 2211 | auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); | ||||
| 2212 | auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); | ||||
| 2213 | |||||
| 2214 | auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); | ||||
| 2215 | B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); | ||||
| 2216 | MI.eraseFromParent(); | ||||
| 2217 | return true; | ||||
| 2218 | } | ||||
| 2219 | |||||
| 2220 | bool AMDGPULegalizerInfo::legalizeITOFP( | ||||
| 2221 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
| 2222 | MachineIRBuilder &B, bool Signed) const { | ||||
| 2223 | |||||
| 2224 | Register Dst = MI.getOperand(0).getReg(); | ||||
| 2225 | Register Src = MI.getOperand(1).getReg(); | ||||
| 2226 | |||||
| 2227 | const LLT S64 = LLT::scalar(64); | ||||
| 2228 | const LLT S32 = LLT::scalar(32); | ||||
| 2229 | |||||
| 2230 | assert(MRI.getType(Src) == S64)(static_cast <bool> (MRI.getType(Src) == S64) ? void (0 ) : __assert_fail ("MRI.getType(Src) == S64", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2230, __extension__ __PRETTY_FUNCTION__)); | ||||
| 2231 | |||||
| 2232 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); | ||||
| 2233 | auto ThirtyTwo = B.buildConstant(S32, 32); | ||||
| 2234 | |||||
| 2235 | if (MRI.getType(Dst) == S64) { | ||||
| 2236 | auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) | ||||
| 2237 | : B.buildUITOFP(S64, Unmerge.getReg(1)); | ||||
| 2238 | |||||
| 2239 | auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); | ||||
| 2240 | auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) | ||||
| 2241 | .addUse(CvtHi.getReg(0)) | ||||
| 2242 | .addUse(ThirtyTwo.getReg(0)); | ||||
| 2243 | |||||
| 2244 | // TODO: Should this propagate fast-math-flags? | ||||
| 2245 | B.buildFAdd(Dst, LdExp, CvtLo); | ||||
| 2246 | MI.eraseFromParent(); | ||||
| 2247 | return true; | ||||
| 2248 | } | ||||
| 2249 | |||||
| 2250 | assert(MRI.getType(Dst) == S32)(static_cast <bool> (MRI.getType(Dst) == S32) ? void (0 ) : __assert_fail ("MRI.getType(Dst) == S32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2250, __extension__ __PRETTY_FUNCTION__)); | ||||
| 2251 | |||||
| 2252 | auto One = B.buildConstant(S32, 1); | ||||
| 2253 | |||||
| 2254 | MachineInstrBuilder ShAmt; | ||||
| 2255 | if (Signed) { | ||||
| 2256 | auto ThirtyOne = B.buildConstant(S32, 31); | ||||
| 2257 | auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); | ||||
| 2258 | auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); | ||||
| 2259 | auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); | ||||
| 2260 | auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, | ||||
| 2261 | /*HasSideEffects=*/false) | ||||
| 2262 | .addUse(Unmerge.getReg(1)); | ||||
| 2263 | auto LS2 = B.buildSub(S32, LS, One); | ||||
| 2264 | ShAmt = B.buildUMin(S32, LS2, MaxShAmt); | ||||
| 2265 | } else | ||||
| 2266 | ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); | ||||
| 2267 | auto Norm = B.buildShl(S64, Src, ShAmt); | ||||
| 2268 | auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); | ||||
| 2269 | auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); | ||||
| 2270 | auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); | ||||
| 2271 | auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); | ||||
| 2272 | auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); | ||||
| 2273 | B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, | ||||
| 2274 | /*HasSideEffects=*/false) | ||||
| 2275 | .addUse(FVal.getReg(0)) | ||||
| 2276 | .addUse(Scale.getReg(0)); | ||||
| 2277 | MI.eraseFromParent(); | ||||
| 2278 | return true; | ||||
| 2279 | } | ||||
| 2280 | |||||
| 2281 | // TODO: Copied from DAG implementation. Verify logic and document how this | ||||
| 2282 | // actually works. | ||||
| 2283 | bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, | ||||
| 2284 | MachineRegisterInfo &MRI, | ||||
| 2285 | MachineIRBuilder &B, | ||||
| 2286 | bool Signed) const { | ||||
| 2287 | |||||
| 2288 | Register Dst = MI.getOperand(0).getReg(); | ||||
| 2289 | Register Src = MI.getOperand(1).getReg(); | ||||
| 2290 | |||||
| 2291 | const LLT S64 = LLT::scalar(64); | ||||
| 2292 | const LLT S32 = LLT::scalar(32); | ||||
| 2293 | |||||
| 2294 | const LLT SrcLT = MRI.getType(Src); | ||||
| 2295 | assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64)(static_cast <bool> ((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64) ? void (0) : __assert_fail ("(SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2295, __extension__ __PRETTY_FUNCTION__)); | ||||
| 2296 | |||||
| 2297 | unsigned Flags = MI.getFlags(); | ||||
| 2298 | |||||
| 2299 | // The basic idea of converting a floating point number into a pair of 32-bit | ||||
| 2300 | // integers is illustrated as follows: | ||||
| 2301 | // | ||||
| 2302 | // tf := trunc(val); | ||||
| 2303 | // hif := floor(tf * 2^-32); | ||||
| 2304 | // lof := tf - hif * 2^32; // lof is always positive due to floor. | ||||
| 2305 | // hi := fptoi(hif); | ||||
| 2306 | // lo := fptoi(lof); | ||||
| 2307 | // | ||||
| 2308 | auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); | ||||
| 2309 | MachineInstrBuilder Sign; | ||||
| 2310 | if (Signed && SrcLT == S32) { | ||||
| 2311 | // However, a 32-bit floating point number has only 23 bits mantissa and | ||||
| 2312 | // it's not enough to hold all the significant bits of `lof` if val is | ||||
| 2313 | // negative. To avoid the loss of precision, We need to take the absolute | ||||
| 2314 | // value after truncating and flip the result back based on the original | ||||
| 2315 | // signedness. | ||||
| 2316 | Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); | ||||
| 2317 | Trunc = B.buildFAbs(S32, Trunc, Flags); | ||||
| 2318 | } | ||||
| 2319 | MachineInstrBuilder K0, K1; | ||||
| 2320 | if (SrcLT == S64) { | ||||
| 2321 | K0 = B.buildFConstant( | ||||
| 2322 | S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000UL)); | ||||
| 2323 | K1 = B.buildFConstant( | ||||
| 2324 | S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000UL)); | ||||
| 2325 | } else { | ||||
| 2326 | K0 = B.buildFConstant( | ||||
| 2327 | S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U)); | ||||
| 2328 | K1 = B.buildFConstant( | ||||
| 2329 | S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U)); | ||||
| 2330 | } | ||||
| 2331 | |||||
| 2332 | auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); | ||||
| 2333 | auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); | ||||
| 2334 | auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); | ||||
| 2335 | |||||
| 2336 | auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) | ||||
| 2337 | : B.buildFPTOUI(S32, FloorMul); | ||||
| 2338 | auto Lo = B.buildFPTOUI(S32, Fma); | ||||
| 2339 | |||||
| 2340 | if (Signed && SrcLT == S32) { | ||||
| 2341 | // Flip the result based on the signedness, which is either all 0s or 1s. | ||||
| 2342 | Sign = B.buildMergeLikeInstr(S64, {Sign, Sign}); | ||||
| 2343 | // r := xor({lo, hi}, sign) - sign; | ||||
| 2344 | B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign), | ||||
| 2345 | Sign); | ||||
| 2346 | } else | ||||
| 2347 | B.buildMergeLikeInstr(Dst, {Lo, Hi}); | ||||
| 2348 | MI.eraseFromParent(); | ||||
| 2349 | |||||
| 2350 | return true; | ||||
| 2351 | } | ||||
| 2352 | |||||
| 2353 | bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, | ||||
| 2354 | MachineInstr &MI) const { | ||||
| 2355 | MachineFunction &MF = Helper.MIRBuilder.getMF(); | ||||
| 2356 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||
| 2357 | |||||
| 2358 | const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || | ||||
| 2359 | MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; | ||||
| 2360 | |||||
| 2361 | // With ieee_mode disabled, the instructions have the correct behavior | ||||
| 2362 | // already for G_FMINNUM/G_FMAXNUM | ||||
| 2363 | if (!MFI->getMode().IEEE) | ||||
| 2364 | return !IsIEEEOp; | ||||
| 2365 | |||||
| 2366 | if (IsIEEEOp) | ||||
| 2367 | return true; | ||||
| 2368 | |||||
| 2369 | return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; | ||||
| 2370 | } | ||||
| 2371 | |||||
| 2372 | bool AMDGPULegalizerInfo::legalizeExtractVectorElt( | ||||
| 2373 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
| 2374 | MachineIRBuilder &B) const { | ||||
| 2375 | // TODO: Should move some of this into LegalizerHelper. | ||||
| 2376 | |||||
| 2377 | // TODO: Promote dynamic indexing of s16 to s32 | ||||
| 2378 | |||||
| 2379 | // FIXME: Artifact combiner probably should have replaced the truncated | ||||
| 2380 | // constant before this, so we shouldn't need | ||||
| 2381 | // getIConstantVRegValWithLookThrough. | ||||
| 2382 | std::optional<ValueAndVReg> MaybeIdxVal = | ||||
| 2383 | getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); | ||||
| 2384 | if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. | ||||
| 2385 | return true; | ||||
| 2386 | const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); | ||||
| 2387 | |||||
| 2388 | Register Dst = MI.getOperand(0).getReg(); | ||||
| 2389 | Register Vec = MI.getOperand(1).getReg(); | ||||
| 2390 | |||||
| 2391 | LLT VecTy = MRI.getType(Vec); | ||||
| 2392 | LLT EltTy = VecTy.getElementType(); | ||||
| 2393 | assert(EltTy == MRI.getType(Dst))(static_cast <bool> (EltTy == MRI.getType(Dst)) ? void ( 0) : __assert_fail ("EltTy == MRI.getType(Dst)", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2393, __extension__ __PRETTY_FUNCTION__)); | ||||
| 2394 | |||||
| 2395 | if (IdxVal < VecTy.getNumElements()) { | ||||
| 2396 | auto Unmerge = B.buildUnmerge(EltTy, Vec); | ||||
| 2397 | B.buildCopy(Dst, Unmerge.getReg(IdxVal)); | ||||
| 2398 | } else { | ||||
| 2399 | B.buildUndef(Dst); | ||||
| 2400 | } | ||||
| 2401 | |||||
| 2402 | MI.eraseFromParent(); | ||||
| 2403 | return true; | ||||
| 2404 | } | ||||
| 2405 | |||||
| 2406 | bool AMDGPULegalizerInfo::legalizeInsertVectorElt( | ||||
| 2407 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
| 2408 | MachineIRBuilder &B) const { | ||||
| 2409 | // TODO: Should move some of this into LegalizerHelper. | ||||
| 2410 | |||||
| 2411 | // TODO: Promote dynamic indexing of s16 to s32 | ||||
| 2412 | |||||
| 2413 | // FIXME: Artifact combiner probably should have replaced the truncated | ||||
| 2414 | // constant before this, so we shouldn't need | ||||
| 2415 | // getIConstantVRegValWithLookThrough. | ||||
| 2416 | std::optional<ValueAndVReg> MaybeIdxVal = | ||||
| 2417 | getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); | ||||
| 2418 | if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. | ||||
| 2419 | return true; | ||||
| 2420 | |||||
| 2421 | const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); | ||||
| 2422 | Register Dst = MI.getOperand(0).getReg(); | ||||
| 2423 | Register Vec = MI.getOperand(1).getReg(); | ||||
| 2424 | Register Ins = MI.getOperand(2).getReg(); | ||||
| 2425 | |||||
| 2426 | LLT VecTy = MRI.getType(Vec); | ||||
| 2427 | LLT EltTy = VecTy.getElementType(); | ||||
| 2428 | assert(EltTy == MRI.getType(Ins))(static_cast <bool> (EltTy == MRI.getType(Ins)) ? void ( 0) : __assert_fail ("EltTy == MRI.getType(Ins)", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2428, __extension__ __PRETTY_FUNCTION__)); | ||||
| 2429 | (void)Ins; | ||||
| 2430 | |||||
| 2431 | unsigned NumElts = VecTy.getNumElements(); | ||||
| 2432 | if (IdxVal < NumElts) { | ||||
| 2433 | SmallVector<Register, 8> SrcRegs; | ||||
| 2434 | for (unsigned i = 0; i < NumElts; ++i) | ||||
| 2435 | SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); | ||||
| 2436 | B.buildUnmerge(SrcRegs, Vec); | ||||
| 2437 | |||||
| 2438 | SrcRegs[IdxVal] = MI.getOperand(2).getReg(); | ||||
| 2439 | B.buildMergeLikeInstr(Dst, SrcRegs); | ||||
| 2440 | } else { | ||||
| 2441 | B.buildUndef(Dst); | ||||
| 2442 | } | ||||
| 2443 | |||||
| 2444 | MI.eraseFromParent(); | ||||
| 2445 | return true; | ||||
| 2446 | } | ||||
| 2447 | |||||
| 2448 | bool AMDGPULegalizerInfo::legalizeSinCos( | ||||
| 2449 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
| 2450 | MachineIRBuilder &B) const { | ||||
| 2451 | |||||
| 2452 | Register DstReg = MI.getOperand(0).getReg(); | ||||
| 2453 | Register SrcReg = MI.getOperand(1).getReg(); | ||||
| 2454 | LLT Ty = MRI.getType(DstReg); | ||||
| 2455 | unsigned Flags = MI.getFlags(); | ||||
| 2456 | |||||
| 2457 | Register TrigVal; | ||||
| 2458 | auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); | ||||
| 2459 | if (ST.hasTrigReducedRange()) { | ||||
| 2460 | auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); | ||||
| 2461 | TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) | ||||
| 2462 | .addUse(MulVal.getReg(0)) | ||||
| 2463 | .setMIFlags(Flags).getReg(0); | ||||
| 2464 | } else | ||||
| 2465 | TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); | ||||
| 2466 | |||||
| 2467 | Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? | ||||
| 2468 | Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; | ||||
| 2469 | B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg), false) | ||||
| 2470 | .addUse(TrigVal) | ||||
| 2471 | .setMIFlags(Flags); | ||||
| 2472 | MI.eraseFromParent(); | ||||
| 2473 | return true; | ||||
| 2474 | } | ||||
| 2475 | |||||
| 2476 | bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, | ||||
| 2477 | MachineIRBuilder &B, | ||||
| 2478 | const GlobalValue *GV, | ||||
| 2479 | int64_t Offset, | ||||
| 2480 | unsigned GAFlags) const { | ||||
| 2481 | assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")(static_cast <bool> (isInt<32>(Offset + 4) && "32-bit offset is expected!") ? void (0) : __assert_fail ("isInt<32>(Offset + 4) && \"32-bit offset is expected!\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2481, __extension__ __PRETTY_FUNCTION__)); | ||||
| 2482 | // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered | ||||
| 2483 | // to the following code sequence: | ||||
| 2484 | // | ||||
| 2485 | // For constant address space: | ||||
| 2486 | // s_getpc_b64 s[0:1] | ||||
| 2487 | // s_add_u32 s0, s0, $symbol | ||||
| 2488 | // s_addc_u32 s1, s1, 0 | ||||
| 2489 | // | ||||
| 2490 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | ||||
| 2491 | // a fixup or relocation is emitted to replace $symbol with a literal | ||||
| 2492 | // constant, which is a pc-relative offset from the encoding of the $symbol | ||||
| 2493 | // operand to the global variable. | ||||
| 2494 | // | ||||
| 2495 | // For global address space: | ||||
| 2496 | // s_getpc_b64 s[0:1] | ||||
| 2497 | // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo | ||||
| 2498 | // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi | ||||
| 2499 | // | ||||
| 2500 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then | ||||
| 2501 | // fixups or relocations are emitted to replace $symbol@*@lo and | ||||
| 2502 | // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, | ||||
| 2503 | // which is a 64-bit pc-relative offset from the encoding of the $symbol | ||||
| 2504 | // operand to the global variable. | ||||
| 2505 | // | ||||
| 2506 | // What we want here is an offset from the value returned by s_getpc | ||||
| 2507 | // (which is the address of the s_add_u32 instruction) to the global | ||||
| 2508 | // variable, but since the encoding of $symbol starts 4 bytes after the start | ||||
| 2509 | // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too | ||||
| 2510 | // small. This requires us to add 4 to the global variable offset in order to | ||||
| 2511 | // compute the correct address. Similarly for the s_addc_u32 instruction, the | ||||
| 2512 | // encoding of $symbol starts 12 bytes after the start of the s_add_u32 | ||||
| 2513 | // instruction. | ||||
| 2514 | |||||
| 2515 | LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||
| 2516 | |||||
| 2517 | Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : | ||||
| 2518 | B.getMRI()->createGenericVirtualRegister(ConstPtrTy); | ||||
| 2519 | |||||
| 2520 | MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) | ||||
| 2521 | .addDef(PCReg); | ||||
| 2522 | |||||
| 2523 | MIB.addGlobalAddress(GV, Offset + 4, GAFlags); | ||||
| 2524 | if (GAFlags == SIInstrInfo::MO_NONE) | ||||
| 2525 | MIB.addImm(0); | ||||
| 2526 | else | ||||
| 2527 | MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); | ||||
| 2528 | |||||
| 2529 | if (!B.getMRI()->getRegClassOrNull(PCReg)) | ||||
| 2530 | B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); | ||||
| 2531 | |||||
| 2532 | if (PtrTy.getSizeInBits() == 32) | ||||
| 2533 | B.buildExtract(DstReg, PCReg, 0); | ||||
| 2534 | return true; | ||||
| 2535 | } | ||||
| 2536 | |||||
| 2537 | bool AMDGPULegalizerInfo::legalizeGlobalValue( | ||||
| 2538 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
| 2539 | MachineIRBuilder &B) const { | ||||
| 2540 | Register DstReg = MI.getOperand(0).getReg(); | ||||
| 2541 | LLT Ty = MRI.getType(DstReg); | ||||
| 2542 | unsigned AS = Ty.getAddressSpace(); | ||||
| 2543 | |||||
| 2544 | const GlobalValue *GV = MI.getOperand(1).getGlobal(); | ||||
| 2545 | MachineFunction &MF = B.getMF(); | ||||
| 2546 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||
| 2547 | |||||
| 2548 | if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { | ||||
| 2549 | if (!MFI->isModuleEntryFunction() && | ||||
| 2550 | !GV->getName().equals("llvm.amdgcn.module.lds")) { | ||||
| 2551 | const Function &Fn = MF.getFunction(); | ||||
| 2552 | DiagnosticInfoUnsupported BadLDSDecl( | ||||
| 2553 | Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), | ||||
| 2554 | DS_Warning); | ||||
| 2555 | Fn.getContext().diagnose(BadLDSDecl); | ||||
| 2556 | |||||
| 2557 | // We currently don't have a way to correctly allocate LDS objects that | ||||
| 2558 | // aren't directly associated with a kernel. We do force inlining of | ||||
| 2559 | // functions that use local objects. However, if these dead functions are | ||||
| 2560 | // not eliminated, we don't want a compile time error. Just emit a warning | ||||
| 2561 | // and a trap, since there should be no callable path here. | ||||
| 2562 | B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); | ||||
| 2563 | B.buildUndef(DstReg); | ||||
| 2564 | MI.eraseFromParent(); | ||||
| 2565 | return true; | ||||
| 2566 | } | ||||
| 2567 | |||||
| 2568 | // TODO: We could emit code to handle the initialization somewhere. | ||||
| 2569 | // We ignore the initializer for now and legalize it to allow selection. | ||||
| 2570 | // The initializer will anyway get errored out during assembly emission. | ||||
| 2571 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||
| 2572 | if (!TLI->shouldUseLDSConstAddress(GV)) { | ||||
| 2573 | MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); | ||||
| 2574 | return true; // Leave in place; | ||||
| 2575 | } | ||||
| 2576 | |||||
| 2577 | if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { | ||||
| 2578 | Type *Ty = GV->getValueType(); | ||||
| 2579 | // HIP uses an unsized array `extern __shared__ T s[]` or similar | ||||
| 2580 | // zero-sized type in other languages to declare the dynamic shared | ||||
| 2581 | // memory which size is not known at the compile time. They will be | ||||
| 2582 | // allocated by the runtime and placed directly after the static | ||||
| 2583 | // allocated ones. They all share the same offset. | ||||
| 2584 | if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { | ||||
| 2585 | // Adjust alignment for that dynamic shared memory array. | ||||
| 2586 | MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV)); | ||||
| 2587 | LLT S32 = LLT::scalar(32); | ||||
| 2588 | auto Sz = | ||||
| 2589 | B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); | ||||
| 2590 | B.buildIntToPtr(DstReg, Sz); | ||||
| 2591 | MI.eraseFromParent(); | ||||
| 2592 | return true; | ||||
| 2593 | } | ||||
| 2594 | } | ||||
| 2595 | |||||
| 2596 | B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), | ||||
| 2597 | *cast<GlobalVariable>(GV))); | ||||
| 2598 | MI.eraseFromParent(); | ||||
| 2599 | return true; | ||||
| 2600 | } | ||||
| 2601 | |||||
| 2602 | const SITargetLowering *TLI = ST.getTargetLowering(); | ||||
| 2603 | |||||
| 2604 | if (TLI->shouldEmitFixup(GV)) { | ||||
| 2605 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); | ||||
| 2606 | MI.eraseFromParent(); | ||||
| 2607 | return true; | ||||
| 2608 | } | ||||
| 2609 | |||||
| 2610 | if (TLI->shouldEmitPCReloc(GV)) { | ||||
| 2611 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); | ||||
| 2612 | MI.eraseFromParent(); | ||||
| 2613 | return true; | ||||
| 2614 | } | ||||
| 2615 | |||||
| 2616 | LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||
| 2617 | Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); | ||||
| 2618 | |||||
| 2619 | LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; | ||||
| 2620 | MachineMemOperand *GOTMMO = MF.getMachineMemOperand( | ||||
| 2621 | MachinePointerInfo::getGOT(MF), | ||||
| 2622 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||
| 2623 | MachineMemOperand::MOInvariant, | ||||
| 2624 | LoadTy, Align(8)); | ||||
| 2625 | |||||
| 2626 | buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); | ||||
| 2627 | |||||
| 2628 | if (Ty.getSizeInBits() == 32) { | ||||
| 2629 | // Truncate if this is a 32-bit constant address. | ||||
| 2630 | auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); | ||||
| 2631 | B.buildExtract(DstReg, Load, 0); | ||||
| 2632 | } else | ||||
| 2633 | B.buildLoad(DstReg, GOTAddr, *GOTMMO); | ||||
| 2634 | |||||
| 2635 | MI.eraseFromParent(); | ||||
| 2636 | return true; | ||||
| 2637 | } | ||||
| 2638 | |||||
| 2639 | static LLT widenToNextPowerOf2(LLT Ty) { | ||||
| 2640 | if (Ty.isVector()) | ||||
| 2641 | return Ty.changeElementCount( | ||||
| 2642 | ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); | ||||
| 2643 | return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); | ||||
| 2644 | } | ||||
| 2645 | |||||
| 2646 | bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, | ||||
| 2647 | MachineInstr &MI) const { | ||||
| 2648 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||
| 2649 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||
| 2650 | GISelChangeObserver &Observer = Helper.Observer; | ||||
| 2651 | |||||
| 2652 | Register PtrReg = MI.getOperand(1).getReg(); | ||||
| 2653 | LLT PtrTy = MRI.getType(PtrReg); | ||||
| 2654 | unsigned AddrSpace = PtrTy.getAddressSpace(); | ||||
| 2655 | |||||
| 2656 | if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { | ||||
| 2657 | LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||
| 2658 | auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); | ||||
| 2659 | Observer.changingInstr(MI); | ||||
| 2660 | MI.getOperand(1).setReg(Cast.getReg(0)); | ||||
| 2661 | Observer.changedInstr(MI); | ||||
| 2662 | return true; | ||||
| 2663 | } | ||||
| 2664 | |||||
| 2665 | if (MI.getOpcode() != AMDGPU::G_LOAD) | ||||
| 2666 | return false; | ||||
| 2667 | |||||
| 2668 | Register ValReg = MI.getOperand(0).getReg(); | ||||
| 2669 | LLT ValTy = MRI.getType(ValReg); | ||||
| 2670 | |||||
| 2671 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||
| 2672 | const unsigned ValSize = ValTy.getSizeInBits(); | ||||
| 2673 | const LLT MemTy = MMO->getMemoryType(); | ||||
| 2674 | const Align MemAlign = MMO->getAlign(); | ||||
| 2675 | const unsigned MemSize = MemTy.getSizeInBits(); | ||||
| 2676 | const uint64_t AlignInBits = 8 * MemAlign.value(); | ||||
| 2677 | |||||
| 2678 | // Widen non-power-of-2 loads to the alignment if needed | ||||
| 2679 | if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { | ||||
| 2680 | const unsigned WideMemSize = PowerOf2Ceil(MemSize); | ||||
| 2681 | |||||
| 2682 | // This was already the correct extending load result type, so just adjust | ||||
| 2683 | // the memory type. | ||||
| 2684 | if (WideMemSize == ValSize) { | ||||
| 2685 | MachineFunction &MF = B.getMF(); | ||||
| 2686 | |||||
| 2687 | MachineMemOperand *WideMMO = | ||||
| 2688 | MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); | ||||
| 2689 | Observer.changingInstr(MI); | ||||
| 2690 | MI.setMemRefs(MF, {WideMMO}); | ||||
| 2691 | Observer.changedInstr(MI); | ||||
| 2692 | return true; | ||||
| 2693 | } | ||||
| 2694 | |||||
| 2695 | // Don't bother handling edge case that should probably never be produced. | ||||
| 2696 | if (ValSize > WideMemSize) | ||||
| 2697 | return false; | ||||
| 2698 | |||||
| 2699 | LLT WideTy = widenToNextPowerOf2(ValTy); | ||||
| 2700 | |||||
| 2701 | Register WideLoad; | ||||
| 2702 | if (!WideTy.isVector()) { | ||||
| 2703 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | ||||
| 2704 | B.buildTrunc(ValReg, WideLoad).getReg(0); | ||||
| 2705 | } else { | ||||
| 2706 | // Extract the subvector. | ||||
| 2707 | |||||
| 2708 | if (isRegisterType(ValTy)) { | ||||
| 2709 | // If this a case where G_EXTRACT is legal, use it. | ||||
| 2710 | // (e.g. <3 x s32> -> <4 x s32>) | ||||
| 2711 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | ||||
| 2712 | B.buildExtract(ValReg, WideLoad, 0); | ||||
| 2713 | } else { | ||||
| 2714 | // For cases where the widened type isn't a nice register value, unmerge | ||||
| 2715 | // from a widened register (e.g. <3 x s16> -> <4 x s16>) | ||||
| 2716 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); | ||||
| 2717 | B.buildDeleteTrailingVectorElements(ValReg, WideLoad); | ||||
| 2718 | } | ||||
| 2719 | } | ||||
| 2720 | |||||
| 2721 | MI.eraseFromParent(); | ||||
| 2722 | return true; | ||||
| 2723 | } | ||||
| 2724 | |||||
| 2725 | return false; | ||||
| 2726 | } | ||||
| 2727 | |||||
| 2728 | bool AMDGPULegalizerInfo::legalizeFMad( | ||||
| 2729 | MachineInstr &MI, MachineRegisterInfo &MRI, | ||||
| 2730 | MachineIRBuilder &B) const { | ||||
| 2731 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||
| 2732 | assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail ("Ty.isScalar()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2732, __extension__ __PRETTY_FUNCTION__)); | ||||
| 2733 | |||||
| 2734 | MachineFunction &MF = B.getMF(); | ||||
| 2735 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||||
| 2736 | |||||
| 2737 | // TODO: Always legal with future ftz flag. | ||||
| 2738 | // FIXME: Do we need just output? | ||||
| 2739 | if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) | ||||
| 2740 | return true; | ||||
| 2741 | if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) | ||||
| 2742 | return true; | ||||
| 2743 | |||||
| 2744 | MachineIRBuilder HelperBuilder(MI); | ||||
| 2745 | GISelObserverWrapper DummyObserver; | ||||
| 2746 | LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); | ||||
| 2747 | return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; | ||||
| 2748 | } | ||||
| 2749 | |||||
| 2750 | bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( | ||||
| 2751 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||
| 2752 | Register DstReg = MI.getOperand(0).getReg(); | ||||
| 2753 | Register PtrReg = MI.getOperand(1).getReg(); | ||||
| 2754 | Register CmpVal = MI.getOperand(2).getReg(); | ||||
| 2755 | Register NewVal = MI.getOperand(3).getReg(); | ||||
| 2756 | |||||
| 2757 | assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI. getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2758, __extension__ __PRETTY_FUNCTION__)) | ||||
| 2758 | "this should not have been custom lowered")(static_cast <bool> (AMDGPU::isFlatGlobalAddrSpace(MRI. getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && \"this should not have been custom lowered\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2758, __extension__ __PRETTY_FUNCTION__)); | ||||
| 2759 | |||||
| 2760 | LLT ValTy = MRI.getType(CmpVal); | ||||
| 2761 | LLT VecTy = LLT::fixed_vector(2, ValTy); | ||||
| 2762 | |||||
| 2763 | Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); | ||||
| 2764 | |||||
| 2765 | B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) | ||||
| 2766 | .addDef(DstReg) | ||||
| 2767 | .addUse(PtrReg) | ||||
| 2768 | .addUse(PackedVal) | ||||
| 2769 | .setMemRefs(MI.memoperands()); | ||||
| 2770 | |||||
| 2771 | MI.eraseFromParent(); | ||||
| 2772 | return true; | ||||
| 2773 | } | ||||
| 2774 | |||||
| 2775 | bool AMDGPULegalizerInfo::legalizeFlog( | ||||
| 2776 | MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { | ||||
| 2777 | Register Dst = MI.getOperand(0).getReg(); | ||||
| 2778 | Register Src = MI.getOperand(1).getReg(); | ||||
| 2779 | LLT Ty = B.getMRI()->getType(Dst); | ||||
| 2780 | unsigned Flags = MI.getFlags(); | ||||
| 2781 | |||||
| 2782 | auto Log2Operand = B.buildFLog2(Ty, Src, Flags); | ||||
| 2783 | auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); | ||||
| 2784 | |||||
| 2785 | B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); | ||||
| 2786 | MI.eraseFromParent(); | ||||
| 2787 | return true; | ||||
| 2788 | } | ||||
| 2789 | |||||
| 2790 | bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, | ||||
| 2791 | MachineIRBuilder &B) const { | ||||
| 2792 | Register Dst = MI.getOperand(0).getReg(); | ||||
| 2793 | Register Src = MI.getOperand(1).getReg(); | ||||
| 2794 | unsigned Flags = MI.getFlags(); | ||||
| 2795 | LLT Ty = B.getMRI()->getType(Dst); | ||||
| 2796 | |||||
| 2797 | auto K = B.buildFConstant(Ty, numbers::log2e); | ||||
| 2798 | auto Mul = B.buildFMul(Ty, Src, K, Flags); | ||||
| 2799 | B.buildFExp2(Dst, Mul, Flags); | ||||
| 2800 | MI.eraseFromParent(); | ||||
| 2801 | return true; | ||||
| 2802 | } | ||||
| 2803 | |||||
| 2804 | bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, | ||||
| 2805 | MachineIRBuilder &B) const { | ||||
| 2806 | Register Dst = MI.getOperand(0).getReg(); | ||||
| 2807 | Register Src0 = MI.getOperand(1).getReg(); | ||||
| 2808 | Register Src1 = MI.getOperand(2).getReg(); | ||||
| 2809 | unsigned Flags = MI.getFlags(); | ||||
| 2810 | LLT Ty = B.getMRI()->getType(Dst); | ||||
| 2811 | const LLT S16 = LLT::scalar(16); | ||||
| 2812 | const LLT S32 = LLT::scalar(32); | ||||
| 2813 | |||||
| 2814 | if (Ty == S32) { | ||||
| 2815 | auto Log = B.buildFLog2(S32, Src0, Flags); | ||||
| 2816 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) | ||||
| 2817 | .addUse(Log.getReg(0)) | ||||
| 2818 | .addUse(Src1) | ||||
| 2819 | .setMIFlags(Flags); | ||||
| 2820 | B.buildFExp2(Dst, Mul, Flags); | ||||
| 2821 | } else if (Ty == S16) { | ||||
| 2822 | // There's no f16 fmul_legacy, so we need to convert for it. | ||||
| 2823 | auto Log = B.buildFLog2(S16, Src0, Flags); | ||||
| 2824 | auto Ext0 = B.buildFPExt(S32, Log, Flags); | ||||
| 2825 | auto Ext1 = B.buildFPExt(S32, Src1, Flags); | ||||
| 2826 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) | ||||
| 2827 | .addUse(Ext0.getReg(0)) | ||||
| 2828 | .addUse(Ext1.getReg(0)) | ||||
| 2829 | .setMIFlags(Flags); | ||||
| 2830 | |||||
| 2831 | B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); | ||||
| 2832 | } else | ||||
| 2833 | return false; | ||||
| 2834 | |||||
| 2835 | MI.eraseFromParent(); | ||||
| 2836 | return true; | ||||
| 2837 | } | ||||
| 2838 | |||||
| 2839 | // Find a source register, ignoring any possible source modifiers. | ||||
| 2840 | static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { | ||||
| 2841 | Register ModSrc = OrigSrc; | ||||
| 2842 | if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { | ||||
| 2843 | ModSrc = SrcFNeg->getOperand(1).getReg(); | ||||
| 2844 | if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) | ||||
| 2845 | ModSrc = SrcFAbs->getOperand(1).getReg(); | ||||
| 2846 | } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) | ||||
| 2847 | ModSrc = SrcFAbs->getOperand(1).getReg(); | ||||
| 2848 | return ModSrc; | ||||
| 2849 | } | ||||
| 2850 | |||||
| 2851 | bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, | ||||
| 2852 | MachineRegisterInfo &MRI, | ||||
| 2853 | MachineIRBuilder &B) const { | ||||
| 2854 | |||||
| 2855 | const LLT S1 = LLT::scalar(1); | ||||
| 2856 | const LLT S64 = LLT::scalar(64); | ||||
| 2857 | Register Dst = MI.getOperand(0).getReg(); | ||||
| 2858 | Register OrigSrc = MI.getOperand(1).getReg(); | ||||
| 2859 | unsigned Flags = MI.getFlags(); | ||||
| 2860 | assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&(static_cast <bool> (ST.hasFractBug() && MRI.getType (Dst) == S64 && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2861, __extension__ __PRETTY_FUNCTION__)) | ||||
| 2861 | "this should not have been custom lowered")(static_cast <bool> (ST.hasFractBug() && MRI.getType (Dst) == S64 && "this should not have been custom lowered" ) ? void (0) : __assert_fail ("ST.hasFractBug() && MRI.getType(Dst) == S64 && \"this should not have been custom lowered\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2861, __extension__ __PRETTY_FUNCTION__)); | ||||
| 2862 | |||||
| 2863 | // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) | ||||
| 2864 | // is used instead. However, SI doesn't have V_FLOOR_F64, so the most | ||||
| 2865 | // efficient way to implement it is using V_FRACT_F64. The workaround for the | ||||
| 2866 | // V_FRACT bug is: | ||||
| 2867 | // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) | ||||
| 2868 | // | ||||
| 2869 | // Convert floor(x) to (x - fract(x)) | ||||
| 2870 | |||||
| 2871 | auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) | ||||
| 2872 | .addUse(OrigSrc) | ||||
| 2873 | .setMIFlags(Flags); | ||||
| 2874 | |||||
| 2875 | // Give source modifier matching some assistance before obscuring a foldable | ||||
| 2876 | // pattern. | ||||
| 2877 | |||||
| 2878 | // TODO: We can avoid the neg on the fract? The input sign to fract | ||||
| 2879 | // shouldn't matter? | ||||
| 2880 | Register ModSrc = stripAnySourceMods(OrigSrc, MRI); | ||||
| 2881 | |||||
| 2882 | auto Const = | ||||
| 2883 | B.buildFConstant(S64, llvm::bit_cast<double>(0x3fefffffffffffff)); | ||||
| 2884 | |||||
| 2885 | Register Min = MRI.createGenericVirtualRegister(S64); | ||||
| 2886 | |||||
| 2887 | // We don't need to concern ourselves with the snan handling difference, so | ||||
| 2888 | // use the one which will directly select. | ||||
| 2889 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
| 2890 | if (MFI->getMode().IEEE) | ||||
| 2891 | B.buildFMinNumIEEE(Min, Fract, Const, Flags); | ||||
| 2892 | else | ||||
| 2893 | B.buildFMinNum(Min, Fract, Const, Flags); | ||||
| 2894 | |||||
| 2895 | Register CorrectedFract = Min; | ||||
| 2896 | if (!MI.getFlag(MachineInstr::FmNoNans)) { | ||||
| 2897 | auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); | ||||
| 2898 | CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); | ||||
| 2899 | } | ||||
| 2900 | |||||
| 2901 | auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); | ||||
| 2902 | B.buildFAdd(Dst, OrigSrc, NegFract, Flags); | ||||
| 2903 | |||||
| 2904 | MI.eraseFromParent(); | ||||
| 2905 | return true; | ||||
| 2906 | } | ||||
| 2907 | |||||
| 2908 | // Turn an illegal packed v2s16 build vector into bit operations. | ||||
| 2909 | // TODO: This should probably be a bitcast action in LegalizerHelper. | ||||
| 2910 | bool AMDGPULegalizerInfo::legalizeBuildVector( | ||||
| 2911 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||
| 2912 | Register Dst = MI.getOperand(0).getReg(); | ||||
| 2913 | const LLT S32 = LLT::scalar(32); | ||||
| 2914 | const LLT S16 = LLT::scalar(16); | ||||
| 2915 | assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16))(static_cast <bool> (MRI.getType(Dst) == LLT::fixed_vector (2, 16)) ? void (0) : __assert_fail ("MRI.getType(Dst) == LLT::fixed_vector(2, 16)" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 2915, __extension__ __PRETTY_FUNCTION__)); | ||||
| 2916 | |||||
| 2917 | Register Src0 = MI.getOperand(1).getReg(); | ||||
| 2918 | Register Src1 = MI.getOperand(2).getReg(); | ||||
| 2919 | |||||
| 2920 | if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) { | ||||
| 2921 | assert(MRI.getType(Src0) == S32)(static_cast <bool> (MRI.getType(Src0) == S32) ? void ( 0) : __assert_fail ("MRI.getType(Src0) == S32", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 2921, __extension__ __PRETTY_FUNCTION__)); | ||||
| 2922 | Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0); | ||||
| 2923 | Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0); | ||||
| 2924 | } | ||||
| 2925 | |||||
| 2926 | auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1}); | ||||
| 2927 | B.buildBitcast(Dst, Merge); | ||||
| 2928 | |||||
| 2929 | MI.eraseFromParent(); | ||||
| 2930 | return true; | ||||
| 2931 | } | ||||
| 2932 | |||||
| 2933 | // Build a big integer multiply or multiply-add using MAD_64_32 instructions. | ||||
| 2934 | // | ||||
| 2935 | // Source and accumulation registers must all be 32-bits. | ||||
| 2936 | // | ||||
| 2937 | // TODO: When the multiply is uniform, we should produce a code sequence | ||||
| 2938 | // that is better suited to instruction selection on the SALU. Instead of | ||||
| 2939 | // the outer loop going over parts of the result, the outer loop should go | ||||
| 2940 | // over parts of one of the factors. This should result in instruction | ||||
| 2941 | // selection that makes full use of S_ADDC_U32 instructions. | ||||
| 2942 | void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper, | ||||
| 2943 | MutableArrayRef<Register> Accum, | ||||
| 2944 | ArrayRef<Register> Src0, | ||||
| 2945 | ArrayRef<Register> Src1, | ||||
| 2946 | bool UsePartialMad64_32, | ||||
| 2947 | bool SeparateOddAlignedProducts) const { | ||||
| 2948 | // Use (possibly empty) vectors of S1 registers to represent the set of | ||||
| 2949 | // carries from one pair of positions to the next. | ||||
| 2950 | using Carry = SmallVector<Register, 2>; | ||||
| 2951 | |||||
| 2952 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||
| 2953 | GISelKnownBits &KB = *Helper.getKnownBits(); | ||||
| 2954 | |||||
| 2955 | const LLT S1 = LLT::scalar(1); | ||||
| 2956 | const LLT S32 = LLT::scalar(32); | ||||
| 2957 | const LLT S64 = LLT::scalar(64); | ||||
| 2958 | |||||
| 2959 | Register Zero32; | ||||
| 2960 | Register Zero64; | ||||
| 2961 | |||||
| 2962 | auto getZero32 = [&]() -> Register { | ||||
| 2963 | if (!Zero32) | ||||
| 2964 | Zero32 = B.buildConstant(S32, 0).getReg(0); | ||||
| 2965 | return Zero32; | ||||
| 2966 | }; | ||||
| 2967 | auto getZero64 = [&]() -> Register { | ||||
| 2968 | if (!Zero64) | ||||
| 2969 | Zero64 = B.buildConstant(S64, 0).getReg(0); | ||||
| 2970 | return Zero64; | ||||
| 2971 | }; | ||||
| 2972 | |||||
| 2973 | SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros; | ||||
| 2974 | for (unsigned i = 0; i < Src0.size(); ++i) { | ||||
| 2975 | Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero()); | ||||
| 2976 | Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero()); | ||||
| 2977 | } | ||||
| 2978 | |||||
| 2979 | // Merge the given carries into the 32-bit LocalAccum, which is modified | ||||
| 2980 | // in-place. | ||||
| 2981 | // | ||||
| 2982 | // Returns the carry-out, which is a single S1 register or null. | ||||
| 2983 | auto mergeCarry = | ||||
| 2984 | [&](Register &LocalAccum, const Carry &CarryIn) -> Register { | ||||
| 2985 | if (CarryIn.empty()) | ||||
| 2986 | return Register(); | ||||
| 2987 | |||||
| 2988 | bool HaveCarryOut = true; | ||||
| 2989 | Register CarryAccum; | ||||
| 2990 | if (CarryIn.size() == 1) { | ||||
| 2991 | if (!LocalAccum) { | ||||
| 2992 | LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); | ||||
| 2993 | return Register(); | ||||
| 2994 | } | ||||
| 2995 | |||||
| 2996 | CarryAccum = getZero32(); | ||||
| 2997 | } else { | ||||
| 2998 | CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); | ||||
| 2999 | for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { | ||||
| 3000 | CarryAccum = | ||||
| 3001 | B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) | ||||
| 3002 | .getReg(0); | ||||
| 3003 | } | ||||
| 3004 | |||||
| 3005 | if (!LocalAccum) { | ||||
| 3006 | LocalAccum = getZero32(); | ||||
| 3007 | HaveCarryOut = false; | ||||
| 3008 | } | ||||
| 3009 | } | ||||
| 3010 | |||||
| 3011 | auto Add = | ||||
| 3012 | B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); | ||||
| 3013 | LocalAccum = Add.getReg(0); | ||||
| 3014 | return HaveCarryOut ? Add.getReg(1) : Register(); | ||||
| 3015 | }; | ||||
| 3016 | |||||
| 3017 | // Build a multiply-add chain to compute | ||||
| 3018 | // | ||||
| 3019 | // LocalAccum + (partial products at DstIndex) | ||||
| 3020 | // + (opportunistic subset of CarryIn) | ||||
| 3021 | // | ||||
| 3022 | // LocalAccum is an array of one or two 32-bit registers that are updated | ||||
| 3023 | // in-place. The incoming registers may be null. | ||||
| 3024 | // | ||||
| 3025 | // In some edge cases, carry-ins can be consumed "for free". In that case, | ||||
| 3026 | // the consumed carry bits are removed from CarryIn in-place. | ||||
| 3027 | auto buildMadChain = | ||||
| 3028 | [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) | ||||
| 3029 | -> Carry { | ||||
| 3030 | assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||(static_cast <bool> ((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)) ? void (0) : __assert_fail ("(DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3031, __extension__ __PRETTY_FUNCTION__)) | ||||
| 3031 | (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1))(static_cast <bool> ((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)) ? void (0) : __assert_fail ("(DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3031, __extension__ __PRETTY_FUNCTION__)); | ||||
| 3032 | |||||
| 3033 | Carry CarryOut; | ||||
| 3034 | unsigned j0 = 0; | ||||
| 3035 | |||||
| 3036 | // Use plain 32-bit multiplication for the most significant part of the | ||||
| 3037 | // result by default. | ||||
| 3038 | if (LocalAccum.size() == 1 && | ||||
| 3039 | (!UsePartialMad64_32 || !CarryIn.empty())) { | ||||
| 3040 | do { | ||||
| 3041 | // Skip multiplication if one of the operands is 0 | ||||
| 3042 | unsigned j1 = DstIndex - j0; | ||||
| 3043 | if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { | ||||
| 3044 | ++j0; | ||||
| 3045 | continue; | ||||
| 3046 | } | ||||
| 3047 | auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); | ||||
| 3048 | if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) { | ||||
| 3049 | LocalAccum[0] = Mul.getReg(0); | ||||
| 3050 | } else { | ||||
| 3051 | if (CarryIn.empty()) { | ||||
| 3052 | LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); | ||||
| 3053 | } else { | ||||
| 3054 | LocalAccum[0] = | ||||
| 3055 | B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) | ||||
| 3056 | .getReg(0); | ||||
| 3057 | CarryIn.pop_back(); | ||||
| 3058 | } | ||||
| 3059 | } | ||||
| 3060 | ++j0; | ||||
| 3061 | } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); | ||||
| 3062 | } | ||||
| 3063 | |||||
| 3064 | // Build full 64-bit multiplies. | ||||
| 3065 | if (j0 <= DstIndex) { | ||||
| 3066 | bool HaveSmallAccum = false; | ||||
| 3067 | Register Tmp; | ||||
| 3068 | |||||
| 3069 | if (LocalAccum[0]) { | ||||
| 3070 | if (LocalAccum.size() == 1) { | ||||
| 3071 | Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); | ||||
| 3072 | HaveSmallAccum = true; | ||||
| 3073 | } else if (LocalAccum[1]) { | ||||
| 3074 | Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0); | ||||
| 3075 | HaveSmallAccum = false; | ||||
| 3076 | } else { | ||||
| 3077 | Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); | ||||
| 3078 | HaveSmallAccum = true; | ||||
| 3079 | } | ||||
| 3080 | } else { | ||||
| 3081 | assert(LocalAccum.size() == 1 || !LocalAccum[1])(static_cast <bool> (LocalAccum.size() == 1 || !LocalAccum [1]) ? void (0) : __assert_fail ("LocalAccum.size() == 1 || !LocalAccum[1]" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3081, __extension__ __PRETTY_FUNCTION__)); | ||||
| 3082 | Tmp = getZero64(); | ||||
| 3083 | HaveSmallAccum = true; | ||||
| 3084 | } | ||||
| 3085 | |||||
| 3086 | do { | ||||
| 3087 | unsigned j1 = DstIndex - j0; | ||||
| 3088 | if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { | ||||
| 3089 | ++j0; | ||||
| 3090 | continue; | ||||
| 3091 | } | ||||
| 3092 | auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, | ||||
| 3093 | {Src0[j0], Src1[j1], Tmp}); | ||||
| 3094 | Tmp = Mad.getReg(0); | ||||
| 3095 | if (!HaveSmallAccum) | ||||
| 3096 | CarryOut.push_back(Mad.getReg(1)); | ||||
| 3097 | HaveSmallAccum = false; | ||||
| 3098 | |||||
| 3099 | ++j0; | ||||
| 3100 | } while (j0 <= DstIndex); | ||||
| 3101 | |||||
| 3102 | auto Unmerge = B.buildUnmerge(S32, Tmp); | ||||
| 3103 | LocalAccum[0] = Unmerge.getReg(0); | ||||
| 3104 | if (LocalAccum.size() > 1) | ||||
| 3105 | LocalAccum[1] = Unmerge.getReg(1); | ||||
| 3106 | } | ||||
| 3107 | |||||
| 3108 | return CarryOut; | ||||
| 3109 | }; | ||||
| 3110 | |||||
| 3111 | // Outer multiply loop, iterating over destination parts from least | ||||
| 3112 | // significant to most significant parts. | ||||
| 3113 | // | ||||
| 3114 | // The columns of the following diagram correspond to the destination parts | ||||
| 3115 | // affected by one iteration of the outer loop (ignoring boundary | ||||
| 3116 | // conditions). | ||||
| 3117 | // | ||||
| 3118 | // Dest index relative to 2 * i: 1 0 -1 | ||||
| 3119 | // ------ | ||||
| 3120 | // Carries from previous iteration: e o | ||||
| 3121 | // Even-aligned partial product sum: E E . | ||||
| 3122 | // Odd-aligned partial product sum: O O | ||||
| 3123 | // | ||||
| 3124 | // 'o' is OddCarry, 'e' is EvenCarry. | ||||
| 3125 | // EE and OO are computed from partial products via buildMadChain and use | ||||
| 3126 | // accumulation where possible and appropriate. | ||||
| 3127 | // | ||||
| 3128 | Register SeparateOddCarry; | ||||
| 3129 | Carry EvenCarry; | ||||
| 3130 | Carry OddCarry; | ||||
| 3131 | |||||
| 3132 | for (unsigned i = 0; i <= Accum.size() / 2; ++i) { | ||||
| 3133 | Carry OddCarryIn = std::move(OddCarry); | ||||
| 3134 | Carry EvenCarryIn = std::move(EvenCarry); | ||||
| 3135 | OddCarry.clear(); | ||||
| 3136 | EvenCarry.clear(); | ||||
| 3137 | |||||
| 3138 | // Partial products at offset 2 * i. | ||||
| 3139 | if (2 * i < Accum.size()) { | ||||
| 3140 | auto LocalAccum = Accum.drop_front(2 * i).take_front(2); | ||||
| 3141 | EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); | ||||
| 3142 | } | ||||
| 3143 | |||||
| 3144 | // Partial products at offset 2 * i - 1. | ||||
| 3145 | if (i > 0) { | ||||
| 3146 | if (!SeparateOddAlignedProducts) { | ||||
| 3147 | auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); | ||||
| 3148 | OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); | ||||
| 3149 | } else { | ||||
| 3150 | bool IsHighest = 2 * i >= Accum.size(); | ||||
| 3151 | Register SeparateOddOut[2]; | ||||
| 3152 | auto LocalAccum = MutableArrayRef(SeparateOddOut) | ||||
| 3153 | .take_front(IsHighest ? 1 : 2); | ||||
| 3154 | OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); | ||||
| 3155 | |||||
| 3156 | MachineInstr *Lo; | ||||
| 3157 | |||||
| 3158 | if (i == 1) { | ||||
| 3159 | if (!IsHighest) | ||||
| 3160 | Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); | ||||
| 3161 | else | ||||
| 3162 | Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); | ||||
| 3163 | } else { | ||||
| 3164 | Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], | ||||
| 3165 | SeparateOddCarry); | ||||
| 3166 | } | ||||
| 3167 | Accum[2 * i - 1] = Lo->getOperand(0).getReg(); | ||||
| 3168 | |||||
| 3169 | if (!IsHighest) { | ||||
| 3170 | auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], | ||||
| 3171 | Lo->getOperand(1).getReg()); | ||||
| 3172 | Accum[2 * i] = Hi.getReg(0); | ||||
| 3173 | SeparateOddCarry = Hi.getReg(1); | ||||
| 3174 | } | ||||
| 3175 | } | ||||
| 3176 | } | ||||
| 3177 | |||||
| 3178 | // Add in the carries from the previous iteration | ||||
| 3179 | if (i > 0) { | ||||
| 3180 | if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) | ||||
| 3181 | EvenCarryIn.push_back(CarryOut); | ||||
| 3182 | |||||
| 3183 | if (2 * i < Accum.size()) { | ||||
| 3184 | if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) | ||||
| 3185 | OddCarry.push_back(CarryOut); | ||||
| 3186 | } | ||||
| 3187 | } | ||||
| 3188 | } | ||||
| 3189 | } | ||||
| 3190 | |||||
| 3191 | // Custom narrowing of wide multiplies using wide multiply-add instructions. | ||||
| 3192 | // | ||||
| 3193 | // TODO: If the multiply is followed by an addition, we should attempt to | ||||
| 3194 | // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. | ||||
| 3195 | bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, | ||||
| 3196 | MachineInstr &MI) const { | ||||
| 3197 | assert(ST.hasMad64_32())(static_cast <bool> (ST.hasMad64_32()) ? void (0) : __assert_fail ("ST.hasMad64_32()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3197, __extension__ __PRETTY_FUNCTION__)); | ||||
| 3198 | assert(MI.getOpcode() == TargetOpcode::G_MUL)(static_cast <bool> (MI.getOpcode() == TargetOpcode::G_MUL ) ? void (0) : __assert_fail ("MI.getOpcode() == TargetOpcode::G_MUL" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3198, __extension__ __PRETTY_FUNCTION__)); | ||||
| 3199 | |||||
| 3200 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||
| 3201 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||
| 3202 | |||||
| 3203 | Register DstReg = MI.getOperand(0).getReg(); | ||||
| 3204 | Register Src0 = MI.getOperand(1).getReg(); | ||||
| 3205 | Register Src1 = MI.getOperand(2).getReg(); | ||||
| 3206 | |||||
| 3207 | LLT Ty = MRI.getType(DstReg); | ||||
| 3208 | assert(Ty.isScalar())(static_cast <bool> (Ty.isScalar()) ? void (0) : __assert_fail ("Ty.isScalar()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3208, __extension__ __PRETTY_FUNCTION__)); | ||||
| 3209 | |||||
| 3210 | unsigned Size = Ty.getSizeInBits(); | ||||
| 3211 | unsigned NumParts = Size / 32; | ||||
| 3212 | assert((Size % 32) == 0)(static_cast <bool> ((Size % 32) == 0) ? void (0) : __assert_fail ("(Size % 32) == 0", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3212, __extension__ __PRETTY_FUNCTION__)); | ||||
| 3213 | assert(NumParts >= 2)(static_cast <bool> (NumParts >= 2) ? void (0) : __assert_fail ("NumParts >= 2", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3213, __extension__ __PRETTY_FUNCTION__)); | ||||
| 3214 | |||||
| 3215 | // Whether to use MAD_64_32 for partial products whose high half is | ||||
| 3216 | // discarded. This avoids some ADD instructions but risks false dependency | ||||
| 3217 | // stalls on some subtargets in some cases. | ||||
| 3218 | const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; | ||||
| 3219 | |||||
| 3220 | // Whether to compute odd-aligned partial products separately. This is | ||||
| 3221 | // advisable on subtargets where the accumulator of MAD_64_32 must be placed | ||||
| 3222 | // in an even-aligned VGPR. | ||||
| 3223 | const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); | ||||
| 3224 | |||||
| 3225 | LLT S32 = LLT::scalar(32); | ||||
| 3226 | SmallVector<Register, 2> Src0Parts, Src1Parts; | ||||
| 3227 | for (unsigned i = 0; i < NumParts; ++i) { | ||||
| 3228 | Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); | ||||
| 3229 | Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); | ||||
| 3230 | } | ||||
| 3231 | B.buildUnmerge(Src0Parts, Src0); | ||||
| 3232 | B.buildUnmerge(Src1Parts, Src1); | ||||
| 3233 | |||||
| 3234 | SmallVector<Register, 2> AccumRegs(NumParts); | ||||
| 3235 | buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, | ||||
| 3236 | SeparateOddAlignedProducts); | ||||
| 3237 | |||||
| 3238 | B.buildMergeLikeInstr(DstReg, AccumRegs); | ||||
| 3239 | MI.eraseFromParent(); | ||||
| 3240 | return true; | ||||
| 3241 | } | ||||
| 3242 | |||||
| 3243 | // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to | ||||
| 3244 | // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input | ||||
| 3245 | // case with a single min instruction instead of a compare+select. | ||||
| 3246 | bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, | ||||
| 3247 | MachineRegisterInfo &MRI, | ||||
| 3248 | MachineIRBuilder &B) const { | ||||
| 3249 | Register Dst = MI.getOperand(0).getReg(); | ||||
| 3250 | Register Src = MI.getOperand(1).getReg(); | ||||
| 3251 | LLT DstTy = MRI.getType(Dst); | ||||
| 3252 | LLT SrcTy = MRI.getType(Src); | ||||
| 3253 | |||||
| 3254 | unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ | ||||
| 3255 | ? AMDGPU::G_AMDGPU_FFBH_U32 | ||||
| 3256 | : AMDGPU::G_AMDGPU_FFBL_B32; | ||||
| 3257 | auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); | ||||
| 3258 | B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); | ||||
| 3259 | |||||
| 3260 | MI.eraseFromParent(); | ||||
| 3261 | return true; | ||||
| 3262 | } | ||||
| 3263 | |||||
| 3264 | // Check that this is a G_XOR x, -1 | ||||
| 3265 | static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { | ||||
| 3266 | if (MI.getOpcode() != TargetOpcode::G_XOR) | ||||
| 3267 | return false; | ||||
| 3268 | auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); | ||||
| 3269 | return ConstVal && *ConstVal == -1; | ||||
| 3270 | } | ||||
| 3271 | |||||
| 3272 | // Return the use branch instruction, otherwise null if the usage is invalid. | ||||
| 3273 | static MachineInstr * | ||||
| 3274 | verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, | ||||
| 3275 | MachineBasicBlock *&UncondBrTarget, bool &Negated) { | ||||
| 3276 | Register CondDef = MI.getOperand(0).getReg(); | ||||
| 3277 | if (!MRI.hasOneNonDBGUse(CondDef)) | ||||
| 3278 | return nullptr; | ||||
| 3279 | |||||
| 3280 | MachineBasicBlock *Parent = MI.getParent(); | ||||
| 3281 | MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); | ||||
| 3282 | |||||
| 3283 | if (isNot(MRI, *UseMI)) { | ||||
| 3284 | Register NegatedCond = UseMI->getOperand(0).getReg(); | ||||
| 3285 | if (!MRI.hasOneNonDBGUse(NegatedCond)) | ||||
| 3286 | return nullptr; | ||||
| 3287 | |||||
| 3288 | // We're deleting the def of this value, so we need to remove it. | ||||
| 3289 | eraseInstr(*UseMI, MRI); | ||||
| 3290 | |||||
| 3291 | UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); | ||||
| 3292 | Negated = true; | ||||
| 3293 | } | ||||
| 3294 | |||||
| 3295 | if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) | ||||
| 3296 | return nullptr; | ||||
| 3297 | |||||
| 3298 | // Make sure the cond br is followed by a G_BR, or is the last instruction. | ||||
| 3299 | MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); | ||||
| 3300 | if (Next == Parent->end()) { | ||||
| 3301 | MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); | ||||
| 3302 | if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. | ||||
| 3303 | return nullptr; | ||||
| 3304 | UncondBrTarget = &*NextMBB; | ||||
| 3305 | } else { | ||||
| 3306 | if (Next->getOpcode() != AMDGPU::G_BR) | ||||
| 3307 | return nullptr; | ||||
| 3308 | Br = &*Next; | ||||
| 3309 | UncondBrTarget = Br->getOperand(0).getMBB(); | ||||
| 3310 | } | ||||
| 3311 | |||||
| 3312 | return UseMI; | ||||
| 3313 | } | ||||
| 3314 | |||||
| 3315 | bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, | ||||
| 3316 | const ArgDescriptor *Arg, | ||||
| 3317 | const TargetRegisterClass *ArgRC, | ||||
| 3318 | LLT ArgTy) const { | ||||
| 3319 | MCRegister SrcReg = Arg->getRegister(); | ||||
| 3320 | assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")(static_cast <bool> (Register::isPhysicalRegister(SrcReg ) && "Physical register expected") ? void (0) : __assert_fail ("Register::isPhysicalRegister(SrcReg) && \"Physical register expected\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3320, __extension__ __PRETTY_FUNCTION__)); | ||||
| 3321 | assert(DstReg.isVirtual() && "Virtual register expected")(static_cast <bool> (DstReg.isVirtual() && "Virtual register expected" ) ? void (0) : __assert_fail ("DstReg.isVirtual() && \"Virtual register expected\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3321, __extension__ __PRETTY_FUNCTION__)); | ||||
| 3322 | |||||
| 3323 | Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, | ||||
| 3324 | *ArgRC, B.getDebugLoc(), ArgTy); | ||||
| 3325 | if (Arg->isMasked()) { | ||||
| 3326 | // TODO: Should we try to emit this once in the entry block? | ||||
| 3327 | const LLT S32 = LLT::scalar(32); | ||||
| 3328 | const unsigned Mask = Arg->getMask(); | ||||
| 3329 | const unsigned Shift = llvm::countr_zero<unsigned>(Mask); | ||||
| 3330 | |||||
| 3331 | Register AndMaskSrc = LiveIn; | ||||
| 3332 | |||||
| 3333 | // TODO: Avoid clearing the high bits if we know workitem id y/z are always | ||||
| 3334 | // 0. | ||||
| 3335 | if (Shift
| ||||
| 3336 | auto ShiftAmt = B.buildConstant(S32, Shift); | ||||
| 3337 | AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); | ||||
| 3338 | } | ||||
| 3339 | |||||
| 3340 | B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); | ||||
| |||||
| 3341 | } else { | ||||
| 3342 | B.buildCopy(DstReg, LiveIn); | ||||
| 3343 | } | ||||
| 3344 | |||||
| 3345 | return true; | ||||
| 3346 | } | ||||
| 3347 | |||||
| 3348 | bool AMDGPULegalizerInfo::loadInputValue( | ||||
| 3349 | Register DstReg, MachineIRBuilder &B, | ||||
| 3350 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||
| 3351 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
| 3352 | const ArgDescriptor *Arg; | ||||
| 3353 | const TargetRegisterClass *ArgRC; | ||||
| 3354 | LLT ArgTy; | ||||
| 3355 | std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); | ||||
| 3356 | |||||
| 3357 | if (!Arg) { | ||||
| 3358 | if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { | ||||
| 3359 | // The intrinsic may appear when we have a 0 sized kernarg segment, in which | ||||
| 3360 | // case the pointer argument may be missing and we use null. | ||||
| 3361 | B.buildConstant(DstReg, 0); | ||||
| 3362 | return true; | ||||
| 3363 | } | ||||
| 3364 | |||||
| 3365 | // It's undefined behavior if a function marked with the amdgpu-no-* | ||||
| 3366 | // attributes uses the corresponding intrinsic. | ||||
| 3367 | B.buildUndef(DstReg); | ||||
| 3368 | return true; | ||||
| 3369 | } | ||||
| 3370 | |||||
| 3371 | if (!Arg->isRegister() || !Arg->getRegister().isValid()) | ||||
| 3372 | return false; // TODO: Handle these | ||||
| 3373 | return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); | ||||
| 3374 | } | ||||
| 3375 | |||||
| 3376 | bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( | ||||
| 3377 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, | ||||
| 3378 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||
| 3379 | if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) | ||||
| 3380 | return false; | ||||
| 3381 | |||||
| 3382 | MI.eraseFromParent(); | ||||
| 3383 | return true; | ||||
| 3384 | } | ||||
| 3385 | |||||
| 3386 | static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, | ||||
| 3387 | int64_t C) { | ||||
| 3388 | B.buildConstant(MI.getOperand(0).getReg(), C); | ||||
| 3389 | MI.eraseFromParent(); | ||||
| 3390 | return true; | ||||
| 3391 | } | ||||
| 3392 | |||||
| 3393 | bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( | ||||
| 3394 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, | ||||
| 3395 | unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { | ||||
| 3396 | unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); | ||||
| 3397 | if (MaxID == 0) | ||||
| 3398 | return replaceWithConstant(B, MI, 0); | ||||
| 3399 | |||||
| 3400 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
| 3401 | const ArgDescriptor *Arg; | ||||
| 3402 | const TargetRegisterClass *ArgRC; | ||||
| 3403 | LLT ArgTy; | ||||
| 3404 | std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); | ||||
| 3405 | |||||
| 3406 | Register DstReg = MI.getOperand(0).getReg(); | ||||
| 3407 | if (!Arg) { | ||||
| 3408 | // It's undefined behavior if a function marked with the amdgpu-no-* | ||||
| 3409 | // attributes uses the corresponding intrinsic. | ||||
| 3410 | B.buildUndef(DstReg); | ||||
| 3411 | MI.eraseFromParent(); | ||||
| 3412 | return true; | ||||
| 3413 | } | ||||
| 3414 | |||||
| 3415 | if (Arg->isMasked()) { | ||||
| 3416 | // Don't bother inserting AssertZext for packed IDs since we're emitting the | ||||
| 3417 | // masking operations anyway. | ||||
| 3418 | // | ||||
| 3419 | // TODO: We could assert the top bit is 0 for the source copy. | ||||
| 3420 | if (!loadInputValue(DstReg, B, ArgType)) | ||||
| 3421 | return false; | ||||
| 3422 | } else { | ||||
| 3423 | Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); | ||||
| 3424 | if (!loadInputValue(TmpReg, B, ArgType)) | ||||
| 3425 | return false; | ||||
| 3426 | B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID)); | ||||
| 3427 | } | ||||
| 3428 | |||||
| 3429 | MI.eraseFromParent(); | ||||
| 3430 | return true; | ||||
| 3431 | } | ||||
| 3432 | |||||
| 3433 | Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, | ||||
| 3434 | int64_t Offset) const { | ||||
| 3435 | LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); | ||||
| 3436 | Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); | ||||
| 3437 | |||||
| 3438 | // TODO: If we passed in the base kernel offset we could have a better | ||||
| 3439 | // alignment than 4, but we don't really need it. | ||||
| 3440 | if (!loadInputValue(KernArgReg, B, | ||||
| |||||
| 3441 | AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) | ||||
| 3442 | llvm_unreachable("failed to find kernarg segment ptr")::llvm::llvm_unreachable_internal("failed to find kernarg segment ptr" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3442); | ||||
| 3443 | |||||
| 3444 | auto COffset = B.buildConstant(LLT::scalar(64), Offset); | ||||
| 3445 | // TODO: Should get nuw | ||||
| 3446 | return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); | ||||
| 3447 | } | ||||
| 3448 | |||||
| 3449 | /// Legalize a value that's loaded from kernel arguments. This is only used by | ||||
| 3450 | /// legacy intrinsics. | ||||
| 3451 | bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, | ||||
| 3452 | MachineIRBuilder &B, | ||||
| 3453 | uint64_t Offset, | ||||
| 3454 | Align Alignment) const { | ||||
| 3455 | Register DstReg = MI.getOperand(0).getReg(); | ||||
| 3456 | |||||
| 3457 | assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&(static_cast <bool> (B.getMRI()->getType(DstReg) == LLT ::scalar(32) && "unexpected kernarg parameter type") ? void (0) : __assert_fail ("B.getMRI()->getType(DstReg) == LLT::scalar(32) && \"unexpected kernarg parameter type\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3458, __extension__ __PRETTY_FUNCTION__)) | ||||
| 3458 | "unexpected kernarg parameter type")(static_cast <bool> (B.getMRI()->getType(DstReg) == LLT ::scalar(32) && "unexpected kernarg parameter type") ? void (0) : __assert_fail ("B.getMRI()->getType(DstReg) == LLT::scalar(32) && \"unexpected kernarg parameter type\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 3458, __extension__ __PRETTY_FUNCTION__)); | ||||
| 3459 | |||||
| 3460 | Register Ptr = getKernargParameterPtr(B, Offset); | ||||
| 3461 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); | ||||
| 3462 | B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), | ||||
| 3463 | MachineMemOperand::MODereferenceable | | ||||
| 3464 | MachineMemOperand::MOInvariant); | ||||
| 3465 | MI.eraseFromParent(); | ||||
| 3466 | return true; | ||||
| 3467 | } | ||||
| 3468 | |||||
| 3469 | bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, | ||||
| 3470 | MachineRegisterInfo &MRI, | ||||
| 3471 | MachineIRBuilder &B) const { | ||||
| 3472 | Register Dst = MI.getOperand(0).getReg(); | ||||
| 3473 | LLT DstTy = MRI.getType(Dst); | ||||
| 3474 | LLT S16 = LLT::scalar(16); | ||||
| 3475 | LLT S32 = LLT::scalar(32); | ||||
| 3476 | LLT S64 = LLT::scalar(64); | ||||
| 3477 | |||||
| 3478 | if (DstTy == S16) | ||||
| 3479 | return legalizeFDIV16(MI, MRI, B); | ||||
| 3480 | if (DstTy == S32) | ||||
| 3481 | return legalizeFDIV32(MI, MRI, B); | ||||
| 3482 | if (DstTy == S64) | ||||
| 3483 | return legalizeFDIV64(MI, MRI, B); | ||||
| 3484 | |||||
| 3485 | return false; | ||||
| 3486 | } | ||||
| 3487 | |||||
| 3488 | void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, | ||||
| 3489 | Register DstDivReg, | ||||
| 3490 | Register DstRemReg, | ||||
| 3491 | Register X, | ||||
| 3492 | Register Y) const { | ||||
| 3493 | const LLT S1 = LLT::scalar(1); | ||||
| 3494 | const LLT S32 = LLT::scalar(32); | ||||
| 3495 | |||||
| 3496 | // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the | ||||
| 3497 | // algorithm used here. | ||||
| 3498 | |||||
| 3499 | // Initial estimate of inv(y). | ||||
| 3500 | auto FloatY = B.buildUITOFP(S32, Y); | ||||
| 3501 | auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); | ||||
| 3502 | auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe)); | ||||
| 3503 | auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); | ||||
| 3504 | auto Z = B.buildFPTOUI(S32, ScaledY); | ||||
| 3505 | |||||
| 3506 | // One round of UNR. | ||||
| 3507 | auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); | ||||
| 3508 | auto NegYZ = B.buildMul(S32, NegY, Z); | ||||
| 3509 | Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); | ||||
| 3510 | |||||
| 3511 | // Quotient/remainder estimate. | ||||
| 3512 | auto Q = B.buildUMulH(S32, X, Z); | ||||
| 3513 | auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); | ||||
| 3514 | |||||
| 3515 | // First quotient/remainder refinement. | ||||
| 3516 | auto One = B.buildConstant(S32, 1); | ||||
| 3517 | auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); | ||||
| 3518 | if (DstDivReg) | ||||
| 3519 | Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); | ||||
| 3520 | R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); | ||||
| 3521 | |||||
| 3522 | // Second quotient/remainder refinement. | ||||
| 3523 | Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); | ||||
| 3524 | if (DstDivReg) | ||||
| 3525 | B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); | ||||
| 3526 | |||||
| 3527 | if (DstRemReg) | ||||
| 3528 | B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); | ||||
| 3529 | } | ||||
| 3530 | |||||
| 3531 | // Build integer reciprocal sequence around V_RCP_IFLAG_F32 | ||||
| 3532 | // | ||||
| 3533 | // Return lo, hi of result | ||||
| 3534 | // | ||||
| 3535 | // %cvt.lo = G_UITOFP Val.lo | ||||
| 3536 | // %cvt.hi = G_UITOFP Val.hi | ||||
| 3537 | // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo | ||||
| 3538 | // %rcp = G_AMDGPU_RCP_IFLAG %mad | ||||
| 3539 | // %mul1 = G_FMUL %rcp, 0x5f7ffffc | ||||
| 3540 | // %mul2 = G_FMUL %mul1, 2**(-32) | ||||
| 3541 | // %trunc = G_INTRINSIC_TRUNC %mul2 | ||||
| 3542 | // %mad2 = G_FMAD %trunc, -(2**32), %mul1 | ||||
| 3543 | // return {G_FPTOUI %mad2, G_FPTOUI %trunc} | ||||
| 3544 | static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, | ||||
| 3545 | Register Val) { | ||||
| 3546 | const LLT S32 = LLT::scalar(32); | ||||
| 3547 | auto Unmerge = B.buildUnmerge(S32, Val); | ||||
| 3548 | |||||
| 3549 | auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); | ||||
| 3550 | auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); | ||||
| 3551 | |||||
| 3552 | auto Mad = B.buildFMAD( | ||||
| 3553 | S32, CvtHi, // 2**32 | ||||
| 3554 | B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo); | ||||
| 3555 | |||||
| 3556 | auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); | ||||
| 3557 | auto Mul1 = B.buildFMul( | ||||
| 3558 | S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc))); | ||||
| 3559 | |||||
| 3560 | // 2**(-32) | ||||
| 3561 | auto Mul2 = B.buildFMul( | ||||
| 3562 | S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000))); | ||||
| 3563 | auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); | ||||
| 3564 | |||||
| 3565 | // -(2**32) | ||||
| 3566 | auto Mad2 = B.buildFMAD( | ||||
| 3567 | S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)), | ||||
| 3568 | Mul1); | ||||
| 3569 | |||||
| 3570 | auto ResultLo = B.buildFPTOUI(S32, Mad2); | ||||
| 3571 | auto ResultHi = B.buildFPTOUI(S32, Trunc); | ||||
| 3572 | |||||
| 3573 | return {ResultLo.getReg(0), ResultHi.getReg(0)}; | ||||
| 3574 | } | ||||
| 3575 | |||||
| 3576 | void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, | ||||
| 3577 | Register DstDivReg, | ||||
| 3578 | Register DstRemReg, | ||||
| 3579 | Register Numer, | ||||
| 3580 | Register Denom) const { | ||||
| 3581 | const LLT S32 = LLT::scalar(32); | ||||
| 3582 | const LLT S64 = LLT::scalar(64); | ||||
| 3583 | const LLT S1 = LLT::scalar(1); | ||||
| 3584 | Register RcpLo, RcpHi; | ||||
| 3585 | |||||
| 3586 | std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); | ||||
| 3587 | |||||
| 3588 | auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi}); | ||||
| 3589 | |||||
| 3590 | auto Zero64 = B.buildConstant(S64, 0); | ||||
| 3591 | auto NegDenom = B.buildSub(S64, Zero64, Denom); | ||||
| 3592 | |||||
| 3593 | auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); | ||||
| 3594 | auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); | ||||
| 3595 | |||||
| 3596 | auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); | ||||
| 3597 | Register MulHi1_Lo = UnmergeMulHi1.getReg(0); | ||||
| 3598 | Register MulHi1_Hi = UnmergeMulHi1.getReg(1); | ||||
| 3599 | |||||
| 3600 | auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); | ||||
| 3601 | auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); | ||||
| 3602 | auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi}); | ||||
| 3603 | |||||
| 3604 | auto MulLo2 = B.buildMul(S64, NegDenom, Add1); | ||||
| 3605 | auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); | ||||
| 3606 | auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); | ||||
| 3607 | Register MulHi2_Lo = UnmergeMulHi2.getReg(0); | ||||
| 3608 | Register MulHi2_Hi = UnmergeMulHi2.getReg(1); | ||||
| 3609 | |||||
| 3610 | auto Zero32 = B.buildConstant(S32, 0); | ||||
| 3611 | auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); | ||||
| 3612 | auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); | ||||
| 3613 | auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi}); | ||||
| 3614 | |||||
| 3615 | auto UnmergeNumer = B.buildUnmerge(S32, Numer); | ||||
| 3616 | Register NumerLo = UnmergeNumer.getReg(0); | ||||
| 3617 | Register NumerHi = UnmergeNumer.getReg(1); | ||||
| 3618 | |||||
| 3619 | auto MulHi3 = B.buildUMulH(S64, Numer, Add2); | ||||
| 3620 | auto Mul3 = B.buildMul(S64, Denom, MulHi3); | ||||
| 3621 | auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); | ||||
| 3622 | Register Mul3_Lo = UnmergeMul3.getReg(0); | ||||
| 3623 | Register Mul3_Hi = UnmergeMul3.getReg(1); | ||||
| 3624 | auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); | ||||
| 3625 | auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); | ||||
| 3626 | auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); | ||||
| 3627 | auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi}); | ||||
| 3628 | |||||
| 3629 | auto UnmergeDenom = B.buildUnmerge(S32, Denom); | ||||
| 3630 | Register DenomLo = UnmergeDenom.getReg(0); | ||||
| 3631 | Register DenomHi = UnmergeDenom.getReg(1); | ||||
| 3632 | |||||
| 3633 | auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); | ||||
| 3634 | auto C1 = B.buildSExt(S32, CmpHi); | ||||
| 3635 | |||||
| 3636 | auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); | ||||
| 3637 | auto C2 = B.buildSExt(S32, CmpLo); | ||||
| 3638 | |||||
| 3639 | auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); | ||||
| 3640 | auto C3 = B.buildSelect(S32, CmpEq, C2, C1); | ||||
| 3641 | |||||
| 3642 | // TODO: Here and below portions of the code can be enclosed into if/endif. | ||||
| 3643 | // Currently control flow is unconditional and we have 4 selects after | ||||
| 3644 | // potential endif to substitute PHIs. | ||||
| 3645 | |||||
| 3646 | // if C3 != 0 ... | ||||
| 3647 | auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); | ||||
| 3648 | auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); | ||||
| 3649 | auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); | ||||
| 3650 | auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi}); | ||||
| 3651 | |||||
| 3652 | auto One64 = B.buildConstant(S64, 1); | ||||
| 3653 | auto Add3 = B.buildAdd(S64, MulHi3, One64); | ||||
| 3654 | |||||
| 3655 | auto C4 = | ||||
| 3656 | B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); | ||||
| 3657 | auto C5 = | ||||
| 3658 | B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); | ||||
| 3659 | auto C6 = B.buildSelect( | ||||
| 3660 | S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); | ||||
| 3661 | |||||
| 3662 | // if (C6 != 0) | ||||
| 3663 | auto Add4 = B.buildAdd(S64, Add3, One64); | ||||
| 3664 | auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); | ||||
| 3665 | |||||
| 3666 | auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); | ||||
| 3667 | auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); | ||||
| 3668 | auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi}); | ||||
| 3669 | |||||
| 3670 | // endif C6 | ||||
| 3671 | // endif C3 | ||||
| 3672 | |||||
| 3673 | if (DstDivReg) { | ||||
| 3674 | auto Sel1 = B.buildSelect( | ||||
| 3675 | S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); | ||||
| 3676 | B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), | ||||
| 3677 | Sel1, MulHi3); | ||||
| 3678 | } | ||||
| 3679 | |||||
| 3680 | if (DstRemReg) { | ||||
| 3681 | auto Sel2 = B.buildSelect( | ||||
| 3682 | S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); | ||||
| 3683 | B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), | ||||
| 3684 | Sel2, Sub1); | ||||
| 3685 | } | ||||
| 3686 | } | ||||
| 3687 | |||||
| 3688 | bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, | ||||
| 3689 | MachineRegisterInfo &MRI, | ||||
| 3690 | MachineIRBuilder &B) const { | ||||
| 3691 | Register DstDivReg, DstRemReg; | ||||
| 3692 | switch (MI.getOpcode()) { | ||||
| 3693 | default: | ||||
| 3694 | llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3694); | ||||
| 3695 | case AMDGPU::G_UDIV: { | ||||
| 3696 | DstDivReg = MI.getOperand(0).getReg(); | ||||
| 3697 | break; | ||||
| 3698 | } | ||||
| 3699 | case AMDGPU::G_UREM: { | ||||
| 3700 | DstRemReg = MI.getOperand(0).getReg(); | ||||
| 3701 | break; | ||||
| 3702 | } | ||||
| 3703 | case AMDGPU::G_UDIVREM: { | ||||
| 3704 | DstDivReg = MI.getOperand(0).getReg(); | ||||
| 3705 | DstRemReg = MI.getOperand(1).getReg(); | ||||
| 3706 | break; | ||||
| 3707 | } | ||||
| 3708 | } | ||||
| 3709 | |||||
| 3710 | const LLT S64 = LLT::scalar(64); | ||||
| 3711 | const LLT S32 = LLT::scalar(32); | ||||
| 3712 | const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); | ||||
| 3713 | Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); | ||||
| 3714 | Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); | ||||
| 3715 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||
| 3716 | |||||
| 3717 | if (Ty == S32) | ||||
| 3718 | legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); | ||||
| 3719 | else if (Ty == S64) | ||||
| 3720 | legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); | ||||
| 3721 | else | ||||
| 3722 | return false; | ||||
| 3723 | |||||
| 3724 | MI.eraseFromParent(); | ||||
| 3725 | return true; | ||||
| 3726 | } | ||||
| 3727 | |||||
| 3728 | bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, | ||||
| 3729 | MachineRegisterInfo &MRI, | ||||
| 3730 | MachineIRBuilder &B) const { | ||||
| 3731 | const LLT S64 = LLT::scalar(64); | ||||
| 3732 | const LLT S32 = LLT::scalar(32); | ||||
| 3733 | |||||
| 3734 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); | ||||
| 3735 | if (Ty != S32 && Ty != S64) | ||||
| 3736 | return false; | ||||
| 3737 | |||||
| 3738 | const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); | ||||
| 3739 | Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); | ||||
| 3740 | Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); | ||||
| 3741 | |||||
| 3742 | auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); | ||||
| 3743 | auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); | ||||
| 3744 | auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); | ||||
| 3745 | |||||
| 3746 | LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); | ||||
| 3747 | RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); | ||||
| 3748 | |||||
| 3749 | LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); | ||||
| 3750 | RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); | ||||
| 3751 | |||||
| 3752 | Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; | ||||
| 3753 | switch (MI.getOpcode()) { | ||||
| 3754 | default: | ||||
| 3755 | llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 3755); | ||||
| 3756 | case AMDGPU::G_SDIV: { | ||||
| 3757 | DstDivReg = MI.getOperand(0).getReg(); | ||||
| 3758 | TmpDivReg = MRI.createGenericVirtualRegister(Ty); | ||||
| 3759 | break; | ||||
| 3760 | } | ||||
| 3761 | case AMDGPU::G_SREM: { | ||||
| 3762 | DstRemReg = MI.getOperand(0).getReg(); | ||||
| 3763 | TmpRemReg = MRI.createGenericVirtualRegister(Ty); | ||||
| 3764 | break; | ||||
| 3765 | } | ||||
| 3766 | case AMDGPU::G_SDIVREM: { | ||||
| 3767 | DstDivReg = MI.getOperand(0).getReg(); | ||||
| 3768 | DstRemReg = MI.getOperand(1).getReg(); | ||||
| 3769 | TmpDivReg = MRI.createGenericVirtualRegister(Ty); | ||||
| 3770 | TmpRemReg = MRI.createGenericVirtualRegister(Ty); | ||||
| 3771 | break; | ||||
| 3772 | } | ||||
| 3773 | } | ||||
| 3774 | |||||
| 3775 | if (Ty == S32) | ||||
| 3776 | legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); | ||||
| 3777 | else | ||||
| 3778 | legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); | ||||
| 3779 | |||||
| 3780 | if (DstDivReg) { | ||||
| 3781 | auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); | ||||
| 3782 | auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); | ||||
| 3783 | B.buildSub(DstDivReg, SignXor, Sign); | ||||
| 3784 | } | ||||
| 3785 | |||||
| 3786 | if (DstRemReg) { | ||||
| 3787 | auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS | ||||
| 3788 | auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); | ||||
| 3789 | B.buildSub(DstRemReg, SignXor, Sign); | ||||
| 3790 | } | ||||
| 3791 | |||||
| 3792 | MI.eraseFromParent(); | ||||
| 3793 | return true; | ||||
| 3794 | } | ||||
| 3795 | |||||
| 3796 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, | ||||
| 3797 | MachineRegisterInfo &MRI, | ||||
| 3798 | MachineIRBuilder &B) const { | ||||
| 3799 | Register Res = MI.getOperand(0).getReg(); | ||||
| 3800 | Register LHS = MI.getOperand(1).getReg(); | ||||
| 3801 | Register RHS = MI.getOperand(2).getReg(); | ||||
| 3802 | uint16_t Flags = MI.getFlags(); | ||||
| 3803 | LLT ResTy = MRI.getType(Res); | ||||
| 3804 | |||||
| 3805 | const MachineFunction &MF = B.getMF(); | ||||
| 3806 | bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || | ||||
| 3807 | MI.getFlag(MachineInstr::FmAfn); | ||||
| 3808 | |||||
| 3809 | if (!AllowInaccurateRcp) | ||||
| 3810 | return false; | ||||
| 3811 | |||||
| 3812 | if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { | ||||
| 3813 | // 1 / x -> RCP(x) | ||||
| 3814 | if (CLHS->isExactlyValue(1.0)) { | ||||
| 3815 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | ||||
| 3816 | .addUse(RHS) | ||||
| 3817 | .setMIFlags(Flags); | ||||
| 3818 | |||||
| 3819 | MI.eraseFromParent(); | ||||
| 3820 | return true; | ||||
| 3821 | } | ||||
| 3822 | |||||
| 3823 | // -1 / x -> RCP( FNEG(x) ) | ||||
| 3824 | if (CLHS->isExactlyValue(-1.0)) { | ||||
| 3825 | auto FNeg = B.buildFNeg(ResTy, RHS, Flags); | ||||
| 3826 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) | ||||
| 3827 | .addUse(FNeg.getReg(0)) | ||||
| 3828 | .setMIFlags(Flags); | ||||
| 3829 | |||||
| 3830 | MI.eraseFromParent(); | ||||
| 3831 | return true; | ||||
| 3832 | } | ||||
| 3833 | } | ||||
| 3834 | |||||
| 3835 | // x / y -> x * (1.0 / y) | ||||
| 3836 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) | ||||
| 3837 | .addUse(RHS) | ||||
| 3838 | .setMIFlags(Flags); | ||||
| 3839 | B.buildFMul(Res, LHS, RCP, Flags); | ||||
| 3840 | |||||
| 3841 | MI.eraseFromParent(); | ||||
| 3842 | return true; | ||||
| 3843 | } | ||||
| 3844 | |||||
| 3845 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, | ||||
| 3846 | MachineRegisterInfo &MRI, | ||||
| 3847 | MachineIRBuilder &B) const { | ||||
| 3848 | Register Res = MI.getOperand(0).getReg(); | ||||
| 3849 | Register X = MI.getOperand(1).getReg(); | ||||
| 3850 | Register Y = MI.getOperand(2).getReg(); | ||||
| 3851 | uint16_t Flags = MI.getFlags(); | ||||
| 3852 | LLT ResTy = MRI.getType(Res); | ||||
| 3853 | |||||
| 3854 | const MachineFunction &MF = B.getMF(); | ||||
| 3855 | bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || | ||||
| 3856 | MI.getFlag(MachineInstr::FmAfn); | ||||
| 3857 | |||||
| 3858 | if (!AllowInaccurateRcp) | ||||
| 3859 | return false; | ||||
| 3860 | |||||
| 3861 | auto NegY = B.buildFNeg(ResTy, Y); | ||||
| 3862 | auto One = B.buildFConstant(ResTy, 1.0); | ||||
| 3863 | |||||
| 3864 | auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) | ||||
| 3865 | .addUse(Y) | ||||
| 3866 | .setMIFlags(Flags); | ||||
| 3867 | |||||
| 3868 | auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); | ||||
| 3869 | R = B.buildFMA(ResTy, Tmp0, R, R); | ||||
| 3870 | |||||
| 3871 | auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); | ||||
| 3872 | R = B.buildFMA(ResTy, Tmp1, R, R); | ||||
| 3873 | |||||
| 3874 | auto Ret = B.buildFMul(ResTy, X, R); | ||||
| 3875 | auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); | ||||
| 3876 | |||||
| 3877 | B.buildFMA(Res, Tmp2, R, Ret); | ||||
| 3878 | MI.eraseFromParent(); | ||||
| 3879 | return true; | ||||
| 3880 | } | ||||
| 3881 | |||||
| 3882 | bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, | ||||
| 3883 | MachineRegisterInfo &MRI, | ||||
| 3884 | MachineIRBuilder &B) const { | ||||
| 3885 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) | ||||
| 3886 | return true; | ||||
| 3887 | |||||
| 3888 | Register Res = MI.getOperand(0).getReg(); | ||||
| 3889 | Register LHS = MI.getOperand(1).getReg(); | ||||
| 3890 | Register RHS = MI.getOperand(2).getReg(); | ||||
| 3891 | |||||
| 3892 | uint16_t Flags = MI.getFlags(); | ||||
| 3893 | |||||
| 3894 | LLT S16 = LLT::scalar(16); | ||||
| 3895 | LLT S32 = LLT::scalar(32); | ||||
| 3896 | |||||
| 3897 | auto LHSExt = B.buildFPExt(S32, LHS, Flags); | ||||
| 3898 | auto RHSExt = B.buildFPExt(S32, RHS, Flags); | ||||
| 3899 | |||||
| 3900 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||
| 3901 | .addUse(RHSExt.getReg(0)) | ||||
| 3902 | .setMIFlags(Flags); | ||||
| 3903 | |||||
| 3904 | auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); | ||||
| 3905 | auto RDst = B.buildFPTrunc(S16, QUOT, Flags); | ||||
| 3906 | |||||
| 3907 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | ||||
| 3908 | .addUse(RDst.getReg(0)) | ||||
| 3909 | .addUse(RHS) | ||||
| 3910 | .addUse(LHS) | ||||
| 3911 | .setMIFlags(Flags); | ||||
| 3912 | |||||
| 3913 | MI.eraseFromParent(); | ||||
| 3914 | return true; | ||||
| 3915 | } | ||||
| 3916 | |||||
| 3917 | // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions | ||||
| 3918 | // to enable denorm mode. When 'Enable' is false, disable denorm mode. | ||||
| 3919 | static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, | ||||
| 3920 | const GCNSubtarget &ST, | ||||
| 3921 | SIModeRegisterDefaults Mode) { | ||||
| 3922 | // Set SP denorm mode to this value. | ||||
| 3923 | unsigned SPDenormMode = | ||||
| 3924 | Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue(); | ||||
| 3925 | |||||
| 3926 | if (ST.hasDenormModeInst()) { | ||||
| 3927 | // Preserve default FP64FP16 denorm mode while updating FP32 mode. | ||||
| 3928 | uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); | ||||
| 3929 | |||||
| 3930 | uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); | ||||
| 3931 | B.buildInstr(AMDGPU::S_DENORM_MODE) | ||||
| 3932 | .addImm(NewDenormModeValue); | ||||
| 3933 | |||||
| 3934 | } else { | ||||
| 3935 | // Select FP32 bit field in mode register. | ||||
| 3936 | unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | | ||||
| 3937 | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | | ||||
| 3938 | (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); | ||||
| 3939 | |||||
| 3940 | B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) | ||||
| 3941 | .addImm(SPDenormMode) | ||||
| 3942 | .addImm(SPDenormModeBitField); | ||||
| 3943 | } | ||||
| 3944 | } | ||||
| 3945 | |||||
| 3946 | bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, | ||||
| 3947 | MachineRegisterInfo &MRI, | ||||
| 3948 | MachineIRBuilder &B) const { | ||||
| 3949 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) | ||||
| 3950 | return true; | ||||
| 3951 | |||||
| 3952 | Register Res = MI.getOperand(0).getReg(); | ||||
| 3953 | Register LHS = MI.getOperand(1).getReg(); | ||||
| 3954 | Register RHS = MI.getOperand(2).getReg(); | ||||
| 3955 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
| 3956 | SIModeRegisterDefaults Mode = MFI->getMode(); | ||||
| 3957 | |||||
| 3958 | uint16_t Flags = MI.getFlags(); | ||||
| 3959 | |||||
| 3960 | LLT S32 = LLT::scalar(32); | ||||
| 3961 | LLT S1 = LLT::scalar(1); | ||||
| 3962 | |||||
| 3963 | auto One = B.buildFConstant(S32, 1.0f); | ||||
| 3964 | |||||
| 3965 | auto DenominatorScaled = | ||||
| 3966 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | ||||
| 3967 | .addUse(LHS) | ||||
| 3968 | .addUse(RHS) | ||||
| 3969 | .addImm(0) | ||||
| 3970 | .setMIFlags(Flags); | ||||
| 3971 | auto NumeratorScaled = | ||||
| 3972 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) | ||||
| 3973 | .addUse(LHS) | ||||
| 3974 | .addUse(RHS) | ||||
| 3975 | .addImm(1) | ||||
| 3976 | .setMIFlags(Flags); | ||||
| 3977 | |||||
| 3978 | auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||
| 3979 | .addUse(DenominatorScaled.getReg(0)) | ||||
| 3980 | .setMIFlags(Flags); | ||||
| 3981 | auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); | ||||
| 3982 | |||||
| 3983 | // FIXME: Doesn't correctly model the FP mode switch, and the FP operations | ||||
| 3984 | // aren't modeled as reading it. | ||||
| 3985 | if (!Mode.allFP32Denormals()) | ||||
| 3986 | toggleSPDenormMode(true, B, ST, Mode); | ||||
| 3987 | |||||
| 3988 | auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); | ||||
| 3989 | auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); | ||||
| 3990 | auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); | ||||
| 3991 | auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); | ||||
| 3992 | auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); | ||||
| 3993 | auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); | ||||
| 3994 | |||||
| 3995 | if (!Mode.allFP32Denormals()) | ||||
| 3996 | toggleSPDenormMode(false, B, ST, Mode); | ||||
| 3997 | |||||
| 3998 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) | ||||
| 3999 | .addUse(Fma4.getReg(0)) | ||||
| 4000 | .addUse(Fma1.getReg(0)) | ||||
| 4001 | .addUse(Fma3.getReg(0)) | ||||
| 4002 | .addUse(NumeratorScaled.getReg(1)) | ||||
| 4003 | .setMIFlags(Flags); | ||||
| 4004 | |||||
| 4005 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) | ||||
| 4006 | .addUse(Fmas.getReg(0)) | ||||
| 4007 | .addUse(RHS) | ||||
| 4008 | .addUse(LHS) | ||||
| 4009 | .setMIFlags(Flags); | ||||
| 4010 | |||||
| 4011 | MI.eraseFromParent(); | ||||
| 4012 | return true; | ||||
| 4013 | } | ||||
| 4014 | |||||
| 4015 | bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, | ||||
| 4016 | MachineRegisterInfo &MRI, | ||||
| 4017 | MachineIRBuilder &B) const { | ||||
| 4018 | if (legalizeFastUnsafeFDIV64(MI, MRI, B)) | ||||
| 4019 | return true; | ||||
| 4020 | |||||
| 4021 | Register Res = MI.getOperand(0).getReg(); | ||||
| 4022 | Register LHS = MI.getOperand(1).getReg(); | ||||
| 4023 | Register RHS = MI.getOperand(2).getReg(); | ||||
| 4024 | |||||
| 4025 | uint16_t Flags = MI.getFlags(); | ||||
| 4026 | |||||
| 4027 | LLT S64 = LLT::scalar(64); | ||||
| 4028 | LLT S1 = LLT::scalar(1); | ||||
| 4029 | |||||
| 4030 | auto One = B.buildFConstant(S64, 1.0); | ||||
| 4031 | |||||
| 4032 | auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) | ||||
| 4033 | .addUse(LHS) | ||||
| 4034 | .addUse(RHS) | ||||
| 4035 | .addImm(0) | ||||
| 4036 | .setMIFlags(Flags); | ||||
| 4037 | |||||
| 4038 | auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); | ||||
| 4039 | |||||
| 4040 | auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) | ||||
| 4041 | .addUse(DivScale0.getReg(0)) | ||||
| 4042 | .setMIFlags(Flags); | ||||
| 4043 | |||||
| 4044 | auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); | ||||
| 4045 | auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); | ||||
| 4046 | auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); | ||||
| 4047 | |||||
| 4048 | auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) | ||||
| 4049 | .addUse(LHS) | ||||
| 4050 | .addUse(RHS) | ||||
| 4051 | .addImm(1) | ||||
| 4052 | .setMIFlags(Flags); | ||||
| 4053 | |||||
| 4054 | auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); | ||||
| 4055 | auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); | ||||
| 4056 | auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); | ||||
| 4057 | |||||
| 4058 | Register Scale; | ||||
| 4059 | if (!ST.hasUsableDivScaleConditionOutput()) { | ||||
| 4060 | // Workaround a hardware bug on SI where the condition output from div_scale | ||||
| 4061 | // is not usable. | ||||
| 4062 | |||||
| 4063 | LLT S32 = LLT::scalar(32); | ||||
| 4064 | |||||
| 4065 | auto NumUnmerge = B.buildUnmerge(S32, LHS); | ||||
| 4066 | auto DenUnmerge = B.buildUnmerge(S32, RHS); | ||||
| 4067 | auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); | ||||
| 4068 | auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); | ||||
| 4069 | |||||
| 4070 | auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), | ||||
| 4071 | Scale1Unmerge.getReg(1)); | ||||
| 4072 | auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), | ||||
| 4073 | Scale0Unmerge.getReg(1)); | ||||
| 4074 | Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); | ||||
| 4075 | } else { | ||||
| 4076 | Scale = DivScale1.getReg(1); | ||||
| 4077 | } | ||||
| 4078 | |||||
| 4079 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) | ||||
| 4080 | .addUse(Fma4.getReg(0)) | ||||
| 4081 | .addUse(Fma3.getReg(0)) | ||||
| 4082 | .addUse(Mul.getReg(0)) | ||||
| 4083 | .addUse(Scale) | ||||
| 4084 | .setMIFlags(Flags); | ||||
| 4085 | |||||
| 4086 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res), false) | ||||
| 4087 | .addUse(Fmas.getReg(0)) | ||||
| 4088 | .addUse(RHS) | ||||
| 4089 | .addUse(LHS) | ||||
| 4090 | .setMIFlags(Flags); | ||||
| 4091 | |||||
| 4092 | MI.eraseFromParent(); | ||||
| 4093 | return true; | ||||
| 4094 | } | ||||
| 4095 | |||||
| 4096 | bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, | ||||
| 4097 | MachineRegisterInfo &MRI, | ||||
| 4098 | MachineIRBuilder &B) const { | ||||
| 4099 | Register Res = MI.getOperand(0).getReg(); | ||||
| 4100 | Register LHS = MI.getOperand(2).getReg(); | ||||
| 4101 | Register RHS = MI.getOperand(3).getReg(); | ||||
| 4102 | uint16_t Flags = MI.getFlags(); | ||||
| 4103 | |||||
| 4104 | LLT S32 = LLT::scalar(32); | ||||
| 4105 | LLT S1 = LLT::scalar(1); | ||||
| 4106 | |||||
| 4107 | auto Abs = B.buildFAbs(S32, RHS, Flags); | ||||
| 4108 | const APFloat C0Val(1.0f); | ||||
| 4109 | |||||
| 4110 | auto C0 = B.buildConstant(S32, 0x6f800000); | ||||
| 4111 | auto C1 = B.buildConstant(S32, 0x2f800000); | ||||
| 4112 | auto C2 = B.buildConstant(S32, llvm::bit_cast<uint32_t>(1.0f)); | ||||
| 4113 | |||||
| 4114 | auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); | ||||
| 4115 | auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); | ||||
| 4116 | |||||
| 4117 | auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); | ||||
| 4118 | |||||
| 4119 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) | ||||
| 4120 | .addUse(Mul0.getReg(0)) | ||||
| 4121 | .setMIFlags(Flags); | ||||
| 4122 | |||||
| 4123 | auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); | ||||
| 4124 | |||||
| 4125 | B.buildFMul(Res, Sel, Mul1, Flags); | ||||
| 4126 | |||||
| 4127 | MI.eraseFromParent(); | ||||
| 4128 | return true; | ||||
| 4129 | } | ||||
| 4130 | |||||
| 4131 | // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. | ||||
| 4132 | // FIXME: Why do we handle this one but not other removed instructions? | ||||
| 4133 | // | ||||
| 4134 | // Reciprocal square root. The clamp prevents infinite results, clamping | ||||
| 4135 | // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to | ||||
| 4136 | // +-max_float. | ||||
| 4137 | bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, | ||||
| 4138 | MachineRegisterInfo &MRI, | ||||
| 4139 | MachineIRBuilder &B) const { | ||||
| 4140 | if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) | ||||
| 4141 | return true; | ||||
| 4142 | |||||
| 4143 | Register Dst = MI.getOperand(0).getReg(); | ||||
| 4144 | Register Src = MI.getOperand(2).getReg(); | ||||
| 4145 | auto Flags = MI.getFlags(); | ||||
| 4146 | |||||
| 4147 | LLT Ty = MRI.getType(Dst); | ||||
| 4148 | |||||
| 4149 | const fltSemantics *FltSemantics; | ||||
| 4150 | if (Ty == LLT::scalar(32)) | ||||
| 4151 | FltSemantics = &APFloat::IEEEsingle(); | ||||
| 4152 | else if (Ty == LLT::scalar(64)) | ||||
| 4153 | FltSemantics = &APFloat::IEEEdouble(); | ||||
| 4154 | else | ||||
| 4155 | return false; | ||||
| 4156 | |||||
| 4157 | auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) | ||||
| 4158 | .addUse(Src) | ||||
| 4159 | .setMIFlags(Flags); | ||||
| 4160 | |||||
| 4161 | // We don't need to concern ourselves with the snan handling difference, since | ||||
| 4162 | // the rsq quieted (or not) so use the one which will directly select. | ||||
| 4163 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
| 4164 | const bool UseIEEE = MFI->getMode().IEEE; | ||||
| 4165 | |||||
| 4166 | auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); | ||||
| 4167 | auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : | ||||
| 4168 | B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); | ||||
| 4169 | |||||
| 4170 | auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); | ||||
| 4171 | |||||
| 4172 | if (UseIEEE) | ||||
| 4173 | B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); | ||||
| 4174 | else | ||||
| 4175 | B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); | ||||
| 4176 | MI.eraseFromParent(); | ||||
| 4177 | return true; | ||||
| 4178 | } | ||||
| 4179 | |||||
| 4180 | static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { | ||||
| 4181 | switch (IID) { | ||||
| 4182 | case Intrinsic::amdgcn_ds_fadd: | ||||
| 4183 | return AMDGPU::G_ATOMICRMW_FADD; | ||||
| 4184 | case Intrinsic::amdgcn_ds_fmin: | ||||
| 4185 | return AMDGPU::G_AMDGPU_ATOMIC_FMIN; | ||||
| 4186 | case Intrinsic::amdgcn_ds_fmax: | ||||
| 4187 | return AMDGPU::G_AMDGPU_ATOMIC_FMAX; | ||||
| 4188 | default: | ||||
| 4189 | llvm_unreachable("not a DS FP intrinsic")::llvm::llvm_unreachable_internal("not a DS FP intrinsic", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4189); | ||||
| 4190 | } | ||||
| 4191 | } | ||||
| 4192 | |||||
| 4193 | bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, | ||||
| 4194 | MachineInstr &MI, | ||||
| 4195 | Intrinsic::ID IID) const { | ||||
| 4196 | GISelChangeObserver &Observer = Helper.Observer; | ||||
| 4197 | Observer.changingInstr(MI); | ||||
| 4198 | |||||
| 4199 | MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); | ||||
| 4200 | |||||
| 4201 | // The remaining operands were used to set fields in the MemOperand on | ||||
| 4202 | // construction. | ||||
| 4203 | for (int I = 6; I > 3; --I) | ||||
| 4204 | MI.removeOperand(I); | ||||
| 4205 | |||||
| 4206 | MI.removeOperand(1); // Remove the intrinsic ID. | ||||
| 4207 | Observer.changedInstr(MI); | ||||
| 4208 | return true; | ||||
| 4209 | } | ||||
| 4210 | |||||
| 4211 | bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, | ||||
| 4212 | MachineRegisterInfo &MRI, | ||||
| 4213 | MachineIRBuilder &B) const { | ||||
| 4214 | uint64_t Offset = | ||||
| 4215 | ST.getTargetLowering()->getImplicitParameterOffset( | ||||
| 4216 | B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); | ||||
| 4217 | LLT DstTy = MRI.getType(DstReg); | ||||
| 4218 | LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); | ||||
| 4219 | |||||
| 4220 | Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); | ||||
| 4221 | if (!loadInputValue(KernargPtrReg, B, | ||||
| 4222 | AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) | ||||
| 4223 | return false; | ||||
| 4224 | |||||
| 4225 | // FIXME: This should be nuw | ||||
| 4226 | B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); | ||||
| 4227 | return true; | ||||
| 4228 | } | ||||
| 4229 | |||||
| 4230 | bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, | ||||
| 4231 | MachineRegisterInfo &MRI, | ||||
| 4232 | MachineIRBuilder &B) const { | ||||
| 4233 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
| 4234 | if (!MFI->isEntryFunction()) { | ||||
| 4235 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
| 4236 | AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); | ||||
| 4237 | } | ||||
| 4238 | |||||
| 4239 | Register DstReg = MI.getOperand(0).getReg(); | ||||
| 4240 | if (!getImplicitArgPtr(DstReg, MRI, B)) | ||||
| 4241 | return false; | ||||
| 4242 | |||||
| 4243 | MI.eraseFromParent(); | ||||
| 4244 | return true; | ||||
| 4245 | } | ||||
| 4246 | |||||
| 4247 | bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg, | ||||
| 4248 | MachineRegisterInfo &MRI, | ||||
| 4249 | MachineIRBuilder &B) const { | ||||
| 4250 | Function &F = B.getMF().getFunction(); | ||||
| 4251 | std::optional<uint32_t> KnownSize = | ||||
| 4252 | AMDGPUMachineFunction::getLDSKernelIdMetadata(F); | ||||
| 4253 | if (KnownSize.has_value()) | ||||
| 4254 | B.buildConstant(DstReg, *KnownSize); | ||||
| 4255 | return false; | ||||
| 4256 | } | ||||
| 4257 | |||||
| 4258 | bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI, | ||||
| 4259 | MachineRegisterInfo &MRI, | ||||
| 4260 | MachineIRBuilder &B) const { | ||||
| 4261 | |||||
| 4262 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); | ||||
| 4263 | if (!MFI->isEntryFunction()) { | ||||
| 4264 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
| 4265 | AMDGPUFunctionArgInfo::LDS_KERNEL_ID); | ||||
| 4266 | } | ||||
| 4267 | |||||
| 4268 | Register DstReg = MI.getOperand(0).getReg(); | ||||
| 4269 | if (!getLDSKernelId(DstReg, MRI, B)) | ||||
| 4270 | return false; | ||||
| 4271 | |||||
| 4272 | MI.eraseFromParent(); | ||||
| 4273 | return true; | ||||
| 4274 | } | ||||
| 4275 | |||||
| 4276 | bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, | ||||
| 4277 | MachineRegisterInfo &MRI, | ||||
| 4278 | MachineIRBuilder &B, | ||||
| 4279 | unsigned AddrSpace) const { | ||||
| 4280 | Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); | ||||
| 4281 | auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); | ||||
| 4282 | Register Hi32 = Unmerge.getReg(1); | ||||
| 4283 | |||||
| 4284 | B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); | ||||
| 4285 | MI.eraseFromParent(); | ||||
| 4286 | return true; | ||||
| 4287 | } | ||||
| 4288 | |||||
| 4289 | // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: | ||||
| 4290 | // offset (the offset that is included in bounds checking and swizzling, to be | ||||
| 4291 | // split between the instruction's voffset and immoffset fields) and soffset | ||||
| 4292 | // (the offset that is excluded from bounds checking and swizzling, to go in | ||||
| 4293 | // the instruction's soffset field). This function takes the first kind of | ||||
| 4294 | // offset and figures out how to split it between voffset and immoffset. | ||||
| 4295 | std::pair<Register, unsigned> | ||||
| 4296 | AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, | ||||
| 4297 | Register OrigOffset) const { | ||||
| 4298 | const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(); | ||||
| 4299 | Register BaseReg; | ||||
| 4300 | unsigned ImmOffset; | ||||
| 4301 | const LLT S32 = LLT::scalar(32); | ||||
| 4302 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||
| 4303 | |||||
| 4304 | std::tie(BaseReg, ImmOffset) = | ||||
| 4305 | AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); | ||||
| 4306 | |||||
| 4307 | // If BaseReg is a pointer, convert it to int. | ||||
| 4308 | if (MRI.getType(BaseReg).isPointer()) | ||||
| 4309 | BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); | ||||
| 4310 | |||||
| 4311 | // If the immediate value is too big for the immoffset field, put only bits | ||||
| 4312 | // that would normally fit in the immoffset field. The remaining value that | ||||
| 4313 | // is copied/added for the voffset field is a large power of 2, and it | ||||
| 4314 | // stands more chance of being CSEd with the copy/add for another similar | ||||
| 4315 | // load/store. | ||||
| 4316 | // However, do not do that rounding down if that is a negative | ||||
| 4317 | // number, as it appears to be illegal to have a negative offset in the | ||||
| 4318 | // vgpr, even if adding the immediate offset makes it positive. | ||||
| 4319 | unsigned Overflow = ImmOffset & ~MaxImm; | ||||
| 4320 | ImmOffset -= Overflow; | ||||
| 4321 | if ((int32_t)Overflow < 0) { | ||||
| 4322 | Overflow += ImmOffset; | ||||
| 4323 | ImmOffset = 0; | ||||
| 4324 | } | ||||
| 4325 | |||||
| 4326 | if (Overflow != 0) { | ||||
| 4327 | if (!BaseReg) { | ||||
| 4328 | BaseReg = B.buildConstant(S32, Overflow).getReg(0); | ||||
| 4329 | } else { | ||||
| 4330 | auto OverflowVal = B.buildConstant(S32, Overflow); | ||||
| 4331 | BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); | ||||
| 4332 | } | ||||
| 4333 | } | ||||
| 4334 | |||||
| 4335 | if (!BaseReg) | ||||
| 4336 | BaseReg = B.buildConstant(S32, 0).getReg(0); | ||||
| 4337 | |||||
| 4338 | return std::pair(BaseReg, ImmOffset); | ||||
| 4339 | } | ||||
| 4340 | |||||
| 4341 | /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. | ||||
| 4342 | void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, | ||||
| 4343 | Register VOffset, Register SOffset, | ||||
| 4344 | unsigned ImmOffset, Register VIndex, | ||||
| 4345 | MachineRegisterInfo &MRI) const { | ||||
| 4346 | std::optional<ValueAndVReg> MaybeVOffsetVal = | ||||
| 4347 | getIConstantVRegValWithLookThrough(VOffset, MRI); | ||||
| 4348 | std::optional<ValueAndVReg> MaybeSOffsetVal = | ||||
| 4349 | getIConstantVRegValWithLookThrough(SOffset, MRI); | ||||
| 4350 | std::optional<ValueAndVReg> MaybeVIndexVal = | ||||
| 4351 | getIConstantVRegValWithLookThrough(VIndex, MRI); | ||||
| 4352 | // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, | ||||
| 4353 | // update the MMO with that offset. The stride is unknown so we can only do | ||||
| 4354 | // this if VIndex is constant 0. | ||||
| 4355 | if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && | ||||
| 4356 | MaybeVIndexVal->Value == 0) { | ||||
| 4357 | uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + | ||||
| 4358 | MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; | ||||
| 4359 | MMO->setOffset(TotalOffset); | ||||
| 4360 | } else { | ||||
| 4361 | // We don't have a constant combined offset to use in the MMO. Give up. | ||||
| 4362 | MMO->setValue((Value *)nullptr); | ||||
| 4363 | } | ||||
| 4364 | } | ||||
| 4365 | |||||
| 4366 | /// Handle register layout difference for f16 images for some subtargets. | ||||
| 4367 | Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, | ||||
| 4368 | MachineRegisterInfo &MRI, | ||||
| 4369 | Register Reg, | ||||
| 4370 | bool ImageStore) const { | ||||
| 4371 | const LLT S16 = LLT::scalar(16); | ||||
| 4372 | const LLT S32 = LLT::scalar(32); | ||||
| 4373 | LLT StoreVT = MRI.getType(Reg); | ||||
| 4374 | assert(StoreVT.isVector() && StoreVT.getElementType() == S16)(static_cast <bool> (StoreVT.isVector() && StoreVT .getElementType() == S16) ? void (0) : __assert_fail ("StoreVT.isVector() && StoreVT.getElementType() == S16" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4374, __extension__ __PRETTY_FUNCTION__)); | ||||
| 4375 | |||||
| 4376 | if (ST.hasUnpackedD16VMem()) { | ||||
| 4377 | auto Unmerge = B.buildUnmerge(S16, Reg); | ||||
| 4378 | |||||
| 4379 | SmallVector<Register, 4> WideRegs; | ||||
| 4380 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||
| 4381 | WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); | ||||
| 4382 | |||||
| 4383 | int NumElts = StoreVT.getNumElements(); | ||||
| 4384 | |||||
| 4385 | return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) | ||||
| 4386 | .getReg(0); | ||||
| 4387 | } | ||||
| 4388 | |||||
| 4389 | if (ImageStore && ST.hasImageStoreD16Bug()) { | ||||
| 4390 | if (StoreVT.getNumElements() == 2) { | ||||
| 4391 | SmallVector<Register, 4> PackedRegs; | ||||
| 4392 | Reg = B.buildBitcast(S32, Reg).getReg(0); | ||||
| 4393 | PackedRegs.push_back(Reg); | ||||
| 4394 | PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); | ||||
| 4395 | return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) | ||||
| 4396 | .getReg(0); | ||||
| 4397 | } | ||||
| 4398 | |||||
| 4399 | if (StoreVT.getNumElements() == 3) { | ||||
| 4400 | SmallVector<Register, 4> PackedRegs; | ||||
| 4401 | auto Unmerge = B.buildUnmerge(S16, Reg); | ||||
| 4402 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||
| 4403 | PackedRegs.push_back(Unmerge.getReg(I)); | ||||
| 4404 | PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); | ||||
| 4405 | Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); | ||||
| 4406 | return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); | ||||
| 4407 | } | ||||
| 4408 | |||||
| 4409 | if (StoreVT.getNumElements() == 4) { | ||||
| 4410 | SmallVector<Register, 4> PackedRegs; | ||||
| 4411 | Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); | ||||
| 4412 | auto Unmerge = B.buildUnmerge(S32, Reg); | ||||
| 4413 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) | ||||
| 4414 | PackedRegs.push_back(Unmerge.getReg(I)); | ||||
| 4415 | PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); | ||||
| 4416 | return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) | ||||
| 4417 | .getReg(0); | ||||
| 4418 | } | ||||
| 4419 | |||||
| 4420 | llvm_unreachable("invalid data type")::llvm::llvm_unreachable_internal("invalid data type", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4420); | ||||
| 4421 | } | ||||
| 4422 | |||||
| 4423 | if (StoreVT == LLT::fixed_vector(3, S16)) { | ||||
| 4424 | Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) | ||||
| 4425 | .getReg(0); | ||||
| 4426 | } | ||||
| 4427 | return Reg; | ||||
| 4428 | } | ||||
| 4429 | |||||
| 4430 | Register AMDGPULegalizerInfo::fixStoreSourceType( | ||||
| 4431 | MachineIRBuilder &B, Register VData, bool IsFormat) const { | ||||
| 4432 | MachineRegisterInfo *MRI = B.getMRI(); | ||||
| 4433 | LLT Ty = MRI->getType(VData); | ||||
| 4434 | |||||
| 4435 | const LLT S16 = LLT::scalar(16); | ||||
| 4436 | |||||
| 4437 | // Fixup illegal register types for i8 stores. | ||||
| 4438 | if (Ty == LLT::scalar(8) || Ty == S16) { | ||||
| 4439 | Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); | ||||
| 4440 | return AnyExt; | ||||
| 4441 | } | ||||
| 4442 | |||||
| 4443 | if (Ty.isVector()) { | ||||
| 4444 | if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { | ||||
| 4445 | if (IsFormat) | ||||
| 4446 | return handleD16VData(B, *MRI, VData); | ||||
| 4447 | } | ||||
| 4448 | } | ||||
| 4449 | |||||
| 4450 | return VData; | ||||
| 4451 | } | ||||
| 4452 | |||||
| 4453 | bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, | ||||
| 4454 | MachineRegisterInfo &MRI, | ||||
| 4455 | MachineIRBuilder &B, | ||||
| 4456 | bool IsTyped, | ||||
| 4457 | bool IsFormat) const { | ||||
| 4458 | Register VData = MI.getOperand(1).getReg(); | ||||
| 4459 | LLT Ty = MRI.getType(VData); | ||||
| 4460 | LLT EltTy = Ty.getScalarType(); | ||||
| 4461 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); | ||||
| 4462 | const LLT S32 = LLT::scalar(32); | ||||
| 4463 | |||||
| 4464 | VData = fixStoreSourceType(B, VData, IsFormat); | ||||
| 4465 | Register RSrc = MI.getOperand(2).getReg(); | ||||
| 4466 | |||||
| 4467 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||
| 4468 | const int MemSize = MMO->getSize(); | ||||
| 4469 | |||||
| 4470 | unsigned ImmOffset; | ||||
| 4471 | |||||
| 4472 | // The typed intrinsics add an immediate after the registers. | ||||
| 4473 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; | ||||
| 4474 | |||||
| 4475 | // The struct intrinsic variants add one additional operand over raw. | ||||
| 4476 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||
| 4477 | Register VIndex; | ||||
| 4478 | int OpOffset = 0; | ||||
| 4479 | if (HasVIndex) { | ||||
| 4480 | VIndex = MI.getOperand(3).getReg(); | ||||
| 4481 | OpOffset = 1; | ||||
| 4482 | } else { | ||||
| 4483 | VIndex = B.buildConstant(S32, 0).getReg(0); | ||||
| 4484 | } | ||||
| 4485 | |||||
| 4486 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); | ||||
| 4487 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||
| 4488 | |||||
| 4489 | unsigned Format = 0; | ||||
| 4490 | if (IsTyped) { | ||||
| 4491 | Format = MI.getOperand(5 + OpOffset).getImm(); | ||||
| 4492 | ++OpOffset; | ||||
| 4493 | } | ||||
| 4494 | |||||
| 4495 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); | ||||
| 4496 | |||||
| 4497 | std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); | ||||
| 4498 | updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); | ||||
| 4499 | |||||
| 4500 | unsigned Opc; | ||||
| 4501 | if (IsTyped) { | ||||
| 4502 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : | ||||
| 4503 | AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; | ||||
| 4504 | } else if (IsFormat) { | ||||
| 4505 | Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : | ||||
| 4506 | AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; | ||||
| 4507 | } else { | ||||
| 4508 | switch (MemSize) { | ||||
| 4509 | case 1: | ||||
| 4510 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; | ||||
| 4511 | break; | ||||
| 4512 | case 2: | ||||
| 4513 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; | ||||
| 4514 | break; | ||||
| 4515 | default: | ||||
| 4516 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; | ||||
| 4517 | break; | ||||
| 4518 | } | ||||
| 4519 | } | ||||
| 4520 | |||||
| 4521 | auto MIB = B.buildInstr(Opc) | ||||
| 4522 | .addUse(VData) // vdata | ||||
| 4523 | .addUse(RSrc) // rsrc | ||||
| 4524 | .addUse(VIndex) // vindex | ||||
| 4525 | .addUse(VOffset) // voffset | ||||
| 4526 | .addUse(SOffset) // soffset | ||||
| 4527 | .addImm(ImmOffset); // offset(imm) | ||||
| 4528 | |||||
| 4529 | if (IsTyped) | ||||
| 4530 | MIB.addImm(Format); | ||||
| 4531 | |||||
| 4532 | MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||
| 4533 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||
| 4534 | .addMemOperand(MMO); | ||||
| 4535 | |||||
| 4536 | MI.eraseFromParent(); | ||||
| 4537 | return true; | ||||
| 4538 | } | ||||
| 4539 | |||||
| 4540 | static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, | ||||
| 4541 | Register VIndex, Register VOffset, Register SOffset, | ||||
| 4542 | unsigned ImmOffset, unsigned Format, | ||||
| 4543 | unsigned AuxiliaryData, MachineMemOperand *MMO, | ||||
| 4544 | bool IsTyped, bool HasVIndex, MachineIRBuilder &B) { | ||||
| 4545 | auto MIB = B.buildInstr(Opc) | ||||
| 4546 | .addDef(LoadDstReg) // vdata | ||||
| 4547 | .addUse(RSrc) // rsrc | ||||
| 4548 | .addUse(VIndex) // vindex | ||||
| 4549 | .addUse(VOffset) // voffset | ||||
| 4550 | .addUse(SOffset) // soffset | ||||
| 4551 | .addImm(ImmOffset); // offset(imm) | ||||
| 4552 | |||||
| 4553 | if (IsTyped) | ||||
| 4554 | MIB.addImm(Format); | ||||
| 4555 | |||||
| 4556 | MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||
| 4557 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||
| 4558 | .addMemOperand(MMO); | ||||
| 4559 | } | ||||
| 4560 | |||||
| 4561 | bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, | ||||
| 4562 | MachineRegisterInfo &MRI, | ||||
| 4563 | MachineIRBuilder &B, | ||||
| 4564 | bool IsFormat, | ||||
| 4565 | bool IsTyped) const { | ||||
| 4566 | // FIXME: Verifier should enforce 1 MMO for these intrinsics. | ||||
| 4567 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||
| 4568 | const LLT MemTy = MMO->getMemoryType(); | ||||
| 4569 | const LLT S32 = LLT::scalar(32); | ||||
| 4570 | |||||
| 4571 | Register Dst = MI.getOperand(0).getReg(); | ||||
| 4572 | |||||
| 4573 | Register StatusDst; | ||||
| 4574 | int OpOffset = 0; | ||||
| 4575 | assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2)(static_cast <bool> (MI.getNumExplicitDefs() == 1 || MI .getNumExplicitDefs() == 2) ? void (0) : __assert_fail ("MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4575, __extension__ __PRETTY_FUNCTION__)); | ||||
| 4576 | bool IsTFE = MI.getNumExplicitDefs() == 2; | ||||
| 4577 | if (IsTFE) { | ||||
| 4578 | StatusDst = MI.getOperand(1).getReg(); | ||||
| 4579 | ++OpOffset; | ||||
| 4580 | } | ||||
| 4581 | |||||
| 4582 | Register RSrc = MI.getOperand(2 + OpOffset).getReg(); | ||||
| 4583 | |||||
| 4584 | // The typed intrinsics add an immediate after the registers. | ||||
| 4585 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; | ||||
| 4586 | |||||
| 4587 | // The struct intrinsic variants add one additional operand over raw. | ||||
| 4588 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset; | ||||
| 4589 | Register VIndex; | ||||
| 4590 | if (HasVIndex) { | ||||
| 4591 | VIndex = MI.getOperand(3 + OpOffset).getReg(); | ||||
| 4592 | ++OpOffset; | ||||
| 4593 | } else { | ||||
| 4594 | VIndex = B.buildConstant(S32, 0).getReg(0); | ||||
| 4595 | } | ||||
| 4596 | |||||
| 4597 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); | ||||
| 4598 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||
| 4599 | |||||
| 4600 | unsigned Format = 0; | ||||
| 4601 | if (IsTyped) { | ||||
| 4602 | Format = MI.getOperand(5 + OpOffset).getImm(); | ||||
| 4603 | ++OpOffset; | ||||
| 4604 | } | ||||
| 4605 | |||||
| 4606 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); | ||||
| 4607 | unsigned ImmOffset; | ||||
| 4608 | |||||
| 4609 | LLT Ty = MRI.getType(Dst); | ||||
| 4610 | LLT EltTy = Ty.getScalarType(); | ||||
| 4611 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); | ||||
| 4612 | const bool Unpacked = ST.hasUnpackedD16VMem(); | ||||
| 4613 | |||||
| 4614 | std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); | ||||
| 4615 | updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); | ||||
| 4616 | |||||
| 4617 | unsigned Opc; | ||||
| 4618 | |||||
| 4619 | // TODO: Support TFE for typed and narrow loads. | ||||
| 4620 | if (IsTyped) { | ||||
| 4621 | if (IsTFE) | ||||
| 4622 | return false; | ||||
| 4623 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : | ||||
| 4624 | AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; | ||||
| 4625 | } else if (IsFormat) { | ||||
| 4626 | if (IsD16) { | ||||
| 4627 | if (IsTFE) | ||||
| 4628 | return false; | ||||
| 4629 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16; | ||||
| 4630 | } else { | ||||
| 4631 | Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE | ||||
| 4632 | : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; | ||||
| 4633 | } | ||||
| 4634 | } else { | ||||
| 4635 | if (IsTFE) | ||||
| 4636 | return false; | ||||
| 4637 | switch (MemTy.getSizeInBits()) { | ||||
| 4638 | case 8: | ||||
| 4639 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; | ||||
| 4640 | break; | ||||
| 4641 | case 16: | ||||
| 4642 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; | ||||
| 4643 | break; | ||||
| 4644 | default: | ||||
| 4645 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; | ||||
| 4646 | break; | ||||
| 4647 | } | ||||
| 4648 | } | ||||
| 4649 | |||||
| 4650 | if (IsTFE) { | ||||
| 4651 | unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32); | ||||
| 4652 | unsigned NumLoadDWords = NumValueDWords + 1; | ||||
| 4653 | LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32); | ||||
| 4654 | Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy); | ||||
| 4655 | buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, | ||||
| 4656 | Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); | ||||
| 4657 | if (NumValueDWords == 1) { | ||||
| 4658 | B.buildUnmerge({Dst, StatusDst}, LoadDstReg); | ||||
| 4659 | } else { | ||||
| 4660 | SmallVector<Register, 5> LoadElts; | ||||
| 4661 | for (unsigned I = 0; I != NumValueDWords; ++I) | ||||
| 4662 | LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32)); | ||||
| 4663 | LoadElts.push_back(StatusDst); | ||||
| 4664 | B.buildUnmerge(LoadElts, LoadDstReg); | ||||
| 4665 | LoadElts.truncate(NumValueDWords); | ||||
| 4666 | B.buildMergeLikeInstr(Dst, LoadElts); | ||||
| 4667 | } | ||||
| 4668 | } else if ((!IsD16 && MemTy.getSizeInBits() < 32) || | ||||
| 4669 | (IsD16 && !Ty.isVector())) { | ||||
| 4670 | Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); | ||||
| 4671 | buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, | ||||
| 4672 | Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); | ||||
| 4673 | B.setInsertPt(B.getMBB(), ++B.getInsertPt()); | ||||
| 4674 | B.buildTrunc(Dst, LoadDstReg); | ||||
| 4675 | } else if (Unpacked && IsD16 && Ty.isVector()) { | ||||
| 4676 | LLT UnpackedTy = Ty.changeElementSize(32); | ||||
| 4677 | Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); | ||||
| 4678 | buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, | ||||
| 4679 | Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); | ||||
| 4680 | B.setInsertPt(B.getMBB(), ++B.getInsertPt()); | ||||
| 4681 | // FIXME: G_TRUNC should work, but legalization currently fails | ||||
| 4682 | auto Unmerge = B.buildUnmerge(S32, LoadDstReg); | ||||
| 4683 | SmallVector<Register, 4> Repack; | ||||
| 4684 | for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) | ||||
| 4685 | Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); | ||||
| 4686 | B.buildMergeLikeInstr(Dst, Repack); | ||||
| 4687 | } else { | ||||
| 4688 | buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format, | ||||
| 4689 | AuxiliaryData, MMO, IsTyped, HasVIndex, B); | ||||
| 4690 | } | ||||
| 4691 | |||||
| 4692 | MI.eraseFromParent(); | ||||
| 4693 | return true; | ||||
| 4694 | } | ||||
| 4695 | |||||
| 4696 | bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, | ||||
| 4697 | MachineIRBuilder &B, | ||||
| 4698 | bool IsInc) const { | ||||
| 4699 | unsigned Opc = IsInc ? AMDGPU::G_ATOMICRMW_UINC_WRAP : | ||||
| 4700 | AMDGPU::G_ATOMICRMW_UDEC_WRAP; | ||||
| 4701 | B.buildInstr(Opc) | ||||
| 4702 | .addDef(MI.getOperand(0).getReg()) | ||||
| 4703 | .addUse(MI.getOperand(2).getReg()) | ||||
| 4704 | .addUse(MI.getOperand(3).getReg()) | ||||
| 4705 | .cloneMemRefs(MI); | ||||
| 4706 | MI.eraseFromParent(); | ||||
| 4707 | return true; | ||||
| 4708 | } | ||||
| 4709 | |||||
| 4710 | static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { | ||||
| 4711 | switch (IntrID) { | ||||
| 4712 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: | ||||
| 4713 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: | ||||
| 4714 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; | ||||
| 4715 | case Intrinsic::amdgcn_raw_buffer_atomic_add: | ||||
| 4716 | case Intrinsic::amdgcn_struct_buffer_atomic_add: | ||||
| 4717 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; | ||||
| 4718 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: | ||||
| 4719 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: | ||||
| 4720 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; | ||||
| 4721 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: | ||||
| 4722 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: | ||||
| 4723 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; | ||||
| 4724 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: | ||||
| 4725 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: | ||||
| 4726 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; | ||||
| 4727 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: | ||||
| 4728 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: | ||||
| 4729 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; | ||||
| 4730 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: | ||||
| 4731 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: | ||||
| 4732 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; | ||||
| 4733 | case Intrinsic::amdgcn_raw_buffer_atomic_and: | ||||
| 4734 | case Intrinsic::amdgcn_struct_buffer_atomic_and: | ||||
| 4735 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; | ||||
| 4736 | case Intrinsic::amdgcn_raw_buffer_atomic_or: | ||||
| 4737 | case Intrinsic::amdgcn_struct_buffer_atomic_or: | ||||
| 4738 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; | ||||
| 4739 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: | ||||
| 4740 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: | ||||
| 4741 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; | ||||
| 4742 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: | ||||
| 4743 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: | ||||
| 4744 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; | ||||
| 4745 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: | ||||
| 4746 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: | ||||
| 4747 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; | ||||
| 4748 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: | ||||
| 4749 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: | ||||
| 4750 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; | ||||
| 4751 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: | ||||
| 4752 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: | ||||
| 4753 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; | ||||
| 4754 | case Intrinsic::amdgcn_raw_buffer_atomic_fmin: | ||||
| 4755 | case Intrinsic::amdgcn_struct_buffer_atomic_fmin: | ||||
| 4756 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; | ||||
| 4757 | case Intrinsic::amdgcn_raw_buffer_atomic_fmax: | ||||
| 4758 | case Intrinsic::amdgcn_struct_buffer_atomic_fmax: | ||||
| 4759 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; | ||||
| 4760 | default: | ||||
| 4761 | llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 4761); | ||||
| 4762 | } | ||||
| 4763 | } | ||||
| 4764 | |||||
| 4765 | bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, | ||||
| 4766 | MachineIRBuilder &B, | ||||
| 4767 | Intrinsic::ID IID) const { | ||||
| 4768 | const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || | ||||
| 4769 | IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; | ||||
| 4770 | const bool HasReturn = MI.getNumExplicitDefs() != 0; | ||||
| 4771 | |||||
| 4772 | Register Dst; | ||||
| 4773 | |||||
| 4774 | int OpOffset = 0; | ||||
| 4775 | if (HasReturn) { | ||||
| 4776 | // A few FP atomics do not support return values. | ||||
| 4777 | Dst = MI.getOperand(0).getReg(); | ||||
| 4778 | } else { | ||||
| 4779 | OpOffset = -1; | ||||
| 4780 | } | ||||
| 4781 | |||||
| 4782 | Register VData = MI.getOperand(2 + OpOffset).getReg(); | ||||
| 4783 | Register CmpVal; | ||||
| 4784 | |||||
| 4785 | if (IsCmpSwap) { | ||||
| 4786 | CmpVal = MI.getOperand(3 + OpOffset).getReg(); | ||||
| 4787 | ++OpOffset; | ||||
| 4788 | } | ||||
| 4789 | |||||
| 4790 | Register RSrc = MI.getOperand(3 + OpOffset).getReg(); | ||||
| 4791 | const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; | ||||
| 4792 | |||||
| 4793 | // The struct intrinsic variants add one additional operand over raw. | ||||
| 4794 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; | ||||
| 4795 | Register VIndex; | ||||
| 4796 | if (HasVIndex) { | ||||
| 4797 | VIndex = MI.getOperand(4 + OpOffset).getReg(); | ||||
| 4798 | ++OpOffset; | ||||
| 4799 | } else { | ||||
| 4800 | VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); | ||||
| 4801 | } | ||||
| 4802 | |||||
| 4803 | Register VOffset = MI.getOperand(4 + OpOffset).getReg(); | ||||
| 4804 | Register SOffset = MI.getOperand(5 + OpOffset).getReg(); | ||||
| 4805 | unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); | ||||
| 4806 | |||||
| 4807 | MachineMemOperand *MMO = *MI.memoperands_begin(); | ||||
| 4808 | |||||
| 4809 | unsigned ImmOffset; | ||||
| 4810 | std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); | ||||
| 4811 | updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); | ||||
| 4812 | |||||
| 4813 | auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); | ||||
| 4814 | |||||
| 4815 | if (HasReturn) | ||||
| 4816 | MIB.addDef(Dst); | ||||
| 4817 | |||||
| 4818 | MIB.addUse(VData); // vdata | ||||
| 4819 | |||||
| 4820 | if (IsCmpSwap) | ||||
| 4821 | MIB.addReg(CmpVal); | ||||
| 4822 | |||||
| 4823 | MIB.addUse(RSrc) // rsrc | ||||
| 4824 | .addUse(VIndex) // vindex | ||||
| 4825 | .addUse(VOffset) // voffset | ||||
| 4826 | .addUse(SOffset) // soffset | ||||
| 4827 | .addImm(ImmOffset) // offset(imm) | ||||
| 4828 | .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) | ||||
| 4829 | .addImm(HasVIndex ? -1 : 0) // idxen(imm) | ||||
| 4830 | .addMemOperand(MMO); | ||||
| 4831 | |||||
| 4832 | MI.eraseFromParent(); | ||||
| 4833 | return true; | ||||
| 4834 | } | ||||
| 4835 | |||||
| 4836 | /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized | ||||
| 4837 | /// vector with s16 typed elements. | ||||
| 4838 | static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, | ||||
| 4839 | SmallVectorImpl<Register> &PackedAddrs, | ||||
| 4840 | unsigned ArgOffset, | ||||
| 4841 | const AMDGPU::ImageDimIntrinsicInfo *Intr, | ||||
| 4842 | bool IsA16, bool IsG16) { | ||||
| 4843 | const LLT S16 = LLT::scalar(16); | ||||
| 4844 | const LLT V2S16 = LLT::fixed_vector(2, 16); | ||||
| 4845 | auto EndIdx = Intr->VAddrEnd; | ||||
| 4846 | |||||
| 4847 | for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { | ||||
| 4848 | MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); | ||||
| 4849 | if (!SrcOp.isReg()) | ||||
| 4850 | continue; // _L to _LZ may have eliminated this. | ||||
| 4851 | |||||
| 4852 | Register AddrReg = SrcOp.getReg(); | ||||
| 4853 | |||||
| 4854 | if ((I < Intr->GradientStart) || | ||||
| 4855 | (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || | ||||
| 4856 | (I >= Intr->CoordStart && !IsA16)) { | ||||
| 4857 | if ((I < Intr->GradientStart) && IsA16 && | ||||
| 4858 | (B.getMRI()->getType(AddrReg) == S16)) { | ||||
| 4859 | assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument")(static_cast <bool> (I == Intr->BiasIndex && "Got unexpected 16-bit extra argument") ? void (0) : __assert_fail ("I == Intr->BiasIndex && \"Got unexpected 16-bit extra argument\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4859, __extension__ __PRETTY_FUNCTION__)); | ||||
| 4860 | // Special handling of bias when A16 is on. Bias is of type half but | ||||
| 4861 | // occupies full 32-bit. | ||||
| 4862 | PackedAddrs.push_back( | ||||
| 4863 | B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) | ||||
| 4864 | .getReg(0)); | ||||
| 4865 | } else { | ||||
| 4866 | assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&(static_cast <bool> ((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && "Bias needs to be converted to 16 bit in A16 mode" ) ? void (0) : __assert_fail ("(!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && \"Bias needs to be converted to 16 bit in A16 mode\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4867, __extension__ __PRETTY_FUNCTION__)) | ||||
| 4867 | "Bias needs to be converted to 16 bit in A16 mode")(static_cast <bool> ((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && "Bias needs to be converted to 16 bit in A16 mode" ) ? void (0) : __assert_fail ("(!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && \"Bias needs to be converted to 16 bit in A16 mode\"" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4867, __extension__ __PRETTY_FUNCTION__)); | ||||
| 4868 | // Handle any gradient or coordinate operands that should not be packed | ||||
| 4869 | AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); | ||||
| 4870 | PackedAddrs.push_back(AddrReg); | ||||
| 4871 | } | ||||
| 4872 | } else { | ||||
| 4873 | // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, | ||||
| 4874 | // derivatives dx/dh and dx/dv are packed with undef. | ||||
| 4875 | if (((I + 1) >= EndIdx) || | ||||
| 4876 | ((Intr->NumGradients / 2) % 2 == 1 && | ||||
| 4877 | (I == static_cast<unsigned>(Intr->GradientStart + | ||||
| 4878 | (Intr->NumGradients / 2) - 1) || | ||||
| 4879 | I == static_cast<unsigned>(Intr->GradientStart + | ||||
| 4880 | Intr->NumGradients - 1))) || | ||||
| 4881 | // Check for _L to _LZ optimization | ||||
| 4882 | !MI.getOperand(ArgOffset + I + 1).isReg()) { | ||||
| 4883 | PackedAddrs.push_back( | ||||
| 4884 | B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) | ||||
| 4885 | .getReg(0)); | ||||
| 4886 | } else { | ||||
| 4887 | PackedAddrs.push_back( | ||||
| 4888 | B.buildBuildVector( | ||||
| 4889 | V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) | ||||
| 4890 | .getReg(0)); | ||||
| 4891 | ++I; | ||||
| 4892 | } | ||||
| 4893 | } | ||||
| 4894 | } | ||||
| 4895 | } | ||||
| 4896 | |||||
| 4897 | /// Convert from separate vaddr components to a single vector address register, | ||||
| 4898 | /// and replace the remaining operands with $noreg. | ||||
| 4899 | static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, | ||||
| 4900 | int DimIdx, int NumVAddrs) { | ||||
| 4901 | const LLT S32 = LLT::scalar(32); | ||||
| 4902 | (void)S32; | ||||
| 4903 | SmallVector<Register, 8> AddrRegs; | ||||
| 4904 | for (int I = 0; I != NumVAddrs; ++I) { | ||||
| 4905 | MachineOperand &SrcOp = MI.getOperand(DimIdx + I); | ||||
| 4906 | if (SrcOp.isReg()) { | ||||
| 4907 | AddrRegs.push_back(SrcOp.getReg()); | ||||
| 4908 | assert(B.getMRI()->getType(SrcOp.getReg()) == S32)(static_cast <bool> (B.getMRI()->getType(SrcOp.getReg ()) == S32) ? void (0) : __assert_fail ("B.getMRI()->getType(SrcOp.getReg()) == S32" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 4908, __extension__ __PRETTY_FUNCTION__)); | ||||
| 4909 | } | ||||
| 4910 | } | ||||
| 4911 | |||||
| 4912 | int NumAddrRegs = AddrRegs.size(); | ||||
| 4913 | if (NumAddrRegs != 1) { | ||||
| 4914 | auto VAddr = | ||||
| 4915 | B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); | ||||
| 4916 | MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); | ||||
| 4917 | } | ||||
| 4918 | |||||
| 4919 | for (int I = 1; I != NumVAddrs; ++I) { | ||||
| 4920 | MachineOperand &SrcOp = MI.getOperand(DimIdx + I); | ||||
| 4921 | if (SrcOp.isReg()) | ||||
| 4922 | MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); | ||||
| 4923 | } | ||||
| 4924 | } | ||||
| 4925 | |||||
| 4926 | /// Rewrite image intrinsics to use register layouts expected by the subtarget. | ||||
| 4927 | /// | ||||
| 4928 | /// Depending on the subtarget, load/store with 16-bit element data need to be | ||||
| 4929 | /// rewritten to use the low half of 32-bit registers, or directly use a packed | ||||
| 4930 | /// layout. 16-bit addresses should also sometimes be packed into 32-bit | ||||
| 4931 | /// registers. | ||||
| 4932 | /// | ||||
| 4933 | /// We don't want to directly select image instructions just yet, but also want | ||||
| 4934 | /// to exposes all register repacking to the legalizer/combiners. We also don't | ||||
| 4935 | /// want a selected instruction entering RegBankSelect. In order to avoid | ||||
| 4936 | /// defining a multitude of intermediate image instructions, directly hack on | ||||
| 4937 | /// the intrinsic's arguments. In cases like a16 addresses, this requires | ||||
| 4938 | /// padding now unnecessary arguments with $noreg. | ||||
| 4939 | bool AMDGPULegalizerInfo::legalizeImageIntrinsic( | ||||
| 4940 | MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, | ||||
| 4941 | const AMDGPU::ImageDimIntrinsicInfo *Intr) const { | ||||
| 4942 | |||||
| 4943 | const MachineFunction &MF = *MI.getMF(); | ||||
| 4944 | const unsigned NumDefs = MI.getNumExplicitDefs(); | ||||
| 4945 | const unsigned ArgOffset = NumDefs + 1; | ||||
| 4946 | bool IsTFE = NumDefs == 2; | ||||
| 4947 | // We are only processing the operands of d16 image operations on subtargets | ||||
| 4948 | // that use the unpacked register layout, or need to repack the TFE result. | ||||
| 4949 | |||||
| 4950 | // TODO: Do we need to guard against already legalized intrinsics? | ||||
| 4951 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = | ||||
| 4952 | AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); | ||||
| 4953 | |||||
| 4954 | MachineRegisterInfo *MRI = B.getMRI(); | ||||
| 4955 | const LLT S32 = LLT::scalar(32); | ||||
| 4956 | const LLT S16 = LLT::scalar(16); | ||||
| 4957 | const LLT V2S16 = LLT::fixed_vector(2, 16); | ||||
| 4958 | |||||
| 4959 | unsigned DMask = 0; | ||||
| 4960 | Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); | ||||
| 4961 | LLT Ty = MRI->getType(VData); | ||||
| 4962 | |||||
| 4963 | // Check for 16 bit addresses and pack if true. | ||||
| 4964 | LLT GradTy = | ||||
| 4965 | MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); | ||||
| 4966 | LLT AddrTy = | ||||
| 4967 | MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); | ||||
| 4968 | const bool IsG16 = | ||||
| 4969 | ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16; | ||||
| 4970 | const bool IsA16 = AddrTy == S16; | ||||
| 4971 | const bool IsD16 = Ty.getScalarType() == S16; | ||||
| 4972 | |||||
| 4973 | int DMaskLanes = 0; | ||||
| 4974 | if (!BaseOpcode->Atomic) { | ||||
| 4975 | DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); | ||||
| 4976 | if (BaseOpcode->Gather4) { | ||||
| 4977 | DMaskLanes = 4; | ||||
| 4978 | } else if (DMask != 0) { | ||||
| 4979 | DMaskLanes = llvm::popcount(DMask); | ||||
| 4980 | } else if (!IsTFE && !BaseOpcode->Store) { | ||||
| 4981 | // If dmask is 0, this is a no-op load. This can be eliminated. | ||||
| 4982 | B.buildUndef(MI.getOperand(0)); | ||||
| 4983 | MI.eraseFromParent(); | ||||
| 4984 | return true; | ||||
| 4985 | } | ||||
| 4986 | } | ||||
| 4987 | |||||
| 4988 | Observer.changingInstr(MI); | ||||
| 4989 | auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); | ||||
| 4990 | |||||
| 4991 | const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 | ||||
| 4992 | : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; | ||||
| 4993 | const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 | ||||
| 4994 | : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; | ||||
| 4995 | unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; | ||||
| 4996 | |||||
| 4997 | // Track that we legalized this | ||||
| 4998 | MI.setDesc(B.getTII().get(NewOpcode)); | ||||
| 4999 | |||||
| 5000 | // Expecting to get an error flag since TFC is on - and dmask is 0 Force | ||||
| 5001 | // dmask to be at least 1 otherwise the instruction will fail | ||||
| 5002 | if (IsTFE && DMask == 0) { | ||||
| 5003 | DMask = 0x1; | ||||
| 5004 | DMaskLanes = 1; | ||||
| 5005 | MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); | ||||
| 5006 | } | ||||
| 5007 | |||||
| 5008 | if (BaseOpcode->Atomic) { | ||||
| 5009 | Register VData0 = MI.getOperand(2).getReg(); | ||||
| 5010 | LLT Ty = MRI->getType(VData0); | ||||
| 5011 | |||||
| 5012 | // TODO: Allow atomic swap and bit ops for v2s16/v4s16 | ||||
| 5013 | if (Ty.isVector()) | ||||
| 5014 | return false; | ||||
| 5015 | |||||
| 5016 | if (BaseOpcode->AtomicX2) { | ||||
| 5017 | Register VData1 = MI.getOperand(3).getReg(); | ||||
| 5018 | // The two values are packed in one register. | ||||
| 5019 | LLT PackedTy = LLT::fixed_vector(2, Ty); | ||||
| 5020 | auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); | ||||
| 5021 | MI.getOperand(2).setReg(Concat.getReg(0)); | ||||
| 5022 | MI.getOperand(3).setReg(AMDGPU::NoRegister); | ||||
| 5023 | } | ||||
| 5024 | } | ||||
| 5025 | |||||
| 5026 | unsigned CorrectedNumVAddrs = Intr->NumVAddrs; | ||||
| 5027 | |||||
| 5028 | // Rewrite the addressing register layout before doing anything else. | ||||
| 5029 | if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { | ||||
| 5030 | // 16 bit gradients are supported, but are tied to the A16 control | ||||
| 5031 | // so both gradients and addresses must be 16 bit | ||||
| 5032 | return false; | ||||
| 5033 | } | ||||
| 5034 | |||||
| 5035 | if (IsA16 && !ST.hasA16()) { | ||||
| 5036 | // A16 not supported | ||||
| 5037 | return false; | ||||
| 5038 | } | ||||
| 5039 | |||||
| 5040 | const unsigned NSAMaxSize = ST.getNSAMaxSize(); | ||||
| 5041 | const unsigned HasPartialNSA = ST.hasPartialNSAEncoding(); | ||||
| 5042 | |||||
| 5043 | if (IsA16 || IsG16) { | ||||
| 5044 | if (Intr->NumVAddrs > 1) { | ||||
| 5045 | SmallVector<Register, 4> PackedRegs; | ||||
| 5046 | |||||
| 5047 | packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, | ||||
| 5048 | IsG16); | ||||
| 5049 | |||||
| 5050 | // See also below in the non-a16 branch | ||||
| 5051 | const bool UseNSA = ST.hasNSAEncoding() && | ||||
| 5052 | PackedRegs.size() >= ST.getNSAThreshold(MF) && | ||||
| 5053 | (PackedRegs.size() <= NSAMaxSize || HasPartialNSA); | ||||
| 5054 | const bool UsePartialNSA = | ||||
| 5055 | UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize; | ||||
| 5056 | |||||
| 5057 | if (UsePartialNSA) { | ||||
| 5058 | // Pack registers that would go over NSAMaxSize into last VAddr register | ||||
| 5059 | LLT PackedAddrTy = | ||||
| 5060 | LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16); | ||||
| 5061 | auto Concat = B.buildConcatVectors( | ||||
| 5062 | PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1)); | ||||
| 5063 | PackedRegs[NSAMaxSize - 1] = Concat.getReg(0); | ||||
| 5064 | PackedRegs.resize(NSAMaxSize); | ||||
| 5065 | } else if (!UseNSA && PackedRegs.size() > 1) { | ||||
| 5066 | LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); | ||||
| 5067 | auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); | ||||
| 5068 | PackedRegs[0] = Concat.getReg(0); | ||||
| 5069 | PackedRegs.resize(1); | ||||
| 5070 | } | ||||
| 5071 | |||||
| 5072 | const unsigned NumPacked = PackedRegs.size(); | ||||
| 5073 | for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { | ||||
| 5074 | MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); | ||||
| 5075 | if (!SrcOp.isReg()) { | ||||
| 5076 | assert(SrcOp.isImm() && SrcOp.getImm() == 0)(static_cast <bool> (SrcOp.isImm() && SrcOp.getImm () == 0) ? void (0) : __assert_fail ("SrcOp.isImm() && SrcOp.getImm() == 0" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 5076, __extension__ __PRETTY_FUNCTION__)); | ||||
| 5077 | continue; | ||||
| 5078 | } | ||||
| 5079 | |||||
| 5080 | assert(SrcOp.getReg() != AMDGPU::NoRegister)(static_cast <bool> (SrcOp.getReg() != AMDGPU::NoRegister ) ? void (0) : __assert_fail ("SrcOp.getReg() != AMDGPU::NoRegister" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 5080, __extension__ __PRETTY_FUNCTION__)); | ||||
| 5081 | |||||
| 5082 | if (I - Intr->VAddrStart < NumPacked) | ||||
| 5083 | SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); | ||||
| 5084 | else | ||||
| 5085 | SrcOp.setReg(AMDGPU::NoRegister); | ||||
| 5086 | } | ||||
| 5087 | } | ||||
| 5088 | } else { | ||||
| 5089 | // If the register allocator cannot place the address registers contiguously | ||||
| 5090 | // without introducing moves, then using the non-sequential address encoding | ||||
| 5091 | // is always preferable, since it saves VALU instructions and is usually a | ||||
| 5092 | // wash in terms of code size or even better. | ||||
| 5093 | // | ||||
| 5094 | // However, we currently have no way of hinting to the register allocator | ||||
| 5095 | // that MIMG addresses should be placed contiguously when it is possible to | ||||
| 5096 | // do so, so force non-NSA for the common 2-address case as a heuristic. | ||||
| 5097 | // | ||||
| 5098 | // SIShrinkInstructions will convert NSA encodings to non-NSA after register | ||||
| 5099 | // allocation when possible. | ||||
| 5100 | // | ||||
| 5101 | // Partial NSA is allowed on GFX11 where the final register is a contiguous | ||||
| 5102 | // set of the remaining addresses. | ||||
| 5103 | const bool UseNSA = ST.hasNSAEncoding() && | ||||
| 5104 | CorrectedNumVAddrs >= ST.getNSAThreshold(MF) && | ||||
| 5105 | (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA); | ||||
| 5106 | const bool UsePartialNSA = | ||||
| 5107 | UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize; | ||||
| 5108 | |||||
| 5109 | if (UsePartialNSA) { | ||||
| 5110 | convertImageAddrToPacked(B, MI, | ||||
| 5111 | ArgOffset + Intr->VAddrStart + NSAMaxSize - 1, | ||||
| 5112 | Intr->NumVAddrs - NSAMaxSize + 1); | ||||
| 5113 | } else if (!UseNSA && Intr->NumVAddrs > 1) { | ||||
| 5114 | convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, | ||||
| 5115 | Intr->NumVAddrs); | ||||
| 5116 | } | ||||
| 5117 | } | ||||
| 5118 | |||||
| 5119 | int Flags = 0; | ||||
| 5120 | if (IsA16) | ||||
| 5121 | Flags |= 1; | ||||
| 5122 | if (IsG16) | ||||
| 5123 | Flags |= 2; | ||||
| 5124 | MI.addOperand(MachineOperand::CreateImm(Flags)); | ||||
| 5125 | |||||
| 5126 | if (BaseOpcode->Store) { // No TFE for stores? | ||||
| 5127 | // TODO: Handle dmask trim | ||||
| 5128 | if (!Ty.isVector() || !IsD16) | ||||
| 5129 | return true; | ||||
| 5130 | |||||
| 5131 | Register RepackedReg = handleD16VData(B, *MRI, VData, true); | ||||
| 5132 | if (RepackedReg != VData) { | ||||
| 5133 | MI.getOperand(1).setReg(RepackedReg); | ||||
| 5134 | } | ||||
| 5135 | |||||
| 5136 | return true; | ||||
| 5137 | } | ||||
| 5138 | |||||
| 5139 | Register DstReg = MI.getOperand(0).getReg(); | ||||
| 5140 | const LLT EltTy = Ty.getScalarType(); | ||||
| 5141 | const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; | ||||
| 5142 | |||||
| 5143 | // Confirm that the return type is large enough for the dmask specified | ||||
| 5144 | if (NumElts < DMaskLanes) | ||||
| 5145 | return false; | ||||
| 5146 | |||||
| 5147 | if (NumElts > 4 || DMaskLanes > 4) | ||||
| 5148 | return false; | ||||
| 5149 | |||||
| 5150 | const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; | ||||
| 5151 | const LLT AdjustedTy = | ||||
| 5152 | Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); | ||||
| 5153 | |||||
| 5154 | // The raw dword aligned data component of the load. The only legal cases | ||||
| 5155 | // where this matters should be when using the packed D16 format, for | ||||
| 5156 | // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, | ||||
| 5157 | LLT RoundedTy; | ||||
| 5158 | |||||
| 5159 | // S32 vector to cover all data, plus TFE result element. | ||||
| 5160 | LLT TFETy; | ||||
| 5161 | |||||
| 5162 | // Register type to use for each loaded component. Will be S32 or V2S16. | ||||
| 5163 | LLT RegTy; | ||||
| 5164 | |||||
| 5165 | if (IsD16 && ST.hasUnpackedD16VMem()) { | ||||
| 5166 | RoundedTy = | ||||
| 5167 | LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); | ||||
| 5168 | TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); | ||||
| 5169 | RegTy = S32; | ||||
| 5170 | } else { | ||||
| 5171 | unsigned EltSize = EltTy.getSizeInBits(); | ||||
| 5172 | unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; | ||||
| 5173 | unsigned RoundedSize = 32 * RoundedElts; | ||||
| 5174 | RoundedTy = LLT::scalarOrVector( | ||||
| 5175 | ElementCount::getFixed(RoundedSize / EltSize), EltSize); | ||||
| 5176 | TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); | ||||
| 5177 | RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; | ||||
| 5178 | } | ||||
| 5179 | |||||
| 5180 | // The return type does not need adjustment. | ||||
| 5181 | // TODO: Should we change s16 case to s32 or <2 x s16>? | ||||
| 5182 | if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) | ||||
| 5183 | return true; | ||||
| 5184 | |||||
| 5185 | Register Dst1Reg; | ||||
| 5186 | |||||
| 5187 | // Insert after the instruction. | ||||
| 5188 | B.setInsertPt(*MI.getParent(), ++MI.getIterator()); | ||||
| 5189 | |||||
| 5190 | // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x | ||||
| 5191 | // s16> instead of s32, we would only need 1 bitcast instead of multiple. | ||||
| 5192 | const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; | ||||
| 5193 | const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; | ||||
| 5194 | |||||
| 5195 | Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); | ||||
| 5196 | |||||
| 5197 | MI.getOperand(0).setReg(NewResultReg); | ||||
| 5198 | |||||
| 5199 | // In the IR, TFE is supposed to be used with a 2 element struct return | ||||
| 5200 | // type. The instruction really returns these two values in one contiguous | ||||
| 5201 | // register, with one additional dword beyond the loaded data. Rewrite the | ||||
| 5202 | // return type to use a single register result. | ||||
| 5203 | |||||
| 5204 | if (IsTFE) { | ||||
| 5205 | Dst1Reg = MI.getOperand(1).getReg(); | ||||
| 5206 | if (MRI->getType(Dst1Reg) != S32) | ||||
| 5207 | return false; | ||||
| 5208 | |||||
| 5209 | // TODO: Make sure the TFE operand bit is set. | ||||
| 5210 | MI.removeOperand(1); | ||||
| 5211 | |||||
| 5212 | // Handle the easy case that requires no repack instructions. | ||||
| 5213 | if (Ty == S32) { | ||||
| 5214 | B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); | ||||
| 5215 | return true; | ||||
| 5216 | } | ||||
| 5217 | } | ||||
| 5218 | |||||
| 5219 | // Now figure out how to copy the new result register back into the old | ||||
| 5220 | // result. | ||||
| 5221 | SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); | ||||
| 5222 | |||||
| 5223 | const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; | ||||
| 5224 | |||||
| 5225 | if (ResultNumRegs == 1) { | ||||
| 5226 | assert(!IsTFE)(static_cast <bool> (!IsTFE) ? void (0) : __assert_fail ("!IsTFE", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 5226, __extension__ __PRETTY_FUNCTION__)); | ||||
| 5227 | ResultRegs[0] = NewResultReg; | ||||
| 5228 | } else { | ||||
| 5229 | // We have to repack into a new vector of some kind. | ||||
| 5230 | for (int I = 0; I != NumDataRegs; ++I) | ||||
| 5231 | ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); | ||||
| 5232 | B.buildUnmerge(ResultRegs, NewResultReg); | ||||
| 5233 | |||||
| 5234 | // Drop the final TFE element to get the data part. The TFE result is | ||||
| 5235 | // directly written to the right place already. | ||||
| 5236 | if (IsTFE) | ||||
| 5237 | ResultRegs.resize(NumDataRegs); | ||||
| 5238 | } | ||||
| 5239 | |||||
| 5240 | // For an s16 scalar result, we form an s32 result with a truncate regardless | ||||
| 5241 | // of packed vs. unpacked. | ||||
| 5242 | if (IsD16 && !Ty.isVector()) { | ||||
| 5243 | B.buildTrunc(DstReg, ResultRegs[0]); | ||||
| 5244 | return true; | ||||
| 5245 | } | ||||
| 5246 | |||||
| 5247 | // Avoid a build/concat_vector of 1 entry. | ||||
| 5248 | if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { | ||||
| 5249 | B.buildBitcast(DstReg, ResultRegs[0]); | ||||
| 5250 | return true; | ||||
| 5251 | } | ||||
| 5252 | |||||
| 5253 | assert(Ty.isVector())(static_cast <bool> (Ty.isVector()) ? void (0) : __assert_fail ("Ty.isVector()", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 5253, __extension__ __PRETTY_FUNCTION__)); | ||||
| 5254 | |||||
| 5255 | if (IsD16) { | ||||
| 5256 | // For packed D16 results with TFE enabled, all the data components are | ||||
| 5257 | // S32. Cast back to the expected type. | ||||
| 5258 | // | ||||
| 5259 | // TODO: We don't really need to use load s32 elements. We would only need one | ||||
| 5260 | // cast for the TFE result if a multiple of v2s16 was used. | ||||
| 5261 | if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { | ||||
| 5262 | for (Register &Reg : ResultRegs) | ||||
| 5263 | Reg = B.buildBitcast(V2S16, Reg).getReg(0); | ||||
| 5264 | } else if (ST.hasUnpackedD16VMem()) { | ||||
| 5265 | for (Register &Reg : ResultRegs) | ||||
| 5266 | Reg = B.buildTrunc(S16, Reg).getReg(0); | ||||
| 5267 | } | ||||
| 5268 | } | ||||
| 5269 | |||||
| 5270 | auto padWithUndef = [&](LLT Ty, int NumElts) { | ||||
| 5271 | if (NumElts == 0) | ||||
| 5272 | return; | ||||
| 5273 | Register Undef = B.buildUndef(Ty).getReg(0); | ||||
| 5274 | for (int I = 0; I != NumElts; ++I) | ||||
| 5275 | ResultRegs.push_back(Undef); | ||||
| 5276 | }; | ||||
| 5277 | |||||
| 5278 | // Pad out any elements eliminated due to the dmask. | ||||
| 5279 | LLT ResTy = MRI->getType(ResultRegs[0]); | ||||
| 5280 | if (!ResTy.isVector()) { | ||||
| 5281 | padWithUndef(ResTy, NumElts - ResultRegs.size()); | ||||
| 5282 | B.buildBuildVector(DstReg, ResultRegs); | ||||
| 5283 | return true; | ||||
| 5284 | } | ||||
| 5285 | |||||
| 5286 | assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)(static_cast <bool> (!ST.hasUnpackedD16VMem() && ResTy == V2S16) ? void (0) : __assert_fail ("!ST.hasUnpackedD16VMem() && ResTy == V2S16" , "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp", 5286, __extension__ __PRETTY_FUNCTION__)); | ||||
| 5287 | const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; | ||||
| 5288 | |||||
| 5289 | // Deal with the one annoying legal case. | ||||
| 5290 | const LLT V3S16 = LLT::fixed_vector(3, 16); | ||||
| 5291 | if (Ty == V3S16) { | ||||
| 5292 | if (IsTFE) { | ||||
| 5293 | if (ResultRegs.size() == 1) { | ||||
| 5294 | NewResultReg = ResultRegs[0]; | ||||
| 5295 | } else if (ResultRegs.size() == 2) { | ||||
| 5296 | LLT V4S16 = LLT::fixed_vector(4, 16); | ||||
| 5297 | NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); | ||||
| 5298 | } else { | ||||
| 5299 | return false; | ||||
| 5300 | } | ||||
| 5301 | } | ||||
| 5302 | |||||
| 5303 | if (MRI->getType(DstReg).getNumElements() < | ||||
| 5304 | MRI->getType(NewResultReg).getNumElements()) { | ||||
| 5305 | B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); | ||||
| 5306 | } else { | ||||
| 5307 | B.buildPadVectorWithUndefElements(DstReg, NewResultReg); | ||||
| 5308 | } | ||||
| 5309 | return true; | ||||
| 5310 | } | ||||
| 5311 | |||||
| 5312 | padWithUndef(ResTy, RegsToCover - ResultRegs.size()); | ||||
| 5313 | B.buildConcatVectors(DstReg, ResultRegs); | ||||
| 5314 | return true; | ||||
| 5315 | } | ||||
| 5316 | |||||
| 5317 | bool AMDGPULegalizerInfo::legalizeSBufferLoad( | ||||
| 5318 | LegalizerHelper &Helper, MachineInstr &MI) const { | ||||
| 5319 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||
| 5320 | GISelChangeObserver &Observer = Helper.Observer; | ||||
| 5321 | |||||
| 5322 | Register Dst = MI.getOperand(0).getReg(); | ||||
| 5323 | LLT Ty = B.getMRI()->getType(Dst); | ||||
| 5324 | unsigned Size = Ty.getSizeInBits(); | ||||
| 5325 | MachineFunction &MF = B.getMF(); | ||||
| 5326 | |||||
| 5327 | Observer.changingInstr(MI); | ||||
| 5328 | |||||
| 5329 | if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { | ||||
| 5330 | Ty = getBitcastRegisterType(Ty); | ||||
| 5331 | Helper.bitcastDst(MI, Ty, 0); | ||||
| 5332 | Dst = MI.getOperand(0).getReg(); | ||||
| 5333 | B.setInsertPt(B.getMBB(), MI); | ||||
| 5334 | } | ||||
| 5335 | |||||
| 5336 | // FIXME: We don't really need this intermediate instruction. The intrinsic | ||||
| 5337 | // should be fixed to have a memory operand. Since it's readnone, we're not | ||||
| 5338 | // allowed to add one. | ||||
| 5339 | MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); | ||||
| 5340 | MI.removeOperand(1); // Remove intrinsic ID | ||||
| 5341 | |||||
| 5342 | // FIXME: When intrinsic definition is fixed, this should have an MMO already. | ||||
| 5343 | // TODO: Should this use datalayout alignment? | ||||
| 5344 | const unsigned MemSize = (Size + 7) / 8; | ||||
| 5345 | const Align MemAlign(4); | ||||
| 5346 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||
| 5347 | MachinePointerInfo(), | ||||
| 5348 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||
| 5349 | MachineMemOperand::MOInvariant, | ||||
| 5350 | MemSize, MemAlign); | ||||
| 5351 | MI.addMemOperand(MF, MMO); | ||||
| 5352 | |||||
| 5353 | // There are no 96-bit result scalar loads, but widening to 128-bit should | ||||
| 5354 | // always be legal. We may need to restore this to a 96-bit result if it turns | ||||
| 5355 | // out this needs to be converted to a vector load during RegBankSelect. | ||||
| 5356 | if (!isPowerOf2_32(Size)) { | ||||
| 5357 | if (Ty.isVector()) | ||||
| 5358 | Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); | ||||
| 5359 | else | ||||
| 5360 | Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); | ||||
| 5361 | } | ||||
| 5362 | |||||
| 5363 | Observer.changedInstr(MI); | ||||
| 5364 | return true; | ||||
| 5365 | } | ||||
| 5366 | |||||
| 5367 | // TODO: Move to selection | ||||
| 5368 | bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, | ||||
| 5369 | MachineRegisterInfo &MRI, | ||||
| 5370 | MachineIRBuilder &B) const { | ||||
| 5371 | if (!ST.isTrapHandlerEnabled() || | ||||
| 5372 | ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) | ||||
| 5373 | return legalizeTrapEndpgm(MI, MRI, B); | ||||
| 5374 | |||||
| 5375 | const Module *M = B.getMF().getFunction().getParent(); | ||||
| 5376 | unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M); | ||||
| 5377 | if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3) | ||||
| 5378 | return legalizeTrapHsaQueuePtr(MI, MRI, B); | ||||
| 5379 | |||||
| 5380 | return ST.supportsGetDoorbellID() ? | ||||
| 5381 | legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B); | ||||
| 5382 | } | ||||
| 5383 | |||||
| 5384 | bool AMDGPULegalizerInfo::legalizeTrapEndpgm( | ||||
| 5385 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||
| 5386 | B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); | ||||
| 5387 | MI.eraseFromParent(); | ||||
| 5388 | return true; | ||||
| 5389 | } | ||||
| 5390 | |||||
| 5391 | bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( | ||||
| 5392 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||
| 5393 | MachineFunction &MF = B.getMF(); | ||||
| 5394 | const LLT S64 = LLT::scalar(64); | ||||
| 5395 | |||||
| 5396 | Register SGPR01(AMDGPU::SGPR0_SGPR1); | ||||
| 5397 | // For code object version 5, queue_ptr is passed through implicit kernarg. | ||||
| 5398 | if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >= | ||||
| 5399 | AMDGPU::AMDHSA_COV5) { | ||||
| 5400 | AMDGPUTargetLowering::ImplicitParameter Param = | ||||
| 5401 | AMDGPUTargetLowering::QUEUE_PTR; | ||||
| 5402 | uint64_t Offset = | ||||
| 5403 | ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); | ||||
| 5404 | |||||
| 5405 | Register KernargPtrReg = MRI.createGenericVirtualRegister( | ||||
| 5406 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||
| 5407 | |||||
| 5408 | if (!loadInputValue(KernargPtrReg, B, | ||||
| 5409 | AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) | ||||
| 5410 | return false; | ||||
| 5411 | |||||
| 5412 | // TODO: can we be smarter about machine pointer info? | ||||
| 5413 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); | ||||
| 5414 | MachineMemOperand *MMO = MF.getMachineMemOperand( | ||||
| 5415 | PtrInfo, | ||||
| 5416 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | | ||||
| 5417 | MachineMemOperand::MOInvariant, | ||||
| 5418 | LLT::scalar(64), commonAlignment(Align(64), Offset)); | ||||
| 5419 | |||||
| 5420 | // Pointer address | ||||
| 5421 | Register LoadAddr = MRI.createGenericVirtualRegister( | ||||
| 5422 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||
| 5423 | B.buildPtrAdd(LoadAddr, KernargPtrReg, | ||||
| 5424 | B.buildConstant(LLT::scalar(64), Offset).getReg(0)); | ||||
| 5425 | // Load address | ||||
| 5426 | Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); | ||||
| 5427 | B.buildCopy(SGPR01, Temp); | ||||
| 5428 | B.buildInstr(AMDGPU::S_TRAP) | ||||
| 5429 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) | ||||
| 5430 | .addReg(SGPR01, RegState::Implicit); | ||||
| 5431 | MI.eraseFromParent(); | ||||
| 5432 | return true; | ||||
| 5433 | } | ||||
| 5434 | |||||
| 5435 | // Pass queue pointer to trap handler as input, and insert trap instruction | ||||
| 5436 | // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi | ||||
| 5437 | Register LiveIn = | ||||
| 5438 | MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); | ||||
| 5439 | if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) | ||||
| 5440 | return false; | ||||
| 5441 | |||||
| 5442 | B.buildCopy(SGPR01, LiveIn); | ||||
| 5443 | B.buildInstr(AMDGPU::S_TRAP) | ||||
| 5444 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) | ||||
| 5445 | .addReg(SGPR01, RegState::Implicit); | ||||
| 5446 | |||||
| 5447 | MI.eraseFromParent(); | ||||
| 5448 | return true; | ||||
| 5449 | } | ||||
| 5450 | |||||
| 5451 | bool AMDGPULegalizerInfo::legalizeTrapHsa( | ||||
| 5452 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||
| 5453 | B.buildInstr(AMDGPU::S_TRAP) | ||||
| 5454 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); | ||||
| 5455 | MI.eraseFromParent(); | ||||
| 5456 | return true; | ||||
| 5457 | } | ||||
| 5458 | |||||
| 5459 | bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( | ||||
| 5460 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { | ||||
| 5461 | // Is non-HSA path or trap-handler disabled? Then, report a warning | ||||
| 5462 | // accordingly | ||||
| 5463 | if (!ST.isTrapHandlerEnabled() || | ||||
| 5464 | ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { | ||||
| 5465 | DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), | ||||
| 5466 | "debugtrap handler not supported", | ||||
| 5467 | MI.getDebugLoc(), DS_Warning); | ||||
| 5468 | LLVMContext &Ctx = B.getMF().getFunction().getContext(); | ||||
| 5469 | Ctx.diagnose(NoTrap); | ||||
| 5470 | } else { | ||||
| 5471 | // Insert debug-trap instruction | ||||
| 5472 | B.buildInstr(AMDGPU::S_TRAP) | ||||
| 5473 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); | ||||
| 5474 | } | ||||
| 5475 | |||||
| 5476 | MI.eraseFromParent(); | ||||
| 5477 | return true; | ||||
| 5478 | } | ||||
| 5479 | |||||
| 5480 | bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, | ||||
| 5481 | MachineIRBuilder &B) const { | ||||
| 5482 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||
| 5483 | const LLT S16 = LLT::scalar(16); | ||||
| 5484 | const LLT S32 = LLT::scalar(32); | ||||
| 5485 | const LLT V2S16 = LLT::fixed_vector(2, 16); | ||||
| 5486 | const LLT V3S32 = LLT::fixed_vector(3, 32); | ||||
| 5487 | |||||
| 5488 | Register DstReg = MI.getOperand(0).getReg(); | ||||
| 5489 | Register NodePtr = MI.getOperand(2).getReg(); | ||||
| 5490 | Register RayExtent = MI.getOperand(3).getReg(); | ||||
| 5491 | Register RayOrigin = MI.getOperand(4).getReg(); | ||||
| 5492 | Register RayDir = MI.getOperand(5).getReg(); | ||||
| 5493 | Register RayInvDir = MI.getOperand(6).getReg(); | ||||
| 5494 | Register TDescr = MI.getOperand(7).getReg(); | ||||
| 5495 | |||||
| 5496 | if (!ST.hasGFX10_AEncoding()) { | ||||
| 5497 | DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), | ||||
| 5498 | "intrinsic not supported on subtarget", | ||||
| 5499 | MI.getDebugLoc()); | ||||
| 5500 | B.getMF().getFunction().getContext().diagnose(BadIntrin); | ||||
| 5501 | return false; | ||||
| 5502 | } | ||||
| 5503 | |||||
| 5504 | const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); | ||||
| 5505 | const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; | ||||
| 5506 | const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; | ||||
| 5507 | const unsigned NumVDataDwords = 4; | ||||
| 5508 | const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); | ||||
| 5509 | const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; | ||||
| 5510 | const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize(); | ||||
| 5511 | const unsigned BaseOpcodes[2][2] = { | ||||
| 5512 | {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, | ||||
| 5513 | {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, | ||||
| 5514 | AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; | ||||
| 5515 | int Opcode; | ||||
| 5516 | if (UseNSA) { | ||||
| 5517 | Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], | ||||
| 5518 | IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA | ||||
| 5519 | : AMDGPU::MIMGEncGfx10NSA, | ||||
| 5520 | NumVDataDwords, NumVAddrDwords); | ||||
| 5521 | } else { | ||||
| 5522 | Opcode = AMDGPU::getMIMGOpcode( | ||||
| 5523 | BaseOpcodes[Is64][IsA16], | ||||
| 5524 | IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default, | ||||
| 5525 | NumVDataDwords, NumVAddrDwords); | ||||
| 5526 | } | ||||
| 5527 | assert(Opcode != -1)(static_cast <bool> (Opcode != -1) ? void (0) : __assert_fail ("Opcode != -1", "llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp" , 5527, __extension__ __PRETTY_FUNCTION__)); | ||||
| 5528 | |||||
| 5529 | SmallVector<Register, 12> Ops; | ||||
| 5530 | if (UseNSA && IsGFX11Plus) { | ||||
| 5531 | auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { | ||||
| 5532 | auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); | ||||
| 5533 | auto Merged = B.buildMergeLikeInstr( | ||||
| 5534 | V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); | ||||
| 5535 | Ops.push_back(Merged.getReg(0)); | ||||
| 5536 | }; | ||||
| 5537 | |||||
| 5538 | Ops.push_back(NodePtr); | ||||
| 5539 | Ops.push_back(RayExtent); | ||||
| 5540 | packLanes(RayOrigin); | ||||
| 5541 | |||||
| 5542 | if (IsA16) { | ||||
| 5543 | auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); | ||||
| 5544 | auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); | ||||
| 5545 | auto MergedDir = B.buildMergeLikeInstr( | ||||
| 5546 | V3S32, | ||||
| 5547 | {B.buildBitcast( | ||||
| 5548 | S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0), | ||||
| 5549 | UnmergeRayDir.getReg(0)})) | ||||
| 5550 | .getReg(0), | ||||
| 5551 | B.buildBitcast( | ||||
| 5552 | S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1), | ||||
| 5553 | UnmergeRayDir.getReg(1)})) | ||||
| 5554 | .getReg(0), | ||||
| 5555 | B.buildBitcast( | ||||
| 5556 | S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2), | ||||
| 5557 | UnmergeRayDir.getReg(2)})) | ||||
| 5558 | .getReg(0)}); | ||||
| 5559 | Ops.push_back(MergedDir.getReg(0)); | ||||
| 5560 | } else { | ||||
| 5561 | packLanes(RayDir); | ||||
| 5562 | packLanes(RayInvDir); | ||||
| 5563 | } | ||||
| 5564 | } else { | ||||
| 5565 | if (Is64) { | ||||
| 5566 | auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); | ||||
| 5567 | Ops.push_back(Unmerge.getReg(0)); | ||||
| 5568 | Ops.push_back(Unmerge.getReg(1)); | ||||
| 5569 | } else { | ||||
| 5570 | Ops.push_back(NodePtr); | ||||
| 5571 | } | ||||
| 5572 | Ops.push_back(RayExtent); | ||||
| 5573 | |||||
| 5574 | auto packLanes = [&Ops, &S32, &B](Register Src) { | ||||
| 5575 | auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); | ||||
| 5576 | Ops.push_back(Unmerge.getReg(0)); | ||||
| 5577 | Ops.push_back(Unmerge.getReg(1)); | ||||
| 5578 | Ops.push_back(Unmerge.getReg(2)); | ||||
| 5579 | }; | ||||
| 5580 | |||||
| 5581 | packLanes(RayOrigin); | ||||
| 5582 | if (IsA16) { | ||||
| 5583 | auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); | ||||
| 5584 | auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); | ||||
| 5585 | Register R1 = MRI.createGenericVirtualRegister(S32); | ||||
| 5586 | Register R2 = MRI.createGenericVirtualRegister(S32); | ||||
| 5587 | Register R3 = MRI.createGenericVirtualRegister(S32); | ||||
| 5588 | B.buildMergeLikeInstr(R1, | ||||
| 5589 | {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); | ||||
| 5590 | B.buildMergeLikeInstr( | ||||
| 5591 | R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); | ||||
| 5592 | B.buildMergeLikeInstr( | ||||
| 5593 | R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); | ||||
| 5594 | Ops.push_back(R1); | ||||
| 5595 | Ops.push_back(R2); | ||||
| 5596 | Ops.push_back(R3); | ||||
| 5597 | } else { | ||||
| 5598 | packLanes(RayDir); | ||||
| 5599 | packLanes(RayInvDir); | ||||
| 5600 | } | ||||
| 5601 | } | ||||
| 5602 | |||||
| 5603 | if (!UseNSA) { | ||||
| 5604 | // Build a single vector containing all the operands so far prepared. | ||||
| 5605 | LLT OpTy = LLT::fixed_vector(Ops.size(), 32); | ||||
| 5606 | Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0); | ||||
| 5607 | Ops.clear(); | ||||
| 5608 | Ops.push_back(MergedOps); | ||||
| 5609 | } | ||||
| 5610 | |||||
| 5611 | auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) | ||||
| 5612 | .addDef(DstReg) | ||||
| 5613 | .addImm(Opcode); | ||||
| 5614 | |||||
| 5615 | for (Register R : Ops) { | ||||
| 5616 | MIB.addUse(R); | ||||
| 5617 | } | ||||
| 5618 | |||||
| 5619 | MIB.addUse(TDescr) | ||||
| 5620 | .addImm(IsA16 ? 1 : 0) | ||||
| 5621 | .cloneMemRefs(MI); | ||||
| 5622 | |||||
| 5623 | MI.eraseFromParent(); | ||||
| 5624 | return true; | ||||
| 5625 | } | ||||
| 5626 | |||||
| 5627 | bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, | ||||
| 5628 | MachineIRBuilder &B) const { | ||||
| 5629 | unsigned Opc; | ||||
| 5630 | int RoundMode = MI.getOperand(2).getImm(); | ||||
| 5631 | |||||
| 5632 | if (RoundMode == (int)RoundingMode::TowardPositive) | ||||
| 5633 | Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; | ||||
| 5634 | else if (RoundMode == (int)RoundingMode::TowardNegative) | ||||
| 5635 | Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; | ||||
| 5636 | else | ||||
| 5637 | return false; | ||||
| 5638 | |||||
| 5639 | B.buildInstr(Opc) | ||||
| 5640 | .addDef(MI.getOperand(0).getReg()) | ||||
| 5641 | .addUse(MI.getOperand(1).getReg()); | ||||
| 5642 | |||||
| 5643 | MI.eraseFromParent(); | ||||
| 5644 | |||||
| 5645 | return true; | ||||
| 5646 | } | ||||
| 5647 | |||||
| 5648 | bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, | ||||
| 5649 | MachineInstr &MI) const { | ||||
| 5650 | MachineIRBuilder &B = Helper.MIRBuilder; | ||||
| 5651 | MachineRegisterInfo &MRI = *B.getMRI(); | ||||
| 5652 | |||||
| 5653 | // Replace the use G_BRCOND with the exec manipulate and branch pseudos. | ||||
| 5654 | auto IntrID = MI.getIntrinsicID(); | ||||
| 5655 | switch (IntrID) { | ||||
| 5656 | case Intrinsic::amdgcn_if: | ||||
| 5657 | case Intrinsic::amdgcn_else: { | ||||
| 5658 | MachineInstr *Br = nullptr; | ||||
| 5659 | MachineBasicBlock *UncondBrTarget = nullptr; | ||||
| 5660 | bool Negated = false; | ||||
| 5661 | if (MachineInstr *BrCond = | ||||
| 5662 | verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { | ||||
| 5663 | const SIRegisterInfo *TRI | ||||
| 5664 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | ||||
| 5665 | |||||
| 5666 | Register Def = MI.getOperand(1).getReg(); | ||||
| 5667 | Register Use = MI.getOperand(3).getReg(); | ||||
| 5668 | |||||
| 5669 | MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); | ||||
| 5670 | |||||
| 5671 | if (Negated) | ||||
| 5672 | std::swap(CondBrTarget, UncondBrTarget); | ||||
| 5673 | |||||
| 5674 | B.setInsertPt(B.getMBB(), BrCond->getIterator()); | ||||
| 5675 | if (IntrID == Intrinsic::amdgcn_if) { | ||||
| 5676 | B.buildInstr(AMDGPU::SI_IF) | ||||
| 5677 | .addDef(Def) | ||||
| 5678 | .addUse(Use) | ||||
| 5679 | .addMBB(UncondBrTarget); | ||||
| 5680 | } else { | ||||
| 5681 | B.buildInstr(AMDGPU::SI_ELSE) | ||||
| 5682 | .addDef(Def) | ||||
| 5683 | .addUse(Use) | ||||
| 5684 | .addMBB(UncondBrTarget); | ||||
| 5685 | } | ||||
| 5686 | |||||
| 5687 | if (Br) { | ||||
| 5688 | Br->getOperand(0).setMBB(CondBrTarget); | ||||
| 5689 | } else { | ||||
| 5690 | // The IRTranslator skips inserting the G_BR for fallthrough cases, but | ||||
| 5691 | // since we're swapping branch targets it needs to be reinserted. | ||||
| 5692 | // FIXME: IRTranslator should probably not do this | ||||
| 5693 | B.buildBr(*CondBrTarget); | ||||
| 5694 | } | ||||
| 5695 | |||||
| 5696 | MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); | ||||
| 5697 | MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); | ||||
| 5698 | MI.eraseFromParent(); | ||||
| 5699 | BrCond->eraseFromParent(); | ||||
| 5700 | return true; | ||||
| 5701 | } | ||||
| 5702 | |||||
| 5703 | return false; | ||||
| 5704 | } | ||||
| 5705 | case Intrinsic::amdgcn_loop: { | ||||
| 5706 | MachineInstr *Br = nullptr; | ||||
| 5707 | MachineBasicBlock *UncondBrTarget = nullptr; | ||||
| 5708 | bool Negated = false; | ||||
| 5709 | if (MachineInstr *BrCond = | ||||
| 5710 | verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { | ||||
| 5711 | const SIRegisterInfo *TRI | ||||
| 5712 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); | ||||
| 5713 | |||||
| 5714 | MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); | ||||
| 5715 | Register Reg = MI.getOperand(2).getReg(); | ||||
| 5716 | |||||
| 5717 | if (Negated) | ||||
| 5718 | std::swap(CondBrTarget, UncondBrTarget); | ||||
| 5719 | |||||
| 5720 | B.setInsertPt(B.getMBB(), BrCond->getIterator()); | ||||
| 5721 | B.buildInstr(AMDGPU::SI_LOOP) | ||||
| 5722 | .addUse(Reg) | ||||
| 5723 | .addMBB(UncondBrTarget); | ||||
| 5724 | |||||
| 5725 | if (Br) | ||||
| 5726 | Br->getOperand(0).setMBB(CondBrTarget); | ||||
| 5727 | else | ||||
| 5728 | B.buildBr(*CondBrTarget); | ||||
| 5729 | |||||
| 5730 | MI.eraseFromParent(); | ||||
| 5731 | BrCond->eraseFromParent(); | ||||
| 5732 | MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); | ||||
| 5733 | return true; | ||||
| 5734 | } | ||||
| 5735 | |||||
| 5736 | return false; | ||||
| 5737 | } | ||||
| 5738 | case Intrinsic::amdgcn_kernarg_segment_ptr: | ||||
| 5739 | if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { | ||||
| 5740 | // This only makes sense to call in a kernel, so just lower to null. | ||||
| 5741 | B.buildConstant(MI.getOperand(0).getReg(), 0); | ||||
| 5742 | MI.eraseFromParent(); | ||||
| 5743 | return true; | ||||
| 5744 | } | ||||
| 5745 | |||||
| 5746 | return legalizePreloadedArgIntrin( | ||||
| 5747 | MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); | ||||
| 5748 | case Intrinsic::amdgcn_implicitarg_ptr: | ||||
| 5749 | return legalizeImplicitArgPtr(MI, MRI, B); | ||||
| 5750 | case Intrinsic::amdgcn_workitem_id_x: | ||||
| 5751 | return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, | ||||
| 5752 | AMDGPUFunctionArgInfo::WORKITEM_ID_X); | ||||
| 5753 | case Intrinsic::amdgcn_workitem_id_y: | ||||
| 5754 | return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, | ||||
| 5755 | AMDGPUFunctionArgInfo::WORKITEM_ID_Y); | ||||
| 5756 | case Intrinsic::amdgcn_workitem_id_z: | ||||
| 5757 | return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, | ||||
| 5758 | AMDGPUFunctionArgInfo::WORKITEM_ID_Z); | ||||
| 5759 | case Intrinsic::amdgcn_workgroup_id_x: | ||||
| 5760 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
| 5761 | AMDGPUFunctionArgInfo::WORKGROUP_ID_X); | ||||
| 5762 | case Intrinsic::amdgcn_workgroup_id_y: | ||||
| 5763 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
| 5764 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); | ||||
| 5765 | case Intrinsic::amdgcn_workgroup_id_z: | ||||
| 5766 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
| 5767 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); | ||||
| 5768 | case Intrinsic::amdgcn_lds_kernel_id: | ||||
| 5769 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
| 5770 | AMDGPUFunctionArgInfo::LDS_KERNEL_ID); | ||||
| 5771 | case Intrinsic::amdgcn_dispatch_ptr: | ||||
| 5772 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
| 5773 | AMDGPUFunctionArgInfo::DISPATCH_PTR); | ||||
| 5774 | case Intrinsic::amdgcn_queue_ptr: | ||||
| 5775 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
| 5776 | AMDGPUFunctionArgInfo::QUEUE_PTR); | ||||
| 5777 | case Intrinsic::amdgcn_implicit_buffer_ptr: | ||||
| 5778 | return legalizePreloadedArgIntrin( | ||||
| 5779 | MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); | ||||
| 5780 | case Intrinsic::amdgcn_dispatch_id: | ||||
| 5781 | return legalizePreloadedArgIntrin(MI, MRI, B, | ||||
| 5782 | AMDGPUFunctionArgInfo::DISPATCH_ID); | ||||
| 5783 | case Intrinsic::r600_read_ngroups_x: | ||||
| 5784 | // TODO: Emit error for hsa | ||||
| 5785 | return legalizeKernargMemParameter(MI, B, | ||||
| 5786 | SI::KernelInputOffsets::NGROUPS_X); | ||||
| 5787 | case Intrinsic::r600_read_ngroups_y: | ||||
| 5788 | return legalizeKernargMemParameter(MI, B, | ||||
| 5789 | SI::KernelInputOffsets::NGROUPS_Y); | ||||
| 5790 | case Intrinsic::r600_read_ngroups_z: | ||||
| 5791 | return legalizeKernargMemParameter(MI, B, | ||||
| 5792 | SI::KernelInputOffsets::NGROUPS_Z); | ||||
| 5793 | case Intrinsic::r600_read_local_size_x: | ||||
| 5794 | // TODO: Could insert G_ASSERT_ZEXT from s16 | ||||
| 5795 | return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); | ||||
| 5796 | case Intrinsic::r600_read_local_size_y: | ||||
| 5797 | // TODO: Could insert G_ASSERT_ZEXT from s16 | ||||
| 5798 | return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); | ||||
| 5799 | // TODO: Could insert G_ASSERT_ZEXT from s16 | ||||
| 5800 | case Intrinsic::r600_read_local_size_z: | ||||
| 5801 | return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); | ||||
| 5802 | case Intrinsic::r600_read_global_size_x: | ||||
| 5803 | return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); | ||||
| 5804 | case Intrinsic::r600_read_global_size_y: | ||||
| 5805 | return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); | ||||
| 5806 | case Intrinsic::r600_read_global_size_z: | ||||
| 5807 | return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); | ||||
| 5808 | case Intrinsic::amdgcn_fdiv_fast: | ||||
| 5809 | return legalizeFDIVFastIntrin(MI, MRI, B); | ||||
| 5810 | case Intrinsic::amdgcn_is_shared: | ||||
| 5811 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); | ||||
| 5812 | case Intrinsic::amdgcn_is_private: | ||||
| 5813 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); | ||||
| 5814 | case Intrinsic::amdgcn_wavefrontsize: { | ||||
| 5815 | B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); | ||||
| 5816 | MI.eraseFromParent(); | ||||
| 5817 | return true; | ||||
| 5818 | } | ||||
| 5819 | case Intrinsic::amdgcn_s_buffer_load: | ||||
| 5820 | return legalizeSBufferLoad(Helper, MI); | ||||
| 5821 | case Intrinsic::amdgcn_raw_buffer_store: | ||||
| 5822 | case Intrinsic::amdgcn_struct_buffer_store: | ||||
| 5823 | return legalizeBufferStore(MI, MRI, B, false, false); | ||||
| 5824 | case Intrinsic::amdgcn_raw_buffer_store_format: | ||||
| 5825 | case Intrinsic::amdgcn_struct_buffer_store_format: | ||||
| 5826 | return legalizeBufferStore(MI, MRI, B, false, true); | ||||
| 5827 | case Intrinsic::amdgcn_raw_tbuffer_store: | ||||
| 5828 | case Intrinsic::amdgcn_struct_tbuffer_store: | ||||
| 5829 | return legalizeBufferStore(MI, MRI, B, true, true); | ||||
| 5830 | case Intrinsic::amdgcn_raw_buffer_load: | ||||
| 5831 | case Intrinsic::amdgcn_struct_buffer_load: | ||||
| 5832 | return legalizeBufferLoad(MI, MRI, B, false, false); | ||||
| 5833 | case Intrinsic::amdgcn_raw_buffer_load_format: | ||||
| 5834 | case Intrinsic::amdgcn_struct_buffer_load_format: | ||||
| 5835 | return legalizeBufferLoad(MI, MRI, B, true, false); | ||||
| 5836 | case Intrinsic::amdgcn_raw_tbuffer_load: | ||||
| 5837 | case Intrinsic::amdgcn_struct_tbuffer_load: | ||||
| 5838 | return legalizeBufferLoad(MI, MRI, B, true, true); | ||||
| 5839 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: | ||||
| 5840 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: | ||||
| 5841 | case Intrinsic::amdgcn_raw_buffer_atomic_add: | ||||
| 5842 | case Intrinsic::amdgcn_struct_buffer_atomic_add: | ||||
| 5843 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: | ||||
| 5844 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: | ||||
| 5845 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: | ||||
| 5846 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: | ||||
| 5847 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: | ||||
| 5848 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: | ||||
| 5849 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: | ||||
| 5850 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: | ||||
| 5851 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: | ||||
| 5852 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: | ||||
| 5853 | case Intrinsic::amdgcn_raw_buffer_atomic_and: | ||||
| 5854 | case Intrinsic::amdgcn_struct_buffer_atomic_and: | ||||
| 5855 | case Intrinsic::amdgcn_raw_buffer_atomic_or: | ||||
| 5856 | case Intrinsic::amdgcn_struct_buffer_atomic_or: | ||||
| 5857 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: | ||||
| 5858 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: | ||||
| 5859 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: | ||||
| 5860 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: | ||||
| 5861 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: | ||||
| 5862 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: | ||||
| 5863 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: | ||||
| 5864 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: | ||||
| 5865 | case Intrinsic::amdgcn_raw_buffer_atomic_fmin: | ||||
| 5866 | case Intrinsic::amdgcn_struct_buffer_atomic_fmin: | ||||
| 5867 | case Intrinsic::amdgcn_raw_buffer_atomic_fmax: | ||||
| 5868 | case Intrinsic::amdgcn_struct_buffer_atomic_fmax: | ||||
| 5869 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: | ||||
| 5870 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: | ||||
| 5871 | return legalizeBufferAtomic(MI, B, IntrID); | ||||
| 5872 | case Intrinsic::amdgcn_atomic_inc: | ||||
| 5873 | return legalizeAtomicIncDec(MI, B, true); | ||||
| 5874 | case Intrinsic::amdgcn_atomic_dec: | ||||
| 5875 | return legalizeAtomicIncDec(MI, B, false); | ||||
| 5876 | case Intrinsic::trap: | ||||
| 5877 | return legalizeTrapIntrinsic(MI, MRI, B); | ||||
| 5878 | case Intrinsic::debugtrap: | ||||
| 5879 | return legalizeDebugTrapIntrinsic(MI, MRI, B); | ||||
| 5880 | case Intrinsic::amdgcn_rsq_clamp: | ||||
| 5881 | return legalizeRsqClampIntrinsic(MI, MRI, B); | ||||
| 5882 | case Intrinsic::amdgcn_ds_fadd: | ||||
| 5883 | case Intrinsic::amdgcn_ds_fmin: | ||||
| 5884 | case Intrinsic::amdgcn_ds_fmax: | ||||
| 5885 | return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); | ||||
| 5886 | case Intrinsic::amdgcn_image_bvh_intersect_ray: | ||||
| 5887 | return legalizeBVHIntrinsic(MI, B); | ||||
| 5888 | default: { | ||||
| 5889 | if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = | ||||
| 5890 | AMDGPU::getImageDimIntrinsicInfo(IntrID)) | ||||
| 5891 | return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); | ||||
| 5892 | return true; | ||||
| 5893 | } | ||||
| 5894 | } | ||||
| 5895 | |||||
| 5896 | return true; | ||||
| 5897 | } |
| 1 | //===-- llvm/ADT/bit.h - C++20 <bit> ----------------------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// |
| 9 | /// \file |
| 10 | /// This file implements the C++20 <bit> header. |
| 11 | /// |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #ifndef LLVM_ADT_BIT_H |
| 15 | #define LLVM_ADT_BIT_H |
| 16 | |
| 17 | #include "llvm/Support/Compiler.h" |
| 18 | #include <cstdint> |
| 19 | #include <limits> |
| 20 | #include <type_traits> |
| 21 | |
| 22 | #if !__has_builtin(__builtin_bit_cast)1 |
| 23 | #include <cstring> |
| 24 | #endif |
| 25 | |
| 26 | #if defined(_MSC_VER) && !defined(_DEBUG1) |
| 27 | #include <cstdlib> // for _byteswap_{ushort,ulong,uint64} |
| 28 | #endif |
| 29 | |
| 30 | #ifdef _MSC_VER |
| 31 | // Declare these intrinsics manually rather including intrin.h. It's very |
| 32 | // expensive, and bit.h is popular via MathExtras.h. |
| 33 | // #include <intrin.h> |
| 34 | extern "C" { |
| 35 | unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask); |
| 36 | unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask); |
| 37 | unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask); |
| 38 | unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask); |
| 39 | } |
| 40 | #endif |
| 41 | |
| 42 | namespace llvm { |
| 43 | |
| 44 | // This implementation of bit_cast is different from the C++20 one in two ways: |
| 45 | // - It isn't constexpr because that requires compiler support. |
| 46 | // - It requires trivially-constructible To, to avoid UB in the implementation. |
| 47 | template < |
| 48 | typename To, typename From, |
| 49 | typename = std::enable_if_t<sizeof(To) == sizeof(From)>, |
| 50 | typename = std::enable_if_t<std::is_trivially_constructible<To>::value>, |
| 51 | typename = std::enable_if_t<std::is_trivially_copyable<To>::value>, |
| 52 | typename = std::enable_if_t<std::is_trivially_copyable<From>::value>> |
| 53 | [[nodiscard]] inline To bit_cast(const From &from) noexcept { |
| 54 | #if __has_builtin(__builtin_bit_cast)1 |
| 55 | return __builtin_bit_cast(To, from); |
| 56 | #else |
| 57 | To to; |
| 58 | std::memcpy(&to, &from, sizeof(To)); |
| 59 | return to; |
| 60 | #endif |
| 61 | } |
| 62 | |
| 63 | /// Reverses the bytes in the given integer value V. |
| 64 | template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>> |
| 65 | [[nodiscard]] constexpr T byteswap(T V) noexcept { |
| 66 | if constexpr (sizeof(T) == 1) { |
| 67 | return V; |
| 68 | } else if constexpr (sizeof(T) == 2) { |
| 69 | uint16_t UV = V; |
| 70 | #if defined(_MSC_VER) && !defined(_DEBUG1) |
| 71 | // The DLL version of the runtime lacks these functions (bug!?), but in a |
| 72 | // release build they're replaced with BSWAP instructions anyway. |
| 73 | return _byteswap_ushort(UV); |
| 74 | #else |
| 75 | uint16_t Hi = UV << 8; |
| 76 | uint16_t Lo = UV >> 8; |
| 77 | return Hi | Lo; |
| 78 | #endif |
| 79 | } else if constexpr (sizeof(T) == 4) { |
| 80 | uint32_t UV = V; |
| 81 | #if __has_builtin(__builtin_bswap32)1 |
| 82 | return __builtin_bswap32(UV); |
| 83 | #elif defined(_MSC_VER) && !defined(_DEBUG1) |
| 84 | return _byteswap_ulong(UV); |
| 85 | #else |
| 86 | uint32_t Byte0 = UV & 0x000000FF; |
| 87 | uint32_t Byte1 = UV & 0x0000FF00; |
| 88 | uint32_t Byte2 = UV & 0x00FF0000; |
| 89 | uint32_t Byte3 = UV & 0xFF000000; |
| 90 | return (Byte0 << 24) | (Byte1 << 8) | (Byte2 >> 8) | (Byte3 >> 24); |
| 91 | #endif |
| 92 | } else if constexpr (sizeof(T) == 8) { |
| 93 | uint64_t UV = V; |
| 94 | #if __has_builtin(__builtin_bswap64)1 |
| 95 | return __builtin_bswap64(UV); |
| 96 | #elif defined(_MSC_VER) && !defined(_DEBUG1) |
| 97 | return _byteswap_uint64(UV); |
| 98 | #else |
| 99 | uint64_t Hi = llvm::byteswap<uint32_t>(UV); |
| 100 | uint32_t Lo = llvm::byteswap<uint32_t>(UV >> 32); |
| 101 | return (Hi << 32) | Lo; |
| 102 | #endif |
| 103 | } else { |
| 104 | static_assert(!sizeof(T *), "Don't know how to handle the given type."); |
| 105 | return 0; |
| 106 | } |
| 107 | } |
| 108 | |
| 109 | template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>> |
| 110 | [[nodiscard]] constexpr inline bool has_single_bit(T Value) noexcept { |
| 111 | return (Value != 0) && ((Value & (Value - 1)) == 0); |
| 112 | } |
| 113 | |
| 114 | namespace detail { |
| 115 | template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter { |
| 116 | static unsigned count(T Val) { |
| 117 | if (!Val) |
| 118 | return std::numeric_limits<T>::digits; |
| 119 | if (Val & 0x1) |
| 120 | return 0; |
| 121 | |
| 122 | // Bisection method. |
| 123 | unsigned ZeroBits = 0; |
| 124 | T Shift = std::numeric_limits<T>::digits >> 1; |
| 125 | T Mask = std::numeric_limits<T>::max() >> Shift; |
| 126 | while (Shift) { |
| 127 | if ((Val & Mask) == 0) { |
| 128 | Val >>= Shift; |
| 129 | ZeroBits |= Shift; |
| 130 | } |
| 131 | Shift >>= 1; |
| 132 | Mask >>= Shift; |
| 133 | } |
| 134 | return ZeroBits; |
| 135 | } |
| 136 | }; |
| 137 | |
| 138 | #if defined(__GNUC__4) || defined(_MSC_VER) |
| 139 | template <typename T> struct TrailingZerosCounter<T, 4> { |
| 140 | static unsigned count(T Val) { |
| 141 | if (Val == 0) |
| 142 | return 32; |
| 143 | |
| 144 | #if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4) |
| 145 | return __builtin_ctz(Val); |
| 146 | #elif defined(_MSC_VER) |
| 147 | unsigned long Index; |
| 148 | _BitScanForward(&Index, Val); |
| 149 | return Index; |
| 150 | #endif |
| 151 | } |
| 152 | }; |
| 153 | |
| 154 | #if !defined(_MSC_VER) || defined(_M_X64) |
| 155 | template <typename T> struct TrailingZerosCounter<T, 8> { |
| 156 | static unsigned count(T Val) { |
| 157 | if (Val == 0) |
| 158 | return 64; |
| 159 | |
| 160 | #if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4) |
| 161 | return __builtin_ctzll(Val); |
| 162 | #elif defined(_MSC_VER) |
| 163 | unsigned long Index; |
| 164 | _BitScanForward64(&Index, Val); |
| 165 | return Index; |
| 166 | #endif |
| 167 | } |
| 168 | }; |
| 169 | #endif |
| 170 | #endif |
| 171 | } // namespace detail |
| 172 | |
| 173 | /// Count number of 0's from the least significant bit to the most |
| 174 | /// stopping at the first 1. |
| 175 | /// |
| 176 | /// Only unsigned integral types are allowed. |
| 177 | /// |
| 178 | /// Returns std::numeric_limits<T>::digits on an input of 0. |
| 179 | template <typename T> [[nodiscard]] int countr_zero(T Val) { |
| 180 | static_assert(std::is_unsigned_v<T>, |
| 181 | "Only unsigned integral types are allowed."); |
| 182 | return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val); |
| 183 | } |
| 184 | |
| 185 | namespace detail { |
| 186 | template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter { |
| 187 | static unsigned count(T Val) { |
| 188 | if (!Val) |
| 189 | return std::numeric_limits<T>::digits; |
| 190 | |
| 191 | // Bisection method. |
| 192 | unsigned ZeroBits = 0; |
| 193 | for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) { |
| 194 | T Tmp = Val >> Shift; |
| 195 | if (Tmp) |
| 196 | Val = Tmp; |
| 197 | else |
| 198 | ZeroBits |= Shift; |
| 199 | } |
| 200 | return ZeroBits; |
| 201 | } |
| 202 | }; |
| 203 | |
| 204 | #if defined(__GNUC__4) || defined(_MSC_VER) |
| 205 | template <typename T> struct LeadingZerosCounter<T, 4> { |
| 206 | static unsigned count(T Val) { |
| 207 | if (Val == 0) |
| 208 | return 32; |
| 209 | |
| 210 | #if __has_builtin(__builtin_clz)1 || defined(__GNUC__4) |
| 211 | return __builtin_clz(Val); |
| 212 | #elif defined(_MSC_VER) |
| 213 | unsigned long Index; |
| 214 | _BitScanReverse(&Index, Val); |
| 215 | return Index ^ 31; |
| 216 | #endif |
| 217 | } |
| 218 | }; |
| 219 | |
| 220 | #if !defined(_MSC_VER) || defined(_M_X64) |
| 221 | template <typename T> struct LeadingZerosCounter<T, 8> { |
| 222 | static unsigned count(T Val) { |
| 223 | if (Val == 0) |
| 224 | return 64; |
| 225 | |
| 226 | #if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4) |
| 227 | return __builtin_clzll(Val); |
| 228 | #elif defined(_MSC_VER) |
| 229 | unsigned long Index; |
| 230 | _BitScanReverse64(&Index, Val); |
| 231 | return Index ^ 63; |
| 232 | #endif |
| 233 | } |
| 234 | }; |
| 235 | #endif |
| 236 | #endif |
| 237 | } // namespace detail |
| 238 | |
| 239 | /// Count number of 0's from the most significant bit to the least |
| 240 | /// stopping at the first 1. |
| 241 | /// |
| 242 | /// Only unsigned integral types are allowed. |
| 243 | /// |
| 244 | /// Returns std::numeric_limits<T>::digits on an input of 0. |
| 245 | template <typename T> [[nodiscard]] int countl_zero(T Val) { |
| 246 | static_assert(std::is_unsigned_v<T>, |
| 247 | "Only unsigned integral types are allowed."); |
| 248 | return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val); |
| 249 | } |
| 250 | |
| 251 | /// Count the number of ones from the most significant bit to the first |
| 252 | /// zero bit. |
| 253 | /// |
| 254 | /// Ex. countl_one(0xFF0FFF00) == 8. |
| 255 | /// Only unsigned integral types are allowed. |
| 256 | /// |
| 257 | /// Returns std::numeric_limits<T>::digits on an input of all ones. |
| 258 | template <typename T> [[nodiscard]] int countl_one(T Value) { |
| 259 | static_assert(std::is_unsigned_v<T>, |
| 260 | "Only unsigned integral types are allowed."); |
| 261 | return llvm::countl_zero<T>(~Value); |
| 262 | } |
| 263 | |
| 264 | /// Count the number of ones from the least significant bit to the first |
| 265 | /// zero bit. |
| 266 | /// |
| 267 | /// Ex. countr_one(0x00FF00FF) == 8. |
| 268 | /// Only unsigned integral types are allowed. |
| 269 | /// |
| 270 | /// Returns std::numeric_limits<T>::digits on an input of all ones. |
| 271 | template <typename T> [[nodiscard]] int countr_one(T Value) { |
| 272 | static_assert(std::is_unsigned_v<T>, |
| 273 | "Only unsigned integral types are allowed."); |
| 274 | return llvm::countr_zero<T>(~Value); |
| 275 | } |
| 276 | |
| 277 | /// Returns the number of bits needed to represent Value if Value is nonzero. |
| 278 | /// Returns 0 otherwise. |
| 279 | /// |
| 280 | /// Ex. bit_width(5) == 3. |
| 281 | template <typename T> [[nodiscard]] int bit_width(T Value) { |
| 282 | static_assert(std::is_unsigned_v<T>, |
| 283 | "Only unsigned integral types are allowed."); |
| 284 | return std::numeric_limits<T>::digits - llvm::countl_zero(Value); |
| 285 | } |
| 286 | |
| 287 | /// Returns the largest integral power of two no greater than Value if Value is |
| 288 | /// nonzero. Returns 0 otherwise. |
| 289 | /// |
| 290 | /// Ex. bit_floor(5) == 4. |
| 291 | template <typename T> [[nodiscard]] T bit_floor(T Value) { |
| 292 | static_assert(std::is_unsigned_v<T>, |
| 293 | "Only unsigned integral types are allowed."); |
| 294 | if (!Value) |
| 295 | return 0; |
| 296 | return T(1) << (llvm::bit_width(Value) - 1); |
| 297 | } |
| 298 | |
| 299 | /// Returns the smallest integral power of two no smaller than Value if Value is |
| 300 | /// nonzero. Returns 1 otherwise. |
| 301 | /// |
| 302 | /// Ex. bit_ceil(5) == 8. |
| 303 | /// |
| 304 | /// The return value is undefined if the input is larger than the largest power |
| 305 | /// of two representable in T. |
| 306 | template <typename T> [[nodiscard]] T bit_ceil(T Value) { |
| 307 | static_assert(std::is_unsigned_v<T>, |
| 308 | "Only unsigned integral types are allowed."); |
| 309 | if (Value < 2) |
| 310 | return 1; |
| 311 | return T(1) << llvm::bit_width<T>(Value - 1u); |
| 312 | } |
| 313 | |
| 314 | namespace detail { |
| 315 | template <typename T, std::size_t SizeOfT> struct PopulationCounter { |
| 316 | static int count(T Value) { |
| 317 | // Generic version, forward to 32 bits. |
| 318 | static_assert(SizeOfT <= 4, "Not implemented!"); |
| 319 | #if defined(__GNUC__4) |
| 320 | return (int)__builtin_popcount(Value); |
| 321 | #else |
| 322 | uint32_t v = Value; |
| 323 | v = v - ((v >> 1) & 0x55555555); |
| 324 | v = (v & 0x33333333) + ((v >> 2) & 0x33333333); |
| 325 | return int(((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24); |
| 326 | #endif |
| 327 | } |
| 328 | }; |
| 329 | |
| 330 | template <typename T> struct PopulationCounter<T, 8> { |
| 331 | static int count(T Value) { |
| 332 | #if defined(__GNUC__4) |
| 333 | return (int)__builtin_popcountll(Value); |
| 334 | #else |
| 335 | uint64_t v = Value; |
| 336 | v = v - ((v >> 1) & 0x5555555555555555ULL); |
| 337 | v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL); |
| 338 | v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL; |
| 339 | return int((uint64_t)(v * 0x0101010101010101ULL) >> 56); |
| 340 | #endif |
| 341 | } |
| 342 | }; |
| 343 | } // namespace detail |
| 344 | |
| 345 | /// Count the number of set bits in a value. |
| 346 | /// Ex. popcount(0xF000F000) = 8 |
| 347 | /// Returns 0 if the word is zero. |
| 348 | template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>> |
| 349 | [[nodiscard]] inline int popcount(T Value) noexcept { |
| 350 | return detail::PopulationCounter<T, sizeof(T)>::count(Value); |
| 351 | } |
| 352 | |
| 353 | // Forward-declare rotr so that rotl can use it. |
| 354 | template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>> |
| 355 | [[nodiscard]] constexpr T rotr(T V, int R); |
| 356 | |
| 357 | template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>> |
| 358 | [[nodiscard]] constexpr T rotl(T V, int R) { |
| 359 | unsigned N = std::numeric_limits<T>::digits; |
| 360 | |
| 361 | R = R % N; |
| 362 | if (!R) |
| 363 | return V; |
| 364 | |
| 365 | if (R < 0) |
| 366 | return llvm::rotr(V, -R); |
| 367 | |
| 368 | return (V << R) | (V >> (N - R)); |
| 369 | } |
| 370 | |
| 371 | template <typename T, typename> [[nodiscard]] constexpr T rotr(T V, int R) { |
| 372 | unsigned N = std::numeric_limits<T>::digits; |
| 373 | |
| 374 | R = R % N; |
| 375 | if (!R) |
| 376 | return V; |
| 377 | |
| 378 | if (R < 0) |
| 379 | return llvm::rotl(V, -R); |
| 380 | |
| 381 | return (V >> R) | (V << (N - R)); |
| 382 | } |
| 383 | |
| 384 | } // namespace llvm |
| 385 | |
| 386 | #endif |