LLVM 20.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
16#include "NVPTX.h"
17#include "NVPTXSubtarget.h"
18#include "NVPTXTargetMachine.h"
20#include "NVPTXUtilities.h"
21#include "llvm/ADT/APInt.h"
22#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/StringRef.h"
36#include "llvm/IR/Argument.h"
37#include "llvm/IR/Attributes.h"
38#include "llvm/IR/Constants.h"
39#include "llvm/IR/DataLayout.h"
42#include "llvm/IR/FPEnv.h"
43#include "llvm/IR/Function.h"
44#include "llvm/IR/GlobalValue.h"
45#include "llvm/IR/Instruction.h"
47#include "llvm/IR/IntrinsicsNVPTX.h"
48#include "llvm/IR/Module.h"
49#include "llvm/IR/Type.h"
50#include "llvm/IR/Value.h"
60#include <algorithm>
61#include <cassert>
62#include <cmath>
63#include <cstdint>
64#include <iterator>
65#include <optional>
66#include <string>
67#include <utility>
68#include <vector>
69
70#define DEBUG_TYPE "nvptx-lower"
71
72using namespace llvm;
73
74static std::atomic<unsigned> GlobalUniqueCallSite;
75
77 "nvptx-sched4reg",
78 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
79
81 "nvptx-fma-level", cl::Hidden,
82 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
83 " 1: do it 2: do it aggressively"),
84 cl::init(2));
85
87 "nvptx-prec-divf32", cl::Hidden,
88 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
89 " IEEE Compliant F32 div.rnd if available."),
90 cl::init(2));
91
93 "nvptx-prec-sqrtf32", cl::Hidden,
94 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
95 cl::init(true));
96
98 "nvptx-force-min-byval-param-align", cl::Hidden,
99 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
100 " params of device functions."),
101 cl::init(false));
102
104 if (UsePrecDivF32.getNumOccurrences() > 0) {
105 // If nvptx-prec-div32=N is used on the command-line, always honor it
106 return UsePrecDivF32;
107 } else {
108 // Otherwise, use div.approx if fast math is enabled
109 if (getTargetMachine().Options.UnsafeFPMath)
110 return 0;
111 else
112 return 2;
113 }
114}
115
118 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
119 return UsePrecSqrtF32;
120 } else {
121 // Otherwise, use sqrt.approx if fast math is enabled
123 }
124}
125
129}
130
131static bool IsPTXVectorType(MVT VT) {
132 switch (VT.SimpleTy) {
133 default:
134 return false;
135 case MVT::v2i1:
136 case MVT::v4i1:
137 case MVT::v2i8:
138 case MVT::v4i8:
139 case MVT::v8i8: // <2 x i8x4>
140 case MVT::v16i8: // <4 x i8x4>
141 case MVT::v2i16:
142 case MVT::v4i16:
143 case MVT::v8i16: // <4 x i16x2>
144 case MVT::v2i32:
145 case MVT::v4i32:
146 case MVT::v2i64:
147 case MVT::v2f16:
148 case MVT::v4f16:
149 case MVT::v8f16: // <4 x f16x2>
150 case MVT::v2bf16:
151 case MVT::v4bf16:
152 case MVT::v8bf16: // <4 x bf16x2>
153 case MVT::v2f32:
154 case MVT::v4f32:
155 case MVT::v2f64:
156 return true;
157 }
158}
159
160static bool Is16bitsType(MVT VT) {
161 return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 ||
162 VT.SimpleTy == MVT::i16);
163}
164
165// When legalizing vector loads/stores, this function is called, which does two
166// things:
167// 1. Determines Whether the vector is something we want to custom lower,
168// std::nullopt is returned if we do not want to custom lower it.
169// 2. If we do want to handle it, returns two parameters:
170// - unsigned int NumElts - The number of elements in the final vector
171// - EVT EltVT - The type of the elements in the final vector
172static std::optional<std::pair<unsigned int, EVT>>
174 if (!VectorVT.isVector() || !VectorVT.isSimple())
175 return std::nullopt;
176
177 EVT EltVT = VectorVT.getVectorElementType();
178 unsigned NumElts = VectorVT.getVectorNumElements();
179
180 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
181 // legal. We can (and should) split that into 2 stores of <2 x double> here
182 // but I'm leaving that as a TODO for now.
183 switch (VectorVT.getSimpleVT().SimpleTy) {
184 default:
185 return std::nullopt;
186 case MVT::v2i8:
187 case MVT::v2i16:
188 case MVT::v2i32:
189 case MVT::v2i64:
190 case MVT::v2f16:
191 case MVT::v2bf16:
192 case MVT::v2f32:
193 case MVT::v2f64:
194 case MVT::v4i8:
195 case MVT::v4i16:
196 case MVT::v4i32:
197 case MVT::v4f16:
198 case MVT::v4bf16:
199 case MVT::v4f32:
200 // This is a "native" vector type
201 return std::pair(NumElts, EltVT);
202 case MVT::v8i8: // <2 x i8x4>
203 case MVT::v8f16: // <4 x f16x2>
204 case MVT::v8bf16: // <4 x bf16x2>
205 case MVT::v8i16: // <4 x i16x2>
206 case MVT::v16i8: // <4 x i8x4>
207 // This can be upsized into a "native" vector type.
208 // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
209 // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
210 // vectorized loads/stores with the actual element type for i8/i16 as that
211 // would require v8/v16 variants that do not exist.
212 // In order to load/store such vectors efficiently, here in Type
213 // Legalization, we split the vector into word-sized chunks (v2x16/v4i8).
214 // Later, we will lower to PTX as vectors of b32.
215
216 // Number of elements to pack in one word.
217 unsigned NPerWord = 32 / EltVT.getSizeInBits();
218
219 return std::pair(NumElts / NPerWord,
220 MVT::getVectorVT(EltVT.getSimpleVT(), NPerWord));
221 }
222
223 llvm_unreachable("All cases in switch should return.");
224}
225
226/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
227/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
228/// into their primitive components.
229/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
230/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
231/// LowerCall, and LowerReturn.
232static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
233 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
234 SmallVectorImpl<uint64_t> *Offsets = nullptr,
235 uint64_t StartingOffset = 0) {
236 SmallVector<EVT, 16> TempVTs;
237 SmallVector<uint64_t, 16> TempOffsets;
238
239 // Special case for i128 - decompose to (i64, i64)
240 if (Ty->isIntegerTy(128)) {
241 ValueVTs.push_back(EVT(MVT::i64));
242 ValueVTs.push_back(EVT(MVT::i64));
243
244 if (Offsets) {
245 Offsets->push_back(StartingOffset + 0);
246 Offsets->push_back(StartingOffset + 8);
247 }
248
249 return;
250 }
251
252 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
253 if (StructType *STy = dyn_cast<StructType>(Ty)) {
254 auto const *SL = DL.getStructLayout(STy);
255 auto ElementNum = 0;
256 for(auto *EI : STy->elements()) {
257 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
258 StartingOffset + SL->getElementOffset(ElementNum));
259 ++ElementNum;
260 }
261 return;
262 }
263
264 // Given an array type, recursively traverse the elements with custom ComputePTXValueVTs.
265 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
266 Type *EltTy = ATy->getElementType();
267 uint64_t EltSize = DL.getTypeAllocSize(EltTy);
268 for (int I : llvm::seq<int>(ATy->getNumElements()))
269 ComputePTXValueVTs(TLI, DL, EltTy, ValueVTs, Offsets, StartingOffset + I * EltSize);
270 return;
271 }
272
273 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
274 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
275 EVT VT = TempVTs[i];
276 uint64_t Off = TempOffsets[i];
277 // Split vectors into individual elements, except for v2f16, which
278 // we will pass as a single scalar.
279 if (VT.isVector()) {
280 unsigned NumElts = VT.getVectorNumElements();
281 EVT EltVT = VT.getVectorElementType();
282 // We require power-of-2 sized vectors becuase
283 // TargetLoweringBase::getVectorTypeBreakdown() which is invoked in
284 // ComputePTXValueVTs() cannot currently break down non-power-of-2 sized
285 // vectors.
286 if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0 &&
287 isPowerOf2_32(NumElts)) {
288 // Vectors with an even number of f16 elements will be passed to
289 // us as an array of v2f16/v2bf16 elements. We must match this so we
290 // stay in sync with Ins/Outs.
291 switch (EltVT.getSimpleVT().SimpleTy) {
292 case MVT::f16:
293 EltVT = MVT::v2f16;
294 break;
295 case MVT::bf16:
296 EltVT = MVT::v2bf16;
297 break;
298 case MVT::i16:
299 EltVT = MVT::v2i16;
300 break;
301 default:
302 llvm_unreachable("Unexpected type");
303 }
304 NumElts /= 2;
305 } else if (EltVT.getSimpleVT() == MVT::i8 &&
306 ((NumElts % 4 == 0 && isPowerOf2_32(NumElts)) ||
307 NumElts == 3)) {
308 // v*i8 are formally lowered as v4i8
309 EltVT = MVT::v4i8;
310 NumElts = (NumElts + 3) / 4;
311 } else if (EltVT.getSimpleVT() == MVT::i8 && NumElts == 2) {
312 // v2i8 is promoted to v2i16
313 NumElts = 1;
314 EltVT = MVT::v2i16;
315 }
316 for (unsigned j = 0; j != NumElts; ++j) {
317 ValueVTs.push_back(EltVT);
318 if (Offsets)
319 Offsets->push_back(Off + j * EltVT.getStoreSize());
320 }
321 } else {
322 ValueVTs.push_back(VT);
323 if (Offsets)
324 Offsets->push_back(Off);
325 }
326 }
327}
328
329/// PromoteScalarIntegerPTX
330/// Used to make sure the arguments/returns are suitable for passing
331/// and promote them to a larger size if they're not.
332///
333/// The promoted type is placed in \p PromoteVT if the function returns true.
334static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) {
335 if (VT.isScalarInteger()) {
336 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
337 default:
339 "Promotion is not suitable for scalars of size larger than 64-bits");
340 case 1:
341 *PromotedVT = MVT::i1;
342 break;
343 case 2:
344 case 4:
345 case 8:
346 *PromotedVT = MVT::i8;
347 break;
348 case 16:
349 *PromotedVT = MVT::i16;
350 break;
351 case 32:
352 *PromotedVT = MVT::i32;
353 break;
354 case 64:
355 *PromotedVT = MVT::i64;
356 break;
357 }
358 return EVT(*PromotedVT) != VT;
359 }
360 return false;
361}
362
363// Check whether we can merge loads/stores of some of the pieces of a
364// flattened function parameter or return value into a single vector
365// load/store.
366//
367// The flattened parameter is represented as a list of EVTs and
368// offsets, and the whole structure is aligned to ParamAlignment. This
369// function determines whether we can load/store pieces of the
370// parameter starting at index Idx using a single vectorized op of
371// size AccessSize. If so, it returns the number of param pieces
372// covered by the vector op. Otherwise, it returns 1.
374 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
375 const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
376
377 // Can't vectorize if param alignment is not sufficient.
378 if (ParamAlignment < AccessSize)
379 return 1;
380 // Can't vectorize if offset is not aligned.
381 if (Offsets[Idx] & (AccessSize - 1))
382 return 1;
383
384 EVT EltVT = ValueVTs[Idx];
385 unsigned EltSize = EltVT.getStoreSize();
386
387 // Element is too large to vectorize.
388 if (EltSize >= AccessSize)
389 return 1;
390
391 unsigned NumElts = AccessSize / EltSize;
392 // Can't vectorize if AccessBytes if not a multiple of EltSize.
393 if (AccessSize != EltSize * NumElts)
394 return 1;
395
396 // We don't have enough elements to vectorize.
397 if (Idx + NumElts > ValueVTs.size())
398 return 1;
399
400 // PTX ISA can only deal with 2- and 4-element vector ops.
401 if (NumElts != 4 && NumElts != 2)
402 return 1;
403
404 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
405 // Types do not match.
406 if (ValueVTs[j] != EltVT)
407 return 1;
408
409 // Elements are not contiguous.
410 if (Offsets[j] - Offsets[j - 1] != EltSize)
411 return 1;
412 }
413 // OK. We can vectorize ValueVTs[i..i+NumElts)
414 return NumElts;
415}
416
417// Flags for tracking per-element vectorization state of loads/stores
418// of a flattened function parameter or return value.
420 PVF_INNER = 0x0, // Middle elements of a vector.
421 PVF_FIRST = 0x1, // First element of the vector.
422 PVF_LAST = 0x2, // Last element of the vector.
423 // Scalar is effectively a 1-element vector.
426
427// Computes whether and how we can vectorize the loads/stores of a
428// flattened function parameter or return value.
429//
430// The flattened parameter is represented as the list of ValueVTs and
431// Offsets, and is aligned to ParamAlignment bytes. We return a vector
432// of the same size as ValueVTs indicating how each piece should be
433// loaded/stored (i.e. as a scalar, or as part of a vector
434// load/store).
437 const SmallVectorImpl<uint64_t> &Offsets,
438 Align ParamAlignment, bool IsVAArg = false) {
439 // Set vector size to match ValueVTs and mark all elements as
440 // scalars by default.
442 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
443
444 if (IsVAArg)
445 return VectorInfo;
446
447 // Check what we can vectorize using 128/64/32-bit accesses.
448 for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
449 // Skip elements we've already processed.
450 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
451 for (unsigned AccessSize : {16, 8, 4, 2}) {
452 unsigned NumElts = CanMergeParamLoadStoresStartingAt(
453 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
454 // Mark vectorized elements.
455 switch (NumElts) {
456 default:
457 llvm_unreachable("Unexpected return value");
458 case 1:
459 // Can't vectorize using this size, try next smaller size.
460 continue;
461 case 2:
462 assert(I + 1 < E && "Not enough elements.");
463 VectorInfo[I] = PVF_FIRST;
464 VectorInfo[I + 1] = PVF_LAST;
465 I += 1;
466 break;
467 case 4:
468 assert(I + 3 < E && "Not enough elements.");
469 VectorInfo[I] = PVF_FIRST;
470 VectorInfo[I + 1] = PVF_INNER;
471 VectorInfo[I + 2] = PVF_INNER;
472 VectorInfo[I + 3] = PVF_LAST;
473 I += 3;
474 break;
475 }
476 // Break out of the inner loop because we've already succeeded
477 // using largest possible AccessSize.
478 break;
479 }
480 }
481 return VectorInfo;
482}
483
485 SDValue Value) {
486 if (Value->getValueType(0) == VT)
487 return Value;
488 return DAG.getNode(ISD::BITCAST, DL, VT, Value);
489}
490
491// NVPTXTargetLowering Constructor.
493 const NVPTXSubtarget &STI)
494 : TargetLowering(TM), nvTM(&TM), STI(STI) {
495 // always lower memset, memcpy, and memmove intrinsics to load/store
496 // instructions, rather
497 // then generating calls to memset, mempcy or memmove.
501
504
505 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
506 // condition branches.
507 setJumpIsExpensive(true);
508
509 // Wide divides are _very_ slow. Try to reduce the width of the divide if
510 // possible.
511 addBypassSlowDiv(64, 32);
512
513 // By default, use the Source scheduling
514 if (sched4reg)
516 else
518
519 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
520 LegalizeAction NoF16Action) {
521 bool IsOpSupported = STI.allowFP16Math();
522 switch (Op) {
523 // Several FP16 instructions are available on sm_80 only.
524 case ISD::FMINNUM:
525 case ISD::FMAXNUM:
528 case ISD::FMAXIMUM:
529 case ISD::FMINIMUM:
530 IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
531 break;
532 }
533 setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action);
534 };
535
536 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
537 LegalizeAction NoBF16Action) {
538 bool IsOpSupported = STI.hasBF16Math();
539 switch (Op) {
540 // Several BF16 instructions are available on sm_90 only.
541 case ISD::FADD:
542 case ISD::FMUL:
543 case ISD::FSUB:
544 case ISD::SELECT:
545 case ISD::SELECT_CC:
546 case ISD::SETCC:
547 case ISD::FEXP2:
548 case ISD::FCEIL:
549 case ISD::FFLOOR:
550 case ISD::FNEARBYINT:
551 case ISD::FRINT:
552 case ISD::FROUNDEVEN:
553 case ISD::FTRUNC:
554 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78;
555 break;
556 // Several BF16 instructions are available on sm_80 only.
557 case ISD::FMINNUM:
558 case ISD::FMAXNUM:
561 case ISD::FMAXIMUM:
562 case ISD::FMINIMUM:
563 IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
564 break;
565 }
567 Op, VT, IsOpSupported ? Action : NoBF16Action);
568 };
569
570 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
571 LegalizeAction NoI16x2Action) {
572 bool IsOpSupported = false;
573 // instructions are available on sm_90 only
574 switch (Op) {
575 case ISD::ADD:
576 case ISD::SMAX:
577 case ISD::SMIN:
578 case ISD::UMIN:
579 case ISD::UMAX:
580 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
581 break;
582 }
583 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
584 };
585
586 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
587 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
588 addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass);
589 addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass);
590 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
591 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
592 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
593 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
594 addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass);
595 addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass);
596 addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass);
597 addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass);
598
599 // Conversion to/from FP16/FP16x2 is always legal.
604
606 if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
608
609 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
610 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
611
612 // Conversion to/from BFP16/BFP16x2 is always legal.
617
618 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
619 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
620 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
621 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
622
623 // Conversion to/from i16/i16x2 is always legal.
628
633
634 // Custom conversions to/from v2i8.
636
637 // Only logical ops can be done on v4i8 directly, others must be done
638 // elementwise.
655 MVT::v4i8, Expand);
656
657 // Operations not directly supported by NVPTX.
658 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
659 MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8,
660 MVT::i32, MVT::i64}) {
663 }
664
665 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
666 // For others we will expand to a SHL/SRA pair.
673
680
683
685 {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},
686 Expand);
687
688 if (STI.hasHWROT32())
690
692
695
698
699 // We want to legalize constant related memmove and memcopy
700 // intrinsics.
702
703 // Turn FP extload into load/fpextend
704 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
705 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
706 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
707 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
708 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
709 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
710 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
711 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
712 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
713 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
714 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
715 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
716 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
717 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
718 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
719 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
720 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
721 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
722 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
723 // Turn FP truncstore into trunc + store.
724 // FIXME: vector types should also be expanded
725 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
726 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
727 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
728 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
729 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
730
731 // PTX does not support load / store predicate registers
734
735 for (MVT VT : MVT::integer_valuetypes()) {
739 setTruncStoreAction(VT, MVT::i1, Expand);
740 }
741
745 MVT::i1, Expand);
746
747 // expand extload of vector of integers.
749 MVT::v2i8, Expand);
750 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
751
752 // This is legal in NVPTX
757
758 setOperationAction(ISD::DYNAMIC_STACKALLOC, {MVT::i32, MVT::i64}, Custom);
760
761 // TRAP can be lowered to PTX trap
762 setOperationAction(ISD::TRAP, MVT::Other, Legal);
763 // DEBUGTRAP can be lowered to PTX brkpt
765
766 // Register custom handling for vector loads/stores
768 if (IsPTXVectorType(VT)) {
772 }
773 }
774
775 // Support varargs.
780
781 // Custom handling for i8 intrinsics
783
784 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
790
793 }
794
795 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
796 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
797 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
798 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
799 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
800 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
801 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
802
803 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
804 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
805 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
806 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
807 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
808 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
809
810 // Other arithmetic and logic ops are unsupported.
814 MVT::v2i16, Expand);
815
820 if (STI.getPTXVersion() >= 43) {
825 }
826
828 setOperationAction(ISD::CTTZ, MVT::v2i16, Expand);
831
832 // PTX does not directly support SELP of i1, so promote to i32 first
834
835 // PTX cannot multiply two i64s in a single instruction.
838
839 // We have some custom DAG combine patterns for these nodes
843
844 // setcc for f16x2 and bf16x2 needs special handling to prevent
845 // legalizer's attempt to scalarize it due to v2i1 not being legal.
846 if (STI.allowFP16Math() || STI.hasBF16Math())
848
849 // Promote fp16 arithmetic if fp16 hardware isn't available or the
850 // user passed --nvptx-no-fp16-math. The flag is useful because,
851 // although sm_53+ GPUs have some sort of FP16 support in
852 // hardware, only sm_53 and sm_60 have full implementation. Others
853 // only have token amount of hardware and are likely to run faster
854 // by using fp32 units instead.
855 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
856 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
857 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
858 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
859 // bf16 must be promoted to f32.
860 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
861 if (getOperationAction(Op, MVT::bf16) == Promote)
862 AddPromotedToType(Op, MVT::bf16, MVT::f32);
863 }
864
865 // f16/f16x2 neg was introduced in PTX 60, SM_53.
866 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
867 STI.getPTXVersion() >= 60 &&
868 STI.allowFP16Math();
869 for (const auto &VT : {MVT::f16, MVT::v2f16})
871 IsFP16FP16x2NegAvailable ? Legal : Expand);
872
873 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
874 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
875 // (would be) Library functions.
876
877 // These map to conversion instructions for scalar FP types.
878 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
880 setOperationAction(Op, MVT::f16, Legal);
881 setOperationAction(Op, MVT::f32, Legal);
882 setOperationAction(Op, MVT::f64, Legal);
883 setOperationAction(Op, MVT::v2f16, Expand);
884 setOperationAction(Op, MVT::v2bf16, Expand);
885 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
886 if (getOperationAction(Op, MVT::bf16) == Promote)
887 AddPromotedToType(Op, MVT::bf16, MVT::f32);
888 }
889
890 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
892 }
893 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
894 for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
897 }
898 }
899
900 // sm_80 only has conversions between f32 and bf16. Custom lower all other
901 // bf16 conversions.
902 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
903 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
906 VT, Custom);
907 }
910 MVT::bf16, Custom);
911 }
912
919 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
920
921 // 'Expand' implements FCOPYSIGN without calling an external library.
928
929 // These map to corresponding instructions for f32/f64. f16 must be
930 // promoted to f32. v2f16 is expanded to f16, which is then promoted
931 // to f32.
932 for (const auto &Op :
934 setOperationAction(Op, MVT::f16, Promote);
935 setOperationAction(Op, MVT::f32, Legal);
936 setOperationAction(Op, MVT::f64, Legal);
937 setOperationAction(Op, MVT::v2f16, Expand);
938 setOperationAction(Op, MVT::v2bf16, Expand);
939 setOperationAction(Op, MVT::bf16, Promote);
940 AddPromotedToType(Op, MVT::bf16, MVT::f32);
941 }
942
943 setOperationAction(ISD::FABS, {MVT::f32, MVT::f64}, Legal);
944 if (STI.getPTXVersion() >= 65) {
945 setFP16OperationAction(ISD::FABS, MVT::f16, Legal, Promote);
946 setFP16OperationAction(ISD::FABS, MVT::v2f16, Legal, Expand);
947 } else {
949 setOperationAction(ISD::FABS, MVT::v2f16, Expand);
950 }
951 setBF16OperationAction(ISD::FABS, MVT::v2bf16, Legal, Expand);
952 setBF16OperationAction(ISD::FABS, MVT::bf16, Legal, Promote);
953 if (getOperationAction(ISD::FABS, MVT::bf16) == Promote)
954 AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);
955
956 for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
957 setOperationAction(Op, MVT::f32, Legal);
958 setOperationAction(Op, MVT::f64, Legal);
959 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
960 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
961 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
962 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
963 if (getOperationAction(Op, MVT::bf16) == Promote)
964 AddPromotedToType(Op, MVT::bf16, MVT::f32);
965 }
966 bool SupportsF32MinMaxNaN =
967 STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
968 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
969 setOperationAction(Op, MVT::f32, SupportsF32MinMaxNaN ? Legal : Expand);
970 setFP16OperationAction(Op, MVT::f16, Legal, Expand);
971 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
972 setBF16OperationAction(Op, MVT::bf16, Legal, Expand);
973 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
974 }
975
976 // Custom lowering for inline asm with 128-bit operands
979
980 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
981 // No FPOW or FREM in PTX.
982
983 // Now deduce the information based on the above mentioned
984 // actions
986
987 setMinCmpXchgSizeInBits(STI.hasAtomCas16() ? 16 : 32);
990}
991
992const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
993
994#define MAKE_CASE(V) \
995 case V: \
996 return #V;
997
998 switch ((NVPTXISD::NodeType)Opcode) {
1000 break;
1001
1065 }
1066 return nullptr;
1067
1068#undef MAKE_CASE
1069}
1070
1073 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1074 VT.getScalarType() == MVT::i1)
1075 return TypeSplitVector;
1077}
1078
1080 int Enabled, int &ExtraSteps,
1081 bool &UseOneConst,
1082 bool Reciprocal) const {
1085 return SDValue();
1086
1087 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1088 ExtraSteps = 0;
1089
1090 SDLoc DL(Operand);
1091 EVT VT = Operand.getValueType();
1092 bool Ftz = useF32FTZ(DAG.getMachineFunction());
1093
1094 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1095 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1096 DAG.getConstant(IID, DL, MVT::i32), Operand);
1097 };
1098
1099 // The sqrt and rsqrt refinement processes assume we always start out with an
1100 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1101 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1102 // any refinement, we must return a regular sqrt.
1103 if (Reciprocal || ExtraSteps > 0) {
1104 if (VT == MVT::f32)
1105 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1106 : Intrinsic::nvvm_rsqrt_approx_f);
1107 else if (VT == MVT::f64)
1108 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1109 else
1110 return SDValue();
1111 } else {
1112 if (VT == MVT::f32)
1113 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1114 : Intrinsic::nvvm_sqrt_approx_f);
1115 else {
1116 // There's no sqrt.approx.f64 instruction, so we emit
1117 // reciprocal(rsqrt(x)). This is faster than
1118 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1119 // x * rsqrt(x).)
1120 return DAG.getNode(
1122 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1123 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1124 }
1125 }
1126}
1127
1128SDValue
1130 SDLoc dl(Op);
1131 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1132 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1133 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1134 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1135}
1136
1137static bool IsTypePassedAsArray(const Type *Ty) {
1138 return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) ||
1139 Ty->isHalfTy() || Ty->isBFloatTy();
1140}
1141
1143 const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1144 const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
1145 std::optional<std::pair<unsigned, const APInt &>> VAInfo,
1146 const CallBase &CB, unsigned UniqueCallSite) const {
1147 auto PtrVT = getPointerTy(DL);
1148
1149 bool isABI = (STI.getSmVersion() >= 20);
1150 assert(isABI && "Non-ABI compilation is not supported");
1151 if (!isABI)
1152 return "";
1153
1154 std::string Prototype;
1155 raw_string_ostream O(Prototype);
1156 O << "prototype_" << UniqueCallSite << " : .callprototype ";
1157
1158 if (retTy->getTypeID() == Type::VoidTyID) {
1159 O << "()";
1160 } else {
1161 O << "(";
1162 if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) &&
1163 !IsTypePassedAsArray(retTy)) {
1164 unsigned size = 0;
1165 if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1166 size = ITy->getBitWidth();
1167 } else {
1168 assert(retTy->isFloatingPointTy() &&
1169 "Floating point type expected here");
1170 size = retTy->getPrimitiveSizeInBits();
1171 }
1172 // PTX ABI requires all scalar return values to be at least 32
1173 // bits in size. fp16 normally uses .b16 as its storage type in
1174 // PTX, so its size must be adjusted here, too.
1176
1177 O << ".param .b" << size << " _";
1178 } else if (isa<PointerType>(retTy)) {
1179 O << ".param .b" << PtrVT.getSizeInBits() << " _";
1180 } else if (IsTypePassedAsArray(retTy)) {
1181 O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
1182 << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
1183 } else {
1184 llvm_unreachable("Unknown return type");
1185 }
1186 O << ") ";
1187 }
1188 O << "_ (";
1189
1190 bool first = true;
1191
1192 unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
1193 for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
1194 Type *Ty = Args[i].Ty;
1195 if (!first) {
1196 O << ", ";
1197 }
1198 first = false;
1199
1200 if (!Outs[OIdx].Flags.isByVal()) {
1201 if (IsTypePassedAsArray(Ty)) {
1202 Align ParamAlign =
1203 getArgumentAlignment(&CB, Ty, i + AttributeList::FirstArgIndex, DL);
1204 O << ".param .align " << ParamAlign.value() << " .b8 ";
1205 O << "_";
1206 O << "[" << DL.getTypeAllocSize(Ty) << "]";
1207 // update the index for Outs
1208 SmallVector<EVT, 16> vtparts;
1209 ComputeValueVTs(*this, DL, Ty, vtparts);
1210 if (unsigned len = vtparts.size())
1211 OIdx += len - 1;
1212 continue;
1213 }
1214 // i8 types in IR will be i16 types in SDAG
1215 assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1216 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1217 "type mismatch between callee prototype and arguments");
1218 // scalar type
1219 unsigned sz = 0;
1220 if (isa<IntegerType>(Ty)) {
1221 sz = cast<IntegerType>(Ty)->getBitWidth();
1223 } else if (isa<PointerType>(Ty)) {
1224 sz = PtrVT.getSizeInBits();
1225 } else {
1226 sz = Ty->getPrimitiveSizeInBits();
1227 }
1228 O << ".param .b" << sz << " ";
1229 O << "_";
1230 continue;
1231 }
1232
1233 // Indirect calls need strict ABI alignment so we disable optimizations by
1234 // not providing a function to optimize.
1235 Type *ETy = Args[i].IndirectType;
1236 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1237 Align ParamByValAlign =
1238 getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
1239
1240 O << ".param .align " << ParamByValAlign.value() << " .b8 ";
1241 O << "_";
1242 O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
1243 }
1244
1245 if (VAInfo)
1246 O << (first ? "" : ",") << " .param .align " << VAInfo->second
1247 << " .b8 _[]\n";
1248 O << ")";
1250 O << " .noreturn";
1251 O << ";";
1252
1253 return Prototype;
1254}
1255
1257 const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const {
1258 return getAlign(*F, Idx).value_or(getFunctionParamOptimizedAlign(F, Ty, DL));
1259}
1260
1261Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
1262 unsigned Idx,
1263 const DataLayout &DL) const {
1264 if (!CB) {
1265 // CallSite is zero, fallback to ABI type alignment
1266 return DL.getABITypeAlign(Ty);
1267 }
1268
1269 const Function *DirectCallee = CB->getCalledFunction();
1270
1271 if (!DirectCallee) {
1272 // We don't have a direct function symbol, but that may be because of
1273 // constant cast instructions in the call.
1274
1275 // With bitcast'd call targets, the instruction will be the call
1276 if (const auto *CI = dyn_cast<CallInst>(CB)) {
1277 // Check if we have call alignment metadata
1278 if (MaybeAlign StackAlign = getAlign(*CI, Idx))
1279 return StackAlign.value();
1280 }
1281 DirectCallee = getMaybeBitcastedCallee(CB);
1282 }
1283
1284 // Check for function alignment information if we found that the
1285 // ultimate target is a Function
1286 if (DirectCallee)
1287 return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL);
1288
1289 // Call is indirect, fall back to the ABI type alignment
1290 return DL.getABITypeAlign(Ty);
1291}
1292
1293static bool adjustElementType(EVT &ElementType) {
1294 switch (ElementType.getSimpleVT().SimpleTy) {
1295 default:
1296 return false;
1297 case MVT::f16:
1298 case MVT::bf16:
1299 ElementType = MVT::i16;
1300 return true;
1301 case MVT::f32:
1302 case MVT::v2f16:
1303 case MVT::v2bf16:
1304 ElementType = MVT::i32;
1305 return true;
1306 case MVT::f64:
1307 ElementType = MVT::i64;
1308 return true;
1309 }
1310}
1311
1312// Use byte-store when the param address of the argument value is unaligned.
1313// This may happen when the return value is a field of a packed structure.
1314//
1315// This is called in LowerCall() when passing the param values.
1317 uint64_t Offset, EVT ElementType,
1318 SDValue StVal, SDValue &InGlue,
1319 unsigned ArgID, const SDLoc &dl) {
1320 // Bit logic only works on integer types
1321 if (adjustElementType(ElementType))
1322 StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);
1323
1324 // Store each byte
1325 SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1326 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1327 // Shift the byte to the last byte position
1328 SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,
1329 DAG.getConstant(i * 8, dl, MVT::i32));
1330 SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),
1331 DAG.getConstant(Offset + i, dl, MVT::i32),
1332 ShiftVal, InGlue};
1333 // Trunc store only the last byte by using
1334 // st.param.b8
1335 // The register type can be larger than b8.
1336 Chain = DAG.getMemIntrinsicNode(
1337 NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,
1339 InGlue = Chain.getValue(1);
1340 }
1341 return Chain;
1342}
1343
1344// Use byte-load when the param adress of the returned value is unaligned.
1345// This may happen when the returned value is a field of a packed structure.
1346static SDValue
1348 EVT ElementType, SDValue &InGlue,
1349 SmallVectorImpl<SDValue> &TempProxyRegOps,
1350 const SDLoc &dl) {
1351 // Bit logic only works on integer types
1352 EVT MergedType = ElementType;
1353 adjustElementType(MergedType);
1354
1355 // Load each byte and construct the whole value. Initial value to 0
1356 SDValue RetVal = DAG.getConstant(0, dl, MergedType);
1357 // LoadParamMemI8 loads into i16 register only
1358 SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);
1359 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1360 SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
1361 DAG.getConstant(Offset + i, dl, MVT::i32),
1362 InGlue};
1363 // This will be selected to LoadParamMemI8
1364 SDValue LdVal =
1365 DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,
1366 MVT::i8, MachinePointerInfo(), Align(1));
1367 SDValue TmpLdVal = LdVal.getValue(0);
1368 Chain = LdVal.getValue(1);
1369 InGlue = LdVal.getValue(2);
1370
1371 TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,
1372 TmpLdVal.getSimpleValueType(), TmpLdVal);
1373 TempProxyRegOps.push_back(TmpLdVal);
1374
1375 SDValue CMask = DAG.getConstant(255, dl, MergedType);
1376 SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);
1377 // Need to extend the i16 register to the whole width.
1378 TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);
1379 // Mask off the high bits. Leave only the lower 8bits.
1380 // Do this because we are using loadparam.b8.
1381 TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);
1382 // Shift and merge
1383 TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);
1384 RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);
1385 }
1386 if (ElementType != MergedType)
1387 RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
1388
1389 return RetVal;
1390}
1391
1393 const GlobalAddressSDNode *Func) {
1394 if (!Func)
1395 return false;
1396 if (auto *CalleeFunc = dyn_cast<Function>(Func->getGlobal()))
1397 return CB->getFunctionType() != CalleeFunc->getFunctionType();
1398 return false;
1399}
1400
1402 SmallVectorImpl<SDValue> &InVals) const {
1403
1404 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1406 "Support for variadic functions (unsized array parameter) introduced "
1407 "in PTX ISA version 6.0 and requires target sm_30.");
1408
1409 SelectionDAG &DAG = CLI.DAG;
1410 SDLoc dl = CLI.DL;
1412 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1414 SDValue Chain = CLI.Chain;
1415 SDValue Callee = CLI.Callee;
1416 bool &isTailCall = CLI.IsTailCall;
1417 ArgListTy &Args = CLI.getArgs();
1418 Type *RetTy = CLI.RetTy;
1419 const CallBase *CB = CLI.CB;
1420 const DataLayout &DL = DAG.getDataLayout();
1421
1422 bool isABI = (STI.getSmVersion() >= 20);
1423 assert(isABI && "Non-ABI compilation is not supported");
1424 if (!isABI)
1425 return Chain;
1426
1427 // Variadic arguments.
1428 //
1429 // Normally, for each argument, we declare a param scalar or a param
1430 // byte array in the .param space, and store the argument value to that
1431 // param scalar or array starting at offset 0.
1432 //
1433 // In the case of the first variadic argument, we declare a vararg byte array
1434 // with size 0. The exact size of this array isn't known at this point, so
1435 // it'll be patched later. All the variadic arguments will be stored to this
1436 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1437 // initially set to 0, so it can be used for non-variadic arguments (which use
1438 // 0 offset) to simplify the code.
1439 //
1440 // After all vararg is processed, 'VAOffset' holds the size of the
1441 // vararg byte array.
1442
1443 SDValue VADeclareParam; // vararg byte array
1444 unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic
1445 unsigned VAOffset = 0; // current offset in the param array
1446
1447 unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
1448 SDValue TempChain = Chain;
1449 Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
1450 SDValue InGlue = Chain.getValue(1);
1451
1452 unsigned ParamCount = 0;
1453 // Args.size() and Outs.size() need not match.
1454 // Outs.size() will be larger
1455 // * if there is an aggregate argument with multiple fields (each field
1456 // showing up separately in Outs)
1457 // * if there is a vector argument with more than typical vector-length
1458 // elements (generally if more than 4) where each vector element is
1459 // individually present in Outs.
1460 // So a different index should be used for indexing into Outs/OutVals.
1461 // See similar issue in LowerFormalArguments.
1462 unsigned OIdx = 0;
1463 // Declare the .params or .reg need to pass values
1464 // to the function
1465 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1466 EVT VT = Outs[OIdx].VT;
1467 Type *Ty = Args[i].Ty;
1468 bool IsVAArg = (i >= CLI.NumFixedArgs);
1469 bool IsByVal = Outs[OIdx].Flags.isByVal();
1470
1473
1474 assert((!IsByVal || Args[i].IndirectType) &&
1475 "byval arg must have indirect type");
1476 Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
1477 ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset);
1478
1479 Align ArgAlign;
1480 if (IsByVal) {
1481 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1482 // so we don't need to worry whether it's naturally aligned or not.
1483 // See TargetLowering::LowerCallTo().
1484 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1485 ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy,
1486 InitialAlign, DL);
1487 if (IsVAArg)
1488 VAOffset = alignTo(VAOffset, ArgAlign);
1489 } else {
1490 ArgAlign = getArgumentAlignment(CB, Ty, ParamCount + 1, DL);
1491 }
1492
1493 unsigned TypeSize =
1494 (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty));
1495 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1496
1497 bool NeedAlign; // Does argument declaration specify alignment?
1498 bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty);
1499 if (IsVAArg) {
1500 if (ParamCount == FirstVAArg) {
1501 SDValue DeclareParamOps[] = {
1502 Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32),
1503 DAG.getConstant(ParamCount, dl, MVT::i32),
1504 DAG.getConstant(1, dl, MVT::i32), InGlue};
1505 VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl,
1506 DeclareParamVTs, DeclareParamOps);
1507 }
1508 NeedAlign = PassAsArray;
1509 } else if (PassAsArray) {
1510 // declare .param .align <align> .b8 .param<n>[<size>];
1511 SDValue DeclareParamOps[] = {
1512 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
1513 DAG.getConstant(ParamCount, dl, MVT::i32),
1514 DAG.getConstant(TypeSize, dl, MVT::i32), InGlue};
1515 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1516 DeclareParamOps);
1517 NeedAlign = true;
1518 } else {
1519 // declare .param .b<size> .param<n>;
1520 if (VT.isInteger() || VT.isFloatingPoint()) {
1521 // PTX ABI requires integral types to be at least 32 bits in
1522 // size. FP16 is loaded/stored using i16, so it's handled
1523 // here as well.
1525 }
1526 SDValue DeclareScalarParamOps[] = {
1527 Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
1528 DAG.getConstant(TypeSize * 8, dl, MVT::i32),
1529 DAG.getConstant(0, dl, MVT::i32), InGlue};
1530 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1531 DeclareScalarParamOps);
1532 NeedAlign = false;
1533 }
1534 InGlue = Chain.getValue(1);
1535
1536 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1537 // than 32-bits are sign extended or zero extended, depending on
1538 // whether they are signed or unsigned types. This case applies
1539 // only to scalar parameters and not to aggregate values.
1540 bool ExtendIntegerParam =
1541 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1542
1543 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1544 SmallVector<SDValue, 6> StoreOperands;
1545 for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1546 EVT EltVT = VTs[j];
1547 int CurOffset = Offsets[j];
1548 MaybeAlign PartAlign;
1549 if (NeedAlign)
1550 PartAlign = commonAlignment(ArgAlign, CurOffset);
1551
1552 SDValue StVal = OutVals[OIdx];
1553
1554 MVT PromotedVT;
1555 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
1556 EltVT = EVT(PromotedVT);
1557 }
1558 if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) {
1560 Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1561 StVal = DAG.getNode(Ext, dl, PromotedVT, StVal);
1562 }
1563
1564 if (IsByVal) {
1565 auto PtrVT = getPointerTy(DL);
1566 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
1567 DAG.getConstant(CurOffset, dl, PtrVT));
1568 StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(),
1569 PartAlign);
1570 } else if (ExtendIntegerParam) {
1571 assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1572 // zext/sext to i32
1573 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1575 dl, MVT::i32, StVal);
1576 }
1577
1578 if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {
1579 // Use 16-bit registers for small stores as it's the
1580 // smallest general purpose register size supported by NVPTX.
1581 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1582 }
1583
1584 // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
1585 // scalar store. In such cases, fall back to byte stores.
1586 if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() &&
1587 PartAlign.value() <
1588 DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) {
1589 assert(StoreOperands.empty() && "Unfinished preceeding store.");
1591 DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT,
1592 StVal, InGlue, ParamCount, dl);
1593
1594 // LowerUnalignedStoreParam took care of inserting the necessary nodes
1595 // into the SDAG, so just move on to the next element.
1596 if (!IsByVal)
1597 ++OIdx;
1598 continue;
1599 }
1600
1601 // New store.
1602 if (VectorInfo[j] & PVF_FIRST) {
1603 assert(StoreOperands.empty() && "Unfinished preceding store.");
1604 StoreOperands.push_back(Chain);
1605 StoreOperands.push_back(
1606 DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
1607
1608 StoreOperands.push_back(DAG.getConstant(
1609 IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
1610 dl, MVT::i32));
1611 }
1612
1613 // Record the value to store.
1614 StoreOperands.push_back(StVal);
1615
1616 if (VectorInfo[j] & PVF_LAST) {
1617 unsigned NumElts = StoreOperands.size() - 3;
1619 switch (NumElts) {
1620 case 1:
1622 break;
1623 case 2:
1625 break;
1626 case 4:
1628 break;
1629 default:
1630 llvm_unreachable("Invalid vector info.");
1631 }
1632
1633 StoreOperands.push_back(InGlue);
1634
1635 // Adjust type of the store op if we've extended the scalar
1636 // return value.
1637 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1638
1639 Chain = DAG.getMemIntrinsicNode(
1640 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1641 TheStoreType, MachinePointerInfo(), PartAlign,
1643 InGlue = Chain.getValue(1);
1644
1645 // Cleanup.
1646 StoreOperands.clear();
1647
1648 // TODO: We may need to support vector types that can be passed
1649 // as scalars in variadic arguments.
1650 if (!IsByVal && IsVAArg) {
1651 assert(NumElts == 1 &&
1652 "Vectorization is expected to be disabled for variadics.");
1653 VAOffset += DL.getTypeAllocSize(
1654 TheStoreType.getTypeForEVT(*DAG.getContext()));
1655 }
1656 }
1657 if (!IsByVal)
1658 ++OIdx;
1659 }
1660 assert(StoreOperands.empty() && "Unfinished parameter store.");
1661 if (!IsByVal && VTs.size() > 0)
1662 --OIdx;
1663 ++ParamCount;
1664 if (IsByVal && IsVAArg)
1665 VAOffset += TypeSize;
1666 }
1667
1668 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1669 MaybeAlign retAlignment = std::nullopt;
1670
1671 // Handle Result
1672 if (Ins.size() > 0) {
1673 SmallVector<EVT, 16> resvtparts;
1674 ComputeValueVTs(*this, DL, RetTy, resvtparts);
1675
1676 // Declare
1677 // .param .align N .b8 retval0[<size-in-bytes>], or
1678 // .param .b<size-in-bits> retval0
1679 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1680 if (!IsTypePassedAsArray(RetTy)) {
1681 resultsz = promoteScalarArgumentSize(resultsz);
1682 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1683 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1684 DAG.getConstant(resultsz, dl, MVT::i32),
1685 DAG.getConstant(0, dl, MVT::i32), InGlue };
1686 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1687 DeclareRetOps);
1688 InGlue = Chain.getValue(1);
1689 } else {
1690 retAlignment = getArgumentAlignment(CB, RetTy, 0, DL);
1691 assert(retAlignment && "retAlignment is guaranteed to be set");
1692 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1693 SDValue DeclareRetOps[] = {
1694 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
1695 DAG.getConstant(resultsz / 8, dl, MVT::i32),
1696 DAG.getConstant(0, dl, MVT::i32), InGlue};
1697 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1698 DeclareRetOps);
1699 InGlue = Chain.getValue(1);
1700 }
1701 }
1702
1703 bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1704 // Set the size of the vararg param byte array if the callee is a variadic
1705 // function and the variadic part is not empty.
1706 if (HasVAArgs) {
1707 SDValue DeclareParamOps[] = {
1708 VADeclareParam.getOperand(0), VADeclareParam.getOperand(1),
1709 VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32),
1710 VADeclareParam.getOperand(4)};
1711 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1712 VADeclareParam->getVTList(), DeclareParamOps);
1713 }
1714
1715 // If the type of the callsite does not match that of the function, convert
1716 // the callsite to an indirect call.
1717 bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func);
1718
1719 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1720 // between them we must rely on the call site value which is valid for
1721 // indirect calls but is always null for libcalls.
1722 bool isIndirectCall = (!Func && CB) || ConvertToIndirectCall;
1723
1724 if (isa<ExternalSymbolSDNode>(Callee)) {
1725 Function* CalleeFunc = nullptr;
1726
1727 // Try to find the callee in the current module.
1728 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1729 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1730
1731 // Set the "libcall callee" attribute to indicate that the function
1732 // must always have a declaration.
1733 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1734 }
1735
1736 if (isIndirectCall) {
1737 // This is indirect function call case : PTX requires a prototype of the
1738 // form
1739 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1740 // to be emitted, and the label has to used as the last arg of call
1741 // instruction.
1742 // The prototype is embedded in a string and put as the operand for a
1743 // CallPrototype SDNode which will print out to the value of the string.
1744 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1745 std::string Proto = getPrototype(
1746 DL, RetTy, Args, Outs, retAlignment,
1747 HasVAArgs
1748 ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(
1749 CLI.NumFixedArgs, VADeclareParam->getConstantOperandAPInt(1)))
1750 : std::nullopt,
1751 *CB, UniqueCallSite);
1752 const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
1753 SDValue ProtoOps[] = {
1754 Chain,
1755 DAG.getTargetExternalSymbol(ProtoStr, MVT::i32),
1756 InGlue,
1757 };
1758 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1759 InGlue = Chain.getValue(1);
1760 }
1761 // Op to just print "call"
1762 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1763 SDValue PrintCallOps[] = {
1764 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue
1765 };
1766 // We model convergent calls as separate opcodes.
1768 if (CLI.IsConvergent)
1771 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1772 InGlue = Chain.getValue(1);
1773
1774 if (ConvertToIndirectCall) {
1775 // Copy the function ptr to a ptx register and use the register to call the
1776 // function.
1777 EVT DestVT = Callee.getValueType();
1779 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
1780 unsigned DestReg =
1781 RegInfo.createVirtualRegister(TLI.getRegClassFor(DestVT.getSimpleVT()));
1782 auto RegCopy = DAG.getCopyToReg(DAG.getEntryNode(), dl, DestReg, Callee);
1783 Callee = DAG.getCopyFromReg(RegCopy, dl, DestReg, DestVT);
1784 }
1785
1786 // Ops to print out the function name
1787 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1788 SDValue CallVoidOps[] = { Chain, Callee, InGlue };
1789 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
1790 InGlue = Chain.getValue(1);
1791
1792 // Ops to print out the param list
1793 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1794 SDValue CallArgBeginOps[] = { Chain, InGlue };
1795 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
1796 CallArgBeginOps);
1797 InGlue = Chain.getValue(1);
1798
1799 for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e;
1800 ++i) {
1801 unsigned opcode;
1802 if (i == (e - 1))
1803 opcode = NVPTXISD::LastCallArg;
1804 else
1805 opcode = NVPTXISD::CallArg;
1806 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1807 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1808 DAG.getConstant(i, dl, MVT::i32), InGlue };
1809 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
1810 InGlue = Chain.getValue(1);
1811 }
1812 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1813 SDValue CallArgEndOps[] = { Chain,
1814 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
1815 InGlue };
1816 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
1817 InGlue = Chain.getValue(1);
1818
1819 if (isIndirectCall) {
1820 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1821 SDValue PrototypeOps[] = {
1822 Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue};
1823 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
1824 InGlue = Chain.getValue(1);
1825 }
1826
1827 SmallVector<SDValue, 16> ProxyRegOps;
1828 SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
1829 // An item of the vector is filled if the element does not need a ProxyReg
1830 // operation on it and should be added to InVals as is. ProxyRegOps and
1831 // ProxyRegTruncates contain empty/none items at the same index.
1833 // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
1834 // to use the values of `LoadParam`s and to be replaced later then
1835 // `CALLSEQ_END` is added.
1836 SmallVector<SDValue, 16> TempProxyRegOps;
1837
1838 // Generate loads from param memory/moves from registers for result
1839 if (Ins.size() > 0) {
1842 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
1843 assert(VTs.size() == Ins.size() && "Bad value decomposition");
1844
1845 Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1846 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1847
1848 SmallVector<EVT, 6> LoadVTs;
1849 int VecIdx = -1; // Index of the first element of the vector.
1850
1851 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1852 // 32-bits are sign extended or zero extended, depending on whether
1853 // they are signed or unsigned types.
1854 bool ExtendIntegerRetVal =
1855 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1856
1857 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
1858 bool needTruncate = false;
1859 EVT TheLoadType = VTs[i];
1860 EVT EltType = Ins[i].VT;
1861 Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
1862 MVT PromotedVT;
1863
1864 if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) {
1865 TheLoadType = EVT(PromotedVT);
1866 EltType = EVT(PromotedVT);
1867 needTruncate = true;
1868 }
1869
1870 if (ExtendIntegerRetVal) {
1871 TheLoadType = MVT::i32;
1872 EltType = MVT::i32;
1873 needTruncate = true;
1874 } else if (TheLoadType.getSizeInBits() < 16) {
1875 if (VTs[i].isInteger())
1876 needTruncate = true;
1877 EltType = MVT::i16;
1878 }
1879
1880 // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
1881 // scalar load. In such cases, fall back to byte loads.
1882 if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() &&
1883 EltAlign < DL.getABITypeAlign(
1884 TheLoadType.getTypeForEVT(*DAG.getContext()))) {
1885 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
1887 DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl);
1888 ProxyRegOps.push_back(SDValue());
1889 ProxyRegTruncates.push_back(std::optional<MVT>());
1890 RetElts.resize(i);
1891 RetElts.push_back(Ret);
1892
1893 continue;
1894 }
1895
1896 // Record index of the very first element of the vector.
1897 if (VectorInfo[i] & PVF_FIRST) {
1898 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
1899 VecIdx = i;
1900 }
1901
1902 LoadVTs.push_back(EltType);
1903
1904 if (VectorInfo[i] & PVF_LAST) {
1905 unsigned NumElts = LoadVTs.size();
1906 LoadVTs.push_back(MVT::Other);
1907 LoadVTs.push_back(MVT::Glue);
1909 switch (NumElts) {
1910 case 1:
1912 break;
1913 case 2:
1915 break;
1916 case 4:
1918 break;
1919 default:
1920 llvm_unreachable("Invalid vector info.");
1921 }
1922
1923 SDValue LoadOperands[] = {
1924 Chain, DAG.getConstant(1, dl, MVT::i32),
1925 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue};
1926 SDValue RetVal = DAG.getMemIntrinsicNode(
1927 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
1928 MachinePointerInfo(), EltAlign,
1930
1931 for (unsigned j = 0; j < NumElts; ++j) {
1932 ProxyRegOps.push_back(RetVal.getValue(j));
1933
1934 if (needTruncate)
1935 ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT));
1936 else
1937 ProxyRegTruncates.push_back(std::optional<MVT>());
1938 }
1939
1940 Chain = RetVal.getValue(NumElts);
1941 InGlue = RetVal.getValue(NumElts + 1);
1942
1943 // Cleanup
1944 VecIdx = -1;
1945 LoadVTs.clear();
1946 }
1947 }
1948 }
1949
1950 Chain =
1951 DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);
1952 InGlue = Chain.getValue(1);
1953
1954 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1955 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1956 // dangling.
1957 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
1958 if (i < RetElts.size() && RetElts[i]) {
1959 InVals.push_back(RetElts[i]);
1960 continue;
1961 }
1962
1963 SDValue Ret = DAG.getNode(
1965 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
1966 { Chain, ProxyRegOps[i], InGlue }
1967 );
1968
1969 Chain = Ret.getValue(1);
1970 InGlue = Ret.getValue(2);
1971
1972 if (ProxyRegTruncates[i]) {
1973 Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret);
1974 }
1975
1976 InVals.push_back(Ret);
1977 }
1978
1979 for (SDValue &T : TempProxyRegOps) {
1980 SDValue Repl = DAG.getNode(
1982 DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue),
1983 {Chain, T.getOperand(0), InGlue});
1984 DAG.ReplaceAllUsesWith(T, Repl);
1985 DAG.RemoveDeadNode(T.getNode());
1986
1987 Chain = Repl.getValue(1);
1988 InGlue = Repl.getValue(2);
1989 }
1990
1991 // set isTailCall to false for now, until we figure out how to express
1992 // tail call optimization in PTX
1993 isTailCall = false;
1994 return Chain;
1995}
1996
1998 SelectionDAG &DAG) const {
1999
2000 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
2001 const Function &Fn = DAG.getMachineFunction().getFunction();
2002
2003 DiagnosticInfoUnsupported NoDynamicAlloca(
2004 Fn,
2005 "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
2006 "requires target sm_52.",
2007 SDLoc(Op).getDebugLoc());
2008 DAG.getContext()->diagnose(NoDynamicAlloca);
2009 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
2010 Op.getOperand(0)};
2011 return DAG.getMergeValues(Ops, SDLoc());
2012 }
2013
2014 SDValue Chain = Op.getOperand(0);
2015 SDValue Size = Op.getOperand(1);
2016 uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2017 SDLoc DL(Op.getNode());
2018
2019 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
2020 MVT ValueSizeTy = nvTM->is64Bit() ? MVT::i64 : MVT::i32;
2021
2022 SDValue AllocOps[] = {Chain, DAG.getZExtOrTrunc(Size, DL, ValueSizeTy),
2023 DAG.getTargetConstant(Align, DL, MVT::i32)};
2024 EVT RetTypes[] = {ValueSizeTy, MVT::Other};
2025 return DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, RetTypes, AllocOps);
2026}
2027
2029 SelectionDAG &DAG) const {
2030 SDLoc DL(Op.getNode());
2031 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
2032 const Function &Fn = DAG.getMachineFunction().getFunction();
2033
2034 DiagnosticInfoUnsupported NoStackRestore(
2035 Fn,
2036 "Support for stackrestore requires PTX ISA version >= 7.3 and target "
2037 ">= sm_52.",
2038 DL.getDebugLoc());
2039 DAG.getContext()->diagnose(NoStackRestore);
2040 return Op.getOperand(0);
2041 }
2042
2043 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
2044 SDValue Chain = Op.getOperand(0);
2045 SDValue Ptr = Op.getOperand(1);
2048 return DAG.getNode(NVPTXISD::STACKRESTORE, DL, MVT::Other, {Chain, ASC});
2049}
2050
2052 SelectionDAG &DAG) const {
2053 SDLoc DL(Op.getNode());
2054 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
2055 const Function &Fn = DAG.getMachineFunction().getFunction();
2056
2057 DiagnosticInfoUnsupported NoStackSave(
2058 Fn,
2059 "Support for stacksave requires PTX ISA version >= 7.3 and target >= "
2060 "sm_52.",
2061 DL.getDebugLoc());
2062 DAG.getContext()->diagnose(NoStackSave);
2063 auto Ops = {DAG.getConstant(0, DL, Op.getValueType()), Op.getOperand(0)};
2064 return DAG.getMergeValues(Ops, DL);
2065 }
2066
2067 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
2068 SDValue Chain = Op.getOperand(0);
2069 SDValue SS =
2070 DAG.getNode(NVPTXISD::STACKSAVE, DL, {LocalVT, MVT::Other}, Chain);
2071 SDValue ASC = DAG.getAddrSpaceCast(
2072 DL, Op.getValueType(), SS, ADDRESS_SPACE_LOCAL, ADDRESS_SPACE_GENERIC);
2073 return DAG.getMergeValues({ASC, SDValue(SS.getNode(), 1)}, DL);
2074}
2075
2076// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
2077// (see LegalizeDAG.cpp). This is slow and uses local memory.
2078// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
2079SDValue
2080NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
2081 SDNode *Node = Op.getNode();
2082 SDLoc dl(Node);
2084 unsigned NumOperands = Node->getNumOperands();
2085 for (unsigned i = 0; i < NumOperands; ++i) {
2086 SDValue SubOp = Node->getOperand(i);
2087 EVT VVT = SubOp.getNode()->getValueType(0);
2088 EVT EltVT = VVT.getVectorElementType();
2089 unsigned NumSubElem = VVT.getVectorNumElements();
2090 for (unsigned j = 0; j < NumSubElem; ++j) {
2091 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
2092 DAG.getIntPtrConstant(j, dl)));
2093 }
2094 }
2095 return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
2096}
2097
2098SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
2099 // Handle bitcasting from v2i8 without hitting the default promotion
2100 // strategy which goes through stack memory.
2101 EVT FromVT = Op->getOperand(0)->getValueType(0);
2102 if (FromVT != MVT::v2i8) {
2103 return Op;
2104 }
2105
2106 // Pack vector elements into i16 and bitcast to final type
2107 SDLoc DL(Op);
2108 SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2109 Op->getOperand(0), DAG.getIntPtrConstant(0, DL));
2110 SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2111 Op->getOperand(0), DAG.getIntPtrConstant(1, DL));
2112 SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
2113 SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
2114 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
2115 SDValue AsInt = DAG.getNode(
2116 ISD::OR, DL, MVT::i16,
2117 {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});
2118 EVT ToVT = Op->getValueType(0);
2119 return MaybeBitcast(DAG, DL, ToVT, AsInt);
2120}
2121
2122// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
2123// would get lowered as two constant loads and vector-packing move.
2124// Instead we want just a constant move:
2125// mov.b32 %r2, 0x40003C00
2126SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2127 SelectionDAG &DAG) const {
2128 EVT VT = Op->getValueType(0);
2129 if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
2130 return Op;
2131 SDLoc DL(Op);
2132
2133 if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2134 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2135 isa<ConstantFPSDNode>(Operand);
2136 })) {
2137 if (VT != MVT::v4i8)
2138 return Op;
2139 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2140 // to optimize calculation of constant parts.
2141 auto GetPRMT = [&](const SDValue Left, const SDValue Right, bool Cast,
2142 uint64_t SelectionValue) -> SDValue {
2143 SDValue L = Left;
2144 SDValue R = Right;
2145 if (Cast) {
2146 L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32);
2147 R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32);
2148 }
2149 return DAG.getNode(
2150 NVPTXISD::PRMT, DL, MVT::v4i8,
2151 {L, R, DAG.getConstant(SelectionValue, DL, MVT::i32),
2152 DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
2153 };
2154 auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340);
2155 auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340);
2156 auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410);
2157 return DAG.getNode(ISD::BITCAST, DL, VT, PRMT3210);
2158 }
2159
2160 // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2161 auto GetOperand = [](SDValue Op, int N) -> APInt {
2162 const SDValue &Operand = Op->getOperand(N);
2163 EVT VT = Op->getValueType(0);
2164 if (Operand->isUndef())
2165 return APInt(32, 0);
2166 APInt Value;
2167 if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2168 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2169 else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2170 Value = Operand->getAsAPIntVal();
2171 else
2172 llvm_unreachable("Unsupported type");
2173 // i8 values are carried around as i16, so we need to zero out upper bits,
2174 // so they do not get in the way of combining individual byte values
2175 if (VT == MVT::v4i8)
2176 Value = Value.trunc(8);
2177 return Value.zext(32);
2178 };
2179 APInt Value;
2180 if (Isv2x16VT(VT)) {
2181 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16);
2182 } else if (VT == MVT::v4i8) {
2183 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) |
2184 GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24);
2185 } else {
2186 llvm_unreachable("Unsupported type");
2187 }
2188 SDValue Const = DAG.getConstant(Value, DL, MVT::i32);
2189 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Const);
2190}
2191
2192SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2193 SelectionDAG &DAG) const {
2194 SDValue Index = Op->getOperand(1);
2195 SDValue Vector = Op->getOperand(0);
2196 SDLoc DL(Op);
2197 EVT VectorVT = Vector.getValueType();
2198
2199 if (VectorVT == MVT::v4i8) {
2200 SDValue BFE =
2201 DAG.getNode(NVPTXISD::BFE, DL, MVT::i32,
2202 {Vector,
2203 DAG.getNode(ISD::MUL, DL, MVT::i32,
2204 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2205 DAG.getConstant(8, DL, MVT::i32)),
2206 DAG.getConstant(8, DL, MVT::i32)});
2207 return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0));
2208 }
2209
2210 // Constant index will be matched by tablegen.
2211 if (isa<ConstantSDNode>(Index.getNode()))
2212 return Op;
2213
2214 // Extract individual elements and select one of them.
2215 assert(Isv2x16VT(VectorVT) && "Unexpected vector type.");
2216 EVT EltVT = VectorVT.getVectorElementType();
2217
2218 SDLoc dl(Op.getNode());
2219 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2220 DAG.getIntPtrConstant(0, dl));
2221 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2222 DAG.getIntPtrConstant(1, dl));
2223 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2225}
2226
2227SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2228 SelectionDAG &DAG) const {
2229 SDValue Vector = Op->getOperand(0);
2230 EVT VectorVT = Vector.getValueType();
2231
2232 if (VectorVT != MVT::v4i8)
2233 return Op;
2234 SDLoc DL(Op);
2235 SDValue Value = Op->getOperand(1);
2236 if (Value->isUndef())
2237 return Vector;
2238
2239 SDValue Index = Op->getOperand(2);
2240
2241 SDValue BFI =
2242 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2243 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2244 DAG.getNode(ISD::MUL, DL, MVT::i32,
2245 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2246 DAG.getConstant(8, DL, MVT::i32)),
2247 DAG.getConstant(8, DL, MVT::i32)});
2248 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2249}
2250
2251SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2252 SelectionDAG &DAG) const {
2253 SDValue V1 = Op.getOperand(0);
2254 EVT VectorVT = V1.getValueType();
2255 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2256 return Op;
2257
2258 // Lower shuffle to PRMT instruction.
2259 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2260 SDValue V2 = Op.getOperand(1);
2261 uint32_t Selector = 0;
2262 for (auto I : llvm::enumerate(SVN->getMask())) {
2263 if (I.value() != -1) // -1 is a placeholder for undef.
2264 Selector |= (I.value() << (I.index() * 4));
2265 }
2266
2267 SDLoc DL(Op);
2268 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2,
2269 DAG.getConstant(Selector, DL, MVT::i32),
2270 DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32));
2271}
2272/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2273/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2274/// amount, or
2275/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2276/// amount.
2277SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2278 SelectionDAG &DAG) const {
2279 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2280 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2281
2282 EVT VT = Op.getValueType();
2283 unsigned VTBits = VT.getSizeInBits();
2284 SDLoc dl(Op);
2285 SDValue ShOpLo = Op.getOperand(0);
2286 SDValue ShOpHi = Op.getOperand(1);
2287 SDValue ShAmt = Op.getOperand(2);
2288 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2289
2290 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2291 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2292 // {dHi, dLo} = {aHi, aLo} >> Amt
2293 // dHi = aHi >> Amt
2294 // dLo = shf.r.clamp aLo, aHi, Amt
2295
2296 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2297 SDValue Lo =
2298 DAG.getNode(NVPTXISD::FSHR_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2299
2300 SDValue Ops[2] = { Lo, Hi };
2301 return DAG.getMergeValues(Ops, dl);
2302 }
2303 else {
2304 // {dHi, dLo} = {aHi, aLo} >> Amt
2305 // - if (Amt>=size) then
2306 // dLo = aHi >> (Amt-size)
2307 // dHi = aHi >> Amt (this is either all 0 or all 1)
2308 // else
2309 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2310 // dHi = aHi >> Amt
2311
2312 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2313 DAG.getConstant(VTBits, dl, MVT::i32),
2314 ShAmt);
2315 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2316 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2317 DAG.getConstant(VTBits, dl, MVT::i32));
2318 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2319 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2320 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2321
2322 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2323 DAG.getConstant(VTBits, dl, MVT::i32),
2324 ISD::SETGE);
2325 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2326 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2327
2328 SDValue Ops[2] = { Lo, Hi };
2329 return DAG.getMergeValues(Ops, dl);
2330 }
2331}
2332
2333/// LowerShiftLeftParts - Lower SHL_PARTS, which
2334/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2335/// amount, or
2336/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2337/// amount.
2338SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2339 SelectionDAG &DAG) const {
2340 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2341 assert(Op.getOpcode() == ISD::SHL_PARTS);
2342
2343 EVT VT = Op.getValueType();
2344 unsigned VTBits = VT.getSizeInBits();
2345 SDLoc dl(Op);
2346 SDValue ShOpLo = Op.getOperand(0);
2347 SDValue ShOpHi = Op.getOperand(1);
2348 SDValue ShAmt = Op.getOperand(2);
2349
2350 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2351 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2352 // {dHi, dLo} = {aHi, aLo} << Amt
2353 // dHi = shf.l.clamp aLo, aHi, Amt
2354 // dLo = aLo << Amt
2355
2356 SDValue Hi =
2357 DAG.getNode(NVPTXISD::FSHL_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2358 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2359
2360 SDValue Ops[2] = { Lo, Hi };
2361 return DAG.getMergeValues(Ops, dl);
2362 }
2363 else {
2364 // {dHi, dLo} = {aHi, aLo} << Amt
2365 // - if (Amt>=size) then
2366 // dLo = aLo << Amt (all 0)
2367 // dLo = aLo << (Amt-size)
2368 // else
2369 // dLo = aLo << Amt
2370 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2371
2372 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2373 DAG.getConstant(VTBits, dl, MVT::i32),
2374 ShAmt);
2375 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2376 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2377 DAG.getConstant(VTBits, dl, MVT::i32));
2378 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2379 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2380 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2381
2382 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2383 DAG.getConstant(VTBits, dl, MVT::i32),
2384 ISD::SETGE);
2385 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2386 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2387
2388 SDValue Ops[2] = { Lo, Hi };
2389 return DAG.getMergeValues(Ops, dl);
2390 }
2391}
2392
2393/// If the types match, convert the generic copysign to the NVPTXISD version,
2394/// otherwise bail ensuring that mismatched cases are properly expaned.
2395SDValue NVPTXTargetLowering::LowerFCOPYSIGN(SDValue Op,
2396 SelectionDAG &DAG) const {
2397 EVT VT = Op.getValueType();
2398 SDLoc DL(Op);
2399
2400 SDValue In1 = Op.getOperand(0);
2401 SDValue In2 = Op.getOperand(1);
2402 EVT SrcVT = In2.getValueType();
2403
2404 if (!SrcVT.bitsEq(VT))
2405 return SDValue();
2406
2407 return DAG.getNode(NVPTXISD::FCOPYSIGN, DL, VT, In1, In2);
2408}
2409
2410SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2411 EVT VT = Op.getValueType();
2412
2413 if (VT == MVT::f32)
2414 return LowerFROUND32(Op, DAG);
2415
2416 if (VT == MVT::f64)
2417 return LowerFROUND64(Op, DAG);
2418
2419 llvm_unreachable("unhandled type");
2420}
2421
2422// This is the the rounding method used in CUDA libdevice in C like code:
2423// float roundf(float A)
2424// {
2425// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2426// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2427// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2428// }
2429SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2430 SelectionDAG &DAG) const {
2431 SDLoc SL(Op);
2432 SDValue A = Op.getOperand(0);
2433 EVT VT = Op.getValueType();
2434
2435 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2436
2437 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2438 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2439 const unsigned SignBitMask = 0x80000000;
2440 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2441 DAG.getConstant(SignBitMask, SL, MVT::i32));
2442 const unsigned PointFiveInBits = 0x3F000000;
2443 SDValue PointFiveWithSignRaw =
2444 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2445 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2446 SDValue PointFiveWithSign =
2447 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2448 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2449 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2450
2451 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2452 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2453 SDValue IsLarge =
2454 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2455 ISD::SETOGT);
2456 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2457
2458 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2459 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2460 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2461 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2462 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2463}
2464
2465// The implementation of round(double) is similar to that of round(float) in
2466// that they both separate the value range into three regions and use a method
2467// specific to the region to round the values. However, round(double) first
2468// calculates the round of the absolute value and then adds the sign back while
2469// round(float) directly rounds the value with sign.
2470SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2471 SelectionDAG &DAG) const {
2472 SDLoc SL(Op);
2473 SDValue A = Op.getOperand(0);
2474 EVT VT = Op.getValueType();
2475
2476 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2477
2478 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2479 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2480 DAG.getConstantFP(0.5, SL, VT));
2481 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2482
2483 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2484 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2485 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2486 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2487 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2488 DAG.getConstantFP(0, SL, VT),
2489 RoundedA);
2490
2491 // Add sign to rounded_A
2492 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2493 DAG.getNode(ISD::FTRUNC, SL, VT, A);
2494
2495 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2496 SDValue IsLarge =
2497 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2498 ISD::SETOGT);
2499 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2500}
2501
2502SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2503 SelectionDAG &DAG) const {
2504 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2505
2506 if (Op.getValueType() == MVT::bf16) {
2507 SDLoc Loc(Op);
2508 return DAG.getNode(
2509 ISD::FP_ROUND, Loc, MVT::bf16,
2510 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2511 DAG.getIntPtrConstant(0, Loc, /*isTarget=*/true));
2512 }
2513
2514 // Everything else is considered legal.
2515 return Op;
2516}
2517
2518SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2519 SelectionDAG &DAG) const {
2520 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2521
2522 if (Op.getOperand(0).getValueType() == MVT::bf16) {
2523 SDLoc Loc(Op);
2524 return DAG.getNode(
2525 Op.getOpcode(), Loc, Op.getValueType(),
2526 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2527 }
2528
2529 // Everything else is considered legal.
2530 return Op;
2531}
2532
2533SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
2534 SelectionDAG &DAG) const {
2535 EVT NarrowVT = Op.getValueType();
2536 SDValue Wide = Op.getOperand(0);
2537 EVT WideVT = Wide.getValueType();
2538 if (NarrowVT.getScalarType() == MVT::bf16) {
2539 const TargetLowering *TLI = STI.getTargetLowering();
2540 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2541 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2542 }
2543 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2544 // This combination was the first to support f32 -> bf16.
2545 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2546 if (WideVT.getScalarType() == MVT::f32) {
2547 return Op;
2548 }
2549 if (WideVT.getScalarType() == MVT::f64) {
2550 SDLoc Loc(Op);
2551 // Round-inexact-to-odd f64 to f32, then do the final rounding using
2552 // the hardware f32 -> bf16 instruction.
2554 WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32)
2555 : MVT::f32,
2556 Wide, Loc, DAG);
2557 return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);
2558 }
2559 }
2560 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2561 }
2562 }
2563
2564 // Everything else is considered legal.
2565 return Op;
2566}
2567
2568SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
2569 SelectionDAG &DAG) const {
2570 SDValue Narrow = Op.getOperand(0);
2571 EVT NarrowVT = Narrow.getValueType();
2572 EVT WideVT = Op.getValueType();
2573 if (NarrowVT.getScalarType() == MVT::bf16) {
2574 if (WideVT.getScalarType() == MVT::f32 &&
2575 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2576 SDLoc Loc(Op);
2577 return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);
2578 }
2579 if (WideVT.getScalarType() == MVT::f64 &&
2580 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2581 EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32)
2582 : MVT::f32;
2583 SDLoc Loc(Op);
2584 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2585 Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);
2586 } else {
2587 Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);
2588 }
2589 return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);
2590 }
2591 }
2592
2593 // Everything else is considered legal.
2594 return Op;
2595}
2596
2598 SDLoc DL(Op);
2599 if (Op.getValueType() != MVT::v2i16)
2600 return Op;
2601 EVT EltVT = Op.getValueType().getVectorElementType();
2602 SmallVector<SDValue> VecElements;
2603 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2604 SmallVector<SDValue> ScalarArgs;
2605 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2606 [&](const SDUse &O) {
2607 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2608 O.get(), DAG.getIntPtrConstant(I, DL));
2609 });
2610 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2611 }
2612 SDValue V =
2613 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2614 return V;
2615}
2616
2617SDValue
2619 switch (Op.getOpcode()) {
2620 case ISD::RETURNADDR:
2621 return SDValue();
2622 case ISD::FRAMEADDR:
2623 return SDValue();
2624 case ISD::GlobalAddress:
2625 return LowerGlobalAddress(Op, DAG);
2627 return Op;
2628 case ISD::BUILD_VECTOR:
2629 return LowerBUILD_VECTOR(Op, DAG);
2630 case ISD::BITCAST:
2631 return LowerBITCAST(Op, DAG);
2633 return Op;
2635 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2637 return LowerINSERT_VECTOR_ELT(Op, DAG);
2639 return LowerVECTOR_SHUFFLE(Op, DAG);
2641 return LowerCONCAT_VECTORS(Op, DAG);
2642 case ISD::STORE:
2643 return LowerSTORE(Op, DAG);
2644 case ISD::LOAD:
2645 return LowerLOAD(Op, DAG);
2646 case ISD::SHL_PARTS:
2647 return LowerShiftLeftParts(Op, DAG);
2648 case ISD::SRA_PARTS:
2649 case ISD::SRL_PARTS:
2650 return LowerShiftRightParts(Op, DAG);
2651 case ISD::SELECT:
2652 return LowerSelect(Op, DAG);
2653 case ISD::FROUND:
2654 return LowerFROUND(Op, DAG);
2655 case ISD::FCOPYSIGN:
2656 return LowerFCOPYSIGN(Op, DAG);
2657 case ISD::SINT_TO_FP:
2658 case ISD::UINT_TO_FP:
2659 return LowerINT_TO_FP(Op, DAG);
2660 case ISD::FP_TO_SINT:
2661 case ISD::FP_TO_UINT:
2662 return LowerFP_TO_INT(Op, DAG);
2663 case ISD::FP_ROUND:
2664 return LowerFP_ROUND(Op, DAG);
2665 case ISD::FP_EXTEND:
2666 return LowerFP_EXTEND(Op, DAG);
2667 case ISD::BR_JT:
2668 return LowerBR_JT(Op, DAG);
2669 case ISD::VAARG:
2670 return LowerVAARG(Op, DAG);
2671 case ISD::VASTART:
2672 return LowerVASTART(Op, DAG);
2673 case ISD::ABS:
2674 case ISD::SMIN:
2675 case ISD::SMAX:
2676 case ISD::UMIN:
2677 case ISD::UMAX:
2678 case ISD::ADD:
2679 case ISD::SUB:
2680 case ISD::MUL:
2681 case ISD::SHL:
2682 case ISD::SREM:
2683 case ISD::UREM:
2684 return LowerVectorArith(Op, DAG);
2686 return LowerDYNAMIC_STACKALLOC(Op, DAG);
2687 case ISD::STACKRESTORE:
2688 return LowerSTACKRESTORE(Op, DAG);
2689 case ISD::STACKSAVE:
2690 return LowerSTACKSAVE(Op, DAG);
2691 case ISD::CopyToReg:
2692 return LowerCopyToReg_128(Op, DAG);
2693 default:
2694 llvm_unreachable("Custom lowering not defined for operation");
2695 }
2696}
2697
2698SDValue NVPTXTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
2699 SDLoc DL(Op);
2700 SDValue Chain = Op.getOperand(0);
2701 const auto *JT = cast<JumpTableSDNode>(Op.getOperand(1));
2702 SDValue Index = Op.getOperand(2);
2703
2704 unsigned JId = JT->getIndex();
2706 ArrayRef<MachineBasicBlock *> MBBs = MJTI->getJumpTables()[JId].MBBs;
2707
2708 SDValue IdV = DAG.getConstant(JId, DL, MVT::i32);
2709
2710 // Generate BrxStart node
2711 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2712 Chain = DAG.getNode(NVPTXISD::BrxStart, DL, VTs, Chain, IdV);
2713
2714 // Generate BrxItem nodes
2715 assert(!MBBs.empty());
2716 for (MachineBasicBlock *MBB : MBBs.drop_back())
2717 Chain = DAG.getNode(NVPTXISD::BrxItem, DL, VTs, Chain.getValue(0),
2718 DAG.getBasicBlock(MBB), Chain.getValue(1));
2719
2720 // Generate BrxEnd nodes
2721 SDValue EndOps[] = {Chain.getValue(0), DAG.getBasicBlock(MBBs.back()), Index,
2722 IdV, Chain.getValue(1)};
2723 SDValue BrxEnd = DAG.getNode(NVPTXISD::BrxEnd, DL, VTs, EndOps);
2724
2725 return BrxEnd;
2726}
2727
2728// This will prevent AsmPrinter from trying to print the jump tables itself.
2731}
2732
2733// This function is almost a copy of SelectionDAG::expandVAArg().
2734// The only diff is that this one produces loads from local address space.
2735SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
2736 const TargetLowering *TLI = STI.getTargetLowering();
2737 SDLoc DL(Op);
2738
2739 SDNode *Node = Op.getNode();
2740 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
2741 EVT VT = Node->getValueType(0);
2742 auto *Ty = VT.getTypeForEVT(*DAG.getContext());
2743 SDValue Tmp1 = Node->getOperand(0);
2744 SDValue Tmp2 = Node->getOperand(1);
2745 const MaybeAlign MA(Node->getConstantOperandVal(3));
2746
2747 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
2748 Tmp1, Tmp2, MachinePointerInfo(V));
2749 SDValue VAList = VAListLoad;
2750
2751 if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
2752 VAList = DAG.getNode(
2753 ISD::ADD, DL, VAList.getValueType(), VAList,
2754 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
2755
2756 VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList,
2757 DAG.getSignedConstant(-(int64_t)MA->value(), DL,
2758 VAList.getValueType()));
2759 }
2760
2761 // Increment the pointer, VAList, to the next vaarg
2762 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
2764 DL, VAList.getValueType()));
2765
2766 // Store the incremented VAList to the legalized pointer
2767 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
2769
2770 const Value *SrcV =
2772
2773 // Load the actual argument out of the pointer VAList
2774 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
2775}
2776
2777SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
2778 const TargetLowering *TLI = STI.getTargetLowering();
2779 SDLoc DL(Op);
2780 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
2781
2782 // Store the address of unsized array <function>_vararg[] in the ap object.
2783 SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
2784 SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg);
2785
2786 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2787 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
2788 MachinePointerInfo(SV));
2789}
2790
2791SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2792 SDValue Op0 = Op->getOperand(0);
2793 SDValue Op1 = Op->getOperand(1);
2794 SDValue Op2 = Op->getOperand(2);
2795 SDLoc DL(Op.getNode());
2796
2797 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2798
2799 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2800 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2801 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2802 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2803
2804 return Trunc;
2805}
2806
2807SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2808 if (Op.getValueType() == MVT::i1)
2809 return LowerLOADi1(Op, DAG);
2810
2811 // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
2812 // unaligned loads and have to handle it here.
2813 EVT VT = Op.getValueType();
2814 if (Isv2x16VT(VT) || VT == MVT::v4i8) {
2815 LoadSDNode *Load = cast<LoadSDNode>(Op);
2816 EVT MemVT = Load->getMemoryVT();
2818 MemVT, *Load->getMemOperand())) {
2819 SDValue Ops[2];
2820 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2821 return DAG.getMergeValues(Ops, SDLoc(Op));
2822 }
2823 }
2824
2825 return SDValue();
2826}
2827
2828// v = ld i1* addr
2829// =>
2830// v1 = ld i8* addr (-> i16)
2831// v = trunc i16 to i1
2832SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2833 SDNode *Node = Op.getNode();
2834 LoadSDNode *LD = cast<LoadSDNode>(Node);
2835 SDLoc dl(Node);
2836 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
2837 assert(Node->getValueType(0) == MVT::i1 &&
2838 "Custom lowering for i1 load only");
2839 SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
2840 LD->getBasePtr(), LD->getPointerInfo(),
2841 MVT::i8, LD->getAlign(),
2842 LD->getMemOperand()->getFlags());
2843 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2844 // The legalizer (the caller) is expecting two values from the legalized
2845 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2846 // in LegalizeDAG.cpp which also uses MergeValues.
2847 SDValue Ops[] = { result, LD->getChain() };
2848 return DAG.getMergeValues(Ops, dl);
2849}
2850
2851SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2852 StoreSDNode *Store = cast<StoreSDNode>(Op);
2853 EVT VT = Store->getMemoryVT();
2854
2855 if (VT == MVT::i1)
2856 return LowerSTOREi1(Op, DAG);
2857
2858 // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2859 // stores and have to handle it here.
2860 if ((Isv2x16VT(VT) || VT == MVT::v4i8) &&
2862 VT, *Store->getMemOperand()))
2863 return expandUnalignedStore(Store, DAG);
2864
2865 // v2f16, v2bf16 and v2i16 don't need special handling.
2866 if (Isv2x16VT(VT) || VT == MVT::v4i8)
2867 return SDValue();
2868
2869 if (VT.isVector())
2870 return LowerSTOREVector(Op, DAG);
2871
2872 return SDValue();
2873}
2874
2875SDValue
2876NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2877 SDNode *N = Op.getNode();
2878 SDValue Val = N->getOperand(1);
2879 SDLoc DL(N);
2880 EVT ValVT = Val.getValueType();
2881
2882 auto NumEltsAndEltVT = getVectorLoweringShape(ValVT);
2883 if (!NumEltsAndEltVT)
2884 return SDValue();
2885 auto [NumElts, EltVT] = NumEltsAndEltVT.value();
2886
2887 MemSDNode *MemSD = cast<MemSDNode>(N);
2888 const DataLayout &TD = DAG.getDataLayout();
2889
2890 Align Alignment = MemSD->getAlign();
2891 Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
2892 if (Alignment < PrefAlign) {
2893 // This store is not sufficiently aligned, so bail out and let this vector
2894 // store be scalarized. Note that we may still be able to emit smaller
2895 // vector stores. For example, if we are storing a <4 x float> with an
2896 // alignment of 8, this check will fail but the legalizer will try again
2897 // with 2 x <2 x float>, which will succeed with an alignment of 8.
2898 return SDValue();
2899 }
2900
2901 // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
2902 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
2903 // stored type to i16 and propagate the "real" type as the memory type.
2904 bool NeedExt = false;
2905 if (EltVT.getSizeInBits() < 16)
2906 NeedExt = true;
2907
2908 unsigned Opcode = 0;
2909 switch (NumElts) {
2910 default:
2911 return SDValue();
2912 case 2:
2913 Opcode = NVPTXISD::StoreV2;
2914 break;
2915 case 4:
2916 Opcode = NVPTXISD::StoreV4;
2917 break;
2918 }
2919
2921
2922 // First is the chain
2923 Ops.push_back(N->getOperand(0));
2924
2925 // Then the split values
2926 assert(NumElts <= ValVT.getVectorNumElements() &&
2927 "NumElts should not increase, only decrease or stay the same.");
2928 if (NumElts < ValVT.getVectorNumElements()) {
2929 // If the number of elements has decreased, getVectorLoweringShape has
2930 // upsized the element types
2931 assert(EltVT.isVector() && EltVT.getSizeInBits() == 32 &&
2932 EltVT.getVectorNumElements() <= 4 && "Unexpected upsized type.");
2933 // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be
2934 // stored as b32s
2935 unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
2936 for (unsigned i = 0; i < NumElts; ++i) {
2937 SmallVector<SDValue, 4> SubVectorElts;
2938 DAG.ExtractVectorElements(Val, SubVectorElts, i * NumEltsPerSubVector,
2939 NumEltsPerSubVector);
2940 SDValue SubVector = DAG.getBuildVector(EltVT, DL, SubVectorElts);
2941 Ops.push_back(SubVector);
2942 }
2943 } else {
2944 for (unsigned i = 0; i < NumElts; ++i) {
2945 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2946 DAG.getIntPtrConstant(i, DL));
2947 if (NeedExt)
2948 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
2949 Ops.push_back(ExtVal);
2950 }
2951 }
2952
2953 // Then any remaining arguments
2954 Ops.append(N->op_begin() + 2, N->op_end());
2955
2956 SDValue NewSt =
2957 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
2958 MemSD->getMemoryVT(), MemSD->getMemOperand());
2959
2960 // return DCI.CombineTo(N, NewSt, true);
2961 return NewSt;
2962}
2963
2964// st i1 v, addr
2965// =>
2966// v1 = zxt v to i16
2967// st.u8 i16, addr
2968SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
2969 SDNode *Node = Op.getNode();
2970 SDLoc dl(Node);
2971 StoreSDNode *ST = cast<StoreSDNode>(Node);
2972 SDValue Tmp1 = ST->getChain();
2973 SDValue Tmp2 = ST->getBasePtr();
2974 SDValue Tmp3 = ST->getValue();
2975 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
2976 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
2977 SDValue Result =
2978 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
2979 ST->getAlign(), ST->getMemOperand()->getFlags());
2980 return Result;
2981}
2982
2983SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
2984 SelectionDAG &DAG) const {
2985 // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit
2986 // operand so that it can pass the legalization.
2987
2988 assert(Op.getOperand(1).getValueType() == MVT::i128 &&
2989 "Custom lowering for 128-bit CopyToReg only");
2990
2991 SDNode *Node = Op.getNode();
2992 SDLoc DL(Node);
2993
2994 SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2));
2995 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2996 DAG.getIntPtrConstant(0, DL));
2997 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2998 DAG.getIntPtrConstant(1, DL));
2999
3001 SmallVector<EVT, 3> ResultsType(Node->values());
3002
3003 NewOps[0] = Op->getOperand(0); // Chain
3004 NewOps[1] = Op->getOperand(1); // Dst Reg
3005 NewOps[2] = Lo; // Lower 64-bit
3006 NewOps[3] = Hi; // Higher 64-bit
3007 if (Op.getNumOperands() == 4)
3008 NewOps[4] = Op->getOperand(3); // Glue if exists
3009
3010 return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
3011}
3012
3013unsigned NVPTXTargetLowering::getNumRegisters(
3014 LLVMContext &Context, EVT VT,
3015 std::optional<MVT> RegisterVT = std::nullopt) const {
3016 if (VT == MVT::i128 && RegisterVT == MVT::i128)
3017 return 1;
3018 return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
3019}
3020
3021bool NVPTXTargetLowering::splitValueIntoRegisterParts(
3022 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
3023 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
3024 if (Val.getValueType() == MVT::i128 && NumParts == 1) {
3025 Parts[0] = Val;
3026 return true;
3027 }
3028 return false;
3029}
3030
3031// This creates target external symbol for a function parameter.
3032// Name of the symbol is composed from its index and the function name.
3033// Negative index corresponds to special parameter (unsized array) used for
3034// passing variable arguments.
3035SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx,
3036 EVT v) const {
3037 StringRef SavedStr = nvTM->getStrPool().save(
3039 return DAG.getTargetExternalSymbol(SavedStr.data(), v);
3040}
3041
3043 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3044 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3045 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3047 const DataLayout &DL = DAG.getDataLayout();
3048 auto PtrVT = getPointerTy(DAG.getDataLayout());
3049
3050 const Function *F = &MF.getFunction();
3051 const AttributeList &PAL = F->getAttributes();
3052 const TargetLowering *TLI = STI.getTargetLowering();
3053
3054 SDValue Root = DAG.getRoot();
3055 std::vector<SDValue> OutChains;
3056
3057 bool isABI = (STI.getSmVersion() >= 20);
3058 assert(isABI && "Non-ABI compilation is not supported");
3059 if (!isABI)
3060 return Chain;
3061
3062 std::vector<Type *> argTypes;
3063 std::vector<const Argument *> theArgs;
3064 for (const Argument &I : F->args()) {
3065 theArgs.push_back(&I);
3066 argTypes.push_back(I.getType());
3067 }
3068 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3069 // Ins.size() will be larger
3070 // * if there is an aggregate argument with multiple fields (each field
3071 // showing up separately in Ins)
3072 // * if there is a vector argument with more than typical vector-length
3073 // elements (generally if more than 4) where each vector element is
3074 // individually present in Ins.
3075 // So a different index should be used for indexing into Ins.
3076 // See similar issue in LowerCall.
3077 unsigned InsIdx = 0;
3078
3079 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++InsIdx) {
3080 Type *Ty = argTypes[i];
3081
3082 if (theArgs[i]->use_empty()) {
3083 // argument is dead
3084 if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) {
3085 SmallVector<EVT, 16> vtparts;
3086
3087 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
3088 if (vtparts.empty())
3089 report_fatal_error("Empty parameter types are not supported");
3090
3091 for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
3092 ++parti) {
3093 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3094 ++InsIdx;
3095 }
3096 if (vtparts.size() > 0)
3097 --InsIdx;
3098 continue;
3099 }
3100 if (Ty->isVectorTy()) {
3101 EVT ObjectVT = getValueType(DL, Ty);
3102 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
3103 for (unsigned parti = 0; parti < NumRegs; ++parti) {
3104 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3105 ++InsIdx;
3106 }
3107 if (NumRegs > 0)
3108 --InsIdx;
3109 continue;
3110 }
3111 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3112 continue;
3113 }
3114
3115 // In the following cases, assign a node order of "i+1"
3116 // to newly created nodes. The SDNodes for params have to
3117 // appear in the same order as their order of appearance
3118 // in the original function. "i+1" holds that order.
3119 if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
3120 bool aggregateIsPacked = false;
3121 if (StructType *STy = dyn_cast<StructType>(Ty))
3122 aggregateIsPacked = STy->isPacked();
3123
3126 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
3127 if (VTs.empty())
3128 report_fatal_error("Empty parameter types are not supported");
3129
3132 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
3133
3134 SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3135 int VecIdx = -1; // Index of the first element of the current vector.
3136 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
3137 if (VectorInfo[parti] & PVF_FIRST) {
3138 assert(VecIdx == -1 && "Orphaned vector.");
3139 VecIdx = parti;
3140 }
3141
3142 // That's the last element of this store op.
3143 if (VectorInfo[parti] & PVF_LAST) {
3144 unsigned NumElts = parti - VecIdx + 1;
3145 EVT EltVT = VTs[parti];
3146 // i1 is loaded/stored as i8.
3147 EVT LoadVT = EltVT;
3148 if (EltVT == MVT::i1)
3149 LoadVT = MVT::i8;
3150 else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8)
3151 // getLoad needs a vector type, but it can't handle
3152 // vectors which contain v2f16 or v2bf16 elements. So we must load
3153 // using i32 here and then bitcast back.
3154 LoadVT = MVT::i32;
3155
3156 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
3157 SDValue VecAddr =
3158 DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
3159 DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
3161 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
3162
3163 const MaybeAlign PartAlign = [&]() -> MaybeAlign {
3164 if (aggregateIsPacked)
3165 return Align(1);
3166 if (NumElts != 1)
3167 return std::nullopt;
3168 Align PartAlign =
3169 DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));
3170 return commonAlignment(PartAlign, Offsets[parti]);
3171 }();
3172 SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
3173 MachinePointerInfo(srcValue), PartAlign,
3176 if (P.getNode())
3177 P.getNode()->setIROrder(i + 1);
3178 for (unsigned j = 0; j < NumElts; ++j) {
3179 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
3180 DAG.getIntPtrConstant(j, dl));
3181 // We've loaded i1 as an i8 and now must truncate it back to i1
3182 if (EltVT == MVT::i1)
3183 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
3184 // v2f16 was loaded as an i32. Now we must bitcast it back.
3185 else if (EltVT != LoadVT)
3186 Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt);
3187
3188 // If a promoted integer type is used, truncate down to the original
3189 MVT PromotedVT;
3190 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
3191 Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
3192 }
3193
3194 // Extend the element if necessary (e.g. an i8 is loaded
3195 // into an i16 register)
3196 if (Ins[InsIdx].VT.isInteger() &&
3197 Ins[InsIdx].VT.getFixedSizeInBits() >
3198 LoadVT.getFixedSizeInBits()) {
3199 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
3201 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
3202 }
3203 InVals.push_back(Elt);
3204 }
3205
3206 // Reset vector tracking state.
3207 VecIdx = -1;
3208 }
3209 ++InsIdx;
3210 }
3211 if (VTs.size() > 0)
3212 --InsIdx;
3213 continue;
3214 }
3215
3216 // Param has ByVal attribute
3217 // Return MoveParam(param symbol).
3218 // Ideally, the param symbol can be returned directly,
3219 // but when SDNode builder decides to use it in a CopyToReg(),
3220 // machine instruction fails because TargetExternalSymbol
3221 // (not lowered) is target dependent, and CopyToReg assumes
3222 // the source is lowered.
3223 EVT ObjectVT = getValueType(DL, Ty);
3224 assert(ObjectVT == Ins[InsIdx].VT &&
3225 "Ins type did not match function type");
3226 SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3227 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
3228 if (p.getNode())
3229 p.getNode()->setIROrder(i + 1);
3230 InVals.push_back(p);
3231 }
3232
3233 if (!OutChains.empty())
3234 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
3235
3236 return Chain;
3237}
3238
3239// Use byte-store when the param adress of the return value is unaligned.
3240// This may happen when the return value is a field of a packed structure.
3242 uint64_t Offset, EVT ElementType,
3243 SDValue RetVal, const SDLoc &dl) {
3244 // Bit logic only works on integer types
3245 if (adjustElementType(ElementType))
3246 RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
3247
3248 // Store each byte
3249 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
3250 // Shift the byte to the last byte position
3251 SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal,
3252 DAG.getConstant(i * 8, dl, MVT::i32));
3253 SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32),
3254 ShiftVal};
3255 // Trunc store only the last byte by using
3256 // st.param.b8
3257 // The register type can be larger than b8.
3259 DAG.getVTList(MVT::Other), StoreOperands,
3260 MVT::i8, MachinePointerInfo(), std::nullopt,
3262 }
3263 return Chain;
3264}
3265
3266SDValue
3268 bool isVarArg,
3270 const SmallVectorImpl<SDValue> &OutVals,
3271 const SDLoc &dl, SelectionDAG &DAG) const {
3272 const MachineFunction &MF = DAG.getMachineFunction();
3273 const Function &F = MF.getFunction();
3275
3276 bool isABI = (STI.getSmVersion() >= 20);
3277 assert(isABI && "Non-ABI compilation is not supported");
3278 if (!isABI)
3279 return Chain;
3280
3281 const DataLayout &DL = DAG.getDataLayout();
3282 SmallVector<SDValue, 16> PromotedOutVals;
3285 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
3286 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3287
3288 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3289 SDValue PromotedOutVal = OutVals[i];
3290 MVT PromotedVT;
3291 if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) {
3292 VTs[i] = EVT(PromotedVT);
3293 }
3294 if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) {
3296 Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3297 PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal);
3298 }
3299 PromotedOutVals.push_back(PromotedOutVal);
3300 }
3301
3302 auto VectorInfo = VectorizePTXValueVTs(
3303 VTs, Offsets,
3305 : Align(1));
3306
3307 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3308 // 32-bits are sign extended or zero extended, depending on whether
3309 // they are signed or unsigned types.
3310 bool ExtendIntegerRetVal =
3311 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
3312
3313 SmallVector<SDValue, 6> StoreOperands;
3314 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3315 SDValue OutVal = OutVals[i];
3316 SDValue RetVal = PromotedOutVals[i];
3317
3318 if (ExtendIntegerRetVal) {
3319 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
3321 dl, MVT::i32, RetVal);
3322 } else if (OutVal.getValueSizeInBits() < 16) {
3323 // Use 16-bit registers for small load-stores as it's the
3324 // smallest general purpose register size supported by NVPTX.
3325 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
3326 }
3327
3328 // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned
3329 // for a scalar store. In such cases, fall back to byte stores.
3330 if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) {
3331 EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3332 Align ElementTypeAlign =
3333 DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext()));
3334 Align ElementAlign =
3335 commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]);
3336 if (ElementAlign < ElementTypeAlign) {
3337 assert(StoreOperands.empty() && "Orphaned operand list.");
3338 Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType,
3339 RetVal, dl);
3340
3341 // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes
3342 // into the graph, so just move on to the next element.
3343 continue;
3344 }
3345 }
3346
3347 // New load/store. Record chain and offset operands.
3348 if (VectorInfo[i] & PVF_FIRST) {
3349 assert(StoreOperands.empty() && "Orphaned operand list.");
3350 StoreOperands.push_back(Chain);
3351 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
3352 }
3353
3354 // Record the value to return.
3355 StoreOperands.push_back(RetVal);
3356
3357 // That's the last element of this store op.
3358 if (VectorInfo[i] & PVF_LAST) {
3360 unsigned NumElts = StoreOperands.size() - 2;
3361 switch (NumElts) {
3362 case 1:
3364 break;
3365 case 2:
3367 break;
3368 case 4:
3370 break;
3371 default:
3372 llvm_unreachable("Invalid vector info.");
3373 }
3374
3375 // Adjust type of load/store op if we've extended the scalar
3376 // return value.
3377 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3378 Chain = DAG.getMemIntrinsicNode(
3379 Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
3381 // Cleanup vector state.
3382 StoreOperands.clear();
3383 }
3384 }
3385
3386 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3387}
3388
3390 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
3391 SelectionDAG &DAG) const {
3392 if (Constraint.size() > 1)
3393 return;
3394 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3395}
3396
3397// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3398// TgtMemIntrinsic
3399// because we need the information that is only available in the "Value" type
3400// of destination
3401// pointer. In particular, the address space information.
3403 IntrinsicInfo &Info, const CallInst &I,
3404 MachineFunction &MF, unsigned Intrinsic) const {
3405 switch (Intrinsic) {
3406 default:
3407 return false;
3408 case Intrinsic::nvvm_match_all_sync_i32p:
3409 case Intrinsic::nvvm_match_all_sync_i64p:
3411 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3412 // in order to model data exchange with other threads, but perform no real
3413 // memory accesses.
3414 Info.memVT = MVT::i1;
3415
3416 // Our result depends on both our and other thread's arguments.
3418 return true;
3419 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3420 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3421 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3422 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3423 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3424 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3425 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3426 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3427 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3428 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3429 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3430 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3431 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3432 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3433 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3434 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3435 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3436 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3437 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3438 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3439 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3440 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3441 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3442 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3444 Info.memVT = MVT::v8f16;
3445 Info.ptrVal = I.getArgOperand(0);
3446 Info.offset = 0;
3448 Info.align = Align(16);
3449 return true;
3450 }
3451 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
3452 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
3453 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
3454 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
3455 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
3456 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
3457 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
3458 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
3459 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
3460 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
3461 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
3462 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
3463 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
3464 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
3465 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
3466 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
3467 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
3468 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
3469 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
3470 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
3471 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
3472 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
3473 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
3474 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
3476 Info.memVT = MVT::v2i32;
3477 Info.ptrVal = I.getArgOperand(0);
3478 Info.offset = 0;
3480 Info.align = Align(8);
3481 return true;
3482 }
3483
3484 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
3485 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
3486 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
3487 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
3488 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
3489 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
3490 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
3491 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
3492 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
3493 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
3494 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
3495 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
3496 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
3497 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
3498 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
3499 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
3500
3501 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
3502 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
3503 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
3504 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
3505 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
3506 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
3507 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
3508 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
3509 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
3510 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
3511 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
3512 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
3513 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
3514 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
3515 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
3516 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
3517 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
3518 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
3520 Info.memVT = MVT::v4i32;
3521 Info.ptrVal = I.getArgOperand(0);
3522 Info.offset = 0;
3524 Info.align = Align(16);
3525 return true;
3526 }
3527
3528 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
3529 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
3530 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
3531 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
3532 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
3533 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
3534 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
3535 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
3536
3537 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
3538 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
3539 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
3540 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
3541 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
3542 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
3543 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
3544 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
3545 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
3546 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
3547 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
3548 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
3549 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
3550 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
3551 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
3552 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
3553 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
3554 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
3555 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
3556 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
3557 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
3558 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
3560 Info.memVT = MVT::i32;
3561 Info.ptrVal = I.getArgOperand(0);
3562 Info.offset = 0;
3564 Info.align = Align(4);
3565 return true;
3566 }
3567
3568 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
3569 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
3570 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
3571 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
3572 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
3573 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
3574 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
3575 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
3576 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
3577 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
3578 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
3579 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
3581 Info.memVT = MVT::v4f16;
3582 Info.ptrVal = I.getArgOperand(0);
3583 Info.offset = 0;
3585 Info.align = Align(16);
3586 return true;
3587 }
3588
3589 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
3590 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
3591 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
3592 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
3593 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
3594 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
3595 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
3596 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
3597 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
3598 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
3599 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
3600 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
3601 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
3602 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
3603 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
3604 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
3606 Info.memVT = MVT::v8f32;
3607 Info.ptrVal = I.getArgOperand(0);
3608 Info.offset = 0;
3610 Info.align = Align(16);
3611 return true;
3612 }
3613
3614 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
3615 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
3616 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
3617 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
3618
3619 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
3620 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
3621 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
3622 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
3623
3624 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
3625 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
3626 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
3627 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
3628 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
3629 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
3630 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
3631 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
3632 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
3633 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
3634 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
3635 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
3637 Info.memVT = MVT::v8i32;
3638 Info.ptrVal = I.getArgOperand(0);
3639 Info.offset = 0;
3641 Info.align = Align(16);
3642 return true;
3643 }
3644
3645 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
3646 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
3647 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
3648 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
3649 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
3650 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
3651 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
3652 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
3653 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
3654 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
3656 Info.memVT = MVT::v2i32;
3657 Info.ptrVal = I.getArgOperand(0);
3658 Info.offset = 0;
3660 Info.align = Align(8);
3661 return true;
3662 }
3663
3664 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
3665 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
3666 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
3667 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
3668
3669 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
3670 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
3671 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
3672 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
3674 Info.memVT = MVT::f64;
3675 Info.ptrVal = I.getArgOperand(0);
3676 Info.offset = 0;
3678 Info.align = Align(8);
3679 return true;
3680 }
3681
3682 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
3683 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
3684 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
3685 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
3687 Info.memVT = MVT::v2f64;
3688 Info.ptrVal = I.getArgOperand(0);
3689 Info.offset = 0;
3691 Info.align = Align(16);
3692 return true;
3693 }
3694
3695 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
3696 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
3697 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
3698 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
3699 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
3700 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
3701 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
3702 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
3703 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
3704 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
3705 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
3706 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
3708 Info.memVT = MVT::v4f16;
3709 Info.ptrVal = I.getArgOperand(0);
3710 Info.offset = 0;
3712 Info.align = Align(16);
3713 return true;
3714 }
3715
3716 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
3717 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
3718 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
3719 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
3720 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
3721 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
3722 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
3723 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
3724 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
3725 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
3726 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
3727 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
3728 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
3729 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
3730 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
3731 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
3733 Info.memVT = MVT::v8f32;
3734 Info.ptrVal = I.getArgOperand(0);
3735 Info.offset = 0;
3737 Info.align = Align(16);
3738 return true;
3739 }
3740
3741 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
3742 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
3743 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
3744 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
3745 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
3746 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
3747 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
3748 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
3749 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
3750 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
3751 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
3752 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
3754 Info.memVT = MVT::v8i32;
3755 Info.ptrVal = I.getArgOperand(0);
3756 Info.offset = 0;
3758 Info.align = Align(16);
3759 return true;
3760 }
3761
3762 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
3763 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
3764 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
3765 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
3766 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
3767 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
3768 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
3769 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
3771 Info.memVT = MVT::v2i32;
3772 Info.ptrVal = I.getArgOperand(0);
3773 Info.offset = 0;
3775 Info.align = Align(8);
3776 return true;
3777 }
3778
3779 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
3780 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
3781 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
3782 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
3784 Info.memVT = MVT::v2f64;
3785 Info.ptrVal = I.getArgOperand(0);
3786 Info.offset = 0;
3788 Info.align = Align(16);
3789 return true;
3790 }
3791
3792 case Intrinsic::nvvm_atomic_load_inc_32:
3793 case Intrinsic::nvvm_atomic_load_dec_32:
3794
3795 case Intrinsic::nvvm_atomic_add_gen_f_cta:
3796 case Intrinsic::nvvm_atomic_add_gen_f_sys:
3797 case Intrinsic::nvvm_atomic_add_gen_i_cta:
3798 case Intrinsic::nvvm_atomic_add_gen_i_sys:
3799 case Intrinsic::nvvm_atomic_and_gen_i_cta:
3800 case Intrinsic::nvvm_atomic_and_gen_i_sys:
3801 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
3802 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
3803 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
3804 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
3805 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
3806 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
3807 case Intrinsic::nvvm_atomic_max_gen_i_cta:
3808 case Intrinsic::nvvm_atomic_max_gen_i_sys:
3809 case Intrinsic::nvvm_atomic_min_gen_i_cta:
3810 case Intrinsic::nvvm_atomic_min_gen_i_sys:
3811 case Intrinsic::nvvm_atomic_or_gen_i_cta:
3812 case Intrinsic::nvvm_atomic_or_gen_i_sys:
3813 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
3814 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
3815 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
3816 case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
3817 auto &DL = I.getDataLayout();
3819 Info.memVT = getValueType(DL, I.getType());
3820 Info.ptrVal = I.getArgOperand(0);
3821 Info.offset = 0;
3823 Info.align.reset();
3824 return true;
3825 }
3826
3827 case Intrinsic::nvvm_ldu_global_i:
3828 case Intrinsic::nvvm_ldu_global_f:
3829 case Intrinsic::nvvm_ldu_global_p: {
3830 auto &DL = I.getDataLayout();
3832 if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
3833 Info.memVT = getValueType(DL, I.getType());
3834 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
3835 Info.memVT = getPointerTy(DL);
3836 else
3837 Info.memVT = getValueType(DL, I.getType());
3838 Info.ptrVal = I.getArgOperand(0);
3839 Info.offset = 0;
3841 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
3842
3843 return true;
3844 }
3845 case Intrinsic::nvvm_tex_1d_v4f32_s32:
3846 case Intrinsic::nvvm_tex_1d_v4f32_f32:
3847 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3848 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3849 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3850 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3851 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3852 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3853 case Intrinsic::nvvm_tex_2d_v4f32_s32:
3854 case Intrinsic::nvvm_tex_2d_v4f32_f32:
3855 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3856 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3857 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3858 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3859 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3860 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3861 case Intrinsic::nvvm_tex_3d_v4f32_s32:
3862 case Intrinsic::nvvm_tex_3d_v4f32_f32:
3863 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3864 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3865 case Intrinsic::nvvm_tex_cube_v4f32_f32:
3866 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3867 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3868 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3869 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3870 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3871 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3872 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3873 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3874 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3875 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3876 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3877 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3878 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3879 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3880 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3881 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3882 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3883 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3884 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3885 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3886 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3887 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3888 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3889 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3890 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3891 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3892 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3893 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3894 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3895 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3896 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3897 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
3898 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
3899 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3900 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3901 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3902 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3904 Info.memVT = MVT::v4f32;
3905 Info.ptrVal = nullptr;
3906 Info.offset = 0;
3908 Info.align = Align(16);
3909 return true;
3910
3911 case Intrinsic::nvvm_tex_1d_v4s32_s32:
3912 case Intrinsic::nvvm_tex_1d_v4s32_f32:
3913 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3914 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3915 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3916 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3917 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3918 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3919 case Intrinsic::nvvm_tex_2d_v4s32_s32:
3920 case Intrinsic::nvvm_tex_2d_v4s32_f32:
3921 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3922 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3923 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3924 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3925 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3926 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3927 case Intrinsic::nvvm_tex_3d_v4s32_s32:
3928 case Intrinsic::nvvm_tex_3d_v4s32_f32:
3929 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3930 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3931 case Intrinsic::nvvm_tex_cube_v4s32_f32:
3932 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3933 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3934 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3935 case Intrinsic::nvvm_tex_cube_v4u32_f32:
3936 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3937 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3938 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3939 case Intrinsic::nvvm_tex_1d_v4u32_s32:
3940 case Intrinsic::nvvm_tex_1d_v4u32_f32:
3941 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3942 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3943 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3944 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3945 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3946 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3947 case Intrinsic::nvvm_tex_2d_v4u32_s32:
3948 case Intrinsic::nvvm_tex_2d_v4u32_f32:
3949 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3950 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3951 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3952 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3953 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3954 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3955 case Intrinsic::nvvm_tex_3d_v4u32_s32:
3956 case Intrinsic::nvvm_tex_3d_v4u32_f32:
3957 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3958 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3959 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3960 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3961 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3962 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3963 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3964 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3965 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3966 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3967 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3968 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3969 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3970 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3971 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3972 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3973 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3974 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3975 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3976 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3977 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3978 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3979 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3980 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3981 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3982 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3983 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3984 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3985 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3986 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3987 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3988 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3989 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3990 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3991 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3992 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3993 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3994 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3995 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3996 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3997 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3998 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3999 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4000 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4001 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4002 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4003 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4004 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4005 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4006 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4007 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4008 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4009 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4010 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4011 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4012 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4013 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4014 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4015 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4016 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4017 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4018 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4019 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4020 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4021 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4022 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4023 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4024 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4025 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4026 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4028 Info.memVT = MVT::v4i32;
4029 Info.ptrVal = nullptr;
4030 Info.offset = 0;
4032 Info.align = Align(16);
4033 return true;
4034
4035 case Intrinsic::nvvm_suld_1d_i8_clamp:
4036 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4037 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4038 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4039 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4040 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4041 case Intrinsic::nvvm_suld_2d_i8_clamp:
4042 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4043 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4044 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4045 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4046 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4047 case Intrinsic::nvvm_suld_3d_i8_clamp:
4048 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4049 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4050 case Intrinsic::nvvm_suld_1d_i8_trap:
4051 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4052 case Intrinsic::nvvm_suld_1d_v4i8_trap:
4053 case Intrinsic::nvvm_suld_1d_array_i8_trap:
4054 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4055 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4056 case Intrinsic::nvvm_suld_2d_i8_trap:
4057 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4058 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4059 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4060 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4061 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4062 case Intrinsic::nvvm_suld_3d_i8_trap:
4063 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4064 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4065 case Intrinsic::nvvm_suld_1d_i8_zero:
4066 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4067 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4068 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4069 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4070 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4071 case Intrinsic::nvvm_suld_2d_i8_zero:
4072 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4073 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4074 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4075 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4076 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4077 case Intrinsic::nvvm_suld_3d_i8_zero:
4078 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4079 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4081 Info.memVT = MVT::i8;
4082 Info.ptrVal = nullptr;
4083 Info.offset = 0;
4085 Info.align = Align(16);
4086 return true;
4087
4088 case Intrinsic::nvvm_suld_1d_i16_clamp:
4089 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4090 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4091 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4092 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4093 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4094 case Intrinsic::nvvm_suld_2d_i16_clamp:
4095 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4096 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4097 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4098 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4099 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4100 case Intrinsic::nvvm_suld_3d_i16_clamp:
4101 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4102 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4103 case Intrinsic::nvvm_suld_1d_i16_trap:
4104 case Intrinsic::nvvm_suld_1d_v2i16_trap:
4105 case Intrinsic::nvvm_suld_1d_v4i16_trap:
4106 case Intrinsic::nvvm_suld_1d_array_i16_trap:
4107 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4108 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4109 case Intrinsic::nvvm_suld_2d_i16_trap:
4110 case Intrinsic::nvvm_suld_2d_v2i16_trap:
4111 case Intrinsic::nvvm_suld_2d_v4i16_trap:
4112 case Intrinsic::nvvm_suld_2d_array_i16_trap:
4113 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4114 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4115 case Intrinsic::nvvm_suld_3d_i16_trap:
4116 case Intrinsic::nvvm_suld_3d_v2i16_trap:
4117 case Intrinsic::nvvm_suld_3d_v4i16_trap:
4118 case Intrinsic::nvvm_suld_1d_i16_zero:
4119 case Intrinsic::nvvm_suld_1d_v2i16_zero:
4120 case Intrinsic::nvvm_suld_1d_v4i16_zero:
4121 case Intrinsic::nvvm_suld_1d_array_i16_zero:
4122 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4123 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4124 case Intrinsic::nvvm_suld_2d_i16_zero:
4125 case Intrinsic::nvvm_suld_2d_v2i16_zero:
4126 case Intrinsic::nvvm_suld_2d_v4i16_zero:
4127 case Intrinsic::nvvm_suld_2d_array_i16_zero:
4128 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4129 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4130 case Intrinsic::nvvm_suld_3d_i16_zero:
4131 case Intrinsic::nvvm_suld_3d_v2i16_zero:
4132 case Intrinsic::nvvm_suld_3d_v4i16_zero:
4134 Info.memVT = MVT::i16;
4135 Info.ptrVal = nullptr;
4136 Info.offset = 0;
4138 Info.align = Align(16);
4139 return true;
4140
4141 case Intrinsic::nvvm_suld_1d_i32_clamp:
4142 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4143 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4144 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4145 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4146 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4147 case Intrinsic::nvvm_suld_2d_i32_clamp:
4148 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4149 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4150 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4151 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4152 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4153 case Intrinsic::nvvm_suld_3d_i32_clamp:
4154 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4155 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4156 case Intrinsic::nvvm_suld_1d_i32_trap:
4157 case Intrinsic::nvvm_suld_1d_v2i32_trap:
4158 case Intrinsic::nvvm_suld_1d_v4i32_trap:
4159 case Intrinsic::nvvm_suld_1d_array_i32_trap:
4160 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4161 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4162 case Intrinsic::nvvm_suld_2d_i32_trap:
4163 case Intrinsic::nvvm_suld_2d_v2i32_trap:
4164 case Intrinsic::nvvm_suld_2d_v4i32_trap:
4165 case Intrinsic::nvvm_suld_2d_array_i32_trap:
4166 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4167 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4168 case Intrinsic::nvvm_suld_3d_i32_trap:
4169 case Intrinsic::nvvm_suld_3d_v2i32_trap:
4170 case Intrinsic::nvvm_suld_3d_v4i32_trap:
4171 case Intrinsic::nvvm_suld_1d_i32_zero:
4172 case Intrinsic::nvvm_suld_1d_v2i32_zero:
4173 case Intrinsic::nvvm_suld_1d_v4i32_zero:
4174 case Intrinsic::nvvm_suld_1d_array_i32_zero:
4175 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4176 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4177 case Intrinsic::nvvm_suld_2d_i32_zero:
4178 case Intrinsic::nvvm_suld_2d_v2i32_zero:
4179 case Intrinsic::nvvm_suld_2d_v4i32_zero:
4180 case Intrinsic::nvvm_suld_2d_array_i32_zero:
4181 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4182 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4183 case Intrinsic::nvvm_suld_3d_i32_zero:
4184 case Intrinsic::nvvm_suld_3d_v2i32_zero:
4185 case Intrinsic::nvvm_suld_3d_v4i32_zero:
4187 Info.memVT = MVT::i32;
4188 Info.ptrVal = nullptr;
4189 Info.offset = 0;
4191 Info.align = Align(16);
4192 return true;
4193
4194 case Intrinsic::nvvm_suld_1d_i64_clamp:
4195 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4196 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4197 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4198 case Intrinsic::nvvm_suld_2d_i64_clamp:
4199 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4200 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4201 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4202 case Intrinsic::nvvm_suld_3d_i64_clamp:
4203 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4204 case Intrinsic::nvvm_suld_1d_i64_trap:
4205 case Intrinsic::nvvm_suld_1d_v2i64_trap:
4206 case Intrinsic::nvvm_suld_1d_array_i64_trap:
4207 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4208 case Intrinsic::nvvm_suld_2d_i64_trap:
4209 case Intrinsic::nvvm_suld_2d_v2i64_trap:
4210 case Intrinsic::nvvm_suld_2d_array_i64_trap:
4211 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4212 case Intrinsic::nvvm_suld_3d_i64_trap:
4213 case Intrinsic::nvvm_suld_3d_v2i64_trap:
4214 case Intrinsic::nvvm_suld_1d_i64_zero:
4215 case Intrinsic::nvvm_suld_1d_v2i64_zero:
4216 case Intrinsic::nvvm_suld_1d_array_i64_zero:
4217 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4218 case Intrinsic::nvvm_suld_2d_i64_zero:
4219 case Intrinsic::nvvm_suld_2d_v2i64_zero:
4220 case Intrinsic::nvvm_suld_2d_array_i64_zero:
4221 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4222 case Intrinsic::nvvm_suld_3d_i64_zero:
4223 case Intrinsic::nvvm_suld_3d_v2i64_zero:
4225 Info.memVT = MVT::i64;
4226 Info.ptrVal = nullptr;
4227 Info.offset = 0;
4229 Info.align = Align(16);
4230 return true;
4231 }
4232 return false;
4233}
4234
4235/// getFunctionParamOptimizedAlign - since function arguments are passed via
4236/// .param space, we may want to increase their alignment in a way that
4237/// ensures that we can effectively vectorize their loads & stores. We can
4238/// increase alignment only if the function has internal or has private
4239/// linkage as for other linkage types callers may already rely on default
4240/// alignment. To allow using 128-bit vectorized loads/stores, this function
4241/// ensures that alignment is 16 or greater.
4243 const Function *F, Type *ArgTy, const DataLayout &DL) const {
4244 // Capping the alignment to 128 bytes as that is the maximum alignment
4245 // supported by PTX.
4246 const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy));
4247
4248 // If a function has linkage different from internal or private, we
4249 // must use default ABI alignment as external users rely on it. Same
4250 // for a function that may be called from a function pointer.
4251 if (!F || !F->hasLocalLinkage() ||
4252 F->hasAddressTaken(/*Users=*/nullptr,
4253 /*IgnoreCallbackUses=*/false,
4254 /*IgnoreAssumeLikeCalls=*/true,
4255 /*IgnoreLLVMUsed=*/true))
4256 return ABITypeAlign;
4257
4258 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
4259 return std::max(Align(16), ABITypeAlign);
4260}
4261
4262/// Helper for computing alignment of a device function byval parameter.
4264 const Function *F, Type *ArgTy, Align InitialAlign,
4265 const DataLayout &DL) const {
4266 Align ArgAlign = InitialAlign;
4267 // Try to increase alignment to enhance vectorization options.
4268 if (F)
4269 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
4270
4271 // Old ptx versions have a bug. When PTX code takes address of
4272 // byval parameter with alignment < 4, ptxas generates code to
4273 // spill argument into memory. Alas on sm_50+ ptxas generates
4274 // SASS code that fails with misaligned access. To work around
4275 // the problem, make sure that we align byval parameters by at
4276 // least 4. This bug seems to be fixed at least starting from
4277 // ptxas > 9.0.
4278 // TODO: remove this after verifying the bug is not reproduced
4279 // on non-deprecated ptxas versions.
4281 ArgAlign = std::max(ArgAlign, Align(4));
4282
4283 return ArgAlign;
4284}
4285
4286// Helper for getting a function parameter name. Name is composed from
4287// its index and the function name. Negative index corresponds to special
4288// parameter (unsized array) used for passing variable arguments.
4290 int Idx) const {
4291 std::string ParamName;
4292 raw_string_ostream ParamStr(ParamName);
4293
4294 ParamStr << getTargetMachine().getSymbol(F)->getName();
4295 if (Idx < 0)
4296 ParamStr << "_vararg";
4297 else
4298 ParamStr << "_param_" << Idx;
4299
4300 return ParamName;
4301}
4302
4303/// isLegalAddressingMode - Return true if the addressing mode represented
4304/// by AM is legal for this target, for a load/store of the specified type.
4305/// Used to guide target specific optimizations, like loop strength reduction
4306/// (LoopStrengthReduce.cpp) and memory optimization for address mode
4307/// (CodeGenPrepare.cpp)
4309 const AddrMode &AM, Type *Ty,
4310 unsigned AS, Instruction *I) const {
4311 // AddrMode - This represents an addressing mode of:
4312 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
4313 //
4314 // The legal address modes are
4315 // - [avar]
4316 // - [areg]
4317 // - [areg+immoff]
4318 // - [immAddr]
4319
4320 // immoff must fit in a signed 32-bit int
4321 if (!APInt(64, AM.BaseOffs).isSignedIntN(32))
4322 return false;
4323
4324 if (AM.BaseGV)
4325 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
4326
4327 switch (AM.Scale) {
4328 case 0: // "r", "r+i" or "i" is allowed
4329 break;
4330 case 1:
4331 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
4332 return false;
4333 // Otherwise we have r+i.
4334 break;
4335 default:
4336 // No scale > 1 is allowed
4337 return false;
4338 }
4339 return true;
4340}
4341
4342//===----------------------------------------------------------------------===//
4343// NVPTX Inline Assembly Support
4344//===----------------------------------------------------------------------===//
4345
4346/// getConstraintType - Given a constraint letter, return the type of
4347/// constraint it is for this target.
4350 if (Constraint.size() == 1) {
4351 switch (Constraint[0]) {
4352 default:
4353 break;
4354 case 'b':
4355 case 'r':
4356 case 'h':
4357 case 'c':
4358 case 'l':
4359 case 'f':
4360 case 'd':
4361 case 'q':
4362 case '0':
4363 case 'N':
4364 return C_RegisterClass;
4365 }
4366 }
4367 return TargetLowering::getConstraintType(Constraint);
4368}
4369
4370std::pair<unsigned, const TargetRegisterClass *>
4372 StringRef Constraint,
4373 MVT VT) const {
4374 if (Constraint.size() == 1) {
4375 switch (Constraint[0]) {
4376 case 'b':
4377 return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
4378 case 'c':
4379 return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4380 case 'h':
4381 return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4382 case 'r':
4383 return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
4384 case 'l':
4385 case 'N':
4386 return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
4387 case 'q': {
4388 if (STI.getSmVersion() < 70)
4389 report_fatal_error("Inline asm with 128 bit operands is only "
4390 "supported for sm_70 and higher!");
4391 return std::make_pair(0U, &NVPTX::Int128RegsRegClass);
4392 }
4393 case 'f':
4394 return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
4395 case 'd':
4396 return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
4397 }
4398 }
4399 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
4400}
4401
4402//===----------------------------------------------------------------------===//
4403// NVPTX DAG Combining
4404//===----------------------------------------------------------------------===//
4405
4407 CodeGenOptLevel OptLevel) const {
4408 // Always honor command-line argument
4409 if (FMAContractLevelOpt.getNumOccurrences() > 0)
4410 return FMAContractLevelOpt > 0;
4411
4412 // Do not contract if we're not optimizing the code.
4413 if (OptLevel == CodeGenOptLevel::None)
4414 return false;
4415
4416 // Honor TargetOptions flags that explicitly say fusion is okay.
4418 return true;
4419
4420 return allowUnsafeFPMath(MF);
4421}
4422
4424 // Honor TargetOptions flags that explicitly say unsafe math is okay.
4426 return true;
4427
4428 // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
4429 const Function &F = MF.getFunction();
4430 return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
4431}
4432
4433static bool isConstZero(const SDValue &Operand) {
4434 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
4435 return Const && Const->getZExtValue() == 0;
4436}
4437
4438/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
4439/// operands N0 and N1. This is a helper for PerformADDCombine that is
4440/// called with the default operands, and if that fails, with commuted
4441/// operands.
4442static SDValue
4445 EVT VT = N0.getValueType();
4446
4447 // Since integer multiply-add costs the same as integer multiply
4448 // but is more costly than integer add, do the fusion only when
4449 // the mul is only used in the add.
4450 // TODO: this may not be true for later architectures, consider relaxing this
4451 if (!N0.getNode()->hasOneUse())
4452 return SDValue();
4453
4454 // fold (add (mul a, b), c) -> (mad a, b, c)
4455 //
4456 if (N0.getOpcode() == ISD::MUL)
4457 return DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, N0.getOperand(0),
4458 N0.getOperand(1), N1);
4459
4460 // fold (add (select cond, 0, (mul a, b)), c)
4461 // -> (select cond, c, (mad a, b, c))
4462 //
4463 if (N0.getOpcode() == ISD::SELECT) {
4464 unsigned ZeroOpNum;
4465 if (isConstZero(N0->getOperand(1)))
4466 ZeroOpNum = 1;
4467 else if (isConstZero(N0->getOperand(2)))
4468 ZeroOpNum = 2;
4469 else
4470 return SDValue();
4471
4472 SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
4473 if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
4474 return SDValue();
4475
4476 SDValue MAD = DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
4477 M->getOperand(0), M->getOperand(1), N1);
4478 return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
4479 ((ZeroOpNum == 1) ? N1 : MAD),
4480 ((ZeroOpNum == 1) ? MAD : N1));
4481 }
4482
4483 return SDValue();
4484}
4485
4486static SDValue
4489 CodeGenOptLevel OptLevel) {
4490 EVT VT = N0.getValueType();
4491 if (N0.getOpcode() == ISD::FMUL) {
4492 const auto *TLI = static_cast<const NVPTXTargetLowering *>(
4493 &DCI.DAG.getTargetLoweringInfo());
4494 if (!TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel))
4495 return SDValue();
4496
4497 // For floating point:
4498 // Do the fusion only when the mul has less than 5 uses and all
4499 // are add.
4500 // The heuristic is that if a use is not an add, then that use
4501 // cannot be fused into fma, therefore mul is still needed anyway.
4502 // If there are more than 4 uses, even if they are all add, fusing
4503 // them will increase register pressue.
4504 //
4505 int numUses = 0;
4506 int nonAddCount = 0;
4507 for (const SDNode *User : N0.getNode()->users()) {
4508 numUses++;
4509 if (User->getOpcode() != ISD::FADD)
4510 ++nonAddCount;
4511 if (numUses >= 5)
4512 return SDValue();
4513 }
4514 if (nonAddCount) {
4515 int orderNo = N->getIROrder();
4516 int orderNo2 = N0.getNode()->getIROrder();
4517 // simple heuristics here for considering potential register
4518 // pressure, the logics here is that the differnce are used
4519 // to measure the distance between def and use, the longer distance
4520 // more likely cause register pressure.
4521 if (orderNo - orderNo2 < 500)
4522 return SDValue();
4523
4524 // Now, check if at least one of the FMUL's operands is live beyond the
4525 // node N, which guarantees that the FMA will not increase register
4526 // pressure at node N.
4527 bool opIsLive = false;
4528 const SDNode *left = N0.getOperand(0).getNode();
4529 const SDNode *right = N0.getOperand(1).getNode();
4530
4531 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
4532 opIsLive = true;
4533
4534 if (!opIsLive)
4535 for (const SDNode *User : left->users()) {
4536 int orderNo3 = User->getIROrder();
4537 if (orderNo3 > orderNo) {
4538 opIsLive = true;
4539 break;
4540 }
4541 }
4542
4543 if (!opIsLive)
4544 for (const SDNode *User : right->users()) {
4545 int orderNo3 = User->getIROrder();
4546 if (orderNo3 > orderNo) {
4547 opIsLive = true;
4548 break;
4549 }
4550 }
4551
4552 if (!opIsLive)
4553 return SDValue();
4554 }
4555
4556 return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),
4557 N0.getOperand(1), N1);
4558 }
4559
4560 return SDValue();
4561}
4562
4563static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front,
4564 std::size_t Back) {
4565 if (all_of(N->ops().drop_front(Front).drop_back(Back),
4566 [](const SDUse &U) { return U.get()->isUndef(); }))
4567 // Operand 0 is the previous value in the chain. Cannot return EntryToken
4568 // as the previous value will become unused and eliminated later.
4569 return N->getOperand(0);
4570
4571 return SDValue();
4572}
4573
4575 // Operands from the 3rd to the 2nd last one are the values to be stored.
4576 // {Chain, ArgID, Offset, Val, Glue}
4577 return PerformStoreCombineHelper(N, 3, 1);
4578}
4579
4581 // Operands from the 2nd to the last one are the values to be stored
4582 return PerformStoreCombineHelper(N, 2, 0);
4583}
4584
4585/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
4586///
4589 CodeGenOptLevel OptLevel) {
4590 if (OptLevel == CodeGenOptLevel::None)
4591 return SDValue();
4592
4593 SDValue N0 = N->getOperand(0);
4594 SDValue N1 = N->getOperand(1);
4595
4596 // Skip non-integer, non-scalar case
4597 EVT VT = N0.getValueType();
4598 if (VT.isVector() || VT != MVT::i32)
4599 return SDValue();
4600
4601 // First try with the default operand order.
4602 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))
4603 return Result;
4604
4605 // If that didn't work, try again with the operands commuted.
4606 return PerformADDCombineWithOperands(N, N1, N0, DCI);
4607}
4608
4609/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
4610///
4613 CodeGenOptLevel OptLevel) {
4614 SDValue N0 = N->getOperand(0);
4615 SDValue N1 = N->getOperand(1);
4616
4617 EVT VT = N0.getValueType();
4618 if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
4619 return SDValue();
4620
4621 // First try with the default operand order.
4622 if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
4623 return Result;
4624
4625 // If that didn't work, try again with the operands commuted.
4626 return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
4627}
4628
4631 // The type legalizer turns a vector load of i8 values into a zextload to i16
4632 // registers, optionally ANY_EXTENDs it (if target type is integer),
4633 // and ANDs off the high 8 bits. Since we turn this load into a
4634 // target-specific DAG node, the DAG combiner fails to eliminate these AND
4635 // nodes. Do that here.
4636 SDValue Val = N->getOperand(0);
4637 SDValue Mask = N->getOperand(1);
4638
4639 if (isa<ConstantSDNode>(Val)) {
4640 std::swap(Val, Mask);
4641 }
4642
4643 SDValue AExt;
4644
4645 // Convert BFE-> truncate i16 -> and 255
4646 // To just BFE-> truncate i16, as the value already has all the bits in the
4647 // right places.
4648 if (Val.getOpcode() == ISD::TRUNCATE) {
4649 SDValue BFE = Val.getOperand(0);
4650 if (BFE.getOpcode() != NVPTXISD::BFE)
4651 return SDValue();
4652
4653 ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0));
4654 if (!BFEBits)
4655 return SDValue();
4656 uint64_t BFEBitsVal = BFEBits->getZExtValue();
4657
4658 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
4659 if (!MaskCnst) {
4660 // Not an AND with a constant
4661 return SDValue();
4662 }
4663 uint64_t MaskVal = MaskCnst->getZExtValue();
4664
4665 if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1)
4666 return SDValue();
4667 // If we get here, the AND is unnecessary. Just replace it with the trunc
4668 DCI.CombineTo(N, Val, false);
4669 }
4670 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
4671 if (Val.getOpcode() == ISD::ANY_EXTEND) {
4672 AExt = Val;
4673 Val = Val->getOperand(0);
4674 }
4675
4676 if (Val->getOpcode() == NVPTXISD::LoadV2 ||
4677 Val->getOpcode() == NVPTXISD::LoadV4) {
4678 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
4679 if (!MaskCnst) {
4680 // Not an AND with a constant
4681 return SDValue();
4682 }
4683
4684 uint64_t MaskVal = MaskCnst->getZExtValue();
4685 if (MaskVal != 0xff) {
4686 // Not an AND that chops off top 8 bits
4687 return SDValue();
4688 }
4689
4690 MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
4691 if (!Mem) {
4692 // Not a MemSDNode?!?
4693 return SDValue();
4694 }
4695
4696 EVT MemVT = Mem->getMemoryVT();
4697 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
4698 // We only handle the i8 case
4699 return SDValue();
4700 }
4701
4702 unsigned ExtType = Val->getConstantOperandVal(Val->getNumOperands() - 1);
4703 if (ExtType == ISD::SEXTLOAD) {
4704 // If for some reason the load is a sextload, the and is needed to zero
4705 // out the high 8 bits
4706 return SDValue();
4707 }
4708
4709 bool AddTo = false;
4710 if (AExt.getNode() != nullptr) {
4711 // Re-insert the ext as a zext.
4712 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
4713 AExt.getValueType(), Val);
4714 AddTo = true;
4715 }
4716
4717 // If we get here, the AND is unnecessary. Just replace it with the load
4718 DCI.CombineTo(N, Val, AddTo);
4719 }
4720
4721 return SDValue();
4722}
4723
4726 CodeGenOptLevel OptLevel) {
4727 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
4728
4729 // Don't do anything at less than -O2.
4730 if (OptLevel < CodeGenOptLevel::Default)
4731 return SDValue();
4732
4733 SelectionDAG &DAG = DCI.DAG;
4734 SDLoc DL(N);
4735 EVT VT = N->getValueType(0);
4736 bool IsSigned = N->getOpcode() == ISD::SREM;
4737 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
4738
4739 const SDValue &Num = N->getOperand(0);
4740 const SDValue &Den = N->getOperand(1);
4741
4742 for (const SDNode *U : Num->users()) {
4743 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
4744 U->getOperand(1) == Den) {
4745 // Num % Den -> Num - (Num / Den) * Den
4746 return DAG.getNode(ISD::SUB, DL, VT, Num,
4747 DAG.getNode(ISD::MUL, DL, VT,
4748 DAG.getNode(DivOpc, DL, VT, Num, Den),
4749 Den));
4750 }
4751 }
4752 return SDValue();
4753}
4754
4758 Unknown
4760
4761/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
4762/// that can be demoted to \p OptSize bits without loss of information. The
4763/// signedness of the operand, if determinable, is placed in \p S.
4765 unsigned OptSize,
4766 OperandSignedness &S) {
4767 S = Unknown;
4768
4769 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
4770 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
4771 EVT OrigVT = Op.getOperand(0).getValueType();
4772 if (OrigVT.getFixedSizeInBits() <= OptSize) {
4773 S = Signed;
4774 return true;
4775 }
4776 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
4777 EVT OrigVT = Op.getOperand(0).getValueType();
4778 if (OrigVT.getFixedSizeInBits() <= OptSize) {
4779 S = Unsigned;
4780 return true;
4781 }
4782 }
4783
4784 return false;
4785}
4786
4787/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
4788/// be demoted to \p OptSize bits without loss of information. If the operands
4789/// contain a constant, it should appear as the RHS operand. The signedness of
4790/// the operands is placed in \p IsSigned.
4792 unsigned OptSize,
4793 bool &IsSigned) {
4794 OperandSignedness LHSSign;
4795
4796 // The LHS operand must be a demotable op
4797 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
4798 return false;
4799
4800 // We should have been able to determine the signedness from the LHS
4801 if (LHSSign == Unknown)
4802 return false;
4803
4804 IsSigned = (LHSSign == Signed);
4805
4806 // The RHS can be a demotable op or a constant
4807 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
4808 const APInt &Val = CI->getAPIntValue();
4809 if (LHSSign == Unsigned) {
4810 return Val.isIntN(OptSize);
4811 } else {
4812 return Val.isSignedIntN(OptSize);
4813 }
4814 } else {
4815 OperandSignedness RHSSign;
4816 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
4817 return false;
4818
4819 return LHSSign == RHSSign;
4820 }
4821}
4822
4823/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
4824/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
4825/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
4826/// amount.
4829 EVT MulType = N->getValueType(0);
4830 if (MulType != MVT::i32 && MulType != MVT::i64) {
4831 return SDValue();
4832 }
4833
4834 SDLoc DL(N);
4835 unsigned OptSize = MulType.getSizeInBits() >> 1;
4836 SDValue LHS = N->getOperand(0);
4837 SDValue RHS = N->getOperand(1);
4838
4839 // Canonicalize the multiply so the constant (if any) is on the right
4840 if (N->getOpcode() == ISD::MUL) {
4841 if (isa<ConstantSDNode>(LHS)) {
4842 std::swap(LHS, RHS);
4843 }
4844 }
4845
4846 // If we have a SHL, determine the actual multiply amount
4847 if (N->getOpcode() == ISD::SHL) {
4848 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
4849 if (!ShlRHS) {
4850 return SDValue();
4851 }
4852
4853 APInt ShiftAmt = ShlRHS->getAPIntValue();
4854 unsigned BitWidth = MulType.getSizeInBits();
4855 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
4856 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
4857 RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
4858 } else {
4859 return SDValue();
4860 }
4861 }
4862
4863 bool Signed;
4864 // Verify that our operands are demotable
4865 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
4866 return SDValue();
4867 }
4868
4869 EVT DemotedVT;
4870 if (MulType == MVT::i32) {
4871 DemotedVT = MVT::i16;
4872 } else {
4873 DemotedVT = MVT::i32;
4874 }
4875
4876 // Truncate the operands to the correct size. Note that these are just for
4877 // type consistency and will (likely) be eliminated in later phases.
4878 SDValue TruncLHS =
4879 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
4880 SDValue TruncRHS =
4881 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
4882
4883 unsigned Opc;
4884 if (Signed) {
4886 } else {
4888 }
4889
4890 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
4891}
4892
4893static bool isConstOne(const SDValue &Operand) {
4894 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
4895 return Const && Const->getZExtValue() == 1;
4896}
4897
4899 if (Add->getOpcode() != ISD::ADD)
4900 return SDValue();
4901
4902 if (isConstOne(Add->getOperand(0)))
4903 return Add->getOperand(1);
4904
4905 if (isConstOne(Add->getOperand(1)))
4906 return Add->getOperand(0);
4907
4908 return SDValue();
4909}
4910
4913
4915 return DCI.DAG.getNode(NVPTXISD::IMAD, DL, VT, X, Y, X);
4916
4917 return SDValue();
4918}
4919
4921 SDLoc DL,
4923 if (Select->getOpcode() != ISD::SELECT)
4924 return SDValue();
4925
4926 SDValue Cond = Select->getOperand(0);
4927
4928 unsigned ConstOpNo;
4929 if (isConstOne(Select->getOperand(1)))
4930 ConstOpNo = 1;
4931 else if (isConstOne(Select->getOperand(2)))
4932 ConstOpNo = 2;
4933 else
4934 return SDValue();
4935
4936 SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);
4937
4938 // Do not combine if the resulting sequence is not obviously profitable.
4940 return SDValue();
4941
4942 SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
4943
4944 return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,
4945 (ConstOpNo == 1) ? X : NewMul,
4946 (ConstOpNo == 1) ? NewMul : X);
4947}
4948
4949static SDValue
4952
4953 EVT VT = N0.getValueType();
4954 if (VT.isVector())
4955 return SDValue();
4956
4957 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
4958 return SDValue();
4959
4960 SDLoc DL(N);
4961
4962 // (mul x, (add y, 1)) -> (mad x, y, x)
4963 if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
4964 return Res;
4965 if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
4966 return Res;
4967
4968 // (mul x, (select y, 1)) -> (select (mul x, y), x)
4969 if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))
4970 return Res;
4971 if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))
4972 return Res;
4973
4974 return SDValue();
4975}
4976
4977/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
4980 CodeGenOptLevel OptLevel) {
4981 if (OptLevel == CodeGenOptLevel::None)
4982 return SDValue();
4983
4984 if (SDValue Ret = TryMULWIDECombine(N, DCI))
4985 return Ret;
4986
4987 SDValue N0 = N->getOperand(0);
4988 SDValue N1 = N->getOperand(1);
4989 return PerformMULCombineWithOperands(N, N0, N1, DCI);
4990}
4991
4992/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
4995 CodeGenOptLevel OptLevel) {
4996 if (OptLevel > CodeGenOptLevel::None) {
4997 // Try mul.wide combining at OptLevel > 0
4998 if (SDValue Ret = TryMULWIDECombine(N, DCI))
4999 return Ret;
5000 }
5001
5002 return SDValue();
5003}
5004
5007 unsigned int SmVersion) {
5008 EVT CCType = N->getValueType(0);
5009 SDValue A = N->getOperand(0);
5010 SDValue B = N->getOperand(1);
5011
5012 EVT AType = A.getValueType();
5013 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
5014 return SDValue();
5015
5016 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
5017 return SDValue();
5018
5019 SDLoc DL(N);
5020 // setp.f16x2 returns two scalar predicates, which we need to
5021 // convert back to v2i1. The returned result will be scalarized by
5022 // the legalizer, but the comparison will remain a single vector
5023 // instruction.
5024 SDValue CCNode = DCI.DAG.getNode(
5025 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
5027 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
5028 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
5029 CCNode.getValue(1));
5030}
5031
5034 SDValue Vector = N->getOperand(0);
5035 if (Vector->getOpcode() == ISD::FREEZE)
5036 Vector = Vector->getOperand(0);
5037 SDLoc DL(N);
5038 EVT VectorVT = Vector.getValueType();
5039 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
5040 IsPTXVectorType(VectorVT.getSimpleVT()))
5041 return SDValue(); // Native vector loads already combine nicely w/
5042 // extract_vector_elt.
5043 // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already
5044 // handle them OK.
5045 if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
5046 VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)
5047 return SDValue();
5048
5049 // Don't mess with undef values as sra may be simplified to 0, not undef.
5050 if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))
5051 return SDValue();
5052
5053 uint64_t VectorBits = VectorVT.getSizeInBits();
5054 // We only handle the types we can extract in-register.
5055 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
5056 return SDValue();
5057
5058 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
5059 // Index == 0 is handled by generic DAG combiner.
5060 if (!Index || Index->getZExtValue() == 0)
5061 return SDValue();
5062
5063 MVT IVT = MVT::getIntegerVT(VectorBits);
5064 EVT EltVT = VectorVT.getVectorElementType();
5065 EVT EltIVT = EltVT.changeTypeToInteger();
5066 uint64_t EltBits = EltVT.getScalarSizeInBits();
5067
5068 SDValue Result = DCI.DAG.getNode(
5069 ISD::TRUNCATE, DL, EltIVT,
5070 DCI.DAG.getNode(
5071 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
5072 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
5073
5074 // If element has non-integer type, bitcast it back to the expected type.
5075 if (EltVT != EltIVT)
5076 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
5077 // Past legalizer, we may need to extent i8 -> i16 to match the register type.
5078 if (EltVT != N->getValueType(0))
5079 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
5080
5081 return Result;
5082}
5083
5086 SDValue VA = N->getOperand(1);
5087 EVT VectorVT = VA.getValueType();
5088 if (VectorVT != MVT::v4i8)
5089 return SDValue();
5090
5091 // We need to split vselect into individual per-element operations Because we
5092 // use BFE/BFI instruction for byte extraction/insertion, we do end up with
5093 // 32-bit values, so we may as well do comparison as i32 to avoid conversions
5094 // to/from i16 normally used for i8 values.
5096 SDLoc DL(N);
5097 SDValue VCond = N->getOperand(0);
5098 SDValue VB = N->getOperand(2);
5099 for (int I = 0; I < 4; ++I) {
5100 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
5101 DCI.DAG.getConstant(I, DL, MVT::i32));
5102 SDValue EA = DCI.DAG.getAnyExtOrTrunc(
5103 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
5104 DCI.DAG.getConstant(I, DL, MVT::i32)),
5105 DL, MVT::i32);
5106 SDValue EB = DCI.DAG.getAnyExtOrTrunc(
5107 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
5108 DCI.DAG.getConstant(I, DL, MVT::i32)),
5109 DL, MVT::i32);
5111 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
5112 }
5113 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
5114}
5115
5116static SDValue
5118 auto VT = N->getValueType(0);
5119 if (!DCI.isAfterLegalizeDAG() || !Isv2x16VT(VT))
5120 return SDValue();
5121
5122 auto Op0 = N->getOperand(0);
5123 auto Op1 = N->getOperand(1);
5124
5125 // Start out by assuming we want to take the lower 2 bytes of each i32
5126 // operand.
5127 uint64_t Op0Bytes = 0x10;
5128 uint64_t Op1Bytes = 0x54;
5129
5130 std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},
5131 {&Op1, &Op1Bytes}};
5132
5133 // Check that each operand is an i16, truncated from an i32 operand. We'll
5134 // select individual bytes from those original operands. Optionally, fold in a
5135 // shift right of that original operand.
5136 for (auto &[Op, OpBytes] : OpData) {
5137 // Eat up any bitcast
5138 if (Op->getOpcode() == ISD::BITCAST)
5139 *Op = Op->getOperand(0);
5140
5141 if (!(Op->getValueType() == MVT::i16 && Op->getOpcode() == ISD::TRUNCATE &&
5142 Op->getOperand(0).getValueType() == MVT::i32))
5143 return SDValue();
5144
5145 // If the truncate has multiple uses, this optimization can increase
5146 // register pressure
5147 if (!Op->hasOneUse())
5148 return SDValue();
5149
5150 *Op = Op->getOperand(0);
5151
5152 // Optionally, fold in a shift-right of the original operand and let permute
5153 // pick the two higher bytes of the original value directly.
5154 if (Op->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Op->getOperand(1))) {
5155 if (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue() == 16) {
5156 // Shift the PRMT byte selector to pick upper bytes from each respective
5157 // value, instead of the lower ones: 0x10 -> 0x32, 0x54 -> 0x76
5158 assert((*OpBytes == 0x10 || *OpBytes == 0x54) &&
5159 "PRMT selector values out of range");
5160 *OpBytes += 0x22;
5161 *Op = Op->getOperand(0);
5162 }
5163 }
5164 }
5165
5166 SDLoc DL(N);
5167 auto &DAG = DCI.DAG;
5168
5169 auto PRMT = DAG.getNode(
5170 NVPTXISD::PRMT, DL, MVT::v4i8,
5171 {Op0, Op1, DAG.getConstant((Op1Bytes << 8) | Op0Bytes, DL, MVT::i32),
5172 DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
5173 return DAG.getNode(ISD::BITCAST, DL, VT, PRMT);
5174}
5175
5176SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
5177 DAGCombinerInfo &DCI) const {
5179 switch (N->getOpcode()) {
5180 default: break;
5181 case ISD::ADD:
5182 return PerformADDCombine(N, DCI, OptLevel);
5183 case ISD::FADD:
5184 return PerformFADDCombine(N, DCI, OptLevel);
5185 case ISD::MUL:
5186 return PerformMULCombine(N, DCI, OptLevel);
5187 case ISD::SHL:
5188 return PerformSHLCombine(N, DCI, OptLevel);
5189 case ISD::AND:
5190 return PerformANDCombine(N, DCI);
5191 case ISD::UREM:
5192 case ISD::SREM:
5193 return PerformREMCombine(N, DCI, OptLevel);
5194 case ISD::SETCC:
5195 return PerformSETCCCombine(N, DCI, STI.getSmVersion());
5205 return PerformEXTRACTCombine(N, DCI);
5206 case ISD::VSELECT:
5207 return PerformVSELECTCombine(N, DCI);
5208 case ISD::BUILD_VECTOR:
5209 return PerformBUILD_VECTORCombine(N, DCI);
5210 }
5211 return SDValue();
5212}
5213
5216 // Handle bitcasting to v2i8 without hitting the default promotion
5217 // strategy which goes through stack memory.
5218 SDValue Op(Node, 0);
5219 EVT ToVT = Op->getValueType(0);
5220 if (ToVT != MVT::v2i8) {
5221 return;
5222 }
5223
5224 // Bitcast to i16 and unpack elements into a vector
5225 SDLoc DL(Node);
5226 SDValue AsInt = MaybeBitcast(DAG, DL, MVT::i16, Op->getOperand(0));
5227 SDValue Vec0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, AsInt);
5228 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
5229 SDValue Vec1 =
5230 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
5231 DAG.getNode(ISD::SRL, DL, MVT::i16, {AsInt, Const8}));
5232 Results.push_back(
5233 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1}));
5234}
5235
5236/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
5239 EVT ResVT = N->getValueType(0);
5240 SDLoc DL(N);
5241
5242 assert(ResVT.isVector() && "Vector load must have vector type");
5243
5244 auto NumEltsAndEltVT = getVectorLoweringShape(ResVT);
5245 if (!NumEltsAndEltVT)
5246 return;
5247 auto [NumElts, EltVT] = NumEltsAndEltVT.value();
5248
5249 LoadSDNode *LD = cast<LoadSDNode>(N);
5250
5251 Align Alignment = LD->getAlign();
5252 auto &TD = DAG.getDataLayout();
5253 Align PrefAlign =
5254 TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
5255 if (Alignment < PrefAlign) {
5256 // This load is not sufficiently aligned, so bail out and let this vector
5257 // load be scalarized. Note that we may still be able to emit smaller
5258 // vector loads. For example, if we are loading a <4 x float> with an
5259 // alignment of 8, this check will fail but the legalizer will try again
5260 // with 2 x <2 x float>, which will succeed with an alignment of 8.
5261 return;
5262 }
5263
5264 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
5265 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
5266 // loaded type to i16 and propagate the "real" type as the memory type.
5267 bool NeedTrunc = false;
5268 if (EltVT.getSizeInBits() < 16) {
5269 EltVT = MVT::i16;
5270 NeedTrunc = true;
5271 }
5272
5273 unsigned Opcode = 0;
5274 SDVTList LdResVTs;
5275
5276 switch (NumElts) {
5277 default:
5278 return;
5279 case 2:
5280 Opcode = NVPTXISD::LoadV2;
5281 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
5282 break;
5283 case 4: {
5284 Opcode = NVPTXISD::LoadV4;
5285 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
5286 LdResVTs = DAG.getVTList(ListVTs);
5287 break;
5288 }
5289 }
5290
5291 // Copy regular operands
5292 SmallVector<SDValue, 8> OtherOps(N->ops());
5293
5294 // The select routine does not have access to the LoadSDNode instance, so
5295 // pass along the extension information
5296 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
5297
5298 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
5299 LD->getMemoryVT(),
5300 LD->getMemOperand());
5301
5302 SmallVector<SDValue> ScalarRes;
5303 assert(NumElts <= ResVT.getVectorNumElements() &&
5304 "NumElts should not increase, only decrease or stay the same.");
5305 if (NumElts < ResVT.getVectorNumElements()) {
5306 // If the number of elements has decreased, getVectorLoweringShape has
5307 // upsized the element types
5308 assert(EltVT.isVector() && EltVT.getSizeInBits() == 32 &&
5309 EltVT.getVectorNumElements() <= 4 && "Unexpected upsized type.");
5310 // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
5311 // into individual elements.
5312 for (unsigned i = 0; i < NumElts; ++i) {
5313 SDValue SubVector = NewLD.getValue(i);
5314 DAG.ExtractVectorElements(SubVector, ScalarRes);
5315 }
5316 } else {
5317 for (unsigned i = 0; i < NumElts; ++i) {
5318 SDValue Res = NewLD.getValue(i);
5319 if (NeedTrunc)
5320 Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
5321 ScalarRes.push_back(Res);
5322 }
5323 }
5324
5325 SDValue LoadChain = NewLD.getValue(NumElts);
5326
5327 SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
5328
5329 Results.push_back(BuildVec);
5330 Results.push_back(LoadChain);
5331}
5332
5335 SDValue Chain = N->getOperand(0);
5336 SDValue Intrin = N->getOperand(1);
5337 SDLoc DL(N);
5338
5339 // Get the intrinsic ID
5340 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
5341 switch (IntrinNo) {
5342 default:
5343 return;
5344 case Intrinsic::nvvm_ldu_global_i:
5345 case Intrinsic::nvvm_ldu_global_f:
5346 case Intrinsic::nvvm_ldu_global_p: {
5347 EVT ResVT = N->getValueType(0);
5348
5349 if (ResVT.isVector()) {
5350 // Vector LDG/LDU
5351
5352 unsigned NumElts = ResVT.getVectorNumElements();
5353 EVT EltVT = ResVT.getVectorElementType();
5354
5355 // Since LDU/LDG are target nodes, we cannot rely on DAG type
5356 // legalization.
5357 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
5358 // loaded type to i16 and propagate the "real" type as the memory type.
5359 bool NeedTrunc = false;
5360 if (EltVT.getSizeInBits() < 16) {
5361 EltVT = MVT::i16;
5362 NeedTrunc = true;
5363 }
5364
5365 unsigned Opcode = 0;
5366 SDVTList LdResVTs;
5367
5368 switch (NumElts) {
5369 default:
5370 return;
5371 case 2:
5372 Opcode = NVPTXISD::LDUV2;
5373 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
5374 break;
5375 case 4: {
5376 Opcode = NVPTXISD::LDUV4;
5377 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
5378 LdResVTs = DAG.getVTList(ListVTs);
5379 break;
5380 }
5381 }
5382
5383 SmallVector<SDValue, 8> OtherOps;
5384
5385 // Copy regular operands
5386
5387 OtherOps.push_back(Chain); // Chain
5388 // Skip operand 1 (intrinsic ID)
5389 // Others
5390 OtherOps.append(N->op_begin() + 2, N->op_end());
5391
5392 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
5393
5394 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
5395 MemSD->getMemoryVT(),
5396 MemSD->getMemOperand());
5397
5398 SmallVector<SDValue, 4> ScalarRes;
5399
5400 for (unsigned i = 0; i < NumElts; ++i) {
5401 SDValue Res = NewLD.getValue(i);
5402 if (NeedTrunc)
5403 Res =
5404 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
5405 ScalarRes.push_back(Res);
5406 }
5407
5408 SDValue LoadChain = NewLD.getValue(NumElts);
5409
5410 SDValue BuildVec =
5411 DAG.getBuildVector(ResVT, DL, ScalarRes);
5412
5413 Results.push_back(BuildVec);
5414 Results.push_back(LoadChain);
5415 } else {
5416 // i8 LDG/LDU
5417 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
5418 "Custom handling of non-i8 ldu/ldg?");
5419
5420 // Just copy all operands as-is
5421 SmallVector<SDValue, 4> Ops(N->ops());
5422
5423 // Force output to i16
5424 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
5425
5426 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
5427
5428 // We make sure the memory type is i8, which will be used during isel
5429 // to select the proper instruction.
5430 SDValue NewLD =
5431 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
5432 MVT::i8, MemSD->getMemOperand());
5433
5434 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
5435 NewLD.getValue(0)));
5436 Results.push_back(NewLD.getValue(1));
5437 }
5438 }
5439 }
5440}
5441
5444 // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit
5445 // result so that it can pass the legalization
5446 SDLoc DL(N);
5447 SDValue Chain = N->getOperand(0);
5448 SDValue Reg = N->getOperand(1);
5449 SDValue Glue = N->getOperand(2);
5450
5451 assert(Reg.getValueType() == MVT::i128 &&
5452 "Custom lowering for CopyFromReg with 128-bit reg only");
5453 SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1),
5454 N->getValueType(2)};
5455 SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};
5456
5457 SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);
5458 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
5459 {NewValue.getValue(0), NewValue.getValue(1)});
5460
5461 Results.push_back(Pair);
5462 Results.push_back(NewValue.getValue(2));
5463 Results.push_back(NewValue.getValue(3));
5464}
5465
5466void NVPTXTargetLowering::ReplaceNodeResults(
5468 switch (N->getOpcode()) {
5469 default:
5470 report_fatal_error("Unhandled custom legalization");
5471 case ISD::BITCAST:
5472 ReplaceBITCAST(N, DAG, Results);
5473 return;
5474 case ISD::LOAD:
5476 return;
5479 return;
5480 case ISD::CopyFromReg:
5482 return;
5483 }
5484}
5485
5488 Type *Ty = AI->getValOperand()->getType();
5489
5490 if (AI->isFloatingPointOperation()) {
5492 if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
5493 STI.getPTXVersion() >= 63)
5495 if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&
5496 STI.getPTXVersion() >= 78)
5498 if (Ty->isFloatTy())
5500 if (Ty->isDoubleTy() && STI.hasAtomAddF64())
5502 }
5504 }
5505
5506 assert(Ty->isIntegerTy() && "Ty should be integer at this point");
5507 auto ITy = cast<llvm::IntegerType>(Ty);
5508
5509 switch (AI->getOperation()) {
5510 default:
5516 switch (ITy->getBitWidth()) {
5517 case 8:
5518 case 16:
5520 case 32:
5522 case 64:
5523 if (STI.hasAtomBitwise64())
5526 default:
5527 llvm_unreachable("unsupported width encountered");
5528 }
5535 switch (ITy->getBitWidth()) {
5536 case 8:
5537 case 16:
5539 case 32:
5541 case 64:
5542 if (STI.hasAtomMinMax64())
5545 default:
5546 llvm_unreachable("unsupported width encountered");
5547 }
5548 }
5549
5551}
5552
5553// Pin NVPTXTargetObjectFile's vtables to this file.
5555
5557 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
5558 return getDataSection();
5559}
#define MAKE_CASE(V)
static const LLT F32
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
This file contains the declarations of entities that describe floating point environment and related ...
Module.h This file contains the declarations for the Module class.
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
unsigned const TargetRegisterInfo * TRI
NVPTX address space definition.
static bool shouldConvertToIndirectCall(const CallBase *CB, const GlobalAddressSDNode *Func)
static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))
static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isConstOne(const SDValue &Operand)
static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it 2: do it aggressively"), cl::init(2))
static bool IsPTXVectorType(MVT VT)
static cl::opt< int > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" " IEEE Compliant F32 div.rnd if available."), cl::init(2))
static SDValue PerformStoreParamCombine(SDNode *N)
static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static bool Is16bitsType(MVT VT)
static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static bool IsTypePassedAsArray(const Type *Ty)
static SmallVector< ParamVectorizationFlags, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< uint64_t > &Offsets, Align ParamAlignment, bool IsVAArg=false)
static unsigned CanMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< uint64_t > &Offsets, Align ParamAlignment)
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static bool isConstZero(const SDValue &Operand)
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > *Offsets=nullptr, uint64_t StartingOffset=0)
ComputePTXValueVTs - For the given Type Ty, returns the set of primitive EVTs that compose it.
static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)
IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...
static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, uint64_t Offset, EVT ElementType, SDValue StVal, SDValue &InGlue, unsigned ArgID, const SDLoc &dl)
static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static std::optional< std::pair< unsigned int, EVT > > getVectorLoweringShape(EVT VectorVT)
static SDValue PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformStoreRetvalCombine(SDNode *N)
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)
AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...
static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front, std::size_t Back)
static bool adjustElementType(EVT &ElementType)
static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...
static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue matchMADConstOnePattern(SDValue Add)
static SDValue MaybeBitcast(SelectionDAG &DAG, SDLoc DL, EVT VT, SDValue Value)
static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))
ParamVectorizationFlags
@ PVF_FIRST
@ PVF_SCALAR
@ PVF_INNER
@ PVF_LAST
static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain, uint64_t Offset, EVT ElementType, SDValue RetVal, const SDLoc &dl)
static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT)
PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...
OperandSignedness
static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)
static SDValue LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, EVT ElementType, SDValue &InGlue, SmallVectorImpl< SDValue > &TempProxyRegOps, const SDLoc &dl)
static std::atomic< unsigned > GlobalUniqueCallSite
static cl::opt< bool > ForceMinByValParamAlign("nvptx-force-min-byval-param-align", cl::Hidden, cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" " params of device functions."), cl::init(false))
static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
unsigned SmVersion
Definition: NVVMReflect.cpp:79
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file describes how to lower LLVM code to machine code.
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:78
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition: APInt.h:435
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1130
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition: APInt.h:432
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1237
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:177
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:213
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
bool isFloatingPointOperation() const
Definition: Instructions.h:882
BinOp getOperation() const
Definition: Instructions.h:805
Value * getValOperand()
Definition: Instructions.h:874
bool hasParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const
Return true if the attribute exists for the given argument.
Definition: Attributes.h:833
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1112
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1199
This class represents a function call, abstracting a target machine's calling convention.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:847
Diagnostic information for unsupported feature in backend.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:641
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:221
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
MCSection * getDataSection() const
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition: MCSection.h:36
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:205
Machine Value Type.
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static auto integer_valuetypes()
static auto fixedlen_vector_valuetypes()
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getAlign() const
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
unsigned getMaxRequiredAlignment() const
bool hasAtomMinMax64() const
bool hasAtomAddF64() const
bool hasHWROT32() const
const NVPTXTargetLowering * getTargetLowering() const override
unsigned getPTXVersion() const
const NVPTXRegisterInfo * getRegisterInfo() const override
unsigned int getSmVersion() const
bool hasAtomBitwise64() const
bool hasBF16Math() const
bool allowFP16Math() const
bool hasAtomCas16() const
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
const NVPTXTargetMachine * nvTM
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)
bool useF32FTZ(const MachineFunction &MF) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
Align getFunctionArgumentAlignment(const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const
std::string getParamName(const Function *F, int Idx) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl< ISD::OutputArg > &, MaybeAlign retAlignment, std::optional< std::pair< unsigned, const APInt & > > VAInfo, const CallBase &CB, unsigned UniqueCallSite) const
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const
getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override
Return the ValueType of the result of SETCC operations.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Align getFunctionByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL) const
Helper for computing alignment of a device function byval parameter.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
bool allowUnsafeFPMath(MachineFunction &MF) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
UniqueStringSaver & getStrPool() const
MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
unsigned getIROrder() const
Return the node ordering.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Return true if the type of the node type undefined.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
SectionKind - This is a simple POD value that classifies the properties of a section.
Definition: SectionKind.h:22
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:750
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:577
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:801
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)
Return a GlobalAddress of the function from the current module with name matching the given ExternalS...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:503
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:856
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:827
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:497
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getBasicBlock(MachineBasicBlock *MBB)
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:700
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:492
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVMContext * getContext() const
Definition: SelectionDAG.h:510
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:586
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:580
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
ArrayRef< int > getMask() const
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:704
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
This class is used to represent ISD::STORE nodes.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:144
Class to represent struct types.
Definition: DerivedTypes.h:218
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Align getMinStackArgumentAlignment() const
Return the minimum stack alignment of an argument.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
std::vector< ArgListEntry > ArgListTy
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const
Expand round(fp) to fp conversion.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
TargetOptions Options
MCSymbol * getSymbol(const GlobalValue *GV) const
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
@ VoidTyID
type with no size
Definition: Type.h:63
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition: Type.h:303
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
StringRef save(const char *S)
Definition: StringSaver.h:52
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
int getNumOccurrences() const
Definition: CommandLine.h:399
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1197
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1193
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1226
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:276
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:814
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:558
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1259
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1148
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1123
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1127
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1222
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:674
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:588
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1044
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1282
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:772
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ SSHLSAT
RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.
Definition: ISDOpcodes.h:366
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1112
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:766
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:973
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:100
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1253
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1279
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:286
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition: ISDOpcodes.h:223
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:539
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1217
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:794
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:320
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
bool allOperandsUndef(const SDNode *N)
Return true if the node has at least one operand and all operands of the specified node are ISD::UNDE...
@ Bitcast
Perform the operation on a different, but equivalently sized type.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
static bool isIndirectCall(const MachineInstr &MI)
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
bool Isv2x16VT(EVT VT)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
MaybeAlign getAlign(const Function &F, unsigned Index)
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1952
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
unsigned promoteScalarArgumentSize(unsigned size)
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
bool isKernelFunction(const Function &F)
Function * getMaybeBitcastedCallee(const CallBase *CB)
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:257
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:376
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
This class contains a discriminated union of information about pointers in memory operands,...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)