/build/source/llvm/lib/Target/X86/X86ISelLowering.cpp

1

//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//

2

//

3

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

4

// See https://llvm.org/LICENSE.txt for license information.

5

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

6

//

7

//===----------------------------------------------------------------------===//

8

//

9

// This file defines the interfaces that X86 uses to lower LLVM code into a

10

// selection DAG.

11

//

12

//===----------------------------------------------------------------------===//

13

14

#include "X86ISelLowering.h"

15

#include "MCTargetDesc/X86ShuffleDecode.h"

16

#include "X86.h"

17

#include "X86CallingConv.h"

18

#include "X86FrameLowering.h"

19

#include "X86InstrBuilder.h"

20

#include "X86IntrinsicsInfo.h"

21

#include "X86MachineFunctionInfo.h"

22

#include "X86TargetMachine.h"

23

#include "X86TargetObjectFile.h"

24

#include "llvm/ADT/SmallBitVector.h"

25

#include "llvm/ADT/SmallSet.h"

26

#include "llvm/ADT/Statistic.h"

27

#include "llvm/ADT/StringExtras.h"

28

#include "llvm/ADT/StringSwitch.h"

29

#include "llvm/Analysis/BlockFrequencyInfo.h"

30

#include "llvm/Analysis/ObjCARCUtil.h"

31

#include "llvm/Analysis/ProfileSummaryInfo.h"

32

#include "llvm/Analysis/VectorUtils.h"

33

#include "llvm/CodeGen/IntrinsicLowering.h"

34

#include "llvm/CodeGen/MachineFrameInfo.h"

35

#include "llvm/CodeGen/MachineFunction.h"

36

#include "llvm/CodeGen/MachineInstrBuilder.h"

37

#include "llvm/CodeGen/MachineJumpTableInfo.h"

38

#include "llvm/CodeGen/MachineLoopInfo.h"

39

#include "llvm/CodeGen/MachineModuleInfo.h"

40

#include "llvm/CodeGen/MachineRegisterInfo.h"

41

#include "llvm/CodeGen/TargetLowering.h"

42

#include "llvm/CodeGen/WinEHFuncInfo.h"

43

#include "llvm/IR/CallingConv.h"

44

#include "llvm/IR/Constants.h"

45

#include "llvm/IR/DerivedTypes.h"

46

#include "llvm/IR/DiagnosticInfo.h"

47

#include "llvm/IR/EHPersonalities.h"

48

#include "llvm/IR/Function.h"

49

#include "llvm/IR/GlobalAlias.h"

50

#include "llvm/IR/GlobalVariable.h"

51

#include "llvm/IR/IRBuilder.h"

52

#include "llvm/IR/Instructions.h"

53

#include "llvm/IR/Intrinsics.h"

54

#include "llvm/IR/PatternMatch.h"

55

#include "llvm/MC/MCAsmInfo.h"

56

#include "llvm/MC/MCContext.h"

57

#include "llvm/MC/MCExpr.h"

58

#include "llvm/MC/MCSymbol.h"

59

#include "llvm/Support/CommandLine.h"

60

#include "llvm/Support/Debug.h"

61

#include "llvm/Support/ErrorHandling.h"

62

#include "llvm/Support/KnownBits.h"

63

#include "llvm/Support/MathExtras.h"

64

#include "llvm/Target/TargetOptions.h"

65

#include <algorithm>

66

#include <bitset>

67

#include <cctype>

68

#include <numeric>

69

using namespace llvm;

70

71

#define DEBUG_TYPE"x86-isel" "x86-isel"

72

73

STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"};

74

75

static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(

76

"x86-experimental-pref-innermost-loop-alignment", cl::init(4),

77

cl::desc(

78

"Sets the preferable loop alignment for experiments (as log2 bytes) "

79

"for innermost loops only. If specified, this option overrides "

80

"alignment set by x86-experimental-pref-loop-alignment."),

81

cl::Hidden);

82

83

static cl::opt<bool> MulConstantOptimization(

84

"mul-constant-optimization", cl::init(true),

85

cl::desc("Replace 'mul x, Const' with more effective instructions like "

86

"SHIFT, LEA, etc."),

87

cl::Hidden);

88

89

static cl::opt<bool> ExperimentalUnorderedISEL(

90

"x86-experimental-unordered-atomic-isel", cl::init(false),

91

cl::desc("Use LoadSDNode and StoreSDNode instead of "

92

"AtomicSDNode for unordered atomic loads and "

93

"stores respectively."),

94

cl::Hidden);

95

96

/// Call this when the user attempts to do something unsupported, like

97

/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike

98

/// report_fatal_error, so calling code should attempt to recover without

99

/// crashing.

100

static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,

101

const char *Msg) {

102

MachineFunction &MF = DAG.getMachineFunction();

103

DAG.getContext()->diagnose(

104

DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));

105

}

106

107

/// Returns true if a CC can dynamically exclude a register from the list of

108

/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on

109

/// the return registers.

110

static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) {

111

switch (CC) {

112

default:

113

return false;

114

case CallingConv::X86_RegCall:

115

case CallingConv::PreserveMost:

116

case CallingConv::PreserveAll:

117

return true;

118

}

119

}

120

121

/// Returns true if a CC can dynamically exclude a register from the list of

122

/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on

123

/// the parameters.

124

static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) {

125

return CC == CallingConv::X86_RegCall;

126

}

127

128

X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

129

const X86Subtarget &STI)

130

: TargetLowering(TM), Subtarget(STI) {

131

bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();

132

MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));

133

134

// Set up the TargetLowering object.

135

136

// X86 is weird. It always uses i8 for shift amounts and setcc results.

137

setBooleanContents(ZeroOrOneBooleanContent);

138

// X86-SSE is even stranger. It uses -1 or 0 for vector masks.

139

setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

140

141

// For 64-bit, since we have so many registers, use the ILP scheduler.

142

// For 32-bit, use the register pressure specific scheduling.

143

// For Atom, always use ILP scheduling.

144

if (Subtarget.isAtom())

145

setSchedulingPreference(Sched::ILP);

146

else if (Subtarget.is64Bit())

147

setSchedulingPreference(Sched::ILP);

148

else

149

setSchedulingPreference(Sched::RegPressure);

150

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

151

setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

152

153

// Bypass expensive divides and use cheaper ones.

154

if (TM.getOptLevel() >= CodeGenOpt::Default) {

155

if (Subtarget.hasSlowDivide32())

156

addBypassSlowDiv(32, 8);

157

if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())

158

addBypassSlowDiv(64, 32);

159

}

160

161

// Setup Windows compiler runtime calls.

162

if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {

163

static const struct {

164

const RTLIB::Libcall Op;

165

const char * const Name;

166

const CallingConv::ID CC;

167

} LibraryCalls[] = {

168

{ RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },

169

{ RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },

170

{ RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },

171

{ RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },

172

{ RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },

173

};

174

175

for (const auto &LC : LibraryCalls) {

176

setLibcallName(LC.Op, LC.Name);

177

setLibcallCallingConv(LC.Op, LC.CC);

178

}

179

}

180

181

if (Subtarget.getTargetTriple().isOSMSVCRT()) {

182

// MSVCRT doesn't have powi; fall back to pow

183

setLibcallName(RTLIB::POWI_F32, nullptr);

184

setLibcallName(RTLIB::POWI_F64, nullptr);

185

}

186

187

// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to

188

// 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.

189

// FIXME: Should we be limiting the atomic size on other configs? Default is

190

// 1024.

191

if (!Subtarget.canUseCMPXCHG8B())

192

setMaxAtomicSizeInBitsSupported(32);

193

194

setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);

195

196

setMaxLargeFPConvertBitWidthSupported(128);

197

198

// Set up the register classes.

199

addRegisterClass(MVT::i8, &X86::GR8RegClass);

200

addRegisterClass(MVT::i16, &X86::GR16RegClass);

201

addRegisterClass(MVT::i32, &X86::GR32RegClass);

202

if (Subtarget.is64Bit())

203

addRegisterClass(MVT::i64, &X86::GR64RegClass);

204

205

for (MVT VT : MVT::integer_valuetypes())

206

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

207

208

// We don't accept any truncstore of integer registers.

209

setTruncStoreAction(MVT::i64, MVT::i32, Expand);

210

setTruncStoreAction(MVT::i64, MVT::i16, Expand);

211

setTruncStoreAction(MVT::i64, MVT::i8 , Expand);

212

setTruncStoreAction(MVT::i32, MVT::i16, Expand);

213

setTruncStoreAction(MVT::i32, MVT::i8 , Expand);

214

setTruncStoreAction(MVT::i16, MVT::i8, Expand);

215

216

setTruncStoreAction(MVT::f64, MVT::f32, Expand);

217

218

// SETOEQ and SETUNE require checking two conditions.

219

for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {

220

setCondCodeAction(ISD::SETOEQ, VT, Expand);

221

setCondCodeAction(ISD::SETUNE, VT, Expand);

222

}

223

224

// Integer absolute.

225

if (Subtarget.canUseCMOV()) {

226

setOperationAction(ISD::ABS , MVT::i16 , Custom);

227

setOperationAction(ISD::ABS , MVT::i32 , Custom);

228

if (Subtarget.is64Bit())

229

setOperationAction(ISD::ABS , MVT::i64 , Custom);

230

}

231

232

// Signed saturation subtraction.

233

setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom);

234

setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom);

235

setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom);

236

if (Subtarget.is64Bit())

237

setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom);

238

239

// Funnel shifts.

240

for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {

241

// For slow shld targets we only lower for code size.

242

LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;

243

244

setOperationAction(ShiftOp , MVT::i8 , Custom);

245

setOperationAction(ShiftOp , MVT::i16 , Custom);

246

setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);

247

if (Subtarget.is64Bit())

248

setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);

249

}

250

251

if (!Subtarget.useSoftFloat()) {

252

// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this

253

// operation.

254

setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);

255

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);

256

setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);

257

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);

258

// We have an algorithm for SSE2, and we turn this into a 64-bit

259

// FILD or VCVTUSI2SS/SD for other targets.

260

setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);

261

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);

262

// We have an algorithm for SSE2->double, and we turn this into a

263

// 64-bit FILD followed by conditional FADD for other targets.

264

setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);

265

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);

266

267

// Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have

268

// this operation.

269

setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);

270

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);

271

// SSE has no i16 to fp conversion, only i32. We promote in the handler

272

// to allow f80 to use i16 and f64 to use i16 with sse1 only

273

setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);

274

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);

275

// f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not

276

setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);

277

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);

278

// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64

279

// are Legal, f80 is custom lowered.

280

setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);

281

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);

282

283

// Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have

284

// this operation.

285

setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);

286

// FIXME: This doesn't generate invalid exception when it should. PR44019.

287

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);

288

setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);

289

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);

290

setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);

291

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);

292

// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64

293

// are Legal, f80 is custom lowered.

294

setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);

295

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);

296

297

// Handle FP_TO_UINT by promoting the destination to a larger signed

298

// conversion.

299

setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);

300

// FIXME: This doesn't generate invalid exception when it should. PR44019.

301

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);

302

setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);

303

// FIXME: This doesn't generate invalid exception when it should. PR44019.

304

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);

305

setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);

306

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);

307

setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);

308

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);

309

310

setOperationAction(ISD::LRINT, MVT::f32, Custom);

311

setOperationAction(ISD::LRINT, MVT::f64, Custom);

312

setOperationAction(ISD::LLRINT, MVT::f32, Custom);

313

setOperationAction(ISD::LLRINT, MVT::f64, Custom);

314

315

if (!Subtarget.is64Bit()) {

316

setOperationAction(ISD::LRINT, MVT::i64, Custom);

317

setOperationAction(ISD::LLRINT, MVT::i64, Custom);

318

}

319

}

320

321

if (Subtarget.hasSSE2()) {

322

// Custom lowering for saturating float to int conversions.

323

// We handle promotion to larger result types manually.

324

for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {

325

setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);

326

setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);

327

}

328

if (Subtarget.is64Bit()) {

329

setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);

330

setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);

331

}

332

}

333

334

// Handle address space casts between mixed sized pointers.

335

setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);

336

setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);

337

338

// TODO: when we have SSE, these could be more efficient, by using movd/movq.

339

if (!Subtarget.hasSSE2()) {

340

setOperationAction(ISD::BITCAST , MVT::f32 , Expand);

341

setOperationAction(ISD::BITCAST , MVT::i32 , Expand);

342

if (Subtarget.is64Bit()) {

343

setOperationAction(ISD::BITCAST , MVT::f64 , Expand);

344

// Without SSE, i64->f64 goes through memory.

345

setOperationAction(ISD::BITCAST , MVT::i64 , Expand);

346

}

347

} else if (!Subtarget.is64Bit())

348

setOperationAction(ISD::BITCAST , MVT::i64 , Custom);

349

350

// Scalar integer divide and remainder are lowered to use operations that

351

// produce two results, to match the available instructions. This exposes

352

// the two-result form to trivial CSE, which is able to combine x/y and x%y

353

// into a single instruction.

354

//

355

// Scalar integer multiply-high is also lowered to use two-result

356

// operations, to match the available instructions. However, plain multiply

357

// (low) operations are left as Legal, as there are single-result

358

// instructions for this in x86. Using the two-result multiply instructions

359

// when both high and low results are needed must be arranged by dagcombine.

360

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

361

setOperationAction(ISD::MULHS, VT, Expand);

362

setOperationAction(ISD::MULHU, VT, Expand);

363

setOperationAction(ISD::SDIV, VT, Expand);

364

setOperationAction(ISD::UDIV, VT, Expand);

365

setOperationAction(ISD::SREM, VT, Expand);

366

setOperationAction(ISD::UREM, VT, Expand);

367

}

368

369

setOperationAction(ISD::BR_JT , MVT::Other, Expand);

370

setOperationAction(ISD::BRCOND , MVT::Other, Custom);

371

for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,

372

MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

373

setOperationAction(ISD::BR_CC, VT, Expand);

374

setOperationAction(ISD::SELECT_CC, VT, Expand);

375

}

376

if (Subtarget.is64Bit())

377

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);

378

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);

379

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);

380

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);

381

382

setOperationAction(ISD::FREM , MVT::f32 , Expand);

383

setOperationAction(ISD::FREM , MVT::f64 , Expand);

384

setOperationAction(ISD::FREM , MVT::f80 , Expand);

385

setOperationAction(ISD::FREM , MVT::f128 , Expand);

386

387

if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {

388

setOperationAction(ISD::GET_ROUNDING , MVT::i32 , Custom);

389

setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);

390

}

391

392

// Promote the i8 variants and force them on up to i32 which has a shorter

393

// encoding.

394

setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);

395

setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);

396

// Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit

397

// a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to

398

// promote that too.

399

setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);

400

setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , MVT::i32);

401

402

if (!Subtarget.hasBMI()) {

403

setOperationAction(ISD::CTTZ , MVT::i32 , Custom);

404

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);

405

if (Subtarget.is64Bit()) {

406

setOperationAction(ISD::CTTZ , MVT::i64 , Custom);

407

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);

408

}

409

}

410

411

if (Subtarget.hasLZCNT()) {

412

// When promoting the i8 variants, force them to i32 for a shorter

413

// encoding.

414

setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);

415

setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);

416

} else {

417

for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {

418

if (VT == MVT::i64 && !Subtarget.is64Bit())

419

continue;

420

setOperationAction(ISD::CTLZ , VT, Custom);

421

setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);

422

}

423

}

424

425

for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,

426

ISD::STRICT_FP_TO_FP16}) {

427

// Special handling for half-precision floating point conversions.

428

// If we don't have F16C support, then lower half float conversions

429

// into library calls.

430

setOperationAction(

431

Op, MVT::f32,

432

(!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);

433

// There's never any support for operations beyond MVT::f32.

434

setOperationAction(Op, MVT::f64, Expand);

435

setOperationAction(Op, MVT::f80, Expand);

436

setOperationAction(Op, MVT::f128, Expand);

437

}

438

439

for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {

440

setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);

441

setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);

442

setTruncStoreAction(VT, MVT::f16, Expand);

443

setTruncStoreAction(VT, MVT::bf16, Expand);

444

445

setOperationAction(ISD::BF16_TO_FP, VT, Expand);

446

setOperationAction(ISD::FP_TO_BF16, VT, Custom);

447

}

448

449

setOperationAction(ISD::PARITY, MVT::i8, Custom);

450

setOperationAction(ISD::PARITY, MVT::i16, Custom);

451

setOperationAction(ISD::PARITY, MVT::i32, Custom);

452

if (Subtarget.is64Bit())

453

setOperationAction(ISD::PARITY, MVT::i64, Custom);

454

if (Subtarget.hasPOPCNT()) {

455

setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);

456

// popcntw is longer to encode than popcntl and also has a false dependency

457

// on the dest that popcntl hasn't had since Cannon Lake.

458

setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);

459

} else {

460

setOperationAction(ISD::CTPOP , MVT::i8 , Expand);

461

setOperationAction(ISD::CTPOP , MVT::i16 , Expand);

462

setOperationAction(ISD::CTPOP , MVT::i32 , Expand);

463

if (Subtarget.is64Bit())

464

setOperationAction(ISD::CTPOP , MVT::i64 , Expand);

465

else

466

setOperationAction(ISD::CTPOP , MVT::i64 , Custom);

467

}

468

469

setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);

470

471

if (!Subtarget.hasMOVBE())

472

setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

473

474

// X86 wants to expand cmov itself.

475

for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {

476

setOperationAction(ISD::SELECT, VT, Custom);

477

setOperationAction(ISD::SETCC, VT, Custom);

478

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

479

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

480

}

481

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

482

if (VT == MVT::i64 && !Subtarget.is64Bit())

483

continue;

484

setOperationAction(ISD::SELECT, VT, Custom);

485

setOperationAction(ISD::SETCC, VT, Custom);

486

}

487

488

// Custom action for SELECT MMX and expand action for SELECT_CC MMX

489

setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);

490

setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);

491

492

setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);

493

// NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since

494

// LLVM/Clang supports zero-cost DWARF and SEH exception handling.

495

setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);

496

setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);

497

setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);

498

if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)

499

setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");

500

501

// Darwin ABI issue.

502

for (auto VT : { MVT::i32, MVT::i64 }) {

503

if (VT == MVT::i64 && !Subtarget.is64Bit())

504

continue;

505

setOperationAction(ISD::ConstantPool , VT, Custom);

506

setOperationAction(ISD::JumpTable , VT, Custom);

507

setOperationAction(ISD::GlobalAddress , VT, Custom);

508

setOperationAction(ISD::GlobalTLSAddress, VT, Custom);

509

setOperationAction(ISD::ExternalSymbol , VT, Custom);

510

setOperationAction(ISD::BlockAddress , VT, Custom);

511

}

512

513

// 64-bit shl, sra, srl (iff 32-bit x86)

514

for (auto VT : { MVT::i32, MVT::i64 }) {

515

if (VT == MVT::i64 && !Subtarget.is64Bit())

516

continue;

517

setOperationAction(ISD::SHL_PARTS, VT, Custom);

518

setOperationAction(ISD::SRA_PARTS, VT, Custom);

519

setOperationAction(ISD::SRL_PARTS, VT, Custom);

520

}

521

522

if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())

523

setOperationAction(ISD::PREFETCH , MVT::Other, Legal);

524

525

setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);

526

527

// Expand certain atomics

528

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

529

setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);

530

setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);

531

setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);

532

setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);

533

setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);

534

setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);

535

setOperationAction(ISD::ATOMIC_STORE, VT, Custom);

536

}

537

538

if (!Subtarget.is64Bit())

539

setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);

540

541

if (Subtarget.canUseCMPXCHG16B())

542

setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);

543

544

// FIXME - use subtarget debug flags

545

if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&

546

!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&

547

TM.Options.ExceptionModel != ExceptionHandling::SjLj) {

548

setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);

549

}

550

551

setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);

552

setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);

553

554

setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);

555

setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

556

557

setOperationAction(ISD::TRAP, MVT::Other, Legal);

558

setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

559

if (Subtarget.isTargetPS())

560

setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);

561

else

562

setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);

563

564

// VASTART needs to be custom lowered to use the VarArgsFrameIndex

565

setOperationAction(ISD::VASTART , MVT::Other, Custom);

566

setOperationAction(ISD::VAEND , MVT::Other, Expand);

567

bool Is64Bit = Subtarget.is64Bit();

568

setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);

569

setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);

570

571

setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);

572

setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

573

574

setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);

575

576

// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.

577

setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);

578

setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);

579

580

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);

581

582

auto setF16Action = [&] (MVT VT, LegalizeAction Action) {

583

setOperationAction(ISD::FABS, VT, Action);

584

setOperationAction(ISD::FNEG, VT, Action);

585

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

586

setOperationAction(ISD::FREM, VT, Action);

587

setOperationAction(ISD::FMA, VT, Action);

588

setOperationAction(ISD::FMINNUM, VT, Action);

589

setOperationAction(ISD::FMAXNUM, VT, Action);

590

setOperationAction(ISD::FMINIMUM, VT, Action);

591

setOperationAction(ISD::FMAXIMUM, VT, Action);

592

setOperationAction(ISD::FSIN, VT, Action);

593

setOperationAction(ISD::FCOS, VT, Action);

594

setOperationAction(ISD::FSINCOS, VT, Action);

595

setOperationAction(ISD::FSQRT, VT, Action);

596

setOperationAction(ISD::FPOW, VT, Action);

597

setOperationAction(ISD::FLOG, VT, Action);

598

setOperationAction(ISD::FLOG2, VT, Action);

599

setOperationAction(ISD::FLOG10, VT, Action);

600

setOperationAction(ISD::FEXP, VT, Action);

601

setOperationAction(ISD::FEXP2, VT, Action);

602

setOperationAction(ISD::FCEIL, VT, Action);

603

setOperationAction(ISD::FFLOOR, VT, Action);

604

setOperationAction(ISD::FNEARBYINT, VT, Action);

605

setOperationAction(ISD::FRINT, VT, Action);

606

setOperationAction(ISD::BR_CC, VT, Action);

607

setOperationAction(ISD::SETCC, VT, Action);

608

setOperationAction(ISD::SELECT, VT, Custom);

609

setOperationAction(ISD::SELECT_CC, VT, Action);

610

setOperationAction(ISD::FROUND, VT, Action);

611

setOperationAction(ISD::FROUNDEVEN, VT, Action);

612

setOperationAction(ISD::FTRUNC, VT, Action);

613

};

614

615

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {

616

// f16, f32 and f64 use SSE.

617

// Set up the FP register classes.

618

addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass

619

: &X86::FR16RegClass);

620

addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass

621

: &X86::FR32RegClass);

622

addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass

623

: &X86::FR64RegClass);

624

625

// Disable f32->f64 extload as we can only generate this in one instruction

626

// under optsize. So its easier to pattern match (fpext (load)) for that

627

// case instead of needing to emit 2 instructions for extload in the

628

// non-optsize case.

629

setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);

630

631

for (auto VT : { MVT::f32, MVT::f64 }) {

632

// Use ANDPD to simulate FABS.

633

setOperationAction(ISD::FABS, VT, Custom);

634

635

// Use XORP to simulate FNEG.

636

setOperationAction(ISD::FNEG, VT, Custom);

637

638

// Use ANDPD and ORPD to simulate FCOPYSIGN.

639

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

640

641

// These might be better off as horizontal vector ops.

642

setOperationAction(ISD::FADD, VT, Custom);

643

setOperationAction(ISD::FSUB, VT, Custom);

644

645

// We don't support sin/cos/fmod

646

setOperationAction(ISD::FSIN , VT, Expand);

647

setOperationAction(ISD::FCOS , VT, Expand);

648

setOperationAction(ISD::FSINCOS, VT, Expand);

649

}

650

651

// Half type will be promoted by default.

652

setF16Action(MVT::f16, Promote);

653

setOperationAction(ISD::FADD, MVT::f16, Promote);

654

setOperationAction(ISD::FSUB, MVT::f16, Promote);

655

setOperationAction(ISD::FMUL, MVT::f16, Promote);

656

setOperationAction(ISD::FDIV, MVT::f16, Promote);

657

setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);

658

setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);

659

setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);

660

661

setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote);

662

setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote);

663

setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote);

664

setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote);

665

setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote);

666

setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote);

667

setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote);

668

setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote);

669

setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote);

670

setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote);

671

setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote);

672

setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote);

673

setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote);

674

setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote);

675

setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote);

676

setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote);

677

setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote);

678

setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote);

679

setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote);

680

setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote);

681

setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote);

682

setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote);

683

setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);

684

setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote);

685

setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote);

686

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);

687

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);

688

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);

689

690

setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");

691

setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");

692

693

// Lower this to MOVMSK plus an AND.

694

setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);

695

setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

696

697

} else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&

698

(UseX87 || Is64Bit)) {

699

// Use SSE for f32, x87 for f64.

700

// Set up the FP register classes.

701

addRegisterClass(MVT::f32, &X86::FR32RegClass);

702

if (UseX87)

703

addRegisterClass(MVT::f64, &X86::RFP64RegClass);

704

705

// Use ANDPS to simulate FABS.

706

setOperationAction(ISD::FABS , MVT::f32, Custom);

707

708

// Use XORP to simulate FNEG.

709

setOperationAction(ISD::FNEG , MVT::f32, Custom);

710

711

if (UseX87)

712

setOperationAction(ISD::UNDEF, MVT::f64, Expand);

713

714

// Use ANDPS and ORPS to simulate FCOPYSIGN.

715

if (UseX87)

716

setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);

717

setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

718

719

// We don't support sin/cos/fmod

720

setOperationAction(ISD::FSIN , MVT::f32, Expand);

721

setOperationAction(ISD::FCOS , MVT::f32, Expand);

722

setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

723

724

if (UseX87) {

725

// Always expand sin/cos functions even though x87 has an instruction.

726

setOperationAction(ISD::FSIN, MVT::f64, Expand);

727

setOperationAction(ISD::FCOS, MVT::f64, Expand);

728

setOperationAction(ISD::FSINCOS, MVT::f64, Expand);

729

}

730

} else if (UseX87) {

731

// f32 and f64 in x87.

732

// Set up the FP register classes.

733

addRegisterClass(MVT::f64, &X86::RFP64RegClass);

734

addRegisterClass(MVT::f32, &X86::RFP32RegClass);

735

736

for (auto VT : { MVT::f32, MVT::f64 }) {

737

setOperationAction(ISD::UNDEF, VT, Expand);

738

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

739

740

// Always expand sin/cos functions even though x87 has an instruction.

741

setOperationAction(ISD::FSIN , VT, Expand);

742

setOperationAction(ISD::FCOS , VT, Expand);

743

setOperationAction(ISD::FSINCOS, VT, Expand);

744

}

745

}

746

747

// Expand FP32 immediates into loads from the stack, save special cases.

748

if (isTypeLegal(MVT::f32)) {

749

if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {

750

addLegalFPImmediate(APFloat(+0.0f)); // FLD0

751

addLegalFPImmediate(APFloat(+1.0f)); // FLD1

752

addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS

753

addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS

754

} else // SSE immediates.

755

addLegalFPImmediate(APFloat(+0.0f)); // xorps

756

}

757

// Expand FP64 immediates into loads from the stack, save special cases.

758

if (isTypeLegal(MVT::f64)) {

759

if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {

760

addLegalFPImmediate(APFloat(+0.0)); // FLD0

761

addLegalFPImmediate(APFloat(+1.0)); // FLD1

762

addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS

763

addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS

764

} else // SSE immediates.

765

addLegalFPImmediate(APFloat(+0.0)); // xorpd

766

}

767

// Support fp16 0 immediate.

768

if (isTypeLegal(MVT::f16))

769

addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));

770

771

// Handle constrained floating-point operations of scalar.

772

setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);

773

setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);

774

setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);

775

setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);

776

setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);

777

setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);

778

setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);

779

setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);

780

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);

781

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);

782

setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);

783

setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);

784

785

// We don't support FMA.

786

setOperationAction(ISD::FMA, MVT::f64, Expand);

787

setOperationAction(ISD::FMA, MVT::f32, Expand);

788

789

// f80 always uses X87.

790

if (UseX87) {

791

addRegisterClass(MVT::f80, &X86::RFP80RegClass);

792

setOperationAction(ISD::UNDEF, MVT::f80, Expand);

793

setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);

794

{

795

APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());

796

addLegalFPImmediate(TmpFlt); // FLD0

797

TmpFlt.changeSign();

798

addLegalFPImmediate(TmpFlt); // FLD0/FCHS

799

800

bool ignored;

801

APFloat TmpFlt2(+1.0);

802

TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,

803

&ignored);

804

addLegalFPImmediate(TmpFlt2); // FLD1

805

TmpFlt2.changeSign();

806

addLegalFPImmediate(TmpFlt2); // FLD1/FCHS

807

}

808

809

// Always expand sin/cos functions even though x87 has an instruction.

810

setOperationAction(ISD::FSIN , MVT::f80, Expand);

811

setOperationAction(ISD::FCOS , MVT::f80, Expand);

812

setOperationAction(ISD::FSINCOS, MVT::f80, Expand);

813

814

setOperationAction(ISD::FFLOOR, MVT::f80, Expand);

815

setOperationAction(ISD::FCEIL, MVT::f80, Expand);

816

setOperationAction(ISD::FTRUNC, MVT::f80, Expand);

817

setOperationAction(ISD::FRINT, MVT::f80, Expand);

818

setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);

819

setOperationAction(ISD::FMA, MVT::f80, Expand);

820

setOperationAction(ISD::LROUND, MVT::f80, Expand);

821

setOperationAction(ISD::LLROUND, MVT::f80, Expand);

822

setOperationAction(ISD::LRINT, MVT::f80, Custom);

823

setOperationAction(ISD::LLRINT, MVT::f80, Custom);

824

825

// Handle constrained floating-point operations of scalar.

826

setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);

827

setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);

828

setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);

829

setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);

830

setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);

831

if (isTypeLegal(MVT::f16)) {

832

setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);

833

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);

834

} else {

835

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);

836

}

837

// FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten

838

// as Custom.

839

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);

840

}

841

842

// f128 uses xmm registers, but most operations require libcalls.

843

if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {

844

addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass

845

: &X86::VR128RegClass);

846

847

addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps

848

849

setOperationAction(ISD::FADD, MVT::f128, LibCall);

850

setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);

851

setOperationAction(ISD::FSUB, MVT::f128, LibCall);

852

setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);

853

setOperationAction(ISD::FDIV, MVT::f128, LibCall);

854

setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);

855

setOperationAction(ISD::FMUL, MVT::f128, LibCall);

856

setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);

857

setOperationAction(ISD::FMA, MVT::f128, LibCall);

858

setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);

859

860

setOperationAction(ISD::FABS, MVT::f128, Custom);

861

setOperationAction(ISD::FNEG, MVT::f128, Custom);

862

setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);

863

864

setOperationAction(ISD::FSIN, MVT::f128, LibCall);

865

setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);

866

setOperationAction(ISD::FCOS, MVT::f128, LibCall);

867

setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);

868

setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);

869

// No STRICT_FSINCOS

870

setOperationAction(ISD::FSQRT, MVT::f128, LibCall);

871

setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);

872

873

setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);

874

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);

875

// We need to custom handle any FP_ROUND with an f128 input, but

876

// LegalizeDAG uses the result type to know when to run a custom handler.

877

// So we have to list all legal floating point result types here.

878

if (isTypeLegal(MVT::f32)) {

879

setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);

880

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);

881

}

882

if (isTypeLegal(MVT::f64)) {

883

setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);

884

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);

885

}

886

if (isTypeLegal(MVT::f80)) {

887

setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);

888

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);

889

}

890

891

setOperationAction(ISD::SETCC, MVT::f128, Custom);

892

893

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);

894

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);

895

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);

896

setTruncStoreAction(MVT::f128, MVT::f32, Expand);

897

setTruncStoreAction(MVT::f128, MVT::f64, Expand);

898

setTruncStoreAction(MVT::f128, MVT::f80, Expand);

899

}

900

901

// Always use a library call for pow.

902

setOperationAction(ISD::FPOW , MVT::f32 , Expand);

903

setOperationAction(ISD::FPOW , MVT::f64 , Expand);

904

setOperationAction(ISD::FPOW , MVT::f80 , Expand);

905

setOperationAction(ISD::FPOW , MVT::f128 , Expand);

906

907

setOperationAction(ISD::FLOG, MVT::f80, Expand);

908

setOperationAction(ISD::FLOG2, MVT::f80, Expand);

909

setOperationAction(ISD::FLOG10, MVT::f80, Expand);

910

setOperationAction(ISD::FEXP, MVT::f80, Expand);

911

setOperationAction(ISD::FEXP2, MVT::f80, Expand);

912

setOperationAction(ISD::FMINNUM, MVT::f80, Expand);

913

setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);

914

915

// Some FP actions are always expanded for vector types.

916

for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,

917

MVT::v4f32, MVT::v8f32, MVT::v16f32,

918

MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {

919

setOperationAction(ISD::FSIN, VT, Expand);

920

setOperationAction(ISD::FSINCOS, VT, Expand);

921

setOperationAction(ISD::FCOS, VT, Expand);

922

setOperationAction(ISD::FREM, VT, Expand);

923

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

924

setOperationAction(ISD::FPOW, VT, Expand);

925

setOperationAction(ISD::FLOG, VT, Expand);

926

setOperationAction(ISD::FLOG2, VT, Expand);

927

setOperationAction(ISD::FLOG10, VT, Expand);

928

setOperationAction(ISD::FEXP, VT, Expand);

929

setOperationAction(ISD::FEXP2, VT, Expand);

930

}

931

932

// First set operation action for all vector types to either promote

933

// (for widening) or expand (for scalarization). Then we will selectively

934

// turn on ones that can be effectively codegen'd.

935

for (MVT VT : MVT::fixedlen_vector_valuetypes()) {

936

setOperationAction(ISD::SDIV, VT, Expand);

937

setOperationAction(ISD::UDIV, VT, Expand);

938

setOperationAction(ISD::SREM, VT, Expand);

939

setOperationAction(ISD::UREM, VT, Expand);

940

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);

941

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);

942

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);

943

setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);

944

setOperationAction(ISD::FMA, VT, Expand);

945

setOperationAction(ISD::FFLOOR, VT, Expand);

946

setOperationAction(ISD::FCEIL, VT, Expand);

947

setOperationAction(ISD::FTRUNC, VT, Expand);

948

setOperationAction(ISD::FRINT, VT, Expand);

949

setOperationAction(ISD::FNEARBYINT, VT, Expand);

950

setOperationAction(ISD::SMUL_LOHI, VT, Expand);

951

setOperationAction(ISD::MULHS, VT, Expand);

952

setOperationAction(ISD::UMUL_LOHI, VT, Expand);

953

setOperationAction(ISD::MULHU, VT, Expand);

954

setOperationAction(ISD::SDIVREM, VT, Expand);

955

setOperationAction(ISD::UDIVREM, VT, Expand);

956

setOperationAction(ISD::CTPOP, VT, Expand);

957

setOperationAction(ISD::CTTZ, VT, Expand);

958

setOperationAction(ISD::CTLZ, VT, Expand);

959

setOperationAction(ISD::ROTL, VT, Expand);

960

setOperationAction(ISD::ROTR, VT, Expand);

961

setOperationAction(ISD::BSWAP, VT, Expand);

962

setOperationAction(ISD::SETCC, VT, Expand);

963

setOperationAction(ISD::FP_TO_UINT, VT, Expand);

964

setOperationAction(ISD::FP_TO_SINT, VT, Expand);

965

setOperationAction(ISD::UINT_TO_FP, VT, Expand);

966

setOperationAction(ISD::SINT_TO_FP, VT, Expand);

967

setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);

968

setOperationAction(ISD::TRUNCATE, VT, Expand);

969

setOperationAction(ISD::SIGN_EXTEND, VT, Expand);

970

setOperationAction(ISD::ZERO_EXTEND, VT, Expand);

971

setOperationAction(ISD::ANY_EXTEND, VT, Expand);

972

setOperationAction(ISD::SELECT_CC, VT, Expand);

973

for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {

974

setTruncStoreAction(InnerVT, VT, Expand);

975

976

setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);

977

setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);

978

979

// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like

980

// types, we have to deal with them whether we ask for Expansion or not.

981

// Setting Expand causes its own optimisation problems though, so leave

982

// them legal.

983

if (VT.getVectorElementType() == MVT::i1)

984

setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

985

986

// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are

987

// split/scalarized right now.

988

if (VT.getVectorElementType() == MVT::f16 ||

989

VT.getVectorElementType() == MVT::bf16)

990

setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

991

}

992

}

993

994

// FIXME: In order to prevent SSE instructions being expanded to MMX ones

995

// with -msoft-float, disable use of MMX as well.

996

if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {

997

addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);

998

// No operations on x86mmx supported, everything uses intrinsics.

999

}

1000

1001

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {

1002

addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass

1003

: &X86::VR128RegClass);

1004

1005

setOperationAction(ISD::FNEG, MVT::v4f32, Custom);

1006

setOperationAction(ISD::FABS, MVT::v4f32, Custom);

1007

setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);

1008

setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);

1009

setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);

1010

setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);

1011

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);

1012

setOperationAction(ISD::SELECT, MVT::v4f32, Custom);

1013

1014

setOperationAction(ISD::LOAD, MVT::v2f32, Custom);

1015

setOperationAction(ISD::STORE, MVT::v2f32, Custom);

1016

1017

setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);

1018

setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);

1019

setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);

1020

setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);

1021

setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);

1022

}

1023

1024

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {

1025

addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass

1026

: &X86::VR128RegClass);

1027

1028

// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM

1029

// registers cannot be used even for integer operations.

1030

addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass

1031

: &X86::VR128RegClass);

1032

addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass

1033

: &X86::VR128RegClass);

1034

addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass

1035

: &X86::VR128RegClass);

1036

addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass

1037

: &X86::VR128RegClass);

1038

addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass

1039

: &X86::VR128RegClass);

1040

1041

for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,

1042

MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {

1043

setOperationAction(ISD::SDIV, VT, Custom);

1044

setOperationAction(ISD::SREM, VT, Custom);

1045

setOperationAction(ISD::UDIV, VT, Custom);

1046

setOperationAction(ISD::UREM, VT, Custom);

1047

}

1048

1049

setOperationAction(ISD::MUL, MVT::v2i8, Custom);

1050

setOperationAction(ISD::MUL, MVT::v4i8, Custom);

1051

setOperationAction(ISD::MUL, MVT::v8i8, Custom);

1052

1053

setOperationAction(ISD::MUL, MVT::v16i8, Custom);

1054

setOperationAction(ISD::MUL, MVT::v4i32, Custom);

1055

setOperationAction(ISD::MUL, MVT::v2i64, Custom);

1056

setOperationAction(ISD::MULHU, MVT::v4i32, Custom);

1057

setOperationAction(ISD::MULHS, MVT::v4i32, Custom);

1058

setOperationAction(ISD::MULHU, MVT::v16i8, Custom);

1059

setOperationAction(ISD::MULHS, MVT::v16i8, Custom);

1060

setOperationAction(ISD::MULHU, MVT::v8i16, Legal);

1061

setOperationAction(ISD::MULHS, MVT::v8i16, Legal);

1062

setOperationAction(ISD::MUL, MVT::v8i16, Legal);

1063

setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal);

1064

setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal);

1065

1066

setOperationAction(ISD::SMULO, MVT::v16i8, Custom);

1067

setOperationAction(ISD::UMULO, MVT::v16i8, Custom);

1068

setOperationAction(ISD::UMULO, MVT::v2i32, Custom);

1069

1070

setOperationAction(ISD::FNEG, MVT::v2f64, Custom);

1071

setOperationAction(ISD::FABS, MVT::v2f64, Custom);

1072

setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);

1073

1074

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1075

setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);

1076

setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);

1077

setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);

1078

setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);

1079

}

1080

1081

setOperationAction(ISD::ABDU, MVT::v16i8, Custom);

1082

setOperationAction(ISD::ABDS, MVT::v8i16, Custom);

1083

1084

setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);

1085

setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);

1086

setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);

1087

setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);

1088

setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);

1089

setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);

1090

setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);

1091

setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);

1092

setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);

1093

setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);

1094

1095

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);

1096

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);

1097

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);

1098

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

1099

1100

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1101

setOperationAction(ISD::SETCC, VT, Custom);

1102

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1103

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1104

setOperationAction(ISD::CTPOP, VT, Custom);

1105

setOperationAction(ISD::ABS, VT, Custom);

1106

1107

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

1108

// setcc all the way to isel and prefer SETGT in some isel patterns.

1109

setCondCodeAction(ISD::SETLT, VT, Custom);

1110

setCondCodeAction(ISD::SETLE, VT, Custom);

1111

}

1112

1113

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {

1114

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1115

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1116

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1117

setOperationAction(ISD::VSELECT, VT, Custom);

1118

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1119

}

1120

1121

for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {

1122

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1123

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1124

setOperationAction(ISD::VSELECT, VT, Custom);

1125

1126

if (VT == MVT::v2i64 && !Subtarget.is64Bit())

1127

continue;

1128

1129

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1130

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1131

}

1132

setF16Action(MVT::v8f16, Expand);

1133

setOperationAction(ISD::FADD, MVT::v8f16, Expand);

1134

setOperationAction(ISD::FSUB, MVT::v8f16, Expand);

1135

setOperationAction(ISD::FMUL, MVT::v8f16, Expand);

1136

setOperationAction(ISD::FDIV, MVT::v8f16, Expand);

1137

1138

// Custom lower v2i64 and v2f64 selects.

1139

setOperationAction(ISD::SELECT, MVT::v2f64, Custom);

1140

setOperationAction(ISD::SELECT, MVT::v2i64, Custom);

1141

setOperationAction(ISD::SELECT, MVT::v4i32, Custom);

1142

setOperationAction(ISD::SELECT, MVT::v8i16, Custom);

1143

setOperationAction(ISD::SELECT, MVT::v8f16, Custom);

1144

setOperationAction(ISD::SELECT, MVT::v16i8, Custom);

1145

1146

setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom);

1147

setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);

1148

setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);

1149

setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);

1150

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom);

1151

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);

1152

1153

// Custom legalize these to avoid over promotion or custom promotion.

1154

for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {

1155

setOperationAction(ISD::FP_TO_SINT, VT, Custom);

1156

setOperationAction(ISD::FP_TO_UINT, VT, Custom);

1157

setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);

1158

setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);

1159

}

1160

1161

setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);

1162

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom);

1163

setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);

1164

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);

1165

1166

setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);

1167

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);

1168

1169

setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

1170

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);

1171

1172

// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.

1173

setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);

1174

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);

1175

setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);

1176

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);

1177

1178

setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);

1179

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);

1180

setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);

1181

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);

1182

1183

// We want to legalize this to an f64 load rather than an i64 load on

1184

// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for

1185

// store.

1186

setOperationAction(ISD::LOAD, MVT::v2i32, Custom);

1187

setOperationAction(ISD::LOAD, MVT::v4i16, Custom);

1188

setOperationAction(ISD::LOAD, MVT::v8i8, Custom);

1189

setOperationAction(ISD::STORE, MVT::v2i32, Custom);

1190

setOperationAction(ISD::STORE, MVT::v4i16, Custom);

1191

setOperationAction(ISD::STORE, MVT::v8i8, Custom);

1192

1193

// Add 32-bit vector stores to help vectorization opportunities.

1194

setOperationAction(ISD::STORE, MVT::v2i16, Custom);

1195

setOperationAction(ISD::STORE, MVT::v4i8, Custom);

1196

1197

setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);

1198

setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);

1199

setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);

1200

if (!Subtarget.hasAVX512())

1201

setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);

1202

1203

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);

1204

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);

1205

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);

1206

1207

setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);

1208

1209

setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);

1210

setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);

1211

setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);

1212

setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);

1213

setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);

1214

setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);

1215

1216

// In the customized shift lowering, the legal v4i32/v2i64 cases

1217

// in AVX2 will be recognized.

1218

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1219

setOperationAction(ISD::SRL, VT, Custom);

1220

setOperationAction(ISD::SHL, VT, Custom);

1221

setOperationAction(ISD::SRA, VT, Custom);

1222

if (VT == MVT::v2i64) continue;

1223

setOperationAction(ISD::ROTL, VT, Custom);

1224

setOperationAction(ISD::ROTR, VT, Custom);

1225

setOperationAction(ISD::FSHL, VT, Custom);

1226

setOperationAction(ISD::FSHR, VT, Custom);

1227

}

1228

1229

setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);

1230

setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);

1231

setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);

1232

setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);

1233

setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);

1234

}

1235

1236

if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {

1237

setOperationAction(ISD::ABS, MVT::v16i8, Legal);

1238

setOperationAction(ISD::ABS, MVT::v8i16, Legal);

1239

setOperationAction(ISD::ABS, MVT::v4i32, Legal);

1240

setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);

1241

setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);

1242

setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);

1243

setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);

1244

setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);

1245

1246

// These might be better off as horizontal vector ops.

1247

setOperationAction(ISD::ADD, MVT::i16, Custom);

1248

setOperationAction(ISD::ADD, MVT::i32, Custom);

1249

setOperationAction(ISD::SUB, MVT::i16, Custom);

1250

setOperationAction(ISD::SUB, MVT::i32, Custom);

1251

}

1252

1253

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {

1254

for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {

1255

setOperationAction(ISD::FFLOOR, RoundedTy, Legal);

1256

setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);

1257

setOperationAction(ISD::FCEIL, RoundedTy, Legal);

1258

setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);

1259

setOperationAction(ISD::FTRUNC, RoundedTy, Legal);

1260

setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);

1261

setOperationAction(ISD::FRINT, RoundedTy, Legal);

1262

setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);

1263

setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);

1264

setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);

1265

setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);

1266

setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);

1267

1268

setOperationAction(ISD::FROUND, RoundedTy, Custom);

1269

}

1270

1271

setOperationAction(ISD::SMAX, MVT::v16i8, Legal);

1272

setOperationAction(ISD::SMAX, MVT::v4i32, Legal);

1273

setOperationAction(ISD::UMAX, MVT::v8i16, Legal);

1274

setOperationAction(ISD::UMAX, MVT::v4i32, Legal);

1275

setOperationAction(ISD::SMIN, MVT::v16i8, Legal);

1276

setOperationAction(ISD::SMIN, MVT::v4i32, Legal);

1277

setOperationAction(ISD::UMIN, MVT::v8i16, Legal);

1278

setOperationAction(ISD::UMIN, MVT::v4i32, Legal);

1279

1280

for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {

1281

setOperationAction(ISD::ABDS, VT, Custom);

1282

setOperationAction(ISD::ABDU, VT, Custom);

1283

}

1284

1285

setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);

1286

setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);

1287

setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);

1288

1289

// FIXME: Do we need to handle scalar-to-vector here?

1290

setOperationAction(ISD::MUL, MVT::v4i32, Legal);

1291

setOperationAction(ISD::SMULO, MVT::v2i32, Custom);

1292

1293

// We directly match byte blends in the backend as they match the VSELECT

1294

// condition form.

1295

setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);

1296

1297

// SSE41 brings specific instructions for doing vector sign extend even in

1298

// cases where we don't have SRA.

1299

for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1300

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);

1301

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);

1302

}

1303

1304

// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X

1305

for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {

1306

setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);

1307

setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);

1308

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);

1309

setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);

1310

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);

1311

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);

1312

}

1313

1314

if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {

1315

// We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can

1316

// do the pre and post work in the vector domain.

1317

setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);

1318

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);

1319

// We need to mark SINT_TO_FP as Custom even though we want to expand it

1320

// so that DAG combine doesn't try to turn it into uint_to_fp.

1321

setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);

1322

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);

1323

}

1324

}

1325

1326

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {

1327

setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);

1328

}

1329

1330

if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {

1331

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1332

MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1333

setOperationAction(ISD::ROTL, VT, Custom);

1334

setOperationAction(ISD::ROTR, VT, Custom);

1335

}

1336

1337

// XOP can efficiently perform BITREVERSE with VPPERM.

1338

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })

1339

setOperationAction(ISD::BITREVERSE, VT, Custom);

1340

1341

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1342

MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })

1343

setOperationAction(ISD::BITREVERSE, VT, Custom);

1344

}

1345

1346

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {

1347

bool HasInt256 = Subtarget.hasInt256();

1348

1349

addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass

1350

: &X86::VR256RegClass);

1351

addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass

1352

: &X86::VR256RegClass);

1353

addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass

1354

: &X86::VR256RegClass);

1355

addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass

1356

: &X86::VR256RegClass);

1357

addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass

1358

: &X86::VR256RegClass);

1359

addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass

1360

: &X86::VR256RegClass);

1361

addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass

1362

: &X86::VR256RegClass);

1363

1364

for (auto VT : { MVT::v8f32, MVT::v4f64 }) {

1365

setOperationAction(ISD::FFLOOR, VT, Legal);

1366

setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

1367

setOperationAction(ISD::FCEIL, VT, Legal);

1368

setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

1369

setOperationAction(ISD::FTRUNC, VT, Legal);

1370

setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

1371

setOperationAction(ISD::FRINT, VT, Legal);

1372

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

1373

setOperationAction(ISD::FNEARBYINT, VT, Legal);

1374

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

1375

setOperationAction(ISD::FROUNDEVEN, VT, Legal);

1376

setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);

1377

1378

setOperationAction(ISD::FROUND, VT, Custom);

1379

1380

setOperationAction(ISD::FNEG, VT, Custom);

1381

setOperationAction(ISD::FABS, VT, Custom);

1382

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

1383

}

1384

1385

// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted

1386

// even though v8i16 is a legal type.

1387

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);

1388

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);

1389

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);

1390

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);

1391

setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom);

1392

setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);

1393

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom);

1394

1395

setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom);

1396

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom);

1397

setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);

1398

setOperationAction(ISD::FP_ROUND, MVT::v8f16, Expand);

1399

setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);

1400

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom);

1401

1402

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);

1403

setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);

1404

setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);

1405

setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);

1406

setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);

1407

setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);

1408

setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);

1409

setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);

1410

setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);

1411

setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);

1412

setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);

1413

1414

if (!Subtarget.hasAVX512())

1415

setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);

1416

1417

// In the customized shift lowering, the legal v8i32/v4i64 cases

1418

// in AVX2 will be recognized.

1419

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1420

setOperationAction(ISD::SRL, VT, Custom);

1421

setOperationAction(ISD::SHL, VT, Custom);

1422

setOperationAction(ISD::SRA, VT, Custom);

1423

setOperationAction(ISD::ABDS, VT, Custom);

1424

setOperationAction(ISD::ABDU, VT, Custom);

1425

if (VT == MVT::v4i64) continue;

1426

setOperationAction(ISD::ROTL, VT, Custom);

1427

setOperationAction(ISD::ROTR, VT, Custom);

1428

setOperationAction(ISD::FSHL, VT, Custom);

1429

setOperationAction(ISD::FSHR, VT, Custom);

1430

}

1431

1432

// These types need custom splitting if their input is a 128-bit vector.

1433

setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

1434

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);

1435

setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);

1436

setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

1437

1438

setOperationAction(ISD::SELECT, MVT::v4f64, Custom);

1439

setOperationAction(ISD::SELECT, MVT::v4i64, Custom);

1440

setOperationAction(ISD::SELECT, MVT::v8i32, Custom);

1441

setOperationAction(ISD::SELECT, MVT::v16i16, Custom);

1442

setOperationAction(ISD::SELECT, MVT::v16f16, Custom);

1443

setOperationAction(ISD::SELECT, MVT::v32i8, Custom);

1444

setOperationAction(ISD::SELECT, MVT::v8f32, Custom);

1445

1446

for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1447

setOperationAction(ISD::SIGN_EXTEND, VT, Custom);

1448

setOperationAction(ISD::ZERO_EXTEND, VT, Custom);

1449

setOperationAction(ISD::ANY_EXTEND, VT, Custom);

1450

}

1451

1452

setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);

1453

setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);

1454

setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);

1455

setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);

1456

1457

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1458

setOperationAction(ISD::SETCC, VT, Custom);

1459

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1460

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1461

setOperationAction(ISD::CTPOP, VT, Custom);

1462

setOperationAction(ISD::CTLZ, VT, Custom);

1463

1464

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

1465

// setcc all the way to isel and prefer SETGT in some isel patterns.

1466

setCondCodeAction(ISD::SETLT, VT, Custom);

1467

setCondCodeAction(ISD::SETLE, VT, Custom);

1468

}

1469

1470

if (Subtarget.hasAnyFMA()) {

1471

for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,

1472

MVT::v2f64, MVT::v4f64 }) {

1473

setOperationAction(ISD::FMA, VT, Legal);

1474

setOperationAction(ISD::STRICT_FMA, VT, Legal);

1475

}

1476

}

1477

1478

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1479

setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);

1480

setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);

1481

}

1482

1483

setOperationAction(ISD::MUL, MVT::v4i64, Custom);

1484

setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);

1485

setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);

1486

setOperationAction(ISD::MUL, MVT::v32i8, Custom);

1487

1488

setOperationAction(ISD::MULHU, MVT::v8i32, Custom);

1489

setOperationAction(ISD::MULHS, MVT::v8i32, Custom);

1490

setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);

1491

setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);

1492

setOperationAction(ISD::MULHU, MVT::v32i8, Custom);

1493

setOperationAction(ISD::MULHS, MVT::v32i8, Custom);

1494

setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);

1495

setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);

1496

1497

setOperationAction(ISD::SMULO, MVT::v32i8, Custom);

1498

setOperationAction(ISD::UMULO, MVT::v32i8, Custom);

1499

1500

setOperationAction(ISD::ABS, MVT::v4i64, Custom);

1501

setOperationAction(ISD::SMAX, MVT::v4i64, Custom);

1502

setOperationAction(ISD::UMAX, MVT::v4i64, Custom);

1503

setOperationAction(ISD::SMIN, MVT::v4i64, Custom);

1504

setOperationAction(ISD::UMIN, MVT::v4i64, Custom);

1505

1506

setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1507

setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1508

setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1509

setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1510

setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1511

setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1512

setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1513

setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1514

setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);

1515

setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);

1516

setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);

1517

setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);

1518

1519

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {

1520

setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);

1521

setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);

1522

setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);

1523

setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);

1524

setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);

1525

}

1526

1527

for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {

1528

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);

1529

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);

1530

}

1531

1532

if (HasInt256) {

1533

// The custom lowering for UINT_TO_FP for v8i32 becomes interesting

1534

// when we have a 256bit-wide blend with immediate.

1535

setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);

1536

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);

1537

1538

// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X

1539

for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {

1540

setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);

1541

setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);

1542

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);

1543

setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);

1544

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);

1545

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);

1546

}

1547

}

1548

1549

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1550

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {

1551

setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);

1552

setOperationAction(ISD::MSTORE, VT, Legal);

1553

}

1554

1555

// Extract subvector is special because the value type

1556

// (result) is 128-bit but the source is 256-bit wide.

1557

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1558

MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {

1559

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

1560

}

1561

1562

// Custom lower several nodes for 256-bit types.

1563

for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,

1564

MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {

1565

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1566

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1567

setOperationAction(ISD::VSELECT, VT, Custom);

1568

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1569

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1570

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1571

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);

1572

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1573

setOperationAction(ISD::STORE, VT, Custom);

1574

}

1575

setF16Action(MVT::v16f16, Expand);

1576

setOperationAction(ISD::FADD, MVT::v16f16, Expand);

1577

setOperationAction(ISD::FSUB, MVT::v16f16, Expand);

1578

setOperationAction(ISD::FMUL, MVT::v16f16, Expand);

1579

setOperationAction(ISD::FDIV, MVT::v16f16, Expand);

1580

1581

if (HasInt256) {

1582

setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);

1583

1584

// Custom legalize 2x32 to get a little better code.

1585

setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);

1586

setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);

1587

1588

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1589

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })

1590

setOperationAction(ISD::MGATHER, VT, Custom);

1591

}

1592

}

1593

1594

if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&

1595

Subtarget.hasF16C()) {

1596

for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {

1597

setOperationAction(ISD::FP_ROUND, VT, Custom);

1598

setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);

1599

}

1600

for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) {

1601

setOperationAction(ISD::FP_EXTEND, VT, Custom);

1602

setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom);

1603

}

1604

for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {

1605

setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);

1606

setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);

1607

}

1608

1609

setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);

1610

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);

1611

}

1612

1613

// This block controls legalization of the mask vector sizes that are

1614

// available with AVX512. 512-bit vectors are in a separate block controlled

1615

// by useAVX512Regs.

1616

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {

1617

addRegisterClass(MVT::v1i1, &X86::VK1RegClass);

1618

addRegisterClass(MVT::v2i1, &X86::VK2RegClass);

1619

addRegisterClass(MVT::v4i1, &X86::VK4RegClass);

1620

addRegisterClass(MVT::v8i1, &X86::VK8RegClass);

1621

addRegisterClass(MVT::v16i1, &X86::VK16RegClass);

1622

1623

setOperationAction(ISD::SELECT, MVT::v1i1, Custom);

1624

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);

1625

setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);

1626

1627

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);

1628

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);

1629

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);

1630

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);

1631

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);

1632

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);

1633

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);

1634

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);

1635

setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);

1636

setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);

1637

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);

1638

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);

1639

1640

// There is no byte sized k-register load or store without AVX512DQ.

1641

if (!Subtarget.hasDQI()) {

1642

setOperationAction(ISD::LOAD, MVT::v1i1, Custom);

1643

setOperationAction(ISD::LOAD, MVT::v2i1, Custom);

1644

setOperationAction(ISD::LOAD, MVT::v4i1, Custom);

1645

setOperationAction(ISD::LOAD, MVT::v8i1, Custom);

1646

1647

setOperationAction(ISD::STORE, MVT::v1i1, Custom);

1648

setOperationAction(ISD::STORE, MVT::v2i1, Custom);

1649

setOperationAction(ISD::STORE, MVT::v4i1, Custom);

1650

setOperationAction(ISD::STORE, MVT::v8i1, Custom);

1651

}

1652

1653

// Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.

1654

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1655

setOperationAction(ISD::SIGN_EXTEND, VT, Custom);

1656

setOperationAction(ISD::ZERO_EXTEND, VT, Custom);

1657

setOperationAction(ISD::ANY_EXTEND, VT, Custom);

1658

}

1659

1660

for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })

1661

setOperationAction(ISD::VSELECT, VT, Expand);

1662

1663

for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {

1664

setOperationAction(ISD::SETCC, VT, Custom);

1665

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1666

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1667

setOperationAction(ISD::SELECT, VT, Custom);

1668

setOperationAction(ISD::TRUNCATE, VT, Custom);

1669

1670

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1671

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1672

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1673

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

1674

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1675

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1676

}

1677

1678

for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })

1679

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

1680

}

1681

1682

// This block controls legalization for 512-bit operations with 32/64 bit

1683

// elements. 512-bits can be disabled based on prefer-vector-width and

1684

// required-vector-width function attributes.

1685

if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {

1686

bool HasBWI = Subtarget.hasBWI();

1687

1688

addRegisterClass(MVT::v16i32, &X86::VR512RegClass);

1689

addRegisterClass(MVT::v16f32, &X86::VR512RegClass);

1690

addRegisterClass(MVT::v8i64, &X86::VR512RegClass);

1691

addRegisterClass(MVT::v8f64, &X86::VR512RegClass);

1692

addRegisterClass(MVT::v32i16, &X86::VR512RegClass);

1693

addRegisterClass(MVT::v32f16, &X86::VR512RegClass);

1694

addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

1695

1696

for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {

1697

setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);

1698

setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);

1699

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);

1700

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);

1701

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);

1702

if (HasBWI)

1703

setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);

1704

}

1705

1706

for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {

1707

setOperationAction(ISD::FNEG, VT, Custom);

1708

setOperationAction(ISD::FABS, VT, Custom);

1709

setOperationAction(ISD::FMA, VT, Legal);

1710

setOperationAction(ISD::STRICT_FMA, VT, Legal);

1711

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

1712

}

1713

1714

for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {

1715

setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);

1716

setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);

1717

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);

1718

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);

1719

}

1720

1721

for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {

1722

setOperationAction(ISD::FP_TO_SINT, VT, Custom);

1723

setOperationAction(ISD::FP_TO_UINT, VT, Custom);

1724

setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);

1725

setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);

1726

}

1727

1728

setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom);

1729

setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom);

1730

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom);

1731

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom);

1732

setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);

1733

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom);

1734

1735

setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);

1736

setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);

1737

setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);

1738

setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);

1739

setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);

1740

setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);

1741

setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);

1742

setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);

1743

setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);

1744

setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);

1745

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);

1746

1747

setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);

1748

setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);

1749

setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);

1750

setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);

1751

setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);

1752

if (HasBWI)

1753

setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);

1754

1755

// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE

1756

// to 512-bit rather than use the AVX2 instructions so that we can use

1757

// k-masks.

1758

if (!Subtarget.hasVLX()) {

1759

for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1760

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {

1761

setOperationAction(ISD::MLOAD, VT, Custom);

1762

setOperationAction(ISD::MSTORE, VT, Custom);

1763

}

1764

}

1765

1766

setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);

1767

setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);

1768

setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);

1769

setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);

1770

setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);

1771

setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

1772

setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);

1773

setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);

1774

setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);

1775

setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);

1776

setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);

1777

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);

1778

setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

1779

1780

if (HasBWI) {

1781

// Extends from v64i1 masks to 512-bit vectors.

1782

setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);

1783

setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);

1784

setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);

1785

}

1786

1787

for (auto VT : { MVT::v16f32, MVT::v8f64 }) {

1788

setOperationAction(ISD::FFLOOR, VT, Legal);

1789

setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

1790

setOperationAction(ISD::FCEIL, VT, Legal);

1791

setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

1792

setOperationAction(ISD::FTRUNC, VT, Legal);

1793

setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

1794

setOperationAction(ISD::FRINT, VT, Legal);

1795

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

1796

setOperationAction(ISD::FNEARBYINT, VT, Legal);

1797

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

1798

setOperationAction(ISD::FROUNDEVEN, VT, Legal);

1799

setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);

1800

1801

setOperationAction(ISD::FROUND, VT, Custom);

1802

}

1803

1804

for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {

1805

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);

1806

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);

1807

}

1808

1809

setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);

1810

setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);

1811

setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);

1812

setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);

1813

1814

setOperationAction(ISD::MUL, MVT::v8i64, Custom);

1815

setOperationAction(ISD::MUL, MVT::v16i32, Legal);

1816

setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);

1817

setOperationAction(ISD::MUL, MVT::v64i8, Custom);

1818

1819

setOperationAction(ISD::MULHU, MVT::v16i32, Custom);

1820

setOperationAction(ISD::MULHS, MVT::v16i32, Custom);

1821

setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);

1822

setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);

1823

setOperationAction(ISD::MULHS, MVT::v64i8, Custom);

1824

setOperationAction(ISD::MULHU, MVT::v64i8, Custom);

1825

setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);

1826

setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);

1827

1828

setOperationAction(ISD::SMULO, MVT::v64i8, Custom);

1829

setOperationAction(ISD::UMULO, MVT::v64i8, Custom);

1830

1831

setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);

1832

1833

for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {

1834

setOperationAction(ISD::SRL, VT, Custom);

1835

setOperationAction(ISD::SHL, VT, Custom);

1836

setOperationAction(ISD::SRA, VT, Custom);

1837

setOperationAction(ISD::ROTL, VT, Custom);

1838

setOperationAction(ISD::ROTR, VT, Custom);

1839

setOperationAction(ISD::SETCC, VT, Custom);

1840

setOperationAction(ISD::ABDS, VT, Custom);

1841

setOperationAction(ISD::ABDU, VT, Custom);

1842

1843

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

1844

// setcc all the way to isel and prefer SETGT in some isel patterns.

1845

setCondCodeAction(ISD::SETLT, VT, Custom);

1846

setCondCodeAction(ISD::SETLE, VT, Custom);

1847

}

1848

for (auto VT : { MVT::v16i32, MVT::v8i64 }) {

1849

setOperationAction(ISD::SMAX, VT, Legal);

1850

setOperationAction(ISD::UMAX, VT, Legal);

1851

setOperationAction(ISD::SMIN, VT, Legal);

1852

setOperationAction(ISD::UMIN, VT, Legal);

1853

setOperationAction(ISD::ABS, VT, Legal);

1854

setOperationAction(ISD::CTPOP, VT, Custom);

1855

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1856

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1857

}

1858

1859

for (auto VT : { MVT::v64i8, MVT::v32i16 }) {

1860

setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);

1861

setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);

1862

setOperationAction(ISD::CTLZ, VT, Custom);

1863

setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);

1864

setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);

1865

setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);

1866

setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);

1867

setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);

1868

setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);

1869

setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);

1870

setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);

1871

}

1872

1873

setOperationAction(ISD::FSHL, MVT::v64i8, Custom);

1874

setOperationAction(ISD::FSHR, MVT::v64i8, Custom);

1875

setOperationAction(ISD::FSHL, MVT::v32i16, Custom);

1876

setOperationAction(ISD::FSHR, MVT::v32i16, Custom);

1877

setOperationAction(ISD::FSHL, MVT::v16i32, Custom);

1878

setOperationAction(ISD::FSHR, MVT::v16i32, Custom);

1879

1880

if (Subtarget.hasDQI()) {

1881

for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,

1882

ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,

1883

ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})

1884

setOperationAction(Opc, MVT::v8i64, Custom);

1885

setOperationAction(ISD::MUL, MVT::v8i64, Legal);

1886

}

1887

1888

if (Subtarget.hasCDI()) {

1889

// NonVLX sub-targets extend 128/256 vectors to use the 512 version.

1890

for (auto VT : { MVT::v16i32, MVT::v8i64} ) {

1891

setOperationAction(ISD::CTLZ, VT, Legal);

1892

}

1893

} // Subtarget.hasCDI()

1894

1895

if (Subtarget.hasVPOPCNTDQ()) {

1896

for (auto VT : { MVT::v16i32, MVT::v8i64 })

1897

setOperationAction(ISD::CTPOP, VT, Legal);

1898

}

1899

1900

// Extract subvector is special because the value type

1901

// (result) is 256-bit but the source is 512-bit wide.

1902

// 128-bit was made Legal under AVX1.

1903

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,

1904

MVT::v16f16, MVT::v8f32, MVT::v4f64 })

1905

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

1906

1907

for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,

1908

MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {

1909

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1910

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);

1911

setOperationAction(ISD::SELECT, VT, Custom);

1912

setOperationAction(ISD::VSELECT, VT, Custom);

1913

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1914

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1915

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1916

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1917

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1918

}

1919

setF16Action(MVT::v32f16, Expand);

1920

setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);

1921

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);

1922

setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);

1923

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);

1924

for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {

1925

setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);

1926

setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);

1927

}

1928

1929

for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {

1930

setOperationAction(ISD::MLOAD, VT, Legal);

1931

setOperationAction(ISD::MSTORE, VT, Legal);

1932

setOperationAction(ISD::MGATHER, VT, Custom);

1933

setOperationAction(ISD::MSCATTER, VT, Custom);

1934

}

1935

if (HasBWI) {

1936

for (auto VT : { MVT::v64i8, MVT::v32i16 }) {

1937

setOperationAction(ISD::MLOAD, VT, Legal);

1938

setOperationAction(ISD::MSTORE, VT, Legal);

1939

}

1940

} else {

1941

setOperationAction(ISD::STORE, MVT::v32i16, Custom);

1942

setOperationAction(ISD::STORE, MVT::v64i8, Custom);

1943

}

1944

1945

if (Subtarget.hasVBMI2()) {

1946

for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,

1947

MVT::v16i16, MVT::v8i32, MVT::v4i64,

1948

MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {

1949

setOperationAction(ISD::FSHL, VT, Custom);

1950

setOperationAction(ISD::FSHR, VT, Custom);

1951

}

1952

1953

setOperationAction(ISD::ROTL, MVT::v32i16, Custom);

1954

setOperationAction(ISD::ROTR, MVT::v8i16, Custom);

1955

setOperationAction(ISD::ROTR, MVT::v16i16, Custom);

1956

setOperationAction(ISD::ROTR, MVT::v32i16, Custom);

1957

}

1958

}// useAVX512Regs

1959

1960

// This block controls legalization for operations that don't have

1961

// pre-AVX512 equivalents. Without VLX we use 512-bit operations for

1962

// narrower widths.

1963

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {

1964

// These operations are handled on non-VLX by artificially widening in

1965

// isel patterns.

1966

1967

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);

1968

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);

1969

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);

1970

1971

if (Subtarget.hasDQI()) {

1972

// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.

1973

// v2f32 UINT_TO_FP is already custom under SSE2.

1974

assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1976, __extension__
__PRETTY_FUNCTION__))

1975

isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1976, __extension__
__PRETTY_FUNCTION__))

1976

"Unexpected operation action!")(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1976, __extension__
__PRETTY_FUNCTION__));

1977

// v2i64 FP_TO_S/UINT(v2f32) custom conversion.

1978

setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);

1979

setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);

1980

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);

1981

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);

1982

}

1983

1984

for (auto VT : { MVT::v2i64, MVT::v4i64 }) {

1985

setOperationAction(ISD::SMAX, VT, Legal);

1986

setOperationAction(ISD::UMAX, VT, Legal);

1987

setOperationAction(ISD::SMIN, VT, Legal);

1988

setOperationAction(ISD::UMIN, VT, Legal);

1989

setOperationAction(ISD::ABS, VT, Legal);

1990

}

1991

1992

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {

1993

setOperationAction(ISD::ROTL, VT, Custom);

1994

setOperationAction(ISD::ROTR, VT, Custom);

1995

}

1996

1997

// Custom legalize 2x32 to get a little better code.

1998

setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);

1999

setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);

2000

2001

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

2002

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })

2003

setOperationAction(ISD::MSCATTER, VT, Custom);

2004

2005

if (Subtarget.hasDQI()) {

2006

for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,

2007

ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,

2008

ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) {

2009

setOperationAction(Opc, MVT::v2i64, Custom);

2010

setOperationAction(Opc, MVT::v4i64, Custom);

2011

}

2012

setOperationAction(ISD::MUL, MVT::v2i64, Legal);

2013

setOperationAction(ISD::MUL, MVT::v4i64, Legal);

2014

}

2015

2016

if (Subtarget.hasCDI()) {

2017

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {

2018

setOperationAction(ISD::CTLZ, VT, Legal);

2019

}

2020

} // Subtarget.hasCDI()

2021

2022

if (Subtarget.hasVPOPCNTDQ()) {

2023

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })

2024

setOperationAction(ISD::CTPOP, VT, Legal);

2025

}

2026

}

2027

2028

// This block control legalization of v32i1/v64i1 which are available with

2029

// AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with

2030

// useBWIRegs.

2031

if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {

2032

addRegisterClass(MVT::v32i1, &X86::VK32RegClass);

2033

addRegisterClass(MVT::v64i1, &X86::VK64RegClass);

2034

2035

for (auto VT : { MVT::v32i1, MVT::v64i1 }) {

2036

setOperationAction(ISD::VSELECT, VT, Expand);

2037

setOperationAction(ISD::TRUNCATE, VT, Custom);

2038

setOperationAction(ISD::SETCC, VT, Custom);

2039

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

2040

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

2041

setOperationAction(ISD::SELECT, VT, Custom);

2042

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

2043

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

2044

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

2045

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

2046

}

2047

2048

for (auto VT : { MVT::v16i1, MVT::v32i1 })

2049

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

2050

2051

// Extends from v32i1 masks to 256-bit vectors.

2052

setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);

2053

setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);

2054

setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);

2055

2056

for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {

2057

setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);

2058

setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);

2059

}

2060

2061

// These operations are handled on non-VLX by artificially widening in

2062

// isel patterns.

2063

// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

2064

2065

if (Subtarget.hasBITALG()) {

2066

for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })

2067

setOperationAction(ISD::CTPOP, VT, Legal);

2068

}

2069

}

2070

2071

if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {

2072

auto setGroup = [&] (MVT VT) {

2073

setOperationAction(ISD::FADD, VT, Legal);

2074

setOperationAction(ISD::STRICT_FADD, VT, Legal);

2075

setOperationAction(ISD::FSUB, VT, Legal);

2076

setOperationAction(ISD::STRICT_FSUB, VT, Legal);

2077

setOperationAction(ISD::FMUL, VT, Legal);

2078

setOperationAction(ISD::STRICT_FMUL, VT, Legal);

2079

setOperationAction(ISD::FDIV, VT, Legal);

2080

setOperationAction(ISD::STRICT_FDIV, VT, Legal);

2081

setOperationAction(ISD::FSQRT, VT, Legal);

2082

setOperationAction(ISD::STRICT_FSQRT, VT, Legal);

2083

2084

setOperationAction(ISD::FFLOOR, VT, Legal);

2085

setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

2086

setOperationAction(ISD::FCEIL, VT, Legal);

2087

setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

2088

setOperationAction(ISD::FTRUNC, VT, Legal);

2089

setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

2090

setOperationAction(ISD::FRINT, VT, Legal);

2091

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

2092

setOperationAction(ISD::FNEARBYINT, VT, Legal);

2093

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

2094

2095

setOperationAction(ISD::FROUND, VT, Custom);

2096

2097

setOperationAction(ISD::LOAD, VT, Legal);

2098

setOperationAction(ISD::STORE, VT, Legal);

2099

2100

setOperationAction(ISD::FMA, VT, Legal);

2101

setOperationAction(ISD::STRICT_FMA, VT, Legal);

2102

setOperationAction(ISD::VSELECT, VT, Legal);

2103

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

2104

setOperationAction(ISD::SELECT, VT, Custom);

2105

2106

setOperationAction(ISD::FNEG, VT, Custom);

2107

setOperationAction(ISD::FABS, VT, Custom);

2108

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

2109

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

2110

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

2111

};

2112

2113

// AVX512_FP16 scalar operations

2114

setGroup(MVT::f16);

2115

setOperationAction(ISD::FREM, MVT::f16, Promote);

2116

setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);

2117

setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);

2118

setOperationAction(ISD::BR_CC, MVT::f16, Expand);

2119

setOperationAction(ISD::SETCC, MVT::f16, Custom);

2120

setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);

2121

setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);

2122

setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);

2123

setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);

2124

setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal);

2125

setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);

2126

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);

2127

setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);

2128

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);

2129

2130

setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);

2131

setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);

2132

2133

if (Subtarget.useAVX512Regs()) {

2134

setGroup(MVT::v32f16);

2135

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);

2136

setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);

2137

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);

2138

setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);

2139

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);

2140

setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);

2141

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);

2142

setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);

2143

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);

2144

setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Legal);

2145

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);

2146

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);

2147

2148

setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);

2149

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom);

2150

setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom);

2151

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom);

2152

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);

2153

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,

2154

MVT::v32i16);

2155

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);

2156

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,

2157

MVT::v32i16);

2158

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);

2159

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,

2160

MVT::v32i16);

2161

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);

2162

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,

2163

MVT::v32i16);

2164

2165

setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal);

2166

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal);

2167

setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom);

2168

2169

setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);

2170

setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);

2171

2172

setOperationAction(ISD::STRICT_FSETCC, MVT::v32i1, Custom);

2173

setOperationAction(ISD::STRICT_FSETCCS, MVT::v32i1, Custom);

2174

}

2175

2176

if (Subtarget.hasVLX()) {

2177

setGroup(MVT::v8f16);

2178

setGroup(MVT::v16f16);

2179

2180

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);

2181

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom);

2182

setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal);

2183

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal);

2184

setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal);

2185

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal);

2186

setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal);

2187

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal);

2188

setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);

2189

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);

2190

2191

setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);

2192

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);

2193

setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);

2194

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);

2195

setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal);

2196

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);

2197

setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);

2198

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);

2199

setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);

2200

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);

2201

2202

// INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE

2203

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);

2204

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom);

2205

2206

setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal);

2207

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal);

2208

setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom);

2209

2210

setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);

2211

setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);

2212

setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);

2213

setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);

2214

2215

// Need to custom widen these to prevent scalarization.

2216

setOperationAction(ISD::LOAD, MVT::v4f16, Custom);

2217

setOperationAction(ISD::STORE, MVT::v4f16, Custom);

2218

}

2219

}

2220

2221

if (!Subtarget.useSoftFloat() &&

2222

(Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {

2223

addRegisterClass(MVT::v8bf16, &X86::VR128XRegClass);

2224

addRegisterClass(MVT::v16bf16, &X86::VR256XRegClass);

2225

// We set the type action of bf16 to TypeSoftPromoteHalf, but we don't

2226

// provide the method to promote BUILD_VECTOR. Set the operation action

2227

// Custom to do the customization later.

2228

setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom);

2229

for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {

2230

setF16Action(VT, Expand);

2231

setOperationAction(ISD::FADD, VT, Expand);

2232

setOperationAction(ISD::FSUB, VT, Expand);

2233

setOperationAction(ISD::FMUL, VT, Expand);

2234

setOperationAction(ISD::FDIV, VT, Expand);

2235

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

2236

}

2237

addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));

2238

}

2239

2240

if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {

2241

addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);

2242

setF16Action(MVT::v32bf16, Expand);

2243

setOperationAction(ISD::FADD, MVT::v32bf16, Expand);

2244

setOperationAction(ISD::FSUB, MVT::v32bf16, Expand);

2245

setOperationAction(ISD::FMUL, MVT::v32bf16, Expand);

2246

setOperationAction(ISD::FDIV, MVT::v32bf16, Expand);

2247

setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);

2248

}

2249

2250

if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {

2251

setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);

2252

setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);

2253

setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);

2254

setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);

2255

setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);

2256

2257

setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);

2258

setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);

2259

setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);

2260

setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);

2261

setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);

2262

2263

if (Subtarget.hasBWI()) {

2264

setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);

2265

setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);

2266

}

2267

2268

if (Subtarget.hasFP16()) {

2269

// vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64

2270

setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom);

2271

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);

2272

setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom);

2273

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);

2274

setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom);

2275

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);

2276

setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom);

2277

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);

2278

// vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16

2279

setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom);

2280

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);

2281

setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom);

2282

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);

2283

setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom);

2284

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);

2285

setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom);

2286

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);

2287

// vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16

2288

setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);

2289

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom);

2290

setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);

2291

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom);

2292

// vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32

2293

setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);

2294

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom);

2295

setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);

2296

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);

2297

}

2298

2299

setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);

2300

setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);

2301

setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);

2302

}

2303

2304

if (Subtarget.hasAMXTILE()) {

2305

addRegisterClass(MVT::x86amx, &X86::TILERegClass);

2306

}

2307

2308

// We want to custom lower some of our intrinsics.

2309

setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

2310

setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);

2311

setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);

2312

if (!Subtarget.is64Bit()) {

2313

setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);

2314

}

2315

2316

// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't

2317

// handle type legalization for these operations here.

2318

//

2319

// FIXME: We really should do custom legalization for addition and

2320

// subtraction on x86-32 once PR3203 is fixed. We really can't do much better

2321

// than generic legalization for 64-bit multiplication-with-overflow, though.

2322

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

2323

if (VT == MVT::i64 && !Subtarget.is64Bit())

2324

continue;

2325

// Add/Sub/Mul with overflow operations are custom lowered.

2326

setOperationAction(ISD::SADDO, VT, Custom);

2327

setOperationAction(ISD::UADDO, VT, Custom);

2328

setOperationAction(ISD::SSUBO, VT, Custom);

2329

setOperationAction(ISD::USUBO, VT, Custom);

2330

setOperationAction(ISD::SMULO, VT, Custom);

2331

setOperationAction(ISD::UMULO, VT, Custom);

2332

2333

// Support carry in as value rather than glue.

2334

setOperationAction(ISD::ADDCARRY, VT, Custom);

2335

setOperationAction(ISD::SUBCARRY, VT, Custom);

2336

setOperationAction(ISD::SETCCCARRY, VT, Custom);

2337

setOperationAction(ISD::SADDO_CARRY, VT, Custom);

2338

setOperationAction(ISD::SSUBO_CARRY, VT, Custom);

2339

}

2340

2341

if (!Subtarget.is64Bit()) {

2342

// These libcalls are not available in 32-bit.

2343

setLibcallName(RTLIB::SHL_I128, nullptr);

2344

setLibcallName(RTLIB::SRL_I128, nullptr);

2345

setLibcallName(RTLIB::SRA_I128, nullptr);

2346

setLibcallName(RTLIB::MUL_I128, nullptr);

2347

// The MULO libcall is not part of libgcc, only compiler-rt.

2348

setLibcallName(RTLIB::MULO_I64, nullptr);

2349

}

2350

// The MULO libcall is not part of libgcc, only compiler-rt.

2351

setLibcallName(RTLIB::MULO_I128, nullptr);

2352

2353

// Combine sin / cos into _sincos_stret if it is available.

2354

if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&

2355

getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {

2356

setOperationAction(ISD::FSINCOS, MVT::f64, Custom);

2357

setOperationAction(ISD::FSINCOS, MVT::f32, Custom);

2358

}

2359

2360

if (Subtarget.isTargetWin64()) {

2361

setOperationAction(ISD::SDIV, MVT::i128, Custom);

2362

setOperationAction(ISD::UDIV, MVT::i128, Custom);

2363

setOperationAction(ISD::SREM, MVT::i128, Custom);

2364

setOperationAction(ISD::UREM, MVT::i128, Custom);

2365

setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);

2366

setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);

2367

setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);

2368

setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);

2369

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);

2370

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);

2371

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);

2372

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);

2373

}

2374

2375

// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`

2376

// is. We should promote the value to 64-bits to solve this.

2377

// This is what the CRT headers do - `fmodf` is an inline header

2378

// function casting to f64 and calling `fmod`.

2379

if (Subtarget.is32Bit() &&

2380

(Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))

2381

for (ISD::NodeType Op :

2382

{ISD::FCEIL, ISD::STRICT_FCEIL,

2383

ISD::FCOS, ISD::STRICT_FCOS,

2384

ISD::FEXP, ISD::STRICT_FEXP,

2385

ISD::FFLOOR, ISD::STRICT_FFLOOR,

2386

ISD::FREM, ISD::STRICT_FREM,

2387

ISD::FLOG, ISD::STRICT_FLOG,

2388

ISD::FLOG10, ISD::STRICT_FLOG10,

2389

ISD::FPOW, ISD::STRICT_FPOW,

2390

ISD::FSIN, ISD::STRICT_FSIN})

2391

if (isOperationExpand(Op, MVT::f32))

2392

setOperationAction(Op, MVT::f32, Promote);

2393

2394

// We have target-specific dag combine patterns for the following nodes:

2395

setTargetDAGCombine({ISD::VECTOR_SHUFFLE,

2396

ISD::SCALAR_TO_VECTOR,

2397

ISD::INSERT_VECTOR_ELT,

2398

ISD::EXTRACT_VECTOR_ELT,

2399

ISD::CONCAT_VECTORS,

2400

ISD::INSERT_SUBVECTOR,

2401

ISD::EXTRACT_SUBVECTOR,

2402

ISD::BITCAST,

2403

ISD::VSELECT,

2404

ISD::SELECT,

2405

ISD::SHL,

2406

ISD::SRA,

2407

ISD::SRL,

2408

ISD::OR,

2409

ISD::AND,

2410

ISD::ADD,

2411

ISD::FADD,

2412

ISD::FSUB,

2413

ISD::FNEG,

2414

ISD::FMA,

2415

ISD::STRICT_FMA,

2416

ISD::FMINNUM,

2417

ISD::FMAXNUM,

2418

ISD::SUB,

2419

ISD::LOAD,

2420

ISD::MLOAD,

2421

ISD::STORE,

2422

ISD::MSTORE,

2423

ISD::TRUNCATE,

2424

ISD::ZERO_EXTEND,

2425

ISD::ANY_EXTEND,

2426

ISD::SIGN_EXTEND,

2427

ISD::SIGN_EXTEND_INREG,

2428

ISD::ANY_EXTEND_VECTOR_INREG,

2429

ISD::SIGN_EXTEND_VECTOR_INREG,

2430

ISD::ZERO_EXTEND_VECTOR_INREG,

2431

ISD::SINT_TO_FP,

2432

ISD::UINT_TO_FP,

2433

ISD::STRICT_SINT_TO_FP,

2434

ISD::STRICT_UINT_TO_FP,

2435

ISD::SETCC,

2436

ISD::MUL,

2437

ISD::XOR,

2438

ISD::MSCATTER,

2439

ISD::MGATHER,

2440

ISD::FP16_TO_FP,

2441

ISD::FP_EXTEND,

2442

ISD::STRICT_FP_EXTEND,

2443

ISD::FP_ROUND,

2444

ISD::STRICT_FP_ROUND});

2445

2446

computeRegisterProperties(Subtarget.getRegisterInfo());

2447

2448

MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores

2449

MaxStoresPerMemsetOptSize = 8;

2450

MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores

2451

MaxStoresPerMemcpyOptSize = 4;

2452

MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores

2453

MaxStoresPerMemmoveOptSize = 4;

2454

2455

// TODO: These control memcmp expansion in CGP and could be raised higher, but

2456

// that needs to benchmarked and balanced with the potential use of vector

2457

// load/store types (PR33329, PR33914).

2458

MaxLoadsPerMemcmp = 2;

2459

MaxLoadsPerMemcmpOptSize = 2;

2460

2461

// Default loop alignment, which can be overridden by -align-loops.

2462

setPrefLoopAlignment(Align(16));

2463

2464

// An out-of-order CPU can speculatively execute past a predictable branch,

2465

// but a conditional move could be stalled by an expensive earlier operation.

2466

PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();

2467

EnableExtLdPromotion = true;

2468

setPrefFunctionAlignment(Align(16));

2469

2470

verifyIntrinsicTables();

2471

2472

// Default to having -disable-strictnode-mutation on

2473

IsStrictFPEnabled = true;

2474

}

2475

2476

// This has so far only been implemented for 64-bit MachO.

2477

bool X86TargetLowering::useLoadStackGuardNode() const {

2478

return Subtarget.isTargetMachO() && Subtarget.is64Bit();

2479

}

2480

2481

bool X86TargetLowering::useStackGuardXorFP() const {

2482

// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.

2483

return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();

2484

}

2485

2486

SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,

2487

const SDLoc &DL) const {

2488

EVT PtrTy = getPointerTy(DAG.getDataLayout());

2489

unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;

2490

MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);

2491

return SDValue(Node, 0);

2492

}

2493

2494

TargetLoweringBase::LegalizeTypeAction

2495

X86TargetLowering::getPreferredVectorAction(MVT VT) const {

2496

if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&

2497

!Subtarget.hasBWI())

2498

return TypeSplitVector;

2499

2500

if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&

2501

!Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)

2502

return TypeSplitVector;

2503

2504

if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&

2505

VT.getVectorElementType() != MVT::i1)

2506

return TypeWidenVector;

2507

2508

return TargetLoweringBase::getPreferredVectorAction(VT);

2509

}

2510

2511

static std::pair<MVT, unsigned>

2512

handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,

2513

const X86Subtarget &Subtarget) {

2514

// v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling

2515

// convention is one that uses k registers.

2516

if (NumElts == 2)

2517

return {MVT::v2i64, 1};

2518

if (NumElts == 4)

2519

return {MVT::v4i32, 1};

2520

if (NumElts == 8 && CC != CallingConv::X86_RegCall &&

2521

CC != CallingConv::Intel_OCL_BI)

2522

return {MVT::v8i16, 1};

2523

if (NumElts == 16 && CC != CallingConv::X86_RegCall &&

2524

CC != CallingConv::Intel_OCL_BI)

2525

return {MVT::v16i8, 1};

2526

// v32i1 passes in ymm unless we have BWI and the calling convention is

2527

// regcall.

2528

if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))

2529

return {MVT::v32i8, 1};

2530

// Split v64i1 vectors if we don't have v64i8 available.

2531

if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {

2532

if (Subtarget.useAVX512Regs())

2533

return {MVT::v64i8, 1};

2534

return {MVT::v32i8, 2};

2535

}

2536

2537

// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.

2538

if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||

2539

NumElts > 64)

2540

return {MVT::i8, NumElts};

2541

2542

return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};

2543

}

2544

2545

MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,

2546

CallingConv::ID CC,

2547

EVT VT) const {

2548

if (VT.isVector()) {

2549

if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {

2550

unsigned NumElts = VT.getVectorNumElements();

2551

2552

MVT RegisterVT;

2553

unsigned NumRegisters;

2554

std::tie(RegisterVT, NumRegisters) =

2555

handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);

2556

if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)

2557

return RegisterVT;

2558

}

2559

2560

if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)

2561

return MVT::v8f16;

2562

}

2563

2564

// We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.

2565

if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&

2566

!Subtarget.hasX87())

2567

return MVT::i32;

2568

2569

if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)

2570

return getRegisterTypeForCallingConv(Context, CC,

2571

VT.changeVectorElementTypeToInteger());

2572

2573

return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);

2574

}

2575

2576

unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,

2577

CallingConv::ID CC,

2578

EVT VT) const {

2579

if (VT.isVector()) {

2580

if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {

2581

unsigned NumElts = VT.getVectorNumElements();

2582

2583

MVT RegisterVT;

2584

unsigned NumRegisters;

2585

std::tie(RegisterVT, NumRegisters) =

2586

handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);

2587

if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)

2588

return NumRegisters;

2589

}

2590

2591

if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)

2592

return 1;

2593

}

2594

2595

// We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if

2596

// x87 is disabled.

2597

if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {

2598

if (VT == MVT::f64)

2599

return 2;

2600

if (VT == MVT::f80)

2601

return 3;

2602

}

2603

2604

if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)

2605

return getNumRegistersForCallingConv(Context, CC,

2606

VT.changeVectorElementTypeToInteger());

2607

2608

return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);

2609

}

2610

2611

unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(

2612

LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,

2613

unsigned &NumIntermediates, MVT &RegisterVT) const {

2614

// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.

2615

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

2616

Subtarget.hasAVX512() &&

2617

(!isPowerOf2_32(VT.getVectorNumElements()) ||

2618

(VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||

2619

VT.getVectorNumElements() > 64)) {

2620

RegisterVT = MVT::i8;

2621

IntermediateVT = MVT::i1;

2622

NumIntermediates = VT.getVectorNumElements();

2623

return NumIntermediates;

2624

}

2625

2626

// Split v64i1 vectors if we don't have v64i8 available.

2627

if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

2628

CC != CallingConv::X86_RegCall) {

2629

RegisterVT = MVT::v32i8;

2630

IntermediateVT = MVT::v32i1;

2631

NumIntermediates = 2;

2632

return 2;

2633

}

2634

2635

return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,

2636

NumIntermediates, RegisterVT);

2637

}

2638

2639

EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,

2640

LLVMContext& Context,

2641

EVT VT) const {

2642

if (!VT.isVector())

2643

return MVT::i8;

2644

2645

if (Subtarget.hasAVX512()) {

2646

// Figure out what this type will be legalized to.

2647

EVT LegalVT = VT;

2648

while (getTypeAction(Context, LegalVT) != TypeLegal)

2649

LegalVT = getTypeToTransformTo(Context, LegalVT);

2650

2651

// If we got a 512-bit vector then we'll definitely have a vXi1 compare.

2652

if (LegalVT.getSimpleVT().is512BitVector())

2653

return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());

2654

2655

if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {

2656

// If we legalized to less than a 512-bit vector, then we will use a vXi1

2657

// compare for vXi32/vXi64 for sure. If we have BWI we will also support

2658

// vXi16/vXi8.

2659

MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();

2660

if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)

2661

return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());

2662

}

2663

}

2664

2665

return VT.changeVectorElementTypeToInteger();

2666

}

2667

2668

/// Helper for getByValTypeAlignment to determine

2669

/// the desired ByVal argument alignment.

2670

static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {

2671

if (MaxAlign == 16)

2672

return;

2673

if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {

2674

if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128)

2675

MaxAlign = Align(16);

2676

} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {

2677

Align EltAlign;

2678

getMaxByValAlign(ATy->getElementType(), EltAlign);

2679

if (EltAlign > MaxAlign)

2680

MaxAlign = EltAlign;

2681

} else if (StructType *STy = dyn_cast<StructType>(Ty)) {

2682

for (auto *EltTy : STy->elements()) {

2683

Align EltAlign;

2684

getMaxByValAlign(EltTy, EltAlign);

2685

if (EltAlign > MaxAlign)

2686

MaxAlign = EltAlign;

2687

if (MaxAlign == 16)

2688

break;

2689

}

2690

}

2691

}

2692

2693

/// Return the desired alignment for ByVal aggregate

2694

/// function arguments in the caller parameter area. For X86, aggregates

2695

/// that contain SSE vectors are placed at 16-byte boundaries while the rest

2696

/// are at 4-byte boundaries.

2697

uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,

2698

const DataLayout &DL) const {

2699

if (Subtarget.is64Bit()) {

2700

// Max of 8 and alignment of type.

2701

Align TyAlign = DL.getABITypeAlign(Ty);

2702

if (TyAlign > 8)

2703

return TyAlign.value();

2704

return 8;

2705

}

2706

2707

Align Alignment(4);

2708

if (Subtarget.hasSSE1())

2709

getMaxByValAlign(Ty, Alignment);

2710

return Alignment.value();

2711

}

2712

2713

/// It returns EVT::Other if the type should be determined using generic

2714

/// target-independent logic.

2715

/// For vector ops we check that the overall size isn't larger than our

2716

/// preferred vector width.

2717

EVT X86TargetLowering::getOptimalMemOpType(

2718

const MemOp &Op, const AttributeList &FuncAttributes) const {

2719

if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {

2720

if (Op.size() >= 16 &&

2721

(!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {

2722

// FIXME: Check if unaligned 64-byte accesses are slow.

2723

if (Op.size() >= 64 && Subtarget.hasAVX512() &&

2724

(Subtarget.getPreferVectorWidth() >= 512)) {

2725

return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;

2726

}

2727

// FIXME: Check if unaligned 32-byte accesses are slow.

2728

if (Op.size() >= 32 && Subtarget.hasAVX() &&

2729

Subtarget.useLight256BitInstructions()) {

2730

// Although this isn't a well-supported type for AVX1, we'll let

2731

// legalization and shuffle lowering produce the optimal codegen. If we

2732

// choose an optimal type with a vector element larger than a byte,

2733

// getMemsetStores() may create an intermediate splat (using an integer

2734

// multiply) before we splat as a vector.

2735

return MVT::v32i8;

2736

}

2737

if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))

2738

return MVT::v16i8;

2739

// TODO: Can SSE1 handle a byte vector?

2740

// If we have SSE1 registers we should be able to use them.

2741

if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&

2742

(Subtarget.getPreferVectorWidth() >= 128))

2743

return MVT::v4f32;

2744

} else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&

2745

Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {

2746

// Do not use f64 to lower memcpy if source is string constant. It's

2747

// better to use i32 to avoid the loads.

2748

// Also, do not use f64 to lower memset unless this is a memset of zeros.

2749

// The gymnastics of splatting a byte value into an XMM register and then

2750

// only using 8-byte stores (because this is a CPU with slow unaligned

2751

// 16-byte accesses) makes that a loser.

2752

return MVT::f64;

2753

}

2754

}

2755

// This is a compromise. If we reach here, unaligned accesses may be slow on

2756

// this target. However, creating smaller, aligned accesses could be even

2757

// slower and would certainly be a lot more code.

2758

if (Subtarget.is64Bit() && Op.size() >= 8)

2759

return MVT::i64;

2760

return MVT::i32;

2761

}

2762

2763

bool X86TargetLowering::isSafeMemOpType(MVT VT) const {

2764

if (VT == MVT::f32)

2765

return Subtarget.hasSSE1();

2766

if (VT == MVT::f64)

2767

return Subtarget.hasSSE2();

2768

return true;

2769

}

2770

2771

static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {

2772

return (8 * Alignment.value()) % SizeInBits == 0;

2773

}

2774

2775

bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {

2776

if (isBitAligned(Alignment, VT.getSizeInBits()))

2777

return true;

2778

switch (VT.getSizeInBits()) {

2779

default:

2780

// 8-byte and under are always assumed to be fast.

2781

return true;

2782

case 128:

2783

return !Subtarget.isUnalignedMem16Slow();

2784

case 256:

2785

return !Subtarget.isUnalignedMem32Slow();

2786

// TODO: What about AVX-512 (512-bit) accesses?

2787

}

2788

}

2789

2790

bool X86TargetLowering::allowsMisalignedMemoryAccesses(

2791

EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,

2792

unsigned *Fast) const {

2793

if (Fast)

2794

*Fast = isMemoryAccessFast(VT, Alignment);

2795

// NonTemporal vector memory ops must be aligned.

2796

if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {

2797

// NT loads can only be vector aligned, so if its less aligned than the

2798

// minimum vector size (which we can split the vector down to), we might as

2799

// well use a regular unaligned vector load.

2800

// We don't have any NT loads pre-SSE41.

2801

if (!!(Flags & MachineMemOperand::MOLoad))

2802

return (Alignment < 16 || !Subtarget.hasSSE41());

2803

return false;

2804

}

2805

// Misaligned accesses of any size are always allowed.

2806

return true;

2807

}

2808

2809

bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,

2810

const DataLayout &DL, EVT VT,

2811

unsigned AddrSpace, Align Alignment,

2812

MachineMemOperand::Flags Flags,

2813

unsigned *Fast) const {

2814

if (Fast)

2815

*Fast = isMemoryAccessFast(VT, Alignment);

2816

if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {

2817

if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,

2818

/*Fast=*/nullptr))

2819

return true;

2820

// NonTemporal vector memory ops are special, and must be aligned.

2821

if (!isBitAligned(Alignment, VT.getSizeInBits()))

2822

return false;

2823

switch (VT.getSizeInBits()) {

2824

case 128:

2825

if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())

2826

return true;

2827

if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())

2828

return true;

2829

return false;

2830

case 256:

2831

if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())

2832

return true;

2833

if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())

2834

return true;

2835

return false;

2836

case 512:

2837

if (Subtarget.hasAVX512())

2838

return true;

2839

return false;

2840

default:

2841

return false; // Don't have NonTemporal vector memory ops of this size.

2842

}

2843

}

2844

return true;

2845

}

2846

2847

/// Return the entry encoding for a jump table in the

2848

/// current function. The returned value is a member of the

2849

/// MachineJumpTableInfo::JTEntryKind enum.

2850

unsigned X86TargetLowering::getJumpTableEncoding() const {

2851

// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF

2852

// symbol.

2853

if (isPositionIndependent() && Subtarget.isPICStyleGOT())

2854

return MachineJumpTableInfo::EK_Custom32;

2855

2856

// Otherwise, use the normal jump table encoding heuristics.

2857

return TargetLowering::getJumpTableEncoding();

2858

}

2859

2860

bool X86TargetLowering::splitValueIntoRegisterParts(

2861

SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,

2862

unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {

2863

bool IsABIRegCopy = CC.has_value();

2864

EVT ValueVT = Val.getValueType();

2865

if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {

2866

unsigned ValueBits = ValueVT.getSizeInBits();

2867

unsigned PartBits = PartVT.getSizeInBits();

2868

Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);

2869

Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);

2870

Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);

2871

Parts[0] = Val;

2872

return true;

2873

}

2874

return false;

2875

}

2876

2877

SDValue X86TargetLowering::joinRegisterPartsIntoValue(

2878

SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,

2879

MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {

2880

bool IsABIRegCopy = CC.has_value();

2881

if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {

2882

unsigned ValueBits = ValueVT.getSizeInBits();

2883

unsigned PartBits = PartVT.getSizeInBits();

2884

SDValue Val = Parts[0];

2885

2886

Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);

2887

Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);

2888

Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);

2889

return Val;

2890

}

2891

return SDValue();

2892

}

2893

2894

bool X86TargetLowering::useSoftFloat() const {

2895

return Subtarget.useSoftFloat();

2896

}

2897

2898

void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,

2899

ArgListTy &Args) const {

2900

2901

// Only relabel X86-32 for C / Stdcall CCs.

2902

if (Subtarget.is64Bit())

2903

return;

2904

if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)

2905

return;

2906

unsigned ParamRegs = 0;

2907

if (auto *M = MF->getFunction().getParent())

2908

ParamRegs = M->getNumberRegisterParameters();

2909

2910

// Mark the first N int arguments as having reg

2911

for (auto &Arg : Args) {

2912

Type *T = Arg.Ty;

2913

if (T->isIntOrPtrTy())

2914

if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {

2915

unsigned numRegs = 1;

2916

if (MF->getDataLayout().getTypeAllocSize(T) > 4)

2917

numRegs = 2;

2918

if (ParamRegs < numRegs)

2919

return;

2920

ParamRegs -= numRegs;

2921

Arg.IsInReg = true;

2922

}

2923

}

2924

}

2925

2926

const MCExpr *

2927

X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,

2928

const MachineBasicBlock *MBB,

2929

unsigned uid,MCContext &Ctx) const{

2930

assert(isPositionIndependent() && Subtarget.isPICStyleGOT())(static_cast <bool> (isPositionIndependent() &&
Subtarget.isPICStyleGOT()) ? void (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2930, __extension__
__PRETTY_FUNCTION__));

2931

// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF

2932

// entries.

2933

return MCSymbolRefExpr::create(MBB->getSymbol(),

2934

MCSymbolRefExpr::VK_GOTOFF, Ctx);

2935

}

2936

2937

/// Returns relocation base for the given PIC jumptable.

2938

SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,

2939

SelectionDAG &DAG) const {

2940

if (!Subtarget.is64Bit())

2941

// This doesn't have SDLoc associated with it, but is not really the

2942

// same as a Register.

2943

return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),

2944

getPointerTy(DAG.getDataLayout()));

2945

return Table;

2946

}

2947

2948

/// This returns the relocation base for the given PIC jumptable,

2949

/// the same as getPICJumpTableRelocBase, but as an MCExpr.

2950

const MCExpr *X86TargetLowering::

2951

getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,

2952

MCContext &Ctx) const {

2953

// X86-64 uses RIP relative addressing based on the jump table label.

2954

if (Subtarget.isPICStyleRIPRel())

2955

return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

2956

2957

// Otherwise, the reference is relative to the PIC base.

2958

return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);

2959

}

2960

2961

std::pair<const TargetRegisterClass *, uint8_t>

2962

X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,

2963

MVT VT) const {

2964

const TargetRegisterClass *RRC = nullptr;

2965

uint8_t Cost = 1;

2966

switch (VT.SimpleTy) {

2967

default:

2968

return TargetLowering::findRepresentativeClass(TRI, VT);

2969

case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:

2970

RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;

2971

break;

2972

case MVT::x86mmx:

2973

RRC = &X86::VR64RegClass;

2974

break;

2975

case MVT::f32: case MVT::f64:

2976

case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:

2977

case MVT::v4f32: case MVT::v2f64:

2978

case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:

2979

case MVT::v8f32: case MVT::v4f64:

2980

case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:

2981

case MVT::v16f32: case MVT::v8f64:

2982

RRC = &X86::VR128XRegClass;

2983

break;

2984

}

2985

return std::make_pair(RRC, Cost);

2986

}

2987

2988

unsigned X86TargetLowering::getAddressSpace() const {

2989

if (Subtarget.is64Bit())

2990

return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;

2991

return 256;

2992

}

2993

2994

static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {

2995

return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||

2996

(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));

2997

}

2998

2999

static Constant* SegmentOffset(IRBuilderBase &IRB,

3000

int Offset, unsigned AddressSpace) {

3001

return ConstantExpr::getIntToPtr(

3002

ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),

3003

Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));

3004

}

3005

3006

Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {

3007

// glibc, bionic, and Fuchsia have a special slot for the stack guard in

3008

// tcbhead_t; use it instead of the usual global variable (see

3009

// sysdeps/{i386,x86_64}/nptl/tls.h)

3010

if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {

3011

if (Subtarget.isTargetFuchsia()) {

3012

// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.

3013

return SegmentOffset(IRB, 0x10, getAddressSpace());

3014

} else {

3015

unsigned AddressSpace = getAddressSpace();

3016

Module *M = IRB.GetInsertBlock()->getParent()->getParent();

3017

// Specially, some users may customize the base reg and offset.

3018

int Offset = M->getStackProtectorGuardOffset();

3019

// If we don't set -stack-protector-guard-offset value:

3020

// %fs:0x28, unless we're using a Kernel code model, in which case

3021

// it's %gs:0x28. gs:0x14 on i386.

3022

if (Offset == INT_MAX2147483647)

3023

Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;

3024

3025

StringRef GuardReg = M->getStackProtectorGuardReg();

3026

if (GuardReg == "fs")

3027

AddressSpace = X86AS::FS;

3028

else if (GuardReg == "gs")

3029

AddressSpace = X86AS::GS;

3030

3031

// Use symbol guard if user specify.

3032

StringRef GuardSymb = M->getStackProtectorGuardSymbol();

3033

if (!GuardSymb.empty()) {

3034

GlobalVariable *GV = M->getGlobalVariable(GuardSymb);

3035

if (!GV) {

3036

Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext())

3037

: Type::getInt32Ty(M->getContext());

3038

GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,

3039

nullptr, GuardSymb, nullptr,

3040

GlobalValue::NotThreadLocal, AddressSpace);

3041

}

3042

return GV;

3043

}

3044

3045

return SegmentOffset(IRB, Offset, AddressSpace);

3046

}

3047

}

3048

return TargetLowering::getIRStackGuard(IRB);

3049

}

3050

3051

void X86TargetLowering::insertSSPDeclarations(Module &M) const {

3052

// MSVC CRT provides functionalities for stack protection.

3053

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

3054

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

3055

// MSVC CRT has a global variable holding security cookie.

3056

M.getOrInsertGlobal("__security_cookie",

3057

Type::getInt8PtrTy(M.getContext()));

3058

3059

// MSVC CRT has a function to validate security cookie.

3060

FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(

3061

"__security_check_cookie", Type::getVoidTy(M.getContext()),

3062

Type::getInt8PtrTy(M.getContext()));

3063

if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {

3064

F->setCallingConv(CallingConv::X86_FastCall);

3065

F->addParamAttr(0, Attribute::AttrKind::InReg);

3066

}

3067

return;

3068

}

3069

3070

StringRef GuardMode = M.getStackProtectorGuard();

3071

3072

// glibc, bionic, and Fuchsia have a special slot for the stack guard.

3073

if ((GuardMode == "tls" || GuardMode.empty()) &&

3074

hasStackGuardSlotTLS(Subtarget.getTargetTriple()))

3075

return;

3076

TargetLowering::insertSSPDeclarations(M);

3077

}

3078

3079

Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {

3080

// MSVC CRT has a global variable holding security cookie.

3081

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

3082

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

3083

return M.getGlobalVariable("__security_cookie");

3084

}

3085

return TargetLowering::getSDagStackGuard(M);

3086

}

3087

3088

Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {

3089

// MSVC CRT has a function to validate security cookie.

3090

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

3091

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

3092

return M.getFunction("__security_check_cookie");

3093

}

3094

return TargetLowering::getSSPStackGuardCheck(M);

3095

}

3096

3097

Value *

3098

X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {

3099

if (Subtarget.getTargetTriple().isOSContiki())

3100

return getDefaultSafeStackPointerLocation(IRB, false);

3101

3102

// Android provides a fixed TLS slot for the SafeStack pointer. See the

3103

// definition of TLS_SLOT_SAFESTACK in

3104

// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h

3105

if (Subtarget.isTargetAndroid()) {

3106

// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:

3107

// %gs:0x24 on i386

3108

int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;

3109

return SegmentOffset(IRB, Offset, getAddressSpace());

3110

}

3111

3112

// Fuchsia is similar.

3113

if (Subtarget.isTargetFuchsia()) {

3114

// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.

3115

return SegmentOffset(IRB, 0x18, getAddressSpace());

3116

}

3117

3118

return TargetLowering::getSafeStackPointerLocation(IRB);

3119

}

3120

3121

//===----------------------------------------------------------------------===//

3122

// Return Value Calling Convention Implementation

3123

//===----------------------------------------------------------------------===//

3124

3125

bool X86TargetLowering::CanLowerReturn(

3126

CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,

3127

const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {

3128

SmallVector<CCValAssign, 16> RVLocs;

3129

CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);

3130

return CCInfo.CheckReturn(Outs, RetCC_X86);

3131

}

3132

3133

const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {

3134

static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };

3135

return ScratchRegs;

3136

}

3137

3138

ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const {

3139

// FIXME: We should def X86::FPCW for x87 as well. But it affects a lot of lit

3140

// tests at the moment, which is not what we expected.

3141

static const MCPhysReg RCRegs[] = {X86::MXCSR};

3142

return RCRegs;

3143

}

3144

3145

/// Lowers masks values (v*i1) to the local register values

3146

/// \returns DAG node after lowering to register type

3147

static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,

3148

const SDLoc &Dl, SelectionDAG &DAG) {

3149

EVT ValVT = ValArg.getValueType();

3150

3151

if (ValVT == MVT::v1i1)

3152

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,

3153

DAG.getIntPtrConstant(0, Dl));

3154

3155

if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||

3156

(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {

3157

// Two stage lowering might be required

3158

// bitcast: v8i1 -> i8 / v16i1 -> i16

3159

// anyextend: i8 -> i32 / i16 -> i32

3160

EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;

3161

SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);

3162

if (ValLoc == MVT::i32)

3163

ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);

3164

return ValToCopy;

3165

}

3166

3167

if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||

3168

(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {

3169

// One stage lowering is required

3170

// bitcast: v32i1 -> i32 / v64i1 -> i64

3171

return DAG.getBitcast(ValLoc, ValArg);

3172

}

3173

3174

return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);

3175

}

3176

3177

/// Breaks v64i1 value into two registers and adds the new node to the DAG

3178

static void Passv64i1ArgInRegs(

3179

const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,

3180

SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,

3181

CCValAssign &NextVA, const X86Subtarget &Subtarget) {

3182

assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3182, __extension__
__PRETTY_FUNCTION__));

3183

assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3183, __extension__
__PRETTY_FUNCTION__));

3184

assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")(static_cast <bool> (Arg.getValueType() == MVT::i64 &&
"Expecting 64 bit value") ? void (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3184, __extension__
__PRETTY_FUNCTION__));

3185

assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3186, __extension__
__PRETTY_FUNCTION__))

3186

"The value should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3186, __extension__
__PRETTY_FUNCTION__));

3187

3188

// Before splitting the value we cast it to i64

3189

Arg = DAG.getBitcast(MVT::i64, Arg);

3190

3191

// Splitting the value into two i32 types

3192

SDValue Lo, Hi;

3193

std::tie(Lo, Hi) = DAG.SplitScalar(Arg, Dl, MVT::i32, MVT::i32);

3194

3195

// Attach the two i32 types into corresponding registers

3196

RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));

3197

RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));

3198

}

3199

3200

SDValue

3201

X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

3202

bool isVarArg,

3203

const SmallVectorImpl<ISD::OutputArg> &Outs,

3204

const SmallVectorImpl<SDValue> &OutVals,

3205

const SDLoc &dl, SelectionDAG &DAG) const {

3206

MachineFunction &MF = DAG.getMachineFunction();

3207

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

3208

3209

// In some cases we need to disable registers from the default CSR list.

3210

// For example, when they are used as return registers (preserve_* and X86's

3211

// regcall) or for argument passing (X86's regcall).

3212

bool ShouldDisableCalleeSavedRegister =

3213

shouldDisableRetRegFromCSR(CallConv) ||

3214

MF.getFunction().hasFnAttribute("no_caller_saved_registers");

3215

3216

if (CallConv == CallingConv::X86_INTR && !Outs.empty())

3217

report_fatal_error("X86 interrupts may not return any value");

3218

3219

SmallVector<CCValAssign, 16> RVLocs;

3220

CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());

3221

CCInfo.AnalyzeReturn(Outs, RetCC_X86);

3222

3223

SmallVector<std::pair<Register, SDValue>, 4> RetVals;

3224

for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;

3225

++I, ++OutsIndex) {

3226

CCValAssign &VA = RVLocs[I];

3227

assert(VA.isRegLoc() && "Can only return in registers!")(static_cast <bool> (VA.isRegLoc() && "Can only return in registers!"
) ? void (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3227, __extension__
__PRETTY_FUNCTION__));

3228

3229

// Add the register to the CalleeSaveDisableRegs list.

3230

if (ShouldDisableCalleeSavedRegister)

3231

MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());

3232

3233

SDValue ValToCopy = OutVals[OutsIndex];

3234

EVT ValVT = ValToCopy.getValueType();

3235

3236

// Promote values to the appropriate types.

3237

if (VA.getLocInfo() == CCValAssign::SExt)

3238

ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);

3239

else if (VA.getLocInfo() == CCValAssign::ZExt)

3240

ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);

3241

else if (VA.getLocInfo() == CCValAssign::AExt) {

3242

if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)

3243

ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);

3244

else

3245

ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);

3246

}

3247

else if (VA.getLocInfo() == CCValAssign::BCvt)

3248

ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);

3249

3250

assert(VA.getLocInfo() != CCValAssign::FPExt &&(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3251, __extension__
__PRETTY_FUNCTION__))

3251

"Unexpected FP-extend for return value.")(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3251, __extension__
__PRETTY_FUNCTION__));

3252

3253

// Report an error if we have attempted to return a value via an XMM

3254

// register and SSE was disabled.

3255

if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {

3256

errorUnsupported(DAG, dl, "SSE register return with SSE disabled");

3257

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3258

} else if (!Subtarget.hasSSE2() &&

3259

X86::FR64XRegClass.contains(VA.getLocReg()) &&

3260

ValVT == MVT::f64) {

3261

// When returning a double via an XMM register, report an error if SSE2 is

3262

// not enabled.

3263

errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");

3264

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3265

}

3266

3267

// Returns in ST0/ST1 are handled specially: these are pushed as operands to

3268

// the RET instruction and handled by the FP Stackifier.

3269

if (VA.getLocReg() == X86::FP0 ||

3270

VA.getLocReg() == X86::FP1) {

3271

// If this is a copy from an xmm register to ST(0), use an FPExtend to

3272

// change the value to the FP stack register class.

3273

if (isScalarFPTypeInSSEReg(VA.getValVT()))

3274

ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);

3275

RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));

3276

// Don't emit a copytoreg.

3277

continue;

3278

}

3279

3280

// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64

3281

// which is returned in RAX / RDX.

3282

if (Subtarget.is64Bit()) {

3283

if (ValVT == MVT::x86mmx) {

3284

if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {

3285

ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);

3286

ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

3287

ValToCopy);

3288

// If we don't have SSE2 available, convert to v4f32 so the generated

3289

// register is legal.

3290

if (!Subtarget.hasSSE2())

3291

ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);

3292

}

3293

}

3294

}

3295

3296

if (VA.needsCustom()) {

3297

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3298, __extension__
__PRETTY_FUNCTION__))

3298

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3298, __extension__
__PRETTY_FUNCTION__));

3299

3300

Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],

3301

Subtarget);

3302

3303

// Add the second register to the CalleeSaveDisableRegs list.

3304

if (ShouldDisableCalleeSavedRegister)

3305

MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());

3306

} else {

3307

RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));

3308

}

3309

}

3310

3311

SDValue Glue;

3312

SmallVector<SDValue, 6> RetOps;

3313

RetOps.push_back(Chain); // Operand #0 = Chain (updated below)

3314

// Operand #1 = Bytes To Pop

3315

RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,

3316

MVT::i32));

3317

3318

// Copy the result values into the output registers.

3319

for (auto &RetVal : RetVals) {

3320

if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {

3321

RetOps.push_back(RetVal.second);

3322

continue; // Don't emit a copytoreg.

3323

}

3324

3325

Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue);

3326

Glue = Chain.getValue(1);

3327

RetOps.push_back(

3328

DAG.getRegister(RetVal.first, RetVal.second.getValueType()));

3329

}

3330

3331

// Swift calling convention does not require we copy the sret argument

3332

// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

3333

3334

// All x86 ABIs require that for returning structs by value we copy

3335

// the sret argument into %rax/%eax (depending on ABI) for the return.

3336

// We saved the argument into a virtual register in the entry block,

3337

// so now we copy the value out and into %rax/%eax.

3338

//

3339

// Checking Function.hasStructRetAttr() here is insufficient because the IR

3340

// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is

3341

// false, then an sret argument may be implicitly inserted in the SelDAG. In

3342

// either case FuncInfo->setSRetReturnReg() will have been called.

3343

if (Register SRetReg = FuncInfo->getSRetReturnReg()) {

3344

// When we have both sret and another return value, we should use the

3345

// original Chain stored in RetOps[0], instead of the current Chain updated

3346

// in the above loop. If we only have sret, RetOps[0] equals to Chain.

3347

3348

// For the case of sret and another return value, we have

3349

// Chain_0 at the function entry

3350

// Chain_1 = getCopyToReg(Chain_0) in the above loop

3351

// If we use Chain_1 in getCopyFromReg, we will have

3352

// Val = getCopyFromReg(Chain_1)

3353

// Chain_2 = getCopyToReg(Chain_1, Val) from below

3354

3355

// getCopyToReg(Chain_0) will be glued together with

3356

// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be

3357

// in Unit B, and we will have cyclic dependency between Unit A and Unit B:

3358

// Data dependency from Unit B to Unit A due to usage of Val in

3359

// getCopyToReg(Chain_1, Val)

3360

// Chain dependency from Unit A to Unit B

3361

3362

// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.

3363

SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,

3364

getPointerTy(MF.getDataLayout()));

3365

3366

Register RetValReg

3367

= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?

3368

X86::RAX : X86::EAX;

3369

Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue);

3370

Glue = Chain.getValue(1);

3371

3372

// RAX/EAX now acts like a return value.

3373

RetOps.push_back(

3374

DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));

3375

3376

// Add the returned register to the CalleeSaveDisableRegs list. Don't do

3377

// this however for preserve_most/preserve_all to minimize the number of

3378

// callee-saved registers for these CCs.

3379

if (ShouldDisableCalleeSavedRegister &&

3380

CallConv != CallingConv::PreserveAll &&

3381

CallConv != CallingConv::PreserveMost)

3382

MF.getRegInfo().disableCalleeSavedRegister(RetValReg);

3383

}

3384

3385

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

3386

const MCPhysReg *I =

3387

TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());

3388

if (I) {

3389

for (; *I; ++I) {

3390

if (X86::GR64RegClass.contains(*I))

3391

RetOps.push_back(DAG.getRegister(*I, MVT::i64));

3392

else

3393

llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3393);

3394

}

3395

}

3396

3397

RetOps[0] = Chain; // Update chain.

3398

3399

// Add the glue if we have it.

3400

if (Glue.getNode())

3401

RetOps.push_back(Glue);

3402

3403

X86ISD::NodeType opcode = X86ISD::RET_GLUE;

3404

if (CallConv == CallingConv::X86_INTR)

3405

opcode = X86ISD::IRET;

3406

return DAG.getNode(opcode, dl, MVT::Other, RetOps);

3407

}

3408

3409

bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {

3410

if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))

3411

return false;

3412

3413

SDValue TCChain = Chain;

3414

SDNode *Copy = *N->use_begin();

3415

if (Copy->getOpcode() == ISD::CopyToReg) {

3416

// If the copy has a glue operand, we conservatively assume it isn't safe to

3417

// perform a tail call.

3418

if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)

3419

return false;

3420

TCChain = Copy->getOperand(0);

3421

} else if (Copy->getOpcode() != ISD::FP_EXTEND)

3422

return false;

3423

3424

bool HasRet = false;

3425

for (const SDNode *U : Copy->uses()) {

3426

if (U->getOpcode() != X86ISD::RET_GLUE)

3427

return false;

3428

// If we are returning more than one value, we can definitely

3429

// not make a tail call see PR19530

3430

if (U->getNumOperands() > 4)

3431

return false;

3432

if (U->getNumOperands() == 4 &&

3433

U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)

3434

return false;

3435

HasRet = true;

3436

}

3437

3438

if (!HasRet)

3439

return false;

3440

3441

Chain = TCChain;

3442

return true;

3443

}

3444

3445

EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,

3446

ISD::NodeType ExtendKind) const {

3447

MVT ReturnMVT = MVT::i32;

3448

3449

bool Darwin = Subtarget.getTargetTriple().isOSDarwin();

3450

if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {

3451

// The ABI does not require i1, i8 or i16 to be extended.

3452

//

3453

// On Darwin, there is code in the wild relying on Clang's old behaviour of

3454

// always extending i8/i16 return values, so keep doing that for now.

3455

// (PR26665).

3456

ReturnMVT = MVT::i8;

3457

}

3458

3459

EVT MinVT = getRegisterType(Context, ReturnMVT);

3460

return VT.bitsLT(MinVT) ? MinVT : VT;

3461

}

3462

3463

/// Reads two 32 bit registers and creates a 64 bit mask value.

3464

/// \param VA The current 32 bit value that need to be assigned.

3465

/// \param NextVA The next 32 bit value that need to be assigned.

3466

/// \param Root The parent DAG node.

3467

/// \param [in,out] InGlue Represents SDvalue in the parent DAG node for

3468

/// glue purposes. In the case the DAG is already using

3469

/// physical register instead of virtual, we should glue

3470

/// our new SDValue to InGlue SDvalue.

3471

/// \return a new SDvalue of size 64bit.

3472

static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,

3473

SDValue &Root, SelectionDAG &DAG,

3474

const SDLoc &Dl, const X86Subtarget &Subtarget,

3475

SDValue *InGlue = nullptr) {

3476

assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(static_cast <bool> ((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3476, __extension__
__PRETTY_FUNCTION__));

3477

assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3477, __extension__
__PRETTY_FUNCTION__));

3478

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3479, __extension__
__PRETTY_FUNCTION__))

3479

"Expecting first location of 64 bit width type")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3479, __extension__
__PRETTY_FUNCTION__));

3480

assert(NextVA.getValVT() == VA.getValVT() &&(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3481, __extension__
__PRETTY_FUNCTION__))

3481

"The locations should have the same type")(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3481, __extension__
__PRETTY_FUNCTION__));

3482

assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3483, __extension__
__PRETTY_FUNCTION__))

3483

"The values should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3483, __extension__
__PRETTY_FUNCTION__));

3484

3485

SDValue Lo, Hi;

3486

SDValue ArgValueLo, ArgValueHi;

3487

3488

MachineFunction &MF = DAG.getMachineFunction();

3489

const TargetRegisterClass *RC = &X86::GR32RegClass;

3490

3491

// Read a 32 bit value from the registers.

3492

if (nullptr == InGlue) {

3493

// When no physical register is present,

3494

// create an intermediate virtual register.

3495

Register Reg = MF.addLiveIn(VA.getLocReg(), RC);

3496

ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);

3497

Reg = MF.addLiveIn(NextVA.getLocReg(), RC);

3498

ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);

3499

} else {

3500

// When a physical register is available read the value from it and glue

3501

// the reads together.

3502

ArgValueLo =

3503

DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InGlue);

3504

*InGlue = ArgValueLo.getValue(2);

3505

ArgValueHi =

3506

DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InGlue);

3507

*InGlue = ArgValueHi.getValue(2);

3508

}

3509

3510

// Convert the i32 type into v32i1 type.

3511

Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);

3512

3513

// Convert the i32 type into v32i1 type.

3514

Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);

3515

3516

// Concatenate the two values together.

3517

return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);

3518

}

3519

3520

/// The function will lower a register of various sizes (8/16/32/64)

3521

/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)

3522

/// \returns a DAG node contains the operand after lowering to mask type.

3523

static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,

3524

const EVT &ValLoc, const SDLoc &Dl,

3525

SelectionDAG &DAG) {

3526

SDValue ValReturned = ValArg;

3527

3528

if (ValVT == MVT::v1i1)

3529

return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);

3530

3531

if (ValVT == MVT::v64i1) {

3532

// In 32 bit machine, this case is handled by getv64i1Argument

3533

assert(ValLoc == MVT::i64 && "Expecting only i64 locations")(static_cast <bool> (ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? void (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3533, __extension__
__PRETTY_FUNCTION__));

3534

// In 64 bit machine, There is no need to truncate the value only bitcast

3535

} else {

3536

MVT maskLen;

3537

switch (ValVT.getSimpleVT().SimpleTy) {

3538

case MVT::v8i1:

3539

maskLen = MVT::i8;

3540

break;

3541

case MVT::v16i1:

3542

maskLen = MVT::i16;

3543

break;

3544

case MVT::v32i1:

3545

maskLen = MVT::i32;

3546

break;

3547

default:

3548

llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3548);

3549

}

3550

3551

ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);

3552

}

3553

return DAG.getBitcast(ValVT, ValReturned);

3554

}

3555

3556

/// Lower the result values of a call into the

3557

/// appropriate copies out of appropriate physical registers.

3558

///

3559

SDValue X86TargetLowering::LowerCallResult(

3560

SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,

3561

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

3562

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,

3563

uint32_t *RegMask) const {

3564

3565

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

3566

// Assign locations to each value returned by this call.

3567

SmallVector<CCValAssign, 16> RVLocs;

3568

CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,

3569

*DAG.getContext());

3570

CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

3571

3572

// Copy all of the result registers out of their specified physreg.

3573

for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;

3574

++I, ++InsIndex) {

3575

CCValAssign &VA = RVLocs[I];

3576

EVT CopyVT = VA.getLocVT();

3577

3578

// In some calling conventions we need to remove the used registers

3579

// from the register mask.

3580

if (RegMask) {

3581

for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);

3582

SubRegs.isValid(); ++SubRegs)

3583

RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));

3584

}

3585

3586

// Report an error if there was an attempt to return FP values via XMM

3587

// registers.

3588

if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {

3589

errorUnsupported(DAG, dl, "SSE register return with SSE disabled");

3590

if (VA.getLocReg() == X86::XMM1)

3591

VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.

3592

else

3593

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3594

} else if (!Subtarget.hasSSE2() &&

3595

X86::FR64XRegClass.contains(VA.getLocReg()) &&

3596

CopyVT == MVT::f64) {

3597

errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");

3598

if (VA.getLocReg() == X86::XMM1)

3599

VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.

3600

else

3601

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3602

}

3603

3604

// If we prefer to use the value in xmm registers, copy it out as f80 and

3605

// use a truncate to move it from fp stack reg to xmm reg.

3606

bool RoundAfterCopy = false;

3607

if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&

3608

isScalarFPTypeInSSEReg(VA.getValVT())) {

3609

if (!Subtarget.hasX87())

3610

report_fatal_error("X87 register return with X87 disabled");

3611

CopyVT = MVT::f80;

3612

RoundAfterCopy = (CopyVT != VA.getLocVT());

3613

}

3614

3615

SDValue Val;

3616

if (VA.needsCustom()) {

3617

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3618, __extension__
__PRETTY_FUNCTION__))

3618

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3618, __extension__
__PRETTY_FUNCTION__));

3619

Val =

3620

getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue);

3621

} else {

3622

Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)

3623

.getValue(1);

3624

Val = Chain.getValue(0);

3625

InGlue = Chain.getValue(2);

3626

}

3627

3628

if (RoundAfterCopy)

3629

Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,

3630

// This truncation won't change the value.

3631

DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));

3632

3633

if (VA.isExtInLoc()) {

3634

if (VA.getValVT().isVector() &&

3635

VA.getValVT().getScalarType() == MVT::i1 &&

3636

((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||

3637

(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {

3638

// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8

3639

Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);

3640

} else

3641

Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);

3642

}

3643

3644

if (VA.getLocInfo() == CCValAssign::BCvt)

3645

Val = DAG.getBitcast(VA.getValVT(), Val);

3646

3647

InVals.push_back(Val);

3648

}

3649

3650

return Chain;

3651

}

3652

3653

//===----------------------------------------------------------------------===//

3654

// C & StdCall & Fast Calling Convention implementation

3655

//===----------------------------------------------------------------------===//

3656

// StdCall calling convention seems to be standard for many Windows' API

3657

// routines and around. It differs from C calling convention just a little:

3658

// callee should clean up the stack, not caller. Symbols should be also

3659

// decorated in some fancy way :) It doesn't support any vector arguments.

3660

// For info on fast calling convention see Fast Calling Convention (tail call)

3661

// implementation LowerX86_32FastCCCallTo.

3662

3663

/// Determines whether Args, either a set of outgoing arguments to a call, or a

3664

/// set of incoming args of a call, contains an sret pointer that the callee

3665

/// pops

3666

template <typename T>

3667

static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,

3668

const X86Subtarget &Subtarget) {

3669

// Not C++20 (yet), so no concepts available.

3670

static_assert(std::is_same_v<T, ISD::OutputArg> ||

3671

std::is_same_v<T, ISD::InputArg>,

3672

"requires ISD::OutputArg or ISD::InputArg");

3673

3674

// Only 32-bit pops the sret. It's a 64-bit world these days, so early-out

3675

// for most compilations.

3676

if (!Subtarget.is32Bit())

3677

return false;

3678

3679

if (Args.empty())

3680

return false;

3681

3682

// Most calls do not have an sret argument, check the arg next.

3683

const ISD::ArgFlagsTy &Flags = Args[0].Flags;

3684

if (!Flags.isSRet() || Flags.isInReg())

3685

return false;

3686

3687

// The MSVCabi does not pop the sret.

3688

if (Subtarget.getTargetTriple().isOSMSVCRT())

3689

return false;

3690

3691

// MCUs don't pop the sret

3692

if (Subtarget.isTargetMCU())

3693

return false;

3694

3695

// Callee pops argument

3696

return true;

3697

}

3698

3699

/// Make a copy of an aggregate at address specified by "Src" to address

3700

/// "Dst" with size and alignment information specified by the specific

3701

/// parameter attribute. The copy will be passed as a byval function parameter.

3702

static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,

3703

SDValue Chain, ISD::ArgFlagsTy Flags,

3704

SelectionDAG &DAG, const SDLoc &dl) {

3705

SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);

3706

3707

return DAG.getMemcpy(

3708

Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),

3709

/*isVolatile*/ false, /*AlwaysInline=*/true,

3710

/*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());

3711

}

3712

3713

/// Return true if the calling convention is one that we can guarantee TCO for.

3714

static bool canGuaranteeTCO(CallingConv::ID CC) {

3715

return (CC == CallingConv::Fast || CC == CallingConv::GHC ||

3716

CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||

3717

CC == CallingConv::Tail || CC == CallingConv::SwiftTail);

3718

}

3719

3720

/// Return true if we might ever do TCO for calls with this calling convention.

3721

static bool mayTailCallThisCC(CallingConv::ID CC) {

3722

switch (CC) {

3723

// C calling conventions:

3724

case CallingConv::C:

3725

case CallingConv::Win64:

3726

case CallingConv::X86_64_SysV:

3727

// Callee pop conventions:

3728

case CallingConv::X86_ThisCall:

3729

case CallingConv::X86_StdCall:

3730

case CallingConv::X86_VectorCall:

3731

case CallingConv::X86_FastCall:

3732

// Swift:

3733

case CallingConv::Swift:

3734

return true;

3735

default:

3736

return canGuaranteeTCO(CC);

3737

}

3738

}

3739

3740

/// Return true if the function is being made into a tailcall target by

3741

/// changing its ABI.

3742

static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {

3743

return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||

3744

CC == CallingConv::Tail || CC == CallingConv::SwiftTail;

3745

}

3746

3747

bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {

3748

if (!CI->isTailCall())

3749

return false;

3750

3751

CallingConv::ID CalleeCC = CI->getCallingConv();

3752

if (!mayTailCallThisCC(CalleeCC))

3753

return false;

3754

3755

return true;

3756

}

3757

3758

SDValue

3759

X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,

3760

const SmallVectorImpl<ISD::InputArg> &Ins,

3761

const SDLoc &dl, SelectionDAG &DAG,

3762

const CCValAssign &VA,

3763

MachineFrameInfo &MFI, unsigned i) const {

3764

// Create the nodes corresponding to a load from this parameter slot.

3765

ISD::ArgFlagsTy Flags = Ins[i].Flags;

3766

bool AlwaysUseMutable = shouldGuaranteeTCO(

3767

CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);

3768

bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();

3769

EVT ValVT;

3770

MVT PtrVT = getPointerTy(DAG.getDataLayout());

3771

3772

// If value is passed by pointer we have address passed instead of the value

3773

// itself. No need to extend if the mask value and location share the same

3774

// absolute size.

3775

bool ExtendedInMem =

3776

VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&

3777

VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();

3778

3779

if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)

3780

ValVT = VA.getLocVT();

3781

else

3782

ValVT = VA.getValVT();

3783

3784

// FIXME: For now, all byval parameter objects are marked mutable. This can be

3785

// changed with more analysis.

3786

// In case of tail call optimization mark all arguments mutable. Since they

3787

// could be overwritten by lowering of arguments in case of a tail call.

3788

if (Flags.isByVal()) {

3789

unsigned Bytes = Flags.getByValSize();

3790

if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.

3791

3792

// FIXME: For now, all byval parameter objects are marked as aliasing. This

3793

// can be improved with deeper analysis.

3794

int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,

3795

/*isAliased=*/true);

3796

return DAG.getFrameIndex(FI, PtrVT);

3797

}

3798

3799

EVT ArgVT = Ins[i].ArgVT;

3800

3801

// If this is a vector that has been split into multiple parts, and the

3802

// scalar size of the parts don't match the vector element size, then we can't

3803

// elide the copy. The parts will have padding between them instead of being

3804

// packed like a vector.

3805

bool ScalarizedAndExtendedVector =

3806

ArgVT.isVector() && !VA.getLocVT().isVector() &&

3807

VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();

3808

3809

// This is an argument in memory. We might be able to perform copy elision.

3810

// If the argument is passed directly in memory without any extension, then we

3811

// can perform copy elision. Large vector types, for example, may be passed

3812

// indirectly by pointer.

3813

if (Flags.isCopyElisionCandidate() &&

3814

VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&

3815

!ScalarizedAndExtendedVector) {

3816

SDValue PartAddr;

3817

if (Ins[i].PartOffset == 0) {

3818

// If this is a one-part value or the first part of a multi-part value,

3819

// create a stack object for the entire argument value type and return a

3820

// load from our portion of it. This assumes that if the first part of an

3821

// argument is in memory, the rest will also be in memory.

3822

int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),

3823

/*IsImmutable=*/false);

3824

PartAddr = DAG.getFrameIndex(FI, PtrVT);

3825

return DAG.getLoad(

3826

ValVT, dl, Chain, PartAddr,

3827

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));

3828

} else {

3829

// This is not the first piece of an argument in memory. See if there is

3830

// already a fixed stack object including this offset. If so, assume it

3831

// was created by the PartOffset == 0 branch above and create a load from

3832

// the appropriate offset into it.

3833

int64_t PartBegin = VA.getLocMemOffset();

3834

int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;

3835

int FI = MFI.getObjectIndexBegin();

3836

for (; MFI.isFixedObjectIndex(FI); ++FI) {

3837

int64_t ObjBegin = MFI.getObjectOffset(FI);

3838

int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);

3839

if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)

3840

break;

3841

}

3842

if (MFI.isFixedObjectIndex(FI)) {

3843

SDValue Addr =

3844

DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),

3845

DAG.getIntPtrConstant(Ins[i].PartOffset, dl));

3846

return DAG.getLoad(

3847

ValVT, dl, Chain, Addr,

3848

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,

3849

Ins[i].PartOffset));

3850

}

3851

}

3852

}

3853

3854

int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,

3855

VA.getLocMemOffset(), isImmutable);

3856

3857

// Set SExt or ZExt flag.

3858

if (VA.getLocInfo() == CCValAssign::ZExt) {

3859

MFI.setObjectZExt(FI, true);

3860

} else if (VA.getLocInfo() == CCValAssign::SExt) {

3861

MFI.setObjectSExt(FI, true);

3862

}

3863

3864

MaybeAlign Alignment;

3865

if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&

3866

ValVT != MVT::f80)

3867

Alignment = MaybeAlign(4);

3868

SDValue FIN = DAG.getFrameIndex(FI, PtrVT);

3869

SDValue Val = DAG.getLoad(

3870

ValVT, dl, Chain, FIN,

3871

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),

3872

Alignment);

3873

return ExtendedInMem

3874

? (VA.getValVT().isVector()

3875

? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)

3876

: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))

3877

: Val;

3878

}

3879

3880

// FIXME: Get this from tablegen.

3881

static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,

3882

const X86Subtarget &Subtarget) {

3883

assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3883, __extension__ __PRETTY_FUNCTION__));

3884

3885

if (Subtarget.isCallingConvWin64(CallConv)) {

3886

static const MCPhysReg GPR64ArgRegsWin64[] = {

3887

X86::RCX, X86::RDX, X86::R8, X86::R9

3888

};

3889

return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));

3890

}

3891

3892

static const MCPhysReg GPR64ArgRegs64Bit[] = {

3893

X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9

3894

};

3895

return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));

3896

}

3897

3898

// FIXME: Get this from tablegen.

3899

static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,

3900

CallingConv::ID CallConv,

3901

const X86Subtarget &Subtarget) {

3902

assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3902, __extension__ __PRETTY_FUNCTION__));

3903

if (Subtarget.isCallingConvWin64(CallConv)) {

3904

// The XMM registers which might contain var arg parameters are shadowed

3905

// in their paired GPR. So we only need to save the GPR to their home

3906

// slots.

3907

// TODO: __vectorcall will change this.

3908

return std::nullopt;

3909

}

3910

3911

bool isSoftFloat = Subtarget.useSoftFloat();

3912

if (isSoftFloat || !Subtarget.hasSSE1())

3913

// Kernel mode asks for SSE to be disabled, so there are no XMM argument

3914

// registers.

3915

return std::nullopt;

3916

3917

static const MCPhysReg XMMArgRegs64Bit[] = {

3918

X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,

3919

X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7

3920

};

3921

return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));

3922

}

3923

3924

#ifndef NDEBUG

3925

static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {

3926

return llvm::is_sorted(

3927

ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {

3928

return A.getValNo() < B.getValNo();

3929

});

3930

}

3931

#endif

3932

3933

namespace {

3934

/// This is a helper class for lowering variable arguments parameters.

3935

class VarArgsLoweringHelper {

3936

public:

3937

VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,

3938

SelectionDAG &DAG, const X86Subtarget &Subtarget,

3939

CallingConv::ID CallConv, CCState &CCInfo)

3940

: FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),

3941

TheMachineFunction(DAG.getMachineFunction()),

3942

TheFunction(TheMachineFunction.getFunction()),

3943

FrameInfo(TheMachineFunction.getFrameInfo()),

3944

FrameLowering(*Subtarget.getFrameLowering()),

3945

TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),

3946

CCInfo(CCInfo) {}

3947

3948

// Lower variable arguments parameters.

3949

void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);

3950

3951

private:

3952

void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);

3953

3954

void forwardMustTailParameters(SDValue &Chain);

3955

3956

bool is64Bit() const { return Subtarget.is64Bit(); }

3957

bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }

3958

3959

X86MachineFunctionInfo *FuncInfo;

3960

const SDLoc &DL;

3961

SelectionDAG &DAG;

3962

const X86Subtarget &Subtarget;

3963

MachineFunction &TheMachineFunction;

3964

const Function &TheFunction;

3965

MachineFrameInfo &FrameInfo;

3966

const TargetFrameLowering &FrameLowering;

3967

const TargetLowering &TargLowering;

3968

CallingConv::ID CallConv;

3969

CCState &CCInfo;

3970

};

3971

} // namespace

3972

3973

void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(

3974

SDValue &Chain, unsigned StackSize) {

3975

// If the function takes variable number of arguments, make a frame index for

3976

// the start of the first vararg value... for expansion of llvm.va_start. We

3977

// can skip this if there are no va_start calls.

3978

if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&

3979

CallConv != CallingConv::X86_ThisCall)) {

3980

FuncInfo->setVarArgsFrameIndex(

3981

FrameInfo.CreateFixedObject(1, StackSize, true));

3982

}

3983

3984

// 64-bit calling conventions support varargs and register parameters, so we

3985

// have to do extra work to spill them in the prologue.

3986

if (is64Bit()) {

3987

// Find the first unallocated argument registers.

3988

ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);

3989

ArrayRef<MCPhysReg> ArgXMMs =

3990

get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);

3991

unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);

3992

unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);

3993

3994

assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3995, __extension__
__PRETTY_FUNCTION__))

3995

"SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3995, __extension__
__PRETTY_FUNCTION__));

3996

3997

if (isWin64()) {

3998

// Get to the caller-allocated home save location. Add 8 to account

3999

// for the return address.

4000

int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;

4001

FuncInfo->setRegSaveFrameIndex(

4002

FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));

4003

// Fixup to set vararg frame on shadow area (4 x i64).

4004

if (NumIntRegs < 4)

4005

FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());

4006

} else {

4007

// For X86-64, if there are vararg parameters that are passed via

4008

// registers, then we must store them to their spots on the stack so

4009

// they may be loaded by dereferencing the result of va_next.

4010

FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);

4011

FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);

4012

FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(

4013

ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));

4014

}

4015

4016

SmallVector<SDValue, 6>

4017

LiveGPRs; // list of SDValue for GPR registers keeping live input value

4018

SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers

4019

// keeping live input value

4020

SDValue ALVal; // if applicable keeps SDValue for %al register

4021

4022

// Gather all the live in physical registers.

4023

for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {

4024

Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);

4025

LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));

4026

}

4027

const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);

4028

if (!AvailableXmms.empty()) {

4029

Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);

4030

ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);

4031

for (MCPhysReg Reg : AvailableXmms) {

4032

// FastRegisterAllocator spills virtual registers at basic

4033

// block boundary. That leads to usages of xmm registers

4034

// outside of check for %al. Pass physical registers to

4035

// VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.

4036

TheMachineFunction.getRegInfo().addLiveIn(Reg);

4037

LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));

4038

}

4039

}

4040

4041

// Store the integer parameter registers.

4042

SmallVector<SDValue, 8> MemOps;

4043

SDValue RSFIN =

4044

DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),

4045

TargLowering.getPointerTy(DAG.getDataLayout()));

4046

unsigned Offset = FuncInfo->getVarArgsGPOffset();

4047

for (SDValue Val : LiveGPRs) {

4048

SDValue FIN = DAG.getNode(ISD::ADD, DL,

4049

TargLowering.getPointerTy(DAG.getDataLayout()),

4050

RSFIN, DAG.getIntPtrConstant(Offset, DL));

4051

SDValue Store =

4052

DAG.getStore(Val.getValue(1), DL, Val, FIN,

4053

MachinePointerInfo::getFixedStack(

4054

DAG.getMachineFunction(),

4055

FuncInfo->getRegSaveFrameIndex(), Offset));

4056

MemOps.push_back(Store);

4057

Offset += 8;

4058

}

4059

4060

// Now store the XMM (fp + vector) parameter registers.

4061

if (!LiveXMMRegs.empty()) {

4062

SmallVector<SDValue, 12> SaveXMMOps;

4063

SaveXMMOps.push_back(Chain);

4064

SaveXMMOps.push_back(ALVal);

4065

SaveXMMOps.push_back(RSFIN);

4066

SaveXMMOps.push_back(

4067

DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));

4068

llvm::append_range(SaveXMMOps, LiveXMMRegs);

4069

MachineMemOperand *StoreMMO =

4070

DAG.getMachineFunction().getMachineMemOperand(

4071

MachinePointerInfo::getFixedStack(

4072

DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),

4073

Offset),

4074

MachineMemOperand::MOStore, 128, Align(16));

4075

MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,

4076

DL, DAG.getVTList(MVT::Other),

4077

SaveXMMOps, MVT::i8, StoreMMO));

4078

}

4079

4080

if (!MemOps.empty())

4081

Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

4082

}

4083

}

4084

4085

void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {

4086

// Find the largest legal vector type.

4087

MVT VecVT = MVT::Other;

4088

// FIXME: Only some x86_32 calling conventions support AVX512.

4089

if (Subtarget.useAVX512Regs() &&

4090

(is64Bit() || (CallConv == CallingConv::X86_VectorCall ||

4091

CallConv == CallingConv::Intel_OCL_BI)))

4092

VecVT = MVT::v16f32;

4093

else if (Subtarget.hasAVX())

4094

VecVT = MVT::v8f32;

4095

else if (Subtarget.hasSSE2())

4096

VecVT = MVT::v4f32;

4097

4098

// We forward some GPRs and some vector types.

4099

SmallVector<MVT, 2> RegParmTypes;

4100

MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;

4101

RegParmTypes.push_back(IntVT);

4102

if (VecVT != MVT::Other)

4103

RegParmTypes.push_back(VecVT);

4104

4105

// Compute the set of forwarded registers. The rest are scratch.

4106

SmallVectorImpl<ForwardedRegister> &Forwards =

4107

FuncInfo->getForwardedMustTailRegParms();

4108

CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

4109

4110

// Forward AL for SysV x86_64 targets, since it is used for varargs.

4111

if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {

4112

Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);

4113

Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));

4114

}

4115

4116

// Copy all forwards from physical to virtual registers.

4117

for (ForwardedRegister &FR : Forwards) {

4118

// FIXME: Can we use a less constrained schedule?

4119

SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);

4120

FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(

4121

TargLowering.getRegClassFor(FR.VT));

4122

Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);

4123

}

4124

}

4125

4126

void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,

4127

unsigned StackSize) {

4128

// Set FrameIndex to the 0xAAAAAAA value to mark unset state.

4129

// If necessary, it would be set into the correct value later.

4130

FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);

4131

FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

4132

4133

if (FrameInfo.hasVAStart())

4134

createVarArgAreaAndStoreRegisters(Chain, StackSize);

4135

4136

if (FrameInfo.hasMustTailInVarArgFunc())

4137

forwardMustTailParameters(Chain);

4138

}

4139

4140

SDValue X86TargetLowering::LowerFormalArguments(

4141

SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,

4142

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

4143

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

4144

MachineFunction &MF = DAG.getMachineFunction();

4145

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

4146

4147

const Function &F = MF.getFunction();

4148

if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&

4149

F.getName() == "main")

4150

FuncInfo->setForceFramePointer(true);

4151

4152

MachineFrameInfo &MFI = MF.getFrameInfo();

4153

bool Is64Bit = Subtarget.is64Bit();

4154

bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

4155

4156

assert((static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4158, __extension__
__PRETTY_FUNCTION__))

4157

!(IsVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4158, __extension__
__PRETTY_FUNCTION__))

4158

"Var args not supported with calling conv' regcall, fastcc, ghc or hipe")(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4158, __extension__
__PRETTY_FUNCTION__));

4159

4160

// Assign locations to all of the incoming arguments.

4161

SmallVector<CCValAssign, 16> ArgLocs;

4162

CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());

4163

4164

// Allocate shadow area for Win64.

4165

if (IsWin64)

4166

CCInfo.AllocateStack(32, Align(8));

4167

4168

CCInfo.AnalyzeArguments(Ins, CC_X86);

4169

4170

// In vectorcall calling convention a second pass is required for the HVA

4171

// types.

4172

if (CallingConv::X86_VectorCall == CallConv) {

4173

CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);

4174

}

4175

4176

// The next loop assumes that the locations are in the same order of the

4177

// input arguments.

4178

assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4179, __extension__
__PRETTY_FUNCTION__))

4179

"Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4179, __extension__
__PRETTY_FUNCTION__));

4180

4181

SDValue ArgValue;

4182

for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;

4183

++I, ++InsIndex) {

4184

assert(InsIndex < Ins.size() && "Invalid Ins index")(static_cast <bool> (InsIndex < Ins.size() &&
"Invalid Ins index") ? void (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4184, __extension__
__PRETTY_FUNCTION__));

4185

CCValAssign &VA = ArgLocs[I];

4186

4187

if (VA.isRegLoc()) {

4188

EVT RegVT = VA.getLocVT();

4189

if (VA.needsCustom()) {

4190

assert((static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4192, __extension__
__PRETTY_FUNCTION__))

4191

VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4192, __extension__
__PRETTY_FUNCTION__))

4192

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4192, __extension__
__PRETTY_FUNCTION__));

4193

4194

// v64i1 values, in regcall calling convention, that are

4195

// compiled to 32 bit arch, are split up into two registers.

4196

ArgValue =

4197

getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);

4198

} else {

4199

const TargetRegisterClass *RC;

4200

if (RegVT == MVT::i8)

4201

RC = &X86::GR8RegClass;

4202

else if (RegVT == MVT::i16)

4203

RC = &X86::GR16RegClass;

4204

else if (RegVT == MVT::i32)

4205

RC = &X86::GR32RegClass;

4206

else if (Is64Bit && RegVT == MVT::i64)

4207

RC = &X86::GR64RegClass;

4208

else if (RegVT == MVT::f16)

4209

RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;

4210

else if (RegVT == MVT::f32)

4211

RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;

4212

else if (RegVT == MVT::f64)

4213

RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;

4214

else if (RegVT == MVT::f80)

4215

RC = &X86::RFP80RegClass;

4216

else if (RegVT == MVT::f128)

4217

RC = &X86::VR128RegClass;

4218

else if (RegVT.is512BitVector())

4219

RC = &X86::VR512RegClass;

4220

else if (RegVT.is256BitVector())

4221

RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;

4222

else if (RegVT.is128BitVector())

4223

RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;

4224

else if (RegVT == MVT::x86mmx)

4225

RC = &X86::VR64RegClass;

4226

else if (RegVT == MVT::v1i1)

4227

RC = &X86::VK1RegClass;

4228

else if (RegVT == MVT::v8i1)

4229

RC = &X86::VK8RegClass;

4230

else if (RegVT == MVT::v16i1)

4231

RC = &X86::VK16RegClass;

4232

else if (RegVT == MVT::v32i1)

4233

RC = &X86::VK32RegClass;

4234

else if (RegVT == MVT::v64i1)

4235

RC = &X86::VK64RegClass;

4236

else

4237

llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4237);

4238

4239

Register Reg = MF.addLiveIn(VA.getLocReg(), RC);

4240

ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);

4241

}

4242

4243

// If this is an 8 or 16-bit value, it is really passed promoted to 32

4244

// bits. Insert an assert[sz]ext to capture this, then truncate to the

4245

// right size.

4246

if (VA.getLocInfo() == CCValAssign::SExt)

4247

ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,

4248

DAG.getValueType(VA.getValVT()));

4249

else if (VA.getLocInfo() == CCValAssign::ZExt)

4250

ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,

4251

DAG.getValueType(VA.getValVT()));

4252

else if (VA.getLocInfo() == CCValAssign::BCvt)

4253

ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);

4254

4255

if (VA.isExtInLoc()) {

4256

// Handle MMX values passed in XMM regs.

4257

if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)

4258

ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);

4259

else if (VA.getValVT().isVector() &&

4260

VA.getValVT().getScalarType() == MVT::i1 &&

4261

((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||

4262

(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {

4263

// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8

4264

ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);

4265

} else

4266

ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);

4267

}

4268

} else {

4269

assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4269, __extension__ __PRETTY_FUNCTION__));

4270

ArgValue =

4271

LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);

4272

}

4273

4274

// If value is passed via pointer - do a load.

4275

if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())

4276

ArgValue =

4277

DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());

4278

4279

InVals.push_back(ArgValue);

4280

}

4281

4282

for (unsigned I = 0, E = Ins.size(); I != E; ++I) {

4283

if (Ins[I].Flags.isSwiftAsync()) {

4284

auto X86FI = MF.getInfo<X86MachineFunctionInfo>();

4285

if (Subtarget.is64Bit())

4286

X86FI->setHasSwiftAsyncContext(true);

4287

else {

4288

int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);

4289

X86FI->setSwiftAsyncContextFrameIdx(FI);

4290

SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],

4291

DAG.getFrameIndex(FI, MVT::i32),

4292

MachinePointerInfo::getFixedStack(MF, FI));

4293

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);

4294

}

4295

}

4296

4297

// Swift calling convention does not require we copy the sret argument

4298

// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.

4299

if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)

4300

continue;

4301

4302

// All x86 ABIs require that for returning structs by value we copy the

4303

// sret argument into %rax/%eax (depending on ABI) for the return. Save

4304

// the argument into a virtual register so that we can access it from the

4305

// return points.

4306

if (Ins[I].Flags.isSRet()) {

4307

assert(!FuncInfo->getSRetReturnReg() &&(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4308, __extension__
__PRETTY_FUNCTION__))

4308

"SRet return has already been set")(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4308, __extension__
__PRETTY_FUNCTION__));

4309

MVT PtrTy = getPointerTy(DAG.getDataLayout());

4310

Register Reg =

4311

MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));

4312

FuncInfo->setSRetReturnReg(Reg);

4313

SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);

4314

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);

4315

break;

4316

}

4317

}

4318

4319

unsigned StackSize = CCInfo.getNextStackOffset();

4320

// Align stack specially for tail calls.

4321

if (shouldGuaranteeTCO(CallConv,

4322

MF.getTarget().Options.GuaranteedTailCallOpt))

4323

StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

4324

4325

if (IsVarArg)

4326

VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)

4327

.lowerVarArgsParameters(Chain, StackSize);

4328

4329

// Some CCs need callee pop.

4330

if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,

4331

MF.getTarget().Options.GuaranteedTailCallOpt)) {

4332

FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.

4333

} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {

4334

// X86 interrupts must pop the error code (and the alignment padding) if

4335

// present.

4336

FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);

4337

} else {

4338

FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.

4339

// If this is an sret function, the return should pop the hidden pointer.

4340

if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))

4341

FuncInfo->setBytesToPopOnReturn(4);

4342

}

4343

4344

if (!Is64Bit) {

4345

// RegSaveFrameIndex is X86-64 only.

4346

FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

4347

}

4348

4349

FuncInfo->setArgumentStackSize(StackSize);

4350

4351

if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {

4352

EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());

4353

if (Personality == EHPersonality::CoreCLR) {

4354

assert(Is64Bit)(static_cast <bool> (Is64Bit) ? void (0) : __assert_fail
("Is64Bit", "llvm/lib/Target/X86/X86ISelLowering.cpp", 4354,
__extension__ __PRETTY_FUNCTION__));

4355

// TODO: Add a mechanism to frame lowering that will allow us to indicate

4356

// that we'd prefer this slot be allocated towards the bottom of the frame

4357

// (i.e. near the stack pointer after allocating the frame). Every

4358

// funclet needs a copy of this slot in its (mostly empty) frame, and the

4359

// offset from the bottom of this and each funclet's frame must be the

4360

// same, so the size of funclets' (mostly empty) frames is dictated by

4361

// how far this slot is from the bottom (since they allocate just enough

4362

// space to accommodate holding this slot at the correct offset).

4363

int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);

4364

EHInfo->PSPSymFrameIdx = PSPSymFI;

4365

}

4366

}

4367

4368

if (shouldDisableArgRegFromCSR(CallConv) ||

4369

F.hasFnAttribute("no_caller_saved_registers")) {

4370

MachineRegisterInfo &MRI = MF.getRegInfo();

4371

for (std::pair<Register, Register> Pair : MRI.liveins())

4372

MRI.disableCalleeSavedRegister(Pair.first);

4373

}

4374

4375

return Chain;

4376

}

4377

4378

SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,

4379

SDValue Arg, const SDLoc &dl,

4380

SelectionDAG &DAG,

4381

const CCValAssign &VA,

4382

ISD::ArgFlagsTy Flags,

4383

bool isByVal) const {

4384

unsigned LocMemOffset = VA.getLocMemOffset();

4385

SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);

4386

PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

4387

StackPtr, PtrOff);

4388

if (isByVal)

4389

return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

4390

4391

MaybeAlign Alignment;

4392

if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&

4393

Arg.getSimpleValueType() != MVT::f80)

4394

Alignment = MaybeAlign(4);

4395

return DAG.getStore(

4396

Chain, dl, Arg, PtrOff,

4397

MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),

4398

Alignment);

4399

}

4400

4401

/// Emit a load of return address if tail call

4402

/// optimization is performed and it is required.

4403

SDValue X86TargetLowering::EmitTailCallLoadRetAddr(

4404

SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,

4405

bool Is64Bit, int FPDiff, const SDLoc &dl) const {

4406

// Adjust the Return address stack slot.

4407

EVT VT = getPointerTy(DAG.getDataLayout());

4408

OutRetAddr = getReturnAddressFrameIndex(DAG);

4409

4410

// Load the "old" Return address.

4411

OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());

4412

return SDValue(OutRetAddr.getNode(), 1);

4413

}

4414

4415

/// Emit a store of the return address if tail call

4416

/// optimization is performed and it is required (FPDiff!=0).

4417

static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,

4418

SDValue Chain, SDValue RetAddrFrIdx,

4419

EVT PtrVT, unsigned SlotSize,

4420

int FPDiff, const SDLoc &dl) {

4421

// Store the return address to the appropriate stack slot.

4422

if (!FPDiff) return Chain;

4423

// Calculate the new stack slot for the return address.

4424

int NewReturnAddrFI =

4425

MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,

4426

false);

4427

SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);

4428

Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,

4429

MachinePointerInfo::getFixedStack(

4430

DAG.getMachineFunction(), NewReturnAddrFI));

4431

return Chain;

4432

}

4433

4434

/// Returns a vector_shuffle mask for an movs{s|d}, movd

4435

/// operation of specified width.

4436

static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,

4437

SDValue V2) {

4438

unsigned NumElems = VT.getVectorNumElements();

4439

SmallVector<int, 8> Mask;

4440

Mask.push_back(NumElems);

4441

for (unsigned i = 1; i != NumElems; ++i)

4442

Mask.push_back(i);

4443

return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);

4444

}

4445

4446

SDValue

4447

X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

4448

SmallVectorImpl<SDValue> &InVals) const {

4449

SelectionDAG &DAG = CLI.DAG;

4450

SDLoc &dl = CLI.DL;

4451

SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;

4452

SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;

4453

SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;

4454

SDValue Chain = CLI.Chain;

4455

SDValue Callee = CLI.Callee;

4456

CallingConv::ID CallConv = CLI.CallConv;

4457

bool &isTailCall = CLI.IsTailCall;

4458

bool isVarArg = CLI.IsVarArg;

4459

const auto *CB = CLI.CB;

4460

4461

MachineFunction &MF = DAG.getMachineFunction();

4462

bool Is64Bit = Subtarget.is64Bit();

4463

bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

4464

bool IsSibcall = false;

4465

bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||

4466

CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;

4467

bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);

4468

X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();

4469

bool HasNCSR = (CB && isa<CallInst>(CB) &&

4470

CB->hasFnAttr("no_caller_saved_registers"));

4471

bool HasNoCfCheck = (CB && CB->doesNoCfCheck());

4472

bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());

4473

bool IsCFICall = IsIndirectCall && CLI.CFIType;

4474

const Module *M = MF.getMMI().getModule();

4475

Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

4476

4477

MachineFunction::CallSiteInfo CSInfo;

4478

if (CallConv == CallingConv::X86_INTR)

4479

report_fatal_error("X86 interrupts may not be called directly");

4480

4481

bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();

4482

if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {

4483

// If we are using a GOT, disable tail calls to external symbols with

4484

// default visibility. Tail calling such a symbol requires using a GOT

4485

// relocation, which forces early binding of the symbol. This breaks code

4486

// that require lazy function symbol resolution. Using musttail or

4487

// GuaranteedTailCallOpt will override this.

4488

GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);

4489

if (!G || (!G->getGlobal()->hasLocalLinkage() &&

4490

G->getGlobal()->hasDefaultVisibility()))

4491

isTailCall = false;

4492

}

4493

4494

if (isTailCall && !IsMustTail) {

4495

// Check if it's really possible to do a tail call.

4496

isTailCall = IsEligibleForTailCallOptimization(

4497

Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,

4498

Ins, DAG);

4499

4500

// Sibcalls are automatically detected tailcalls which do not require

4501

// ABI changes.

4502

if (!IsGuaranteeTCO && isTailCall)

4503

IsSibcall = true;

4504

4505

if (isTailCall)

4506

++NumTailCalls;

4507

}

4508

4509

if (IsMustTail && !isTailCall)

4510

report_fatal_error("failed to perform tail call elimination on a call "

4511

"site marked musttail");

4512

4513

assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4514, __extension__
__PRETTY_FUNCTION__))

4514

"Var args not supported with calling convention fastcc, ghc or hipe")(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4514, __extension__
__PRETTY_FUNCTION__));

4515

4516

// Analyze operands of the call, assigning locations to each operand.

4517

SmallVector<CCValAssign, 16> ArgLocs;

4518

CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

4519

4520

// Allocate shadow area for Win64.

4521

if (IsWin64)

4522

CCInfo.AllocateStack(32, Align(8));

4523

4524

CCInfo.AnalyzeArguments(Outs, CC_X86);

4525

4526

// In vectorcall calling convention a second pass is required for the HVA

4527

// types.

4528

if (CallingConv::X86_VectorCall == CallConv) {

4529

CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);

4530

}

4531

4532

// Get a count of how many bytes are to be pushed on the stack.

4533

unsigned NumBytes = CCInfo.getAlignedCallFrameSize();

4534

if (IsSibcall)

4535

// This is a sibcall. The memory operands are available in caller's

4536

// own caller's stack.

4537

NumBytes = 0;

4538

else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))

4539

NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

4540

4541

int FPDiff = 0;

4542

if (isTailCall &&

4543

shouldGuaranteeTCO(CallConv,

4544

MF.getTarget().Options.GuaranteedTailCallOpt)) {

4545

// Lower arguments at fp - stackoffset + fpdiff.

4546

unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

4547

4548

FPDiff = NumBytesCallerPushed - NumBytes;

4549

4550

// Set the delta of movement of the returnaddr stackslot.

4551

// But only set if delta is greater than previous delta.

4552

if (FPDiff < X86Info->getTCReturnAddrDelta())

4553

X86Info->setTCReturnAddrDelta(FPDiff);

4554

}

4555

4556

unsigned NumBytesToPush = NumBytes;

4557

unsigned NumBytesToPop = NumBytes;

4558

4559

// If we have an inalloca argument, all stack space has already been allocated

4560

// for us and be right at the top of the stack. We don't support multiple

4561

// arguments passed in memory when using inalloca.

4562

if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {

4563

NumBytesToPush = 0;

4564

if (!ArgLocs.back().isMemLoc())

4565

report_fatal_error("cannot use inalloca attribute on a register "

4566

"parameter");

4567

if (ArgLocs.back().getLocMemOffset() != 0)

4568

report_fatal_error("any parameter with the inalloca attribute must be "

4569

"the only memory argument");

4570

} else if (CLI.IsPreallocated) {

4571

assert(ArgLocs.back().isMemLoc() &&(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4573, __extension__
__PRETTY_FUNCTION__))

4572

"cannot use preallocated attribute on a register "(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4573, __extension__
__PRETTY_FUNCTION__))

4573

"parameter")(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4573, __extension__
__PRETTY_FUNCTION__));

4574

SmallVector<size_t, 4> PreallocatedOffsets;

4575

for (size_t i = 0; i < CLI.OutVals.size(); ++i) {

4576

if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {

4577

PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());

4578

}

4579

}

4580

auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();

4581

size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);

4582

MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);

4583

MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);

4584

NumBytesToPush = 0;

4585

}

4586

4587

if (!IsSibcall && !IsMustTail)

4588

Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,

4589

NumBytes - NumBytesToPush, dl);

4590

4591

SDValue RetAddrFrIdx;

4592

// Load return address for tail calls.

4593

if (isTailCall && FPDiff)

4594

Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,

4595

Is64Bit, FPDiff, dl);

4596

4597

SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;

4598

SmallVector<SDValue, 8> MemOpChains;

4599

SDValue StackPtr;

4600

4601

// The next loop assumes that the locations are in the same order of the

4602

// input arguments.

4603

assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4604, __extension__
__PRETTY_FUNCTION__))

4604

"Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4604, __extension__
__PRETTY_FUNCTION__));

4605

4606

// Walk the register/memloc assignments, inserting copies/loads. In the case

4607

// of tail call optimization arguments are handle later.

4608

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

4609

for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;

4610

++I, ++OutIndex) {

4611

assert(OutIndex < Outs.size() && "Invalid Out index")(static_cast <bool> (OutIndex < Outs.size() &&
"Invalid Out index") ? void (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4611, __extension__
__PRETTY_FUNCTION__));

4612

// Skip inalloca/preallocated arguments, they have already been written.

4613

ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;

4614

if (Flags.isInAlloca() || Flags.isPreallocated())

4615

continue;

4616

4617

CCValAssign &VA = ArgLocs[I];

4618

EVT RegVT = VA.getLocVT();

4619

SDValue Arg = OutVals[OutIndex];

4620

bool isByVal = Flags.isByVal();

4621

4622

// Promote the value if needed.

4623

switch (VA.getLocInfo()) {

4624

default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4624);

4625

case CCValAssign::Full: break;

4626

case CCValAssign::SExt:

4627

Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);

4628

break;

4629

case CCValAssign::ZExt:

4630

Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);

4631

break;

4632

case CCValAssign::AExt:

4633

if (Arg.getValueType().isVector() &&

4634

Arg.getValueType().getVectorElementType() == MVT::i1)

4635

Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);

4636

else if (RegVT.is128BitVector()) {

4637

// Special case: passing MMX values in XMM registers.

4638

Arg = DAG.getBitcast(MVT::i64, Arg);

4639

Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);

4640

Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);

4641

} else

4642

Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);

4643

break;

4644

case CCValAssign::BCvt:

4645

Arg = DAG.getBitcast(RegVT, Arg);

4646

break;

4647

case CCValAssign::Indirect: {

4648

if (isByVal) {

4649

// Memcpy the argument to a temporary stack slot to prevent

4650

// the caller from seeing any modifications the callee may make

4651

// as guaranteed by the `byval` attribute.

4652

int FrameIdx = MF.getFrameInfo().CreateStackObject(

4653

Flags.getByValSize(),

4654

std::max(Align(16), Flags.getNonZeroByValAlign()), false);

4655

SDValue StackSlot =

4656

DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));

4657

Chain =

4658

CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);

4659

// From now on treat this as a regular pointer

4660

Arg = StackSlot;

4661

isByVal = false;

4662

} else {

4663

// Store the argument.

4664

SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());

4665

int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();

4666

Chain = DAG.getStore(

4667

Chain, dl, Arg, SpillSlot,

4668

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));

4669

Arg = SpillSlot;

4670

}

4671

break;

4672

}

4673

}

4674

4675

if (VA.needsCustom()) {

4676

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4677, __extension__
__PRETTY_FUNCTION__))

4677

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4677, __extension__
__PRETTY_FUNCTION__));

4678

// Split v64i1 value into two registers

4679

Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);

4680

} else if (VA.isRegLoc()) {

4681

RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));

4682

const TargetOptions &Options = DAG.getTarget().Options;

4683

if (Options.EmitCallSiteInfo)

4684

CSInfo.emplace_back(VA.getLocReg(), I);

4685

if (isVarArg && IsWin64) {

4686

// Win64 ABI requires argument XMM reg to be copied to the corresponding

4687

// shadow reg if callee is a varargs function.

4688

Register ShadowReg;

4689

switch (VA.getLocReg()) {

4690

case X86::XMM0: ShadowReg = X86::RCX; break;

4691

case X86::XMM1: ShadowReg = X86::RDX; break;

4692

case X86::XMM2: ShadowReg = X86::R8; break;

4693

case X86::XMM3: ShadowReg = X86::R9; break;

4694

}

4695

if (ShadowReg)

4696

RegsToPass.push_back(std::make_pair(ShadowReg, Arg));

4697

}

4698

} else if (!IsSibcall && (!isTailCall || isByVal)) {

4699

assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4699, __extension__ __PRETTY_FUNCTION__));

4700

if (!StackPtr.getNode())

4701

StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),

4702

getPointerTy(DAG.getDataLayout()));

4703

MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,

4704

dl, DAG, VA, Flags, isByVal));

4705

}

4706

}

4707

4708

if (!MemOpChains.empty())

4709

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

4710

4711

if (Subtarget.isPICStyleGOT()) {

4712

// ELF / PIC requires GOT in the EBX register before function calls via PLT

4713

// GOT pointer (except regcall).

4714

if (!isTailCall) {

4715

// Indirect call with RegCall calling convertion may use up all the

4716

// general registers, so it is not suitable to bind EBX reister for

4717

// GOT address, just let register allocator handle it.

4718

if (CallConv != CallingConv::X86_RegCall)

4719

RegsToPass.push_back(std::make_pair(

4720

Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),

4721

getPointerTy(DAG.getDataLayout()))));

4722

} else {

4723

// If we are tail calling and generating PIC/GOT style code load the

4724

// address of the callee into ECX. The value in ecx is used as target of

4725

// the tail jump. This is done to circumvent the ebx/callee-saved problem

4726

// for tail calls on PIC/GOT architectures. Normally we would just put the

4727

// address of GOT into ebx and then call target@PLT. But for tail calls

4728

// ebx would be restored (since ebx is callee saved) before jumping to the

4729

// target@PLT.

4730

4731

// Note: The actual moving to ECX is done further down.

4732

GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);

4733

if (G && !G->getGlobal()->hasLocalLinkage() &&

4734

G->getGlobal()->hasDefaultVisibility())

4735

Callee = LowerGlobalAddress(Callee, DAG);

4736

else if (isa<ExternalSymbolSDNode>(Callee))

4737

Callee = LowerExternalSymbol(Callee, DAG);

4738

}

4739

}

4740

4741

if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&

4742

(Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {

4743

// From AMD64 ABI document:

4744

// For calls that may call functions that use varargs or stdargs

4745

// (prototype-less calls or calls to functions containing ellipsis (...) in

4746

// the declaration) %al is used as hidden argument to specify the number

4747

// of SSE registers used. The contents of %al do not need to match exactly

4748

// the number of registers, but must be an ubound on the number of SSE

4749

// registers used and is in the range 0 - 8 inclusive.

4750

4751

// Count the number of XMM registers allocated.

4752

static const MCPhysReg XMMArgRegs[] = {

4753

X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,

4754

X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7

4755

};

4756

unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);

4757

assert((Subtarget.hasSSE1() || !NumXMMRegs)(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4758, __extension__
__PRETTY_FUNCTION__))

4758

&& "SSE registers cannot be used when SSE is disabled")(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4758, __extension__
__PRETTY_FUNCTION__));

4759

RegsToPass.push_back(std::make_pair(Register(X86::AL),

4760

DAG.getConstant(NumXMMRegs, dl,

4761

MVT::i8)));

4762

}

4763

4764

if (isVarArg && IsMustTail) {

4765

const auto &Forwards = X86Info->getForwardedMustTailRegParms();

4766

for (const auto &F : Forwards) {

4767

SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);

4768

RegsToPass.push_back(std::make_pair(F.PReg, Val));

4769

}

4770

}

4771

4772

// For tail calls lower the arguments to the 'real' stack slots. Sibcalls

4773

// don't need this because the eligibility check rejects calls that require

4774

// shuffling arguments passed in memory.

4775

if (!IsSibcall && isTailCall) {

4776

// Force all the incoming stack arguments to be loaded from the stack

4777

// before any new outgoing arguments are stored to the stack, because the

4778

// outgoing stack slots may alias the incoming argument stack slots, and

4779

// the alias isn't otherwise explicit. This is slightly more conservative

4780

// than necessary, because it means that each store effectively depends

4781

// on every argument instead of just those arguments it would clobber.

4782

SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

4783

4784

SmallVector<SDValue, 8> MemOpChains2;

4785

SDValue FIN;

4786

int FI = 0;

4787

for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;

4788

++I, ++OutsIndex) {

4789

CCValAssign &VA = ArgLocs[I];

4790

4791

if (VA.isRegLoc()) {

4792

if (VA.needsCustom()) {

4793

assert((CallConv == CallingConv::X86_RegCall) &&(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4794, __extension__
__PRETTY_FUNCTION__))

4794

"Expecting custom case only in regcall calling convention")(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4794, __extension__
__PRETTY_FUNCTION__));

4795

// This means that we are in special case where one argument was

4796

// passed through two register locations - Skip the next location

4797

++I;

4798

}

4799

4800

continue;

4801

}

4802

4803

assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4803, __extension__ __PRETTY_FUNCTION__));

4804

SDValue Arg = OutVals[OutsIndex];

4805

ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;

4806

// Skip inalloca/preallocated arguments. They don't require any work.

4807

if (Flags.isInAlloca() || Flags.isPreallocated())

4808

continue;

4809

// Create frame index.

4810

int32_t Offset = VA.getLocMemOffset()+FPDiff;

4811

uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;

4812

FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);

4813

FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

4814

4815

if (Flags.isByVal()) {

4816

// Copy relative to framepointer.

4817

SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);

4818

if (!StackPtr.getNode())

4819

StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),

4820

getPointerTy(DAG.getDataLayout()));

4821

Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

4822

StackPtr, Source);

4823

4824

MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,

4825

ArgChain,

4826

Flags, DAG, dl));

4827

} else {

4828

// Store relative to framepointer.

4829

MemOpChains2.push_back(DAG.getStore(

4830

ArgChain, dl, Arg, FIN,

4831

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));

4832

}

4833

}

4834

4835

if (!MemOpChains2.empty())

4836

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

4837

4838

// Store the return address to the appropriate stack slot.

4839

Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,

4840

getPointerTy(DAG.getDataLayout()),

4841

RegInfo->getSlotSize(), FPDiff, dl);

4842

}

4843

4844

// Build a sequence of copy-to-reg nodes chained together with token chain

4845

// and glue operands which copy the outgoing args into registers.

4846

SDValue InGlue;

4847

for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {

4848

Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,

4849

RegsToPass[i].second, InGlue);

4850

InGlue = Chain.getValue(1);

4851

}

4852

4853

if (DAG.getTarget().getCodeModel() == CodeModel::Large) {

4854

assert(Is64Bit && "Large code model is only legal in 64-bit mode.")(static_cast <bool> (Is64Bit && "Large code model is only legal in 64-bit mode."
) ? void (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4854, __extension__
__PRETTY_FUNCTION__));

4855

// In the 64-bit large code model, we have to make all calls

4856

// through a register, since the call instruction's 32-bit

4857

// pc-relative offset may not be large enough to hold the whole

4858

// address.

4859

} else if (Callee->getOpcode() == ISD::GlobalAddress ||

4860

Callee->getOpcode() == ISD::ExternalSymbol) {

4861

// Lower direct calls to global addresses and external symbols. Setting

4862

// ForCall to true here has the effect of removing WrapperRIP when possible

4863

// to allow direct calls to be selected without first materializing the

4864

// address into a register.

4865

Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);

4866

} else if (Subtarget.isTarget64BitILP32() &&

4867

Callee.getValueType() == MVT::i32) {

4868

// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI

4869

Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);

4870

}

4871

4872

// Returns a chain & a glue for retval copy to use.

4873

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

4874

SmallVector<SDValue, 8> Ops;

4875

4876

if (!IsSibcall && isTailCall && !IsMustTail) {

4877

Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl);

4878

InGlue = Chain.getValue(1);

4879

}

4880

4881

Ops.push_back(Chain);

4882

Ops.push_back(Callee);

4883

4884

if (isTailCall)

4885

Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));

4886

4887

// Add argument registers to the end of the list so that they are known live

4888

// into the call.

4889

for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)

4890

Ops.push_back(DAG.getRegister(RegsToPass[i].first,

4891

RegsToPass[i].second.getValueType()));

4892

4893

// Add a register mask operand representing the call-preserved registers.

4894

const uint32_t *Mask = [&]() {

4895

auto AdaptedCC = CallConv;

4896

// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),

4897

// use X86_INTR calling convention because it has the same CSR mask

4898

// (same preserved registers).

4899

if (HasNCSR)

4900

AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;

4901

// If NoCalleeSavedRegisters is requested, than use GHC since it happens

4902

// to use the CSR_NoRegs_RegMask.

4903

if (CB && CB->hasFnAttr("no_callee_saved_registers"))

4904

AdaptedCC = (CallingConv::ID)CallingConv::GHC;

4905

return RegInfo->getCallPreservedMask(MF, AdaptedCC);

4906

}();

4907

assert(Mask && "Missing call preserved mask for calling convention")(static_cast <bool> (Mask && "Missing call preserved mask for calling convention"
) ? void (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4907, __extension__
__PRETTY_FUNCTION__));

4908

4909

// If this is an invoke in a 32-bit function using a funclet-based

4910

// personality, assume the function clobbers all registers. If an exception

4911

// is thrown, the runtime will not restore CSRs.

4912

// FIXME: Model this more precisely so that we can register allocate across

4913

// the normal edge and spill and fill across the exceptional edge.

4914

if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {

4915

const Function &CallerFn = MF.getFunction();

4916

EHPersonality Pers =

4917

CallerFn.hasPersonalityFn()

4918

? classifyEHPersonality(CallerFn.getPersonalityFn())

4919

: EHPersonality::Unknown;

4920

if (isFuncletEHPersonality(Pers))

4921

Mask = RegInfo->getNoPreservedMask();

4922

}

4923

4924

// Define a new register mask from the existing mask.

4925

uint32_t *RegMask = nullptr;

4926

4927

// In some calling conventions we need to remove the used physical registers

4928

// from the reg mask. Create a new RegMask for such calling conventions.

4929

// RegMask for calling conventions that disable only return registers (e.g.

4930

// preserve_most) will be modified later in LowerCallResult.

4931

bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR;

4932

if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) {

4933

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

4934

4935

// Allocate a new Reg Mask and copy Mask.

4936

RegMask = MF.allocateRegMask();

4937

unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());

4938

memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);

4939

4940

// Make sure all sub registers of the argument registers are reset

4941

// in the RegMask.

4942

if (ShouldDisableArgRegs) {

4943

for (auto const &RegPair : RegsToPass)

4944

for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);

4945

SubRegs.isValid(); ++SubRegs)

4946

RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));

4947

}

4948

4949

// Create the RegMask Operand according to our updated mask.

4950

Ops.push_back(DAG.getRegisterMask(RegMask));

4951

} else {

4952

// Create the RegMask Operand according to the static mask.

4953

Ops.push_back(DAG.getRegisterMask(Mask));

4954

}

4955

4956

if (InGlue.getNode())

4957

Ops.push_back(InGlue);

4958

4959

if (isTailCall) {

4960

// We used to do:

4961

//// If this is the first return lowered for this function, add the regs

4962

//// to the liveout set for the function.

4963

// This isn't right, although it's probably harmless on x86; liveouts

4964

// should be computed from returns not tail calls. Consider a void

4965

// function making a tail call to a function returning int.

4966

MF.getFrameInfo().setHasTailCall();

4967

SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);

4968

4969

if (IsCFICall)

4970

Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());

4971

4972

DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));

4973

return Ret;

4974

}

4975

4976

if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {

4977

Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);

4978

} else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {

4979

// Calls with a "clang.arc.attachedcall" bundle are special. They should be

4980

// expanded to the call, directly followed by a special marker sequence and

4981

// a call to a ObjC library function. Use the CALL_RVMARKER to do that.

4982

assert(!isTailCall &&(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4983, __extension__
__PRETTY_FUNCTION__))

4983

"tail calls cannot be marked with clang.arc.attachedcall")(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4983, __extension__
__PRETTY_FUNCTION__));

4984

assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode")(static_cast <bool> (Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode"
) ? void (0) : __assert_fail ("Is64Bit && \"clang.arc.attachedcall is only supported in 64bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4984, __extension__
__PRETTY_FUNCTION__));

4985

4986

// Add a target global address for the retainRV/claimRV runtime function

4987

// just before the call target.

4988

Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);

4989

auto PtrVT = getPointerTy(DAG.getDataLayout());

4990

auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);

4991

Ops.insert(Ops.begin() + 1, GA);

4992

Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);

4993

} else {

4994

Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);

4995

}

4996

4997

if (IsCFICall)

4998

Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());

4999

5000

InGlue = Chain.getValue(1);

5001

DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);

5002

DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));

5003

5004

// Save heapallocsite metadata.

5005

if (CLI.CB)

5006

if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))

5007

DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);

5008

5009

// Create the CALLSEQ_END node.

5010

unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.

5011

if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,

5012

DAG.getTarget().Options.GuaranteedTailCallOpt))

5013

NumBytesForCalleeToPop = NumBytes; // Callee pops everything

5014

else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)

5015

// If this call passes a struct-return pointer, the callee

5016

// pops that struct pointer.

5017

NumBytesForCalleeToPop = 4;

5018

5019

// Returns a glue for retval copy to use.

5020

if (!IsSibcall) {

5021

Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop,

5022

InGlue, dl);

5023

InGlue = Chain.getValue(1);

5024

}

5025

5026

// Handle result values, copying them out of physregs into vregs that we

5027

// return.

5028

return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,

5029

InVals, RegMask);

5030

}

5031

5032

//===----------------------------------------------------------------------===//

5033

// Fast Calling Convention (tail call) implementation

5034

//===----------------------------------------------------------------------===//

5035

5036

// Like std call, callee cleans arguments, convention except that ECX is

5037

// reserved for storing the tail called function address. Only 2 registers are

5038

// free for argument passing (inreg). Tail call optimization is performed

5039

// provided:

5040

// * tailcallopt is enabled

5041

// * caller/callee are fastcc

5042

// On X86_64 architecture with GOT-style position independent code only local

5043

// (within module) calls are supported at the moment.

5044

// To keep the stack aligned according to platform abi the function

5045

// GetAlignedArgumentStackSize ensures that argument delta is always multiples

5046

// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)

5047

// If a tail called function callee has more arguments than the caller the

5048

// caller needs to make sure that there is room to move the RETADDR to. This is

5049

// achieved by reserving an area the size of the argument delta right after the

5050

// original RETADDR, but before the saved framepointer or the spilled registers

5051

// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)

5052

// stack layout:

5053

// arg1

5054

// arg2

5055

// RETADDR

5056

// [ new RETADDR

5057

// move area ]

5058

// (possible EBP)

5059

// ESI

5060

// EDI

5061

// local1 ..

5062

5063

/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align

5064

/// requirement.

5065

unsigned

5066

X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,

5067

SelectionDAG &DAG) const {

5068

const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();

5069

const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();

5070

assert(StackSize % SlotSize == 0 &&(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5071, __extension__
__PRETTY_FUNCTION__))

5071

"StackSize must be a multiple of SlotSize")(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5071, __extension__
__PRETTY_FUNCTION__));

5072

return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;

5073

}

5074

5075

/// Return true if the given stack call argument is already available in the

5076

/// same position (relatively) of the caller's incoming argument stack.

5077

static

5078

bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,

5079

MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,

5080

const X86InstrInfo *TII, const CCValAssign &VA) {

5081

unsigned Bytes = Arg.getValueSizeInBits() / 8;

5082

5083

for (;;) {

5084

// Look through nodes that don't alter the bits of the incoming value.

5085

unsigned Op = Arg.getOpcode();

5086

if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {

5087

Arg = Arg.getOperand(0);

5088

continue;

5089

}

5090

if (Op == ISD::TRUNCATE) {

5091

const SDValue &TruncInput = Arg.getOperand(0);

5092

if (TruncInput.getOpcode() == ISD::AssertZext &&

5093

cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==

5094

Arg.getValueType()) {

5095

Arg = TruncInput.getOperand(0);

5096

continue;

5097

}

5098

}

5099

break;

5100

}

5101

5102

int FI = INT_MAX2147483647;

5103

if (Arg.getOpcode() == ISD::CopyFromReg) {

5104

Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();

5105

if (!VR.isVirtual())

5106

return false;

5107

MachineInstr *Def = MRI->getVRegDef(VR);

5108

if (!Def)

5109

return false;

5110

if (!Flags.isByVal()) {

5111

if (!TII->isLoadFromStackSlot(*Def, FI))

5112

return false;

5113

} else {

5114

unsigned Opcode = Def->getOpcode();

5115

if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||

5116

Opcode == X86::LEA64_32r) &&

5117

Def->getOperand(1).isFI()) {

5118

FI = Def->getOperand(1).getIndex();

5119

Bytes = Flags.getByValSize();

5120

} else

5121

return false;

5122

}

5123

} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {

5124

if (Flags.isByVal())

5125

// ByVal argument is passed in as a pointer but it's now being

5126

// dereferenced. e.g.

5127

// define @foo(%struct.X* %A) {

5128

// tail call @bar(%struct.X* byval %A)

5129

// }

5130

return false;

5131

SDValue Ptr = Ld->getBasePtr();

5132

FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);

5133

if (!FINode)

5134

return false;

5135

FI = FINode->getIndex();

5136

} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {

5137

FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);

5138

FI = FINode->getIndex();

5139

Bytes = Flags.getByValSize();

5140

} else

5141

return false;

5142

5143

assert(FI != INT_MAX)(static_cast <bool> (FI != 2147483647) ? void (0) : __assert_fail
("FI != INT_MAX", "llvm/lib/Target/X86/X86ISelLowering.cpp",
5143, __extension__ __PRETTY_FUNCTION__));

5144

if (!MFI.isFixedObjectIndex(FI))

5145

return false;

5146

5147

if (Offset != MFI.getObjectOffset(FI))

5148

return false;

5149

5150

// If this is not byval, check that the argument stack object is immutable.

5151

// inalloca and argument copy elision can create mutable argument stack

5152

// objects. Byval objects can be mutated, but a byval call intends to pass the

5153

// mutated memory.

5154

if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))

5155

return false;

5156

5157

if (VA.getLocVT().getFixedSizeInBits() >

5158

Arg.getValueSizeInBits().getFixedValue()) {

5159

// If the argument location is wider than the argument type, check that any

5160

// extension flags match.

5161

if (Flags.isZExt() != MFI.isObjectZExt(FI) ||

5162

Flags.isSExt() != MFI.isObjectSExt(FI)) {

5163

return false;

5164

}

5165

}

5166

5167

return Bytes == MFI.getObjectSize(FI);

5168

}

5169

5170

/// Check whether the call is eligible for tail call optimization. Targets

5171

/// that want to do tail call optimization should implement this function.

5172

bool X86TargetLowering::IsEligibleForTailCallOptimization(

5173

SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,

5174

bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,

5175

const SmallVectorImpl<SDValue> &OutVals,

5176

const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {

5177

if (!mayTailCallThisCC(CalleeCC))

5178

return false;

5179

5180

// If -tailcallopt is specified, make fastcc functions tail-callable.

5181

MachineFunction &MF = DAG.getMachineFunction();

5182

const Function &CallerF = MF.getFunction();

5183

5184

// If the function return type is x86_fp80 and the callee return type is not,

5185

// then the FP_EXTEND of the call result is not a nop. It's not safe to

5186

// perform a tailcall optimization here.

5187

if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())

5188

return false;

5189

5190

CallingConv::ID CallerCC = CallerF.getCallingConv();

5191

bool CCMatch = CallerCC == CalleeCC;

5192

bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);

5193

bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);

5194

bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||

5195

CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;

5196

5197

// Win64 functions have extra shadow space for argument homing. Don't do the

5198

// sibcall if the caller and callee have mismatched expectations for this

5199

// space.

5200

if (IsCalleeWin64 != IsCallerWin64)

5201

return false;

5202

5203

if (IsGuaranteeTCO) {

5204

if (canGuaranteeTCO(CalleeCC) && CCMatch)

5205

return true;

5206

return false;

5207

}

5208

5209

// Look for obvious safe cases to perform tail call optimization that do not

5210

// require ABI changes. This is what gcc calls sibcall.

5211

5212

// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to

5213

// emit a special epilogue.

5214

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

5215

if (RegInfo->hasStackRealignment(MF))

5216

return false;

5217

5218

// Also avoid sibcall optimization if we're an sret return fn and the callee

5219

// is incompatible. See comment in LowerReturn about why hasStructRetAttr is

5220

// insufficient.

5221

if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {

5222

// For a compatible tail call the callee must return our sret pointer. So it

5223

// needs to be (a) an sret function itself and (b) we pass our sret as its

5224

// sret. Condition #b is harder to determine.

5225

return false;

5226

} else if (IsCalleePopSRet)

5227

// The callee pops an sret, so we cannot tail-call, as our caller doesn't

5228

// expect that.

5229

return false;

5230

5231

// Do not sibcall optimize vararg calls unless all arguments are passed via

5232

// registers.

5233

LLVMContext &C = *DAG.getContext();

5234

if (isVarArg && !Outs.empty()) {

5235

// Optimizing for varargs on Win64 is unlikely to be safe without

5236

// additional testing.

5237

if (IsCalleeWin64 || IsCallerWin64)

5238

return false;

5239

5240

SmallVector<CCValAssign, 16> ArgLocs;

5241

CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

5242

5243

CCInfo.AnalyzeCallOperands(Outs, CC_X86);

5244

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)

5245

if (!ArgLocs[i].isRegLoc())

5246

return false;

5247

}

5248

5249

// If the call result is in ST0 / ST1, it needs to be popped off the x87

5250

// stack. Therefore, if it's not used by the call it is not safe to optimize

5251

// this into a sibcall.

5252

bool Unused = false;

5253

for (unsigned i = 0, e = Ins.size(); i != e; ++i) {

5254

if (!Ins[i].Used) {

5255

Unused = true;

5256

break;

5257

}

5258

}

5259

if (Unused) {

5260

SmallVector<CCValAssign, 16> RVLocs;

5261

CCState CCInfo(CalleeCC, false, MF, RVLocs, C);

5262

CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

5263

for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {

5264

CCValAssign &VA = RVLocs[i];

5265

if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)

5266

return false;

5267

}

5268

}

5269

5270

// Check that the call results are passed in the same way.

5271

if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,

5272

RetCC_X86, RetCC_X86))

5273

return false;

5274

// The callee has to preserve all registers the caller needs to preserve.

5275

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

5276

const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);

5277

if (!CCMatch) {

5278

const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);

5279

if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))

5280

return false;

5281

}

5282

5283

unsigned StackArgsSize = 0;

5284

5285

// If the callee takes no arguments then go on to check the results of the

5286

// call.

5287

if (!Outs.empty()) {

5288

// Check if stack adjustment is needed. For now, do not do this if any

5289

// argument is passed on the stack.

5290

SmallVector<CCValAssign, 16> ArgLocs;

5291

CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

5292

5293

// Allocate shadow area for Win64

5294

if (IsCalleeWin64)

5295

CCInfo.AllocateStack(32, Align(8));

5296

5297

CCInfo.AnalyzeCallOperands(Outs, CC_X86);

5298

StackArgsSize = CCInfo.getNextStackOffset();

5299

5300

if (CCInfo.getNextStackOffset()) {

5301

// Check if the arguments are already laid out in the right way as

5302

// the caller's fixed stack objects.

5303

MachineFrameInfo &MFI = MF.getFrameInfo();

5304

const MachineRegisterInfo *MRI = &MF.getRegInfo();

5305

const X86InstrInfo *TII = Subtarget.getInstrInfo();

5306

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

5307

CCValAssign &VA = ArgLocs[i];

5308

SDValue Arg = OutVals[i];

5309

ISD::ArgFlagsTy Flags = Outs[i].Flags;

5310

if (VA.getLocInfo() == CCValAssign::Indirect)

5311

return false;

5312

if (!VA.isRegLoc()) {

5313

if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,

5314

MFI, MRI, TII, VA))

5315

return false;

5316

}

5317

}

5318

}

5319

5320

bool PositionIndependent = isPositionIndependent();

5321

// If the tailcall address may be in a register, then make sure it's

5322

// possible to register allocate for it. In 32-bit, the call address can

5323

// only target EAX, EDX, or ECX since the tail call must be scheduled after

5324

// callee-saved registers are restored. These happen to be the same

5325

// registers used to pass 'inreg' arguments so watch out for those.

5326

if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&

5327

!isa<ExternalSymbolSDNode>(Callee)) ||

5328

PositionIndependent)) {

5329

unsigned NumInRegs = 0;

5330

// In PIC we need an extra register to formulate the address computation

5331

// for the callee.

5332

unsigned MaxInRegs = PositionIndependent ? 2 : 3;

5333

5334

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

5335

CCValAssign &VA = ArgLocs[i];

5336

if (!VA.isRegLoc())

5337

continue;

5338

Register Reg = VA.getLocReg();

5339

switch (Reg) {

5340

default: break;

5341

case X86::EAX: case X86::EDX: case X86::ECX:

5342

if (++NumInRegs == MaxInRegs)

5343

return false;

5344

break;

5345

}

5346

}

5347

}

5348

5349

const MachineRegisterInfo &MRI = MF.getRegInfo();

5350

if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))

5351

return false;

5352

}

5353

5354

bool CalleeWillPop =

5355

X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,

5356

MF.getTarget().Options.GuaranteedTailCallOpt);

5357

5358

if (unsigned BytesToPop =

5359

MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {

5360

// If we have bytes to pop, the callee must pop them.

5361

bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;

5362

if (!CalleePopMatches)

5363

return false;

5364

} else if (CalleeWillPop && StackArgsSize > 0) {

5365

// If we don't have bytes to pop, make sure the callee doesn't pop any.

5366

return false;

5367

}

5368

5369

return true;

5370

}

5371

5372

FastISel *

5373

X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,

5374

const TargetLibraryInfo *libInfo) const {

5375

return X86::createFastISel(funcInfo, libInfo);

5376

}

5377

5378

//===----------------------------------------------------------------------===//

5379

// Other Lowering Hooks

5380

//===----------------------------------------------------------------------===//

5381

5382

bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,

5383

bool AssumeSingleUse) {

5384

if (!AssumeSingleUse && !Op.hasOneUse())

5385

return false;

5386

if (!ISD::isNormalLoad(Op.getNode()))

5387

return false;

5388

5389

// If this is an unaligned vector, make sure the target supports folding it.

5390

auto *Ld = cast<LoadSDNode>(Op.getNode());

5391

if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&

5392

Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))

5393

return false;

5394

5395

// TODO: If this is a non-temporal load and the target has an instruction

5396

// for it, it should not be folded. See "useNonTemporalLoad()".

5397

5398

return true;

5399

}

5400

5401

bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,

5402

const X86Subtarget &Subtarget,

5403

bool AssumeSingleUse) {

5404

assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX for broadcast from memory"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX for broadcast from memory\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5404, __extension__
__PRETTY_FUNCTION__));

5405

if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))

5406

return false;

5407

5408

// We can not replace a wide volatile load with a broadcast-from-memory,

5409

// because that would narrow the load, which isn't legal for volatiles.

5410

auto *Ld = cast<LoadSDNode>(Op.getNode());

5411

return !Ld->isVolatile() ||

5412

Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();

5413

}

5414

5415

bool X86::mayFoldIntoStore(SDValue Op) {

5416

return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());

5417

}

5418

5419

bool X86::mayFoldIntoZeroExtend(SDValue Op) {

5420

if (Op.hasOneUse()) {

5421

unsigned Opcode = Op.getNode()->use_begin()->getOpcode();

5422

return (ISD::ZERO_EXTEND == Opcode);

5423

}

5424

return false;

5425

}

5426

5427

static bool isTargetShuffle(unsigned Opcode) {

5428

switch(Opcode) {

5429

default: return false;

5430

case X86ISD::BLENDI:

5431

case X86ISD::PSHUFB:

5432

case X86ISD::PSHUFD:

5433

case X86ISD::PSHUFHW:

5434

case X86ISD::PSHUFLW:

5435

case X86ISD::SHUFP:

5436

case X86ISD::INSERTPS:

5437

case X86ISD::EXTRQI:

5438

case X86ISD::INSERTQI:

5439

case X86ISD::VALIGN:

5440

case X86ISD::PALIGNR:

5441

case X86ISD::VSHLDQ:

5442

case X86ISD::VSRLDQ:

5443

case X86ISD::MOVLHPS:

5444

case X86ISD::MOVHLPS:

5445

case X86ISD::MOVSHDUP:

5446

case X86ISD::MOVSLDUP:

5447

case X86ISD::MOVDDUP:

5448

case X86ISD::MOVSS:

5449

case X86ISD::MOVSD:

5450

case X86ISD::MOVSH:

5451

case X86ISD::UNPCKL:

5452

case X86ISD::UNPCKH:

5453

case X86ISD::VBROADCAST:

5454

case X86ISD::VPERMILPI:

5455

case X86ISD::VPERMILPV:

5456

case X86ISD::VPERM2X128:

5457

case X86ISD::SHUF128:

5458

case X86ISD::VPERMIL2:

5459

case X86ISD::VPERMI:

5460

case X86ISD::VPPERM:

5461

case X86ISD::VPERMV:

5462

case X86ISD::VPERMV3:

5463

case X86ISD::VZEXT_MOVL:

5464

return true;

5465

}

5466

}

5467

5468

static bool isTargetShuffleVariableMask(unsigned Opcode) {

5469

switch (Opcode) {

5470

default: return false;

5471

// Target Shuffles.

5472

case X86ISD::PSHUFB:

5473

case X86ISD::VPERMILPV:

5474

case X86ISD::VPERMIL2:

5475

case X86ISD::VPPERM:

5476

case X86ISD::VPERMV:

5477

case X86ISD::VPERMV3:

5478

return true;

5479

// 'Faux' Target Shuffles.

5480

case ISD::OR:

5481

case ISD::AND:

5482

case X86ISD::ANDNP:

5483

return true;

5484

}

5485

}

5486

5487

SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {

5488

MachineFunction &MF = DAG.getMachineFunction();

5489

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

5490

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

5491

int ReturnAddrIndex = FuncInfo->getRAIndex();

5492

5493

if (ReturnAddrIndex == 0) {

5494

// Set up a frame object for the return address.

5495

unsigned SlotSize = RegInfo->getSlotSize();

5496

ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,

5497

-(int64_t)SlotSize,

5498

false);

5499

FuncInfo->setRAIndex(ReturnAddrIndex);

5500

}

5501

5502

return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));

5503

}

5504

5505

bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,

5506

bool hasSymbolicDisplacement) {

5507

// Offset should fit into 32 bit immediate field.

5508

if (!isInt<32>(Offset))

5509

return false;

5510

5511

// If we don't have a symbolic displacement - we don't have any extra

5512

// restrictions.

5513

if (!hasSymbolicDisplacement)

5514

return true;

5515

5516

// FIXME: Some tweaks might be needed for medium code model.

5517

if (M != CodeModel::Small && M != CodeModel::Kernel)

5518

return false;

5519

5520

// For small code model we assume that latest object is 16MB before end of 31

5521

// bits boundary. We may also accept pretty large negative constants knowing

5522

// that all objects are in the positive half of address space.

5523

if (M == CodeModel::Small && Offset < 16*1024*1024)

5524

return true;

5525

5526

// For kernel code model we know that all object resist in the negative half

5527

// of 32bits address space. We may not accept negative offsets, since they may

5528

// be just off and we may accept pretty large positive ones.

5529

if (M == CodeModel::Kernel && Offset >= 0)

5530

return true;

5531

5532

return false;

5533

}

5534

5535

/// Determines whether the callee is required to pop its own arguments.

5536

/// Callee pop is necessary to support tail calls.

5537

bool X86::isCalleePop(CallingConv::ID CallingConv,

5538

bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {

5539

// If GuaranteeTCO is true, we force some calls to be callee pop so that we

5540

// can guarantee TCO.

5541

if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))

5542

return true;

5543

5544

switch (CallingConv) {

5545

default:

5546

return false;

5547

case CallingConv::X86_StdCall:

5548

case CallingConv::X86_FastCall:

5549

case CallingConv::X86_ThisCall:

5550

case CallingConv::X86_VectorCall:

5551

return !is64Bit;

5552

}

5553

}

5554

5555

/// Return true if the condition is an signed comparison operation.

5556

static bool isX86CCSigned(unsigned X86CC) {

5557

switch (X86CC) {

5558

default:

5559

llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5559);

5560

case X86::COND_E:

5561

case X86::COND_NE:

5562

case X86::COND_B:

5563

case X86::COND_A:

5564

case X86::COND_BE:

5565

case X86::COND_AE:

5566

return false;

5567

case X86::COND_G:

5568

case X86::COND_GE:

5569

case X86::COND_L:

5570

case X86::COND_LE:

5571

return true;

5572

}

5573

}

5574

5575

static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {

5576

switch (SetCCOpcode) {

5577

default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5577);

5578

case ISD::SETEQ: return X86::COND_E;

5579

case ISD::SETGT: return X86::COND_G;

5580

case ISD::SETGE: return X86::COND_GE;

5581

case ISD::SETLT: return X86::COND_L;

5582

case ISD::SETLE: return X86::COND_LE;

5583

case ISD::SETNE: return X86::COND_NE;

5584

case ISD::SETULT: return X86::COND_B;

5585

case ISD::SETUGT: return X86::COND_A;

5586

case ISD::SETULE: return X86::COND_BE;

5587

case ISD::SETUGE: return X86::COND_AE;

5588

}

5589

}

5590

5591

/// Do a one-to-one translation of a ISD::CondCode to the X86-specific

5592

/// condition code, returning the condition code and the LHS/RHS of the

5593

/// comparison to make.

5594

static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,

5595

bool isFP, SDValue &LHS, SDValue &RHS,

5596

SelectionDAG &DAG) {

5597

if (!isFP) {

5598

if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {

5599

if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {

5600

// X > -1 -> X == 0, jump !sign.

5601

RHS = DAG.getConstant(0, DL, RHS.getValueType());

5602

return X86::COND_NS;

5603

}

5604

if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {

5605

// X < 0 -> X == 0, jump on sign.

5606

return X86::COND_S;

5607

}

5608

if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {

5609

// X >= 0 -> X == 0, jump on !sign.

5610

return X86::COND_NS;

5611

}

5612

if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {

5613

// X < 1 -> X <= 0

5614

RHS = DAG.getConstant(0, DL, RHS.getValueType());

5615

return X86::COND_LE;

5616

}

5617

}

5618

5619

return TranslateIntegerX86CC(SetCCOpcode);

5620

}

5621

5622

// First determine if it is required or is profitable to flip the operands.

5623

5624

// If LHS is a foldable load, but RHS is not, flip the condition.

5625

if (ISD::isNON_EXTLoad(LHS.getNode()) &&

5626

!ISD::isNON_EXTLoad(RHS.getNode())) {

5627

SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);

5628

std::swap(LHS, RHS);

5629

}

5630

5631

switch (SetCCOpcode) {

5632

default: break;

5633

case ISD::SETOLT:

5634

case ISD::SETOLE:

5635

case ISD::SETUGT:

5636

case ISD::SETUGE:

5637

std::swap(LHS, RHS);

5638

break;

5639

}

5640

5641

// On a floating point condition, the flags are set as follows:

5642

// ZF PF CF op

5643

// 0 | 0 | 0 | X > Y

5644

// 0 | 0 | 1 | X < Y

5645

// 1 | 0 | 0 | X == Y

5646

// 1 | 1 | 1 | unordered

5647

switch (SetCCOpcode) {

5648

default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5648);

5649

case ISD::SETUEQ:

5650

case ISD::SETEQ: return X86::COND_E;

5651

case ISD::SETOLT: // flipped

5652

case ISD::SETOGT:

5653

case ISD::SETGT: return X86::COND_A;

5654

case ISD::SETOLE: // flipped

5655

case ISD::SETOGE:

5656

case ISD::SETGE: return X86::COND_AE;

5657

case ISD::SETUGT: // flipped

5658

case ISD::SETULT:

5659

case ISD::SETLT: return X86::COND_B;

5660

case ISD::SETUGE: // flipped

5661

case ISD::SETULE:

5662

case ISD::SETLE: return X86::COND_BE;

5663

case ISD::SETONE:

5664

case ISD::SETNE: return X86::COND_NE;

5665

case ISD::SETUO: return X86::COND_P;

5666

case ISD::SETO: return X86::COND_NP;

5667

case ISD::SETOEQ:

5668

case ISD::SETUNE: return X86::COND_INVALID;

5669

}

5670

}

5671

5672

/// Is there a floating point cmov for the specific X86 condition code?

5673

/// Current x86 isa includes the following FP cmov instructions:

5674

/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.

5675

static bool hasFPCMov(unsigned X86CC) {

5676

switch (X86CC) {

5677

default:

5678

return false;

5679

case X86::COND_B:

5680

case X86::COND_BE:

5681

case X86::COND_E:

5682

case X86::COND_P:

5683

case X86::COND_A:

5684

case X86::COND_AE:

5685

case X86::COND_NE:

5686

case X86::COND_NP:

5687

return true;

5688

}

5689

}

5690

5691

static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {

5692

return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||

5693

VT.is512BitVector();

5694

}

5695

5696

bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

5697

const CallInst &I,

5698

MachineFunction &MF,

5699

unsigned Intrinsic) const {

5700

Info.flags = MachineMemOperand::MONone;

5701

Info.offset = 0;

5702

5703

const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);

5704

if (!IntrData) {

5705

switch (Intrinsic) {

5706

case Intrinsic::x86_aesenc128kl:

5707

case Intrinsic::x86_aesdec128kl:

5708

Info.opc = ISD::INTRINSIC_W_CHAIN;

5709

Info.ptrVal = I.getArgOperand(1);

5710

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);

5711

Info.align = Align(1);

5712

Info.flags |= MachineMemOperand::MOLoad;

5713

return true;

5714

case Intrinsic::x86_aesenc256kl:

5715

case Intrinsic::x86_aesdec256kl:

5716

Info.opc = ISD::INTRINSIC_W_CHAIN;

5717

Info.ptrVal = I.getArgOperand(1);

5718

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);

5719

Info.align = Align(1);

5720

Info.flags |= MachineMemOperand::MOLoad;

5721

return true;

5722

case Intrinsic::x86_aesencwide128kl:

5723

case Intrinsic::x86_aesdecwide128kl:

5724

Info.opc = ISD::INTRINSIC_W_CHAIN;

5725

Info.ptrVal = I.getArgOperand(0);

5726

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);

5727

Info.align = Align(1);

5728

Info.flags |= MachineMemOperand::MOLoad;

5729

return true;

5730

case Intrinsic::x86_aesencwide256kl:

5731

case Intrinsic::x86_aesdecwide256kl:

5732

Info.opc = ISD::INTRINSIC_W_CHAIN;

5733

Info.ptrVal = I.getArgOperand(0);

5734

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);

5735

Info.align = Align(1);

5736

Info.flags |= MachineMemOperand::MOLoad;

5737

return true;

5738

case Intrinsic::x86_cmpccxadd32:

5739

case Intrinsic::x86_cmpccxadd64:

5740

case Intrinsic::x86_atomic_bts:

5741

case Intrinsic::x86_atomic_btc:

5742

case Intrinsic::x86_atomic_btr: {

5743

Info.opc = ISD::INTRINSIC_W_CHAIN;

5744

Info.ptrVal = I.getArgOperand(0);

5745

unsigned Size = I.getType()->getScalarSizeInBits();

5746

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);

5747

Info.align = Align(Size);

5748

Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |

5749

MachineMemOperand::MOVolatile;

5750

return true;

5751

}

5752

case Intrinsic::x86_atomic_bts_rm:

5753

case Intrinsic::x86_atomic_btc_rm:

5754

case Intrinsic::x86_atomic_btr_rm: {

5755

Info.opc = ISD::INTRINSIC_W_CHAIN;

5756

Info.ptrVal = I.getArgOperand(0);

5757

unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();

5758

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);

5759

Info.align = Align(Size);

5760

Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |

5761

MachineMemOperand::MOVolatile;

5762

return true;

5763

}

5764

case Intrinsic::x86_aadd32:

5765

case Intrinsic::x86_aadd64:

5766

case Intrinsic::x86_aand32:

5767

case Intrinsic::x86_aand64:

5768

case Intrinsic::x86_aor32:

5769

case Intrinsic::x86_aor64:

5770

case Intrinsic::x86_axor32:

5771

case Intrinsic::x86_axor64:

5772

case Intrinsic::x86_atomic_add_cc:

5773

case Intrinsic::x86_atomic_sub_cc:

5774

case Intrinsic::x86_atomic_or_cc:

5775

case Intrinsic::x86_atomic_and_cc:

5776

case Intrinsic::x86_atomic_xor_cc: {

5777

Info.opc = ISD::INTRINSIC_W_CHAIN;

5778

Info.ptrVal = I.getArgOperand(0);

5779

unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();

5780

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);

5781

Info.align = Align(Size);

5782

Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |

5783

MachineMemOperand::MOVolatile;

5784

return true;

5785

}

5786

}

5787

return false;

5788

}

5789

5790

switch (IntrData->Type) {

5791

case TRUNCATE_TO_MEM_VI8:

5792

case TRUNCATE_TO_MEM_VI16:

5793

case TRUNCATE_TO_MEM_VI32: {

5794

Info.opc = ISD::INTRINSIC_VOID;

5795

Info.ptrVal = I.getArgOperand(0);

5796

MVT VT = MVT::getVT(I.getArgOperand(1)->getType());

5797

MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;

5798

if (IntrData->Type == TRUNCATE_TO_MEM_VI8)

5799

ScalarVT = MVT::i8;

5800

else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)

5801

ScalarVT = MVT::i16;

5802

else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)

5803

ScalarVT = MVT::i32;

5804

5805

Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());

5806

Info.align = Align(1);

5807

Info.flags |= MachineMemOperand::MOStore;

5808

break;

5809

}

5810

case GATHER:

5811

case GATHER_AVX2: {

5812

Info.opc = ISD::INTRINSIC_W_CHAIN;

5813

Info.ptrVal = nullptr;

5814

MVT DataVT = MVT::getVT(I.getType());

5815

MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());

5816

unsigned NumElts = std::min(DataVT.getVectorNumElements(),

5817

IndexVT.getVectorNumElements());

5818

Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);

5819

Info.align = Align(1);

5820

Info.flags |= MachineMemOperand::MOLoad;

5821

break;

5822

}

5823

case SCATTER: {

5824

Info.opc = ISD::INTRINSIC_VOID;

5825

Info.ptrVal = nullptr;

5826

MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());

5827

MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());

5828

unsigned NumElts = std::min(DataVT.getVectorNumElements(),

5829

IndexVT.getVectorNumElements());

5830

Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);

5831

Info.align = Align(1);

5832

Info.flags |= MachineMemOperand::MOStore;

5833

break;

5834

}

5835

default:

5836

return false;

5837

}

5838

5839

return true;

5840

}

5841

5842

/// Returns true if the target can instruction select the

5843

/// specified FP immediate natively. If false, the legalizer will

5844

/// materialize the FP immediate as a load from a constant pool.

5845

bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,

5846

bool ForCodeSize) const {

5847

for (const APFloat &FPImm : LegalFPImmediates)

5848

if (Imm.bitwiseIsEqual(FPImm))

5849

return true;

5850

return false;

5851

}

5852

5853

bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,

5854

ISD::LoadExtType ExtTy,

5855

EVT NewVT) const {

5856

assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")(static_cast <bool> (cast<LoadSDNode>(Load)->isSimple
() && "illegal to narrow") ? void (0) : __assert_fail
("cast<LoadSDNode>(Load)->isSimple() && \"illegal to narrow\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5856, __extension__
__PRETTY_FUNCTION__));

5857

5858

// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF

5859

// relocation target a movq or addq instruction: don't let the load shrink.

5860

SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();

5861

if (BasePtr.getOpcode() == X86ISD::WrapperRIP)

5862

if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))

5863

return GA->getTargetFlags() != X86II::MO_GOTTPOFF;

5864

5865

// If this is an (1) AVX vector load with (2) multiple uses and (3) all of

5866

// those uses are extracted directly into a store, then the extract + store

5867

// can be store-folded. Therefore, it's probably not worth splitting the load.

5868

EVT VT = Load->getValueType(0);

5869

if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {

5870

for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {

5871

// Skip uses of the chain value. Result 0 of the node is the load value.

5872

if (UI.getUse().getResNo() != 0)

5873

continue;

5874

5875

// If this use is not an extract + store, it's probably worth splitting.

5876

if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||

5877

UI->use_begin()->getOpcode() != ISD::STORE)

5878

return true;

5879

}

5880

// All non-chain uses are extract + store.

5881

return false;

5882

}

5883

5884

return true;

5885

}

5886

5887

/// Returns true if it is beneficial to convert a load of a constant

5888

/// to just the constant itself.

5889

bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,

5890

Type *Ty) const {

5891

assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5891, __extension__ __PRETTY_FUNCTION__));

5892

5893

unsigned BitSize = Ty->getPrimitiveSizeInBits();

5894

if (BitSize == 0 || BitSize > 64)

5895

return false;

5896

return true;

5897

}

5898

5899

bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {

5900

// If we are using XMM registers in the ABI and the condition of the select is

5901

// a floating-point compare and we have blendv or conditional move, then it is

5902

// cheaper to select instead of doing a cross-register move and creating a

5903

// load that depends on the compare result.

5904

bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;

5905

return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();

5906

}

5907

5908

bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {

5909

// TODO: It might be a win to ease or lift this restriction, but the generic

5910

// folds in DAGCombiner conflict with vector folds for an AVX512 target.

5911

if (VT.isVector() && Subtarget.hasAVX512())

5912

return false;

5913

5914

return true;

5915

}

5916

5917

bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,

5918

SDValue C) const {

5919

// TODO: We handle scalars using custom code, but generic combining could make

5920

// that unnecessary.

5921

APInt MulC;

5922

if (!ISD::isConstantSplatVector(C.getNode(), MulC))

5923

return false;

5924

5925

// Find the type this will be legalized too. Otherwise we might prematurely

5926

// convert this to shl+add/sub and then still have to type legalize those ops.

5927

// Another choice would be to defer the decision for illegal types until

5928

// after type legalization. But constant splat vectors of i64 can't make it

5929

// through type legalization on 32-bit targets so we would need to special

5930

// case vXi64.

5931

while (getTypeAction(Context, VT) != TypeLegal)

5932

VT = getTypeToTransformTo(Context, VT);

5933

5934

// If vector multiply is legal, assume that's faster than shl + add/sub.

5935

// Multiply is a complex op with higher latency and lower throughput in

5936

// most implementations, sub-vXi32 vector multiplies are always fast,

5937

// vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)

5938

// is always going to be slow.

5939

unsigned EltSizeInBits = VT.getScalarSizeInBits();

5940

if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&

5941

(EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))

5942

return false;

5943

5944

// shl+add, shl+sub, shl+add+neg

5945

return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||

5946

(1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();

5947

}

5948

5949

bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,

5950

unsigned Index) const {

5951

if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))

5952

return false;

5953

5954

// Mask vectors support all subregister combinations and operations that

5955

// extract half of vector.

5956

if (ResVT.getVectorElementType() == MVT::i1)

5957

return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&

5958

(Index == ResVT.getVectorNumElements()));

5959

5960

return (Index % ResVT.getVectorNumElements()) == 0;

5961

}

5962

5963

bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {

5964

unsigned Opc = VecOp.getOpcode();

5965

5966

// Assume target opcodes can't be scalarized.

5967

// TODO - do we have any exceptions?

5968

if (Opc >= ISD::BUILTIN_OP_END)

5969

return false;

5970

5971

// If the vector op is not supported, try to convert to scalar.

5972

EVT VecVT = VecOp.getValueType();

5973

if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))

5974

return true;

5975

5976

// If the vector op is supported, but the scalar op is not, the transform may

5977

// not be worthwhile.

5978

EVT ScalarVT = VecVT.getScalarType();

5979

return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);

5980

}

5981

5982

bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,

5983

bool) const {

5984

// TODO: Allow vectors?

5985

if (VT.isVector())

5986

return false;

5987

return VT.isSimple() || !isOperationExpand(Opcode, VT);

5988

}

5989

5990

bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {

5991

// Speculate cttz only if we can directly use TZCNT or can promote to i32.

5992

return Subtarget.hasBMI() ||

5993

(!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);

5994

}

5995

5996

bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {

5997

// Speculate ctlz only if we can directly use LZCNT.

5998

return Subtarget.hasLZCNT();

5999

}

6000

6001

bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {

6002

// Don't shrink FP constpool if SSE2 is available since cvtss2sd is more

6003

// expensive than a straight movsd. On the other hand, it's important to

6004

// shrink long double fp constant since fldt is very slow.

6005

return !Subtarget.hasSSE2() || VT == MVT::f80;

6006

}

6007

6008

bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {

6009

return (VT == MVT::f64 && Subtarget.hasSSE2()) ||

6010

(VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;

6011

}

6012

6013

bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,

6014

const SelectionDAG &DAG,

6015

const MachineMemOperand &MMO) const {

6016

if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&

6017

BitcastVT.getVectorElementType() == MVT::i1)

6018

return false;

6019

6020

if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)

6021

return false;

6022

6023

// If both types are legal vectors, it's always ok to convert them.

6024

if (LoadVT.isVector() && BitcastVT.isVector() &&

6025

isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))

6026

return true;

6027

6028

return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);

6029

}

6030

6031

bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,

6032

const MachineFunction &MF) const {

6033

// Do not merge to float value size (128 bytes) if no implicit

6034

// float attribute is set.

6035

bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);

6036

6037

if (NoFloat) {

6038

unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;

6039

return (MemVT.getSizeInBits() <= MaxIntSize);

6040

}

6041

// Make sure we don't merge greater than our preferred vector

6042

// width.

6043

if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())

6044

return false;

6045

6046

return true;

6047

}

6048

6049

bool X86TargetLowering::isCtlzFast() const {

6050

return Subtarget.hasFastLZCNT();

6051

}

6052

6053

bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(

6054

const Instruction &AndI) const {

6055

return true;

6056

}

6057

6058

bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {

6059

EVT VT = Y.getValueType();

6060

6061

if (VT.isVector())

6062

return false;

6063

6064

if (!Subtarget.hasBMI())

6065

return false;

6066

6067

// There are only 32-bit and 64-bit forms for 'andn'.

6068

if (VT != MVT::i32 && VT != MVT::i64)

6069

return false;

6070

6071

return !isa<ConstantSDNode>(Y);

6072

}

6073

6074

bool X86TargetLowering::hasAndNot(SDValue Y) const {

6075

EVT VT = Y.getValueType();

6076

6077

if (!VT.isVector())

6078

return hasAndNotCompare(Y);

6079

6080

// Vector.

6081

6082

if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)

6083

return false;

6084

6085

if (VT == MVT::v4i32)

6086

return true;

6087

6088

return Subtarget.hasSSE2();

6089

}

6090

6091

bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {

6092

return X.getValueType().isScalarInteger(); // 'bt'

6093

}

6094

6095

bool X86TargetLowering::

6096

shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(

6097

SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,

6098

unsigned OldShiftOpcode, unsigned NewShiftOpcode,

6099

SelectionDAG &DAG) const {

6100

// Does baseline recommend not to perform the fold by default?

6101

if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(

6102

X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))

6103

return false;

6104

// For scalars this transform is always beneficial.

6105

if (X.getValueType().isScalarInteger())

6106

return true;

6107

// If all the shift amounts are identical, then transform is beneficial even

6108

// with rudimentary SSE2 shifts.

6109

if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))

6110

return true;

6111

// If we have AVX2 with it's powerful shift operations, then it's also good.

6112

if (Subtarget.hasAVX2())

6113

return true;

6114

// Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.

6115

return NewShiftOpcode == ISD::SHL;

6116

}

6117

6118

bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const {

6119

return N->getOpcode() != ISD::FP_EXTEND;

6120

}

6121

6122

bool X86TargetLowering::shouldFoldConstantShiftPairToMask(

6123

const SDNode *N, CombineLevel Level) const {

6124

assert(((N->getOpcode() == ISD::SHL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6128, __extension__
__PRETTY_FUNCTION__))

6125

N->getOperand(0).getOpcode() == ISD::SRL) ||(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6128, __extension__
__PRETTY_FUNCTION__))

6126

(N->getOpcode() == ISD::SRL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6128, __extension__
__PRETTY_FUNCTION__))

6127

N->getOperand(0).getOpcode() == ISD::SHL)) &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6128, __extension__
__PRETTY_FUNCTION__))

6128

"Expected shift-shift mask")(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6128, __extension__
__PRETTY_FUNCTION__));

6129

// TODO: Should we always create i64 masks? Or only folded immediates?

6130

EVT VT = N->getValueType(0);

6131

if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||

6132

(Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {

6133

// Only fold if the shift values are equal - so it folds to AND.

6134

// TODO - we should fold if either is a non-uniform vector but we don't do

6135

// the fold for non-splats yet.

6136

return N->getOperand(1) == N->getOperand(0).getOperand(1);

6137

}

6138

return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);

6139

}

6140

6141

bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {

6142

EVT VT = Y.getValueType();

6143

6144

// For vectors, we don't have a preference, but we probably want a mask.

6145

if (VT.isVector())

6146

return false;

6147

6148

// 64-bit shifts on 32-bit targets produce really bad bloated code.

6149

if (VT == MVT::i64 && !Subtarget.is64Bit())

6150

return false;

6151

6152

return true;

6153

}

6154

6155

TargetLowering::ShiftLegalizationStrategy

6156

X86TargetLowering::preferredShiftLegalizationStrategy(

6157

SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {

6158

if (DAG.getMachineFunction().getFunction().hasMinSize() &&

6159

!Subtarget.isOSWindows())

6160

return ShiftLegalizationStrategy::LowerToLibcall;

6161

return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,

6162

ExpansionFactor);

6163

}

6164

6165

bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {

6166

// Any legal vector type can be splatted more efficiently than

6167

// loading/spilling from memory.

6168

return isTypeLegal(VT);

6169

}

6170

6171

MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {

6172

MVT VT = MVT::getIntegerVT(NumBits);

6173

if (isTypeLegal(VT))

6174

return VT;

6175

6176

// PMOVMSKB can handle this.

6177

if (NumBits == 128 && isTypeLegal(MVT::v16i8))

6178

return MVT::v16i8;

6179

6180

// VPMOVMSKB can handle this.

6181

if (NumBits == 256 && isTypeLegal(MVT::v32i8))

6182

return MVT::v32i8;

6183

6184

// TODO: Allow 64-bit type for 32-bit target.

6185

// TODO: 512-bit types should be allowed, but make sure that those

6186

// cases are handled in combineVectorSizedSetCCEquality().

6187

6188

return MVT::INVALID_SIMPLE_VALUE_TYPE;

6189

}

6190

6191

/// Val is the undef sentinel value or equal to the specified value.

6192

static bool isUndefOrEqual(int Val, int CmpVal) {

6193

return ((Val == SM_SentinelUndef) || (Val == CmpVal));

6194

}

6195

6196

/// Return true if every element in Mask is the undef sentinel value or equal to

6197

/// the specified value..

6198

static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {

6199

return llvm::all_of(Mask, [CmpVal](int M) {

6200

return (M == SM_SentinelUndef) || (M == CmpVal);

6201

});

6202

}

6203

6204

/// Val is either the undef or zero sentinel value.

6205

static bool isUndefOrZero(int Val) {

6206

return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));

6207

}

6208

6209

/// Return true if every element in Mask, beginning from position Pos and ending

6210

/// in Pos+Size is the undef sentinel value.

6211

static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {

6212

return llvm::all_of(Mask.slice(Pos, Size),

6213

[](int M) { return M == SM_SentinelUndef; });

6214

}

6215

6216

/// Return true if the mask creates a vector whose lower half is undefined.

6217

static bool isUndefLowerHalf(ArrayRef<int> Mask) {

6218

unsigned NumElts = Mask.size();

6219

return isUndefInRange(Mask, 0, NumElts / 2);

6220

}

6221

6222

/// Return true if the mask creates a vector whose upper half is undefined.

6223

static bool isUndefUpperHalf(ArrayRef<int> Mask) {

6224

unsigned NumElts = Mask.size();

6225

return isUndefInRange(Mask, NumElts / 2, NumElts / 2);

6226

}

6227

6228

/// Return true if Val falls within the specified range (L, H].

6229

static bool isInRange(int Val, int Low, int Hi) {

6230

return (Val >= Low && Val < Hi);

6231

}

6232

6233

/// Return true if the value of any element in Mask falls within the specified

6234

/// range (L, H].

6235

static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {

6236

return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });

6237

}

6238

6239

/// Return true if the value of any element in Mask is the zero sentinel value.

6240

static bool isAnyZero(ArrayRef<int> Mask) {

6241

return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });

6242

}

6243

6244

/// Return true if the value of any element in Mask is the zero or undef

6245

/// sentinel values.

6246

static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {

6247

return llvm::any_of(Mask, [](int M) {

6248

return M == SM_SentinelZero || M == SM_SentinelUndef;

6249

});

6250

}

6251

6252

/// Return true if Val is undef or if its value falls within the

6253

/// specified range (L, H].

6254

static bool isUndefOrInRange(int Val, int Low, int Hi) {

6255

return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);

6256

}

6257

6258

/// Return true if every element in Mask is undef or if its value

6259

/// falls within the specified range (L, H].

6260

static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {

6261

return llvm::all_of(

6262

Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });

6263

}

6264

6265

/// Return true if Val is undef, zero or if its value falls within the

6266

/// specified range (L, H].

6267

static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {

6268

return isUndefOrZero(Val) || isInRange(Val, Low, Hi);

6269

}

6270

6271

/// Return true if every element in Mask is undef, zero or if its value

6272

/// falls within the specified range (L, H].

6273

static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {

6274

return llvm::all_of(

6275

Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });

6276

}

6277

6278

/// Return true if every element in Mask, beginning

6279

/// from position Pos and ending in Pos + Size, falls within the specified

6280

/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.

6281

static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,

6282

unsigned Size, int Low, int Step = 1) {

6283

for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)

6284

if (!isUndefOrEqual(Mask[i], Low))

6285

return false;

6286

return true;

6287

}

6288

6289

/// Return true if every element in Mask, beginning

6290

/// from position Pos and ending in Pos+Size, falls within the specified

6291

/// sequential range (Low, Low+Size], or is undef or is zero.

6292

static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,

6293

unsigned Size, int Low,

6294

int Step = 1) {

6295

for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)

6296

if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)

6297

return false;

6298

return true;

6299

}

6300

6301

/// Return true if every element in Mask, beginning

6302

/// from position Pos and ending in Pos+Size is undef or is zero.

6303

static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,

6304

unsigned Size) {

6305

return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);

6306

}

6307

6308

/// Helper function to test whether a shuffle mask could be

6309

/// simplified by widening the elements being shuffled.

6310

///

6311

/// Appends the mask for wider elements in WidenedMask if valid. Otherwise

6312

/// leaves it in an unspecified state.

6313

///

6314

/// NOTE: This must handle normal vector shuffle masks and *target* vector

6315

/// shuffle masks. The latter have the special property of a '-2' representing

6316

/// a zero-ed lane of a vector.

6317

static bool canWidenShuffleElements(ArrayRef<int> Mask,

6318

SmallVectorImpl<int> &WidenedMask) {

6319

WidenedMask.assign(Mask.size() / 2, 0);

6320

for (int i = 0, Size = Mask.size(); i < Size; i += 2) {

6321

int M0 = Mask[i];

6322

int M1 = Mask[i + 1];

6323

6324

// If both elements are undef, its trivial.

6325

if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {

6326

WidenedMask[i / 2] = SM_SentinelUndef;

6327

continue;

6328

}

6329

6330

// Check for an undef mask and a mask value properly aligned to fit with

6331

// a pair of values. If we find such a case, use the non-undef mask's value.

6332

if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {

6333

WidenedMask[i / 2] = M1 / 2;

6334

continue;

6335

}

6336

if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {

6337

WidenedMask[i / 2] = M0 / 2;

6338

continue;

6339

}

6340

6341

// When zeroing, we need to spread the zeroing across both lanes to widen.

6342

if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {

6343

if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&

6344

(M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {

6345

WidenedMask[i / 2] = SM_SentinelZero;

6346

continue;

6347

}

6348

return false;

6349

}

6350

6351

// Finally check if the two mask values are adjacent and aligned with

6352

// a pair.

6353

if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {

6354

WidenedMask[i / 2] = M0 / 2;

6355

continue;

6356

}

6357

6358

// Otherwise we can't safely widen the elements used in this shuffle.

6359

return false;

6360

}

6361

assert(WidenedMask.size() == Mask.size() / 2 &&(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6362, __extension__
__PRETTY_FUNCTION__))

6362

"Incorrect size of mask after widening the elements!")(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6362, __extension__
__PRETTY_FUNCTION__));

6363

6364

return true;

6365

}

6366

6367

static bool canWidenShuffleElements(ArrayRef<int> Mask,

6368

const APInt &Zeroable,

6369

bool V2IsZero,

6370

SmallVectorImpl<int> &WidenedMask) {

6371

// Create an alternative mask with info about zeroable elements.

6372

// Here we do not set undef elements as zeroable.

6373

SmallVector<int, 64> ZeroableMask(Mask);

6374

if (V2IsZero) {

6375

assert(!Zeroable.isZero() && "V2's non-undef elements are used?!")(static_cast <bool> (!Zeroable.isZero() && "V2's non-undef elements are used?!"
) ? void (0) : __assert_fail ("!Zeroable.isZero() && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6375, __extension__
__PRETTY_FUNCTION__));

6376

for (int i = 0, Size = Mask.size(); i != Size; ++i)

6377

if (Mask[i] != SM_SentinelUndef && Zeroable[i])

6378

ZeroableMask[i] = SM_SentinelZero;

6379

}

6380

return canWidenShuffleElements(ZeroableMask, WidenedMask);

6381

}

6382

6383

static bool canWidenShuffleElements(ArrayRef<int> Mask) {

6384

SmallVector<int, 32> WidenedMask;

6385

return canWidenShuffleElements(Mask, WidenedMask);

6386

}

6387

6388

// Attempt to narrow/widen shuffle mask until it matches the target number of

6389

// elements.

6390

static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,

6391

SmallVectorImpl<int> &ScaledMask) {

6392

unsigned NumSrcElts = Mask.size();

6393

assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6394, __extension__
__PRETTY_FUNCTION__))

6394

"Illegal shuffle scale factor")(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6394, __extension__
__PRETTY_FUNCTION__));

6395

6396

// Narrowing is guaranteed to work.

6397

if (NumDstElts >= NumSrcElts) {

6398

int Scale = NumDstElts / NumSrcElts;

6399

llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);

6400

return true;

6401

}

6402

6403

// We have to repeat the widening until we reach the target size, but we can

6404

// split out the first widening as it sets up ScaledMask for us.

6405

if (canWidenShuffleElements(Mask, ScaledMask)) {

6406

while (ScaledMask.size() > NumDstElts) {

6407

SmallVector<int, 16> WidenedMask;

6408

if (!canWidenShuffleElements(ScaledMask, WidenedMask))

6409

return false;

6410

ScaledMask = std::move(WidenedMask);

6411

}

6412

return true;

6413

}

6414

6415

return false;

6416

}

6417

6418

/// Returns true if Elt is a constant zero or a floating point constant +0.0.

6419

bool X86::isZeroNode(SDValue Elt) {

6420

return isNullConstant(Elt) || isNullFPConstant(Elt);

6421

}

6422

6423

// Build a vector of constants.

6424

// Use an UNDEF node if MaskElt == -1.

6425

// Split 64-bit constants in the 32-bit mode.

6426

static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,

6427

const SDLoc &dl, bool IsMask = false) {

6428

6429

SmallVector<SDValue, 32> Ops;

6430

bool Split = false;

6431

6432

MVT ConstVecVT = VT;

6433

unsigned NumElts = VT.getVectorNumElements();

6434

bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);

6435

if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {

6436

ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);

6437

Split = true;

6438

}

6439

6440

MVT EltVT = ConstVecVT.getVectorElementType();

6441

for (unsigned i = 0; i < NumElts; ++i) {

6442

bool IsUndef = Values[i] < 0 && IsMask;

6443

SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :

6444

DAG.getConstant(Values[i], dl, EltVT);

6445

Ops.push_back(OpNode);

6446

if (Split)

6447

Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :

6448

DAG.getConstant(0, dl, EltVT));

6449

}

6450

SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);

6451

if (Split)

6452

ConstsNode = DAG.getBitcast(VT, ConstsNode);

6453

return ConstsNode;

6454

}

6455

6456

static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,

6457

MVT VT, SelectionDAG &DAG, const SDLoc &dl) {

6458

assert(Bits.size() == Undefs.getBitWidth() &&(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6459, __extension__
__PRETTY_FUNCTION__))

6459

"Unequal constant and undef arrays")(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6459, __extension__
__PRETTY_FUNCTION__));

6460

SmallVector<SDValue, 32> Ops;

6461

bool Split = false;

6462

6463

MVT ConstVecVT = VT;

6464

unsigned NumElts = VT.getVectorNumElements();

6465

bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);

6466

if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {

6467

ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);

6468

Split = true;

6469

}

6470

6471

MVT EltVT = ConstVecVT.getVectorElementType();

6472

for (unsigned i = 0, e = Bits.size(); i != e; ++i) {

6473

if (Undefs[i]) {

6474

Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));

6475

continue;

6476

}

6477

const APInt &V = Bits[i];

6478

assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")(static_cast <bool> (V.getBitWidth() == VT.getScalarSizeInBits
() && "Unexpected sizes") ? void (0) : __assert_fail (
"V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6478, __extension__
__PRETTY_FUNCTION__));

6479

if (Split) {

6480

Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));

6481

Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));

6482

} else if (EltVT == MVT::f32) {

6483

APFloat FV(APFloat::IEEEsingle(), V);

6484

Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));

6485

} else if (EltVT == MVT::f64) {

6486

APFloat FV(APFloat::IEEEdouble(), V);

6487

Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));

6488

} else {

6489

Ops.push_back(DAG.getConstant(V, dl, EltVT));

6490

}

6491

}

6492

6493

SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);

6494

return DAG.getBitcast(VT, ConstsNode);

6495

}

6496

6497

/// Returns a vector of specified type with all zero elements.

6498

static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,

6499

SelectionDAG &DAG, const SDLoc &dl) {

6500

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6502, __extension__
__PRETTY_FUNCTION__))

6501

VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6502, __extension__
__PRETTY_FUNCTION__))

6502

"Unexpected vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6502, __extension__
__PRETTY_FUNCTION__));

6503

6504

// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest

6505

// type. This ensures they get CSE'd. But if the integer type is not

6506

// available, use a floating-point +0.0 instead.

6507

SDValue Vec;

6508

if (!Subtarget.hasSSE2() && VT.is128BitVector()) {

6509

Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);

6510

} else if (VT.isFloatingPoint()) {

6511

Vec = DAG.getConstantFP(+0.0, dl, VT);

6512

} else if (VT.getVectorElementType() == MVT::i1) {

6513

assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6514, __extension__
__PRETTY_FUNCTION__))

6514

"Unexpected vector type")(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6514, __extension__
__PRETTY_FUNCTION__));

6515

Vec = DAG.getConstant(0, dl, VT);

6516

} else {

6517

unsigned Num32BitElts = VT.getSizeInBits() / 32;

6518

Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));

6519

}

6520

return DAG.getBitcast(VT, Vec);

6521

}

6522

6523

// Helper to determine if the ops are all the extracted subvectors come from a

6524

// single source. If we allow commute they don't have to be in order (Lo/Hi).

6525

static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {

6526

if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

6527

RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

6528

LHS.getValueType() != RHS.getValueType() ||

6529

LHS.getOperand(0) != RHS.getOperand(0))

6530

return SDValue();

6531

6532

SDValue Src = LHS.getOperand(0);

6533

if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))

6534

return SDValue();

6535

6536

unsigned NumElts = LHS.getValueType().getVectorNumElements();

6537

if ((LHS.getConstantOperandAPInt(1) == 0 &&

6538

RHS.getConstantOperandAPInt(1) == NumElts) ||

6539

(AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&

6540

LHS.getConstantOperandAPInt(1) == NumElts))

6541

return Src;

6542

6543

return SDValue();

6544

}

6545

6546

static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,

6547

const SDLoc &dl, unsigned vectorWidth) {

6548

EVT VT = Vec.getValueType();

6549

EVT ElVT = VT.getVectorElementType();

6550

unsigned Factor = VT.getSizeInBits() / vectorWidth;

6551

EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,

6552

VT.getVectorNumElements() / Factor);

6553

6554

// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR

6555

unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();

6556

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6556, __extension__
__PRETTY_FUNCTION__));

6557

6558

// This is the index of the first element of the vectorWidth-bit chunk

6559

// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.

6560

IdxVal &= ~(ElemsPerChunk - 1);

6561

6562

// If the input is a buildvector just emit a smaller one.

6563

if (Vec.getOpcode() == ISD::BUILD_VECTOR)

6564

return DAG.getBuildVector(ResultVT, dl,

6565

Vec->ops().slice(IdxVal, ElemsPerChunk));

6566

6567

SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);

6568

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);

6569

}

6570

6571

/// Generate a DAG to grab 128-bits from a vector > 128 bits. This

6572

/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128

6573

/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4

6574

/// instructions or a simple subregister reference. Idx is an index in the

6575

/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes

6576

/// lowering EXTRACT_VECTOR_ELT operations easier.

6577

static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,

6578

SelectionDAG &DAG, const SDLoc &dl) {

6579

assert((Vec.getValueType().is256BitVector() ||(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6580, __extension__
__PRETTY_FUNCTION__))

6580

Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6580, __extension__
__PRETTY_FUNCTION__));

6581

return extractSubVector(Vec, IdxVal, DAG, dl, 128);

6582

}

6583

6584

/// Generate a DAG to grab 256-bits from a 512-bit vector.

6585

static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,

6586

SelectionDAG &DAG, const SDLoc &dl) {

6587

assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is512BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6587, __extension__
__PRETTY_FUNCTION__));

6588

return extractSubVector(Vec, IdxVal, DAG, dl, 256);

6589

}

6590

6591

static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,

6592

SelectionDAG &DAG, const SDLoc &dl,

6593

unsigned vectorWidth) {

6594

assert((vectorWidth == 128 || vectorWidth == 256) &&(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6595, __extension__
__PRETTY_FUNCTION__))

6595

"Unsupported vector width")(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6595, __extension__
__PRETTY_FUNCTION__));

6596

// Inserting UNDEF is Result

6597

if (Vec.isUndef())

6598

return Result;

6599

EVT VT = Vec.getValueType();

6600

EVT ElVT = VT.getVectorElementType();

6601

EVT ResultVT = Result.getValueType();

6602

6603

// Insert the relevant vectorWidth bits.

6604

unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();

6605

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6605, __extension__
__PRETTY_FUNCTION__));

6606

6607

// This is the index of the first element of the vectorWidth-bit chunk

6608

// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.

6609

IdxVal &= ~(ElemsPerChunk - 1);

6610

6611

SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);

6612

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);

6613

}

6614

6615

/// Generate a DAG to put 128-bits into a vector > 128 bits. This

6616

/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or

6617

/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a

6618

/// simple superregister reference. Idx is an index in the 128 bits

6619

/// we want. It need not be aligned to a 128-bit boundary. That makes

6620

/// lowering INSERT_VECTOR_ELT operations easier.

6621

static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,

6622

SelectionDAG &DAG, const SDLoc &dl) {

6623

assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is128BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6623, __extension__
__PRETTY_FUNCTION__));

6624

return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);

6625

}

6626

6627

/// Widen a vector to a larger size with the same scalar type, with the new

6628

/// elements either zero or undef.

6629

static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,

6630

const X86Subtarget &Subtarget, SelectionDAG &DAG,

6631

const SDLoc &dl) {

6632

assert(Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6634, __extension__
__PRETTY_FUNCTION__))

6633

Vec.getValueType().getScalarType() == VT.getScalarType() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6634, __extension__
__PRETTY_FUNCTION__))

6634

"Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6634, __extension__
__PRETTY_FUNCTION__));

6635

SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)

6636

: DAG.getUNDEF(VT);

6637

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,

6638

DAG.getIntPtrConstant(0, dl));

6639

}

6640

6641

/// Widen a vector to a larger size with the same scalar type, with the new

6642

/// elements either zero or undef.

6643

static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,

6644

const X86Subtarget &Subtarget, SelectionDAG &DAG,

6645

const SDLoc &dl, unsigned WideSizeInBits) {

6646

assert(Vec.getValueSizeInBits() < WideSizeInBits &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6648, __extension__
__PRETTY_FUNCTION__))

6647

(WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6648, __extension__
__PRETTY_FUNCTION__))

6648

"Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6648, __extension__
__PRETTY_FUNCTION__));

6649

unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();

6650

MVT SVT = Vec.getSimpleValueType().getScalarType();

6651

MVT VT = MVT::getVectorVT(SVT, WideNumElts);

6652

return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);

6653

}

6654

6655

// Helper function to collect subvector ops that are concatenated together,

6656

// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.

6657

// The subvectors in Ops are guaranteed to be the same type.

6658

static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops,

6659

SelectionDAG &DAG) {

6660

assert(Ops.empty() && "Expected an empty ops vector")(static_cast <bool> (Ops.empty() && "Expected an empty ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"Expected an empty ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6660, __extension__
__PRETTY_FUNCTION__));

6661

6662

if (N->getOpcode() == ISD::CONCAT_VECTORS) {

6663

Ops.append(N->op_begin(), N->op_end());

6664

return true;

6665

}

6666

6667

if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {

6668

SDValue Src = N->getOperand(0);

6669

SDValue Sub = N->getOperand(1);

6670

const APInt &Idx = N->getConstantOperandAPInt(2);

6671

EVT VT = Src.getValueType();

6672

EVT SubVT = Sub.getValueType();

6673

6674

// TODO - Handle more general insert_subvector chains.

6675

if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {

6676

// insert_subvector(undef, x, lo)

6677

if (Idx == 0 && Src.isUndef()) {

6678

Ops.push_back(Sub);

6679

Ops.push_back(DAG.getUNDEF(SubVT));

6680

return true;

6681

}

6682

if (Idx == (VT.getVectorNumElements() / 2)) {

6683

// insert_subvector(insert_subvector(undef, x, lo), y, hi)

6684

if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&

6685

Src.getOperand(1).getValueType() == SubVT &&

6686

isNullConstant(Src.getOperand(2))) {

6687

Ops.push_back(Src.getOperand(1));

6688

Ops.push_back(Sub);

6689

return true;

6690

}

6691

// insert_subvector(x, extract_subvector(x, lo), hi)

6692

if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

6693

Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {

6694

Ops.append(2, Sub);

6695

return true;

6696

}

6697

// insert_subvector(undef, x, hi)

6698

if (Src.isUndef()) {

6699

Ops.push_back(DAG.getUNDEF(SubVT));

6700

Ops.push_back(Sub);

6701

return true;

6702

}

6703

}

6704

}

6705

}

6706

6707

return false;

6708

}

6709

6710

static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,

6711

const SDLoc &dl) {

6712

EVT VT = Op.getValueType();

6713

unsigned NumElems = VT.getVectorNumElements();

6714

unsigned SizeInBits = VT.getSizeInBits();

6715

assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6716, __extension__
__PRETTY_FUNCTION__))

6716

"Can't split odd sized vector")(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6716, __extension__
__PRETTY_FUNCTION__));

6717

6718

// If this is a splat value (with no-undefs) then use the lower subvector,

6719

// which should be a free extraction.

6720

SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);

6721

if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))

6722

return std::make_pair(Lo, Lo);

6723

6724

SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);

6725

return std::make_pair(Lo, Hi);

6726

}

6727

6728

/// Break an operation into 2 half sized ops and then concatenate the results.

6729

static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG) {

6730

unsigned NumOps = Op.getNumOperands();

6731

EVT VT = Op.getValueType();

6732

SDLoc dl(Op);

6733

6734

// Extract the LHS Lo/Hi vectors

6735

SmallVector<SDValue> LoOps(NumOps, SDValue());

6736

SmallVector<SDValue> HiOps(NumOps, SDValue());

6737

for (unsigned I = 0; I != NumOps; ++I) {

6738

SDValue SrcOp = Op.getOperand(I);

6739

if (!SrcOp.getValueType().isVector()) {

6740

LoOps[I] = HiOps[I] = SrcOp;

6741

continue;

6742

}

6743

std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);

6744

}

6745

6746

EVT LoVT, HiVT;

6747

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

6748

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

6749

DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),

6750

DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));

6751

}

6752

6753

/// Break an unary integer operation into 2 half sized ops and then

6754

/// concatenate the result back.

6755

static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {

6756

// Make sure we only try to split 256/512-bit types to avoid creating

6757

// narrow vectors.

6758

EVT VT = Op.getValueType();

6759

(void)VT;

6760

assert((Op.getOperand(0).getValueType().is256BitVector() ||(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6762, __extension__
__PRETTY_FUNCTION__))

6761

Op.getOperand(0).getValueType().is512BitVector()) &&(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6762, __extension__
__PRETTY_FUNCTION__))

6762

(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6762, __extension__
__PRETTY_FUNCTION__));

6763

assert(Op.getOperand(0).getValueType().getVectorNumElements() ==(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6765, __extension__
__PRETTY_FUNCTION__))

6764

VT.getVectorNumElements() &&(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6765, __extension__
__PRETTY_FUNCTION__))

6765

"Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6765, __extension__
__PRETTY_FUNCTION__));

6766

return splitVectorOp(Op, DAG);

6767

}

6768

6769

/// Break a binary integer operation into 2 half sized ops and then

6770

/// concatenate the result back.

6771

static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {

6772

// Assert that all the types match.

6773

EVT VT = Op.getValueType();

6774

(void)VT;

6775

assert(Op.getOperand(0).getValueType() == VT &&(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6776, __extension__
__PRETTY_FUNCTION__))

6776

Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6776, __extension__
__PRETTY_FUNCTION__));

6777

assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported VT!") ? void (0) : __assert_fail (
"(VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6777, __extension__
__PRETTY_FUNCTION__));

6778

return splitVectorOp(Op, DAG);

6779

}

6780

6781

// Helper for splitting operands of an operation to legal target size and

6782

// apply a function on each part.

6783

// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in

6784

// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for

6785

// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.

6786

// The argument Builder is a function that will be applied on each split part:

6787

// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)

6788

template <typename F>

6789

SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,

6790

const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,

6791

F Builder, bool CheckBWI = true) {

6792

assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Target assumed to support at least SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Target assumed to support at least SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6792, __extension__
__PRETTY_FUNCTION__));

6793

unsigned NumSubs = 1;

6794

if ((CheckBWI && Subtarget.useBWIRegs()) ||

6795

(!CheckBWI && Subtarget.useAVX512Regs())) {

6796

if (VT.getSizeInBits() > 512) {

6797

NumSubs = VT.getSizeInBits() / 512;

6798

assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 512) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 512) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6798, __extension__
__PRETTY_FUNCTION__));

6799

}

6800

} else if (Subtarget.hasAVX2()) {

6801

if (VT.getSizeInBits() > 256) {

6802

NumSubs = VT.getSizeInBits() / 256;

6803

assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 256) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 256) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6803, __extension__
__PRETTY_FUNCTION__));

6804

}

6805

} else {

6806

if (VT.getSizeInBits() > 128) {

6807

NumSubs = VT.getSizeInBits() / 128;

6808

assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 128) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 128) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6808, __extension__
__PRETTY_FUNCTION__));

6809

}

6810

}

6811

6812

if (NumSubs == 1)

6813

return Builder(DAG, DL, Ops);

6814

6815

SmallVector<SDValue, 4> Subs;

6816

for (unsigned i = 0; i != NumSubs; ++i) {

6817

SmallVector<SDValue, 2> SubOps;

6818

for (SDValue Op : Ops) {

6819

EVT OpVT = Op.getValueType();

6820

unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;

6821

unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;

6822

SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));

6823

}

6824

Subs.push_back(Builder(DAG, DL, SubOps));

6825

}

6826

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);

6827

}

6828

6829

// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX

6830

// targets.

6831

static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,

6832

ArrayRef<SDValue> Ops, SelectionDAG &DAG,

6833

const X86Subtarget &Subtarget) {

6834

assert(Subtarget.hasAVX512() && "AVX512 target expected")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 target expected"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 target expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6834, __extension__
__PRETTY_FUNCTION__));

6835

MVT SVT = VT.getScalarType();

6836

6837

// If we have a 32/64 splatted constant, splat it to DstTy to

6838

// encourage a foldable broadcast'd operand.

6839

auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {

6840

unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();

6841

// AVX512 broadcasts 32/64-bit operands.

6842

// TODO: Support float once getAVX512Node is used by fp-ops.

6843

if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||

6844

!DAG.getTargetLoweringInfo().isTypeLegal(SVT))

6845

return SDValue();

6846

// If we're not widening, don't bother if we're not bitcasting.

6847

if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)

6848

return SDValue();

6849

if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {

6850

APInt SplatValue, SplatUndef;

6851

unsigned SplatBitSize;

6852

bool HasAnyUndefs;

6853

if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,

6854

HasAnyUndefs, OpEltSizeInBits) &&

6855

!HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)

6856

return DAG.getConstant(SplatValue, DL, DstVT);

6857

}

6858

return SDValue();

6859

};

6860

6861

bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());

6862

6863

MVT DstVT = VT;

6864

if (Widen)

6865

DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());

6866

6867

// Canonicalize src operands.

6868

SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());

6869

for (SDValue &Op : SrcOps) {

6870

MVT OpVT = Op.getSimpleValueType();

6871

// Just pass through scalar operands.

6872

if (!OpVT.isVector())

6873

continue;

6874

assert(OpVT == VT && "Vector type mismatch")(static_cast <bool> (OpVT == VT && "Vector type mismatch"
) ? void (0) : __assert_fail ("OpVT == VT && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6874, __extension__
__PRETTY_FUNCTION__));

6875

6876

if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {

6877

Op = BroadcastOp;

6878

continue;

6879

}

6880

6881

// Just widen the subvector by inserting into an undef wide vector.

6882

if (Widen)

6883

Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);

6884

}

6885

6886

SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);

6887

6888

// Perform the 512-bit op then extract the bottom subvector.

6889

if (Widen)

6890

Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());

6891

return Res;

6892

}

6893

6894

/// Insert i1-subvector to i1-vector.

6895

static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,

6896

const X86Subtarget &Subtarget) {

6897

6898

SDLoc dl(Op);

6899

SDValue Vec = Op.getOperand(0);

6900

SDValue SubVec = Op.getOperand(1);

6901

SDValue Idx = Op.getOperand(2);

6902

unsigned IdxVal = Op.getConstantOperandVal(2);

6903

6904

// Inserting undef is a nop. We can just return the original vector.

6905

if (SubVec.isUndef())

6906

return Vec;

6907

6908

if (IdxVal == 0 && Vec.isUndef()) // the operation is legal

6909

return Op;

6910

6911

MVT OpVT = Op.getSimpleValueType();

6912

unsigned NumElems = OpVT.getVectorNumElements();

6913

SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

6914

6915

// Extend to natively supported kshift.

6916

MVT WideOpVT = OpVT;

6917

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)

6918

WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

6919

6920

// Inserting into the lsbs of a zero vector is legal. ISel will insert shifts

6921

// if necessary.

6922

if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {

6923

// May need to promote to a legal type.

6924

Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6925

DAG.getConstant(0, dl, WideOpVT),

6926

SubVec, Idx);

6927

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

6928

}

6929

6930

MVT SubVecVT = SubVec.getSimpleValueType();

6931

unsigned SubVecNumElems = SubVecVT.getVectorNumElements();

6932

assert(IdxVal + SubVecNumElems <= NumElems &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6934, __extension__
__PRETTY_FUNCTION__))

6933

IdxVal % SubVecVT.getSizeInBits() == 0 &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6934, __extension__
__PRETTY_FUNCTION__))

6934

"Unexpected index value in INSERT_SUBVECTOR")(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6934, __extension__
__PRETTY_FUNCTION__));

6935

6936

SDValue Undef = DAG.getUNDEF(WideOpVT);

6937

6938

if (IdxVal == 0) {

6939

// Zero lower bits of the Vec

6940

SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);

6941

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,

6942

ZeroIdx);

6943

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);

6944

Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);

6945

// Merge them together, SubVec should be zero extended.

6946

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6947

DAG.getConstant(0, dl, WideOpVT),

6948

SubVec, ZeroIdx);

6949

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

6950

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

6951

}

6952

6953

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6954

Undef, SubVec, ZeroIdx);

6955

6956

if (Vec.isUndef()) {

6957

assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6957, __extension__
__PRETTY_FUNCTION__));

6958

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6959

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

6960

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

6961

}

6962

6963

if (ISD::isBuildVectorAllZeros(Vec.getNode())) {

6964

assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6964, __extension__
__PRETTY_FUNCTION__));

6965

// If upper elements of Vec are known undef, then just shift into place.

6966

if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),

6967

[](SDValue V) { return V.isUndef(); })) {

6968

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6969

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

6970

} else {

6971

NumElems = WideOpVT.getVectorNumElements();

6972

unsigned ShiftLeft = NumElems - SubVecNumElems;

6973

unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

6974

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6975

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

6976

if (ShiftRight != 0)

6977

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

6978

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

6979

}

6980

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

6981

}

6982

6983

// Simple case when we put subvector in the upper part

6984

if (IdxVal + SubVecNumElems == NumElems) {

6985

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6986

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

6987

if (SubVecNumElems * 2 == NumElems) {

6988

// Special case, use legal zero extending insert_subvector. This allows

6989

// isel to optimize when bits are known zero.

6990

Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);

6991

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6992

DAG.getConstant(0, dl, WideOpVT),

6993

Vec, ZeroIdx);

6994

} else {

6995

// Otherwise use explicit shifts to zero the bits.

6996

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6997

Undef, Vec, ZeroIdx);

6998

NumElems = WideOpVT.getVectorNumElements();

6999

SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);

7000

Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);

7001

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);

7002

}

7003

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

7004

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

7005

}

7006

7007

// Inserting into the middle is more complicated.

7008

7009

NumElems = WideOpVT.getVectorNumElements();

7010

7011

// Widen the vector if needed.

7012

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);

7013

7014

unsigned ShiftLeft = NumElems - SubVecNumElems;

7015

unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

7016

7017

// Do an optimization for the the most frequently used types.

7018

if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {

7019

APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);

7020

Mask0.flipAllBits();

7021

SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));

7022

SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);

7023

Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);

7024

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

7025

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

7026

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

7027

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

7028

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

7029

7030

// Reduce to original width if needed.

7031

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

7032

}

7033

7034

// Clear the upper bits of the subvector and move it to its insert position.

7035

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

7036

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

7037

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

7038

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

7039

7040

// Isolate the bits below the insertion point.

7041

unsigned LowShift = NumElems - IdxVal;

7042

SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,

7043

DAG.getTargetConstant(LowShift, dl, MVT::i8));

7044

Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,

7045

DAG.getTargetConstant(LowShift, dl, MVT::i8));

7046

7047

// Isolate the bits after the last inserted bit.

7048

unsigned HighShift = IdxVal + SubVecNumElems;

7049

SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,

7050

DAG.getTargetConstant(HighShift, dl, MVT::i8));

7051

High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,

7052

DAG.getTargetConstant(HighShift, dl, MVT::i8));

7053

7054

// Now OR all 3 pieces together.

7055

Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);

7056

SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);

7057

7058

// Reduce to original width if needed.

7059

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

7060

}

7061

7062

static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,

7063

const SDLoc &dl) {

7064

assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "subvector type mismatch") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"subvector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7064, __extension__
__PRETTY_FUNCTION__));

7065

EVT SubVT = V1.getValueType();

7066

EVT SubSVT = SubVT.getScalarType();

7067

unsigned SubNumElts = SubVT.getVectorNumElements();

7068

unsigned SubVectorWidth = SubVT.getSizeInBits();

7069

EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);

7070

SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);

7071

return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);

7072

}

7073

7074

/// Returns a vector of specified type with all bits set.

7075

/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.

7076

/// Then bitcast to their original type, ensuring they get CSE'd.

7077

static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {

7078

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7079, __extension__
__PRETTY_FUNCTION__))

7079

"Expected a 128/256/512-bit vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7079, __extension__
__PRETTY_FUNCTION__));

7080

7081

APInt Ones = APInt::getAllOnes(32);

7082

unsigned NumElts = VT.getSizeInBits() / 32;

7083

SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));

7084

return DAG.getBitcast(VT, Vec);

7085

}

7086

7087

static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,

7088

SDValue In, SelectionDAG &DAG) {

7089

EVT InVT = In.getValueType();

7090

assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector VTs.") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector VTs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7090, __extension__
__PRETTY_FUNCTION__));

7091

assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7093, __extension__
__PRETTY_FUNCTION__))

7092

ISD::ZERO_EXTEND == Opcode) &&(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7093, __extension__
__PRETTY_FUNCTION__))

7093

"Unknown extension opcode")(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7093, __extension__
__PRETTY_FUNCTION__));

7094

7095

// For 256-bit vectors, we only need the lower (128-bit) input half.

7096

// For 512-bit vectors, we only need the lower input half or quarter.

7097

if (InVT.getSizeInBits() > 128) {

7098

assert(VT.getSizeInBits() == InVT.getSizeInBits() &&(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7099, __extension__
__PRETTY_FUNCTION__))

7099

"Expected VTs to be the same size!")(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7099, __extension__
__PRETTY_FUNCTION__));

7100

unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();

7101

In = extractSubVector(In, 0, DAG, DL,

7102

std::max(128U, (unsigned)VT.getSizeInBits() / Scale));

7103

InVT = In.getValueType();

7104

}

7105

7106

if (VT.getVectorNumElements() != InVT.getVectorNumElements())

7107

Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);

7108

7109

return DAG.getNode(Opcode, DL, VT, In);

7110

}

7111

7112

// Match (xor X, -1) -> X.

7113

// Match extract_subvector(xor X, -1) -> extract_subvector(X).

7114

// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).

7115

static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {

7116

V = peekThroughBitcasts(V);

7117

if (V.getOpcode() == ISD::XOR &&

7118

(ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||

7119

isAllOnesConstant(V.getOperand(1))))

7120

return V.getOperand(0);

7121

if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

7122

(isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {

7123

if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {

7124

Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);

7125

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),

7126

Not, V.getOperand(1));

7127

}

7128

}

7129

SmallVector<SDValue, 2> CatOps;

7130

if (collectConcatOps(V.getNode(), CatOps, DAG)) {

7131

for (SDValue &CatOp : CatOps) {

7132

SDValue NotCat = IsNOT(CatOp, DAG);

7133

if (!NotCat) return SDValue();

7134

CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);

7135

}

7136

return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);

7137

}

7138

return SDValue();

7139

}

7140

7141

void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,

7142

bool Lo, bool Unary) {

7143

assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7144, __extension__
__PRETTY_FUNCTION__))

7144

"Illegal vector type to unpack")(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7144, __extension__
__PRETTY_FUNCTION__));

7145

assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7145, __extension__
__PRETTY_FUNCTION__));

7146

int NumElts = VT.getVectorNumElements();

7147

int NumEltsInLane = 128 / VT.getScalarSizeInBits();

7148

for (int i = 0; i < NumElts; ++i) {

7149

unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;

7150

int Pos = (i % NumEltsInLane) / 2 + LaneStart;

7151

Pos += (Unary ? 0 : NumElts * (i % 2));

7152

Pos += (Lo ? 0 : NumEltsInLane / 2);

7153

Mask.push_back(Pos);

7154

}

7155

}

7156

7157

/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation

7158

/// imposed by AVX and specific to the unary pattern. Example:

7159

/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>

7160

/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>

7161

void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

7162

bool Lo) {

7163

assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7163, __extension__
__PRETTY_FUNCTION__));

7164

int NumElts = VT.getVectorNumElements();

7165

for (int i = 0; i < NumElts; ++i) {

7166

int Pos = i / 2;

7167

Pos += (Lo ? 0 : NumElts / 2);

7168

Mask.push_back(Pos);

7169

}

7170

}

7171

7172

// Attempt to constant fold, else just create a VECTOR_SHUFFLE.

7173

static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,

7174

SDValue V1, SDValue V2, ArrayRef<int> Mask) {

7175

if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) &&

7176

(ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {

7177

SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));

7178

for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {

7179

int M = Mask[I];

7180

if (M < 0)

7181

continue;

7182

SDValue V = (M < NumElts) ? V1 : V2;

7183

if (V.isUndef())

7184

continue;

7185

Ops[I] = V.getOperand(M % NumElts);

7186

}

7187

return DAG.getBuildVector(VT, dl, Ops);

7188

}

7189

7190

return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);

7191

}

7192

7193

/// Returns a vector_shuffle node for an unpackl operation.

7194

static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,

7195

SDValue V1, SDValue V2) {

7196

SmallVector<int, 8> Mask;

7197

createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);

7198

return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);

7199

}

7200

7201

/// Returns a vector_shuffle node for an unpackh operation.

7202

static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,

7203

SDValue V1, SDValue V2) {

7204

SmallVector<int, 8> Mask;

7205

createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);

7206

return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);

7207

}

7208

7209

/// Returns a node that packs the LHS + RHS nodes together at half width.

7210

/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.

7211

/// TODO: Add subvector splitting if/when we have a need for it.

7212

static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,

7213

const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,

7214

bool PackHiHalf = false) {

7215

MVT OpVT = LHS.getSimpleValueType();

7216

unsigned EltSizeInBits = VT.getScalarSizeInBits();

7217

bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;

7218

assert(OpVT == RHS.getSimpleValueType() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7221, __extension__
__PRETTY_FUNCTION__))

7219

VT.getSizeInBits() == OpVT.getSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7221, __extension__
__PRETTY_FUNCTION__))

7220

(EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7221, __extension__
__PRETTY_FUNCTION__))

7221

"Unexpected PACK operand types")(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7221, __extension__
__PRETTY_FUNCTION__));

7222

assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7223, __extension__
__PRETTY_FUNCTION__))

7223

"Unexpected PACK result type")(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7223, __extension__
__PRETTY_FUNCTION__));

7224

7225

// Rely on vector shuffles for vXi64 -> vXi32 packing.

7226

if (EltSizeInBits == 32) {

7227

SmallVector<int> PackMask;

7228

int Offset = PackHiHalf ? 1 : 0;

7229

int NumElts = VT.getVectorNumElements();

7230

for (int I = 0; I != NumElts; I += 4) {

7231

PackMask.push_back(I + Offset);

7232

PackMask.push_back(I + Offset + 2);

7233

PackMask.push_back(I + Offset + NumElts);

7234

PackMask.push_back(I + Offset + NumElts + 2);

7235

}

7236

return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),

7237

DAG.getBitcast(VT, RHS), PackMask);

7238

}

7239

7240

// See if we already have sufficient leading bits for PACKSS/PACKUS.

7241

if (!PackHiHalf) {

7242

if (UsePackUS &&

7243

DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&

7244

DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)

7245

return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);

7246

7247

if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&

7248

DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)

7249

return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);

7250

}

7251

7252

// Fallback to sign/zero extending the requested half and pack.

7253

SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);

7254

if (UsePackUS) {

7255

if (PackHiHalf) {

7256

LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);

7257

RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);

7258

} else {

7259

SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);

7260

LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);

7261

RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);

7262

};

7263

return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);

7264

};

7265

7266

if (!PackHiHalf) {

7267

LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);

7268

RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);

7269

}

7270

LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);

7271

RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);

7272

return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);

7273

}

7274

7275

/// Return a vector_shuffle of the specified vector of zero or undef vector.

7276

/// This produces a shuffle where the low element of V2 is swizzled into the

7277

/// zero/undef vector, landing at element Idx.

7278

/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).

7279

static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,

7280

bool IsZero,

7281

const X86Subtarget &Subtarget,

7282

SelectionDAG &DAG) {

7283

MVT VT = V2.getSimpleValueType();

7284

SDValue V1 = IsZero

7285

? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);

7286

int NumElems = VT.getVectorNumElements();

7287

SmallVector<int, 16> MaskVec(NumElems);

7288

for (int i = 0; i != NumElems; ++i)

7289

// If this is the insertion idx, put the low elt of V2 here.

7290

MaskVec[i] = (i == Idx) ? NumElems : i;

7291

return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);

7292

}

7293

7294

static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {

7295

if (Ptr.getOpcode() == X86ISD::Wrapper ||

7296

Ptr.getOpcode() == X86ISD::WrapperRIP)

7297

Ptr = Ptr.getOperand(0);

7298

7299

auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);

7300

if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)

7301

return nullptr;

7302

7303

return CNode->getConstVal();

7304

}

7305

7306

static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {

7307

if (!Load || !ISD::isNormalLoad(Load))

7308

return nullptr;

7309

return getTargetConstantFromBasePtr(Load->getBasePtr());

7310

}

7311

7312

static const Constant *getTargetConstantFromNode(SDValue Op) {

7313

Op = peekThroughBitcasts(Op);

7314

return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));

7315

}

7316

7317

const Constant *

7318

X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {

7319

assert(LD && "Unexpected null LoadSDNode")(static_cast <bool> (LD && "Unexpected null LoadSDNode"
) ? void (0) : __assert_fail ("LD && \"Unexpected null LoadSDNode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7319, __extension__
__PRETTY_FUNCTION__));

7320

return getTargetConstantFromNode(LD);

7321

}

7322

7323

// Extract raw constant bits from constant pools.

7324

static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,

7325

APInt &UndefElts,

7326

SmallVectorImpl<APInt> &EltBits,

7327

bool AllowWholeUndefs = true,

7328

bool AllowPartialUndefs = true) {

7329

assert(EltBits.empty() && "Expected an empty EltBits vector")(static_cast <bool> (EltBits.empty() && "Expected an empty EltBits vector"
) ? void (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7329, __extension__
__PRETTY_FUNCTION__));

7330

7331

Op = peekThroughBitcasts(Op);

7332

7333

EVT VT = Op.getValueType();

7334

unsigned SizeInBits = VT.getSizeInBits();

7335

assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(static_cast <bool> ((SizeInBits % EltSizeInBits) == 0 &&
"Can't split constant!") ? void (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7335, __extension__
__PRETTY_FUNCTION__));

7336

unsigned NumElts = SizeInBits / EltSizeInBits;

7337

7338

// Bitcast a source array of element bits to the target size.

7339

auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {

7340

unsigned NumSrcElts = UndefSrcElts.getBitWidth();

7341

unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();

7342

assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7343, __extension__
__PRETTY_FUNCTION__))

7343

"Constant bit sizes don't match")(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7343, __extension__
__PRETTY_FUNCTION__));

7344

7345

// Don't split if we don't allow undef bits.

7346

bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;

7347

if (UndefSrcElts.getBoolValue() && !AllowUndefs)

7348

return false;

7349

7350

// If we're already the right size, don't bother bitcasting.

7351

if (NumSrcElts == NumElts) {

7352

UndefElts = UndefSrcElts;

7353

EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());

7354

return true;

7355

}

7356

7357

// Extract all the undef/constant element data and pack into single bitsets.

7358

APInt UndefBits(SizeInBits, 0);

7359

APInt MaskBits(SizeInBits, 0);

7360

7361

for (unsigned i = 0; i != NumSrcElts; ++i) {

7362

unsigned BitOffset = i * SrcEltSizeInBits;

7363

if (UndefSrcElts[i])

7364

UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);

7365

MaskBits.insertBits(SrcEltBits[i], BitOffset);

7366

}

7367

7368

// Split the undef/constant single bitset data into the target elements.

7369

UndefElts = APInt(NumElts, 0);

7370

EltBits.resize(NumElts, APInt(EltSizeInBits, 0));

7371

7372

for (unsigned i = 0; i != NumElts; ++i) {

7373

unsigned BitOffset = i * EltSizeInBits;

7374

APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);

7375

7376

// Only treat an element as UNDEF if all bits are UNDEF.

7377

if (UndefEltBits.isAllOnes()) {

7378

if (!AllowWholeUndefs)

7379

return false;

7380

UndefElts.setBit(i);

7381

continue;

7382

}

7383

7384

// If only some bits are UNDEF then treat them as zero (or bail if not

7385

// supported).

7386

if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)

7387

return false;

7388

7389

EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);

7390

}

7391

return true;

7392

};

7393

7394

// Collect constant bits and insert into mask/undef bit masks.

7395

auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,

7396

unsigned UndefBitIndex) {

7397

if (!Cst)

7398

return false;

7399

if (isa<UndefValue>(Cst)) {

7400

Undefs.setBit(UndefBitIndex);

7401

return true;

7402

}

7403

if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {

7404

Mask = CInt->getValue();

7405

return true;

7406

}

7407

if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {

7408

Mask = CFP->getValueAPF().bitcastToAPInt();

7409

return true;

7410

}

7411

return false;

7412

};

7413

7414

// Handle UNDEFs.

7415

if (Op.isUndef()) {

7416

APInt UndefSrcElts = APInt::getAllOnes(NumElts);

7417

SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));

7418

return CastBitData(UndefSrcElts, SrcEltBits);

7419

}

7420

7421

// Extract scalar constant bits.

7422

if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {

7423

APInt UndefSrcElts = APInt::getZero(1);

7424

SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());

7425

return CastBitData(UndefSrcElts, SrcEltBits);

7426

}

7427

if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

7428

APInt UndefSrcElts = APInt::getZero(1);

7429

APInt RawBits = Cst->getValueAPF().bitcastToAPInt();

7430

SmallVector<APInt, 64> SrcEltBits(1, RawBits);

7431

return CastBitData(UndefSrcElts, SrcEltBits);

7432

}

7433

7434

// Extract constant bits from build vector.

7435

if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {

7436

BitVector Undefs;

7437

SmallVector<APInt> SrcEltBits;

7438

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

7439

if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {

7440

APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());

7441

for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)

7442

if (Undefs[I])

7443

UndefSrcElts.setBit(I);

7444

return CastBitData(UndefSrcElts, SrcEltBits);

7445

}

7446

}

7447

7448

// Extract constant bits from constant pool vector.

7449

if (auto *Cst = getTargetConstantFromNode(Op)) {

7450

Type *CstTy = Cst->getType();

7451

unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();

7452

if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)

7453

return false;

7454

7455

unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();

7456

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

7457

7458

APInt UndefSrcElts(NumSrcElts, 0);

7459

SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));

7460

for (unsigned i = 0; i != NumSrcElts; ++i)

7461

if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],

7462

UndefSrcElts, i))

7463

return false;

7464

7465

return CastBitData(UndefSrcElts, SrcEltBits);

7466

}

7467

7468

// Extract constant bits from a broadcasted constant pool scalar.

7469

if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&

7470

EltSizeInBits <= VT.getScalarSizeInBits()) {

7471

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

7472

if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())

7473

return false;

7474

7475

SDValue Ptr = MemIntr->getBasePtr();

7476

if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {

7477

unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();

7478

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

7479

7480

APInt UndefSrcElts(NumSrcElts, 0);

7481

SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));

7482

if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {

7483

if (UndefSrcElts[0])

7484

UndefSrcElts.setBits(0, NumSrcElts);

7485

SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);

7486

return CastBitData(UndefSrcElts, SrcEltBits);

7487

}

7488

}

7489

}

7490

7491

// Extract constant bits from a subvector broadcast.

7492

if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {

7493

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

7494

SDValue Ptr = MemIntr->getBasePtr();

7495

// The source constant may be larger than the subvector broadcast,

7496

// ensure we extract the correct subvector constants.

7497

if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {

7498

Type *CstTy = Cst->getType();

7499

unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();

7500

unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();

7501

if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||

7502

(SizeInBits % SubVecSizeInBits) != 0)

7503

return false;

7504

unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();

7505

unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;

7506

unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;

7507

APInt UndefSubElts(NumSubElts, 0);

7508

SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,

7509

APInt(CstEltSizeInBits, 0));

7510

for (unsigned i = 0; i != NumSubElts; ++i) {

7511

if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],

7512

UndefSubElts, i))

7513

return false;

7514

for (unsigned j = 1; j != NumSubVecs; ++j)

7515

SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];

7516

}

7517

UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),

7518

UndefSubElts);

7519

return CastBitData(UndefSubElts, SubEltBits);

7520

}

7521

}

7522

7523

// Extract a rematerialized scalar constant insertion.

7524

if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&

7525

Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&

7526

isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {

7527

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

7528

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

7529

7530

APInt UndefSrcElts(NumSrcElts, 0);

7531

SmallVector<APInt, 64> SrcEltBits;

7532

auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));

7533

SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));

7534

SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));

7535

return CastBitData(UndefSrcElts, SrcEltBits);

7536

}

7537

7538

// Insert constant bits from a base and sub vector sources.

7539

if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {

7540

// If bitcasts to larger elements we might lose track of undefs - don't

7541

// allow any to be safe.

7542

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

7543

bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;

7544

7545

APInt UndefSrcElts, UndefSubElts;

7546

SmallVector<APInt, 32> EltSrcBits, EltSubBits;

7547

if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,

7548

UndefSubElts, EltSubBits,

7549

AllowWholeUndefs && AllowUndefs,

7550

AllowPartialUndefs && AllowUndefs) &&

7551

getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,

7552

UndefSrcElts, EltSrcBits,

7553

AllowWholeUndefs && AllowUndefs,

7554

AllowPartialUndefs && AllowUndefs)) {

7555

unsigned BaseIdx = Op.getConstantOperandVal(2);

7556

UndefSrcElts.insertBits(UndefSubElts, BaseIdx);

7557

for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)

7558

EltSrcBits[BaseIdx + i] = EltSubBits[i];

7559

return CastBitData(UndefSrcElts, EltSrcBits);

7560

}

7561

}

7562

7563

// Extract constant bits from a subvector's source.

7564

if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

7565

// TODO - support extract_subvector through bitcasts.

7566

if (EltSizeInBits != VT.getScalarSizeInBits())

7567

return false;

7568

7569

if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,

7570

UndefElts, EltBits, AllowWholeUndefs,

7571

AllowPartialUndefs)) {

7572

EVT SrcVT = Op.getOperand(0).getValueType();

7573

unsigned NumSrcElts = SrcVT.getVectorNumElements();

7574

unsigned NumSubElts = VT.getVectorNumElements();

7575

unsigned BaseIdx = Op.getConstantOperandVal(1);

7576

UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);

7577

if ((BaseIdx + NumSubElts) != NumSrcElts)

7578

EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());

7579

if (BaseIdx != 0)

7580

EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);

7581

return true;

7582

}

7583

}

7584

7585

// Extract constant bits from shuffle node sources.

7586

if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {

7587

// TODO - support shuffle through bitcasts.

7588

if (EltSizeInBits != VT.getScalarSizeInBits())

7589

return false;

7590

7591

ArrayRef<int> Mask = SVN->getMask();

7592

if ((!AllowWholeUndefs || !AllowPartialUndefs) &&

7593

llvm::any_of(Mask, [](int M) { return M < 0; }))

7594

return false;

7595

7596

APInt UndefElts0, UndefElts1;

7597

SmallVector<APInt, 32> EltBits0, EltBits1;

7598

if (isAnyInRange(Mask, 0, NumElts) &&

7599

!getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,

7600

UndefElts0, EltBits0, AllowWholeUndefs,

7601

AllowPartialUndefs))

7602

return false;

7603

if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&

7604

!getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,

7605

UndefElts1, EltBits1, AllowWholeUndefs,

7606

AllowPartialUndefs))

7607

return false;

7608

7609

UndefElts = APInt::getZero(NumElts);

7610

for (int i = 0; i != (int)NumElts; ++i) {

7611

int M = Mask[i];

7612

if (M < 0) {

7613

UndefElts.setBit(i);

7614

EltBits.push_back(APInt::getZero(EltSizeInBits));

7615

} else if (M < (int)NumElts) {

7616

if (UndefElts0[M])

7617

UndefElts.setBit(i);

7618

EltBits.push_back(EltBits0[M]);

7619

} else {

7620

if (UndefElts1[M - NumElts])

7621

UndefElts.setBit(i);

7622

EltBits.push_back(EltBits1[M - NumElts]);

7623

}

7624

}

7625

return true;

7626

}

7627

7628

return false;

7629

}

7630

7631

namespace llvm {

7632

namespace X86 {

7633

bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {

7634

APInt UndefElts;

7635

SmallVector<APInt, 16> EltBits;

7636

if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),

7637

UndefElts, EltBits, true,

7638

AllowPartialUndefs)) {

7639

int SplatIndex = -1;

7640

for (int i = 0, e = EltBits.size(); i != e; ++i) {

7641

if (UndefElts[i])

7642

continue;

7643

if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {

7644

SplatIndex = -1;

7645

break;

7646

}

7647

SplatIndex = i;

7648

}

7649

if (0 <= SplatIndex) {

7650

SplatVal = EltBits[SplatIndex];

7651

return true;

7652

}

7653

}

7654

7655

return false;

7656

}

7657

} // namespace X86

7658

} // namespace llvm

7659

7660

static bool getTargetShuffleMaskIndices(SDValue MaskNode,

7661

unsigned MaskEltSizeInBits,

7662

SmallVectorImpl<uint64_t> &RawMask,

7663

APInt &UndefElts) {

7664

// Extract the raw target constant bits.

7665

SmallVector<APInt, 64> EltBits;

7666

if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,

7667

EltBits, /* AllowWholeUndefs */ true,

7668

/* AllowPartialUndefs */ false))

7669

return false;

7670

7671

// Insert the extracted elements into the mask.

7672

for (const APInt &Elt : EltBits)

7673

RawMask.push_back(Elt.getZExtValue());

7674

7675

return true;

7676

}

7677

7678

/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.

7679

/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.

7680

/// Note: This ignores saturation, so inputs must be checked first.

7681

static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

7682

bool Unary, unsigned NumStages = 1) {

7683

assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7683, __extension__
__PRETTY_FUNCTION__));

7684

unsigned NumElts = VT.getVectorNumElements();

7685

unsigned NumLanes = VT.getSizeInBits() / 128;

7686

unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();

7687

unsigned Offset = Unary ? 0 : NumElts;

7688

unsigned Repetitions = 1u << (NumStages - 1);

7689

unsigned Increment = 1u << NumStages;

7690

assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")(static_cast <bool> ((NumEltsPerLane >> NumStages
) > 0 && "Illegal packing compaction") ? void (0) :
__assert_fail ("(NumEltsPerLane >> NumStages) > 0 && \"Illegal packing compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7690, __extension__
__PRETTY_FUNCTION__));

7691

7692

for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

7693

for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {

7694

for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)

7695

Mask.push_back(Elt + (Lane * NumEltsPerLane));

7696

for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)

7697

Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);

7698

}

7699

}

7700

}

7701

7702

// Split the demanded elts of a PACKSS/PACKUS node between its operands.

7703

static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,

7704

APInt &DemandedLHS, APInt &DemandedRHS) {

7705

int NumLanes = VT.getSizeInBits() / 128;

7706

int NumElts = DemandedElts.getBitWidth();

7707

int NumInnerElts = NumElts / 2;

7708

int NumEltsPerLane = NumElts / NumLanes;

7709

int NumInnerEltsPerLane = NumInnerElts / NumLanes;

7710

7711

DemandedLHS = APInt::getZero(NumInnerElts);

7712

DemandedRHS = APInt::getZero(NumInnerElts);

7713

7714

// Map DemandedElts to the packed operands.

7715

for (int Lane = 0; Lane != NumLanes; ++Lane) {

7716

for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {

7717

int OuterIdx = (Lane * NumEltsPerLane) + Elt;

7718

int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;

7719

if (DemandedElts[OuterIdx])

7720

DemandedLHS.setBit(InnerIdx);

7721

if (DemandedElts[OuterIdx + NumInnerEltsPerLane])

7722

DemandedRHS.setBit(InnerIdx);

7723

}

7724

}

7725

}

7726

7727

// Split the demanded elts of a HADD/HSUB node between its operands.

7728

static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,

7729

APInt &DemandedLHS, APInt &DemandedRHS) {

7730

int NumLanes = VT.getSizeInBits() / 128;

7731

int NumElts = DemandedElts.getBitWidth();

7732

int NumEltsPerLane = NumElts / NumLanes;

7733

int HalfEltsPerLane = NumEltsPerLane / 2;

7734

7735

DemandedLHS = APInt::getZero(NumElts);

7736

DemandedRHS = APInt::getZero(NumElts);

7737

7738

// Map DemandedElts to the horizontal operands.

7739

for (int Idx = 0; Idx != NumElts; ++Idx) {

7740

if (!DemandedElts[Idx])

7741

continue;

7742

int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;

7743

int LocalIdx = Idx % NumEltsPerLane;

7744

if (LocalIdx < HalfEltsPerLane) {

7745

DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);

7746

DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);

7747

} else {

7748

LocalIdx -= HalfEltsPerLane;

7749

DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);

7750

DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);

7751

}

7752

}

7753

}

7754

7755

/// Calculates the shuffle mask corresponding to the target-specific opcode.

7756

/// If the mask could be calculated, returns it in \p Mask, returns the shuffle

7757

/// operands in \p Ops, and returns true.

7758

/// Sets \p IsUnary to true if only one source is used. Note that this will set

7759

/// IsUnary for shuffles which use a single input multiple times, and in those

7760

/// cases it will adjust the mask to only have indices within that single input.

7761

/// It is an error to call this with non-empty Mask/Ops vectors.

7762

static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

7763

SmallVectorImpl<SDValue> &Ops,

7764

SmallVectorImpl<int> &Mask, bool &IsUnary) {

7765

unsigned NumElems = VT.getVectorNumElements();

7766

unsigned MaskEltSize = VT.getScalarSizeInBits();

7767

SmallVector<uint64_t, 32> RawMask;

7768

APInt RawUndefs;

7769

uint64_t ImmN;

7770

7771

assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")(static_cast <bool> (Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7771, __extension__
__PRETTY_FUNCTION__));

7772

assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")(static_cast <bool> (Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7772, __extension__
__PRETTY_FUNCTION__));

7773

7774

IsUnary = false;

7775

bool IsFakeUnary = false;

7776

switch (N->getOpcode()) {

7777

case X86ISD::BLENDI:

7778

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7778, __extension__
__PRETTY_FUNCTION__));

7779

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7779, __extension__
__PRETTY_FUNCTION__));

7780

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7781

DecodeBLENDMask(NumElems, ImmN, Mask);

7782

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7783

break;

7784

case X86ISD::SHUFP:

7785

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7785, __extension__
__PRETTY_FUNCTION__));

7786

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7786, __extension__
__PRETTY_FUNCTION__));

7787

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7788

DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);

7789

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7790

break;

7791

case X86ISD::INSERTPS:

7792

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7792, __extension__
__PRETTY_FUNCTION__));

7793

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7793, __extension__
__PRETTY_FUNCTION__));

7794

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7795

DecodeINSERTPSMask(ImmN, Mask);

7796

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7797

break;

7798

case X86ISD::EXTRQI:

7799

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7799, __extension__
__PRETTY_FUNCTION__));

7800

if (isa<ConstantSDNode>(N->getOperand(1)) &&

7801

isa<ConstantSDNode>(N->getOperand(2))) {

7802

int BitLen = N->getConstantOperandVal(1);

7803

int BitIdx = N->getConstantOperandVal(2);

7804

DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);

7805

IsUnary = true;

7806

}

7807

break;

7808

case X86ISD::INSERTQI:

7809

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7809, __extension__
__PRETTY_FUNCTION__));

7810

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7810, __extension__
__PRETTY_FUNCTION__));

7811

if (isa<ConstantSDNode>(N->getOperand(2)) &&

7812

isa<ConstantSDNode>(N->getOperand(3))) {

7813

int BitLen = N->getConstantOperandVal(2);

7814

int BitIdx = N->getConstantOperandVal(3);

7815

DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);

7816

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7817

}

7818

break;

7819

case X86ISD::UNPCKH:

7820

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7820, __extension__
__PRETTY_FUNCTION__));

7821

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7821, __extension__
__PRETTY_FUNCTION__));

7822

DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);

7823

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7824

break;

7825

case X86ISD::UNPCKL:

7826

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7826, __extension__
__PRETTY_FUNCTION__));

7827

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7827, __extension__
__PRETTY_FUNCTION__));

7828

DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);

7829

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7830

break;

7831

case X86ISD::MOVHLPS:

7832

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7832, __extension__
__PRETTY_FUNCTION__));

7833

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7833, __extension__
__PRETTY_FUNCTION__));

7834

DecodeMOVHLPSMask(NumElems, Mask);

7835

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7836

break;

7837

case X86ISD::MOVLHPS:

7838

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7838, __extension__
__PRETTY_FUNCTION__));

7839

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7839, __extension__
__PRETTY_FUNCTION__));

7840

DecodeMOVLHPSMask(NumElems, Mask);

7841

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7842

break;

7843

case X86ISD::VALIGN:

7844

assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7845, __extension__
__PRETTY_FUNCTION__))

7845

"Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7845, __extension__
__PRETTY_FUNCTION__));

7846

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7846, __extension__
__PRETTY_FUNCTION__));

7847

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7847, __extension__
__PRETTY_FUNCTION__));

7848

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7849

DecodeVALIGNMask(NumElems, ImmN, Mask);

7850

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7851

Ops.push_back(N->getOperand(1));

7852

Ops.push_back(N->getOperand(0));

7853

break;

7854

case X86ISD::PALIGNR:

7855

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7855, __extension__
__PRETTY_FUNCTION__));

7856

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7856, __extension__
__PRETTY_FUNCTION__));

7857

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7857, __extension__
__PRETTY_FUNCTION__));

7858

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7859

DecodePALIGNRMask(NumElems, ImmN, Mask);

7860

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7861

Ops.push_back(N->getOperand(1));

7862

Ops.push_back(N->getOperand(0));

7863

break;

7864

case X86ISD::VSHLDQ:

7865

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7865, __extension__
__PRETTY_FUNCTION__));

7866

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7866, __extension__
__PRETTY_FUNCTION__));

7867

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7868

DecodePSLLDQMask(NumElems, ImmN, Mask);

7869

IsUnary = true;

7870

break;

7871

case X86ISD::VSRLDQ:

7872

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7872, __extension__
__PRETTY_FUNCTION__));

7873

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7873, __extension__
__PRETTY_FUNCTION__));

7874

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7875

DecodePSRLDQMask(NumElems, ImmN, Mask);

7876

IsUnary = true;

7877

break;

7878

case X86ISD::PSHUFD:

7879

case X86ISD::VPERMILPI:

7880

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7880, __extension__
__PRETTY_FUNCTION__));

7881

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7882

DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);

7883

IsUnary = true;

7884

break;

7885

case X86ISD::PSHUFHW:

7886

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7886, __extension__
__PRETTY_FUNCTION__));

7887

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7888

DecodePSHUFHWMask(NumElems, ImmN, Mask);

7889

IsUnary = true;

7890

break;

7891

case X86ISD::PSHUFLW:

7892

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7892, __extension__
__PRETTY_FUNCTION__));

7893

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7894

DecodePSHUFLWMask(NumElems, ImmN, Mask);

7895

IsUnary = true;

7896

break;

7897

case X86ISD::VZEXT_MOVL:

7898

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7898, __extension__
__PRETTY_FUNCTION__));

7899

DecodeZeroMoveLowMask(NumElems, Mask);

7900

IsUnary = true;

7901

break;

7902

case X86ISD::VBROADCAST:

7903

// We only decode broadcasts of same-sized vectors, peeking through to

7904

// extracted subvectors is likely to cause hasOneUse issues with

7905

// SimplifyDemandedBits etc.

7906

if (N->getOperand(0).getValueType() == VT) {

7907

DecodeVectorBroadcast(NumElems, Mask);

7908

IsUnary = true;

7909

break;

7910

}

7911

return false;

7912

case X86ISD::VPERMILPV: {

7913

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7913, __extension__
__PRETTY_FUNCTION__));

7914

IsUnary = true;

7915

SDValue MaskNode = N->getOperand(1);

7916

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

7917

RawUndefs)) {

7918

DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);

7919

break;

7920

}

7921

return false;

7922

}

7923

case X86ISD::PSHUFB: {

7924

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7924, __extension__
__PRETTY_FUNCTION__));

7925

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7925, __extension__
__PRETTY_FUNCTION__));

7926

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7926, __extension__
__PRETTY_FUNCTION__));

7927

IsUnary = true;

7928

SDValue MaskNode = N->getOperand(1);

7929

if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {

7930

DecodePSHUFBMask(RawMask, RawUndefs, Mask);

7931

break;

7932

}

7933

return false;

7934

}

7935

case X86ISD::VPERMI:

7936

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7936, __extension__
__PRETTY_FUNCTION__));

7937

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7938

DecodeVPERMMask(NumElems, ImmN, Mask);

7939

IsUnary = true;

7940

break;

7941

case X86ISD::MOVSS:

7942

case X86ISD::MOVSD:

7943

case X86ISD::MOVSH:

7944

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7944, __extension__
__PRETTY_FUNCTION__));

7945

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7945, __extension__
__PRETTY_FUNCTION__));

7946

DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);

7947

break;

7948

case X86ISD::VPERM2X128:

7949

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7949, __extension__
__PRETTY_FUNCTION__));

7950

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7950, __extension__
__PRETTY_FUNCTION__));

7951

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7952

DecodeVPERM2X128Mask(NumElems, ImmN, Mask);

7953

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7954

break;

7955

case X86ISD::SHUF128:

7956

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7956, __extension__
__PRETTY_FUNCTION__));

7957

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7957, __extension__
__PRETTY_FUNCTION__));

7958

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7959

decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);

7960

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7961

break;

7962

case X86ISD::MOVSLDUP:

7963

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7963, __extension__
__PRETTY_FUNCTION__));

7964

DecodeMOVSLDUPMask(NumElems, Mask);

7965

IsUnary = true;

7966

break;

7967

case X86ISD::MOVSHDUP:

7968

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7968, __extension__
__PRETTY_FUNCTION__));

7969

DecodeMOVSHDUPMask(NumElems, Mask);

7970

IsUnary = true;

7971

break;

7972

case X86ISD::MOVDDUP:

7973

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7973, __extension__
__PRETTY_FUNCTION__));

7974

DecodeMOVDDUPMask(NumElems, Mask);

7975

IsUnary = true;

7976

break;

7977

case X86ISD::VPERMIL2: {

7978

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7978, __extension__
__PRETTY_FUNCTION__));

7979

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7979, __extension__
__PRETTY_FUNCTION__));

7980

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7981

SDValue MaskNode = N->getOperand(2);

7982

SDValue CtrlNode = N->getOperand(3);

7983

if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {

7984

unsigned CtrlImm = CtrlOp->getZExtValue();

7985

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

7986

RawUndefs)) {

7987

DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,

7988

Mask);

7989

break;

7990

}

7991

}

7992

return false;

7993

}

7994

case X86ISD::VPPERM: {

7995

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7995, __extension__
__PRETTY_FUNCTION__));

7996

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7996, __extension__
__PRETTY_FUNCTION__));

7997

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7998

SDValue MaskNode = N->getOperand(2);

7999

if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {

8000

DecodeVPPERMMask(RawMask, RawUndefs, Mask);

8001

break;

8002

}

8003

return false;

8004

}

8005

case X86ISD::VPERMV: {

8006

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8006, __extension__
__PRETTY_FUNCTION__));

8007

IsUnary = true;

8008

// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.

8009

Ops.push_back(N->getOperand(1));

8010

SDValue MaskNode = N->getOperand(0);

8011

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

8012

RawUndefs)) {

8013

DecodeVPERMVMask(RawMask, RawUndefs, Mask);

8014

break;

8015

}

8016

return false;

8017

}

8018

case X86ISD::VPERMV3: {

8019

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8019, __extension__
__PRETTY_FUNCTION__));

8020

assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(2).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8020, __extension__
__PRETTY_FUNCTION__));

8021

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);

8022

// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.

8023

Ops.push_back(N->getOperand(0));

8024

Ops.push_back(N->getOperand(2));

8025

SDValue MaskNode = N->getOperand(1);

8026

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

8027

RawUndefs)) {

8028

DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);

8029

break;

8030

}

8031

return false;

8032

}

8033

default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8033);

8034

}

8035

8036

// Empty mask indicates the decode failed.

8037

if (Mask.empty())

8038

return false;

8039

8040

// Check if we're getting a shuffle mask with zero'd elements.

8041

if (!AllowSentinelZero && isAnyZero(Mask))

8042

return false;

8043

8044

// If we have a fake unary shuffle, the shuffle mask is spread across two

8045

// inputs that are actually the same node. Re-map the mask to always point

8046

// into the first input.

8047

if (IsFakeUnary)

8048

for (int &M : Mask)

8049

if (M >= (int)Mask.size())

8050

M -= Mask.size();

8051

8052

// If we didn't already add operands in the opcode-specific code, default to

8053

// adding 1 or 2 operands starting at 0.

8054

if (Ops.empty()) {

8055

Ops.push_back(N->getOperand(0));

8056

if (!IsUnary || IsFakeUnary)

8057

Ops.push_back(N->getOperand(1));

8058

}

8059

8060

return true;

8061

}

8062

8063

// Wrapper for getTargetShuffleMask with InUnary;

8064

static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

8065

SmallVectorImpl<SDValue> &Ops,

8066

SmallVectorImpl<int> &Mask) {

8067

bool IsUnary;

8068

return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);

8069

}

8070

8071

/// Compute whether each element of a shuffle is zeroable.

8072

///

8073

/// A "zeroable" vector shuffle element is one which can be lowered to zero.

8074

/// Either it is an undef element in the shuffle mask, the element of the input

8075

/// referenced is undef, or the element of the input referenced is known to be

8076

/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle

8077

/// as many lanes with this technique as possible to simplify the remaining

8078

/// shuffle.

8079

static void computeZeroableShuffleElements(ArrayRef<int> Mask,

8080

SDValue V1, SDValue V2,

8081

APInt &KnownUndef, APInt &KnownZero) {

8082

int Size = Mask.size();

8083

KnownUndef = KnownZero = APInt::getZero(Size);

8084

8085

V1 = peekThroughBitcasts(V1);

8086

V2 = peekThroughBitcasts(V2);

8087

8088

bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());

8089

bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

8090

8091

int VectorSizeInBits = V1.getValueSizeInBits();

8092

int ScalarSizeInBits = VectorSizeInBits / Size;

8093

assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")(static_cast <bool> (!(VectorSizeInBits % ScalarSizeInBits
) && "Illegal shuffle mask size") ? void (0) : __assert_fail
("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8093, __extension__
__PRETTY_FUNCTION__));

8094

8095

for (int i = 0; i < Size; ++i) {

8096

int M = Mask[i];

8097

// Handle the easy cases.

8098

if (M < 0) {

8099

KnownUndef.setBit(i);

8100

continue;

8101

}

8102

if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {

8103

KnownZero.setBit(i);

8104

continue;

8105

}

8106

8107

// Determine shuffle input and normalize the mask.

8108

SDValue V = M < Size ? V1 : V2;

8109

M %= Size;

8110

8111

// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.

8112

if (V.getOpcode() != ISD::BUILD_VECTOR)

8113

continue;

8114

8115

// If the BUILD_VECTOR has fewer elements then the bitcasted portion of

8116

// the (larger) source element must be UNDEF/ZERO.

8117

if ((Size % V.getNumOperands()) == 0) {

8118

int Scale = Size / V->getNumOperands();

8119

SDValue Op = V.getOperand(M / Scale);

8120

if (Op.isUndef())

8121

KnownUndef.setBit(i);

8122

if (X86::isZeroNode(Op))

8123

KnownZero.setBit(i);

8124

else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {

8125

APInt Val = Cst->getAPIntValue();

8126

Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);

8127

if (Val == 0)

8128

KnownZero.setBit(i);

8129

} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

8130

APInt Val = Cst->getValueAPF().bitcastToAPInt();

8131

Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);

8132

if (Val == 0)

8133

KnownZero.setBit(i);

8134

}

8135

continue;

8136

}

8137

8138

// If the BUILD_VECTOR has more elements then all the (smaller) source

8139

// elements must be UNDEF or ZERO.

8140

if ((V.getNumOperands() % Size) == 0) {

8141

int Scale = V->getNumOperands() / Size;

8142

bool AllUndef = true;

8143

bool AllZero = true;

8144

for (int j = 0; j < Scale; ++j) {

8145

SDValue Op = V.getOperand((M * Scale) + j);

8146

AllUndef &= Op.isUndef();

8147

AllZero &= X86::isZeroNode(Op);

8148

}

8149

if (AllUndef)

8150

KnownUndef.setBit(i);

8151

if (AllZero)

8152

KnownZero.setBit(i);

8153

continue;

8154

}

8155

}

8156

}

8157

8158

/// Decode a target shuffle mask and inputs and see if any values are

8159

/// known to be undef or zero from their inputs.

8160

/// Returns true if the target shuffle mask was decoded.

8161

/// FIXME: Merge this with computeZeroableShuffleElements?

8162

static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,

8163

SmallVectorImpl<SDValue> &Ops,

8164

APInt &KnownUndef, APInt &KnownZero) {

8165

bool IsUnary;

8166

if (!isTargetShuffle(N.getOpcode()))

8167

return false;

8168

8169

MVT VT = N.getSimpleValueType();

8170

if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))

8171

return false;

8172

8173

int Size = Mask.size();

8174

SDValue V1 = Ops[0];

8175

SDValue V2 = IsUnary ? V1 : Ops[1];

8176

KnownUndef = KnownZero = APInt::getZero(Size);

8177

8178

V1 = peekThroughBitcasts(V1);

8179

V2 = peekThroughBitcasts(V2);

8180

8181

assert((VT.getSizeInBits() % Size) == 0 &&(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8182, __extension__
__PRETTY_FUNCTION__))

8182

"Illegal split of shuffle value type")(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8182, __extension__
__PRETTY_FUNCTION__));

8183

unsigned EltSizeInBits = VT.getSizeInBits() / Size;

8184

8185

// Extract known constant input data.

8186

APInt UndefSrcElts[2];

8187

SmallVector<APInt, 32> SrcEltBits[2];

8188

bool IsSrcConstant[2] = {

8189

getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],

8190

SrcEltBits[0], true, false),

8191

getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],

8192

SrcEltBits[1], true, false)};

8193

8194

for (int i = 0; i < Size; ++i) {

8195

int M = Mask[i];

8196

8197

// Already decoded as SM_SentinelZero / SM_SentinelUndef.

8198

if (M < 0) {

8199

assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")(static_cast <bool> (isUndefOrZero(M) && "Unknown shuffle sentinel value!"
) ? void (0) : __assert_fail ("isUndefOrZero(M) && \"Unknown shuffle sentinel value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8199, __extension__
__PRETTY_FUNCTION__));

8200

if (SM_SentinelUndef == M)

8201

KnownUndef.setBit(i);

8202

if (SM_SentinelZero == M)

8203

KnownZero.setBit(i);

8204

continue;

8205

}

8206

8207

// Determine shuffle input and normalize the mask.

8208

unsigned SrcIdx = M / Size;

8209

SDValue V = M < Size ? V1 : V2;

8210

M %= Size;

8211

8212

// We are referencing an UNDEF input.

8213

if (V.isUndef()) {

8214

KnownUndef.setBit(i);

8215

continue;

8216

}

8217

8218

// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.

8219

// TODO: We currently only set UNDEF for integer types - floats use the same

8220

// registers as vectors and many of the scalar folded loads rely on the

8221

// SCALAR_TO_VECTOR pattern.

8222

if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&

8223

(Size % V.getValueType().getVectorNumElements()) == 0) {

8224

int Scale = Size / V.getValueType().getVectorNumElements();

8225

int Idx = M / Scale;

8226

if (Idx != 0 && !VT.isFloatingPoint())

8227

KnownUndef.setBit(i);

8228

else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))

8229

KnownZero.setBit(i);

8230

continue;

8231

}

8232

8233

// INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF

8234

// base vectors.

8235

if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {

8236

SDValue Vec = V.getOperand(0);

8237

int NumVecElts = Vec.getValueType().getVectorNumElements();

8238

if (Vec.isUndef() && Size == NumVecElts) {

8239

int Idx = V.getConstantOperandVal(2);

8240

int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();

8241

if (M < Idx || (Idx + NumSubElts) <= M)

8242

KnownUndef.setBit(i);

8243

}

8244

continue;

8245

}

8246

8247

// Attempt to extract from the source's constant bits.

8248

if (IsSrcConstant[SrcIdx]) {

8249

if (UndefSrcElts[SrcIdx][M])

8250

KnownUndef.setBit(i);

8251

else if (SrcEltBits[SrcIdx][M] == 0)

8252

KnownZero.setBit(i);

8253

}

8254

}

8255

8256

assert(VT.getVectorNumElements() == (unsigned)Size &&(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8257, __extension__
__PRETTY_FUNCTION__))

8257

"Different mask size from vector size!")(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8257, __extension__
__PRETTY_FUNCTION__));

8258

return true;

8259

}

8260

8261

// Replace target shuffle mask elements with known undef/zero sentinels.

8262

static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,

8263

const APInt &KnownUndef,

8264

const APInt &KnownZero,

8265

bool ResolveKnownZeros= true) {

8266

unsigned NumElts = Mask.size();

8267

assert(KnownUndef.getBitWidth() == NumElts &&(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8268, __extension__
__PRETTY_FUNCTION__))

8268

KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8268, __extension__
__PRETTY_FUNCTION__));

8269

8270

for (unsigned i = 0; i != NumElts; ++i) {

8271

if (KnownUndef[i])

8272

Mask[i] = SM_SentinelUndef;

8273

else if (ResolveKnownZeros && KnownZero[i])

8274

Mask[i] = SM_SentinelZero;

8275

}

8276

}

8277

8278

// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.

8279

static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,

8280

APInt &KnownUndef,

8281

APInt &KnownZero) {

8282

unsigned NumElts = Mask.size();

8283

KnownUndef = KnownZero = APInt::getZero(NumElts);

8284

8285

for (unsigned i = 0; i != NumElts; ++i) {

8286

int M = Mask[i];

8287

if (SM_SentinelUndef == M)

8288

KnownUndef.setBit(i);

8289

if (SM_SentinelZero == M)

8290

KnownZero.setBit(i);

8291

}

8292

}

8293

8294

// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.

8295

static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,

8296

SDValue Cond, bool IsBLENDV = false) {

8297

EVT CondVT = Cond.getValueType();

8298

unsigned EltSizeInBits = CondVT.getScalarSizeInBits();

8299

unsigned NumElts = CondVT.getVectorNumElements();

8300

8301

APInt UndefElts;

8302

SmallVector<APInt, 32> EltBits;

8303

if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,

8304

true, false))

8305

return false;

8306

8307

Mask.resize(NumElts, SM_SentinelUndef);

8308

8309

for (int i = 0; i != (int)NumElts; ++i) {

8310

Mask[i] = i;

8311

// Arbitrarily choose from the 2nd operand if the select condition element

8312

// is undef.

8313

// TODO: Can we do better by matching patterns such as even/odd?

8314

if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||

8315

(IsBLENDV && EltBits[i].isNonNegative()))

8316

Mask[i] += NumElts;

8317

}

8318

8319

return true;

8320

}

8321

8322

// Forward declaration (for getFauxShuffleMask recursive check).

8323

static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

8324

SmallVectorImpl<SDValue> &Inputs,

8325

SmallVectorImpl<int> &Mask,

8326

const SelectionDAG &DAG, unsigned Depth,

8327

bool ResolveKnownElts);

8328

8329

// Attempt to decode ops that could be represented as a shuffle mask.

8330

// The decoded shuffle mask may contain a different number of elements to the

8331

// destination value type.

8332

// TODO: Merge into getTargetShuffleInputs()

8333

static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

8334

SmallVectorImpl<int> &Mask,

8335

SmallVectorImpl<SDValue> &Ops,

8336

const SelectionDAG &DAG, unsigned Depth,

8337

bool ResolveKnownElts) {

8338

Mask.clear();

8339

Ops.clear();

8340

8341

MVT VT = N.getSimpleValueType();

8342

unsigned NumElts = VT.getVectorNumElements();

8343

unsigned NumSizeInBits = VT.getSizeInBits();

8344

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

8345

if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)

8346

return false;

8347

assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")(static_cast <bool> (NumElts == DemandedElts.getBitWidth
() && "Unexpected vector size") ? void (0) : __assert_fail
("NumElts == DemandedElts.getBitWidth() && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8347, __extension__
__PRETTY_FUNCTION__));

8348

unsigned NumSizeInBytes = NumSizeInBits / 8;

8349

unsigned NumBytesPerElt = NumBitsPerElt / 8;

8350

8351

unsigned Opcode = N.getOpcode();

8352

switch (Opcode) {

8353

case ISD::VECTOR_SHUFFLE: {

8354

// Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.

8355

ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();

8356

if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {

8357

Mask.append(ShuffleMask.begin(), ShuffleMask.end());

8358

Ops.push_back(N.getOperand(0));

8359

Ops.push_back(N.getOperand(1));

8360

return true;

8361

}

8362

return false;

8363

}

8364

case ISD::AND:

8365

case X86ISD::ANDNP: {

8366

// Attempt to decode as a per-byte mask.

8367

APInt UndefElts;

8368

SmallVector<APInt, 32> EltBits;

8369

SDValue N0 = N.getOperand(0);

8370

SDValue N1 = N.getOperand(1);

8371

bool IsAndN = (X86ISD::ANDNP == Opcode);

8372

uint64_t ZeroMask = IsAndN ? 255 : 0;

8373

if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))

8374

return false;

8375

// We can't assume an undef src element gives an undef dst - the other src

8376

// might be zero.

8377

if (!UndefElts.isZero())

8378

return false;

8379

for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {

8380

const APInt &ByteBits = EltBits[i];

8381

if (ByteBits != 0 && ByteBits != 255)

8382

return false;

8383

Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);

8384

}

8385

Ops.push_back(IsAndN ? N1 : N0);

8386

return true;

8387

}

8388

case ISD::OR: {

8389

// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other

8390

// is a valid shuffle index.

8391

SDValue N0 = peekThroughBitcasts(N.getOperand(0));

8392

SDValue N1 = peekThroughBitcasts(N.getOperand(1));

8393

if (!N0.getValueType().isVector() || !N1.getValueType().isVector())

8394

return false;

8395

8396

SmallVector<int, 64> SrcMask0, SrcMask1;

8397

SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;

8398

APInt Demand0 = APInt::getAllOnes(N0.getValueType().getVectorNumElements());

8399

APInt Demand1 = APInt::getAllOnes(N1.getValueType().getVectorNumElements());

8400

if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,

8401

Depth + 1, true) ||

8402

!getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,

8403

Depth + 1, true))

8404

return false;

8405

8406

size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());

8407

SmallVector<int, 64> Mask0, Mask1;

8408

narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);

8409

narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);

8410

for (int i = 0; i != (int)MaskSize; ++i) {

8411

// NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite

8412

// loops converting between OR and BLEND shuffles due to

8413

// canWidenShuffleElements merging away undef elements, meaning we

8414

// fail to recognise the OR as the undef element isn't known zero.

8415

if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)

8416

Mask.push_back(SM_SentinelZero);

8417

else if (Mask1[i] == SM_SentinelZero)

8418

Mask.push_back(i);

8419

else if (Mask0[i] == SM_SentinelZero)

8420

Mask.push_back(i + MaskSize);

8421

else

8422

return false;

8423

}

8424

Ops.push_back(N0);

8425

Ops.push_back(N1);

8426

return true;

8427

}

8428

case ISD::INSERT_SUBVECTOR: {

8429

SDValue Src = N.getOperand(0);

8430

SDValue Sub = N.getOperand(1);

8431

EVT SubVT = Sub.getValueType();

8432

unsigned NumSubElts = SubVT.getVectorNumElements();

8433

if (!N->isOnlyUserOf(Sub.getNode()))

8434

return false;

8435

uint64_t InsertIdx = N.getConstantOperandVal(2);

8436

// Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).

8437

if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

8438

Sub.getOperand(0).getValueType() == VT) {

8439

uint64_t ExtractIdx = Sub.getConstantOperandVal(1);

8440

for (int i = 0; i != (int)NumElts; ++i)

8441

Mask.push_back(i);

8442

for (int i = 0; i != (int)NumSubElts; ++i)

8443

Mask[InsertIdx + i] = NumElts + ExtractIdx + i;

8444

Ops.push_back(Src);

8445

Ops.push_back(Sub.getOperand(0));

8446

return true;

8447

}

8448

// Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).

8449

SmallVector<int, 64> SubMask;

8450

SmallVector<SDValue, 2> SubInputs;

8451

SDValue SubSrc = peekThroughOneUseBitcasts(Sub);

8452

EVT SubSrcVT = SubSrc.getValueType();

8453

if (!SubSrcVT.isVector())

8454

return false;

8455

8456

APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());

8457

if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,

8458

Depth + 1, ResolveKnownElts))

8459

return false;

8460

8461

// Subvector shuffle inputs must not be larger than the subvector.

8462

if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {

8463

return SubVT.getFixedSizeInBits() <

8464

SubInput.getValueSizeInBits().getFixedValue();

8465

}))

8466

return false;

8467

8468

if (SubMask.size() != NumSubElts) {

8469

assert(((SubMask.size() % NumSubElts) == 0 ||(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8470, __extension__
__PRETTY_FUNCTION__))

8470

(NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8470, __extension__
__PRETTY_FUNCTION__));

8471

if ((NumSubElts % SubMask.size()) == 0) {

8472

int Scale = NumSubElts / SubMask.size();

8473

SmallVector<int,64> ScaledSubMask;

8474

narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);

8475

SubMask = ScaledSubMask;

8476

} else {

8477

int Scale = SubMask.size() / NumSubElts;

8478

NumSubElts = SubMask.size();

8479

NumElts *= Scale;

8480

InsertIdx *= Scale;

8481

}

8482

}

8483

Ops.push_back(Src);

8484

Ops.append(SubInputs.begin(), SubInputs.end());

8485

if (ISD::isBuildVectorAllZeros(Src.getNode()))

8486

Mask.append(NumElts, SM_SentinelZero);

8487

else

8488

for (int i = 0; i != (int)NumElts; ++i)

8489

Mask.push_back(i);

8490

for (int i = 0; i != (int)NumSubElts; ++i) {

8491

int M = SubMask[i];

8492

if (0 <= M) {

8493

int InputIdx = M / NumSubElts;

8494

M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);

8495

}

8496

Mask[i + InsertIdx] = M;

8497

}

8498

return true;

8499

}

8500

case X86ISD::PINSRB:

8501

case X86ISD::PINSRW:

8502

case ISD::SCALAR_TO_VECTOR:

8503

case ISD::INSERT_VECTOR_ELT: {

8504

// Match against a insert_vector_elt/scalar_to_vector of an extract from a

8505

// vector, for matching src/dst vector types.

8506

SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);

8507

8508

unsigned DstIdx = 0;

8509

if (Opcode != ISD::SCALAR_TO_VECTOR) {

8510

// Check we have an in-range constant insertion index.

8511

if (!isa<ConstantSDNode>(N.getOperand(2)) ||

8512

N.getConstantOperandAPInt(2).uge(NumElts))

8513

return false;

8514

DstIdx = N.getConstantOperandVal(2);

8515

8516

// Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.

8517

if (X86::isZeroNode(Scl)) {

8518

Ops.push_back(N.getOperand(0));

8519

for (unsigned i = 0; i != NumElts; ++i)

8520

Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);

8521

return true;

8522

}

8523

}

8524

8525

// Peek through trunc/aext/zext.

8526

// TODO: aext shouldn't require SM_SentinelZero padding.

8527

// TODO: handle shift of scalars.

8528

unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();

8529

while (Scl.getOpcode() == ISD::TRUNCATE ||

8530

Scl.getOpcode() == ISD::ANY_EXTEND ||

8531

Scl.getOpcode() == ISD::ZERO_EXTEND) {

8532

Scl = Scl.getOperand(0);

8533

MinBitsPerElt =

8534

std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());

8535

}

8536

if ((MinBitsPerElt % 8) != 0)

8537

return false;

8538

8539

// Attempt to find the source vector the scalar was extracted from.

8540

SDValue SrcExtract;

8541

if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

8542

Scl.getOpcode() == X86ISD::PEXTRW ||

8543

Scl.getOpcode() == X86ISD::PEXTRB) &&

8544

Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {

8545

SrcExtract = Scl;

8546

}

8547

if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))

8548

return false;

8549

8550

SDValue SrcVec = SrcExtract.getOperand(0);

8551

EVT SrcVT = SrcVec.getValueType();

8552

if (!SrcVT.getScalarType().isByteSized())

8553

return false;

8554

unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);

8555

unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);

8556

unsigned DstByte = DstIdx * NumBytesPerElt;

8557

MinBitsPerElt =

8558

std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());

8559

8560

// Create 'identity' byte level shuffle mask and then add inserted bytes.

8561

if (Opcode == ISD::SCALAR_TO_VECTOR) {

8562

Ops.push_back(SrcVec);

8563

Mask.append(NumSizeInBytes, SM_SentinelUndef);

8564

} else {

8565

Ops.push_back(SrcVec);

8566

Ops.push_back(N.getOperand(0));

8567

for (int i = 0; i != (int)NumSizeInBytes; ++i)

8568

Mask.push_back(NumSizeInBytes + i);

8569

}

8570

8571

unsigned MinBytesPerElts = MinBitsPerElt / 8;

8572

MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);

8573

for (unsigned i = 0; i != MinBytesPerElts; ++i)

8574

Mask[DstByte + i] = SrcByte + i;

8575

for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)

8576

Mask[DstByte + i] = SM_SentinelZero;

8577

return true;

8578

}

8579

case X86ISD::PACKSS:

8580

case X86ISD::PACKUS: {

8581

SDValue N0 = N.getOperand(0);

8582

SDValue N1 = N.getOperand(1);

8583

assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8585, __extension__
__PRETTY_FUNCTION__))

8584

N1.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8585, __extension__
__PRETTY_FUNCTION__))

8585

"Unexpected input value type")(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8585, __extension__
__PRETTY_FUNCTION__));

8586

8587

APInt EltsLHS, EltsRHS;

8588

getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);

8589

8590

// If we know input saturation won't happen (or we don't care for particular

8591

// lanes), we can treat this as a truncation shuffle.

8592

bool Offset0 = false, Offset1 = false;

8593

if (Opcode == X86ISD::PACKSS) {

8594

if ((!(N0.isUndef() || EltsLHS.isZero()) &&

8595

DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||

8596

(!(N1.isUndef() || EltsRHS.isZero()) &&

8597

DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))

8598

return false;

8599

// We can't easily fold ASHR into a shuffle, but if it was feeding a

8600

// PACKSS then it was likely being used for sign-extension for a

8601

// truncation, so just peek through and adjust the mask accordingly.

8602

if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&

8603

N0.getConstantOperandAPInt(1) == NumBitsPerElt) {

8604

Offset0 = true;

8605

N0 = N0.getOperand(0);

8606

}

8607

if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&

8608

N1.getConstantOperandAPInt(1) == NumBitsPerElt) {

8609

Offset1 = true;

8610

N1 = N1.getOperand(0);

8611

}

8612

} else {

8613

APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);

8614

if ((!(N0.isUndef() || EltsLHS.isZero()) &&

8615

!DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||

8616

(!(N1.isUndef() || EltsRHS.isZero()) &&

8617

!DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))

8618

return false;

8619

}

8620

8621

bool IsUnary = (N0 == N1);

8622

8623

Ops.push_back(N0);

8624

if (!IsUnary)

8625

Ops.push_back(N1);

8626

8627

createPackShuffleMask(VT, Mask, IsUnary);

8628

8629

if (Offset0 || Offset1) {

8630

for (int &M : Mask)

8631

if ((Offset0 && isInRange(M, 0, NumElts)) ||

8632

(Offset1 && isInRange(M, NumElts, 2 * NumElts)))

8633

++M;

8634

}

8635

return true;

8636

}

8637

case ISD::VSELECT:

8638

case X86ISD::BLENDV: {

8639

SDValue Cond = N.getOperand(0);

8640

if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {

8641

Ops.push_back(N.getOperand(1));

8642

Ops.push_back(N.getOperand(2));

8643

return true;

8644

}

8645

return false;

8646

}

8647

case X86ISD::VTRUNC: {

8648

SDValue Src = N.getOperand(0);

8649

EVT SrcVT = Src.getValueType();

8650

// Truncated source must be a simple vector.

8651

if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||

8652

(SrcVT.getScalarSizeInBits() % 8) != 0)

8653

return false;

8654

unsigned NumSrcElts = SrcVT.getVectorNumElements();

8655

unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();

8656

unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;

8657

assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")(static_cast <bool> ((NumBitsPerSrcElt % NumBitsPerElt)
== 0 && "Illegal truncation") ? void (0) : __assert_fail
("(NumBitsPerSrcElt % NumBitsPerElt) == 0 && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8657, __extension__
__PRETTY_FUNCTION__));

8658

for (unsigned i = 0; i != NumSrcElts; ++i)

8659

Mask.push_back(i * Scale);

8660

Mask.append(NumElts - NumSrcElts, SM_SentinelZero);

8661

Ops.push_back(Src);

8662

return true;

8663

}

8664

case X86ISD::VSHLI:

8665

case X86ISD::VSRLI: {

8666

uint64_t ShiftVal = N.getConstantOperandVal(1);

8667

// Out of range bit shifts are guaranteed to be zero.

8668

if (NumBitsPerElt <= ShiftVal) {

8669

Mask.append(NumElts, SM_SentinelZero);

8670

return true;

8671

}

8672

8673

// We can only decode 'whole byte' bit shifts as shuffles.

8674

if ((ShiftVal % 8) != 0)

8675

break;

8676

8677

uint64_t ByteShift = ShiftVal / 8;

8678

Ops.push_back(N.getOperand(0));

8679

8680

// Clear mask to all zeros and insert the shifted byte indices.

8681

Mask.append(NumSizeInBytes, SM_SentinelZero);

8682

8683

if (X86ISD::VSHLI == Opcode) {

8684

for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)

8685

for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)

8686

Mask[i + j] = i + j - ByteShift;

8687

} else {

8688

for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)

8689

for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)

8690

Mask[i + j - ByteShift] = i + j;

8691

}

8692

return true;

8693

}

8694

case X86ISD::VROTLI:

8695

case X86ISD::VROTRI: {

8696

// We can only decode 'whole byte' bit rotates as shuffles.

8697

uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);

8698

if ((RotateVal % 8) != 0)

8699

return false;

8700

Ops.push_back(N.getOperand(0));

8701

int Offset = RotateVal / 8;

8702

Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);

8703

for (int i = 0; i != (int)NumElts; ++i) {

8704

int BaseIdx = i * NumBytesPerElt;

8705

for (int j = 0; j != (int)NumBytesPerElt; ++j) {

8706

Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));

8707

}

8708

}

8709

return true;

8710

}

8711

case X86ISD::VBROADCAST: {

8712

SDValue Src = N.getOperand(0);

8713

if (!Src.getSimpleValueType().isVector()) {

8714

if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

8715

!isNullConstant(Src.getOperand(1)) ||

8716

Src.getOperand(0).getValueType().getScalarType() !=

8717

VT.getScalarType())

8718

return false;

8719

Src = Src.getOperand(0);

8720

}

8721

Ops.push_back(Src);

8722

Mask.append(NumElts, 0);

8723

return true;

8724

}

8725

case ISD::ZERO_EXTEND:

8726

case ISD::ANY_EXTEND:

8727

case ISD::ZERO_EXTEND_VECTOR_INREG:

8728

case ISD::ANY_EXTEND_VECTOR_INREG: {

8729

SDValue Src = N.getOperand(0);

8730

EVT SrcVT = Src.getValueType();

8731

8732

// Extended source must be a simple vector.

8733

if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||

8734

(SrcVT.getScalarSizeInBits() % 8) != 0)

8735

return false;

8736

8737

bool IsAnyExtend =

8738

(ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);

8739

DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,

8740

IsAnyExtend, Mask);

8741

Ops.push_back(Src);

8742

return true;

8743

}

8744

}

8745

8746

return false;

8747

}

8748

8749

/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.

8750

static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,

8751

SmallVectorImpl<int> &Mask) {

8752

int MaskWidth = Mask.size();

8753

SmallVector<SDValue, 16> UsedInputs;

8754

for (int i = 0, e = Inputs.size(); i < e; ++i) {

8755

int lo = UsedInputs.size() * MaskWidth;

8756

int hi = lo + MaskWidth;

8757

8758

// Strip UNDEF input usage.

8759

if (Inputs[i].isUndef())

8760

for (int &M : Mask)

8761

if ((lo <= M) && (M < hi))

8762

M = SM_SentinelUndef;

8763

8764

// Check for unused inputs.

8765

if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {

8766

for (int &M : Mask)

8767

if (lo <= M)

8768

M -= MaskWidth;

8769

continue;

8770

}

8771

8772

// Check for repeated inputs.

8773

bool IsRepeat = false;

8774

for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {

8775

if (UsedInputs[j] != Inputs[i])

8776

continue;

8777

for (int &M : Mask)

8778

if (lo <= M)

8779

M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);

8780

IsRepeat = true;

8781

break;

8782

}

8783

if (IsRepeat)

8784

continue;

8785

8786

UsedInputs.push_back(Inputs[i]);

8787

}

8788

Inputs = UsedInputs;

8789

}

8790

8791

/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs

8792

/// and then sets the SM_SentinelUndef and SM_SentinelZero values.

8793

/// Returns true if the target shuffle mask was decoded.

8794

static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

8795

SmallVectorImpl<SDValue> &Inputs,

8796

SmallVectorImpl<int> &Mask,

8797

APInt &KnownUndef, APInt &KnownZero,

8798

const SelectionDAG &DAG, unsigned Depth,

8799

bool ResolveKnownElts) {

8800

if (Depth >= SelectionDAG::MaxRecursionDepth)

8801

return false; // Limit search depth.

8802

8803

EVT VT = Op.getValueType();

8804

if (!VT.isSimple() || !VT.isVector())

8805

return false;

8806

8807

if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {

8808

if (ResolveKnownElts)

8809

resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);

8810

return true;

8811

}

8812

if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,

8813

ResolveKnownElts)) {

8814

resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);

8815

return true;

8816

}

8817

return false;

8818

}

8819

8820

static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

8821

SmallVectorImpl<SDValue> &Inputs,

8822

SmallVectorImpl<int> &Mask,

8823

const SelectionDAG &DAG, unsigned Depth,

8824

bool ResolveKnownElts) {

8825

APInt KnownUndef, KnownZero;

8826

return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,

8827

KnownZero, DAG, Depth, ResolveKnownElts);

8828

}

8829

8830

static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,

8831

SmallVectorImpl<int> &Mask,

8832

const SelectionDAG &DAG, unsigned Depth = 0,

8833

bool ResolveKnownElts = true) {

8834

EVT VT = Op.getValueType();

8835

if (!VT.isSimple() || !VT.isVector())

8836

return false;

8837

8838

unsigned NumElts = Op.getValueType().getVectorNumElements();

8839

APInt DemandedElts = APInt::getAllOnes(NumElts);

8840

return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,

8841

ResolveKnownElts);

8842

}

8843

8844

// Attempt to create a scalar/subvector broadcast from the base MemSDNode.

8845

static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,

8846

EVT MemVT, MemSDNode *Mem, unsigned Offset,

8847

SelectionDAG &DAG) {

8848

assert((Opcode == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8850, __extension__
__PRETTY_FUNCTION__))

8849

Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8850, __extension__
__PRETTY_FUNCTION__))

8850

"Unknown broadcast load type")(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8850, __extension__
__PRETTY_FUNCTION__));

8851

8852

// Ensure this is a simple (non-atomic, non-voltile), temporal read memop.

8853

if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())

8854

return SDValue();

8855

8856

SDValue Ptr =

8857

DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);

8858

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

8859

SDValue Ops[] = {Mem->getChain(), Ptr};

8860

SDValue BcstLd = DAG.getMemIntrinsicNode(

8861

Opcode, DL, Tys, Ops, MemVT,

8862

DAG.getMachineFunction().getMachineMemOperand(

8863

Mem->getMemOperand(), Offset, MemVT.getStoreSize()));

8864

DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));

8865

return BcstLd;

8866

}

8867

8868

/// Returns the scalar element that will make up the i'th

8869

/// element of the result of the vector shuffle.

8870

static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,

8871

SelectionDAG &DAG, unsigned Depth) {

8872

if (Depth >= SelectionDAG::MaxRecursionDepth)

8873

return SDValue(); // Limit search depth.

8874

8875

EVT VT = Op.getValueType();

8876

unsigned Opcode = Op.getOpcode();

8877

unsigned NumElems = VT.getVectorNumElements();

8878

8879

// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.

8880

if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {

8881

int Elt = SV->getMaskElt(Index);

8882

8883

if (Elt < 0)

8884

return DAG.getUNDEF(VT.getVectorElementType());

8885

8886

SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);

8887

return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);

8888

}

8889

8890

// Recurse into target specific vector shuffles to find scalars.

8891

if (isTargetShuffle(Opcode)) {

8892

MVT ShufVT = VT.getSimpleVT();

8893

MVT ShufSVT = ShufVT.getVectorElementType();

8894

int NumElems = (int)ShufVT.getVectorNumElements();

8895

SmallVector<int, 16> ShuffleMask;

8896

SmallVector<SDValue, 16> ShuffleOps;

8897

if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,

8898

ShuffleMask))

8899

return SDValue();

8900

8901

int Elt = ShuffleMask[Index];

8902

if (Elt == SM_SentinelZero)

8903

return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)

8904

: DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);

8905

if (Elt == SM_SentinelUndef)

8906

return DAG.getUNDEF(ShufSVT);

8907

8908

assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")(static_cast <bool> (0 <= Elt && Elt < (2
* NumElems) && "Shuffle index out of range") ? void (
0) : __assert_fail ("0 <= Elt && Elt < (2 * NumElems) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8908, __extension__
__PRETTY_FUNCTION__));

8909

SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];

8910

return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);

8911

}

8912

8913

// Recurse into insert_subvector base/sub vector to find scalars.

8914

if (Opcode == ISD::INSERT_SUBVECTOR) {

8915

SDValue Vec = Op.getOperand(0);

8916

SDValue Sub = Op.getOperand(1);

8917

uint64_t SubIdx = Op.getConstantOperandVal(2);

8918

unsigned NumSubElts = Sub.getValueType().getVectorNumElements();

8919

8920

if (SubIdx <= Index && Index < (SubIdx + NumSubElts))

8921

return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);

8922

return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);

8923

}

8924

8925

// Recurse into concat_vectors sub vector to find scalars.

8926

if (Opcode == ISD::CONCAT_VECTORS) {

8927

EVT SubVT = Op.getOperand(0).getValueType();

8928

unsigned NumSubElts = SubVT.getVectorNumElements();

8929

uint64_t SubIdx = Index / NumSubElts;

8930

uint64_t SubElt = Index % NumSubElts;

8931

return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);

8932

}

8933

8934

// Recurse into extract_subvector src vector to find scalars.

8935

if (Opcode == ISD::EXTRACT_SUBVECTOR) {

8936

SDValue Src = Op.getOperand(0);

8937

uint64_t SrcIdx = Op.getConstantOperandVal(1);

8938

return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);

8939

}

8940

8941

// We only peek through bitcasts of the same vector width.

8942

if (Opcode == ISD::BITCAST) {

8943

SDValue Src = Op.getOperand(0);

8944

EVT SrcVT = Src.getValueType();

8945

if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)

8946

return getShuffleScalarElt(Src, Index, DAG, Depth + 1);

8947

return SDValue();

8948

}

8949

8950

// Actual nodes that may contain scalar elements

8951

8952

// For insert_vector_elt - either return the index matching scalar or recurse

8953

// into the base vector.

8954

if (Opcode == ISD::INSERT_VECTOR_ELT &&

8955

isa<ConstantSDNode>(Op.getOperand(2))) {

8956

if (Op.getConstantOperandAPInt(2) == Index)

8957

return Op.getOperand(1);

8958

return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);

8959

}

8960

8961

if (Opcode == ISD::SCALAR_TO_VECTOR)

8962

return (Index == 0) ? Op.getOperand(0)

8963

: DAG.getUNDEF(VT.getVectorElementType());

8964

8965

if (Opcode == ISD::BUILD_VECTOR)

8966

return Op.getOperand(Index);

8967

8968

return SDValue();

8969

}

8970

8971

// Use PINSRB/PINSRW/PINSRD to create a build vector.

8972

static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,

8973

unsigned NumNonZero, unsigned NumZero,

8974

SelectionDAG &DAG,

8975

const X86Subtarget &Subtarget) {

8976

MVT VT = Op.getSimpleValueType();

8977

unsigned NumElts = VT.getVectorNumElements();

8978

assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8980, __extension__
__PRETTY_FUNCTION__))

8979

((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8980, __extension__
__PRETTY_FUNCTION__))

8980

"Illegal vector insertion")(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8980, __extension__
__PRETTY_FUNCTION__));

8981

8982

SDLoc dl(Op);

8983

SDValue V;

8984

bool First = true;

8985

8986

for (unsigned i = 0; i < NumElts; ++i) {

8987

bool IsNonZero = NonZeroMask[i];

8988

if (!IsNonZero)

8989

continue;

8990

8991

// If the build vector contains zeros or our first insertion is not the

8992

// first index then insert into zero vector to break any register

8993

// dependency else use SCALAR_TO_VECTOR.

8994

if (First) {

8995

First = false;

8996

if (NumZero || 0 != i)

8997

V = getZeroVector(VT, Subtarget, DAG, dl);

8998

else {

8999

assert(0 == i && "Expected insertion into zero-index")(static_cast <bool> (0 == i && "Expected insertion into zero-index"
) ? void (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8999, __extension__
__PRETTY_FUNCTION__));

9000

V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

9001

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);

9002

V = DAG.getBitcast(VT, V);

9003

continue;

9004

}

9005

}

9006

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),

9007

DAG.getIntPtrConstant(i, dl));

9008

}

9009

9010

return V;

9011

}

9012

9013

/// Custom lower build_vector of v16i8.

9014

static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,

9015

unsigned NumNonZero, unsigned NumZero,

9016

SelectionDAG &DAG,

9017

const X86Subtarget &Subtarget) {

9018

if (NumNonZero > 8 && !Subtarget.hasSSE41())

9019

return SDValue();

9020

9021

// SSE4.1 - use PINSRB to insert each byte directly.

9022

if (Subtarget.hasSSE41())

9023

return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,

9024

Subtarget);

9025

9026

SDLoc dl(Op);

9027

SDValue V;

9028

9029

// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.

9030

for (unsigned i = 0; i < 16; i += 2) {

9031

bool ThisIsNonZero = NonZeroMask[i];

9032

bool NextIsNonZero = NonZeroMask[i + 1];

9033

if (!ThisIsNonZero && !NextIsNonZero)

9034

continue;

9035

9036

// FIXME: Investigate combining the first 4 bytes as a i32 instead.

9037

SDValue Elt;

9038

if (ThisIsNonZero) {

9039

if (NumZero || NextIsNonZero)

9040

Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

9041

else

9042

Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

9043

}

9044

9045

if (NextIsNonZero) {

9046

SDValue NextElt = Op.getOperand(i + 1);

9047

if (i == 0 && NumZero)

9048

NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);

9049

else

9050

NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);

9051

NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,

9052

DAG.getConstant(8, dl, MVT::i8));

9053

if (ThisIsNonZero)

9054

Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);

9055

else

9056

Elt = NextElt;

9057

}

9058

9059

// If our first insertion is not the first index or zeros are needed, then

9060

// insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high

9061

// elements undefined).

9062

if (!V) {

9063

if (i != 0 || NumZero)

9064

V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);

9065

else {

9066

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);

9067

V = DAG.getBitcast(MVT::v8i16, V);

9068

continue;

9069

}

9070

}

9071

Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);

9072

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,

9073

DAG.getIntPtrConstant(i / 2, dl));

9074

}

9075

9076

return DAG.getBitcast(MVT::v16i8, V);

9077

}

9078

9079

/// Custom lower build_vector of v8i16.

9080

static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,

9081

unsigned NumNonZero, unsigned NumZero,

9082

SelectionDAG &DAG,

9083

const X86Subtarget &Subtarget) {

9084

if (NumNonZero > 4 && !Subtarget.hasSSE41())

9085

return SDValue();

9086

9087

// Use PINSRW to insert each byte directly.

9088

return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,

9089

Subtarget);

9090

}

9091

9092

/// Custom lower build_vector of v4i32 or v4f32.

9093

static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,

9094

const X86Subtarget &Subtarget) {

9095

// If this is a splat of a pair of elements, use MOVDDUP (unless the target

9096

// has XOP; in that case defer lowering to potentially use VPERMIL2PS).

9097

// Because we're creating a less complicated build vector here, we may enable

9098

// further folding of the MOVDDUP via shuffle transforms.

9099

if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&

9100

Op.getOperand(0) == Op.getOperand(2) &&

9101

Op.getOperand(1) == Op.getOperand(3) &&

9102

Op.getOperand(0) != Op.getOperand(1)) {

9103

SDLoc DL(Op);

9104

MVT VT = Op.getSimpleValueType();

9105

MVT EltVT = VT.getVectorElementType();

9106

// Create a new build vector with the first 2 elements followed by undef

9107

// padding, bitcast to v2f64, duplicate, and bitcast back.

9108

SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),

9109

DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };

9110

SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));

9111

SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);

9112

return DAG.getBitcast(VT, Dup);

9113

}

9114

9115

// Find all zeroable elements.

9116

std::bitset<4> Zeroable, Undefs;

9117

for (int i = 0; i < 4; ++i) {

9118

SDValue Elt = Op.getOperand(i);

9119

Undefs[i] = Elt.isUndef();

9120

Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));

9121

}

9122

assert(Zeroable.size() - Zeroable.count() > 1 &&(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9123, __extension__
__PRETTY_FUNCTION__))

9123

"We expect at least two non-zero elements!")(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9123, __extension__
__PRETTY_FUNCTION__));

9124

9125

// We only know how to deal with build_vector nodes where elements are either

9126

// zeroable or extract_vector_elt with constant index.

9127

SDValue FirstNonZero;

9128

unsigned FirstNonZeroIdx;

9129

for (unsigned i = 0; i < 4; ++i) {

9130

if (Zeroable[i])

9131

continue;

9132

SDValue Elt = Op.getOperand(i);

9133

if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

9134

!isa<ConstantSDNode>(Elt.getOperand(1)))

9135

return SDValue();

9136

// Make sure that this node is extracting from a 128-bit vector.

9137

MVT VT = Elt.getOperand(0).getSimpleValueType();

9138

if (!VT.is128BitVector())

9139

return SDValue();

9140

if (!FirstNonZero.getNode()) {

9141

FirstNonZero = Elt;

9142

FirstNonZeroIdx = i;

9143

}

9144

}

9145

9146

assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")(static_cast <bool> (FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? void (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9146, __extension__
__PRETTY_FUNCTION__));

9147

SDValue V1 = FirstNonZero.getOperand(0);

9148

MVT VT = V1.getSimpleValueType();

9149

9150

// See if this build_vector can be lowered as a blend with zero.

9151

SDValue Elt;

9152

unsigned EltMaskIdx, EltIdx;

9153

int Mask[4];

9154

for (EltIdx = 0; EltIdx < 4; ++EltIdx) {

9155

if (Zeroable[EltIdx]) {

9156

// The zero vector will be on the right hand side.

9157

Mask[EltIdx] = EltIdx+4;

9158

continue;

9159

}

9160

9161

Elt = Op->getOperand(EltIdx);

9162

// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.

9163

EltMaskIdx = Elt.getConstantOperandVal(1);

9164

if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)

9165

break;

9166

Mask[EltIdx] = EltIdx;

9167

}

9168

9169

if (EltIdx == 4) {

9170

// Let the shuffle legalizer deal with blend operations.

9171

SDValue VZeroOrUndef = (Zeroable == Undefs)

9172

? DAG.getUNDEF(VT)

9173

: getZeroVector(VT, Subtarget, DAG, SDLoc(Op));

9174

if (V1.getSimpleValueType() != VT)

9175

V1 = DAG.getBitcast(VT, V1);

9176

return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);

9177

}

9178

9179

// See if we can lower this build_vector to a INSERTPS.

9180

if (!Subtarget.hasSSE41())

9181

return SDValue();

9182

9183

SDValue V2 = Elt.getOperand(0);

9184

if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)

9185

V1 = SDValue();

9186

9187

bool CanFold = true;

9188

for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {

9189

if (Zeroable[i])

9190

continue;

9191

9192

SDValue Current = Op->getOperand(i);

9193

SDValue SrcVector = Current->getOperand(0);

9194

if (!V1.getNode())

9195

V1 = SrcVector;

9196

CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);

9197

}

9198

9199

if (!CanFold)

9200

return SDValue();

9201

9202

assert(V1.getNode() && "Expected at least two non-zero elements!")(static_cast <bool> (V1.getNode() && "Expected at least two non-zero elements!"
) ? void (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9202, __extension__
__PRETTY_FUNCTION__));

9203

if (V1.getSimpleValueType() != MVT::v4f32)

9204

V1 = DAG.getBitcast(MVT::v4f32, V1);

9205

if (V2.getSimpleValueType() != MVT::v4f32)

9206

V2 = DAG.getBitcast(MVT::v4f32, V2);

9207

9208

// Ok, we can emit an INSERTPS instruction.

9209

unsigned ZMask = Zeroable.to_ulong();

9210

9211

unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;

9212

assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9212, __extension__
__PRETTY_FUNCTION__));

9213

SDLoc DL(Op);

9214

SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,

9215

DAG.getIntPtrConstant(InsertPSMask, DL, true));

9216

return DAG.getBitcast(VT, Result);

9217

}

9218

9219

/// Return a vector logical shift node.

9220

static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,

9221

SelectionDAG &DAG, const TargetLowering &TLI,

9222

const SDLoc &dl) {

9223

assert(VT.is128BitVector() && "Unknown type for VShift")(static_cast <bool> (VT.is128BitVector() && "Unknown type for VShift"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9223, __extension__
__PRETTY_FUNCTION__));

9224

MVT ShVT = MVT::v16i8;

9225

unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;

9226

SrcOp = DAG.getBitcast(ShVT, SrcOp);

9227

assert(NumBits % 8 == 0 && "Only support byte sized shifts")(static_cast <bool> (NumBits % 8 == 0 && "Only support byte sized shifts"
) ? void (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9227, __extension__
__PRETTY_FUNCTION__));

9228

SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);

9229

return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));

9230

}

9231

9232

static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,

9233

SelectionDAG &DAG) {

9234

9235

// Check if the scalar load can be widened into a vector load. And if

9236

// the address is "base + cst" see if the cst can be "absorbed" into

9237

// the shuffle mask.

9238

if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {

9239

SDValue Ptr = LD->getBasePtr();

9240

if (!ISD::isNormalLoad(LD) || !LD->isSimple())

9241

return SDValue();

9242

EVT PVT = LD->getValueType(0);

9243

if (PVT != MVT::i32 && PVT != MVT::f32)

9244

return SDValue();

9245

9246

int FI = -1;

9247

int64_t Offset = 0;

9248

if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {

9249

FI = FINode->getIndex();

9250

Offset = 0;

9251

} else if (DAG.isBaseWithConstantOffset(Ptr) &&

9252

isa<FrameIndexSDNode>(Ptr.getOperand(0))) {

9253

FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();

9254

Offset = Ptr.getConstantOperandVal(1);

9255

Ptr = Ptr.getOperand(0);

9256

} else {

9257

return SDValue();

9258

}

9259

9260

// FIXME: 256-bit vector instructions don't require a strict alignment,

9261

// improve this code to support it better.

9262

Align RequiredAlign(VT.getSizeInBits() / 8);

9263

SDValue Chain = LD->getChain();

9264

// Make sure the stack object alignment is at least 16 or 32.

9265

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

9266

MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);

9267

if (!InferredAlign || *InferredAlign < RequiredAlign) {

9268

if (MFI.isFixedObjectIndex(FI)) {

9269

// Can't change the alignment. FIXME: It's possible to compute

9270

// the exact stack offset and reference FI + adjust offset instead.

9271

// If someone *really* cares about this. That's the way to implement it.

9272

return SDValue();

9273

} else {

9274

MFI.setObjectAlignment(FI, RequiredAlign);

9275

}

9276

}

9277

9278

// (Offset % 16 or 32) must be multiple of 4. Then address is then

9279

// Ptr + (Offset & ~15).

9280

if (Offset < 0)

9281

return SDValue();

9282

if ((Offset % RequiredAlign.value()) & 3)

9283

return SDValue();

9284

int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);

9285

if (StartOffset) {

9286

SDLoc DL(Ptr);

9287

Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,

9288

DAG.getConstant(StartOffset, DL, Ptr.getValueType()));

9289

}

9290

9291

int EltNo = (Offset - StartOffset) >> 2;

9292

unsigned NumElems = VT.getVectorNumElements();

9293

9294

EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);

9295

SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,

9296

LD->getPointerInfo().getWithOffset(StartOffset));

9297

9298

SmallVector<int, 8> Mask(NumElems, EltNo);

9299

9300

return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);

9301

}

9302

9303

return SDValue();

9304

}

9305

9306

// Recurse to find a LoadSDNode source and the accumulated ByteOffest.

9307

static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {

9308

if (ISD::isNON_EXTLoad(Elt.getNode())) {

9309

auto *BaseLd = cast<LoadSDNode>(Elt);

9310

if (!BaseLd->isSimple())

9311

return false;

9312

Ld = BaseLd;

9313

ByteOffset = 0;

9314

return true;

9315

}

9316

9317

switch (Elt.getOpcode()) {

9318

case ISD::BITCAST:

9319

case ISD::TRUNCATE:

9320

case ISD::SCALAR_TO_VECTOR:

9321

return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);

9322

case ISD::SRL:

9323

if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {

9324

uint64_t Amt = AmtC->getZExtValue();

9325

if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {

9326

ByteOffset += Amt / 8;

9327

return true;

9328

}

9329

}

9330

break;

9331

case ISD::EXTRACT_VECTOR_ELT:

9332

if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {

9333

SDValue Src = Elt.getOperand(0);

9334

unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();

9335

unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();

9336

if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&

9337

findEltLoadSrc(Src, Ld, ByteOffset)) {

9338

uint64_t Idx = IdxC->getZExtValue();

9339

ByteOffset += Idx * (SrcSizeInBits / 8);

9340

return true;

9341

}

9342

}

9343

break;

9344

}

9345

9346

return false;

9347

}

9348

9349

/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the

9350

/// elements can be replaced by a single large load which has the same value as

9351

/// a build_vector or insert_subvector whose loaded operands are 'Elts'.

9352

///

9353

/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a

9354

static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,

9355

const SDLoc &DL, SelectionDAG &DAG,

9356

const X86Subtarget &Subtarget,

9357

bool IsAfterLegalize) {

9358

if ((VT.getScalarSizeInBits() % 8) != 0)

9359

return SDValue();

9360

9361

unsigned NumElems = Elts.size();

9362

9363

int LastLoadedElt = -1;

9364

APInt LoadMask = APInt::getZero(NumElems);

9365

APInt ZeroMask = APInt::getZero(NumElems);

9366

APInt UndefMask = APInt::getZero(NumElems);

9367

9368

SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);

9369

SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);

9370

9371

// For each element in the initializer, see if we've found a load, zero or an

9372

// undef.

9373

for (unsigned i = 0; i < NumElems; ++i) {

9374

SDValue Elt = peekThroughBitcasts(Elts[i]);

9375

if (!Elt.getNode())

9376

return SDValue();

9377

if (Elt.isUndef()) {

9378

UndefMask.setBit(i);

9379

continue;

9380

}

9381

if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {

9382

ZeroMask.setBit(i);

9383

continue;

9384

}

9385

9386

// Each loaded element must be the correct fractional portion of the

9387

// requested vector load.

9388

unsigned EltSizeInBits = Elt.getValueSizeInBits();

9389

if ((NumElems * EltSizeInBits) != VT.getSizeInBits())

9390

return SDValue();

9391

9392

if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)

9393

return SDValue();

9394

unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);

9395

if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)

9396

return SDValue();

9397

9398

LoadMask.setBit(i);

9399

LastLoadedElt = i;

9400

}

9401

assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9403, __extension__
__PRETTY_FUNCTION__))

9402

NumElems &&(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9403, __extension__
__PRETTY_FUNCTION__))

9403

"Incomplete element masks")(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9403, __extension__
__PRETTY_FUNCTION__));

9404

9405

// Handle Special Cases - all undef or undef/zero.

9406

if (UndefMask.popcount() == NumElems)

9407

return DAG.getUNDEF(VT);

9408

if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)

9409

return VT.isInteger() ? DAG.getConstant(0, DL, VT)

9410

: DAG.getConstantFP(0.0, DL, VT);

9411

9412

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9413

int FirstLoadedElt = LoadMask.countr_zero();

9414

SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);

9415

EVT EltBaseVT = EltBase.getValueType();

9416

assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9417, __extension__
__PRETTY_FUNCTION__))

9417

"Register/Memory size mismatch")(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9417, __extension__
__PRETTY_FUNCTION__));

9418

LoadSDNode *LDBase = Loads[FirstLoadedElt];

9419

assert(LDBase && "Did not find base load for merging consecutive loads")(static_cast <bool> (LDBase && "Did not find base load for merging consecutive loads"
) ? void (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9419, __extension__
__PRETTY_FUNCTION__));

9420

unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();

9421

unsigned BaseSizeInBytes = BaseSizeInBits / 8;

9422

int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);

9423

int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;

9424

assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(static_cast <bool> ((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected") ? void (0) : __assert_fail
("(BaseSizeInBits % 8) == 0 && \"Sub-byte element loads detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9424, __extension__
__PRETTY_FUNCTION__));

9425

9426

// TODO: Support offsetting the base load.

9427

if (ByteOffsets[FirstLoadedElt] != 0)

9428

return SDValue();

9429

9430

// Check to see if the element's load is consecutive to the base load

9431

// or offset from a previous (already checked) load.

9432

auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {

9433

LoadSDNode *Ld = Loads[EltIdx];

9434

int64_t ByteOffset = ByteOffsets[EltIdx];

9435

if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {

9436

int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);

9437

return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&

9438

Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);

9439

}

9440

return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,

9441

EltIdx - FirstLoadedElt);

9442

};

9443

9444

// Consecutive loads can contain UNDEFS but not ZERO elements.

9445

// Consecutive loads with UNDEFs and ZEROs elements require a

9446

// an additional shuffle stage to clear the ZERO elements.

9447

bool IsConsecutiveLoad = true;

9448

bool IsConsecutiveLoadWithZeros = true;

9449

for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {

9450

if (LoadMask[i]) {

9451

if (!CheckConsecutiveLoad(LDBase, i)) {

9452

IsConsecutiveLoad = false;

9453

IsConsecutiveLoadWithZeros = false;

9454

break;

9455

}

9456

} else if (ZeroMask[i]) {

9457

IsConsecutiveLoad = false;

9458

}

9459

}

9460

9461

auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {

9462

auto MMOFlags = LDBase->getMemOperand()->getFlags();

9463

assert(LDBase->isSimple() &&(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9464, __extension__
__PRETTY_FUNCTION__))

9464

"Cannot merge volatile or atomic loads.")(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9464, __extension__
__PRETTY_FUNCTION__));

9465

SDValue NewLd =

9466

DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),

9467

LDBase->getPointerInfo(), LDBase->getOriginalAlign(),

9468

MMOFlags);

9469

for (auto *LD : Loads)

9470

if (LD)

9471

DAG.makeEquivalentMemoryOrdering(LD, NewLd);

9472

return NewLd;

9473

};

9474

9475

// Check if the base load is entirely dereferenceable.

9476

bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(

9477

VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());

9478

9479

// LOAD - all consecutive load/undefs (must start/end with a load or be

9480

// entirely dereferenceable). If we have found an entire vector of loads and

9481

// undefs, then return a large load of the entire vector width starting at the

9482

// base pointer. If the vector contains zeros, then attempt to shuffle those

9483

// elements.

9484

if (FirstLoadedElt == 0 &&

9485

(NumLoadedElts == (int)NumElems || IsDereferenceable) &&

9486

(IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {

9487

if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))

9488

return SDValue();

9489

9490

// Don't create 256-bit non-temporal aligned loads without AVX2 as these

9491

// will lower to regular temporal loads and use the cache.

9492

if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&

9493

VT.is256BitVector() && !Subtarget.hasInt256())

9494

return SDValue();

9495

9496

if (NumElems == 1)

9497

return DAG.getBitcast(VT, Elts[FirstLoadedElt]);

9498

9499

if (!ZeroMask)

9500

return CreateLoad(VT, LDBase);

9501

9502

// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded

9503

// vector and a zero vector to clear out the zero elements.

9504

if (!IsAfterLegalize && VT.isVector()) {

9505

unsigned NumMaskElts = VT.getVectorNumElements();

9506

if ((NumMaskElts % NumElems) == 0) {

9507

unsigned Scale = NumMaskElts / NumElems;

9508

SmallVector<int, 4> ClearMask(NumMaskElts, -1);

9509

for (unsigned i = 0; i < NumElems; ++i) {

9510

if (UndefMask[i])

9511

continue;

9512

int Offset = ZeroMask[i] ? NumMaskElts : 0;

9513

for (unsigned j = 0; j != Scale; ++j)

9514

ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;

9515

}

9516

SDValue V = CreateLoad(VT, LDBase);

9517

SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)

9518

: DAG.getConstantFP(0.0, DL, VT);

9519

return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);

9520

}

9521

}

9522

}

9523

9524

// If the upper half of a ymm/zmm load is undef then just load the lower half.

9525

if (VT.is256BitVector() || VT.is512BitVector()) {

9526

unsigned HalfNumElems = NumElems / 2;

9527

if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {

9528

EVT HalfVT =

9529

EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);

9530

SDValue HalfLD =

9531

EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,

9532

DAG, Subtarget, IsAfterLegalize);

9533

if (HalfLD)

9534

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),

9535

HalfLD, DAG.getIntPtrConstant(0, DL));

9536

}

9537

}

9538

9539

// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.

9540

if (IsConsecutiveLoad && FirstLoadedElt == 0 &&

9541

((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||

9542

LoadSizeInBits == 64) &&

9543

((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {

9544

MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)

9545

: MVT::getIntegerVT(LoadSizeInBits);

9546

MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);

9547

// Allow v4f32 on SSE1 only targets.

9548

// FIXME: Add more isel patterns so we can just use VT directly.

9549

if (!Subtarget.hasSSE2() && VT == MVT::v4f32)

9550

VecVT = MVT::v4f32;

9551

if (TLI.isTypeLegal(VecVT)) {

9552

SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);

9553

SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };

9554

SDValue ResNode = DAG.getMemIntrinsicNode(

9555

X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),

9556

LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);

9557

for (auto *LD : Loads)

9558

if (LD)

9559

DAG.makeEquivalentMemoryOrdering(LD, ResNode);

9560

return DAG.getBitcast(VT, ResNode);

9561

}

9562

}

9563

9564

// BROADCAST - match the smallest possible repetition pattern, load that

9565

// scalar/subvector element and then broadcast to the entire vector.

9566

if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&

9567

(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {

9568

for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {

9569

unsigned RepeatSize = SubElems * BaseSizeInBits;

9570

unsigned ScalarSize = std::min(RepeatSize, 64u);

9571

if (!Subtarget.hasAVX2() && ScalarSize < 32)

9572

continue;

9573

9574

// Don't attempt a 1:N subvector broadcast - it should be caught by

9575

// combineConcatVectorOps, else will cause infinite loops.

9576

if (RepeatSize > ScalarSize && SubElems == 1)

9577

continue;

9578

9579

bool Match = true;

9580

SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));

9581

for (unsigned i = 0; i != NumElems && Match; ++i) {

9582

if (!LoadMask[i])

9583

continue;

9584

SDValue Elt = peekThroughBitcasts(Elts[i]);

9585

if (RepeatedLoads[i % SubElems].isUndef())

9586

RepeatedLoads[i % SubElems] = Elt;

9587

else

9588

Match &= (RepeatedLoads[i % SubElems] == Elt);

9589

}

9590

9591

// We must have loads at both ends of the repetition.

9592

Match &= !RepeatedLoads.front().isUndef();

9593

Match &= !RepeatedLoads.back().isUndef();

9594

if (!Match)

9595

continue;

9596

9597

EVT RepeatVT =

9598

VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))

9599

? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)

9600

: EVT::getFloatingPointVT(ScalarSize);

9601

if (RepeatSize > ScalarSize)

9602

RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,

9603

RepeatSize / ScalarSize);

9604

EVT BroadcastVT =

9605

EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),

9606

VT.getSizeInBits() / ScalarSize);

9607

if (TLI.isTypeLegal(BroadcastVT)) {

9608

if (SDValue RepeatLoad = EltsFromConsecutiveLoads(

9609

RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {

9610

SDValue Broadcast = RepeatLoad;

9611

if (RepeatSize > ScalarSize) {

9612

while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())

9613

Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);

9614

} else {

9615

if (!Subtarget.hasAVX2() &&

9616

!X86::mayFoldLoadIntoBroadcastFromMem(

9617

RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),

9618

Subtarget,

9619

/*AssumeSingleUse=*/true))

9620

return SDValue();

9621

Broadcast =

9622

DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);

9623

}

9624

return DAG.getBitcast(VT, Broadcast);

9625

}

9626

}

9627

}

9628

}

9629

9630

return SDValue();

9631

}

9632

9633

// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,

9634

// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses

9635

// are consecutive, non-overlapping, and in the right order.

9636

static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,

9637

SelectionDAG &DAG,

9638

const X86Subtarget &Subtarget,

9639

bool IsAfterLegalize) {

9640

SmallVector<SDValue, 64> Elts;

9641

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {

9642

if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {

9643

Elts.push_back(Elt);

9644

continue;

9645

}

9646

return SDValue();

9647

}

9648

assert(Elts.size() == VT.getVectorNumElements())(static_cast <bool> (Elts.size() == VT.getVectorNumElements
()) ? void (0) : __assert_fail ("Elts.size() == VT.getVectorNumElements()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9648, __extension__
__PRETTY_FUNCTION__));

9649

return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,

9650

IsAfterLegalize);

9651

}

9652

9653

static Constant *getConstantVector(MVT VT, const APInt &SplatValue,

9654

unsigned SplatBitSize, LLVMContext &C) {

9655

unsigned ScalarSize = VT.getScalarSizeInBits();

9656

unsigned NumElm = SplatBitSize / ScalarSize;

9657

9658

SmallVector<Constant *, 32> ConstantVec;

9659

for (unsigned i = 0; i < NumElm; i++) {

9660

APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);

9661

Constant *Const;

9662

if (VT.isFloatingPoint()) {

9663

if (ScalarSize == 16) {

9664

Const = ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));

9665

} else if (ScalarSize == 32) {

9666

Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));

9667

} else {

9668

assert(ScalarSize == 64 && "Unsupported floating point scalar size")(static_cast <bool> (ScalarSize == 64 && "Unsupported floating point scalar size"
) ? void (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9668, __extension__
__PRETTY_FUNCTION__));

9669

Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));

9670

}

9671

} else

9672

Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);

9673

ConstantVec.push_back(Const);

9674

}

9675

return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));

9676

}

9677

9678

static bool isFoldableUseOfShuffle(SDNode *N) {

9679

for (auto *U : N->uses()) {

9680

unsigned Opc = U->getOpcode();

9681

// VPERMV/VPERMV3 shuffles can never fold their index operands.

9682

if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)

9683

return false;

9684

if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)

9685

return false;

9686

if (isTargetShuffle(Opc))

9687

return true;

9688

if (Opc == ISD::BITCAST) // Ignore bitcasts

9689

return isFoldableUseOfShuffle(U);

9690

if (N->hasOneUse()) {

9691

// TODO, there may be some general way to know if a SDNode can

9692

// be folded. We now only know whether an MI is foldable.

9693

if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)

9694

return false;

9695

return true;

9696

}

9697

}

9698

return false;

9699

}

9700

9701

/// Attempt to use the vbroadcast instruction to generate a splat value

9702

/// from a splat BUILD_VECTOR which uses:

9703

/// a. A single scalar load, or a constant.

9704

/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).

9705

///

9706

/// The VBROADCAST node is returned when a pattern is found,

9707

/// or SDValue() otherwise.

9708

static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,

9709

const X86Subtarget &Subtarget,

9710

SelectionDAG &DAG) {

9711

// VBROADCAST requires AVX.

9712

// TODO: Splats could be generated for non-AVX CPUs using SSE

9713

// instructions, but there's less potential gain for only 128-bit vectors.

9714

if (!Subtarget.hasAVX())

9715

return SDValue();

9716

9717

MVT VT = BVOp->getSimpleValueType(0);

9718

unsigned NumElts = VT.getVectorNumElements();

9719

SDLoc dl(BVOp);

9720

9721

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9722, __extension__
__PRETTY_FUNCTION__))

9722

"Unsupported vector type for broadcast.")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9722, __extension__
__PRETTY_FUNCTION__));

9723

9724

// See if the build vector is a repeating sequence of scalars (inc. splat).

9725

SDValue Ld;

9726

BitVector UndefElements;

9727

SmallVector<SDValue, 16> Sequence;

9728

if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {

9729

assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.")(static_cast <bool> ((NumElts % Sequence.size()) == 0 &&
"Sequence doesn't fit.") ? void (0) : __assert_fail ("(NumElts % Sequence.size()) == 0 && \"Sequence doesn't fit.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9729, __extension__
__PRETTY_FUNCTION__));

9730

if (Sequence.size() == 1)

9731

Ld = Sequence[0];

9732

}

9733

9734

// Attempt to use VBROADCASTM

9735

// From this pattern:

9736

// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))

9737

// b. t1 = (build_vector t0 t0)

9738

//

9739

// Create (VBROADCASTM v2i1 X)

9740

if (!Sequence.empty() && Subtarget.hasCDI()) {

9741

// If not a splat, are the upper sequence values zeroable?

9742

unsigned SeqLen = Sequence.size();

9743

bool UpperZeroOrUndef =

9744

SeqLen == 1 ||

9745

llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) {

9746

return !V || V.isUndef() || isNullConstant(V);

9747

});

9748

SDValue Op0 = Sequence[0];

9749

if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||

9750

(Op0.getOpcode() == ISD::ZERO_EXTEND &&

9751

Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {

9752

SDValue BOperand = Op0.getOpcode() == ISD::BITCAST

9753

? Op0.getOperand(0)

9754

: Op0.getOperand(0).getOperand(0);

9755

MVT MaskVT = BOperand.getSimpleValueType();

9756

MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);

9757

if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q

9758

(EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d

9759

MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);

9760

if (!VT.is512BitVector() && !Subtarget.hasVLX()) {

9761

unsigned Scale = 512 / VT.getSizeInBits();

9762

BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));

9763

}

9764

SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);

9765

if (BcstVT.getSizeInBits() != VT.getSizeInBits())

9766

Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());

9767

return DAG.getBitcast(VT, Bcst);

9768

}

9769

}

9770

}

9771

9772

unsigned NumUndefElts = UndefElements.count();

9773

if (!Ld || (NumElts - NumUndefElts) <= 1) {

9774

APInt SplatValue, Undef;

9775

unsigned SplatBitSize;

9776

bool HasUndef;

9777

// Check if this is a repeated constant pattern suitable for broadcasting.

9778

if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&

9779

SplatBitSize > VT.getScalarSizeInBits() &&

9780

SplatBitSize < VT.getSizeInBits()) {

9781

// Avoid replacing with broadcast when it's a use of a shuffle

9782

// instruction to preserve the present custom lowering of shuffles.

9783

if (isFoldableUseOfShuffle(BVOp))

9784

return SDValue();

9785

// replace BUILD_VECTOR with broadcast of the repeated constants.

9786

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9787

LLVMContext *Ctx = DAG.getContext();

9788

MVT PVT = TLI.getPointerTy(DAG.getDataLayout());

9789

if (Subtarget.hasAVX()) {

9790

if (SplatBitSize == 32 || SplatBitSize == 64 ||

9791

(SplatBitSize < 32 && Subtarget.hasAVX2())) {

9792

// Splatted value can fit in one INTEGER constant in constant pool.

9793

// Load the constant and broadcast it.

9794

MVT CVT = MVT::getIntegerVT(SplatBitSize);

9795

Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);

9796

Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);

9797

SDValue CP = DAG.getConstantPool(C, PVT);

9798

unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

9799

9800

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

9801

SDVTList Tys =

9802

DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);

9803

SDValue Ops[] = {DAG.getEntryNode(), CP};

9804

MachinePointerInfo MPI =

9805

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

9806

SDValue Brdcst = DAG.getMemIntrinsicNode(

9807

X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,

9808

MachineMemOperand::MOLoad);

9809

return DAG.getBitcast(VT, Brdcst);

9810

}

9811

if (SplatBitSize > 64) {

9812

// Load the vector of constants and broadcast it.

9813

Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,

9814

*Ctx);

9815

SDValue VCP = DAG.getConstantPool(VecC, PVT);

9816

unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();

9817

MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);

9818

Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();

9819

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9820

SDValue Ops[] = {DAG.getEntryNode(), VCP};

9821

MachinePointerInfo MPI =

9822

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

9823

return DAG.getMemIntrinsicNode(

9824

X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,

9825

MachineMemOperand::MOLoad);

9826

}

9827

}

9828

}

9829

9830

// If we are moving a scalar into a vector (Ld must be set and all elements

9831

// but 1 are undef) and that operation is not obviously supported by

9832

// vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.

9833

// That's better than general shuffling and may eliminate a load to GPR and

9834

// move from scalar to vector register.

9835

if (!Ld || NumElts - NumUndefElts != 1)

9836

return SDValue();

9837

unsigned ScalarSize = Ld.getValueSizeInBits();

9838

if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))

9839

return SDValue();

9840

}

9841

9842

bool ConstSplatVal =

9843

(Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);

9844

bool IsLoad = ISD::isNormalLoad(Ld.getNode());

9845

9846

// TODO: Handle broadcasts of non-constant sequences.

9847

9848

// Make sure that all of the users of a non-constant load are from the

9849

// BUILD_VECTOR node.

9850

// FIXME: Is the use count needed for non-constant, non-load case?

9851

if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))

9852

return SDValue();

9853

9854

unsigned ScalarSize = Ld.getValueSizeInBits();

9855

bool IsGE256 = (VT.getSizeInBits() >= 256);

9856

9857

// When optimizing for size, generate up to 5 extra bytes for a broadcast

9858

// instruction to save 8 or more bytes of constant pool data.

9859

// TODO: If multiple splats are generated to load the same constant,

9860

// it may be detrimental to overall size. There needs to be a way to detect

9861

// that condition to know if this is truly a size win.

9862

bool OptForSize = DAG.shouldOptForSize();

9863

9864

// Handle broadcasting a single constant scalar from the constant pool

9865

// into a vector.

9866

// On Sandybridge (no AVX2), it is still better to load a constant vector

9867

// from the constant pool and not to broadcast it from a scalar.

9868

// But override that restriction when optimizing for size.

9869

// TODO: Check if splatting is recommended for other AVX-capable CPUs.

9870

if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {

9871

EVT CVT = Ld.getValueType();

9872

assert(!CVT.isVector() && "Must not broadcast a vector type")(static_cast <bool> (!CVT.isVector() && "Must not broadcast a vector type"
) ? void (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9872, __extension__
__PRETTY_FUNCTION__));

9873

9874

// Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.

9875

// For size optimization, also splat v2f64 and v2i64, and for size opt

9876

// with AVX2, also splat i8 and i16.

9877

// With pattern matching, the VBROADCAST node may become a VMOVDDUP.

9878

if (ScalarSize == 32 ||

9879

(ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||

9880

CVT == MVT::f16 ||

9881

(OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {

9882

const Constant *C = nullptr;

9883

if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))

9884

C = CI->getConstantIntValue();

9885

else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))

9886

C = CF->getConstantFPValue();

9887

9888

assert(C && "Invalid constant type")(static_cast <bool> (C && "Invalid constant type"
) ? void (0) : __assert_fail ("C && \"Invalid constant type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9888, __extension__
__PRETTY_FUNCTION__));

9889

9890

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9891

SDValue CP =

9892

DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));

9893

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

9894

9895

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9896

SDValue Ops[] = {DAG.getEntryNode(), CP};

9897

MachinePointerInfo MPI =

9898

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

9899

return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,

9900

MPI, Alignment, MachineMemOperand::MOLoad);

9901

}

9902

}

9903

9904

// Handle AVX2 in-register broadcasts.

9905

if (!IsLoad && Subtarget.hasInt256() &&

9906

(ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))

9907

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

9908

9909

// The scalar source must be a normal load.

9910

if (!IsLoad)

9911

return SDValue();

9912

9913

// Make sure the non-chain result is only used by this build vector.

9914

if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))

9915

return SDValue();

9916

9917

if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||

9918

(Subtarget.hasVLX() && ScalarSize == 64)) {

9919

auto *LN = cast<LoadSDNode>(Ld);

9920

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9921

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

9922

SDValue BCast =

9923

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

9924

LN->getMemoryVT(), LN->getMemOperand());

9925

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));

9926

return BCast;

9927

}

9928

9929

// The integer check is needed for the 64-bit into 128-bit so it doesn't match

9930

// double since there is no vbroadcastsd xmm

9931

if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&

9932

(ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {

9933

auto *LN = cast<LoadSDNode>(Ld);

9934

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9935

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

9936

SDValue BCast =

9937

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

9938

LN->getMemoryVT(), LN->getMemOperand());

9939

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));

9940

return BCast;

9941

}

9942

9943

if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)

9944

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

9945

9946

// Unsupported broadcast.

9947

return SDValue();

9948

}

9949

9950

/// For an EXTRACT_VECTOR_ELT with a constant index return the real

9951

/// underlying vector and index.

9952

///

9953

/// Modifies \p ExtractedFromVec to the real vector and returns the real

9954

/// index.

9955

static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,

9956

SDValue ExtIdx) {

9957

int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();

9958

if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))

9959

return Idx;

9960

9961

// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already

9962

// lowered this:

9963

// (extract_vector_elt (v8f32 %1), Constant<6>)

9964

// to:

9965

// (extract_vector_elt (vector_shuffle<2,u,u,u>

9966

// (extract_subvector (v8f32 %0), Constant<4>),

9967

// undef)

9968

// Constant<0>)

9969

// In this case the vector is the extract_subvector expression and the index

9970

// is 2, as specified by the shuffle.

9971

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);

9972

SDValue ShuffleVec = SVOp->getOperand(0);

9973

MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();

9974

assert(ShuffleVecVT.getVectorElementType() ==(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9975, __extension__
__PRETTY_FUNCTION__))

9975

ExtractedFromVec.getSimpleValueType().getVectorElementType())(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9975, __extension__
__PRETTY_FUNCTION__));

9976

9977

int ShuffleIdx = SVOp->getMaskElt(Idx);

9978

if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {

9979

ExtractedFromVec = ShuffleVec;

9980

return ShuffleIdx;

9981

}

9982

return Idx;

9983

}

9984

9985

static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {

9986

MVT VT = Op.getSimpleValueType();

9987

9988

// Skip if insert_vec_elt is not supported.

9989

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9990

if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))

9991

return SDValue();

9992

9993

SDLoc DL(Op);

9994

unsigned NumElems = Op.getNumOperands();

9995

9996

SDValue VecIn1;

9997

SDValue VecIn2;

9998

SmallVector<unsigned, 4> InsertIndices;

9999

SmallVector<int, 8> Mask(NumElems, -1);

10000

10001

for (unsigned i = 0; i != NumElems; ++i) {

10002

unsigned Opc = Op.getOperand(i).getOpcode();

10003

10004

if (Opc == ISD::UNDEF)

10005

continue;

10006

10007

if (Opc != ISD::EXTRACT_VECTOR_ELT) {

10008

// Quit if more than 1 elements need inserting.

10009

if (InsertIndices.size() > 1)

10010

return SDValue();

10011

10012

InsertIndices.push_back(i);

10013

continue;

10014

}

10015

10016

SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);

10017

SDValue ExtIdx = Op.getOperand(i).getOperand(1);

10018

10019

// Quit if non-constant index.

10020

if (!isa<ConstantSDNode>(ExtIdx))

10021

return SDValue();

10022

int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);

10023

10024

// Quit if extracted from vector of different type.

10025

if (ExtractedFromVec.getValueType() != VT)

10026

return SDValue();

10027

10028

if (!VecIn1.getNode())

10029

VecIn1 = ExtractedFromVec;

10030

else if (VecIn1 != ExtractedFromVec) {

10031

if (!VecIn2.getNode())

10032

VecIn2 = ExtractedFromVec;

10033

else if (VecIn2 != ExtractedFromVec)

10034

// Quit if more than 2 vectors to shuffle

10035

return SDValue();

10036

}

10037

10038

if (ExtractedFromVec == VecIn1)

10039

Mask[i] = Idx;

10040

else if (ExtractedFromVec == VecIn2)

10041

Mask[i] = Idx + NumElems;

10042

}

10043

10044

if (!VecIn1.getNode())

10045

return SDValue();

10046

10047

VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);

10048

SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);

10049

10050

for (unsigned Idx : InsertIndices)

10051

NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),

10052

DAG.getIntPtrConstant(Idx, DL));

10053

10054

return NV;

10055

}

10056

10057

// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.

10058

static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,

10059

const X86Subtarget &Subtarget) {

10060

MVT VT = Op.getSimpleValueType();

10061

MVT IVT = VT.changeVectorElementTypeToInteger();

10062

SmallVector<SDValue, 16> NewOps;

10063

for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)

10064

NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I)));

10065

SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);

10066

return DAG.getBitcast(VT, Res);

10067

}

10068

10069

// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.

10070

static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,

10071

const X86Subtarget &Subtarget) {

10072

10073

MVT VT = Op.getSimpleValueType();

10074

assert((VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10075, __extension__
__PRETTY_FUNCTION__))

10075

"Unexpected type in LowerBUILD_VECTORvXi1!")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10075, __extension__
__PRETTY_FUNCTION__));

10076

10077

SDLoc dl(Op);

10078

if (ISD::isBuildVectorAllZeros(Op.getNode()) ||

10079

ISD::isBuildVectorAllOnes(Op.getNode()))

10080

return Op;

10081

10082

uint64_t Immediate = 0;

10083

SmallVector<unsigned, 16> NonConstIdx;

10084

bool IsSplat = true;

10085

bool HasConstElts = false;

10086

int SplatIdx = -1;

10087

for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {

10088

SDValue In = Op.getOperand(idx);

10089

if (In.isUndef())

10090

continue;

10091

if (auto *InC = dyn_cast<ConstantSDNode>(In)) {

10092

Immediate |= (InC->getZExtValue() & 0x1) << idx;

10093

HasConstElts = true;

10094

} else {

10095

NonConstIdx.push_back(idx);

10096

}

10097

if (SplatIdx < 0)

10098

SplatIdx = idx;

10099

else if (In != Op.getOperand(SplatIdx))

10100

IsSplat = false;

10101

}

10102

10103

// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"

10104

if (IsSplat) {

10105

// The build_vector allows the scalar element to be larger than the vector

10106

// element type. We need to mask it to use as a condition unless we know

10107

// the upper bits are zero.

10108

// FIXME: Use computeKnownBits instead of checking specific opcode?

10109

SDValue Cond = Op.getOperand(SplatIdx);

10110

assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Cond.getValueType() == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Cond.getValueType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10110, __extension__
__PRETTY_FUNCTION__));

10111

if (Cond.getOpcode() != ISD::SETCC)

10112

Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,

10113

DAG.getConstant(1, dl, MVT::i8));

10114

10115

// Perform the select in the scalar domain so we can use cmov.

10116

if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {

10117

SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,

10118

DAG.getAllOnesConstant(dl, MVT::i32),

10119

DAG.getConstant(0, dl, MVT::i32));

10120

Select = DAG.getBitcast(MVT::v32i1, Select);

10121

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);

10122

} else {

10123

MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));

10124

SDValue Select = DAG.getSelect(dl, ImmVT, Cond,

10125

DAG.getAllOnesConstant(dl, ImmVT),

10126

DAG.getConstant(0, dl, ImmVT));

10127

MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;

10128

Select = DAG.getBitcast(VecVT, Select);

10129

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,

10130

DAG.getIntPtrConstant(0, dl));

10131

}

10132

}

10133

10134

// insert elements one by one

10135

SDValue DstVec;

10136

if (HasConstElts) {

10137

if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {

10138

SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);

10139

SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);

10140

ImmL = DAG.getBitcast(MVT::v32i1, ImmL);

10141

ImmH = DAG.getBitcast(MVT::v32i1, ImmH);

10142

DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);

10143

} else {

10144

MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));

10145

SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);

10146

MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;

10147

DstVec = DAG.getBitcast(VecVT, Imm);

10148

DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,

10149

DAG.getIntPtrConstant(0, dl));

10150

}

10151

} else

10152

DstVec = DAG.getUNDEF(VT);

10153

10154

for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {

10155

unsigned InsertIdx = NonConstIdx[i];

10156

DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,

10157

Op.getOperand(InsertIdx),

10158

DAG.getIntPtrConstant(InsertIdx, dl));

10159

}

10160

return DstVec;

10161

}

10162

10163

LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) static bool isHorizOp(unsigned Opcode) {

10164

switch (Opcode) {

10165

case X86ISD::PACKSS:

10166

case X86ISD::PACKUS:

10167

case X86ISD::FHADD:

10168

case X86ISD::FHSUB:

10169

case X86ISD::HADD:

10170

case X86ISD::HSUB:

10171

return true;

10172

}

10173

return false;

10174

}

10175

10176

/// This is a helper function of LowerToHorizontalOp().

10177

/// This function checks that the build_vector \p N in input implements a

10178

/// 128-bit partial horizontal operation on a 256-bit vector, but that operation

10179

/// may not match the layout of an x86 256-bit horizontal instruction.

10180

/// In other words, if this returns true, then some extraction/insertion will

10181

/// be required to produce a valid horizontal instruction.

10182

///

10183

/// Parameter \p Opcode defines the kind of horizontal operation to match.

10184

/// For example, if \p Opcode is equal to ISD::ADD, then this function

10185

/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode

10186

/// is equal to ISD::SUB, then this function checks if this is a horizontal

10187

/// arithmetic sub.

10188

///

10189

/// This function only analyzes elements of \p N whose indices are

10190

/// in range [BaseIdx, LastIdx).

10191

///

10192

/// TODO: This function was originally used to match both real and fake partial

10193

/// horizontal operations, but the index-matching logic is incorrect for that.

10194

/// See the corrected implementation in isHopBuildVector(). Can we reduce this

10195

/// code because it is only used for partial h-op matching now?

10196

static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,

10197

SelectionDAG &DAG,

10198

unsigned BaseIdx, unsigned LastIdx,

10199

SDValue &V0, SDValue &V1) {

10200

EVT VT = N->getValueType(0);

10201

assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")(static_cast <bool> (VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only use for matching partial 256-bit h-ops\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10201, __extension__
__PRETTY_FUNCTION__));

10202

assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")(static_cast <bool> (BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!") ? void (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10202, __extension__
__PRETTY_FUNCTION__));

10203

assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10204, __extension__
__PRETTY_FUNCTION__))

10204

"Invalid Vector in input!")(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10204, __extension__
__PRETTY_FUNCTION__));

10205

10206

bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);

10207

bool CanFold = true;

10208

unsigned ExpectedVExtractIdx = BaseIdx;

10209

unsigned NumElts = LastIdx - BaseIdx;

10210

V0 = DAG.getUNDEF(VT);

10211

V1 = DAG.getUNDEF(VT);

10212

10213

// Check if N implements a horizontal binop.

10214

for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {

10215

SDValue Op = N->getOperand(i + BaseIdx);

10216

10217

// Skip UNDEFs.

10218

if (Op->isUndef()) {

10219

// Update the expected vector extract index.

10220

if (i * 2 == NumElts)

10221

ExpectedVExtractIdx = BaseIdx;

10222

ExpectedVExtractIdx += 2;

10223

continue;

10224

}

10225

10226

CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();

10227

10228

if (!CanFold)

10229

break;

10230

10231

SDValue Op0 = Op.getOperand(0);

10232

SDValue Op1 = Op.getOperand(1);

10233

10234

// Try to match the following pattern:

10235

// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))

10236

CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

10237

Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

10238

Op0.getOperand(0) == Op1.getOperand(0) &&

10239

isa<ConstantSDNode>(Op0.getOperand(1)) &&

10240

isa<ConstantSDNode>(Op1.getOperand(1)));

10241

if (!CanFold)

10242

break;

10243

10244

unsigned I0 = Op0.getConstantOperandVal(1);

10245

unsigned I1 = Op1.getConstantOperandVal(1);

10246

10247

if (i * 2 < NumElts) {

10248

if (V0.isUndef()) {

10249

V0 = Op0.getOperand(0);

10250

if (V0.getValueType() != VT)

10251

return false;

10252

}

10253

} else {

10254

if (V1.isUndef()) {

10255

V1 = Op0.getOperand(0);

10256

if (V1.getValueType() != VT)

10257

return false;

10258

}

10259

if (i * 2 == NumElts)

10260

ExpectedVExtractIdx = BaseIdx;

10261

}

10262

10263

SDValue Expected = (i * 2 < NumElts) ? V0 : V1;

10264

if (I0 == ExpectedVExtractIdx)

10265

CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;

10266

else if (IsCommutable && I1 == ExpectedVExtractIdx) {

10267

// Try to match the following dag sequence:

10268

// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))

10269

CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;

10270

} else

10271

CanFold = false;

10272

10273

ExpectedVExtractIdx += 2;

10274

}

10275

10276

return CanFold;

10277

}

10278

10279

/// Emit a sequence of two 128-bit horizontal add/sub followed by

10280

/// a concat_vector.

10281

///

10282

/// This is a helper function of LowerToHorizontalOp().

10283

/// This function expects two 256-bit vectors called V0 and V1.

10284

/// At first, each vector is split into two separate 128-bit vectors.

10285

/// Then, the resulting 128-bit vectors are used to implement two

10286

/// horizontal binary operations.

10287

///

10288

/// The kind of horizontal binary operation is defined by \p X86Opcode.

10289

///

10290

/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to

10291

/// the two new horizontal binop.

10292

/// When Mode is set, the first horizontal binop dag node would take as input

10293

/// the lower 128-bit of V0 and the upper 128-bit of V0. The second

10294

/// horizontal binop dag node would take as input the lower 128-bit of V1

10295

/// and the upper 128-bit of V1.

10296

/// Example:

10297

/// HADD V0_LO, V0_HI

10298

/// HADD V1_LO, V1_HI

10299

///

10300

/// Otherwise, the first horizontal binop dag node takes as input the lower

10301

/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop

10302

/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.

10303

/// Example:

10304

/// HADD V0_LO, V1_LO

10305

/// HADD V0_HI, V1_HI

10306

///

10307

/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower

10308

/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to

10309

/// the upper 128-bits of the result.

10310

static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,

10311

const SDLoc &DL, SelectionDAG &DAG,

10312

unsigned X86Opcode, bool Mode,

10313

bool isUndefLO, bool isUndefHI) {

10314

MVT VT = V0.getSimpleValueType();

10315

assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10316, __extension__
__PRETTY_FUNCTION__))

10316

"Invalid nodes in input!")(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10316, __extension__
__PRETTY_FUNCTION__));

10317

10318

unsigned NumElts = VT.getVectorNumElements();

10319

SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);

10320

SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);

10321

SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);

10322

SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);

10323

MVT NewVT = V0_LO.getSimpleValueType();

10324

10325

SDValue LO = DAG.getUNDEF(NewVT);

10326

SDValue HI = DAG.getUNDEF(NewVT);

10327

10328

if (Mode) {

10329

// Don't emit a horizontal binop if the result is expected to be UNDEF.

10330

if (!isUndefLO && !V0->isUndef())

10331

LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);

10332

if (!isUndefHI && !V1->isUndef())

10333

HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);

10334

} else {

10335

// Don't emit a horizontal binop if the result is expected to be UNDEF.

10336

if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))

10337

LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);

10338

10339

if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))

10340

HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);

10341

}

10342

10343

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);

10344

}

10345

10346

/// Returns true iff \p BV builds a vector with the result equivalent to

10347

/// the result of ADDSUB/SUBADD operation.

10348

/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1

10349

/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters

10350

/// \p Opnd0 and \p Opnd1.

10351

static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,

10352

const X86Subtarget &Subtarget, SelectionDAG &DAG,

10353

SDValue &Opnd0, SDValue &Opnd1,

10354

unsigned &NumExtracts,

10355

bool &IsSubAdd) {

10356

10357

MVT VT = BV->getSimpleValueType(0);

10358

if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())

10359

return false;

10360

10361

unsigned NumElts = VT.getVectorNumElements();

10362

SDValue InVec0 = DAG.getUNDEF(VT);

10363

SDValue InVec1 = DAG.getUNDEF(VT);

10364

10365

NumExtracts = 0;

10366

10367

// Odd-numbered elements in the input build vector are obtained from

10368

// adding/subtracting two integer/float elements.

10369

// Even-numbered elements in the input build vector are obtained from

10370

// subtracting/adding two integer/float elements.

10371

unsigned Opc[2] = {0, 0};

10372

for (unsigned i = 0, e = NumElts; i != e; ++i) {

10373

SDValue Op = BV->getOperand(i);

10374

10375

// Skip 'undef' values.

10376

unsigned Opcode = Op.getOpcode();

10377

if (Opcode == ISD::UNDEF)

10378

continue;

10379

10380

// Early exit if we found an unexpected opcode.

10381

if (Opcode != ISD::FADD && Opcode != ISD::FSUB)

10382

return false;

10383

10384

SDValue Op0 = Op.getOperand(0);

10385

SDValue Op1 = Op.getOperand(1);

10386

10387

// Try to match the following pattern:

10388

// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))

10389

// Early exit if we cannot match that sequence.

10390

if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10391

Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10392

!isa<ConstantSDNode>(Op0.getOperand(1)) ||

10393

Op0.getOperand(1) != Op1.getOperand(1))

10394

return false;

10395

10396

unsigned I0 = Op0.getConstantOperandVal(1);

10397

if (I0 != i)

10398

return false;

10399

10400

// We found a valid add/sub node, make sure its the same opcode as previous

10401

// elements for this parity.

10402

if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)

10403

return false;

10404

Opc[i % 2] = Opcode;

10405

10406

// Update InVec0 and InVec1.

10407

if (InVec0.isUndef()) {

10408

InVec0 = Op0.getOperand(0);

10409

if (InVec0.getSimpleValueType() != VT)

10410

return false;

10411

}

10412

if (InVec1.isUndef()) {

10413

InVec1 = Op1.getOperand(0);

10414

if (InVec1.getSimpleValueType() != VT)

10415

return false;

10416

}

10417

10418

// Make sure that operands in input to each add/sub node always

10419

// come from a same pair of vectors.

10420

if (InVec0 != Op0.getOperand(0)) {

10421

if (Opcode == ISD::FSUB)

10422

return false;

10423

10424

// FADD is commutable. Try to commute the operands

10425

// and then test again.

10426

std::swap(Op0, Op1);

10427

if (InVec0 != Op0.getOperand(0))

10428

return false;

10429

}

10430

10431

if (InVec1 != Op1.getOperand(0))

10432

return false;

10433

10434

// Increment the number of extractions done.

10435

++NumExtracts;

10436

}

10437

10438

// Ensure we have found an opcode for both parities and that they are

10439

// different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the

10440

// inputs are undef.

10441

if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||

10442

InVec0.isUndef() || InVec1.isUndef())

10443

return false;

10444

10445

IsSubAdd = Opc[0] == ISD::FADD;

10446

10447

Opnd0 = InVec0;

10448

Opnd1 = InVec1;

10449

return true;

10450

}

10451

10452

/// Returns true if is possible to fold MUL and an idiom that has already been

10453

/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into

10454

/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the

10455

/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.

10456

///

10457

/// Prior to calling this function it should be known that there is some

10458

/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation

10459

/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called

10460

/// before replacement of such SDNode with ADDSUB operation. Thus the number

10461

/// of \p Opnd0 uses is expected to be equal to 2.

10462

/// For example, this function may be called for the following IR:

10463

/// %AB = fmul fast <2 x double> %A, %B

10464

/// %Sub = fsub fast <2 x double> %AB, %C

10465

/// %Add = fadd fast <2 x double> %AB, %C

10466

/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,

10467

/// <2 x i32> <i32 0, i32 3>

10468

/// There is a def for %Addsub here, which potentially can be replaced by

10469

/// X86ISD::ADDSUB operation:

10470

/// %Addsub = X86ISD::ADDSUB %AB, %C

10471

/// and such ADDSUB can further be replaced with FMADDSUB:

10472

/// %Addsub = FMADDSUB %A, %B, %C.

10473

///

10474

/// The main reason why this method is called before the replacement of the

10475

/// recognized ADDSUB idiom with ADDSUB operation is that such replacement

10476

/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit

10477

/// FMADDSUB is.

10478

static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,

10479

SelectionDAG &DAG,

10480

SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,

10481

unsigned ExpectedUses) {

10482

if (Opnd0.getOpcode() != ISD::FMUL ||

10483

!Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())

10484

return false;

10485

10486

// FIXME: These checks must match the similar ones in

10487

// DAGCombiner::visitFADDForFMACombine. It would be good to have one

10488

// function that would answer if it is Ok to fuse MUL + ADD to FMADD

10489

// or MUL + ADDSUB to FMADDSUB.

10490

const TargetOptions &Options = DAG.getTarget().Options;

10491

bool AllowFusion =

10492

(Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);

10493

if (!AllowFusion)

10494

return false;

10495

10496

Opnd2 = Opnd1;

10497

Opnd1 = Opnd0.getOperand(1);

10498

Opnd0 = Opnd0.getOperand(0);

10499

10500

return true;

10501

}

10502

10503

/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or

10504

/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or

10505

/// X86ISD::FMSUBADD node.

10506

static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,

10507

const X86Subtarget &Subtarget,

10508

SelectionDAG &DAG) {

10509

SDValue Opnd0, Opnd1;

10510

unsigned NumExtracts;

10511

bool IsSubAdd;

10512

if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,

10513

IsSubAdd))

10514

return SDValue();

10515

10516

MVT VT = BV->getSimpleValueType(0);

10517

SDLoc DL(BV);

10518

10519

// Try to generate X86ISD::FMADDSUB node here.

10520

SDValue Opnd2;

10521

if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {

10522

unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

10523

return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);

10524

}

10525

10526

// We only support ADDSUB.

10527

if (IsSubAdd)

10528

return SDValue();

10529

10530

// There are no known X86 targets with 512-bit ADDSUB instructions!

10531

// Convert to blend(fsub,fadd).

10532

if (VT.is512BitVector()) {

10533

SmallVector<int> Mask;

10534

for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {

10535

Mask.push_back(I);

10536

Mask.push_back(I + E + 1);

10537

}

10538

SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);

10539

SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);

10540

return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);

10541

}

10542

10543

return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);

10544

}

10545

10546

static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,

10547

unsigned &HOpcode, SDValue &V0, SDValue &V1) {

10548

// Initialize outputs to known values.

10549

MVT VT = BV->getSimpleValueType(0);

10550

HOpcode = ISD::DELETED_NODE;

10551

V0 = DAG.getUNDEF(VT);

10552

V1 = DAG.getUNDEF(VT);

10553

10554

// x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit

10555

// half of the result is calculated independently from the 128-bit halves of

10556

// the inputs, so that makes the index-checking logic below more complicated.

10557

unsigned NumElts = VT.getVectorNumElements();

10558

unsigned GenericOpcode = ISD::DELETED_NODE;

10559

unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;

10560

unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;

10561

unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;

10562

for (unsigned i = 0; i != Num128BitChunks; ++i) {

10563

for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {

10564

// Ignore undef elements.

10565

SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);

10566

if (Op.isUndef())

10567

continue;

10568

10569

// If there's an opcode mismatch, we're done.

10570

if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)

10571

return false;

10572

10573

// Initialize horizontal opcode.

10574

if (HOpcode == ISD::DELETED_NODE) {

10575

GenericOpcode = Op.getOpcode();

10576

switch (GenericOpcode) {

10577

case ISD::ADD: HOpcode = X86ISD::HADD; break;

10578

case ISD::SUB: HOpcode = X86ISD::HSUB; break;

10579

case ISD::FADD: HOpcode = X86ISD::FHADD; break;

10580

case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;

10581

default: return false;

10582

}

10583

}

10584

10585

SDValue Op0 = Op.getOperand(0);

10586

SDValue Op1 = Op.getOperand(1);

10587

if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10588

Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10589

Op0.getOperand(0) != Op1.getOperand(0) ||

10590

!isa<ConstantSDNode>(Op0.getOperand(1)) ||

10591

!isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())

10592

return false;

10593

10594

// The source vector is chosen based on which 64-bit half of the

10595

// destination vector is being calculated.

10596

if (j < NumEltsIn64Bits) {

10597

if (V0.isUndef())

10598

V0 = Op0.getOperand(0);

10599

} else {

10600

if (V1.isUndef())

10601

V1 = Op0.getOperand(0);

10602

}

10603

10604

SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;

10605

if (SourceVec != Op0.getOperand(0))

10606

return false;

10607

10608

// op (extract_vector_elt A, I), (extract_vector_elt A, I+1)

10609

unsigned ExtIndex0 = Op0.getConstantOperandVal(1);

10610

unsigned ExtIndex1 = Op1.getConstantOperandVal(1);

10611

unsigned ExpectedIndex = i * NumEltsIn128Bits +

10612

(j % NumEltsIn64Bits) * 2;

10613

if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)

10614

continue;

10615

10616

// If this is not a commutative op, this does not match.

10617

if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)

10618

return false;

10619

10620

// Addition is commutative, so try swapping the extract indexes.

10621

// op (extract_vector_elt A, I+1), (extract_vector_elt A, I)

10622

if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)

10623

continue;

10624

10625

// Extract indexes do not match horizontal requirement.

10626

return false;

10627

}

10628

}

10629

// We matched. Opcode and operands are returned by reference as arguments.

10630

return true;

10631

}

10632

10633

static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,

10634

SelectionDAG &DAG, unsigned HOpcode,

10635

SDValue V0, SDValue V1) {

10636

// If either input vector is not the same size as the build vector,

10637

// extract/insert the low bits to the correct size.

10638

// This is free (examples: zmm --> xmm, xmm --> ymm).

10639

MVT VT = BV->getSimpleValueType(0);

10640

unsigned Width = VT.getSizeInBits();

10641

if (V0.getValueSizeInBits() > Width)

10642

V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);

10643

else if (V0.getValueSizeInBits() < Width)

10644

V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);

10645

10646

if (V1.getValueSizeInBits() > Width)

10647

V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);

10648

else if (V1.getValueSizeInBits() < Width)

10649

V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);

10650

10651

unsigned NumElts = VT.getVectorNumElements();

10652

APInt DemandedElts = APInt::getAllOnes(NumElts);

10653

for (unsigned i = 0; i != NumElts; ++i)

10654

if (BV->getOperand(i).isUndef())

10655

DemandedElts.clearBit(i);

10656

10657

// If we don't need the upper xmm, then perform as a xmm hop.

10658

unsigned HalfNumElts = NumElts / 2;

10659

if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {

10660

MVT HalfVT = VT.getHalfNumVectorElementsVT();

10661

V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);

10662

V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);

10663

SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);

10664

return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);

10665

}

10666

10667

return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);

10668

}

10669

10670

/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.

10671

static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,

10672

const X86Subtarget &Subtarget,

10673

SelectionDAG &DAG) {

10674

// We need at least 2 non-undef elements to make this worthwhile by default.

10675

unsigned NumNonUndefs =

10676

count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });

10677

if (NumNonUndefs < 2)

10678

return SDValue();

10679

10680

// There are 4 sets of horizontal math operations distinguished by type:

10681

// int/FP at 128-bit/256-bit. Each type was introduced with a different

10682

// subtarget feature. Try to match those "native" patterns first.

10683

MVT VT = BV->getSimpleValueType(0);

10684

if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||

10685

((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||

10686

((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||

10687

((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {

10688

unsigned HOpcode;

10689

SDValue V0, V1;

10690

if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))

10691

return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);

10692

}

10693

10694

// Try harder to match 256-bit ops by using extract/concat.

10695

if (!Subtarget.hasAVX() || !VT.is256BitVector())

10696

return SDValue();

10697

10698

// Count the number of UNDEF operands in the build_vector in input.

10699

unsigned NumElts = VT.getVectorNumElements();

10700

unsigned Half = NumElts / 2;

10701

unsigned NumUndefsLO = 0;

10702

unsigned NumUndefsHI = 0;

10703

for (unsigned i = 0, e = Half; i != e; ++i)

10704

if (BV->getOperand(i)->isUndef())

10705

NumUndefsLO++;

10706

10707

for (unsigned i = Half, e = NumElts; i != e; ++i)

10708

if (BV->getOperand(i)->isUndef())

10709

NumUndefsHI++;

10710

10711

SDLoc DL(BV);

10712

SDValue InVec0, InVec1;

10713

if (VT == MVT::v8i32 || VT == MVT::v16i16) {

10714

SDValue InVec2, InVec3;

10715

unsigned X86Opcode;

10716

bool CanFold = true;

10717

10718

if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&

10719

isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,

10720

InVec3) &&

10721

((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&

10722

((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))

10723

X86Opcode = X86ISD::HADD;

10724

else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,

10725

InVec1) &&

10726

isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,

10727

InVec3) &&

10728

((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&

10729

((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))

10730

X86Opcode = X86ISD::HSUB;

10731

else

10732

CanFold = false;

10733

10734

if (CanFold) {

10735

// Do not try to expand this build_vector into a pair of horizontal

10736

// add/sub if we can emit a pair of scalar add/sub.

10737

if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)

10738

return SDValue();

10739

10740

// Convert this build_vector into a pair of horizontal binops followed by

10741

// a concat vector. We must adjust the outputs from the partial horizontal

10742

// matching calls above to account for undefined vector halves.

10743

SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;

10744

SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;

10745

assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(static_cast <bool> ((!V0.isUndef() || !V1.isUndef()) &&
"Horizontal-op of undefs?") ? void (0) : __assert_fail ("(!V0.isUndef() || !V1.isUndef()) && \"Horizontal-op of undefs?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10745, __extension__
__PRETTY_FUNCTION__));

10746

bool isUndefLO = NumUndefsLO == Half;

10747

bool isUndefHI = NumUndefsHI == Half;

10748

return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,

10749

isUndefHI);

10750

}

10751

}

10752

10753

if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||

10754

VT == MVT::v16i16) {

10755

unsigned X86Opcode;

10756

if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))

10757

X86Opcode = X86ISD::HADD;

10758

else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,

10759

InVec1))

10760

X86Opcode = X86ISD::HSUB;

10761

else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,

10762

InVec1))

10763

X86Opcode = X86ISD::FHADD;

10764

else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,

10765

InVec1))

10766

X86Opcode = X86ISD::FHSUB;

10767

else

10768

return SDValue();

10769

10770

// Don't try to expand this build_vector into a pair of horizontal add/sub

10771

// if we can simply emit a pair of scalar add/sub.

10772

if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)

10773

return SDValue();

10774

10775

// Convert this build_vector into two horizontal add/sub followed by

10776

// a concat vector.

10777

bool isUndefLO = NumUndefsLO == Half;

10778

bool isUndefHI = NumUndefsHI == Half;

10779

return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,

10780

isUndefLO, isUndefHI);

10781

}

10782

10783

return SDValue();

10784

}

10785

10786

static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

10787

SelectionDAG &DAG);

10788

10789

/// If a BUILD_VECTOR's source elements all apply the same bit operation and

10790

/// one of their operands is constant, lower to a pair of BUILD_VECTOR and

10791

/// just apply the bit to the vectors.

10792

/// NOTE: Its not in our interest to start make a general purpose vectorizer

10793

/// from this, but enough scalar bit operations are created from the later

10794

/// legalization + scalarization stages to need basic support.

10795

static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,

10796

const X86Subtarget &Subtarget,

10797

SelectionDAG &DAG) {

10798

SDLoc DL(Op);

10799

MVT VT = Op->getSimpleValueType(0);

10800

unsigned NumElems = VT.getVectorNumElements();

10801

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

10802

10803

// Check that all elements have the same opcode.

10804

// TODO: Should we allow UNDEFS and if so how many?

10805

unsigned Opcode = Op->getOperand(0).getOpcode();

10806

for (unsigned i = 1; i < NumElems; ++i)

10807

if (Opcode != Op->getOperand(i).getOpcode())

10808

return SDValue();

10809

10810

// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).

10811

bool IsShift = false;

10812

switch (Opcode) {

10813

default:

10814

return SDValue();

10815

case ISD::SHL:

10816

case ISD::SRL:

10817

case ISD::SRA:

10818

IsShift = true;

10819

break;

10820

case ISD::AND:

10821

case ISD::XOR:

10822

case ISD::OR:

10823

// Don't do this if the buildvector is a splat - we'd replace one

10824

// constant with an entire vector.

10825

if (Op->getSplatValue())

10826

return SDValue();

10827

if (!TLI.isOperationLegalOrPromote(Opcode, VT))

10828

return SDValue();

10829

break;

10830

}

10831

10832

SmallVector<SDValue, 4> LHSElts, RHSElts;

10833

for (SDValue Elt : Op->ops()) {

10834

SDValue LHS = Elt.getOperand(0);

10835

SDValue RHS = Elt.getOperand(1);

10836

10837

// We expect the canonicalized RHS operand to be the constant.

10838

if (!isa<ConstantSDNode>(RHS))

10839

return SDValue();

10840

10841

// Extend shift amounts.

10842

if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {

10843

if (!IsShift)

10844

return SDValue();

10845

RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());

10846

}

10847

10848

LHSElts.push_back(LHS);

10849

RHSElts.push_back(RHS);

10850

}

10851

10852

// Limit to shifts by uniform immediates.

10853

// TODO: Only accept vXi8/vXi64 special cases?

10854

// TODO: Permit non-uniform XOP/AVX2/MULLO cases?

10855

if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))

10856

return SDValue();

10857

10858

SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);

10859

SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);

10860

SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);

10861

10862

if (!IsShift)

10863

return Res;

10864

10865

// Immediately lower the shift to ensure the constant build vector doesn't

10866

// get converted to a constant pool before the shift is lowered.

10867

return LowerShift(Res, Subtarget, DAG);

10868

}

10869

10870

/// Create a vector constant without a load. SSE/AVX provide the bare minimum

10871

/// functionality to do this, so it's all zeros, all ones, or some derivation

10872

/// that is cheap to calculate.

10873

static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,

10874

const X86Subtarget &Subtarget) {

10875

SDLoc DL(Op);

10876

MVT VT = Op.getSimpleValueType();

10877

10878

// Vectors containing all zeros can be matched by pxor and xorps.

10879

if (ISD::isBuildVectorAllZeros(Op.getNode()))

10880

return Op;

10881

10882

// Vectors containing all ones can be matched by pcmpeqd on 128-bit width

10883

// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use

10884

// vpcmpeqd on 256-bit vectors.

10885

if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {

10886

if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)

10887

return Op;

10888

10889

return getOnesVector(VT, DAG, DL);

10890

}

10891

10892

return SDValue();

10893

}

10894

10895

/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute

10896

/// from a vector of source values and a vector of extraction indices.

10897

/// The vectors might be manipulated to match the type of the permute op.

10898

static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,

10899

SDLoc &DL, SelectionDAG &DAG,

10900

const X86Subtarget &Subtarget) {

10901

MVT ShuffleVT = VT;

10902

EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();

10903

unsigned NumElts = VT.getVectorNumElements();

10904

unsigned SizeInBits = VT.getSizeInBits();

10905

10906

// Adjust IndicesVec to match VT size.

10907

assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10908, __extension__
__PRETTY_FUNCTION__))

10908

"Illegal variable permute mask size")(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10908, __extension__
__PRETTY_FUNCTION__));

10909

if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {

10910

// Narrow/widen the indices vector to the correct size.

10911

if (IndicesVec.getValueSizeInBits() > SizeInBits)

10912

IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),

10913

NumElts * VT.getScalarSizeInBits());

10914

else if (IndicesVec.getValueSizeInBits() < SizeInBits)

10915

IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,

10916

SDLoc(IndicesVec), SizeInBits);

10917

// Zero-extend the index elements within the vector.

10918

if (IndicesVec.getValueType().getVectorNumElements() > NumElts)

10919

IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),

10920

IndicesVT, IndicesVec);

10921

}

10922

IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);

10923

10924

// Handle SrcVec that don't match VT type.

10925

if (SrcVec.getValueSizeInBits() != SizeInBits) {

10926

if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {

10927

// Handle larger SrcVec by treating it as a larger permute.

10928

unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;

10929

VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);

10930

IndicesVT = EVT(VT).changeVectorElementTypeToInteger();

10931

IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,

10932

Subtarget, DAG, SDLoc(IndicesVec));

10933

SDValue NewSrcVec =

10934

createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);

10935

if (NewSrcVec)

10936

return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);

10937

return SDValue();

10938

} else if (SrcVec.getValueSizeInBits() < SizeInBits) {

10939

// Widen smaller SrcVec to match VT.

10940

SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));

10941

} else

10942

return SDValue();

10943

}

10944

10945

auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {

10946

assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")(static_cast <bool> (isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"
) ? void (0) : __assert_fail ("isPowerOf2_64(Scale) && \"Illegal variable permute shuffle scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10946, __extension__
__PRETTY_FUNCTION__));

10947

EVT SrcVT = Idx.getValueType();

10948

unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;

10949

uint64_t IndexScale = 0;

10950

uint64_t IndexOffset = 0;

10951

10952

// If we're scaling a smaller permute op, then we need to repeat the

10953

// indices, scaling and offsetting them as well.

10954

// e.g. v4i32 -> v16i8 (Scale = 4)

10955

// IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)

10956

// IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)

10957

for (uint64_t i = 0; i != Scale; ++i) {

10958

IndexScale |= Scale << (i * NumDstBits);

10959

IndexOffset |= i << (i * NumDstBits);

10960

}

10961

10962

Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,

10963

DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));

10964

Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,

10965

DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));

10966

return Idx;

10967

};

10968

10969

unsigned Opcode = 0;

10970

switch (VT.SimpleTy) {

10971

default:

10972

break;

10973

case MVT::v16i8:

10974

if (Subtarget.hasSSSE3())

10975

Opcode = X86ISD::PSHUFB;

10976

break;

10977

case MVT::v8i16:

10978

if (Subtarget.hasVLX() && Subtarget.hasBWI())

10979

Opcode = X86ISD::VPERMV;

10980

else if (Subtarget.hasSSSE3()) {

10981

Opcode = X86ISD::PSHUFB;

10982

ShuffleVT = MVT::v16i8;

10983

}

10984

break;

10985

case MVT::v4f32:

10986

case MVT::v4i32:

10987

if (Subtarget.hasAVX()) {

10988

Opcode = X86ISD::VPERMILPV;

10989

ShuffleVT = MVT::v4f32;

10990

} else if (Subtarget.hasSSSE3()) {

10991

Opcode = X86ISD::PSHUFB;

10992

ShuffleVT = MVT::v16i8;

10993

}

10994

break;

10995

case MVT::v2f64:

10996

case MVT::v2i64:

10997

if (Subtarget.hasAVX()) {

10998

// VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.

10999

IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);

11000

Opcode = X86ISD::VPERMILPV;

11001

ShuffleVT = MVT::v2f64;

11002

} else if (Subtarget.hasSSE41()) {

11003

// SSE41 can compare v2i64 - select between indices 0 and 1.

11004

return DAG.getSelectCC(

11005

DL, IndicesVec,

11006

getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),

11007

DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),

11008

DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),

11009

ISD::CondCode::SETEQ);

11010

}

11011

break;

11012

case MVT::v32i8:

11013

if (Subtarget.hasVLX() && Subtarget.hasVBMI())

11014

Opcode = X86ISD::VPERMV;

11015

else if (Subtarget.hasXOP()) {

11016

SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);

11017

SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);

11018

SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);

11019

SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);

11020

return DAG.getNode(

11021

ISD::CONCAT_VECTORS, DL, VT,

11022

DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),

11023

DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));

11024

} else if (Subtarget.hasAVX()) {

11025

SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);

11026

SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);

11027

SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);

11028

SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);

11029

auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

11030

ArrayRef<SDValue> Ops) {

11031

// Permute Lo and Hi and then select based on index range.

11032

// This works as SHUFB uses bits[3:0] to permute elements and we don't

11033

// care about the bit[7] as its just an index vector.

11034

SDValue Idx = Ops[2];

11035

EVT VT = Idx.getValueType();

11036

return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),

11037

DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),

11038

DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),

11039

ISD::CondCode::SETGT);

11040

};

11041

SDValue Ops[] = {LoLo, HiHi, IndicesVec};

11042

return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,

11043

PSHUFBBuilder);

11044

}

11045

break;

11046

case MVT::v16i16:

11047

if (Subtarget.hasVLX() && Subtarget.hasBWI())

11048

Opcode = X86ISD::VPERMV;

11049

else if (Subtarget.hasAVX()) {

11050

// Scale to v32i8 and perform as v32i8.

11051

IndicesVec = ScaleIndices(IndicesVec, 2);

11052

return DAG.getBitcast(

11053

VT, createVariablePermute(

11054

MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),

11055

DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));

11056

}

11057

break;

11058

case MVT::v8f32:

11059

case MVT::v8i32:

11060

if (Subtarget.hasAVX2())

11061

Opcode = X86ISD::VPERMV;

11062

else if (Subtarget.hasAVX()) {

11063

SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);

11064

SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,

11065

{0, 1, 2, 3, 0, 1, 2, 3});

11066

SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,

11067

{4, 5, 6, 7, 4, 5, 6, 7});

11068

if (Subtarget.hasXOP())

11069

return DAG.getBitcast(

11070

VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,

11071

IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));

11072

// Permute Lo and Hi and then select based on index range.

11073

// This works as VPERMILPS only uses index bits[0:1] to permute elements.

11074

SDValue Res = DAG.getSelectCC(

11075

DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),

11076

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),

11077

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),

11078

ISD::CondCode::SETGT);

11079

return DAG.getBitcast(VT, Res);

11080

}

11081

break;

11082

case MVT::v4i64:

11083

case MVT::v4f64:

11084

if (Subtarget.hasAVX512()) {

11085

if (!Subtarget.hasVLX()) {

11086

MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);

11087

SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,

11088

SDLoc(SrcVec));

11089

IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,

11090

DAG, SDLoc(IndicesVec));

11091

SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,

11092

DAG, Subtarget);

11093

return extract256BitVector(Res, 0, DAG, DL);

11094

}

11095

Opcode = X86ISD::VPERMV;

11096

} else if (Subtarget.hasAVX()) {

11097

SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);

11098

SDValue LoLo =

11099

DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});

11100

SDValue HiHi =

11101

DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});

11102

// VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.

11103

IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);

11104

if (Subtarget.hasXOP())

11105

return DAG.getBitcast(

11106

VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,

11107

IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));

11108

// Permute Lo and Hi and then select based on index range.

11109

// This works as VPERMILPD only uses index bit[1] to permute elements.

11110

SDValue Res = DAG.getSelectCC(

11111

DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),

11112

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),

11113

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),

11114

ISD::CondCode::SETGT);

11115

return DAG.getBitcast(VT, Res);

11116

}

11117

break;

11118

case MVT::v64i8:

11119

if (Subtarget.hasVBMI())

11120

Opcode = X86ISD::VPERMV;

11121

break;

11122

case MVT::v32i16:

11123

if (Subtarget.hasBWI())

11124

Opcode = X86ISD::VPERMV;

11125

break;

11126

case MVT::v16f32:

11127

case MVT::v16i32:

11128

case MVT::v8f64:

11129

case MVT::v8i64:

11130

if (Subtarget.hasAVX512())

11131

Opcode = X86ISD::VPERMV;

11132

break;

11133

}

11134

if (!Opcode)

11135

return SDValue();

11136

11137

assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11139, __extension__
__PRETTY_FUNCTION__))

11138

(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11139, __extension__
__PRETTY_FUNCTION__))

11139

"Illegal variable permute shuffle type")(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11139, __extension__
__PRETTY_FUNCTION__));

11140

11141

uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();

11142

if (Scale > 1)

11143

IndicesVec = ScaleIndices(IndicesVec, Scale);

11144

11145

EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();

11146

IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);

11147

11148

SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);

11149

SDValue Res = Opcode == X86ISD::VPERMV

11150

? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)

11151

: DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);

11152

return DAG.getBitcast(VT, Res);

11153

}

11154

11155

// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be

11156

// reasoned to be a permutation of a vector by indices in a non-constant vector.

11157

// (build_vector (extract_elt V, (extract_elt I, 0)),

11158

// (extract_elt V, (extract_elt I, 1)),

11159

// ...

11160

// ->

11161

// (vpermv I, V)

11162

//

11163

// TODO: Handle undefs

11164

// TODO: Utilize pshufb and zero mask blending to support more efficient

11165

// construction of vectors with constant-0 elements.

11166

static SDValue

11167

LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,

11168

const X86Subtarget &Subtarget) {

11169

SDValue SrcVec, IndicesVec;

11170

// Check for a match of the permute source vector and permute index elements.

11171

// This is done by checking that the i-th build_vector operand is of the form:

11172

// (extract_elt SrcVec, (extract_elt IndicesVec, i)).

11173

for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {

11174

SDValue Op = V.getOperand(Idx);

11175

if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

11176

return SDValue();

11177

11178

// If this is the first extract encountered in V, set the source vector,

11179

// otherwise verify the extract is from the previously defined source

11180

// vector.

11181

if (!SrcVec)

11182

SrcVec = Op.getOperand(0);

11183

else if (SrcVec != Op.getOperand(0))

11184

return SDValue();

11185

SDValue ExtractedIndex = Op->getOperand(1);

11186

// Peek through extends.

11187

if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||

11188

ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)

11189

ExtractedIndex = ExtractedIndex.getOperand(0);

11190

if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

11191

return SDValue();

11192

11193

// If this is the first extract from the index vector candidate, set the

11194

// indices vector, otherwise verify the extract is from the previously

11195

// defined indices vector.

11196

if (!IndicesVec)

11197

IndicesVec = ExtractedIndex.getOperand(0);

11198

else if (IndicesVec != ExtractedIndex.getOperand(0))

11199

return SDValue();

11200

11201

auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));

11202

if (!PermIdx || PermIdx->getAPIntValue() != Idx)

11203

return SDValue();

11204

}

11205

11206

SDLoc DL(V);

11207

MVT VT = V.getSimpleValueType();

11208

return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);

11209

}

11210

11211

SDValue

11212

X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {

11213

SDLoc dl(Op);

11214

11215

MVT VT = Op.getSimpleValueType();

11216

MVT EltVT = VT.getVectorElementType();

11217

MVT OpEltVT = Op.getOperand(0).getSimpleValueType();

11218

unsigned NumElems = Op.getNumOperands();

11219

11220

// Generate vectors for predicate vectors.

11221

if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())

11222

return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);

11223

11224

if (VT.getVectorElementType() == MVT::bf16 && Subtarget.hasBF16())

11225

return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);

11226

11227

if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))

11228

return VectorConstant;

11229

11230

unsigned EVTBits = EltVT.getSizeInBits();

11231

APInt UndefMask = APInt::getZero(NumElems);

11232

APInt FrozenUndefMask = APInt::getZero(NumElems);

11233

APInt ZeroMask = APInt::getZero(NumElems);

11234

APInt NonZeroMask = APInt::getZero(NumElems);

11235

bool IsAllConstants = true;

11236

SmallSet<SDValue, 8> Values;

11237

unsigned NumConstants = NumElems;

11238

for (unsigned i = 0; i < NumElems; ++i) {

11239

SDValue Elt = Op.getOperand(i);

11240

if (Elt.isUndef()) {

11241

UndefMask.setBit(i);

11242

continue;

11243

}

11244

if (Elt.getOpcode() == ISD::FREEZE && Elt.getOperand(0).isUndef()) {

11245

FrozenUndefMask.setBit(i);

11246

continue;

11247

}

11248

Values.insert(Elt);

11249

if (!isIntOrFPConstant(Elt)) {

11250

IsAllConstants = false;

11251

NumConstants--;

11252

}

11253

if (X86::isZeroNode(Elt)) {

11254

ZeroMask.setBit(i);

11255

} else {

11256

NonZeroMask.setBit(i);

11257

}

11258

}

11259

11260

// All undef vector. Return an UNDEF.

11261

if (UndefMask.isAllOnes())

11262

return DAG.getUNDEF(VT);

11263

11264

// If we have multiple FREEZE-UNDEF operands, we are likely going to end up

11265

// lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in

11266

// our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,

11267

// and blend the FREEZE-UNDEF operands back in.

11268

// FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?

11269

if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();

11270

NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {

11271

SmallVector<int, 16> BlendMask(NumElems, -1);

11272

SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));

11273

for (unsigned i = 0; i < NumElems; ++i) {

11274

if (UndefMask[i]) {

11275

BlendMask[i] = -1;

11276

continue;

11277

}

11278

BlendMask[i] = i;

11279

if (!FrozenUndefMask[i])

11280

Elts[i] = Op.getOperand(i);

11281

else

11282

BlendMask[i] += NumElems;

11283

}

11284

SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);

11285

SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));

11286

SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);

11287

return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);

11288

}

11289

11290

BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());

11291

11292

// If the upper elts of a ymm/zmm are undef/zero then we might be better off

11293

// lowering to a smaller build vector and padding with undef/zero.

11294

if ((VT.is256BitVector() || VT.is512BitVector()) &&

11295

!isFoldableUseOfShuffle(BV)) {

11296

unsigned UpperElems = NumElems / 2;

11297

APInt UndefOrZeroMask = UndefMask | ZeroMask;

11298

unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();

11299

if (NumUpperUndefsOrZeros >= UpperElems) {

11300

if (VT.is512BitVector() &&

11301

NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))

11302

UpperElems = NumElems - (NumElems / 4);

11303

bool UndefUpper = UndefMask.countl_one() >= UpperElems;

11304

MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);

11305

SDValue NewBV =

11306

DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));

11307

return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);

11308

}

11309

}

11310

11311

if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))

11312

return AddSub;

11313

if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))

11314

return HorizontalOp;

11315

if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))

11316

return Broadcast;

11317

if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))

11318

return BitOp;

11319

11320

unsigned NumZero = ZeroMask.popcount();

11321

unsigned NumNonZero = NonZeroMask.popcount();

11322

11323

// If we are inserting one variable into a vector of non-zero constants, try

11324

// to avoid loading each constant element as a scalar. Load the constants as a

11325

// vector and then insert the variable scalar element. If insertion is not

11326

// supported, fall back to a shuffle to get the scalar blended with the

11327

// constants. Insertion into a zero vector is handled as a special-case

11328

// somewhere below here.

11329

if (NumConstants == NumElems - 1 && NumNonZero != 1 &&

11330

(isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||

11331

isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {

11332

// Create an all-constant vector. The variable element in the old

11333

// build vector is replaced by undef in the constant vector. Save the

11334

// variable scalar element and its index for use in the insertelement.

11335

LLVMContext &Context = *DAG.getContext();

11336

Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);

11337

SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));

11338

SDValue VarElt;

11339

SDValue InsIndex;

11340

for (unsigned i = 0; i != NumElems; ++i) {

11341

SDValue Elt = Op.getOperand(i);

11342

if (auto *C = dyn_cast<ConstantSDNode>(Elt))

11343

ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());

11344

else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))

11345

ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());

11346

else if (!Elt.isUndef()) {

11347

assert(!VarElt.getNode() && !InsIndex.getNode() &&(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11348, __extension__
__PRETTY_FUNCTION__))

11348

"Expected one variable element in this vector")(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11348, __extension__
__PRETTY_FUNCTION__));

11349

VarElt = Elt;

11350

InsIndex = DAG.getVectorIdxConstant(i, dl);

11351

}

11352

}

11353

Constant *CV = ConstantVector::get(ConstVecOps);

11354

SDValue DAGConstVec = DAG.getConstantPool(CV, VT);

11355

11356

// The constants we just created may not be legal (eg, floating point). We

11357

// must lower the vector right here because we can not guarantee that we'll

11358

// legalize it before loading it. This is also why we could not just create

11359

// a new build vector here. If the build vector contains illegal constants,

11360

// it could get split back up into a series of insert elements.

11361

// TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.

11362

SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);

11363

MachineFunction &MF = DAG.getMachineFunction();

11364

MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);

11365

SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);

11366

unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();

11367

unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();

11368

if (InsertC < NumEltsInLow128Bits)

11369

return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);

11370

11371

// There's no good way to insert into the high elements of a >128-bit

11372

// vector, so use shuffles to avoid an extract/insert sequence.

11373

assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Invalid insertion index?") ? void (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Invalid insertion index?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11373, __extension__
__PRETTY_FUNCTION__));

11374

assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")(static_cast <bool> (Subtarget.hasAVX() && "Must have AVX with >16-byte vector"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Must have AVX with >16-byte vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11374, __extension__
__PRETTY_FUNCTION__));

11375

SmallVector<int, 8> ShuffleMask;

11376

unsigned NumElts = VT.getVectorNumElements();

11377

for (unsigned i = 0; i != NumElts; ++i)

11378

ShuffleMask.push_back(i == InsertC ? NumElts : i);

11379

SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);

11380

return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);

11381

}

11382

11383

// Special case for single non-zero, non-undef, element.

11384

if (NumNonZero == 1) {

11385

unsigned Idx = NonZeroMask.countr_zero();

11386

SDValue Item = Op.getOperand(Idx);

11387

11388

// If we have a constant or non-constant insertion into the low element of

11389

// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into

11390

// the rest of the elements. This will be matched as movd/movq/movss/movsd

11391

// depending on what the source datatype is.

11392

if (Idx == 0) {

11393

if (NumZero == 0)

11394

return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

11395

11396

if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||

11397

EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||

11398

(EltVT == MVT::i16 && Subtarget.hasFP16())) {

11399

assert((VT.is128BitVector() || VT.is256BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11401, __extension__
__PRETTY_FUNCTION__))

11400

VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11401, __extension__
__PRETTY_FUNCTION__))

11401

"Expected an SSE value type!")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11401, __extension__
__PRETTY_FUNCTION__));

11402

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

11403

// Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a

11404

// zero vector.

11405

return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

11406

}

11407

11408

// We can't directly insert an i8 or i16 into a vector, so zero extend

11409

// it to i32 first.

11410

if (EltVT == MVT::i16 || EltVT == MVT::i8) {

11411

Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);

11412

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

11413

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);

11414

Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

11415

return DAG.getBitcast(VT, Item);

11416

}

11417

}

11418

11419

// Is it a vector logical left shift?

11420

if (NumElems == 2 && Idx == 1 &&

11421

X86::isZeroNode(Op.getOperand(0)) &&

11422

!X86::isZeroNode(Op.getOperand(1))) {

11423

unsigned NumBits = VT.getSizeInBits();

11424

return getVShift(true, VT,

11425

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,

11426

VT, Op.getOperand(1)),

11427

NumBits/2, DAG, *this, dl);

11428

}

11429

11430

if (IsAllConstants) // Otherwise, it's better to do a constpool load.

11431

return SDValue();

11432

11433

// Otherwise, if this is a vector with i32 or f32 elements, and the element

11434

// is a non-constant being inserted into an element other than the low one,

11435

// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka

11436

// movd/movss) to move this into the low element, then shuffle it into

11437

// place.

11438

if (EVTBits == 32) {

11439

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

11440

return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);

11441

}

11442

}

11443

11444

// Splat is obviously ok. Let legalizer expand it to a shuffle.

11445

if (Values.size() == 1) {

11446

if (EVTBits == 32) {

11447

// Instead of a shuffle like this:

11448

// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>

11449

// Check if it's possible to issue this instead.

11450

// shuffle (vload ptr)), undef, <1, 1, 1, 1>

11451

unsigned Idx = NonZeroMask.countr_zero();

11452

SDValue Item = Op.getOperand(Idx);

11453

if (Op.getNode()->isOnlyUserOf(Item.getNode()))

11454

return LowerAsSplatVectorLoad(Item, VT, dl, DAG);

11455

}

11456

return SDValue();

11457

}

11458

11459

// A vector full of immediates; various special cases are already

11460

// handled, so this is best done with a single constant-pool load.

11461

if (IsAllConstants)

11462

return SDValue();

11463

11464

if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))

11465

return V;

11466

11467

// See if we can use a vector load to get all of the elements.

11468

{

11469

SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);

11470

if (SDValue LD =

11471

EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))

11472

return LD;

11473

}

11474

11475

// If this is a splat of pairs of 32-bit elements, we can use a narrower

11476

// build_vector and broadcast it.

11477

// TODO: We could probably generalize this more.

11478

if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {

11479

SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),

11480

DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };

11481

auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {

11482

// Make sure all the even/odd operands match.

11483

for (unsigned i = 2; i != NumElems; ++i)

11484

if (Ops[i % 2] != Op.getOperand(i))

11485

return false;

11486

return true;

11487

};

11488

if (CanSplat(Op, NumElems, Ops)) {

11489

MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;

11490

MVT NarrowVT = MVT::getVectorVT(EltVT, 4);

11491

// Create a new build vector and cast to v2i64/v2f64.

11492

SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),

11493

DAG.getBuildVector(NarrowVT, dl, Ops));

11494

// Broadcast from v2i64/v2f64 and cast to final VT.

11495

MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);

11496

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,

11497

NewBV));

11498

}

11499

}

11500

11501

// For AVX-length vectors, build the individual 128-bit pieces and use

11502

// shuffles to put them in place.

11503

if (VT.getSizeInBits() > 128) {

11504

MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);

11505

11506

// Build both the lower and upper subvector.

11507

SDValue Lower =

11508

DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));

11509

SDValue Upper = DAG.getBuildVector(

11510

HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));

11511

11512

// Recreate the wider vector with the lower and upper part.

11513

return concatSubVectors(Lower, Upper, DAG, dl);

11514

}

11515

11516

// Let legalizer expand 2-wide build_vectors.

11517

if (EVTBits == 64) {

11518

if (NumNonZero == 1) {

11519

// One half is zero or undef.

11520

unsigned Idx = NonZeroMask.countr_zero();

11521

SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,

11522

Op.getOperand(Idx));

11523

return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);

11524

}

11525

return SDValue();

11526

}

11527

11528

// If element VT is < 32 bits, convert it to inserts into a zero vector.

11529

if (EVTBits == 8 && NumElems == 16)

11530

if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,

11531

DAG, Subtarget))

11532

return V;

11533

11534

if (EltVT == MVT::i16 && NumElems == 8)

11535

if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,

11536

DAG, Subtarget))

11537

return V;

11538

11539

// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS

11540

if (EVTBits == 32 && NumElems == 4)

11541

if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))

11542

return V;

11543

11544

// If element VT is == 32 bits, turn it into a number of shuffles.

11545

if (NumElems == 4 && NumZero > 0) {

11546

SmallVector<SDValue, 8> Ops(NumElems);

11547

for (unsigned i = 0; i < 4; ++i) {

11548

bool isZero = !NonZeroMask[i];

11549

if (isZero)

11550

Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);

11551

else

11552

Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));

11553

}

11554

11555

for (unsigned i = 0; i < 2; ++i) {

11556

switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {

11557

default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 11557);

11558

case 0:

11559

Ops[i] = Ops[i*2]; // Must be a zero vector.

11560

break;

11561

case 1:

11562

Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);

11563

break;

11564

case 2:

11565

Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);

11566

break;

11567

case 3:

11568

Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);

11569

break;

11570

}

11571

}

11572

11573

bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;

11574

bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;

11575

int MaskVec[] = {

11576

Reverse1 ? 1 : 0,

11577

Reverse1 ? 0 : 1,

11578

static_cast<int>(Reverse2 ? NumElems+1 : NumElems),

11579

static_cast<int>(Reverse2 ? NumElems : NumElems+1)

11580

};

11581

return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);

11582

}

11583

11584

assert(Values.size() > 1 && "Expected non-undef and non-splat vector")(static_cast <bool> (Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? void (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11584, __extension__
__PRETTY_FUNCTION__));

11585

11586

// Check for a build vector from mostly shuffle plus few inserting.

11587

if (SDValue Sh = buildFromShuffleMostly(Op, DAG))

11588

return Sh;

11589

11590

// For SSE 4.1, use insertps to put the high elements into the low element.

11591

if (Subtarget.hasSSE41() && EltVT != MVT::f16) {

11592

SDValue Result;

11593

if (!Op.getOperand(0).isUndef())

11594

Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));

11595

else

11596

Result = DAG.getUNDEF(VT);

11597

11598

for (unsigned i = 1; i < NumElems; ++i) {

11599

if (Op.getOperand(i).isUndef()) continue;

11600

Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,

11601

Op.getOperand(i), DAG.getIntPtrConstant(i, dl));

11602

}

11603

return Result;

11604

}

11605

11606

// Otherwise, expand into a number of unpckl*, start by extending each of

11607

// our (non-undef) elements to the full vector width with the element in the

11608

// bottom slot of the vector (which generates no code for SSE).

11609

SmallVector<SDValue, 8> Ops(NumElems);

11610

for (unsigned i = 0; i < NumElems; ++i) {

11611

if (!Op.getOperand(i).isUndef())

11612

Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));

11613

else

11614

Ops[i] = DAG.getUNDEF(VT);

11615

}

11616

11617

// Next, we iteratively mix elements, e.g. for v4f32:

11618

// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>

11619

// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>

11620

// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>

11621

for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {

11622

// Generate scaled UNPCKL shuffle mask.

11623

SmallVector<int, 16> Mask;

11624

for(unsigned i = 0; i != Scale; ++i)

11625

Mask.push_back(i);

11626

for (unsigned i = 0; i != Scale; ++i)

11627

Mask.push_back(NumElems+i);

11628

Mask.append(NumElems - Mask.size(), SM_SentinelUndef);

11629

11630

for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)

11631

Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);

11632

}

11633

return Ops[0];

11634

}

11635

11636

// 256-bit AVX can use the vinsertf128 instruction

11637

// to create 256-bit vectors from two other 128-bit ones.

11638

// TODO: Detect subvector broadcast here instead of DAG combine?

11639

static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,

11640

const X86Subtarget &Subtarget) {

11641

SDLoc dl(Op);

11642

MVT ResVT = Op.getSimpleValueType();

11643

11644

assert((ResVT.is256BitVector() ||(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11645, __extension__
__PRETTY_FUNCTION__))

11645

ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11645, __extension__
__PRETTY_FUNCTION__));

11646

11647

unsigned NumOperands = Op.getNumOperands();

11648

unsigned NumFreezeUndef = 0;

11649

unsigned NumZero = 0;

11650

unsigned NumNonZero = 0;

11651

unsigned NonZeros = 0;

11652

for (unsigned i = 0; i != NumOperands; ++i) {

11653

SDValue SubVec = Op.getOperand(i);

11654

if (SubVec.isUndef())

11655

continue;

11656

if (ISD::isFreezeUndef(SubVec.getNode()) && SubVec.hasOneUse())

11657

++NumFreezeUndef;

11658

else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))

11659

++NumZero;

11660

else {

11661

assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11661, __extension__ __PRETTY_FUNCTION__)); // Ensure the shift is in range.

11662

NonZeros |= 1 << i;

11663

++NumNonZero;

11664

}

11665

}

11666

11667

// If we have more than 2 non-zeros, build each half separately.

11668

if (NumNonZero > 2) {

11669

MVT HalfVT = ResVT.getHalfNumVectorElementsVT();

11670

ArrayRef<SDUse> Ops = Op->ops();

11671

SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11672

Ops.slice(0, NumOperands/2));

11673

SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11674

Ops.slice(NumOperands/2));

11675

return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);

11676

}

11677

11678

// Otherwise, build it up through insert_subvectors.

11679

SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)

11680

: (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))

11681

: DAG.getUNDEF(ResVT));

11682

11683

MVT SubVT = Op.getOperand(0).getSimpleValueType();

11684

unsigned NumSubElems = SubVT.getVectorNumElements();

11685

for (unsigned i = 0; i != NumOperands; ++i) {

11686

if ((NonZeros & (1 << i)) == 0)

11687

continue;

11688

11689

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,

11690

Op.getOperand(i),

11691

DAG.getIntPtrConstant(i * NumSubElems, dl));

11692

}

11693

11694

return Vec;

11695

}

11696

11697

// Returns true if the given node is a type promotion (by concatenating i1

11698

// zeros) of the result of a node that already zeros all upper bits of

11699

// k-register.

11700

// TODO: Merge this with LowerAVXCONCAT_VECTORS?

11701

static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,

11702

const X86Subtarget &Subtarget,

11703

SelectionDAG & DAG) {

11704

SDLoc dl(Op);

11705

MVT ResVT = Op.getSimpleValueType();

11706

unsigned NumOperands = Op.getNumOperands();

11707

11708

assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11709, __extension__
__PRETTY_FUNCTION__))

11709

"Unexpected number of operands in CONCAT_VECTORS")(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11709, __extension__
__PRETTY_FUNCTION__));

11710

11711

uint64_t Zeros = 0;

11712

uint64_t NonZeros = 0;

11713

for (unsigned i = 0; i != NumOperands; ++i) {

11714

SDValue SubVec = Op.getOperand(i);

11715

if (SubVec.isUndef())

11716

continue;

11717

assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11717, __extension__ __PRETTY_FUNCTION__)); // Ensure the shift is in range.

11718

if (ISD::isBuildVectorAllZeros(SubVec.getNode()))

11719

Zeros |= (uint64_t)1 << i;

11720

else

11721

NonZeros |= (uint64_t)1 << i;

11722

}

11723

11724

unsigned NumElems = ResVT.getVectorNumElements();

11725

11726

// If we are inserting non-zero vector and there are zeros in LSBs and undef

11727

// in the MSBs we need to emit a KSHIFTL. The generic lowering to

11728

// insert_subvector will give us two kshifts.

11729

if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&

11730

Log2_64(NonZeros) != NumOperands - 1) {

11731

MVT ShiftVT = ResVT;

11732

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)

11733

ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

11734

unsigned Idx = Log2_64(NonZeros);

11735

SDValue SubVec = Op.getOperand(Idx);

11736

unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();

11737

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,

11738

DAG.getUNDEF(ShiftVT), SubVec,

11739

DAG.getIntPtrConstant(0, dl));

11740

Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,

11741

DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));

11742

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,

11743

DAG.getIntPtrConstant(0, dl));

11744

}

11745

11746

// If there are zero or one non-zeros we can handle this very simply.

11747

if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {

11748

SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);

11749

if (!NonZeros)

11750

return Vec;

11751

unsigned Idx = Log2_64(NonZeros);

11752

SDValue SubVec = Op.getOperand(Idx);

11753

unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();

11754

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,

11755

DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));

11756

}

11757

11758

if (NumOperands > 2) {

11759

MVT HalfVT = ResVT.getHalfNumVectorElementsVT();

11760

ArrayRef<SDUse> Ops = Op->ops();

11761

SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11762

Ops.slice(0, NumOperands/2));

11763

SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11764

Ops.slice(NumOperands/2));

11765

return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);

11766

}

11767

11768

assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?")(static_cast <bool> (llvm::popcount(NonZeros) == 2 &&
"Simple cases not handled?") ? void (0) : __assert_fail ("llvm::popcount(NonZeros) == 2 && \"Simple cases not handled?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11768, __extension__
__PRETTY_FUNCTION__));

11769

11770

if (ResVT.getVectorNumElements() >= 16)

11771

return Op; // The operation is legal with KUNPCK

11772

11773

SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,

11774

DAG.getUNDEF(ResVT), Op.getOperand(0),

11775

DAG.getIntPtrConstant(0, dl));

11776

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),

11777

DAG.getIntPtrConstant(NumElems/2, dl));

11778

}

11779

11780

static SDValue LowerCONCAT_VECTORS(SDValue Op,

11781

const X86Subtarget &Subtarget,

11782

SelectionDAG &DAG) {

11783

MVT VT = Op.getSimpleValueType();

11784

if (VT.getVectorElementType() == MVT::i1)

11785

return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);

11786

11787

assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11789, __extension__
__PRETTY_FUNCTION__))

11788

(VT.is512BitVector() && (Op.getNumOperands() == 2 ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11789, __extension__
__PRETTY_FUNCTION__))

11789

Op.getNumOperands() == 4)))(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11789, __extension__
__PRETTY_FUNCTION__));

11790

11791

// AVX can use the vinsertf128 instruction to create 256-bit vectors

11792

// from two other 128-bit ones.

11793

11794

// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors

11795

return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);

11796

}

11797

11798

//===----------------------------------------------------------------------===//

11799

// Vector shuffle lowering

11800

//

11801

// This is an experimental code path for lowering vector shuffles on x86. It is

11802

// designed to handle arbitrary vector shuffles and blends, gracefully

11803

// degrading performance as necessary. It works hard to recognize idiomatic

11804

// shuffles and lower them to optimal instruction patterns without leaving

11805

// a framework that allows reasonably efficient handling of all vector shuffle

11806

// patterns.

11807

//===----------------------------------------------------------------------===//

11808

11809

/// Tiny helper function to identify a no-op mask.

11810

///

11811

/// This is a somewhat boring predicate function. It checks whether the mask

11812

/// array input, which is assumed to be a single-input shuffle mask of the kind

11813

/// used by the X86 shuffle instructions (not a fully general

11814

/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an

11815

/// in-place shuffle are 'no-op's.

11816

static bool isNoopShuffleMask(ArrayRef<int> Mask) {

11817

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

11818

assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11818, __extension__
__PRETTY_FUNCTION__));

11819

if (Mask[i] >= 0 && Mask[i] != i)

11820

return false;

11821

}

11822

return true;

11823

}

11824

11825

/// Test whether there are elements crossing LaneSizeInBits lanes in this

11826

/// shuffle mask.

11827

///

11828

/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations

11829

/// and we routinely test for these.

11830

static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,

11831

unsigned ScalarSizeInBits,

11832

ArrayRef<int> Mask) {

11833

assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11835, __extension__
__PRETTY_FUNCTION__))

11834

(LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11835, __extension__
__PRETTY_FUNCTION__))

11835

"Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11835, __extension__
__PRETTY_FUNCTION__));

11836

int LaneSize = LaneSizeInBits / ScalarSizeInBits;

11837

int Size = Mask.size();

11838

for (int i = 0; i < Size; ++i)

11839

if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)

11840

return true;

11841

return false;

11842

}

11843

11844

/// Test whether there are elements crossing 128-bit lanes in this

11845

/// shuffle mask.

11846

static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {

11847

return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);

11848

}

11849

11850

/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come

11851

/// from multiple lanes - this is different to isLaneCrossingShuffleMask to

11852

/// better support 'repeated mask + lane permute' style shuffles.

11853

static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,

11854

unsigned ScalarSizeInBits,

11855

ArrayRef<int> Mask) {

11856

assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11858, __extension__
__PRETTY_FUNCTION__))

11857

(LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11858, __extension__
__PRETTY_FUNCTION__))

11858

"Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11858, __extension__
__PRETTY_FUNCTION__));

11859

int NumElts = Mask.size();

11860

int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;

11861

int NumLanes = NumElts / NumEltsPerLane;

11862

if (NumLanes > 1) {

11863

for (int i = 0; i != NumLanes; ++i) {

11864

int SrcLane = -1;

11865

for (int j = 0; j != NumEltsPerLane; ++j) {

11866

int M = Mask[(i * NumEltsPerLane) + j];

11867

if (M < 0)

11868

continue;

11869

int Lane = (M % NumElts) / NumEltsPerLane;

11870

if (SrcLane >= 0 && SrcLane != Lane)

11871

return true;

11872

SrcLane = Lane;

11873

}

11874

}

11875

}

11876

return false;

11877

}

11878

11879

/// Test whether a shuffle mask is equivalent within each sub-lane.

11880

///

11881

/// This checks a shuffle mask to see if it is performing the same

11882

/// lane-relative shuffle in each sub-lane. This trivially implies

11883

/// that it is also not lane-crossing. It may however involve a blend from the

11884

/// same lane of a second vector.

11885

///

11886

/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is

11887

/// non-trivial to compute in the face of undef lanes. The representation is

11888

/// suitable for use with existing 128-bit shuffles as entries from the second

11889

/// vector have been remapped to [LaneSize, 2*LaneSize).

11890

static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,

11891

ArrayRef<int> Mask,

11892

SmallVectorImpl<int> &RepeatedMask) {

11893

auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();

11894

RepeatedMask.assign(LaneSize, -1);

11895

int Size = Mask.size();

11896

for (int i = 0; i < Size; ++i) {

11897

assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)(static_cast <bool> (Mask[i] == SM_SentinelUndef || Mask
[i] >= 0) ? void (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11897, __extension__
__PRETTY_FUNCTION__));

11898

if (Mask[i] < 0)

11899

continue;

11900

if ((Mask[i] % Size) / LaneSize != i / LaneSize)

11901

// This entry crosses lanes, so there is no way to model this shuffle.

11902

return false;

11903

11904

// Ok, handle the in-lane shuffles by detecting if and when they repeat.

11905

// Adjust second vector indices to start at LaneSize instead of Size.

11906

int LocalM = Mask[i] < Size ? Mask[i] % LaneSize

11907

: Mask[i] % LaneSize + LaneSize;

11908

if (RepeatedMask[i % LaneSize] < 0)

11909

// This is the first non-undef entry in this slot of a 128-bit lane.

11910

RepeatedMask[i % LaneSize] = LocalM;

11911

else if (RepeatedMask[i % LaneSize] != LocalM)

11912

// Found a mismatch with the repeated mask.

11913

return false;

11914

}

11915

return true;

11916

}

11917

11918

/// Test whether a shuffle mask is equivalent within each 128-bit lane.

11919

static bool

11920

is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,

11921

SmallVectorImpl<int> &RepeatedMask) {

11922

return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);

11923

}

11924

11925

static bool

11926

is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {

11927

SmallVector<int, 32> RepeatedMask;

11928

return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);

11929

}

11930

11931

/// Test whether a shuffle mask is equivalent within each 256-bit lane.

11932

static bool

11933

is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,

11934

SmallVectorImpl<int> &RepeatedMask) {

11935

return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);

11936

}

11937

11938

/// Test whether a target shuffle mask is equivalent within each sub-lane.

11939

/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.

11940

static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,

11941

unsigned EltSizeInBits,

11942

ArrayRef<int> Mask,

11943

SmallVectorImpl<int> &RepeatedMask) {

11944

int LaneSize = LaneSizeInBits / EltSizeInBits;

11945

RepeatedMask.assign(LaneSize, SM_SentinelUndef);

11946

int Size = Mask.size();

11947

for (int i = 0; i < Size; ++i) {

11948

assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))(static_cast <bool> (isUndefOrZero(Mask[i]) || (Mask[i]
>= 0)) ? void (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11948, __extension__
__PRETTY_FUNCTION__));

11949

if (Mask[i] == SM_SentinelUndef)

11950

continue;

11951

if (Mask[i] == SM_SentinelZero) {

11952

if (!isUndefOrZero(RepeatedMask[i % LaneSize]))

11953

return false;

11954

RepeatedMask[i % LaneSize] = SM_SentinelZero;

11955

continue;

11956

}

11957

if ((Mask[i] % Size) / LaneSize != i / LaneSize)

11958

// This entry crosses lanes, so there is no way to model this shuffle.

11959

return false;

11960

11961

// Handle the in-lane shuffles by detecting if and when they repeat. Adjust

11962

// later vector indices to start at multiples of LaneSize instead of Size.

11963

int LaneM = Mask[i] / Size;

11964

int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);

11965

if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)

11966

// This is the first non-undef entry in this slot of a 128-bit lane.

11967

RepeatedMask[i % LaneSize] = LocalM;

11968

else if (RepeatedMask[i % LaneSize] != LocalM)

11969

// Found a mismatch with the repeated mask.

11970

return false;

11971

}

11972

return true;

11973

}

11974

11975

/// Test whether a target shuffle mask is equivalent within each sub-lane.

11976

/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.

11977

static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,

11978

ArrayRef<int> Mask,

11979

SmallVectorImpl<int> &RepeatedMask) {

11980

return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),

11981

Mask, RepeatedMask);

11982

}

11983

11984

/// Checks whether the vector elements referenced by two shuffle masks are

11985

/// equivalent.

11986

static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,

11987

int Idx, int ExpectedIdx) {

11988

assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11989, __extension__
__PRETTY_FUNCTION__))

11989

ExpectedIdx < MaskSize && "Out of range element index")(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11989, __extension__
__PRETTY_FUNCTION__));

11990

if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())

11991

return false;

11992

11993

switch (Op.getOpcode()) {

11994

case ISD::BUILD_VECTOR:

11995

// If the values are build vectors, we can look through them to find

11996

// equivalent inputs that make the shuffles equivalent.

11997

// TODO: Handle MaskSize != Op.getNumOperands()?

11998

if (MaskSize == (int)Op.getNumOperands() &&

11999

MaskSize == (int)ExpectedOp.getNumOperands())

12000

return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);

12001

break;

12002

case X86ISD::VBROADCAST:

12003

case X86ISD::VBROADCAST_LOAD:

12004

// TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?

12005

return (Op == ExpectedOp &&

12006

(int)Op.getValueType().getVectorNumElements() == MaskSize);

12007

case X86ISD::HADD:

12008

case X86ISD::HSUB:

12009

case X86ISD::FHADD:

12010

case X86ISD::FHSUB:

12011

case X86ISD::PACKSS:

12012

case X86ISD::PACKUS:

12013

// HOP(X,X) can refer to the elt from the lower/upper half of a lane.

12014

// TODO: Handle MaskSize != NumElts?

12015

// TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.

12016

if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {

12017

MVT VT = Op.getSimpleValueType();

12018

int NumElts = VT.getVectorNumElements();

12019

if (MaskSize == NumElts) {

12020

int NumLanes = VT.getSizeInBits() / 128;

12021

int NumEltsPerLane = NumElts / NumLanes;

12022

int NumHalfEltsPerLane = NumEltsPerLane / 2;

12023

bool SameLane =

12024

(Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);

12025

bool SameElt =

12026

(Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);

12027

return SameLane && SameElt;

12028

}

12029

}

12030

break;

12031

}

12032

12033

return false;

12034

}

12035

12036

/// Checks whether a shuffle mask is equivalent to an explicit list of

12037

/// arguments.

12038

///

12039

/// This is a fast way to test a shuffle mask against a fixed pattern:

12040

///

12041

/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }

12042

///

12043

/// It returns true if the mask is exactly as wide as the argument list, and

12044

/// each element of the mask is either -1 (signifying undef) or the value given

12045

/// in the argument.

12046

static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,

12047

SDValue V1 = SDValue(),

12048

SDValue V2 = SDValue()) {

12049

int Size = Mask.size();

12050

if (Size != (int)ExpectedMask.size())

12051

return false;

12052

12053

for (int i = 0; i < Size; ++i) {

12054

assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12054, __extension__
__PRETTY_FUNCTION__));

12055

int MaskIdx = Mask[i];

12056

int ExpectedIdx = ExpectedMask[i];

12057

if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {

12058

SDValue MaskV = MaskIdx < Size ? V1 : V2;

12059

SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

12060

MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);

12061

ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

12062

if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))

12063

return false;

12064

}

12065

}

12066

return true;

12067

}

12068

12069

/// Checks whether a target shuffle mask is equivalent to an explicit pattern.

12070

///

12071

/// The masks must be exactly the same width.

12072

///

12073

/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding

12074

/// value in ExpectedMask is always accepted. Otherwise the indices must match.

12075

///

12076

/// SM_SentinelZero is accepted as a valid negative index but must match in

12077

/// both, or via a known bits test.

12078

static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,

12079

ArrayRef<int> ExpectedMask,

12080

const SelectionDAG &DAG,

12081

SDValue V1 = SDValue(),

12082

SDValue V2 = SDValue()) {

12083

int Size = Mask.size();

12084

if (Size != (int)ExpectedMask.size())

12085

return false;

12086

assert(llvm::all_of(ExpectedMask,(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12088, __extension__
__PRETTY_FUNCTION__))

12087

[Size](int M) { return isInRange(M, 0, 2 * Size); }) &&(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12088, __extension__
__PRETTY_FUNCTION__))

12088

"Illegal target shuffle mask")(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12088, __extension__
__PRETTY_FUNCTION__));

12089

12090

// Check for out-of-range target shuffle mask indices.

12091

if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))

12092

return false;

12093

12094

// Don't use V1/V2 if they're not the same size as the shuffle mask type.

12095

if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())

12096

V1 = SDValue();

12097

if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())

12098

V2 = SDValue();

12099

12100

APInt ZeroV1 = APInt::getZero(Size);

12101

APInt ZeroV2 = APInt::getZero(Size);

12102

12103

for (int i = 0; i < Size; ++i) {

12104

int MaskIdx = Mask[i];

12105

int ExpectedIdx = ExpectedMask[i];

12106

if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)

12107

continue;

12108

if (MaskIdx == SM_SentinelZero) {

12109

// If we need this expected index to be a zero element, then update the

12110

// relevant zero mask and perform the known bits at the end to minimize

12111

// repeated computes.

12112

SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

12113

if (ExpectedV &&

12114

Size == (int)ExpectedV.getValueType().getVectorNumElements()) {

12115

int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

12116

APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;

12117

ZeroMask.setBit(BitIdx);

12118

continue;

12119

}

12120

}

12121

if (MaskIdx >= 0) {

12122

SDValue MaskV = MaskIdx < Size ? V1 : V2;

12123

SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

12124

MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);

12125

ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

12126

if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))

12127

continue;

12128

}

12129

return false;

12130

}

12131

return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&

12132

(ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));

12133

}

12134

12135

// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd

12136

// instructions.

12137

static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT,

12138

const SelectionDAG &DAG) {

12139

if (VT != MVT::v8i32 && VT != MVT::v8f32)

12140

return false;

12141

12142

SmallVector<int, 8> Unpcklwd;

12143

createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,

12144

/* Unary = */ false);

12145

SmallVector<int, 8> Unpckhwd;

12146

createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,

12147

/* Unary = */ false);

12148

bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||

12149

isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));

12150

return IsUnpackwdMask;

12151

}

12152

12153

static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask,

12154

const SelectionDAG &DAG) {

12155

// Create 128-bit vector type based on mask size.

12156

MVT EltVT = MVT::getIntegerVT(128 / Mask.size());

12157

MVT VT = MVT::getVectorVT(EltVT, Mask.size());

12158

12159

// We can't assume a canonical shuffle mask, so try the commuted version too.

12160

SmallVector<int, 4> CommutedMask(Mask);

12161

ShuffleVectorSDNode::commuteMask(CommutedMask);

12162

12163

// Match any of unary/binary or low/high.

12164

for (unsigned i = 0; i != 4; ++i) {

12165

SmallVector<int, 16> UnpackMask;

12166

createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);

12167

if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||

12168

isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))

12169

return true;

12170

}

12171

return false;

12172

}

12173

12174

/// Return true if a shuffle mask chooses elements identically in its top and

12175

/// bottom halves. For example, any splat mask has the same top and bottom

12176

/// halves. If an element is undefined in only one half of the mask, the halves

12177

/// are not considered identical.

12178

static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {

12179

assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")(static_cast <bool> (Mask.size() % 2 == 0 && "Expecting even number of elements in mask"
) ? void (0) : __assert_fail ("Mask.size() % 2 == 0 && \"Expecting even number of elements in mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12179, __extension__
__PRETTY_FUNCTION__));

12180

unsigned HalfSize = Mask.size() / 2;

12181

for (unsigned i = 0; i != HalfSize; ++i) {

12182

if (Mask[i] != Mask[i + HalfSize])

12183

return false;

12184

}

12185

return true;

12186

}

12187

12188

/// Get a 4-lane 8-bit shuffle immediate for a mask.

12189

///

12190

/// This helper function produces an 8-bit shuffle immediate corresponding to

12191

/// the ubiquitous shuffle encoding scheme used in x86 instructions for

12192

/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for

12193

/// example.

12194

///

12195

/// NB: We rely heavily on "undef" masks preserving the input lane.

12196

static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {

12197

assert(Mask.size() == 4 && "Only 4-lane shuffle masks")(static_cast <bool> (Mask.size() == 4 && "Only 4-lane shuffle masks"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12197, __extension__
__PRETTY_FUNCTION__));

12198

assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12198, __extension__
__PRETTY_FUNCTION__));

12199

assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12199, __extension__
__PRETTY_FUNCTION__));

12200

assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12200, __extension__
__PRETTY_FUNCTION__));

12201

assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12201, __extension__
__PRETTY_FUNCTION__));

12202

12203

// If the mask only uses one non-undef element, then fully 'splat' it to

12204

// improve later broadcast matching.

12205

int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();

12206

assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")(static_cast <bool> (0 <= FirstIndex && FirstIndex
< 4 && "All undef shuffle mask") ? void (0) : __assert_fail
("0 <= FirstIndex && FirstIndex < 4 && \"All undef shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12206, __extension__
__PRETTY_FUNCTION__));

12207

12208

int FirstElt = Mask[FirstIndex];

12209

if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))

12210

return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;

12211

12212

unsigned Imm = 0;

12213

Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;

12214

Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;

12215

Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;

12216

Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;

12217

return Imm;

12218

}

12219

12220

static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,

12221

SelectionDAG &DAG) {

12222

return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);

12223

}

12224

12225

// The Shuffle result is as follow:

12226

// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.

12227

// Each Zeroable's element correspond to a particular Mask's element.

12228

// As described in computeZeroableShuffleElements function.

12229

//

12230

// The function looks for a sub-mask that the nonzero elements are in

12231

// increasing order. If such sub-mask exist. The function returns true.

12232

static bool isNonZeroElementsInOrder(const APInt &Zeroable,

12233

ArrayRef<int> Mask, const EVT &VectorType,

12234

bool &IsZeroSideLeft) {

12235

int NextElement = -1;

12236

// Check if the Mask's nonzero elements are in increasing order.

12237

for (int i = 0, e = Mask.size(); i < e; i++) {

12238

// Checks if the mask's zeros elements are built from only zeros.

12239

assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12239, __extension__
__PRETTY_FUNCTION__));

12240

if (Mask[i] < 0)

12241

return false;

12242

if (Zeroable[i])

12243

continue;

12244

// Find the lowest non zero element

12245

if (NextElement < 0) {

12246

NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;

12247

IsZeroSideLeft = NextElement != 0;

12248

}

12249

// Exit if the mask's non zero elements are not in increasing order.

12250

if (NextElement != Mask[i])

12251

return false;

12252

NextElement++;

12253

}

12254

return true;

12255

}

12256

12257

/// Try to lower a shuffle with a single PSHUFB of V1 or V2.

12258

static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,

12259

ArrayRef<int> Mask, SDValue V1,

12260

SDValue V2, const APInt &Zeroable,

12261

const X86Subtarget &Subtarget,

12262

SelectionDAG &DAG) {

12263

int Size = Mask.size();

12264

int LaneSize = 128 / VT.getScalarSizeInBits();

12265

const int NumBytes = VT.getSizeInBits() / 8;

12266

const int NumEltBytes = VT.getScalarSizeInBits() / 8;

12267

12268

assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12270, __extension__
__PRETTY_FUNCTION__))

12269

(Subtarget.hasAVX2() && VT.is256BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12270, __extension__
__PRETTY_FUNCTION__))

12270

(Subtarget.hasBWI() && VT.is512BitVector()))(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12270, __extension__
__PRETTY_FUNCTION__));

12271

12272

SmallVector<SDValue, 64> PSHUFBMask(NumBytes);

12273

// Sign bit set in i8 mask means zero element.

12274

SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);

12275

12276

SDValue V;

12277

for (int i = 0; i < NumBytes; ++i) {

12278

int M = Mask[i / NumEltBytes];

12279

if (M < 0) {

12280

PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);

12281

continue;

12282

}

12283

if (Zeroable[i / NumEltBytes]) {

12284

PSHUFBMask[i] = ZeroMask;

12285

continue;

12286

}

12287

12288

// We can only use a single input of V1 or V2.

12289

SDValue SrcV = (M >= Size ? V2 : V1);

12290

if (V && V != SrcV)

12291

return SDValue();

12292

V = SrcV;

12293

M %= Size;

12294

12295

// PSHUFB can't cross lanes, ensure this doesn't happen.

12296

if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))

12297

return SDValue();

12298

12299

M = M % LaneSize;

12300

M = M * NumEltBytes + (i % NumEltBytes);

12301

PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);

12302

}

12303

assert(V && "Failed to find a source input")(static_cast <bool> (V && "Failed to find a source input"
) ? void (0) : __assert_fail ("V && \"Failed to find a source input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12303, __extension__
__PRETTY_FUNCTION__));

12304

12305

MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);

12306

return DAG.getBitcast(

12307

VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),

12308

DAG.getBuildVector(I8VT, DL, PSHUFBMask)));

12309

}

12310

12311

static SDValue getMaskNode(SDValue Mask, MVT MaskVT,

12312

const X86Subtarget &Subtarget, SelectionDAG &DAG,

12313

const SDLoc &dl);

12314

12315

// X86 has dedicated shuffle that can be lowered to VEXPAND

12316

static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,

12317

const APInt &Zeroable,

12318

ArrayRef<int> Mask, SDValue &V1,

12319

SDValue &V2, SelectionDAG &DAG,

12320

const X86Subtarget &Subtarget) {

12321

bool IsLeftZeroSide = true;

12322

if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),

12323

IsLeftZeroSide))

12324

return SDValue();

12325

unsigned VEXPANDMask = (~Zeroable).getZExtValue();

12326

MVT IntegerType =

12327

MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));

12328

SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);

12329

unsigned NumElts = VT.getVectorNumElements();

12330

assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12331, __extension__
__PRETTY_FUNCTION__))

12331

"Unexpected number of vector elements")(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12331, __extension__
__PRETTY_FUNCTION__));

12332

SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),

12333

Subtarget, DAG, DL);

12334

SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);

12335

SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;

12336

return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);

12337

}

12338

12339

static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,

12340

unsigned &UnpackOpcode, bool IsUnary,

12341

ArrayRef<int> TargetMask, const SDLoc &DL,

12342

SelectionDAG &DAG,

12343

const X86Subtarget &Subtarget) {

12344

int NumElts = VT.getVectorNumElements();

12345

12346

bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;

12347

for (int i = 0; i != NumElts; i += 2) {

12348

int M1 = TargetMask[i + 0];

12349

int M2 = TargetMask[i + 1];

12350

Undef1 &= (SM_SentinelUndef == M1);

12351

Undef2 &= (SM_SentinelUndef == M2);

12352

Zero1 &= isUndefOrZero(M1);

12353

Zero2 &= isUndefOrZero(M2);

12354

}

12355

assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12356, __extension__
__PRETTY_FUNCTION__))

12356

"Zeroable shuffle detected")(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12356, __extension__
__PRETTY_FUNCTION__));

12357

12358

// Attempt to match the target mask against the unpack lo/hi mask patterns.

12359

SmallVector<int, 64> Unpckl, Unpckh;

12360

createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);

12361

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,

12362

(IsUnary ? V1 : V2))) {

12363

UnpackOpcode = X86ISD::UNPCKL;

12364

V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));

12365

V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);

12366

return true;

12367

}

12368

12369

createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);

12370

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,

12371

(IsUnary ? V1 : V2))) {

12372

UnpackOpcode = X86ISD::UNPCKH;

12373

V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));

12374

V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);

12375

return true;

12376

}

12377

12378

// If an unary shuffle, attempt to match as an unpack lo/hi with zero.

12379

if (IsUnary && (Zero1 || Zero2)) {

12380

// Don't bother if we can blend instead.

12381

if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&

12382

isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))

12383

return false;

12384

12385

bool MatchLo = true, MatchHi = true;

12386

for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {

12387

int M = TargetMask[i];

12388

12389

// Ignore if the input is known to be zero or the index is undef.

12390

if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||

12391

(M == SM_SentinelUndef))

12392

continue;

12393

12394

MatchLo &= (M == Unpckl[i]);

12395

MatchHi &= (M == Unpckh[i]);

12396

}

12397

12398

if (MatchLo || MatchHi) {

12399

UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;

12400

V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;

12401

V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;

12402

return true;

12403

}

12404

}

12405

12406

// If a binary shuffle, commute and try again.

12407

if (!IsUnary) {

12408

ShuffleVectorSDNode::commuteMask(Unpckl);

12409

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {

12410

UnpackOpcode = X86ISD::UNPCKL;

12411

std::swap(V1, V2);

12412

return true;

12413

}

12414

12415

ShuffleVectorSDNode::commuteMask(Unpckh);

12416

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {

12417

UnpackOpcode = X86ISD::UNPCKH;

12418

std::swap(V1, V2);

12419

return true;

12420

}

12421

}

12422

12423

return false;

12424

}

12425

12426

// X86 has dedicated unpack instructions that can handle specific blend

12427

// operations: UNPCKH and UNPCKL.

12428

static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,

12429

ArrayRef<int> Mask, SDValue V1, SDValue V2,

12430

SelectionDAG &DAG) {

12431

SmallVector<int, 8> Unpckl;

12432

createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);

12433

if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

12434

return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

12435

12436

SmallVector<int, 8> Unpckh;

12437

createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);

12438

if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

12439

return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

12440

12441

// Commute and try again.

12442

ShuffleVectorSDNode::commuteMask(Unpckl);

12443

if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

12444

return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);

12445

12446

ShuffleVectorSDNode::commuteMask(Unpckh);

12447

if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

12448

return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);

12449

12450

return SDValue();

12451

}

12452

12453

/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)

12454

/// followed by unpack 256-bit.

12455

static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,

12456

ArrayRef<int> Mask, SDValue V1,

12457

SDValue V2, SelectionDAG &DAG) {

12458

SmallVector<int, 32> Unpckl, Unpckh;

12459

createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);

12460

createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);

12461

12462

unsigned UnpackOpcode;

12463

if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

12464

UnpackOpcode = X86ISD::UNPCKL;

12465

else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

12466

UnpackOpcode = X86ISD::UNPCKH;

12467

else

12468

return SDValue();

12469

12470

// This is a "natural" unpack operation (rather than the 128-bit sectored

12471

// operation implemented by AVX). We need to rearrange 64-bit chunks of the

12472

// input in order to use the x86 instruction.

12473

V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),

12474

DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});

12475

V1 = DAG.getBitcast(VT, V1);

12476

return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);

12477

}

12478

12479

// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the

12480

// source into the lower elements and zeroing the upper elements.

12481

static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,

12482

ArrayRef<int> Mask, const APInt &Zeroable,

12483

const X86Subtarget &Subtarget) {

12484

if (!VT.is512BitVector() && !Subtarget.hasVLX())

12485

return false;

12486

12487

unsigned NumElts = Mask.size();

12488

unsigned EltSizeInBits = VT.getScalarSizeInBits();

12489

unsigned MaxScale = 64 / EltSizeInBits;

12490

12491

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

12492

unsigned SrcEltBits = EltSizeInBits * Scale;

12493

if (SrcEltBits < 32 && !Subtarget.hasBWI())

12494

continue;

12495

unsigned NumSrcElts = NumElts / Scale;

12496

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))

12497

continue;

12498

unsigned UpperElts = NumElts - NumSrcElts;

12499

if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

12500

continue;

12501

SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);

12502

SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);

12503

DstVT = MVT::getIntegerVT(EltSizeInBits);

12504

if ((NumSrcElts * EltSizeInBits) >= 128) {

12505

// ISD::TRUNCATE

12506

DstVT = MVT::getVectorVT(DstVT, NumSrcElts);

12507

} else {

12508

// X86ISD::VTRUNC

12509

DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);

12510

}

12511

return true;

12512

}

12513

12514

return false;

12515

}

12516

12517

// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper

12518

// element padding to the final DstVT.

12519

static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,

12520

const X86Subtarget &Subtarget,

12521

SelectionDAG &DAG, bool ZeroUppers) {

12522

MVT SrcVT = Src.getSimpleValueType();

12523

MVT DstSVT = DstVT.getScalarType();

12524

unsigned NumDstElts = DstVT.getVectorNumElements();

12525

unsigned NumSrcElts = SrcVT.getVectorNumElements();

12526

unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();

12527

12528

if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))

12529

return SDValue();

12530

12531

// Perform a direct ISD::TRUNCATE if possible.

12532

if (NumSrcElts == NumDstElts)

12533

return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);

12534

12535

if (NumSrcElts > NumDstElts) {

12536

MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);

12537

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);

12538

return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());

12539

}

12540

12541

if ((NumSrcElts * DstEltSizeInBits) >= 128) {

12542

MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);

12543

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);

12544

return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,

12545

DstVT.getSizeInBits());

12546

}

12547

12548

// Non-VLX targets must truncate from a 512-bit type, so we need to

12549

// widen, truncate and then possibly extract the original subvector.

12550

if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {

12551

SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);

12552

return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);

12553

}

12554

12555

// Fallback to a X86ISD::VTRUNC, padding if necessary.

12556

MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);

12557

SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);

12558

if (DstVT != TruncVT)

12559

Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,

12560

DstVT.getSizeInBits());

12561

return Trunc;

12562

}

12563

12564

// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.

12565

//

12566

// An example is the following:

12567

//

12568

// t0: ch = EntryToken

12569

// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0

12570

// t25: v4i32 = truncate t2

12571

// t41: v8i16 = bitcast t25

12572

// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,

12573

// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>

12574

// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21

12575

// t18: v2i64 = bitcast t51

12576

//

12577

// One can just use a single vpmovdw instruction, without avx512vl we need to

12578

// use the zmm variant and extract the lower subvector, padding with zeroes.

12579

// TODO: Merge with lowerShuffleAsVTRUNC.

12580

static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,

12581

SDValue V2, ArrayRef<int> Mask,

12582

const APInt &Zeroable,

12583

const X86Subtarget &Subtarget,

12584

SelectionDAG &DAG) {

12585

assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v8i16
) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v8i16) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12585, __extension__
__PRETTY_FUNCTION__));

12586

if (!Subtarget.hasAVX512())

12587

return SDValue();

12588

12589

unsigned NumElts = VT.getVectorNumElements();

12590

unsigned EltSizeInBits = VT.getScalarSizeInBits();

12591

unsigned MaxScale = 64 / EltSizeInBits;

12592

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

12593

unsigned SrcEltBits = EltSizeInBits * Scale;

12594

unsigned NumSrcElts = NumElts / Scale;

12595

unsigned UpperElts = NumElts - NumSrcElts;

12596

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||

12597

!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

12598

continue;

12599

12600

// Attempt to find a matching source truncation, but as a fall back VLX

12601

// cases can use the VPMOV directly.

12602

SDValue Src = peekThroughBitcasts(V1);

12603

if (Src.getOpcode() == ISD::TRUNCATE &&

12604

Src.getScalarValueSizeInBits() == SrcEltBits) {

12605

Src = Src.getOperand(0);

12606

} else if (Subtarget.hasVLX()) {

12607

MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

12608

MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

12609

Src = DAG.getBitcast(SrcVT, Src);

12610

// Don't do this if PACKSS/PACKUS could perform it cheaper.

12611

if (Scale == 2 &&

12612

((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||

12613

(DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))

12614

return SDValue();

12615

} else

12616

return SDValue();

12617

12618

// VPMOVWB is only available with avx512bw.

12619

if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)

12620

return SDValue();

12621

12622

bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);

12623

return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);

12624

}

12625

12626

return SDValue();

12627

}

12628

12629

// Attempt to match binary shuffle patterns as a truncate.

12630

static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,

12631

SDValue V2, ArrayRef<int> Mask,

12632

const APInt &Zeroable,

12633

const X86Subtarget &Subtarget,

12634

SelectionDAG &DAG) {

12635

assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12636, __extension__
__PRETTY_FUNCTION__))

12636

"Unexpected VTRUNC type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12636, __extension__
__PRETTY_FUNCTION__));

12637

if (!Subtarget.hasAVX512())

12638

return SDValue();

12639

12640

unsigned NumElts = VT.getVectorNumElements();

12641

unsigned EltSizeInBits = VT.getScalarSizeInBits();

12642

unsigned MaxScale = 64 / EltSizeInBits;

12643

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

12644

// TODO: Support non-BWI VPMOVWB truncations?

12645

unsigned SrcEltBits = EltSizeInBits * Scale;

12646

if (SrcEltBits < 32 && !Subtarget.hasBWI())

12647

continue;

12648

12649

// Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>

12650

// Bail if the V2 elements are undef.

12651

unsigned NumHalfSrcElts = NumElts / Scale;

12652

unsigned NumSrcElts = 2 * NumHalfSrcElts;

12653

for (unsigned Offset = 0; Offset != Scale; ++Offset) {

12654

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||

12655

isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))

12656

continue;

12657

12658

// The elements beyond the truncation must be undef/zero.

12659

unsigned UpperElts = NumElts - NumSrcElts;

12660

if (UpperElts > 0 &&

12661

!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

12662

continue;

12663

bool UndefUppers =

12664

UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);

12665

12666

// For offset truncations, ensure that the concat is cheap.

12667

if (Offset) {

12668

auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {

12669

if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

12670

Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)

12671

return Lo.getOperand(0) == Hi.getOperand(0);

12672

if (ISD::isNormalLoad(Lo.getNode()) &&

12673

ISD::isNormalLoad(Hi.getNode())) {

12674

auto *LDLo = cast<LoadSDNode>(Lo);

12675

auto *LDHi = cast<LoadSDNode>(Hi);

12676

return DAG.areNonVolatileConsecutiveLoads(

12677

LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);

12678

}

12679

return false;

12680

};

12681

if (!IsCheapConcat(V1, V2))

12682

continue;

12683

}

12684

12685

// As we're using both sources then we need to concat them together

12686

// and truncate from the double-sized src.

12687

MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);

12688

SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);

12689

12690

MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

12691

MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

12692

Src = DAG.getBitcast(SrcVT, Src);

12693

12694

// Shift the offset'd elements into place for the truncation.

12695

// TODO: Use getTargetVShiftByConstNode.

12696

if (Offset)

12697

Src = DAG.getNode(

12698

X86ISD::VSRLI, DL, SrcVT, Src,

12699

DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));

12700

12701

return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);

12702

}

12703

}

12704

12705

return SDValue();

12706

}

12707

12708

/// Check whether a compaction lowering can be done by dropping even/odd

12709

/// elements and compute how many times even/odd elements must be dropped.

12710

///

12711

/// This handles shuffles which take every Nth element where N is a power of

12712

/// two. Example shuffle masks:

12713

///

12714

/// (even)

12715

/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14

12716

/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30

12717

/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12

12718

/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28

12719

/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8

12720

/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24

12721

///

12722

/// (odd)

12723

/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14

12724

/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31

12725

///

12726

/// Any of these lanes can of course be undef.

12727

///

12728

/// This routine only supports N <= 3.

12729

/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here

12730

/// for larger N.

12731

///

12732

/// \returns N above, or the number of times even/odd elements must be dropped

12733

/// if there is such a number. Otherwise returns zero.

12734

static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,

12735

bool IsSingleInput) {

12736

// The modulus for the shuffle vector entries is based on whether this is

12737

// a single input or not.

12738

int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);

12739

assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12740, __extension__
__PRETTY_FUNCTION__))

12740

"We should only be called with masks with a power-of-2 size!")(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12740, __extension__
__PRETTY_FUNCTION__));

12741

12742

uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

12743

int Offset = MatchEven ? 0 : 1;

12744

12745

// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,

12746

// and 2^3 simultaneously. This is because we may have ambiguity with

12747

// partially undef inputs.

12748

bool ViableForN[3] = {true, true, true};

12749

12750

for (int i = 0, e = Mask.size(); i < e; ++i) {

12751

// Ignore undef lanes, we'll optimistically collapse them to the pattern we

12752

// want.

12753

if (Mask[i] < 0)

12754

continue;

12755

12756

bool IsAnyViable = false;

12757

for (unsigned j = 0; j != std::size(ViableForN); ++j)

12758

if (ViableForN[j]) {

12759

uint64_t N = j + 1;

12760

12761

// The shuffle mask must be equal to (i * 2^N) % M.

12762

if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))

12763

IsAnyViable = true;

12764

else

12765

ViableForN[j] = false;

12766

}

12767

// Early exit if we exhaust the possible powers of two.

12768

if (!IsAnyViable)

12769

break;

12770

}

12771

12772

for (unsigned j = 0; j != std::size(ViableForN); ++j)

12773

if (ViableForN[j])

12774

return j + 1;

12775

12776

// Return 0 as there is no viable power of two.

12777

return 0;

12778

}

12779

12780

// X86 has dedicated pack instructions that can handle specific truncation

12781

// operations: PACKSS and PACKUS.

12782

// Checks for compaction shuffle masks if MaxStages > 1.

12783

// TODO: Add support for matching multiple PACKSS/PACKUS stages.

12784

static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,

12785

unsigned &PackOpcode, ArrayRef<int> TargetMask,

12786

const SelectionDAG &DAG,

12787

const X86Subtarget &Subtarget,

12788

unsigned MaxStages = 1) {

12789

unsigned NumElts = VT.getVectorNumElements();

12790

unsigned BitSize = VT.getScalarSizeInBits();

12791

assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12792, __extension__
__PRETTY_FUNCTION__))

12792

"Illegal maximum compaction")(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12792, __extension__
__PRETTY_FUNCTION__));

12793

12794

auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {

12795

unsigned NumSrcBits = PackVT.getScalarSizeInBits();

12796

unsigned NumPackedBits = NumSrcBits - BitSize;

12797

N1 = peekThroughBitcasts(N1);

12798

N2 = peekThroughBitcasts(N2);

12799

unsigned NumBits1 = N1.getScalarValueSizeInBits();

12800

unsigned NumBits2 = N2.getScalarValueSizeInBits();

12801

bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);

12802

bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);

12803

if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||

12804

(!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))

12805

return false;

12806

if (Subtarget.hasSSE41() || BitSize == 8) {

12807

APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);

12808

if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&

12809

(N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {

12810

V1 = N1;

12811

V2 = N2;

12812

SrcVT = PackVT;

12813

PackOpcode = X86ISD::PACKUS;

12814

return true;

12815

}

12816

}

12817

bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);

12818

bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);

12819

if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||

12820

DAG.ComputeNumSignBits(N1) > NumPackedBits) &&

12821

(N2.isUndef() || IsZero2 || IsAllOnes2 ||

12822

DAG.ComputeNumSignBits(N2) > NumPackedBits)) {

12823

V1 = N1;

12824

V2 = N2;

12825

SrcVT = PackVT;

12826

PackOpcode = X86ISD::PACKSS;

12827

return true;

12828

}

12829

return false;

12830

};

12831

12832

// Attempt to match against wider and wider compaction patterns.

12833

for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {

12834

MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);

12835

MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);

12836

12837

// Try binary shuffle.

12838

SmallVector<int, 32> BinaryMask;

12839

createPackShuffleMask(VT, BinaryMask, false, NumStages);

12840

if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))

12841

if (MatchPACK(V1, V2, PackVT))

12842

return true;

12843

12844

// Try unary shuffle.

12845

SmallVector<int, 32> UnaryMask;

12846

createPackShuffleMask(VT, UnaryMask, true, NumStages);

12847

if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))

12848

if (MatchPACK(V1, V1, PackVT))

12849

return true;

12850

}

12851

12852

return false;

12853

}

12854

12855

static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

12856

SDValue V1, SDValue V2, SelectionDAG &DAG,

12857

const X86Subtarget &Subtarget) {

12858

MVT PackVT;

12859

unsigned PackOpcode;

12860

unsigned SizeBits = VT.getSizeInBits();

12861

unsigned EltBits = VT.getScalarSizeInBits();

12862

unsigned MaxStages = Log2_32(64 / EltBits);

12863

if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,

12864

Subtarget, MaxStages))

12865

return SDValue();

12866

12867

unsigned CurrentEltBits = PackVT.getScalarSizeInBits();

12868

unsigned NumStages = Log2_32(CurrentEltBits / EltBits);

12869

12870

// Don't lower multi-stage packs on AVX512, truncation is better.

12871

if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())

12872

return SDValue();

12873

12874

// Pack to the largest type possible:

12875

// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.

12876

unsigned MaxPackBits = 16;

12877

if (CurrentEltBits > 16 &&

12878

(PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))

12879

MaxPackBits = 32;

12880

12881

// Repeatedly pack down to the target size.

12882

SDValue Res;

12883

for (unsigned i = 0; i != NumStages; ++i) {

12884

unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);

12885

unsigned NumSrcElts = SizeBits / SrcEltBits;

12886

MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

12887

MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);

12888

MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

12889

MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);

12890

Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),

12891

DAG.getBitcast(SrcVT, V2));

12892

V1 = V2 = Res;

12893

CurrentEltBits /= 2;

12894

}

12895

assert(Res && Res.getValueType() == VT &&(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12896, __extension__
__PRETTY_FUNCTION__))

12896

"Failed to lower compaction shuffle")(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12896, __extension__
__PRETTY_FUNCTION__));

12897

return Res;

12898

}

12899

12900

/// Try to emit a bitmask instruction for a shuffle.

12901

///

12902

/// This handles cases where we can model a blend exactly as a bitmask due to

12903

/// one of the inputs being zeroable.

12904

static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,

12905

SDValue V2, ArrayRef<int> Mask,

12906

const APInt &Zeroable,

12907

const X86Subtarget &Subtarget,

12908

SelectionDAG &DAG) {

12909

MVT MaskVT = VT;

12910

MVT EltVT = VT.getVectorElementType();

12911

SDValue Zero, AllOnes;

12912

// Use f64 if i64 isn't legal.

12913

if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

12914

EltVT = MVT::f64;

12915

MaskVT = MVT::getVectorVT(EltVT, Mask.size());

12916

}

12917

12918

MVT LogicVT = VT;

12919

if (EltVT == MVT::f32 || EltVT == MVT::f64) {

12920

Zero = DAG.getConstantFP(0.0, DL, EltVT);

12921

APFloat AllOnesValue =

12922

APFloat::getAllOnesValue(SelectionDAG::EVTToAPFloatSemantics(EltVT));

12923

AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);

12924

LogicVT =

12925

MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());

12926

} else {

12927

Zero = DAG.getConstant(0, DL, EltVT);

12928

AllOnes = DAG.getAllOnesConstant(DL, EltVT);

12929

}

12930

12931

SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);

12932

SDValue V;

12933

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

12934

if (Zeroable[i])

12935

continue;

12936

if (Mask[i] % Size != i)

12937

return SDValue(); // Not a blend.

12938

if (!V)

12939

V = Mask[i] < Size ? V1 : V2;

12940

else if (V != (Mask[i] < Size ? V1 : V2))

12941

return SDValue(); // Can only let one input through the mask.

12942

12943

VMaskOps[i] = AllOnes;

12944

}

12945

if (!V)

12946

return SDValue(); // No non-zeroable elements!

12947

12948

SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);

12949

VMask = DAG.getBitcast(LogicVT, VMask);

12950

V = DAG.getBitcast(LogicVT, V);

12951

SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);

12952

return DAG.getBitcast(VT, And);

12953

}

12954

12955

/// Try to emit a blend instruction for a shuffle using bit math.

12956

///

12957

/// This is used as a fallback approach when first class blend instructions are

12958

/// unavailable. Currently it is only suitable for integer vectors, but could

12959

/// be generalized for floating point vectors if desirable.

12960

static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,

12961

SDValue V2, ArrayRef<int> Mask,

12962

SelectionDAG &DAG) {

12963

assert(VT.isInteger() && "Only supports integer vector types!")(static_cast <bool> (VT.isInteger() && "Only supports integer vector types!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12963, __extension__
__PRETTY_FUNCTION__));

12964

MVT EltVT = VT.getVectorElementType();

12965

SDValue Zero = DAG.getConstant(0, DL, EltVT);

12966

SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);

12967

SmallVector<SDValue, 16> MaskOps;

12968

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

12969

if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)

12970

return SDValue(); // Shuffled input!

12971

MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);

12972

}

12973

12974

SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);

12975

V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);

12976

V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);

12977

return DAG.getNode(ISD::OR, DL, VT, V1, V2);

12978

}

12979

12980

static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,

12981

SDValue PreservedSrc,

12982

const X86Subtarget &Subtarget,

12983

SelectionDAG &DAG);

12984

12985

static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2,

12986

MutableArrayRef<int> Mask,

12987

const APInt &Zeroable, bool &ForceV1Zero,

12988

bool &ForceV2Zero, uint64_t &BlendMask) {

12989

bool V1IsZeroOrUndef =

12990

V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());

12991

bool V2IsZeroOrUndef =

12992

V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());

12993

12994

BlendMask = 0;

12995

ForceV1Zero = false, ForceV2Zero = false;

12996

assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")(static_cast <bool> (Mask.size() <= 64 && "Shuffle mask too big for blend mask"
) ? void (0) : __assert_fail ("Mask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12996, __extension__
__PRETTY_FUNCTION__));

12997

12998

int NumElts = Mask.size();

12999

int NumLanes = VT.getSizeInBits() / 128;

13000

int NumEltsPerLane = NumElts / NumLanes;

13001

assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch")(static_cast <bool> ((NumLanes * NumEltsPerLane) == NumElts
&& "Value type mismatch") ? void (0) : __assert_fail
("(NumLanes * NumEltsPerLane) == NumElts && \"Value type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13001, __extension__
__PRETTY_FUNCTION__));

13002

13003

// For 32/64-bit elements, if we only reference one input (plus any undefs),

13004

// then ensure the blend mask part for that lane just references that input.

13005

bool ForceWholeLaneMasks =

13006

VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;

13007

13008

// Attempt to generate the binary blend mask. If an input is zero then

13009

// we can use any lane.

13010

for (int Lane = 0; Lane != NumLanes; ++Lane) {

13011

// Keep track of the inputs used per lane.

13012

bool LaneV1InUse = false;

13013

bool LaneV2InUse = false;

13014

uint64_t LaneBlendMask = 0;

13015

for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {

13016

int Elt = (Lane * NumEltsPerLane) + LaneElt;

13017

int M = Mask[Elt];

13018

if (M == SM_SentinelUndef)

13019

continue;

13020

if (M == Elt || (0 <= M && M < NumElts &&

13021

IsElementEquivalent(NumElts, V1, V1, M, Elt))) {

13022

Mask[Elt] = Elt;

13023

LaneV1InUse = true;

13024

continue;

13025

}

13026

if (M == (Elt + NumElts) ||

13027

(NumElts <= M &&

13028

IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {

13029

LaneBlendMask |= 1ull << LaneElt;

13030

Mask[Elt] = Elt + NumElts;

13031

LaneV2InUse = true;

13032

continue;

13033

}

13034

if (Zeroable[Elt]) {

13035

if (V1IsZeroOrUndef) {

13036

ForceV1Zero = true;

13037

Mask[Elt] = Elt;

13038

LaneV1InUse = true;

13039

continue;

13040

}

13041

if (V2IsZeroOrUndef) {

13042

ForceV2Zero = true;

13043

LaneBlendMask |= 1ull << LaneElt;

13044

Mask[Elt] = Elt + NumElts;

13045

LaneV2InUse = true;

13046

continue;

13047

}

13048

}

13049

return false;

13050

}

13051

13052

// If we only used V2 then splat the lane blend mask to avoid any demanded

13053

// elts from V1 in this lane (the V1 equivalent is implicit with a zero

13054

// blend mask bit).

13055

if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)

13056

LaneBlendMask = (1ull << NumEltsPerLane) - 1;

13057

13058

BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);

13059

}

13060

return true;

13061

}

13062

13063

static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,

13064

int Scale) {

13065

uint64_t ScaledMask = 0;

13066

for (int i = 0; i != Size; ++i)

13067

if (BlendMask & (1ull << i))

13068

ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);

13069

return ScaledMask;

13070

}

13071

13072

/// Try to emit a blend instruction for a shuffle.

13073

///

13074

/// This doesn't do any checks for the availability of instructions for blending

13075

/// these values. It relies on the availability of the X86ISD::BLENDI pattern to

13076

/// be matched in the backend with the type given. What it does check for is

13077

/// that the shuffle mask is a blend, or convertible into a blend with zero.

13078

static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,

13079

SDValue V2, ArrayRef<int> Original,

13080

const APInt &Zeroable,

13081

const X86Subtarget &Subtarget,

13082

SelectionDAG &DAG) {

13083

uint64_t BlendMask = 0;

13084

bool ForceV1Zero = false, ForceV2Zero = false;

13085

SmallVector<int, 64> Mask(Original);

13086

if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,

13087

BlendMask))

13088

return SDValue();

13089

13090

// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.

13091

if (ForceV1Zero)

13092

V1 = getZeroVector(VT, Subtarget, DAG, DL);

13093

if (ForceV2Zero)

13094

V2 = getZeroVector(VT, Subtarget, DAG, DL);

13095

13096

unsigned NumElts = VT.getVectorNumElements();

13097

13098

switch (VT.SimpleTy) {

13099

case MVT::v4i64:

13100

case MVT::v8i32:

13101

assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13101, __extension__
__PRETTY_FUNCTION__));

13102

[[fallthrough]];

13103

case MVT::v4f64:

13104

case MVT::v8f32:

13105

assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")(static_cast <bool> (Subtarget.hasAVX() && "256-bit float blends require AVX!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"256-bit float blends require AVX!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13105, __extension__
__PRETTY_FUNCTION__));

13106

[[fallthrough]];

13107

case MVT::v2f64:

13108

case MVT::v2i64:

13109

case MVT::v4f32:

13110

case MVT::v4i32:

13111

case MVT::v8i16:

13112

assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13112, __extension__
__PRETTY_FUNCTION__));

13113

return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,

13114

DAG.getTargetConstant(BlendMask, DL, MVT::i8));

13115

case MVT::v16i16: {

13116

assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "v16i16 blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"v16i16 blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13116, __extension__
__PRETTY_FUNCTION__));

13117

SmallVector<int, 8> RepeatedMask;

13118

if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {

13119

// We can lower these with PBLENDW which is mirrored across 128-bit lanes.

13120

assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13120, __extension__
__PRETTY_FUNCTION__));

13121

BlendMask = 0;

13122

for (int i = 0; i < 8; ++i)

13123

if (RepeatedMask[i] >= 8)

13124

BlendMask |= 1ull << i;

13125

return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

13126

DAG.getTargetConstant(BlendMask, DL, MVT::i8));

13127

}

13128

// Use PBLENDW for lower/upper lanes and then blend lanes.

13129

// TODO - we should allow 2 PBLENDW here and leave shuffle combine to

13130

// merge to VSELECT where useful.

13131

uint64_t LoMask = BlendMask & 0xFF;

13132

uint64_t HiMask = (BlendMask >> 8) & 0xFF;

13133

if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {

13134

SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

13135

DAG.getTargetConstant(LoMask, DL, MVT::i8));

13136

SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

13137

DAG.getTargetConstant(HiMask, DL, MVT::i8));

13138

return DAG.getVectorShuffle(

13139

MVT::v16i16, DL, Lo, Hi,

13140

{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});

13141

}

13142

[[fallthrough]];

13143

}

13144

case MVT::v32i8:

13145

assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit byte-blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13145, __extension__
__PRETTY_FUNCTION__));

13146

[[fallthrough]];

13147

case MVT::v16i8: {

13148

assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit byte-blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13148, __extension__
__PRETTY_FUNCTION__));

13149

13150

// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.

13151

if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

13152

Subtarget, DAG))

13153

return Masked;

13154

13155

if (Subtarget.hasBWI() && Subtarget.hasVLX()) {

13156

MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));

13157

SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);

13158

return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);

13159

}

13160

13161

// If we have VPTERNLOG, we can use that as a bit blend.

13162

if (Subtarget.hasVLX())

13163

if (SDValue BitBlend =

13164

lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

13165

return BitBlend;

13166

13167

// Scale the blend by the number of bytes per element.

13168

int Scale = VT.getScalarSizeInBits() / 8;

13169

13170

// This form of blend is always done on bytes. Compute the byte vector

13171

// type.

13172

MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

13173

13174

// x86 allows load folding with blendvb from the 2nd source operand. But

13175

// we are still using LLVM select here (see comment below), so that's V1.

13176

// If V2 can be load-folded and V1 cannot be load-folded, then commute to

13177

// allow that load-folding possibility.

13178

if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {

13179

ShuffleVectorSDNode::commuteMask(Mask);

13180

std::swap(V1, V2);

13181

}

13182

13183

// Compute the VSELECT mask. Note that VSELECT is really confusing in the

13184

// mix of LLVM's code generator and the x86 backend. We tell the code

13185

// generator that boolean values in the elements of an x86 vector register

13186

// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'

13187

// mapping a select to operand #1, and 'false' mapping to operand #2. The

13188

// reality in x86 is that vector masks (pre-AVX-512) use only the high bit

13189

// of the element (the remaining are ignored) and 0 in that high bit would

13190

// mean operand #1 while 1 in the high bit would mean operand #2. So while

13191

// the LLVM model for boolean values in vector elements gets the relevant

13192

// bit set, it is set backwards and over constrained relative to x86's

13193

// actual model.

13194

SmallVector<SDValue, 32> VSELECTMask;

13195

for (int i = 0, Size = Mask.size(); i < Size; ++i)

13196

for (int j = 0; j < Scale; ++j)

13197

VSELECTMask.push_back(

13198

Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)

13199

: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,

13200

MVT::i8));

13201

13202

V1 = DAG.getBitcast(BlendVT, V1);

13203

V2 = DAG.getBitcast(BlendVT, V2);

13204

return DAG.getBitcast(

13205

VT,

13206

DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),

13207

V1, V2));

13208

}

13209

case MVT::v16f32:

13210

case MVT::v8f64:

13211

case MVT::v8i64:

13212

case MVT::v16i32:

13213

case MVT::v32i16:

13214

case MVT::v64i8: {

13215

// Attempt to lower to a bitmask if we can. Only if not optimizing for size.

13216

bool OptForSize = DAG.shouldOptForSize();

13217

if (!OptForSize) {

13218

if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

13219

Subtarget, DAG))

13220

return Masked;

13221

}

13222

13223

// Otherwise load an immediate into a GPR, cast to k-register, and use a

13224

// masked move.

13225

MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));

13226

SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);

13227

return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);

13228

}

13229

default:

13230

llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13230);

13231

}

13232

}

13233

13234

/// Try to lower as a blend of elements from two inputs followed by

13235

/// a single-input permutation.

13236

///

13237

/// This matches the pattern where we can blend elements from two inputs and

13238

/// then reduce the shuffle to a single-input permutation.

13239

static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,

13240

SDValue V1, SDValue V2,

13241

ArrayRef<int> Mask,

13242

SelectionDAG &DAG,

13243

bool ImmBlends = false) {

13244

// We build up the blend mask while checking whether a blend is a viable way

13245

// to reduce the shuffle.

13246

SmallVector<int, 32> BlendMask(Mask.size(), -1);

13247

SmallVector<int, 32> PermuteMask(Mask.size(), -1);

13248

13249

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

13250

if (Mask[i] < 0)

13251

continue;

13252

13253

assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")(static_cast <bool> (Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? void (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13253, __extension__
__PRETTY_FUNCTION__));

13254

13255

if (BlendMask[Mask[i] % Size] < 0)

13256

BlendMask[Mask[i] % Size] = Mask[i];

13257

else if (BlendMask[Mask[i] % Size] != Mask[i])

13258

return SDValue(); // Can't blend in the needed input!

13259

13260

PermuteMask[i] = Mask[i] % Size;

13261

}

13262

13263

// If only immediate blends, then bail if the blend mask can't be widened to

13264

// i16.

13265

unsigned EltSize = VT.getScalarSizeInBits();

13266

if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))

13267

return SDValue();

13268

13269

SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);

13270

return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);

13271

}

13272

13273

/// Try to lower as an unpack of elements from two inputs followed by

13274

/// a single-input permutation.

13275

///

13276

/// This matches the pattern where we can unpack elements from two inputs and

13277

/// then reduce the shuffle to a single-input (wider) permutation.

13278

static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,

13279

SDValue V1, SDValue V2,

13280

ArrayRef<int> Mask,

13281

SelectionDAG &DAG) {

13282

int NumElts = Mask.size();

13283

int NumLanes = VT.getSizeInBits() / 128;

13284

int NumLaneElts = NumElts / NumLanes;

13285

int NumHalfLaneElts = NumLaneElts / 2;

13286

13287

bool MatchLo = true, MatchHi = true;

13288

SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

13289

13290

// Determine UNPCKL/UNPCKH type and operand order.

13291

for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {

13292

for (int Elt = 0; Elt != NumLaneElts; ++Elt) {

13293

int M = Mask[Lane + Elt];

13294

if (M < 0)

13295

continue;

13296

13297

SDValue &Op = Ops[Elt & 1];

13298

if (M < NumElts && (Op.isUndef() || Op == V1))

13299

Op = V1;

13300

else if (NumElts <= M && (Op.isUndef() || Op == V2))

13301

Op = V2;

13302

else

13303

return SDValue();

13304

13305

int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;

13306

MatchLo &= isUndefOrInRange(M, Lo, Mid) ||

13307

isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);

13308

MatchHi &= isUndefOrInRange(M, Mid, Hi) ||

13309

isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);

13310

if (!MatchLo && !MatchHi)

13311

return SDValue();

13312

}

13313

}

13314

assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"
) ? void (0) : __assert_fail ("(MatchLo ^ MatchHi) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13314, __extension__
__PRETTY_FUNCTION__));

13315

13316

// Now check that each pair of elts come from the same unpack pair

13317

// and set the permute mask based on each pair.

13318

// TODO - Investigate cases where we permute individual elements.

13319

SmallVector<int, 32> PermuteMask(NumElts, -1);

13320

for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {

13321

for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {

13322

int M0 = Mask[Lane + Elt + 0];

13323

int M1 = Mask[Lane + Elt + 1];

13324

if (0 <= M0 && 0 <= M1 &&

13325

(M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))

13326

return SDValue();

13327

if (0 <= M0

49.1	'M0' is < 0

)

13328

PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));

13329

if (0 <= M1)

13330

PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;

13331

}

13332

}

13333

13334

unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;

13335

SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);

13336

return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);

13337

}

13338

13339

/// Try to lower a shuffle as a permute of the inputs followed by an

13340

/// UNPCK instruction.

13341

///

13342

/// This specifically targets cases where we end up with alternating between

13343

/// the two inputs, and so can permute them into something that feeds a single

13344

/// UNPCK instruction. Note that this routine only targets integer vectors

13345

/// because for floating point vectors we have a generalized SHUFPS lowering

13346

/// strategy that handles everything that doesn't *exactly* match an unpack,

13347

/// making this clever lowering unnecessary.

13348

static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,

13349

SDValue V1, SDValue V2,

13350

ArrayRef<int> Mask,

13351

const X86Subtarget &Subtarget,

13352

SelectionDAG &DAG) {

13353

int Size = Mask.size();

13354

assert(Mask.size() >= 2 && "Single element masks are invalid.")(static_cast <bool> (Mask.size() >= 2 && "Single element masks are invalid."
) ? void (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13354, __extension__
__PRETTY_FUNCTION__));

13355

13356

// This routine only supports 128-bit integer dual input vectors.

13357

if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())

13358

return SDValue();

13359

13360

int NumLoInputs =

13361

count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });

13362

int NumHiInputs =

13363

count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });

13364

13365

bool UnpackLo = NumLoInputs >= NumHiInputs;

13366

13367

auto TryUnpack = [&](int ScalarSize, int Scale) {

13368

SmallVector<int, 16> V1Mask((unsigned)Size, -1);

13369

SmallVector<int, 16> V2Mask((unsigned)Size, -1);

13370

13371

for (int i = 0; i < Size; ++i) {

13372

if (Mask[i] < 0)

13373

continue;

13374

13375

// Each element of the unpack contains Scale elements from this mask.

13376

int UnpackIdx = i / Scale;

13377

13378

// We only handle the case where V1 feeds the first slots of the unpack.

13379

// We rely on canonicalization to ensure this is the case.

13380

if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))

13381

return SDValue();

13382

13383

// Setup the mask for this input. The indexing is tricky as we have to

13384

// handle the unpack stride.

13385

SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;

13386

VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =

13387

Mask[i] % Size;

13388

}

13389

13390

// If we will have to shuffle both inputs to use the unpack, check whether

13391

// we can just unpack first and shuffle the result. If so, skip this unpack.

13392

if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&

13393

!isNoopShuffleMask(V2Mask))

13394

return SDValue();

13395

13396

// Shuffle the inputs into place.

13397

V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);

13398

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

13399

13400

// Cast the inputs to the type we will use to unpack them.

13401

MVT UnpackVT =

13402

MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);

13403

V1 = DAG.getBitcast(UnpackVT, V1);

13404

V2 = DAG.getBitcast(UnpackVT, V2);

13405

13406

// Unpack the inputs and cast the result back to the desired type.

13407

return DAG.getBitcast(

13408

VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,

13409

UnpackVT, V1, V2));

13410

};

13411

13412

// We try each unpack from the largest to the smallest to try and find one

13413

// that fits this mask.

13414

int OrigScalarSize = VT.getScalarSizeInBits();

13415

for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)

13416

if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))

13417

return Unpack;

13418

13419

// If we're shuffling with a zero vector then we're better off not doing

13420

// VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.

13421

if (ISD::isBuildVectorAllZeros(V1.getNode()) ||

13422

ISD::isBuildVectorAllZeros(V2.getNode()))

13423

return SDValue();

13424

13425

// If none of the unpack-rooted lowerings worked (or were profitable) try an

13426

// initial unpack.

13427

if (NumLoInputs == 0 || NumHiInputs == 0) {

13428

assert((NumLoInputs > 0 || NumHiInputs > 0) &&(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13429, __extension__
__PRETTY_FUNCTION__))

13429

"We have to have *some* inputs!")(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13429, __extension__
__PRETTY_FUNCTION__));

13430

int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;

13431

13432

// FIXME: We could consider the total complexity of the permute of each

13433

// possible unpacking. Or at the least we should consider how many

13434

// half-crossings are created.

13435

// FIXME: We could consider commuting the unpacks.

13436

13437

SmallVector<int, 32> PermMask((unsigned)Size, -1);

13438

for (int i = 0; i < Size; ++i) {

13439

if (Mask[i] < 0)

13440

continue;

13441

13442

assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")(static_cast <bool> (Mask[i] % Size >= HalfOffset &&
"Found input from wrong half!") ? void (0) : __assert_fail (
"Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13442, __extension__
__PRETTY_FUNCTION__));

13443

13444

PermMask[i] =

13445

2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);

13446

}

13447

return DAG.getVectorShuffle(

13448

VT, DL,

13449

DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,

13450

V1, V2),

13451

DAG.getUNDEF(VT), PermMask);

13452

}

13453

13454

return SDValue();

13455

}

13456

13457

/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then

13458

/// permuting the elements of the result in place.

13459

static SDValue lowerShuffleAsByteRotateAndPermute(

13460

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

13461

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

13462

if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||

13463

(VT.is256BitVector() && !Subtarget.hasAVX2()) ||

13464

(VT.is512BitVector() && !Subtarget.hasBWI()))

13465

return SDValue();

13466

13467

// We don't currently support lane crossing permutes.

13468

if (is128BitLaneCrossingShuffleMask(VT, Mask))

13469

return SDValue();

13470

13471

int Scale = VT.getScalarSizeInBits() / 8;

13472

int NumLanes = VT.getSizeInBits() / 128;

13473

int NumElts = VT.getVectorNumElements();

13474

int NumEltsPerLane = NumElts / NumLanes;

13475

13476

// Determine range of mask elts.

13477

bool Blend1 = true;

13478

bool Blend2 = true;

13479

std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));

13480

std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));

13481

for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {

13482

for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {

13483

int M = Mask[Lane + Elt];

13484

if (M < 0)

13485

continue;

13486

if (M < NumElts) {

13487

Blend1 &= (M == (Lane + Elt));

13488

assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13488, __extension__
__PRETTY_FUNCTION__));

13489

M = M % NumEltsPerLane;

13490

Range1.first = std::min(Range1.first, M);

13491

Range1.second = std::max(Range1.second, M);

13492

} else {

13493

M -= NumElts;

13494

Blend2 &= (M == (Lane + Elt));

13495

assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13495, __extension__
__PRETTY_FUNCTION__));

13496

M = M % NumEltsPerLane;

13497

Range2.first = std::min(Range2.first, M);

13498

Range2.second = std::max(Range2.second, M);

13499

}

13500

}

13501

}

13502

13503

// Bail if we don't need both elements.

13504

// TODO - it might be worth doing this for unary shuffles if the permute

13505

// can be widened.

13506

if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||

13507

!(0 <= Range2.first && Range2.second < NumEltsPerLane))

13508

return SDValue();

13509

13510

if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))

13511

return SDValue();

13512

13513

// Rotate the 2 ops so we can access both ranges, then permute the result.

13514

auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {

13515

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

13516

SDValue Rotate = DAG.getBitcast(

13517

VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),

13518

DAG.getBitcast(ByteVT, Lo),

13519

DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));

13520

SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);

13521

for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {

13522

for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {

13523

int M = Mask[Lane + Elt];

13524

if (M < 0)

13525

continue;

13526

if (M < NumElts)

13527

PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);

13528

else

13529

PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);

13530

}

13531

}

13532

return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);

13533

};

13534

13535

// Check if the ranges are small enough to rotate from either direction.

13536

if (Range2.second < Range1.first)

13537

return RotateAndPermute(V1, V2, Range1.first, 0);

13538

if (Range1.second < Range2.first)

13539

return RotateAndPermute(V2, V1, Range2.first, NumElts);

13540

return SDValue();

13541

}

13542

13543

static bool isBroadcastShuffleMask(ArrayRef<int> Mask) {

13544

return isUndefOrEqual(Mask, 0);

13545

}

13546

13547

static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) {

13548

return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);

13549

}

13550

13551

/// Generic routine to decompose a shuffle and blend into independent

13552

/// blends and permutes.

13553

///

13554

/// This matches the extremely common pattern for handling combined

13555

/// shuffle+blend operations on newer X86 ISAs where we have very fast blend

13556

/// operations. It will try to pick the best arrangement of shuffles and

13557

/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.

13558

static SDValue lowerShuffleAsDecomposedShuffleMerge(

13559

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

13560

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

13561

int NumElts = Mask.size();

13562

int NumLanes = VT.getSizeInBits() / 128;

13563

int NumEltsPerLane = NumElts / NumLanes;

13564

13565

// Shuffle the input elements into the desired positions in V1 and V2 and

13566

// unpack/blend them together.

13567

bool IsAlternating = true;

13568

SmallVector<int, 32> V1Mask(NumElts, -1);

13569

SmallVector<int, 32> V2Mask(NumElts, -1);

13570

SmallVector<int, 32> FinalMask(NumElts, -1);

13571

for (int i = 0; i < NumElts; ++i) {

13572

int M = Mask[i];

13573

if (M >= 0 && M < NumElts) {

13574

V1Mask[i] = M;

13575

FinalMask[i] = i;

13576

IsAlternating &= (i & 1) == 0;

13577

} else if (M >= NumElts) {

13578

V2Mask[i] = M - NumElts;

13579

FinalMask[i] = i + NumElts;

13580

IsAlternating &= (i & 1) == 1;

13581

}

13582

}

13583

13584

// If we effectively only demand the 0'th element of \p Input, and not only

13585

// as 0'th element, then broadcast said input,

13586

// and change \p InputMask to be a no-op (identity) mask.

13587

auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,

13588

&DAG](SDValue &Input,

13589

MutableArrayRef<int> InputMask) {

13590

unsigned EltSizeInBits = Input.getScalarValueSizeInBits();

13591

if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||

13592

!X86::mayFoldLoad(Input, Subtarget)))

13593

return;

13594

if (isNoopShuffleMask(InputMask))

13595

return;

13596

assert(isBroadcastShuffleMask(InputMask) &&(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13597, __extension__
__PRETTY_FUNCTION__))

13597

"Expected to demand only the 0'th element.")(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13597, __extension__
__PRETTY_FUNCTION__));

13598

Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);

13599

for (auto I : enumerate(InputMask)) {

13600

int &InputMaskElt = I.value();

13601

if (InputMaskElt >= 0)

13602

InputMaskElt = I.index();

13603

}

13604

};

13605

13606

// Currently, we may need to produce one shuffle per input, and blend results.

13607

// It is possible that the shuffle for one of the inputs is already a no-op.

13608

// See if we can simplify non-no-op shuffles into broadcasts,

13609

// which we consider to be strictly better than an arbitrary shuffle.

13610

if (isNoopOrBroadcastShuffleMask(V1Mask) &&

13611

isNoopOrBroadcastShuffleMask(V2Mask)) {

13612

canonicalizeBroadcastableInput(V1, V1Mask);

13613

canonicalizeBroadcastableInput(V2, V2Mask);

13614

}

13615

13616

// Try to lower with the simpler initial blend/unpack/rotate strategies unless

13617

// one of the input shuffles would be a no-op. We prefer to shuffle inputs as

13618

// the shuffle may be able to fold with a load or other benefit. However, when

13619

// we'll have to do 2x as many shuffles in order to achieve this, a 2-input

13620

// pre-shuffle first is a better strategy.

13621

if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {

13622

// Only prefer immediate blends to unpack/rotate.

13623

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,

13624

DAG, true))

13625

return BlendPerm;

13626

if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,

13627

DAG))

13628

return UnpackPerm;

13629

if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(

13630

DL, VT, V1, V2, Mask, Subtarget, DAG))

13631

return RotatePerm;

13632

// Unpack/rotate failed - try again with variable blends.

13633

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,

13634

DAG))

13635

return BlendPerm;

13636

if (VT.getScalarSizeInBits() >= 32)

13637

if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(

13638

DL, VT, V1, V2, Mask, Subtarget, DAG))

13639

return PermUnpack;

13640

}

13641

13642

// If the final mask is an alternating blend of vXi8/vXi16, convert to an

13643

// UNPCKL(SHUFFLE, SHUFFLE) pattern.

13644

// TODO: It doesn't have to be alternating - but each lane mustn't have more

13645

// than half the elements coming from each source.

13646

if (IsAlternating && VT.getScalarSizeInBits() < 32) {

13647

V1Mask.assign(NumElts, -1);

13648

V2Mask.assign(NumElts, -1);

13649

FinalMask.assign(NumElts, -1);

13650

for (int i = 0; i != NumElts; i += NumEltsPerLane)

13651

for (int j = 0; j != NumEltsPerLane; ++j) {

13652

int M = Mask[i + j];

13653

if (M >= 0 && M < NumElts) {

13654

V1Mask[i + (j / 2)] = M;

13655

FinalMask[i + j] = i + (j / 2);

13656

} else if (M >= NumElts) {

13657

V2Mask[i + (j / 2)] = M - NumElts;

13658

FinalMask[i + j] = i + (j / 2) + NumElts;

13659

}

13660

}

13661

}

13662

13663

V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);

13664

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

13665

return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);

13666

}

13667

13668

/// Try to lower a vector shuffle as a bit rotation.

13669

///

13670

/// Look for a repeated rotation pattern in each sub group.

13671

/// Returns a ISD::ROTL element rotation amount or -1 if failed.

13672

static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {

13673

int NumElts = Mask.size();

13674

assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")(static_cast <bool> ((NumElts % NumSubElts) == 0 &&
"Illegal shuffle mask") ? void (0) : __assert_fail ("(NumElts % NumSubElts) == 0 && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13674, __extension__
__PRETTY_FUNCTION__));

13675

13676

int RotateAmt = -1;

13677

for (int i = 0; i != NumElts; i += NumSubElts) {

13678

for (int j = 0; j != NumSubElts; ++j) {

13679

int M = Mask[i + j];

13680

if (M < 0)

13681

continue;

13682

if (!isInRange(M, i, i + NumSubElts))

13683

return -1;

13684

int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;

13685

if (0 <= RotateAmt && Offset != RotateAmt)

13686

return -1;

13687

RotateAmt = Offset;

13688

}

13689

}

13690

return RotateAmt;

13691

}

13692

13693

static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,

13694

const X86Subtarget &Subtarget,

13695

ArrayRef<int> Mask) {

13696

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13696, __extension__
__PRETTY_FUNCTION__));

13697

assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")(static_cast <bool> (EltSizeInBits < 64 && "Can't rotate 64-bit integers"
) ? void (0) : __assert_fail ("EltSizeInBits < 64 && \"Can't rotate 64-bit integers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13697, __extension__
__PRETTY_FUNCTION__));

13698

13699

// AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.

13700

int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;

13701

int MaxSubElts = 64 / EltSizeInBits;

13702

for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {

13703

int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);

13704

if (RotateAmt < 0)

13705

continue;

13706

13707

int NumElts = Mask.size();

13708

MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);

13709

RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);

13710

return RotateAmt * EltSizeInBits;

13711

}

13712

13713

return -1;

13714

}

13715

13716

/// Lower shuffle using X86ISD::VROTLI rotations.

13717

static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,

13718

ArrayRef<int> Mask,

13719

const X86Subtarget &Subtarget,

13720

SelectionDAG &DAG) {

13721

// Only XOP + AVX512 targets have bit rotation instructions.

13722

// If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.

13723

bool IsLegal =

13724

(VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();

13725

if (!IsLegal && Subtarget.hasSSE3())

13726

return SDValue();

13727

13728

MVT RotateVT;

13729

int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),

13730

Subtarget, Mask);

13731

if (RotateAmt < 0)

13732

return SDValue();

13733

13734

// For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,

13735

// expanded to OR(SRL,SHL), will be more efficient, but if they can

13736

// widen to vXi16 or more then existing lowering should will be better.

13737

if (!IsLegal) {

13738

if ((RotateAmt % 16) == 0)

13739

return SDValue();

13740

// TODO: Use getTargetVShiftByConstNode.

13741

unsigned ShlAmt = RotateAmt;

13742

unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;

13743

V1 = DAG.getBitcast(RotateVT, V1);

13744

SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,

13745

DAG.getTargetConstant(ShlAmt, DL, MVT::i8));

13746

SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,

13747

DAG.getTargetConstant(SrlAmt, DL, MVT::i8));

13748

SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);

13749

return DAG.getBitcast(VT, Rot);

13750

}

13751

13752

SDValue Rot =

13753

DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),

13754

DAG.getTargetConstant(RotateAmt, DL, MVT::i8));

13755

return DAG.getBitcast(VT, Rot);

13756

}

13757

13758

/// Try to match a vector shuffle as an element rotation.

13759

///

13760

/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.

13761

static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,

13762

ArrayRef<int> Mask) {

13763

int NumElts = Mask.size();

13764

13765

// We need to detect various ways of spelling a rotation:

13766

// [11, 12, 13, 14, 15, 0, 1, 2]

13767

// [-1, 12, 13, 14, -1, -1, 1, -1]

13768

// [-1, -1, -1, -1, -1, -1, 1, 2]

13769

// [ 3, 4, 5, 6, 7, 8, 9, 10]

13770

// [-1, 4, 5, 6, -1, -1, 9, -1]

13771

// [-1, 4, 5, 6, -1, -1, -1, -1]

13772

int Rotation = 0;

13773

SDValue Lo, Hi;

13774

for (int i = 0; i < NumElts; ++i) {

13775

int M = Mask[i];

13776

assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13777, __extension__
__PRETTY_FUNCTION__))

13777

"Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13777, __extension__
__PRETTY_FUNCTION__));

13778

if (M < 0)

13779

continue;

13780

13781

// Determine where a rotated vector would have started.

13782

int StartIdx = i - (M % NumElts);

13783

if (StartIdx == 0)

13784

// The identity rotation isn't interesting, stop.

13785

return -1;

13786

13787

// If we found the tail of a vector the rotation must be the missing

13788

// front. If we found the head of a vector, it must be how much of the

13789

// head.

13790

int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;

13791

13792

if (Rotation == 0)

13793

Rotation = CandidateRotation;

13794

else if (Rotation != CandidateRotation)

13795

// The rotations don't match, so we can't match this mask.

13796

return -1;

13797

13798

// Compute which value this mask is pointing at.

13799

SDValue MaskV = M < NumElts ? V1 : V2;

13800

13801

// Compute which of the two target values this index should be assigned

13802

// to. This reflects whether the high elements are remaining or the low

13803

// elements are remaining.

13804

SDValue &TargetV = StartIdx < 0 ? Hi : Lo;

13805

13806

// Either set up this value if we've not encountered it before, or check

13807

// that it remains consistent.

13808

if (!TargetV)

13809

TargetV = MaskV;

13810

else if (TargetV != MaskV)

13811

// This may be a rotation, but it pulls from the inputs in some

13812

// unsupported interleaving.

13813

return -1;

13814

}

13815

13816

// Check that we successfully analyzed the mask, and normalize the results.

13817

assert(Rotation != 0 && "Failed to locate a viable rotation!")(static_cast <bool> (Rotation != 0 && "Failed to locate a viable rotation!"
) ? void (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13817, __extension__
__PRETTY_FUNCTION__));

13818

assert((Lo || Hi) && "Failed to find a rotated input vector!")(static_cast <bool> ((Lo || Hi) && "Failed to find a rotated input vector!"
) ? void (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13818, __extension__
__PRETTY_FUNCTION__));

13819

if (!Lo)

13820

Lo = Hi;

13821

else if (!Hi)

13822

Hi = Lo;

13823

13824

V1 = Lo;

13825

V2 = Hi;

13826

13827

return Rotation;

13828

}

13829

13830

/// Try to lower a vector shuffle as a byte rotation.

13831

///

13832

/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary

13833

/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use

13834

/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will

13835

/// try to generically lower a vector shuffle through such an pattern. It

13836

/// does not check for the profitability of lowering either as PALIGNR or

13837

/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.

13838

/// This matches shuffle vectors that look like:

13839

///

13840

/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]

13841

///

13842

/// Essentially it concatenates V1 and V2, shifts right by some number of

13843

/// elements, and takes the low elements as the result. Note that while this is

13844

/// specified as a *right shift* because x86 is little-endian, it is a *left

13845

/// rotate* of the vector lanes.

13846

static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,

13847

ArrayRef<int> Mask) {

13848

// Don't accept any shuffles with zero elements.

13849

if (isAnyZero(Mask))

13850

return -1;

13851

13852

// PALIGNR works on 128-bit lanes.

13853

SmallVector<int, 16> RepeatedMask;

13854

if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))

13855

return -1;

13856

13857

int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);

13858

if (Rotation <= 0)

13859

return -1;

13860

13861

// PALIGNR rotates bytes, so we need to scale the

13862

// rotation based on how many bytes are in the vector lane.

13863

int NumElts = RepeatedMask.size();

13864

int Scale = 16 / NumElts;

13865

return Rotation * Scale;

13866

}

13867

13868

static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,

13869

SDValue V2, ArrayRef<int> Mask,

13870

const X86Subtarget &Subtarget,

13871

SelectionDAG &DAG) {

13872

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13872, __extension__
__PRETTY_FUNCTION__));

13873

13874

SDValue Lo = V1, Hi = V2;

13875

int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);

13876

if (ByteRotation <= 0)

13877

return SDValue();

13878

13879

// Cast the inputs to i8 vector of correct length to match PALIGNR or

13880

// PSLLDQ/PSRLDQ.

13881

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

13882

Lo = DAG.getBitcast(ByteVT, Lo);

13883

Hi = DAG.getBitcast(ByteVT, Hi);

13884

13885

// SSSE3 targets can use the palignr instruction.

13886

if (Subtarget.hasSSSE3()) {

13887

assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13888, __extension__
__PRETTY_FUNCTION__))

13888

"512-bit PALIGNR requires BWI instructions")(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13888, __extension__
__PRETTY_FUNCTION__));

13889

return DAG.getBitcast(

13890

VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,

13891

DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));

13892

}

13893

13894

assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13895, __extension__
__PRETTY_FUNCTION__))

13895

"Rotate-based lowering only supports 128-bit lowering!")(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13895, __extension__
__PRETTY_FUNCTION__));

13896

assert(Mask.size() <= 16 &&(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13897, __extension__
__PRETTY_FUNCTION__))

13897

"Can shuffle at most 16 bytes in a 128-bit vector!")(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13897, __extension__
__PRETTY_FUNCTION__));

13898

assert(ByteVT == MVT::v16i8 &&(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13899, __extension__
__PRETTY_FUNCTION__))

13899

"SSE2 rotate lowering only needed for v16i8!")(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13899, __extension__
__PRETTY_FUNCTION__));

13900

13901

// Default SSE2 implementation

13902

int LoByteShift = 16 - ByteRotation;

13903

int HiByteShift = ByteRotation;

13904

13905

SDValue LoShift =

13906

DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,

13907

DAG.getTargetConstant(LoByteShift, DL, MVT::i8));

13908

SDValue HiShift =

13909

DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,

13910

DAG.getTargetConstant(HiByteShift, DL, MVT::i8));

13911

return DAG.getBitcast(VT,

13912

DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));

13913

}

13914

13915

/// Try to lower a vector shuffle as a dword/qword rotation.

13916

///

13917

/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary

13918

/// rotation of the concatenation of two vectors; This routine will

13919

/// try to generically lower a vector shuffle through such an pattern.

13920

///

13921

/// Essentially it concatenates V1 and V2, shifts right by some number of

13922

/// elements, and takes the low elements as the result. Note that while this is

13923

/// specified as a *right shift* because x86 is little-endian, it is a *left

13924

/// rotate* of the vector lanes.

13925

static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,

13926

SDValue V2, ArrayRef<int> Mask,

13927

const X86Subtarget &Subtarget,

13928

SelectionDAG &DAG) {

13929

assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13930, __extension__
__PRETTY_FUNCTION__))

13930

"Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13930, __extension__
__PRETTY_FUNCTION__));

13931

13932

// 128/256-bit vectors are only supported with VLX.

13933

assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13934, __extension__
__PRETTY_FUNCTION__))

13934

&& "VLX required for 128/256-bit vectors")(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13934, __extension__
__PRETTY_FUNCTION__));

13935

13936

SDValue Lo = V1, Hi = V2;

13937

int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);

13938

if (Rotation <= 0)

13939

return SDValue();

13940

13941

return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,

13942

DAG.getTargetConstant(Rotation, DL, MVT::i8));

13943

}

13944

13945

/// Try to lower a vector shuffle as a byte shift sequence.

13946

static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,

13947

SDValue V2, ArrayRef<int> Mask,

13948

const APInt &Zeroable,

13949

const X86Subtarget &Subtarget,

13950

SelectionDAG &DAG) {

13951

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13951, __extension__
__PRETTY_FUNCTION__));

13952

assert(VT.is128BitVector() && "Only 128-bit vectors supported")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors supported"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13952, __extension__
__PRETTY_FUNCTION__));

13953

13954

// We need a shuffle that has zeros at one/both ends and a sequential

13955

// shuffle from one source within.

13956

unsigned ZeroLo = Zeroable.countr_one();

13957

unsigned ZeroHi = Zeroable.countl_one();

13958

if (!ZeroLo && !ZeroHi)

13959

return SDValue();

13960

13961

unsigned NumElts = Mask.size();

13962

unsigned Len = NumElts - (ZeroLo + ZeroHi);

13963

if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))

13964

return SDValue();

13965

13966

unsigned Scale = VT.getScalarSizeInBits() / 8;

13967

ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);

13968

if (!isUndefOrInRange(StubMask, 0, NumElts) &&

13969

!isUndefOrInRange(StubMask, NumElts, 2 * NumElts))

13970

return SDValue();

13971

13972

SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;

13973

Res = DAG.getBitcast(MVT::v16i8, Res);

13974

13975

// Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an

13976

// inner sequential set of elements, possibly offset:

13977

// 01234567 --> zzzzzz01 --> 1zzzzzzz

13978

// 01234567 --> 4567zzzz --> zzzzz456

13979

// 01234567 --> z0123456 --> 3456zzzz --> zz3456zz

13980

if (ZeroLo == 0) {

13981

unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);

13982

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

13983

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

13984

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

13985

DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));

13986

} else if (ZeroHi == 0) {

13987

unsigned Shift = Mask[ZeroLo] % NumElts;

13988

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

13989

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

13990

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

13991

DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));

13992

} else if (!Subtarget.hasSSSE3()) {

13993

// If we don't have PSHUFB then its worth avoiding an AND constant mask

13994

// by performing 3 byte shifts. Shuffle combining can kick in above that.

13995

// TODO: There may be some cases where VSH{LR}DQ+PAND is still better.

13996

unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);

13997

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

13998

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

13999

Shift += Mask[ZeroLo] % NumElts;

14000

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

14001

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

14002

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

14003

DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));

14004

} else

14005

return SDValue();

14006

14007

return DAG.getBitcast(VT, Res);

14008

}

14009

14010

/// Try to lower a vector shuffle as a bit shift (shifts in zeros).

14011

///

14012

/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and

14013

/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function

14014

/// matches elements from one of the input vectors shuffled to the left or

14015

/// right with zeroable elements 'shifted in'. It handles both the strictly

14016

/// bit-wise element shifts and the byte shift across an entire 128-bit double

14017

/// quad word lane.

14018

///

14019

/// PSHL : (little-endian) left bit shift.

14020

/// [ zz, 0, zz, 2 ]

14021

/// [ -1, 4, zz, -1 ]

14022

/// PSRL : (little-endian) right bit shift.

14023

/// [ 1, zz, 3, zz]

14024

/// [ -1, -1, 7, zz]

14025

/// PSLLDQ : (little-endian) left byte shift

14026

/// [ zz, 0, 1, 2, 3, 4, 5, 6]

14027

/// [ zz, zz, -1, -1, 2, 3, 4, -1]

14028

/// [ zz, zz, zz, zz, zz, zz, -1, 1]

14029

/// PSRLDQ : (little-endian) right byte shift

14030

/// [ 5, 6, 7, zz, zz, zz, zz, zz]

14031

/// [ -1, 5, 6, 7, zz, zz, zz, zz]

14032

/// [ 1, 2, -1, -1, -1, -1, zz, zz]

14033

static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,

14034

unsigned ScalarSizeInBits, ArrayRef<int> Mask,

14035

int MaskOffset, const APInt &Zeroable,

14036

const X86Subtarget &Subtarget) {

14037

int Size = Mask.size();

14038

unsigned SizeInBits = Size * ScalarSizeInBits;

14039

14040

auto CheckZeros = [&](int Shift, int Scale, bool Left) {

14041

for (int i = 0; i < Size; i += Scale)

14042

for (int j = 0; j < Shift; ++j)

14043

if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])

14044

return false;

14045

14046

return true;

14047

};

14048

14049

auto MatchShift = [&](int Shift, int Scale, bool Left) {

14050

for (int i = 0; i != Size; i += Scale) {

14051

unsigned Pos = Left ? i + Shift : i;

14052

unsigned Low = Left ? i : i + Shift;

14053

unsigned Len = Scale - Shift;

14054

if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))

14055

return -1;

14056

}

14057

14058

int ShiftEltBits = ScalarSizeInBits * Scale;

14059

bool ByteShift = ShiftEltBits > 64;

14060

Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)

14061

: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);

14062

int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);

14063

14064

// Normalize the scale for byte shifts to still produce an i64 element

14065

// type.

14066

Scale = ByteShift ? Scale / 2 : Scale;

14067

14068

// We need to round trip through the appropriate type for the shift.

14069

MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);

14070

ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)

14071

: MVT::getVectorVT(ShiftSVT, Size / Scale);

14072

return (int)ShiftAmt;

14073

};

14074

14075

// SSE/AVX supports logical shifts up to 64-bit integers - so we can just

14076

// keep doubling the size of the integer elements up to that. We can

14077

// then shift the elements of the integer vector by whole multiples of

14078

// their width within the elements of the larger integer vector. Test each

14079

// multiple to see if we can find a match with the moved element indices

14080

// and that the shifted in elements are all zeroable.

14081

unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);

14082

for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)

14083

for (int Shift = 1; Shift != Scale; ++Shift)

14084

for (bool Left : {true, false})

14085

if (CheckZeros(Shift, Scale, Left)) {

14086

int ShiftAmt = MatchShift(Shift, Scale, Left);

14087

if (0 < ShiftAmt)

14088

return ShiftAmt;

14089

}

14090

14091

// no match

14092

return -1;

14093

}

14094

14095

static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,

14096

SDValue V2, ArrayRef<int> Mask,

14097

const APInt &Zeroable,

14098

const X86Subtarget &Subtarget,

14099

SelectionDAG &DAG, bool BitwiseOnly) {

14100

int Size = Mask.size();

14101

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14101, __extension__
__PRETTY_FUNCTION__));

14102

14103

MVT ShiftVT;

14104

SDValue V = V1;

14105

unsigned Opcode;

14106

14107

// Try to match shuffle against V1 shift.

14108

int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),

14109

Mask, 0, Zeroable, Subtarget);

14110

14111

// If V1 failed, try to match shuffle against V2 shift.

14112

if (ShiftAmt < 0) {

14113

ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),

14114

Mask, Size, Zeroable, Subtarget);

14115

V = V2;

14116

}

14117

14118

if (ShiftAmt < 0)

14119

return SDValue();

14120

14121

if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))

14122

return SDValue();

14123

14124

assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14125, __extension__
__PRETTY_FUNCTION__))

14125

"Illegal integer vector type")(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14125, __extension__
__PRETTY_FUNCTION__));

14126

V = DAG.getBitcast(ShiftVT, V);

14127

V = DAG.getNode(Opcode, DL, ShiftVT, V,

14128

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

14129

return DAG.getBitcast(VT, V);

14130

}

14131

14132

// EXTRQ: Extract Len elements from lower half of source, starting at Idx.

14133

// Remainder of lower half result is zero and upper half is all undef.

14134

static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,

14135

ArrayRef<int> Mask, uint64_t &BitLen,

14136

uint64_t &BitIdx, const APInt &Zeroable) {

14137

int Size = Mask.size();

14138

int HalfSize = Size / 2;

14139

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14139, __extension__
__PRETTY_FUNCTION__));

14140

assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask")(static_cast <bool> (!Zeroable.isAllOnes() && "Fully zeroable shuffle mask"
) ? void (0) : __assert_fail ("!Zeroable.isAllOnes() && \"Fully zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14140, __extension__
__PRETTY_FUNCTION__));

14141

14142

// Upper half must be undefined.

14143

if (!isUndefUpperHalf(Mask))

14144

return false;

14145

14146

// Determine the extraction length from the part of the

14147

// lower half that isn't zeroable.

14148

int Len = HalfSize;

14149

for (; Len > 0; --Len)

14150

if (!Zeroable[Len - 1])

14151

break;

14152

assert(Len > 0 && "Zeroable shuffle mask")(static_cast <bool> (Len > 0 && "Zeroable shuffle mask"
) ? void (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14152, __extension__
__PRETTY_FUNCTION__));

14153

14154

// Attempt to match first Len sequential elements from the lower half.

14155

SDValue Src;

14156

int Idx = -1;

14157

for (int i = 0; i != Len; ++i) {

14158

int M = Mask[i];

14159

if (M == SM_SentinelUndef)

14160

continue;

14161

SDValue &V = (M < Size ? V1 : V2);

14162

M = M % Size;

14163

14164

// The extracted elements must start at a valid index and all mask

14165

// elements must be in the lower half.

14166

if (i > M || M >= HalfSize)

14167

return false;

14168

14169

if (Idx < 0 || (Src == V && Idx == (M - i))) {

14170

Src = V;

14171

Idx = M - i;

14172

continue;

14173

}

14174

return false;

14175

}

14176

14177

if (!Src || Idx < 0)

14178

return false;

14179

14180

assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(static_cast <bool> ((Idx + Len) <= HalfSize &&
"Illegal extraction mask") ? void (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14180, __extension__
__PRETTY_FUNCTION__));

14181

BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;

14182

BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;

14183

V1 = Src;

14184

return true;

14185

}

14186

14187

// INSERTQ: Extract lowest Len elements from lower half of second source and

14188

// insert over first source, starting at Idx.

14189

// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }

14190

static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,

14191

ArrayRef<int> Mask, uint64_t &BitLen,

14192

uint64_t &BitIdx) {

14193

int Size = Mask.size();

14194

int HalfSize = Size / 2;

14195

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14195, __extension__
__PRETTY_FUNCTION__));

14196

14197

// Upper half must be undefined.

14198

if (!isUndefUpperHalf(Mask))

14199

return false;

14200

14201

for (int Idx = 0; Idx != HalfSize; ++Idx) {

14202

SDValue Base;

14203

14204

// Attempt to match first source from mask before insertion point.

14205

if (isUndefInRange(Mask, 0, Idx)) {

14206

/* EMPTY */

14207

} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {

14208

Base = V1;

14209

} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {

14210

Base = V2;

14211

} else {

14212

continue;

14213

}

14214

14215

// Extend the extraction length looking to match both the insertion of

14216

// the second source and the remaining elements of the first.

14217

for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {

14218

SDValue Insert;

14219

int Len = Hi - Idx;

14220

14221

// Match insertion.

14222

if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {

14223

Insert = V1;

14224

} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {

14225

Insert = V2;

14226

} else {

14227

continue;

14228

}

14229

14230

// Match the remaining elements of the lower half.

14231

if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {

14232

/* EMPTY */

14233

} else if ((!Base || (Base == V1)) &&

14234

isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {

14235

Base = V1;

14236

} else if ((!Base || (Base == V2)) &&

14237

isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,

14238

Size + Hi)) {

14239

Base = V2;

14240

} else {

14241

continue;

14242

}

14243

14244

BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;

14245

BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;

14246

V1 = Base;

14247

V2 = Insert;

14248

return true;

14249

}

14250

}

14251

14252

return false;

14253

}

14254

14255

/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.

14256

static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,

14257

SDValue V2, ArrayRef<int> Mask,

14258

const APInt &Zeroable, SelectionDAG &DAG) {

14259

uint64_t BitLen, BitIdx;

14260

if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))

14261

return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,

14262

DAG.getTargetConstant(BitLen, DL, MVT::i8),

14263

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

14264

14265

if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))

14266

return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),

14267

V2 ? V2 : DAG.getUNDEF(VT),

14268

DAG.getTargetConstant(BitLen, DL, MVT::i8),

14269

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

14270

14271

return SDValue();

14272

}

14273

14274

/// Lower a vector shuffle as a zero or any extension.

14275

///

14276

/// Given a specific number of elements, element bit width, and extension

14277

/// stride, produce either a zero or any extension based on the available

14278

/// features of the subtarget. The extended elements are consecutive and

14279

/// begin and can start from an offsetted element index in the input; to

14280

/// avoid excess shuffling the offset must either being in the bottom lane

14281

/// or at the start of a higher lane. All extended elements must be from

14282

/// the same lane.

14283

static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(

14284

const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,

14285

ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {

14286

assert(Scale > 1 && "Need a scale to extend.")(static_cast <bool> (Scale > 1 && "Need a scale to extend."
) ? void (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14286, __extension__
__PRETTY_FUNCTION__));

14287

int EltBits = VT.getScalarSizeInBits();

14288

int NumElements = VT.getVectorNumElements();

14289

int NumEltsPerLane = 128 / EltBits;

14290

int OffsetLane = Offset / NumEltsPerLane;

14291

assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14292, __extension__
__PRETTY_FUNCTION__))

14292

"Only 8, 16, and 32 bit elements can be extended.")(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14292, __extension__
__PRETTY_FUNCTION__));

14293

assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")(static_cast <bool> (Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.") ? void (0) : __assert_fail
("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14293, __extension__
__PRETTY_FUNCTION__));

14294

assert(0 <= Offset && "Extension offset must be positive.")(static_cast <bool> (0 <= Offset && "Extension offset must be positive."
) ? void (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14294, __extension__
__PRETTY_FUNCTION__));

14295

assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14296, __extension__
__PRETTY_FUNCTION__))

14296

"Extension offset must be in the first lane or start an upper lane.")(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14296, __extension__
__PRETTY_FUNCTION__));

14297

14298

// Check that an index is in same lane as the base offset.

14299

auto SafeOffset = [&](int Idx) {

14300

return OffsetLane == (Idx / NumEltsPerLane);

14301

};

14302

14303

// Shift along an input so that the offset base moves to the first element.

14304

auto ShuffleOffset = [&](SDValue V) {

14305

if (!Offset)

14306

return V;

14307

14308

SmallVector<int, 8> ShMask((unsigned)NumElements, -1);

14309

for (int i = 0; i * Scale < NumElements; ++i) {

14310

int SrcIdx = i + Offset;

14311

ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;

14312

}

14313

return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);

14314

};

14315

14316

// Found a valid a/zext mask! Try various lowering strategies based on the

14317

// input type and available ISA extensions.

14318

if (Subtarget.hasSSE41()) {

14319

// Not worth offsetting 128-bit vectors if scale == 2, a pattern using

14320

// PUNPCK will catch this in a later shuffle match.

14321

if (Offset && Scale == 2 && VT.is128BitVector())

14322

return SDValue();

14323

MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),

14324

NumElements / Scale);

14325

InputV = DAG.getBitcast(VT, InputV);

14326

InputV = ShuffleOffset(InputV);

14327

InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,

14328

DL, ExtVT, InputV, DAG);

14329

return DAG.getBitcast(VT, InputV);

14330

}

14331

14332

assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14332, __extension__
__PRETTY_FUNCTION__));

14333

InputV = DAG.getBitcast(VT, InputV);

14334

14335

// For any extends we can cheat for larger element sizes and use shuffle

14336

// instructions that can fold with a load and/or copy.

14337

if (AnyExt && EltBits == 32) {

14338

int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,

14339

-1};

14340

return DAG.getBitcast(

14341

VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

14342

DAG.getBitcast(MVT::v4i32, InputV),

14343

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

14344

}

14345

if (AnyExt && EltBits == 16 && Scale > 2) {

14346

int PSHUFDMask[4] = {Offset / 2, -1,

14347

SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};

14348

InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

14349

DAG.getBitcast(MVT::v4i32, InputV),

14350

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));

14351

int PSHUFWMask[4] = {1, -1, -1, -1};

14352

unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;

14353

return DAG.getBitcast(

14354

VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,

14355

DAG.getBitcast(MVT::v8i16, InputV),

14356

getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));

14357

}

14358

14359

// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes

14360

// to 64-bits.

14361

if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {

14362

assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")(static_cast <bool> (NumElements == (int)Mask.size() &&
"Unexpected shuffle mask size!") ? void (0) : __assert_fail (
"NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14362, __extension__
__PRETTY_FUNCTION__));

14363

assert(VT.is128BitVector() && "Unexpected vector width!")(static_cast <bool> (VT.is128BitVector() && "Unexpected vector width!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14363, __extension__
__PRETTY_FUNCTION__));

14364

14365

int LoIdx = Offset * EltBits;

14366

SDValue Lo = DAG.getBitcast(

14367

MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,

14368

DAG.getTargetConstant(EltBits, DL, MVT::i8),

14369

DAG.getTargetConstant(LoIdx, DL, MVT::i8)));

14370

14371

if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))

14372

return DAG.getBitcast(VT, Lo);

14373

14374

int HiIdx = (Offset + 1) * EltBits;

14375

SDValue Hi = DAG.getBitcast(

14376

MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,

14377

DAG.getTargetConstant(EltBits, DL, MVT::i8),

14378

DAG.getTargetConstant(HiIdx, DL, MVT::i8)));

14379

return DAG.getBitcast(VT,

14380

DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));

14381

}

14382

14383

// If this would require more than 2 unpack instructions to expand, use

14384

// pshufb when available. We can only use more than 2 unpack instructions

14385

// when zero extending i8 elements which also makes it easier to use pshufb.

14386

if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {

14387

assert(NumElements == 16 && "Unexpected byte vector width!")(static_cast <bool> (NumElements == 16 && "Unexpected byte vector width!"
) ? void (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14387, __extension__
__PRETTY_FUNCTION__));

14388

SDValue PSHUFBMask[16];

14389

for (int i = 0; i < 16; ++i) {

14390

int Idx = Offset + (i / Scale);

14391

if ((i % Scale == 0 && SafeOffset(Idx))) {

14392

PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);

14393

continue;

14394

}

14395

PSHUFBMask[i] =

14396

AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);

14397

}

14398

InputV = DAG.getBitcast(MVT::v16i8, InputV);

14399

return DAG.getBitcast(

14400

VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,

14401

DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));

14402

}

14403

14404

// If we are extending from an offset, ensure we start on a boundary that

14405

// we can unpack from.

14406

int AlignToUnpack = Offset % (NumElements / Scale);

14407

if (AlignToUnpack) {

14408

SmallVector<int, 8> ShMask((unsigned)NumElements, -1);

14409

for (int i = AlignToUnpack; i < NumElements; ++i)

14410

ShMask[i - AlignToUnpack] = i;

14411

InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);

14412

Offset -= AlignToUnpack;

14413

}

14414

14415

// Otherwise emit a sequence of unpacks.

14416

do {

14417

unsigned UnpackLoHi = X86ISD::UNPCKL;

14418

if (Offset >= (NumElements / 2)) {

14419

UnpackLoHi = X86ISD::UNPCKH;

14420

Offset -= (NumElements / 2);

14421

}

14422

14423

MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);

14424

SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)

14425

: getZeroVector(InputVT, Subtarget, DAG, DL);

14426

InputV = DAG.getBitcast(InputVT, InputV);

14427

InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);

14428

Scale /= 2;

14429

EltBits *= 2;

14430

NumElements /= 2;

14431

} while (Scale > 1);

14432

return DAG.getBitcast(VT, InputV);

14433

}

14434

14435

/// Try to lower a vector shuffle as a zero extension on any microarch.

14436

///

14437

/// This routine will try to do everything in its power to cleverly lower

14438

/// a shuffle which happens to match the pattern of a zero extend. It doesn't

14439

/// check for the profitability of this lowering, it tries to aggressively

14440

/// match this pattern. It will use all of the micro-architectural details it

14441

/// can to emit an efficient lowering. It handles both blends with all-zero

14442

/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to

14443

/// masking out later).

14444

///

14445

/// The reason we have dedicated lowering for zext-style shuffles is that they

14446

/// are both incredibly common and often quite performance sensitive.

14447

static SDValue lowerShuffleAsZeroOrAnyExtend(

14448

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

14449

const APInt &Zeroable, const X86Subtarget &Subtarget,

14450

SelectionDAG &DAG) {

14451

int Bits = VT.getSizeInBits();

14452

int NumLanes = Bits / 128;

14453

int NumElements = VT.getVectorNumElements();

14454

int NumEltsPerLane = NumElements / NumLanes;

14455

assert(VT.getScalarSizeInBits() <= 32 &&(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14456, __extension__
__PRETTY_FUNCTION__))

14456

"Exceeds 32-bit integer zero extension limit")(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14456, __extension__
__PRETTY_FUNCTION__));

14457

assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(static_cast <bool> ((int)Mask.size() == NumElements &&
"Unexpected shuffle mask size") ? void (0) : __assert_fail (
"(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14457, __extension__
__PRETTY_FUNCTION__));

14458

14459

// Define a helper function to check a particular ext-scale and lower to it if

14460

// valid.

14461

auto Lower = [&](int Scale) -> SDValue {

14462

SDValue InputV;

14463

bool AnyExt = true;

14464

int Offset = 0;

14465

int Matches = 0;

14466

for (int i = 0; i < NumElements; ++i) {

14467

int M = Mask[i];

14468

if (M < 0)

14469

continue; // Valid anywhere but doesn't tell us anything.

14470

if (i % Scale != 0) {

14471

// Each of the extended elements need to be zeroable.

14472

if (!Zeroable[i])

14473

return SDValue();

14474

14475

// We no longer are in the anyext case.

14476

AnyExt = false;

14477

continue;

14478

}

14479

14480

// Each of the base elements needs to be consecutive indices into the

14481

// same input vector.

14482

SDValue V = M < NumElements ? V1 : V2;

14483

M = M % NumElements;

14484

if (!InputV) {

14485

InputV = V;

14486

Offset = M - (i / Scale);

14487

} else if (InputV != V)

14488

return SDValue(); // Flip-flopping inputs.

14489

14490

// Offset must start in the lowest 128-bit lane or at the start of an

14491

// upper lane.

14492

// FIXME: Is it ever worth allowing a negative base offset?

14493

if (!((0 <= Offset && Offset < NumEltsPerLane) ||

14494

(Offset % NumEltsPerLane) == 0))

14495

return SDValue();

14496

14497

// If we are offsetting, all referenced entries must come from the same

14498

// lane.

14499

if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))

14500

return SDValue();

14501

14502

if ((M % NumElements) != (Offset + (i / Scale)))

14503

return SDValue(); // Non-consecutive strided elements.

14504

Matches++;

14505

}

14506

14507

// If we fail to find an input, we have a zero-shuffle which should always

14508

// have already been handled.

14509

// FIXME: Maybe handle this here in case during blending we end up with one?

14510

if (!InputV)

14511

return SDValue();

14512

14513

// If we are offsetting, don't extend if we only match a single input, we

14514

// can always do better by using a basic PSHUF or PUNPCK.

14515

if (Offset != 0 && Matches < 2)

14516

return SDValue();

14517

14518

return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,

14519

InputV, Mask, Subtarget, DAG);

14520

};

14521

14522

// The widest scale possible for extending is to a 64-bit integer.

14523

assert(Bits % 64 == 0 &&(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14524, __extension__
__PRETTY_FUNCTION__))

14524

"The number of bits in a vector must be divisible by 64 on x86!")(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14524, __extension__
__PRETTY_FUNCTION__));

14525

int NumExtElements = Bits / 64;

14526

14527

// Each iteration, try extending the elements half as much, but into twice as

14528

// many elements.

14529

for (; NumExtElements < NumElements; NumExtElements *= 2) {

14530

assert(NumElements % NumExtElements == 0 &&(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14531, __extension__
__PRETTY_FUNCTION__))

14531

"The input vector size must be divisible by the extended size.")(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14531, __extension__
__PRETTY_FUNCTION__));

14532

if (SDValue V = Lower(NumElements / NumExtElements))

14533

return V;

14534

}

14535

14536

// General extends failed, but 128-bit vectors may be able to use MOVQ.

14537

if (Bits != 128)

14538

return SDValue();

14539

14540

// Returns one of the source operands if the shuffle can be reduced to a

14541

// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.

14542

auto CanZExtLowHalf = [&]() {

14543

for (int i = NumElements / 2; i != NumElements; ++i)

14544

if (!Zeroable[i])

14545

return SDValue();

14546

if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))

14547

return V1;

14548

if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))

14549

return V2;

14550

return SDValue();

14551

};

14552

14553

if (SDValue V = CanZExtLowHalf()) {

14554

V = DAG.getBitcast(MVT::v2i64, V);

14555

V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);

14556

return DAG.getBitcast(VT, V);

14557

}

14558

14559

// No viable ext lowering found.

14560

return SDValue();

14561

}

14562

14563

/// Try to get a scalar value for a specific element of a vector.

14564

///

14565

/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.

14566

static SDValue getScalarValueForVectorElement(SDValue V, int Idx,

14567

SelectionDAG &DAG) {

14568

MVT VT = V.getSimpleValueType();

14569

MVT EltVT = VT.getVectorElementType();

14570

V = peekThroughBitcasts(V);

14571

14572

// If the bitcasts shift the element size, we can't extract an equivalent

14573

// element from it.

14574

MVT NewVT = V.getSimpleValueType();

14575

if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())

14576

return SDValue();

14577

14578

if (V.getOpcode() == ISD::BUILD_VECTOR ||

14579

(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {

14580

// Ensure the scalar operand is the same size as the destination.

14581

// FIXME: Add support for scalar truncation where possible.

14582

SDValue S = V.getOperand(Idx);

14583

if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())

14584

return DAG.getBitcast(EltVT, S);

14585

}

14586

14587

return SDValue();

14588

}

14589

14590

/// Helper to test for a load that can be folded with x86 shuffles.

14591

///

14592

/// This is particularly important because the set of instructions varies

14593

/// significantly based on whether the operand is a load or not.

14594

static bool isShuffleFoldableLoad(SDValue V) {

14595

return V->hasOneUse() &&

14596

ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());

14597

}

14598

14599

template<typename T>

14600

static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {

14601

return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16();

14602

}

14603

14604

template<typename T>

14605

bool X86TargetLowering::isSoftFP16(T VT) const {

14606

return ::isSoftFP16(VT, Subtarget);

14607

}

14608

14609

/// Try to lower insertion of a single element into a zero vector.

14610

///

14611

/// This is a common pattern that we have especially efficient patterns to lower

14612

/// across all subtarget feature sets.

14613

static SDValue lowerShuffleAsElementInsertion(

14614

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

14615

const APInt &Zeroable, const X86Subtarget &Subtarget,

14616

SelectionDAG &DAG) {

14617

MVT ExtVT = VT;

14618

MVT EltVT = VT.getVectorElementType();

14619

14620

if (isSoftFP16(EltVT, Subtarget))

14621

return SDValue();

14622

14623

int V2Index =

14624

find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -

14625

Mask.begin();

14626

bool IsV1Zeroable = true;

14627

for (int i = 0, Size = Mask.size(); i < Size; ++i)

14628

if (i != V2Index && !Zeroable[i]) {

14629

IsV1Zeroable = false;

14630

break;

14631

}

14632

14633

// Check for a single input from a SCALAR_TO_VECTOR node.

14634

// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and

14635

// all the smarts here sunk into that routine. However, the current

14636

// lowering of BUILD_VECTOR makes that nearly impossible until the old

14637

// vector shuffle lowering is dead.

14638

SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),

14639

DAG);

14640

if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {

14641

// We need to zext the scalar if it is smaller than an i32.

14642

V2S = DAG.getBitcast(EltVT, V2S);

14643

if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {

14644

// Using zext to expand a narrow element won't work for non-zero

14645

// insertions.

14646

if (!IsV1Zeroable)

14647

return SDValue();

14648

14649

// Zero-extend directly to i32.

14650

ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);

14651

V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);

14652

}

14653

V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);

14654

} else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||

14655

EltVT == MVT::i16) {

14656

// Either not inserting from the low element of the input or the input

14657

// element size is too small to use VZEXT_MOVL to clear the high bits.

14658

return SDValue();

14659

}

14660

14661

if (!IsV1Zeroable) {

14662

// If V1 can't be treated as a zero vector we have fewer options to lower

14663

// this. We can't support integer vectors or non-zero targets cheaply, and

14664

// the V1 elements can't be permuted in any way.

14665

assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")(static_cast <bool> (VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? void (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14665, __extension__
__PRETTY_FUNCTION__));

14666

if (!VT.isFloatingPoint() || V2Index != 0)

14667

return SDValue();

14668

SmallVector<int, 8> V1Mask(Mask);

14669

V1Mask[V2Index] = -1;

14670

if (!isNoopShuffleMask(V1Mask))

14671

return SDValue();

14672

if (!VT.is128BitVector())

14673

return SDValue();

14674

14675

// Otherwise, use MOVSD, MOVSS or MOVSH.

14676

unsigned MovOpc = 0;

14677

if (EltVT == MVT::f16)

14678

MovOpc = X86ISD::MOVSH;

14679

else if (EltVT == MVT::f32)

14680

MovOpc = X86ISD::MOVSS;

14681

else if (EltVT == MVT::f64)

14682

MovOpc = X86ISD::MOVSD;

14683

else

14684

llvm_unreachable("Unsupported floating point element type to handle!")::llvm::llvm_unreachable_internal("Unsupported floating point element type to handle!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14684);

14685

return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);

14686

}

14687

14688

// This lowering only works for the low element with floating point vectors.

14689

if (VT.isFloatingPoint() && V2Index != 0)

14690

return SDValue();

14691

14692

V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);

14693

if (ExtVT != VT)

14694

V2 = DAG.getBitcast(VT, V2);

14695

14696

if (V2Index != 0) {

14697

// If we have 4 or fewer lanes we can cheaply shuffle the element into

14698

// the desired position. Otherwise it is more efficient to do a vector

14699

// shift left. We know that we can do a vector shift left because all

14700

// the inputs are zero.

14701

if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {

14702

SmallVector<int, 4> V2Shuffle(Mask.size(), 1);

14703

V2Shuffle[V2Index] = 0;

14704

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);

14705

} else {

14706

V2 = DAG.getBitcast(MVT::v16i8, V2);

14707

V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,

14708

DAG.getTargetConstant(

14709

V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));

14710

V2 = DAG.getBitcast(VT, V2);

14711

}

14712

}

14713

return V2;

14714

}

14715

14716

/// Try to lower broadcast of a single - truncated - integer element,

14717

/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.

14718

///

14719

/// This assumes we have AVX2.

14720

static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,

14721

int BroadcastIdx,

14722

const X86Subtarget &Subtarget,

14723

SelectionDAG &DAG) {

14724

assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14725, __extension__
__PRETTY_FUNCTION__))

14725

"We can only lower integer broadcasts with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14725, __extension__
__PRETTY_FUNCTION__));

14726

14727

MVT EltVT = VT.getVectorElementType();

14728

MVT V0VT = V0.getSimpleValueType();

14729

14730

assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")(static_cast <bool> (VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14730, __extension__
__PRETTY_FUNCTION__));

14731

assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")(static_cast <bool> (V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? void (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14731, __extension__
__PRETTY_FUNCTION__));

14732

14733

MVT V0EltVT = V0VT.getVectorElementType();

14734

if (!V0EltVT.isInteger())

14735

return SDValue();

14736

14737

const unsigned EltSize = EltVT.getSizeInBits();

14738

const unsigned V0EltSize = V0EltVT.getSizeInBits();

14739

14740

// This is only a truncation if the original element type is larger.

14741

if (V0EltSize <= EltSize)

14742

return SDValue();

14743

14744

assert(((V0EltSize % EltSize) == 0) &&(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14745, __extension__
__PRETTY_FUNCTION__))

14745

"Scalar type sizes must all be powers of 2 on x86!")(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14745, __extension__
__PRETTY_FUNCTION__));

14746

14747

const unsigned V0Opc = V0.getOpcode();

14748

const unsigned Scale = V0EltSize / EltSize;

14749

const unsigned V0BroadcastIdx = BroadcastIdx / Scale;

14750

14751

if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&

14752

V0Opc != ISD::BUILD_VECTOR)

14753

return SDValue();

14754

14755

SDValue Scalar = V0.getOperand(V0BroadcastIdx);

14756

14757

// If we're extracting non-least-significant bits, shift so we can truncate.

14758

// Hopefully, we can fold away the trunc/srl/load into the broadcast.

14759

// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer

14760

// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.

14761

if (const int OffsetIdx = BroadcastIdx % Scale)

14762

Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,

14763

DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));

14764

14765

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

14766

DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));

14767

}

14768

14769

/// Test whether this can be lowered with a single SHUFPS instruction.

14770

///

14771

/// This is used to disable more specialized lowerings when the shufps lowering

14772

/// will happen to be efficient.

14773

static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {

14774

// This routine only handles 128-bit shufps.

14775

assert(Mask.size() == 4 && "Unsupported mask size!")(static_cast <bool> (Mask.size() == 4 && "Unsupported mask size!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14775, __extension__
__PRETTY_FUNCTION__));

14776

assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14776, __extension__
__PRETTY_FUNCTION__));

14777

assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14777, __extension__
__PRETTY_FUNCTION__));

14778

assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14778, __extension__
__PRETTY_FUNCTION__));

14779

assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14779, __extension__
__PRETTY_FUNCTION__));

14780

14781

// To lower with a single SHUFPS we need to have the low half and high half

14782

// each requiring a single input.

14783

if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))

14784

return false;

14785

if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))

14786

return false;

14787

14788

return true;

14789

}

14790

14791

/// Test whether the specified input (0 or 1) is in-place blended by the

14792

/// given mask.

14793

///

14794

/// This returns true if the elements from a particular input are already in the

14795

/// slot required by the given mask and require no permutation.

14796

static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {

14797

assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(static_cast <bool> ((Input == 0 || Input == 1) &&
"Only two inputs to shuffles.") ? void (0) : __assert_fail (
"(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14797, __extension__
__PRETTY_FUNCTION__));

14798

int Size = Mask.size();

14799

for (int i = 0; i < Size; ++i)

14800

if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)

14801

return false;

14802

14803

return true;

14804

}

14805

14806

/// If we are extracting two 128-bit halves of a vector and shuffling the

14807

/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a

14808

/// multi-shuffle lowering.

14809

static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,

14810

SDValue N1, ArrayRef<int> Mask,

14811

SelectionDAG &DAG) {

14812

MVT VT = N0.getSimpleValueType();

14813

assert((VT.is128BitVector() &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14815, __extension__
__PRETTY_FUNCTION__))

14814

(VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14815, __extension__
__PRETTY_FUNCTION__))

14815

"VPERM* family of shuffles requires 32-bit or 64-bit elements")(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14815, __extension__
__PRETTY_FUNCTION__));

14816

14817

// Check that both sources are extracts of the same source vector.

14818

if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

14819

N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

14820

N0.getOperand(0) != N1.getOperand(0) ||

14821

!N0.hasOneUse() || !N1.hasOneUse())

14822

return SDValue();

14823

14824

SDValue WideVec = N0.getOperand(0);

14825

MVT WideVT = WideVec.getSimpleValueType();

14826

if (!WideVT.is256BitVector())

14827

return SDValue();

14828

14829

// Match extracts of each half of the wide source vector. Commute the shuffle

14830

// if the extract of the low half is N1.

14831

unsigned NumElts = VT.getVectorNumElements();

14832

SmallVector<int, 4> NewMask(Mask);

14833

const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);

14834

const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);

14835

if (ExtIndex1 == 0 && ExtIndex0 == NumElts)

14836

ShuffleVectorSDNode::commuteMask(NewMask);

14837

else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)

14838

return SDValue();

14839

14840

// Final bailout: if the mask is simple, we are better off using an extract

14841

// and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps

14842

// because that avoids a constant load from memory.

14843

if (NumElts == 4 &&

14844

(isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))

14845

return SDValue();

14846

14847

// Extend the shuffle mask with undef elements.

14848

NewMask.append(NumElts, -1);

14849

14850

// shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0

14851

SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),

14852

NewMask);

14853

// This is free: ymm -> xmm.

14854

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,

14855

DAG.getIntPtrConstant(0, DL));

14856

}

14857

14858

/// Try to lower broadcast of a single element.

14859

///

14860

/// For convenience, this code also bundles all of the subtarget feature set

14861

/// filtering. While a little annoying to re-dispatch on type here, there isn't

14862

/// a convenient way to factor it out.

14863

static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,

14864

SDValue V2, ArrayRef<int> Mask,

14865

const X86Subtarget &Subtarget,

14866

SelectionDAG &DAG) {

14867

if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||

14868

(Subtarget.hasAVX() && VT.isFloatingPoint()) ||

14869

(Subtarget.hasAVX2() && VT.isInteger())))

14870

return SDValue();

14871

14872

// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise

14873

// we can only broadcast from a register with AVX2.

14874

unsigned NumEltBits = VT.getScalarSizeInBits();

14875

unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())

14876

? X86ISD::MOVDDUP

14877

: X86ISD::VBROADCAST;

14878

bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();

14879

14880

// Check that the mask is a broadcast.

14881

int BroadcastIdx = getSplatIndex(Mask);

14882

if (BroadcastIdx < 0)

14883

return SDValue();

14884

assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14886, __extension__
__PRETTY_FUNCTION__))

14885

"a sorted mask where the broadcast "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14886, __extension__
__PRETTY_FUNCTION__))

14886

"comes from V1.")(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14886, __extension__
__PRETTY_FUNCTION__));

14887

14888

// Go up the chain of (vector) values to find a scalar load that we can

14889

// combine with the broadcast.

14890

// TODO: Combine this logic with findEltLoadSrc() used by

14891

// EltsFromConsecutiveLoads().

14892

int BitOffset = BroadcastIdx * NumEltBits;

14893

SDValue V = V1;

14894

for (;;) {

14895

switch (V.getOpcode()) {

14896

case ISD::BITCAST: {

14897

V = V.getOperand(0);

14898

continue;

14899

}

14900

case ISD::CONCAT_VECTORS: {

14901

int OpBitWidth = V.getOperand(0).getValueSizeInBits();

14902

int OpIdx = BitOffset / OpBitWidth;

14903

V = V.getOperand(OpIdx);

14904

BitOffset %= OpBitWidth;

14905

continue;

14906

}

14907

case ISD::EXTRACT_SUBVECTOR: {

14908

// The extraction index adds to the existing offset.

14909

unsigned EltBitWidth = V.getScalarValueSizeInBits();

14910

unsigned Idx = V.getConstantOperandVal(1);

14911

unsigned BeginOffset = Idx * EltBitWidth;

14912

BitOffset += BeginOffset;

14913

V = V.getOperand(0);

14914

continue;

14915

}

14916

case ISD::INSERT_SUBVECTOR: {

14917

SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);

14918

int EltBitWidth = VOuter.getScalarValueSizeInBits();

14919

int Idx = (int)V.getConstantOperandVal(2);

14920

int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();

14921

int BeginOffset = Idx * EltBitWidth;

14922

int EndOffset = BeginOffset + NumSubElts * EltBitWidth;

14923

if (BeginOffset <= BitOffset && BitOffset < EndOffset) {

14924

BitOffset -= BeginOffset;

14925

V = VInner;

14926

} else {

14927

V = VOuter;

14928

}

14929

continue;

14930

}

14931

}

14932

break;

14933

}

14934

assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(static_cast <bool> ((BitOffset % NumEltBits) == 0 &&
"Illegal bit-offset") ? void (0) : __assert_fail ("(BitOffset % NumEltBits) == 0 && \"Illegal bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14934, __extension__
__PRETTY_FUNCTION__));

14935

BroadcastIdx = BitOffset / NumEltBits;

14936

14937

// Do we need to bitcast the source to retrieve the original broadcast index?

14938

bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;

14939

14940

// Check if this is a broadcast of a scalar. We special case lowering

14941

// for scalars so that we can more effectively fold with loads.

14942

// If the original value has a larger element type than the shuffle, the

14943

// broadcast element is in essence truncated. Make that explicit to ease

14944

// folding.

14945

if (BitCastSrc && VT.isInteger())

14946

if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(

14947

DL, VT, V, BroadcastIdx, Subtarget, DAG))

14948

return TruncBroadcast;

14949

14950

// Also check the simpler case, where we can directly reuse the scalar.

14951

if (!BitCastSrc &&

14952

((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||

14953

(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {

14954

V = V.getOperand(BroadcastIdx);

14955

14956

// If we can't broadcast from a register, check that the input is a load.

14957

if (!BroadcastFromReg && !isShuffleFoldableLoad(V))

14958

return SDValue();

14959

} else if (ISD::isNormalLoad(V.getNode()) &&

14960

cast<LoadSDNode>(V)->isSimple()) {

14961

// We do not check for one-use of the vector load because a broadcast load

14962

// is expected to be a win for code size, register pressure, and possibly

14963

// uops even if the original vector load is not eliminated.

14964

14965

// Reduce the vector load and shuffle to a broadcasted scalar load.

14966

LoadSDNode *Ld = cast<LoadSDNode>(V);

14967

SDValue BaseAddr = Ld->getOperand(1);

14968

MVT SVT = VT.getScalarType();

14969

unsigned Offset = BroadcastIdx * SVT.getStoreSize();

14970

assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(static_cast <bool> ((int)(Offset * 8) == BitOffset &&
"Unexpected bit-offset") ? void (0) : __assert_fail ("(int)(Offset * 8) == BitOffset && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14970, __extension__
__PRETTY_FUNCTION__));

14971

SDValue NewAddr =

14972

DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);

14973

14974

// Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather

14975

// than MOVDDUP.

14976

// FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?

14977

if (Opcode == X86ISD::VBROADCAST) {

14978

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

14979

SDValue Ops[] = {Ld->getChain(), NewAddr};

14980

V = DAG.getMemIntrinsicNode(

14981

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,

14982

DAG.getMachineFunction().getMachineMemOperand(

14983

Ld->getMemOperand(), Offset, SVT.getStoreSize()));

14984

DAG.makeEquivalentMemoryOrdering(Ld, V);

14985

return DAG.getBitcast(VT, V);

14986

}

14987

assert(SVT == MVT::f64 && "Unexpected VT!")(static_cast <bool> (SVT == MVT::f64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("SVT == MVT::f64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14987, __extension__
__PRETTY_FUNCTION__));

14988

V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,

14989

DAG.getMachineFunction().getMachineMemOperand(

14990

Ld->getMemOperand(), Offset, SVT.getStoreSize()));

14991

DAG.makeEquivalentMemoryOrdering(Ld, V);

14992

} else if (!BroadcastFromReg) {

14993

// We can't broadcast from a vector register.

14994

return SDValue();

14995

} else if (BitOffset != 0) {

14996

// We can only broadcast from the zero-element of a vector register,

14997

// but it can be advantageous to broadcast from the zero-element of a

14998

// subvector.

14999

if (!VT.is256BitVector() && !VT.is512BitVector())

15000

return SDValue();

15001

15002

// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.

15003

if (VT == MVT::v4f64 || VT == MVT::v4i64)

15004

return SDValue();

15005

15006

// Only broadcast the zero-element of a 128-bit subvector.

15007

if ((BitOffset % 128) != 0)

15008

return SDValue();

15009

15010

assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15011, __extension__
__PRETTY_FUNCTION__))

15011

"Unexpected bit-offset")(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15011, __extension__
__PRETTY_FUNCTION__));

15012

assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15013, __extension__
__PRETTY_FUNCTION__))

15013

"Unexpected vector size")(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15013, __extension__
__PRETTY_FUNCTION__));

15014

unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();

15015

V = extract128BitVector(V, ExtractIdx, DAG, DL);

15016

}

15017

15018

// On AVX we can use VBROADCAST directly for scalar sources.

15019

if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {

15020

V = DAG.getBitcast(MVT::f64, V);

15021

if (Subtarget.hasAVX()) {

15022

V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);

15023

return DAG.getBitcast(VT, V);

15024

}

15025

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);

15026

}

15027

15028

// If this is a scalar, do the broadcast on this type and bitcast.

15029

if (!V.getValueType().isVector()) {

15030

assert(V.getScalarValueSizeInBits() == NumEltBits &&(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15031, __extension__
__PRETTY_FUNCTION__))

15031

"Unexpected scalar size")(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15031, __extension__
__PRETTY_FUNCTION__));

15032

MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),

15033

VT.getVectorNumElements());

15034

return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));

15035

}

15036

15037

// We only support broadcasting from 128-bit vectors to minimize the

15038

// number of patterns we need to deal with in isel. So extract down to

15039

// 128-bits, removing as many bitcasts as possible.

15040

if (V.getValueSizeInBits() > 128)

15041

V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);

15042

15043

// Otherwise cast V to a vector with the same element type as VT, but

15044

// possibly narrower than VT. Then perform the broadcast.

15045

unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;

15046

MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);

15047

return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));

15048

}

15049

15050

// Check for whether we can use INSERTPS to perform the shuffle. We only use

15051

// INSERTPS when the V1 elements are already in the correct locations

15052

// because otherwise we can just always use two SHUFPS instructions which

15053

// are much smaller to encode than a SHUFPS and an INSERTPS. We can also

15054

// perform INSERTPS if a single V1 element is out of place and all V2

15055

// elements are zeroable.

15056

static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,

15057

unsigned &InsertPSMask,

15058

const APInt &Zeroable,

15059

ArrayRef<int> Mask, SelectionDAG &DAG) {

15060

assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15060, __extension__
__PRETTY_FUNCTION__));

15061

assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15061, __extension__
__PRETTY_FUNCTION__));

15062

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15062, __extension__
__PRETTY_FUNCTION__));

15063

15064

// Attempt to match INSERTPS with one element from VA or VB being

15065

// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask

15066

// are updated.

15067

auto matchAsInsertPS = [&](SDValue VA, SDValue VB,

15068

ArrayRef<int> CandidateMask) {

15069

unsigned ZMask = 0;

15070

int VADstIndex = -1;

15071

int VBDstIndex = -1;

15072

bool VAUsedInPlace = false;

15073

15074

for (int i = 0; i < 4; ++i) {

15075

// Synthesize a zero mask from the zeroable elements (includes undefs).

15076

if (Zeroable[i]) {

15077

ZMask |= 1 << i;

15078

continue;

15079

}

15080

15081

// Flag if we use any VA inputs in place.

15082

if (i == CandidateMask[i]) {

15083

VAUsedInPlace = true;

15084

continue;

15085

}

15086

15087

// We can only insert a single non-zeroable element.

15088

if (VADstIndex >= 0 || VBDstIndex >= 0)

15089

return false;

15090

15091

if (CandidateMask[i] < 4) {

15092

// VA input out of place for insertion.

15093

VADstIndex = i;

15094

} else {

15095

// VB input for insertion.

15096

VBDstIndex = i;

15097

}

15098

}

15099

15100

// Don't bother if we have no (non-zeroable) element for insertion.

15101

if (VADstIndex < 0 && VBDstIndex < 0)

15102

return false;

15103

15104

// Determine element insertion src/dst indices. The src index is from the

15105

// start of the inserted vector, not the start of the concatenated vector.

15106

unsigned VBSrcIndex = 0;

15107

if (VADstIndex >= 0) {

15108

// If we have a VA input out of place, we use VA as the V2 element

15109

// insertion and don't use the original V2 at all.

15110

VBSrcIndex = CandidateMask[VADstIndex];

15111

VBDstIndex = VADstIndex;

15112

VB = VA;

15113

} else {

15114

VBSrcIndex = CandidateMask[VBDstIndex] - 4;

15115

}

15116

15117

// If no V1 inputs are used in place, then the result is created only from

15118

// the zero mask and the V2 insertion - so remove V1 dependency.

15119

if (!VAUsedInPlace)

15120

VA = DAG.getUNDEF(MVT::v4f32);

15121

15122

// Update V1, V2 and InsertPSMask accordingly.

15123

V1 = VA;

15124

V2 = VB;

15125

15126

// Insert the V2 element into the desired position.

15127

InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;

15128

assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15128, __extension__
__PRETTY_FUNCTION__));

15129

return true;

15130

};

15131

15132

if (matchAsInsertPS(V1, V2, Mask))

15133

return true;

15134

15135

// Commute and try again.

15136

SmallVector<int, 4> CommutedMask(Mask);

15137

ShuffleVectorSDNode::commuteMask(CommutedMask);

15138

if (matchAsInsertPS(V2, V1, CommutedMask))

15139

return true;

15140

15141

return false;

15142

}

15143

15144

static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,

15145

ArrayRef<int> Mask, const APInt &Zeroable,

15146

SelectionDAG &DAG) {

15147

assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15147, __extension__
__PRETTY_FUNCTION__));

15148

assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15148, __extension__
__PRETTY_FUNCTION__));

15149

15150

// Attempt to match the insertps pattern.

15151

unsigned InsertPSMask = 0;

15152

if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))

15153

return SDValue();

15154

15155

// Insert the V2 element into the desired position.

15156

return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,

15157

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

15158

}

15159

15160

/// Handle lowering of 2-lane 64-bit floating point shuffles.

15161

///

15162

/// This is the basis function for the 2-lane 64-bit shuffles as we have full

15163

/// support for floating point shuffles but not integer shuffles. These

15164

/// instructions will incur a domain crossing penalty on some chips though so

15165

/// it is better to avoid lowering through this for integer vectors where

15166

/// possible.

15167

static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15168

const APInt &Zeroable, SDValue V1, SDValue V2,

15169

const X86Subtarget &Subtarget,

15170

SelectionDAG &DAG) {

15171

assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15171, __extension__
__PRETTY_FUNCTION__));

15172

assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15172, __extension__
__PRETTY_FUNCTION__));

15173

assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15173, __extension__
__PRETTY_FUNCTION__));

15174

15175

if (V2.isUndef()) {

15176

// Check for being able to broadcast a single element.

15177

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,

15178

Mask, Subtarget, DAG))

15179

return Broadcast;

15180

15181

// Straight shuffle of a single input vector. Simulate this by using the

15182

// single input as both of the "inputs" to this instruction..

15183

unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);

15184

15185

if (Subtarget.hasAVX()) {

15186

// If we have AVX, we can use VPERMILPS which will allow folding a load

15187

// into the shuffle.

15188

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,

15189

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

15190

}

15191

15192

return DAG.getNode(

15193

X86ISD::SHUFP, DL, MVT::v2f64,

15194

Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,

15195

Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,

15196

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

15197

}

15198

assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15198, __extension__
__PRETTY_FUNCTION__));

15199

assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15199, __extension__
__PRETTY_FUNCTION__));

15200

assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15200, __extension__
__PRETTY_FUNCTION__));

15201

assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15201, __extension__
__PRETTY_FUNCTION__));

15202

15203

if (Subtarget.hasAVX2())

15204

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15205

return Extract;

15206

15207

// When loading a scalar and then shuffling it into a vector we can often do

15208

// the insertion cheaply.

15209

if (SDValue Insertion = lowerShuffleAsElementInsertion(

15210

DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))

15211

return Insertion;

15212

// Try inverting the insertion since for v2 masks it is easy to do and we

15213

// can't reliably sort the mask one way or the other.

15214

int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),

15215

Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};

15216

if (SDValue Insertion = lowerShuffleAsElementInsertion(

15217

DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))

15218

return Insertion;

15219

15220

// Try to use one of the special instruction patterns to handle two common

15221

// blend patterns if a zero-blend above didn't work.

15222

if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||

15223

isShuffleEquivalent(Mask, {1, 3}, V1, V2))

15224

if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))

15225

// We can either use a special instruction to load over the low double or

15226

// to move just the low double.

15227

return DAG.getNode(

15228

X86ISD::MOVSD, DL, MVT::v2f64, V2,

15229

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));

15230

15231

if (Subtarget.hasSSE41())

15232

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,

15233

Zeroable, Subtarget, DAG))

15234

return Blend;

15235

15236

// Use dedicated unpack instructions for masks that match their pattern.

15237

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))

15238

return V;

15239

15240

unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);

15241

return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,

15242

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

15243

}

15244

15245

/// Handle lowering of 2-lane 64-bit integer shuffles.

15246

///

15247

/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by

15248

/// the integer unit to minimize domain crossing penalties. However, for blends

15249

/// it falls back to the floating point shuffle operation with appropriate bit

15250

/// casting.

15251

static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15252

const APInt &Zeroable, SDValue V1, SDValue V2,

15253

const X86Subtarget &Subtarget,

15254

SelectionDAG &DAG) {

15255

assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15255, __extension__
__PRETTY_FUNCTION__));

15256

assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15256, __extension__
__PRETTY_FUNCTION__));

15257

assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15257, __extension__
__PRETTY_FUNCTION__));

15258

15259

if (V2.isUndef()) {

15260

// Check for being able to broadcast a single element.

15261

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,

15262

Mask, Subtarget, DAG))

15263

return Broadcast;

15264

15265

// Straight shuffle of a single input vector. For everything from SSE2

15266

// onward this has a single fast instruction with no scary immediates.

15267

// We have to map the mask as it is actually a v4i32 shuffle instruction.

15268

V1 = DAG.getBitcast(MVT::v4i32, V1);

15269

int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),

15270

Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),

15271

Mask[1] < 0 ? -1 : (Mask[1] * 2),

15272

Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};

15273

return DAG.getBitcast(

15274

MVT::v2i64,

15275

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,

15276

getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));

15277

}

15278

assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15278, __extension__
__PRETTY_FUNCTION__));

15279

assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15279, __extension__
__PRETTY_FUNCTION__));

15280

assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15280, __extension__
__PRETTY_FUNCTION__));

15281

assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15281, __extension__
__PRETTY_FUNCTION__));

15282

15283

if (Subtarget.hasAVX2())

15284

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15285

return Extract;

15286

15287

// Try to use shift instructions.

15288

if (SDValue Shift =

15289

lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,

15290

DAG, /*BitwiseOnly*/ false))

15291

return Shift;

15292

15293

// When loading a scalar and then shuffling it into a vector we can often do

15294

// the insertion cheaply.

15295

if (SDValue Insertion = lowerShuffleAsElementInsertion(

15296

DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))

15297

return Insertion;

15298

// Try inverting the insertion since for v2 masks it is easy to do and we

15299

// can't reliably sort the mask one way or the other.

15300

int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};

15301

if (SDValue Insertion = lowerShuffleAsElementInsertion(

15302

DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))

15303

return Insertion;

15304

15305

// We have different paths for blend lowering, but they all must use the

15306

// *exact* same predicate.

15307

bool IsBlendSupported = Subtarget.hasSSE41();

15308

if (IsBlendSupported)

15309

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,

15310

Zeroable, Subtarget, DAG))

15311

return Blend;

15312

15313

// Use dedicated unpack instructions for masks that match their pattern.

15314

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))

15315

return V;

15316

15317

// Try to use byte rotation instructions.

15318

// Its more profitable for pre-SSSE3 to use shuffles/unpacks.

15319

if (Subtarget.hasSSSE3()) {

15320

if (Subtarget.hasVLX())

15321

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,

15322

Subtarget, DAG))

15323

return Rotate;

15324

15325

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,

15326

Subtarget, DAG))

15327

return Rotate;

15328

}

15329

15330

// If we have direct support for blends, we should lower by decomposing into

15331

// a permute. That will be faster than the domain cross.

15332

if (IsBlendSupported)

15333

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,

15334

Subtarget, DAG);

15335

15336

// We implement this with SHUFPD which is pretty lame because it will likely

15337

// incur 2 cycles of stall for integer vectors on Nehalem and older chips.

15338

// However, all the alternatives are still more cycles and newer chips don't

15339

// have this problem. It would be really nice if x86 had better shuffles here.

15340

V1 = DAG.getBitcast(MVT::v2f64, V1);

15341

V2 = DAG.getBitcast(MVT::v2f64, V2);

15342

return DAG.getBitcast(MVT::v2i64,

15343

DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));

15344

}

15345

15346

/// Lower a vector shuffle using the SHUFPS instruction.

15347

///

15348

/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.

15349

/// It makes no assumptions about whether this is the *best* lowering, it simply

15350

/// uses it.

15351

static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,

15352

ArrayRef<int> Mask, SDValue V1,

15353

SDValue V2, SelectionDAG &DAG) {

15354

SDValue LowV = V1, HighV = V2;

15355

SmallVector<int, 4> NewMask(Mask);

15356

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

15357

15358

if (NumV2Elements == 1) {

15359

int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();

15360

15361

// Compute the index adjacent to V2Index and in the same half by toggling

15362

// the low bit.

15363

int V2AdjIndex = V2Index ^ 1;

15364

15365

if (Mask[V2AdjIndex] < 0) {

15366

// Handles all the cases where we have a single V2 element and an undef.

15367

// This will only ever happen in the high lanes because we commute the

15368

// vector otherwise.

15369

if (V2Index < 2)

15370

std::swap(LowV, HighV);

15371

NewMask[V2Index] -= 4;

15372

} else {

15373

// Handle the case where the V2 element ends up adjacent to a V1 element.

15374

// To make this work, blend them together as the first step.

15375

int V1Index = V2AdjIndex;

15376

int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};

15377

V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,

15378

getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

15379

15380

// Now proceed to reconstruct the final blend as we have the necessary

15381

// high or low half formed.

15382

if (V2Index < 2) {

15383

LowV = V2;

15384

HighV = V1;

15385

} else {

15386

HighV = V2;

15387

}

15388

NewMask[V1Index] = 2; // We put the V1 element in V2[2].

15389

NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].

15390

}

15391

} else if (NumV2Elements == 2) {

15392

if (Mask[0] < 4 && Mask[1] < 4) {

15393

// Handle the easy case where we have V1 in the low lanes and V2 in the

15394

// high lanes.

15395

NewMask[2] -= 4;

15396

NewMask[3] -= 4;

15397

} else if (Mask[2] < 4 && Mask[3] < 4) {

15398

// We also handle the reversed case because this utility may get called

15399

// when we detect a SHUFPS pattern but can't easily commute the shuffle to

15400

// arrange things in the right direction.

15401

NewMask[0] -= 4;

15402

NewMask[1] -= 4;

15403

HighV = V1;

15404

LowV = V2;

15405

} else {

15406

// We have a mixture of V1 and V2 in both low and high lanes. Rather than

15407

// trying to place elements directly, just blend them and set up the final

15408

// shuffle to place them.

15409

15410

// The first two blend mask elements are for V1, the second two are for

15411

// V2.

15412

int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],

15413

Mask[2] < 4 ? Mask[2] : Mask[3],

15414

(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,

15415

(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};

15416

V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,

15417

getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

15418

15419

// Now we do a normal shuffle of V1 by giving V1 as both operands to

15420

// a blend.

15421

LowV = HighV = V1;

15422

NewMask[0] = Mask[0] < 4 ? 0 : 2;

15423

NewMask[1] = Mask[0] < 4 ? 2 : 0;

15424

NewMask[2] = Mask[2] < 4 ? 1 : 3;

15425

NewMask[3] = Mask[2] < 4 ? 3 : 1;

15426

}

15427

} else if (NumV2Elements == 3) {

15428

// Ideally canonicalizeShuffleMaskWithCommute should have caught this, but

15429

// we can get here due to other paths (e.g repeated mask matching) that we

15430

// don't want to do another round of lowerVECTOR_SHUFFLE.

15431

ShuffleVectorSDNode::commuteMask(NewMask);

15432

return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);

15433

}

15434

return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,

15435

getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));

15436

}

15437

15438

/// Lower 4-lane 32-bit floating point shuffles.

15439

///

15440

/// Uses instructions exclusively from the floating point unit to minimize

15441

/// domain crossing penalties, as these are sufficient to implement all v4f32

15442

/// shuffles.

15443

static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15444

const APInt &Zeroable, SDValue V1, SDValue V2,

15445

const X86Subtarget &Subtarget,

15446

SelectionDAG &DAG) {

15447

assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15447, __extension__
__PRETTY_FUNCTION__));

15448

assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15448, __extension__
__PRETTY_FUNCTION__));

15449

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15449, __extension__
__PRETTY_FUNCTION__));

15450

15451

if (Subtarget.hasSSE41())

15452

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,

15453

Zeroable, Subtarget, DAG))

15454

return Blend;

15455

15456

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

15457

15458

if (NumV2Elements == 0) {

15459

// Check for being able to broadcast a single element.

15460

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,

15461

Mask, Subtarget, DAG))

15462

return Broadcast;

15463

15464

// Use even/odd duplicate instructions for masks that match their pattern.

15465

if (Subtarget.hasSSE3()) {

15466

if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))

15467

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);

15468

if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))

15469

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);

15470

}

15471

15472

if (Subtarget.hasAVX()) {

15473

// If we have AVX, we can use VPERMILPS which will allow folding a load

15474

// into the shuffle.

15475

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,

15476

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

15477

}

15478

15479

// Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid

15480

// in SSE1 because otherwise they are widened to v2f64 and never get here.

15481

if (!Subtarget.hasSSE2()) {

15482

if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))

15483

return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);

15484

if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))

15485

return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);

15486

}

15487

15488

// Otherwise, use a straight shuffle of a single input vector. We pass the

15489

// input vector to both operands to simulate this with a SHUFPS.

15490

return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,

15491

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

15492

}

15493

15494

if (Subtarget.hasSSE2())

15495

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

15496

DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {

15497

ZExt = DAG.getBitcast(MVT::v4f32, ZExt);

15498

return ZExt;

15499

}

15500

15501

if (Subtarget.hasAVX2())

15502

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15503

return Extract;

15504

15505

// There are special ways we can lower some single-element blends. However, we

15506

// have custom ways we can lower more complex single-element blends below that

15507

// we defer to if both this and BLENDPS fail to match, so restrict this to

15508

// when the V2 input is targeting element 0 of the mask -- that is the fast

15509

// case here.

15510

if (NumV2Elements == 1 && Mask[0] >= 4)

15511

if (SDValue V = lowerShuffleAsElementInsertion(

15512

DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))

15513

return V;

15514

15515

if (Subtarget.hasSSE41()) {

15516

// Use INSERTPS if we can complete the shuffle efficiently.

15517

if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))

15518

return V;

15519

15520

if (!isSingleSHUFPSMask(Mask))

15521

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,

15522

V2, Mask, DAG))

15523

return BlendPerm;

15524

}

15525

15526

// Use low/high mov instructions. These are only valid in SSE1 because

15527

// otherwise they are widened to v2f64 and never get here.

15528

if (!Subtarget.hasSSE2()) {

15529

if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))

15530

return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);

15531

if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))

15532

return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);

15533

}

15534

15535

// Use dedicated unpack instructions for masks that match their pattern.

15536

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))

15537

return V;

15538

15539

// Otherwise fall back to a SHUFPS lowering strategy.

15540

return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);

15541

}

15542

15543

/// Lower 4-lane i32 vector shuffles.

15544

///

15545

/// We try to handle these with integer-domain shuffles where we can, but for

15546

/// blends we use the floating point domain blend instructions.

15547

static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15548

const APInt &Zeroable, SDValue V1, SDValue V2,

15549

const X86Subtarget &Subtarget,

15550

SelectionDAG &DAG) {

15551

assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15551, __extension__
__PRETTY_FUNCTION__));

15552

assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15552, __extension__
__PRETTY_FUNCTION__));

15553

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15553, __extension__
__PRETTY_FUNCTION__));

15554

15555

// Whenever we can lower this as a zext, that instruction is strictly faster

15556

// than any alternative. It also allows us to fold memory operands into the

15557

// shuffle in many cases.

15558

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,

15559

Zeroable, Subtarget, DAG))

15560

return ZExt;

15561

15562

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

15563

15564

// Try to use shift instructions if fast.

15565

if (Subtarget.preferLowerShuffleAsShift()) {

15566

if (SDValue Shift =

15567

lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,

15568

Subtarget, DAG, /*BitwiseOnly*/ true))

15569

return Shift;

15570

if (NumV2Elements == 0)

15571

if (SDValue Rotate =

15572

lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))

15573

return Rotate;

15574

}

15575

15576

if (NumV2Elements == 0) {

15577

// Try to use broadcast unless the mask only has one non-undef element.

15578

if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {

15579

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,

15580

Mask, Subtarget, DAG))

15581

return Broadcast;

15582

}

15583

15584

// Straight shuffle of a single input vector. For everything from SSE2

15585

// onward this has a single fast instruction with no scary immediates.

15586

// We coerce the shuffle pattern to be compatible with UNPCK instructions

15587

// but we aren't actually going to use the UNPCK instruction because doing

15588

// so prevents folding a load into this instruction or making a copy.

15589

const int UnpackLoMask[] = {0, 0, 1, 1};

15590

const int UnpackHiMask[] = {2, 2, 3, 3};

15591

if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))

15592

Mask = UnpackLoMask;

15593

else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))

15594

Mask = UnpackHiMask;

15595

15596

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,

15597

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

15598

}

15599

15600

if (Subtarget.hasAVX2())

15601

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15602

return Extract;

15603

15604

// Try to use shift instructions.

15605

if (SDValue Shift =

15606

lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,

15607

DAG, /*BitwiseOnly*/ false))

15608

return Shift;

15609

15610

// There are special ways we can lower some single-element blends.

15611

if (NumV2Elements == 1)

15612

if (SDValue V = lowerShuffleAsElementInsertion(

15613

DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

15614

return V;

15615

15616

// We have different paths for blend lowering, but they all must use the

15617

// *exact* same predicate.

15618

bool IsBlendSupported = Subtarget.hasSSE41();

15619

if (IsBlendSupported)

15620

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,

15621

Zeroable, Subtarget, DAG))

15622

return Blend;

15623

15624

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,

15625

Zeroable, Subtarget, DAG))

15626

return Masked;

15627

15628

// Use dedicated unpack instructions for masks that match their pattern.

15629

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))

15630

return V;

15631

15632

// Try to use byte rotation instructions.

15633

// Its more profitable for pre-SSSE3 to use shuffles/unpacks.

15634

if (Subtarget.hasSSSE3()) {

15635

if (Subtarget.hasVLX())

15636

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,

15637

Subtarget, DAG))

15638

return Rotate;

15639

15640

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,

15641

Subtarget, DAG))

15642

return Rotate;

15643

}

15644

15645

// Assume that a single SHUFPS is faster than an alternative sequence of

15646

// multiple instructions (even if the CPU has a domain penalty).

15647

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

15648

if (!isSingleSHUFPSMask(Mask)) {

15649

// If we have direct support for blends, we should lower by decomposing into

15650

// a permute. That will be faster than the domain cross.

15651

if (IsBlendSupported)

15652

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,

15653

Subtarget, DAG);

15654

15655

// Try to lower by permuting the inputs into an unpack instruction.

15656

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,

15657

Mask, Subtarget, DAG))

15658

return Unpack;

15659

}

15660

15661

// We implement this with SHUFPS because it can blend from two vectors.

15662

// Because we're going to eventually use SHUFPS, we use SHUFPS even to build

15663

// up the inputs, bypassing domain shift penalties that we would incur if we

15664

// directly used PSHUFD on Nehalem and older. For newer chips, this isn't

15665

// relevant.

15666

SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);

15667

SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);

15668

SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);

15669

return DAG.getBitcast(MVT::v4i32, ShufPS);

15670

}

15671

15672

/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2

15673

/// shuffle lowering, and the most complex part.

15674

///

15675

/// The lowering strategy is to try to form pairs of input lanes which are

15676

/// targeted at the same half of the final vector, and then use a dword shuffle

15677

/// to place them onto the right half, and finally unpack the paired lanes into

15678

/// their final position.

15679

///

15680

/// The exact breakdown of how to form these dword pairs and align them on the

15681

/// correct sides is really tricky. See the comments within the function for

15682

/// more of the details.

15683

///

15684

/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each

15685

/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to

15686

/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16

15687

/// vector, form the analogous 128-bit 8-element Mask.

15688

static SDValue lowerV8I16GeneralSingleInputShuffle(

15689

const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,

15690

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

15691

assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad input type!") ? void (0) : __assert_fail (
"VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15691, __extension__
__PRETTY_FUNCTION__));

15692

MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

15693

15694

assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")(static_cast <bool> (Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15694, __extension__
__PRETTY_FUNCTION__));

15695

MutableArrayRef<int> LoMask = Mask.slice(0, 4);

15696

MutableArrayRef<int> HiMask = Mask.slice(4, 4);

15697

15698

// Attempt to directly match PSHUFLW or PSHUFHW.

15699

if (isUndefOrInRange(LoMask, 0, 4) &&

15700

isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {

15701

return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

15702

getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

15703

}

15704

if (isUndefOrInRange(HiMask, 4, 8) &&

15705

isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {

15706

for (int i = 0; i != 4; ++i)

15707

HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));

15708

return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

15709

getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

15710

}

15711

15712

SmallVector<int, 4> LoInputs;

15713

copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });

15714

array_pod_sort(LoInputs.begin(), LoInputs.end());

15715

LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());

15716

SmallVector<int, 4> HiInputs;

15717

copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });

15718

array_pod_sort(HiInputs.begin(), HiInputs.end());

15719

HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());

15720

int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();

15721

int NumHToL = LoInputs.size() - NumLToL;

15722

int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();

15723

int NumHToH = HiInputs.size() - NumLToH;

15724

MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);

15725

MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);

15726

MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);

15727

MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

15728

15729

// If we are shuffling values from one half - check how many different DWORD

15730

// pairs we need to create. If only 1 or 2 then we can perform this as a

15731

// PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.

15732

auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,

15733

ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {

15734

V = DAG.getNode(ShufWOp, DL, VT, V,

15735

getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

15736

V = DAG.getBitcast(PSHUFDVT, V);

15737

V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,

15738

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));

15739

return DAG.getBitcast(VT, V);

15740

};

15741

15742

if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {

15743

int PSHUFDMask[4] = { -1, -1, -1, -1 };

15744

SmallVector<std::pair<int, int>, 4> DWordPairs;

15745

int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);

15746

15747

// Collect the different DWORD pairs.

15748

for (int DWord = 0; DWord != 4; ++DWord) {

15749

int M0 = Mask[2 * DWord + 0];

15750

int M1 = Mask[2 * DWord + 1];

15751

M0 = (M0 >= 0 ? M0 % 4 : M0);

15752

M1 = (M1 >= 0 ? M1 % 4 : M1);

15753

if (M0 < 0 && M1 < 0)

15754

continue;

15755

15756

bool Match = false;

15757

for (int j = 0, e = DWordPairs.size(); j < e; ++j) {

15758

auto &DWordPair = DWordPairs[j];

15759

if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&

15760

(M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {

15761

DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);

15762

DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);

15763

PSHUFDMask[DWord] = DOffset + j;

15764

Match = true;

15765

break;

15766

}

15767

}

15768

if (!Match) {

15769

PSHUFDMask[DWord] = DOffset + DWordPairs.size();

15770

DWordPairs.push_back(std::make_pair(M0, M1));

15771

}

15772

}

15773

15774

if (DWordPairs.size() <= 2) {

15775

DWordPairs.resize(2, std::make_pair(-1, -1));

15776

int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,

15777

DWordPairs[1].first, DWordPairs[1].second};

15778

if ((NumHToL + NumHToH) == 0)

15779

return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);

15780

if ((NumLToL + NumLToH) == 0)

15781

return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);

15782

}

15783

}

15784

15785

// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all

15786

// such inputs we can swap two of the dwords across the half mark and end up

15787

// with <=2 inputs to each half in each half. Once there, we can fall through

15788

// to the generic code below. For example:

15789

//

15790

// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]

15791

// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]

15792

//

15793

// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half

15794

// and an existing 2-into-2 on the other half. In this case we may have to

15795

// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or

15796

// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.

15797

// Fortunately, we don't have to handle anything but a 2-into-2 pattern

15798

// because any other situation (including a 3-into-1 or 1-into-3 in the other

15799

// half than the one we target for fixing) will be fixed when we re-enter this

15800

// path. We will also combine away any sequence of PSHUFD instructions that

15801

// result into a single instruction. Here is an example of the tricky case:

15802

//

15803

// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]

15804

// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]

15805

//

15806

// This now has a 1-into-3 in the high half! Instead, we do two shuffles:

15807

//

15808

// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]

15809

// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]

15810

//

15811

// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]

15812

// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]

15813

//

15814

// The result is fine to be handled by the generic logic.

15815

auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,

15816

ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,

15817

int AOffset, int BOffset) {

15818

assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15819, __extension__
__PRETTY_FUNCTION__))

15819

"Must call this with A having 3 or 1 inputs from the A half.")(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15819, __extension__
__PRETTY_FUNCTION__));

15820

assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15821, __extension__
__PRETTY_FUNCTION__))

15821

"Must call this with B having 1 or 3 inputs from the B half.")(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15821, __extension__
__PRETTY_FUNCTION__));

15822

assert(AToAInputs.size() + BToAInputs.size() == 4 &&(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15823, __extension__
__PRETTY_FUNCTION__))

15823

"Must call this with either 3:1 or 1:3 inputs (summing to 4).")(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15823, __extension__
__PRETTY_FUNCTION__));

15824

15825

bool ThreeAInputs = AToAInputs.size() == 3;

15826

15827

// Compute the index of dword with only one word among the three inputs in

15828

// a half by taking the sum of the half with three inputs and subtracting

15829

// the sum of the actual three inputs. The difference is the remaining

15830

// slot.

15831

int ADWord = 0, BDWord = 0;

15832

int &TripleDWord = ThreeAInputs ? ADWord : BDWord;

15833

int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;

15834

int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;

15835

ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;

15836

int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];

15837

int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);

15838

int TripleNonInputIdx =

15839

TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);

15840

TripleDWord = TripleNonInputIdx / 2;

15841

15842

// We use xor with one to compute the adjacent DWord to whichever one the

15843

// OneInput is in.

15844

OneInputDWord = (OneInput / 2) ^ 1;

15845

15846

// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA

15847

// and BToA inputs. If there is also such a problem with the BToB and AToB

15848

// inputs, we don't try to fix it necessarily -- we'll recurse and see it in

15849

// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it

15850

// is essential that we don't *create* a 3<-1 as then we might oscillate.

15851

if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {

15852

// Compute how many inputs will be flipped by swapping these DWords. We

15853

// need

15854

// to balance this to ensure we don't form a 3-1 shuffle in the other

15855

// half.

15856

int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +

15857

llvm::count(AToBInputs, 2 * ADWord + 1);

15858

int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +

15859

llvm::count(BToBInputs, 2 * BDWord + 1);

15860

if ((NumFlippedAToBInputs == 1 &&

15861

(NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||

15862

(NumFlippedBToBInputs == 1 &&

15863

(NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {

15864

// We choose whether to fix the A half or B half based on whether that

15865

// half has zero flipped inputs. At zero, we may not be able to fix it

15866

// with that half. We also bias towards fixing the B half because that

15867

// will more commonly be the high half, and we have to bias one way.

15868

auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,

15869

ArrayRef<int> Inputs) {

15870

int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.

15871

bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);

15872

// Determine whether the free index is in the flipped dword or the

15873

// unflipped dword based on where the pinned index is. We use this bit

15874

// in an xor to conditionally select the adjacent dword.

15875

int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));

15876

bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);

15877

if (IsFixIdxInput == IsFixFreeIdxInput)

15878

FixFreeIdx += 1;

15879

IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);

15880

assert(IsFixIdxInput != IsFixFreeIdxInput &&(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15881, __extension__
__PRETTY_FUNCTION__))

15881

"We need to be changing the number of flipped inputs!")(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15881, __extension__
__PRETTY_FUNCTION__));

15882

int PSHUFHalfMask[] = {0, 1, 2, 3};

15883

std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);

15884

V = DAG.getNode(

15885

FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,

15886

MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,

15887

getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

15888

15889

for (int &M : Mask)

15890

if (M >= 0 && M == FixIdx)

15891

M = FixFreeIdx;

15892

else if (M >= 0 && M == FixFreeIdx)

15893

M = FixIdx;

15894

};

15895

if (NumFlippedBToBInputs != 0) {

15896

int BPinnedIdx =

15897

BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;

15898

FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);

15899

} else {

15900

assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")(static_cast <bool> (NumFlippedAToBInputs != 0 &&
"Impossible given predicates!") ? void (0) : __assert_fail (
"NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15900, __extension__
__PRETTY_FUNCTION__));

15901

int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;

15902

FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);

15903

}

15904

}

15905

}

15906

15907

int PSHUFDMask[] = {0, 1, 2, 3};

15908

PSHUFDMask[ADWord] = BDWord;

15909

PSHUFDMask[BDWord] = ADWord;

15910

V = DAG.getBitcast(

15911

VT,

15912

DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),

15913

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

15914

15915

// Adjust the mask to match the new locations of A and B.

15916

for (int &M : Mask)

15917

if (M >= 0 && M/2 == ADWord)

15918

M = 2 * BDWord + M % 2;

15919

else if (M >= 0 && M/2 == BDWord)

15920

M = 2 * ADWord + M % 2;

15921

15922

// Recurse back into this routine to re-compute state now that this isn't

15923

// a 3 and 1 problem.

15924

return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);

15925

};

15926

if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))

15927

return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);

15928

if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))

15929

return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);

15930

15931

// At this point there are at most two inputs to the low and high halves from

15932

// each half. That means the inputs can always be grouped into dwords and

15933

// those dwords can then be moved to the correct half with a dword shuffle.

15934

// We use at most one low and one high word shuffle to collect these paired

15935

// inputs into dwords, and finally a dword shuffle to place them.

15936

int PSHUFLMask[4] = {-1, -1, -1, -1};

15937

int PSHUFHMask[4] = {-1, -1, -1, -1};

15938

int PSHUFDMask[4] = {-1, -1, -1, -1};

15939

15940

// First fix the masks for all the inputs that are staying in their

15941

// original halves. This will then dictate the targets of the cross-half

15942

// shuffles.

15943

auto fixInPlaceInputs =

15944

[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,

15945

MutableArrayRef<int> SourceHalfMask,

15946

MutableArrayRef<int> HalfMask, int HalfOffset) {

15947

if (InPlaceInputs.empty())

15948

return;

15949

if (InPlaceInputs.size() == 1) {

15950

SourceHalfMask[InPlaceInputs[0] - HalfOffset] =

15951

InPlaceInputs[0] - HalfOffset;

15952

PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;

15953

return;

15954

}

15955

if (IncomingInputs.empty()) {

15956

// Just fix all of the in place inputs.

15957

for (int Input : InPlaceInputs) {

15958

SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;

15959

PSHUFDMask[Input / 2] = Input / 2;

15960

}

15961

return;

15962

}

15963

15964

assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")(static_cast <bool> (InPlaceInputs.size() == 2 &&
"Cannot handle 3 or 4 inputs!") ? void (0) : __assert_fail (
"InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15964, __extension__
__PRETTY_FUNCTION__));

15965

SourceHalfMask[InPlaceInputs[0] - HalfOffset] =

15966

InPlaceInputs[0] - HalfOffset;

15967

// Put the second input next to the first so that they are packed into

15968

// a dword. We find the adjacent index by toggling the low bit.

15969

int AdjIndex = InPlaceInputs[0] ^ 1;

15970

SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;

15971

std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);

15972

PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;

15973

};

15974

fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);

15975

fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);

15976

15977

// Now gather the cross-half inputs and place them into a free dword of

15978

// their target half.

15979

// FIXME: This operation could almost certainly be simplified dramatically to

15980

// look more like the 3-1 fixing operation.

15981

auto moveInputsToRightHalf = [&PSHUFDMask](

15982

MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,

15983

MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,

15984

MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,

15985

int DestOffset) {

15986

auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {

15987

return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;

15988

};

15989

auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,

15990

int Word) {

15991

int LowWord = Word & ~1;

15992

int HighWord = Word | 1;

15993

return isWordClobbered(SourceHalfMask, LowWord) ||

15994

isWordClobbered(SourceHalfMask, HighWord);

15995

};

15996

15997

if (IncomingInputs.empty())

15998

return;

15999

16000

if (ExistingInputs.empty()) {

16001

// Map any dwords with inputs from them into the right half.

16002

for (int Input : IncomingInputs) {

16003

// If the source half mask maps over the inputs, turn those into

16004

// swaps and use the swapped lane.

16005

if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {

16006

if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {

16007

SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =

16008

Input - SourceOffset;

16009

// We have to swap the uses in our half mask in one sweep.

16010

for (int &M : HalfMask)

16011

if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)

16012

M = Input;

16013

else if (M == Input)

16014

M = SourceHalfMask[Input - SourceOffset] + SourceOffset;

16015

} else {

16016

assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16018, __extension__
__PRETTY_FUNCTION__))

16017

Input - SourceOffset &&(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16018, __extension__
__PRETTY_FUNCTION__))

16018

"Previous placement doesn't match!")(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16018, __extension__
__PRETTY_FUNCTION__));

16019

}

16020

// Note that this correctly re-maps both when we do a swap and when

16021

// we observe the other side of the swap above. We rely on that to

16022

// avoid swapping the members of the input list directly.

16023

Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;

16024

}

16025

16026

// Map the input's dword into the correct half.

16027

if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)

16028

PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;

16029

else

16030

assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16032, __extension__
__PRETTY_FUNCTION__))

16031

Input / 2 &&(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16032, __extension__
__PRETTY_FUNCTION__))

16032

"Previous placement doesn't match!")(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16032, __extension__
__PRETTY_FUNCTION__));

16033

}

16034

16035

// And just directly shift any other-half mask elements to be same-half

16036

// as we will have mirrored the dword containing the element into the

16037

// same position within that half.

16038

for (int &M : HalfMask)

16039

if (M >= SourceOffset && M < SourceOffset + 4) {

16040

M = M - SourceOffset + DestOffset;

16041

assert(M >= 0 && "This should never wrap below zero!")(static_cast <bool> (M >= 0 && "This should never wrap below zero!"
) ? void (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16041, __extension__
__PRETTY_FUNCTION__));

16042

}

16043

return;

16044

}

16045

16046

// Ensure we have the input in a viable dword of its current half. This

16047

// is particularly tricky because the original position may be clobbered

16048

// by inputs being moved and *staying* in that half.

16049

if (IncomingInputs.size() == 1) {

16050

if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {

16051

int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +

16052

SourceOffset;

16053

SourceHalfMask[InputFixed - SourceOffset] =

16054

IncomingInputs[0] - SourceOffset;

16055

std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],

16056

InputFixed);

16057

IncomingInputs[0] = InputFixed;

16058

}

16059

} else if (IncomingInputs.size() == 2) {

16060

if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||

16061

isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {

16062

// We have two non-adjacent or clobbered inputs we need to extract from

16063

// the source half. To do this, we need to map them into some adjacent

16064

// dword slot in the source mask.

16065

int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,

16066

IncomingInputs[1] - SourceOffset};

16067

16068

// If there is a free slot in the source half mask adjacent to one of

16069

// the inputs, place the other input in it. We use (Index XOR 1) to

16070

// compute an adjacent index.

16071

if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&

16072

SourceHalfMask[InputsFixed[0] ^ 1] < 0) {

16073

SourceHalfMask[InputsFixed[0]] = InputsFixed[0];

16074

SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];

16075

InputsFixed[1] = InputsFixed[0] ^ 1;

16076

} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&

16077

SourceHalfMask[InputsFixed[1] ^ 1] < 0) {

16078

SourceHalfMask[InputsFixed[1]] = InputsFixed[1];

16079

SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];

16080

InputsFixed[0] = InputsFixed[1] ^ 1;

16081

} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&

16082

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {

16083

// The two inputs are in the same DWord but it is clobbered and the

16084

// adjacent DWord isn't used at all. Move both inputs to the free

16085

// slot.

16086

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];

16087

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];

16088

InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);

16089

InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;

16090

} else {

16091

// The only way we hit this point is if there is no clobbering

16092

// (because there are no off-half inputs to this half) and there is no

16093

// free slot adjacent to one of the inputs. In this case, we have to

16094

// swap an input with a non-input.

16095

for (int i = 0; i < 4; ++i)

16096

assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16097, __extension__
__PRETTY_FUNCTION__))

16097

"We can't handle any clobbers here!")(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16097, __extension__
__PRETTY_FUNCTION__));

16098

assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16099, __extension__
__PRETTY_FUNCTION__))

16099

"Cannot have adjacent inputs here!")(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16099, __extension__
__PRETTY_FUNCTION__));

16100

16101

SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];

16102

SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;

16103

16104

// We also have to update the final source mask in this case because

16105

// it may need to undo the above swap.

16106

for (int &M : FinalSourceHalfMask)

16107

if (M == (InputsFixed[0] ^ 1) + SourceOffset)

16108

M = InputsFixed[1] + SourceOffset;

16109

else if (M == InputsFixed[1] + SourceOffset)

16110

M = (InputsFixed[0] ^ 1) + SourceOffset;

16111

16112

InputsFixed[1] = InputsFixed[0] ^ 1;

16113

}

16114

16115

// Point everything at the fixed inputs.

16116

for (int &M : HalfMask)

16117

if (M == IncomingInputs[0])

16118

M = InputsFixed[0] + SourceOffset;

16119

else if (M == IncomingInputs[1])

16120

M = InputsFixed[1] + SourceOffset;

16121

16122

IncomingInputs[0] = InputsFixed[0] + SourceOffset;

16123

IncomingInputs[1] = InputsFixed[1] + SourceOffset;

16124

}

16125

} else {

16126

llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16126);

16127

}

16128

16129

// Now hoist the DWord down to the right half.

16130

int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;

16131

assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")(static_cast <bool> (PSHUFDMask[FreeDWord] < 0 &&
"DWord not free") ? void (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16131, __extension__
__PRETTY_FUNCTION__));

16132

PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;

16133

for (int &M : HalfMask)

16134

for (int Input : IncomingInputs)

16135

if (M == Input)

16136

M = FreeDWord * 2 + Input % 2;

16137

};

16138

moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,

16139

/*SourceOffset*/ 4, /*DestOffset*/ 0);

16140

moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,

16141

/*SourceOffset*/ 0, /*DestOffset*/ 4);

16142

16143

// Now enact all the shuffles we've computed to move the inputs into their

16144

// target half.

16145

if (!isNoopShuffleMask(PSHUFLMask))

16146

V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

16147

getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));

16148

if (!isNoopShuffleMask(PSHUFHMask))

16149

V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

16150

getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));

16151

if (!isNoopShuffleMask(PSHUFDMask))

16152

V = DAG.getBitcast(

16153

VT,

16154

DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),

16155

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

16156

16157

// At this point, each half should contain all its inputs, and we can then

16158

// just shuffle them into their final position.

16159

assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16160, __extension__
__PRETTY_FUNCTION__))

16160

"Failed to lift all the high half inputs to the low mask!")(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16160, __extension__
__PRETTY_FUNCTION__));

16161

assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16162, __extension__
__PRETTY_FUNCTION__))

16162

"Failed to lift all the low half inputs to the high mask!")(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16162, __extension__
__PRETTY_FUNCTION__));

16163

16164

// Do a half shuffle for the low mask.

16165

if (!isNoopShuffleMask(LoMask))

16166

V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

16167

getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

16168

16169

// Do a half shuffle with the high mask after shifting its values down.

16170

for (int &M : HiMask)

16171

if (M >= 0)

16172

M -= 4;

16173

if (!isNoopShuffleMask(HiMask))

16174

V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

16175

getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

16176

16177

return V;

16178

}

16179

16180

/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the

16181

/// blend if only one input is used.

16182

static SDValue lowerShuffleAsBlendOfPSHUFBs(

16183

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

16184

const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {

16185

assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16186, __extension__
__PRETTY_FUNCTION__))

16186

"Lane crossing shuffle masks not supported")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16186, __extension__
__PRETTY_FUNCTION__));

16187

16188

int NumBytes = VT.getSizeInBits() / 8;

16189

int Size = Mask.size();

16190

int Scale = NumBytes / Size;

16191

16192

SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));

16193

SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));

16194

V1InUse = false;

16195

V2InUse = false;

16196

16197

for (int i = 0; i < NumBytes; ++i) {

16198

int M = Mask[i / Scale];

16199

if (M < 0)

16200

continue;

16201

16202

const int ZeroMask = 0x80;

16203

int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;

16204

int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;

16205

if (Zeroable[i / Scale])

16206

V1Idx = V2Idx = ZeroMask;

16207

16208

V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);

16209

V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);

16210

V1InUse |= (ZeroMask != V1Idx);

16211

V2InUse |= (ZeroMask != V2Idx);

16212

}

16213

16214

MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);

16215

if (V1InUse)

16216

V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),

16217

DAG.getBuildVector(ShufVT, DL, V1Mask));

16218

if (V2InUse)

16219

V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),

16220

DAG.getBuildVector(ShufVT, DL, V2Mask));

16221

16222

// If we need shuffled inputs from both, blend the two.

16223

SDValue V;

16224

if (V1InUse && V2InUse)

16225

V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);

16226

else

16227

V = V1InUse ? V1 : V2;

16228

16229

// Cast the result back to the correct type.

16230

return DAG.getBitcast(VT, V);

16231

}

16232

16233

/// Generic lowering of 8-lane i16 shuffles.

16234

///

16235

/// This handles both single-input shuffles and combined shuffle/blends with

16236

/// two inputs. The single input shuffles are immediately delegated to

16237

/// a dedicated lowering routine.

16238

///

16239

/// The blends are lowered in one of three fundamental ways. If there are few

16240

/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle

16241

/// of the input is significantly cheaper when lowered as an interleaving of

16242

/// the two inputs, try to interleave them. Otherwise, blend the low and high

16243

/// halves of the inputs separately (making them have relatively few inputs)

16244

/// and then concatenate them.

16245

static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

16246

const APInt &Zeroable, SDValue V1, SDValue V2,

16247

const X86Subtarget &Subtarget,

16248

SelectionDAG &DAG) {

16249

assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16249, __extension__
__PRETTY_FUNCTION__));

16250

assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16250, __extension__
__PRETTY_FUNCTION__));

16251

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16251, __extension__
__PRETTY_FUNCTION__));

16252

16253

// Whenever we can lower this as a zext, that instruction is strictly faster

16254

// than any alternative.

16255

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,

16256

Zeroable, Subtarget, DAG))

16257

return ZExt;

16258

16259

// Try to use lower using a truncation.

16260

if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,

16261

Subtarget, DAG))

16262

return V;

16263

16264

int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });

16265

16266

if (NumV2Inputs == 0) {

16267

// Try to use shift instructions.

16268

if (SDValue Shift =

16269

lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,

16270

Subtarget, DAG, /*BitwiseOnly*/ false))

16271

return Shift;

16272

16273

// Check for being able to broadcast a single element.

16274

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,

16275

Mask, Subtarget, DAG))

16276

return Broadcast;

16277

16278

// Try to use bit rotation instructions.

16279

if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,

16280

Subtarget, DAG))

16281

return Rotate;

16282

16283

// Use dedicated unpack instructions for masks that match their pattern.

16284

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))

16285

return V;

16286

16287

// Use dedicated pack instructions for masks that match their pattern.

16288

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,

16289

Subtarget))

16290

return V;

16291

16292

// Try to use byte rotation instructions.

16293

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,

16294

Subtarget, DAG))

16295

return Rotate;

16296

16297

// Make a copy of the mask so it can be modified.

16298

SmallVector<int, 8> MutableMask(Mask);

16299

return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,

16300

Subtarget, DAG);

16301

}

16302

16303

assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16305, __extension__
__PRETTY_FUNCTION__))

16304

"All single-input shuffles should be canonicalized to be V1-input "(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16305, __extension__
__PRETTY_FUNCTION__))

16305

"shuffles.")(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16305, __extension__
__PRETTY_FUNCTION__));

16306

16307

// Try to use shift instructions.

16308

if (SDValue Shift =

16309

lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,

16310

DAG, /*BitwiseOnly*/ false))

16311

return Shift;

16312

16313

// See if we can use SSE4A Extraction / Insertion.

16314

if (Subtarget.hasSSE4A())

16315

if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,

16316

Zeroable, DAG))

16317

return V;

16318

16319

// There are special ways we can lower some single-element blends.

16320

if (NumV2Inputs == 1)

16321

if (SDValue V = lowerShuffleAsElementInsertion(

16322

DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

16323

return V;

16324

16325

// We have different paths for blend lowering, but they all must use the

16326

// *exact* same predicate.

16327

bool IsBlendSupported = Subtarget.hasSSE41();

16328

if (IsBlendSupported)

16329

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,

16330

Zeroable, Subtarget, DAG))

16331

return Blend;

16332

16333

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,

16334

Zeroable, Subtarget, DAG))

16335

return Masked;

16336

16337

// Use dedicated unpack instructions for masks that match their pattern.

16338

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))

16339

return V;

16340

16341

// Use dedicated pack instructions for masks that match their pattern.

16342

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,

16343

Subtarget))

16344

return V;

16345

16346

// Try to use lower using a truncation.

16347

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,

16348

Subtarget, DAG))

16349

return V;

16350

16351

// Try to use byte rotation instructions.

16352

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,

16353

Subtarget, DAG))

16354

return Rotate;

16355

16356

if (SDValue BitBlend =

16357

lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))

16358

return BitBlend;

16359

16360

// Try to use byte shift instructions to mask.

16361

if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,

16362

Zeroable, Subtarget, DAG))

16363

return V;

16364

16365

// Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.

16366

// We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to

16367

// be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.

16368

int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);

16369

if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&

16370

!Subtarget.hasVLX()) {

16371

// Check if this is part of a 256-bit vector truncation.

16372

if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&

16373

peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&

16374

peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {

16375

SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);

16376

V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,

16377

getZeroVector(MVT::v16i16, Subtarget, DAG, DL),

16378

DAG.getTargetConstant(0xEE, DL, MVT::i8));

16379

V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);

16380

V1 = extract128BitVector(V1V2, 0, DAG, DL);

16381

V2 = extract128BitVector(V1V2, 4, DAG, DL);

16382

} else {

16383

SmallVector<SDValue, 4> DWordClearOps(4,

16384

DAG.getConstant(0, DL, MVT::i32));

16385

for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))

16386

DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);

16387

SDValue DWordClearMask =

16388

DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);

16389

V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),

16390

DWordClearMask);

16391

V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),

16392

DWordClearMask);

16393

}

16394

// Now pack things back together.

16395

SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);

16396

if (NumEvenDrops == 2) {

16397

Result = DAG.getBitcast(MVT::v4i32, Result);

16398

Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);

16399

}

16400

return Result;

16401

}

16402

16403

// When compacting odd (upper) elements, use PACKSS pre-SSE41.

16404

int NumOddDrops = canLowerByDroppingElements(Mask, false, false);

16405

if (NumOddDrops == 1) {

16406

bool HasSSE41 = Subtarget.hasSSE41();

16407

V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,

16408

DAG.getBitcast(MVT::v4i32, V1),

16409

DAG.getTargetConstant(16, DL, MVT::i8));

16410

V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,

16411

DAG.getBitcast(MVT::v4i32, V2),

16412

DAG.getTargetConstant(16, DL, MVT::i8));

16413

return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,

16414

MVT::v8i16, V1, V2);

16415

}

16416

16417

// Try to lower by permuting the inputs into an unpack instruction.

16418

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,

16419

Mask, Subtarget, DAG))

16420

return Unpack;

16421

16422

// If we can't directly blend but can use PSHUFB, that will be better as it

16423

// can both shuffle and set up the inefficient blend.

16424

if (!IsBlendSupported && Subtarget.hasSSSE3()) {

16425

bool V1InUse, V2InUse;

16426

return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,

16427

Zeroable, DAG, V1InUse, V2InUse);

16428

}

16429

16430

// We can always bit-blend if we have to so the fallback strategy is to

16431

// decompose into single-input permutes and blends/unpacks.

16432

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,

16433

Mask, Subtarget, DAG);

16434

}

16435

16436

/// Lower 8-lane 16-bit floating point shuffles.

16437

static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

16438

const APInt &Zeroable, SDValue V1, SDValue V2,

16439

const X86Subtarget &Subtarget,

16440

SelectionDAG &DAG) {

16441

assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16441, __extension__
__PRETTY_FUNCTION__));

16442

assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16442, __extension__
__PRETTY_FUNCTION__));

16443

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16443, __extension__
__PRETTY_FUNCTION__));

16444

int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });

16445

16446

if (Subtarget.hasFP16()) {

16447

if (NumV2Elements == 0) {

16448

// Check for being able to broadcast a single element.

16449

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,

16450

Mask, Subtarget, DAG))

16451

return Broadcast;

16452

}

16453

if (NumV2Elements == 1 && Mask[0] >= 8)

16454

if (SDValue V = lowerShuffleAsElementInsertion(

16455

DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))

16456

return V;

16457

}

16458

16459

V1 = DAG.getBitcast(MVT::v8i16, V1);

16460

V2 = DAG.getBitcast(MVT::v8i16, V2);

16461

return DAG.getBitcast(MVT::v8f16,

16462

DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));

16463

}

16464

16465

// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,

16466

// sub-512-bit shuffles are padded to 512-bits for the shuffle and then

16467

// the active subvector is extracted.

16468

static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,

16469

ArrayRef<int> Mask, SDValue V1, SDValue V2,

16470

const X86Subtarget &Subtarget,

16471

SelectionDAG &DAG) {

16472

MVT MaskVT = VT.changeTypeToInteger();

16473

SDValue MaskNode;

16474

MVT ShuffleVT = VT;

16475

if (!VT.is512BitVector() && !Subtarget.hasVLX()) {

16476

V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);

16477

V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);

16478

ShuffleVT = V1.getSimpleValueType();

16479

16480

// Adjust mask to correct indices for the second input.

16481

int NumElts = VT.getVectorNumElements();

16482

unsigned Scale = 512 / VT.getSizeInBits();

16483

SmallVector<int, 32> AdjustedMask(Mask);

16484

for (int &M : AdjustedMask)

16485

if (NumElts <= M)

16486

M += (Scale - 1) * NumElts;

16487

MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);

16488

MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);

16489

} else {

16490

MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);

16491

}

16492

16493

SDValue Result;

16494

if (V2.isUndef())

16495

Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);

16496

else

16497

Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);

16498

16499

if (VT != ShuffleVT)

16500

Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());

16501

16502

return Result;

16503

}

16504

16505

/// Generic lowering of v16i8 shuffles.

16506

///

16507

/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to

16508

/// detect any complexity reducing interleaving. If that doesn't help, it uses

16509

/// UNPCK to spread the i8 elements across two i16-element vectors, and uses

16510

/// the existing lowering for v8i16 blends on each half, finally PACK-ing them

16511

/// back together.

16512

static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

16513

const APInt &Zeroable, SDValue V1, SDValue V2,

16514

const X86Subtarget &Subtarget,

16515

SelectionDAG &DAG) {

16516

assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16516, __extension__
__PRETTY_FUNCTION__));

16517

assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16517, __extension__
__PRETTY_FUNCTION__));

16518

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16518, __extension__
__PRETTY_FUNCTION__));

16519

16520

// Try to use shift instructions.

16521

if (SDValue Shift =

16522

lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,

16523

DAG, /*BitwiseOnly*/ false))

16524

return Shift;

16525

16526

// Try to use byte rotation instructions.

16527

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,

16528

Subtarget, DAG))

16529

return Rotate;

16530

16531

// Use dedicated pack instructions for masks that match their pattern.

16532

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,

16533

Subtarget))

16534

return V;

16535

16536

// Try to use a zext lowering.

16537

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,

16538

Zeroable, Subtarget, DAG))

16539

return ZExt;

16540

16541

// Try to use lower using a truncation.

16542

if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,

16543

Subtarget, DAG))

16544

return V;

16545

16546

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,

16547

Subtarget, DAG))

16548

return V;

16549

16550

// See if we can use SSE4A Extraction / Insertion.

16551

if (Subtarget.hasSSE4A())

16552

if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,

16553

Zeroable, DAG))

16554

return V;

16555

16556

int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

16557

16558

// For single-input shuffles, there are some nicer lowering tricks we can use.

16559

if (NumV2Elements == 0) {

16560

// Check for being able to broadcast a single element.

16561

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,

16562

Mask, Subtarget, DAG))

16563

return Broadcast;

16564

16565

// Try to use bit rotation instructions.

16566

if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,

16567

Subtarget, DAG))

16568

return Rotate;

16569

16570

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))

16571

return V;

16572

16573

// Check whether we can widen this to an i16 shuffle by duplicating bytes.

16574

// Notably, this handles splat and partial-splat shuffles more efficiently.

16575

// However, it only makes sense if the pre-duplication shuffle simplifies

16576

// things significantly. Currently, this means we need to be able to

16577

// express the pre-duplication shuffle as an i16 shuffle.

16578

//

16579

// FIXME: We should check for other patterns which can be widened into an

16580

// i16 shuffle as well.

16581

auto canWidenViaDuplication = [](ArrayRef<int> Mask) {

16582

for (int i = 0; i < 16; i += 2)

16583

if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])

16584

return false;

16585

16586

return true;

16587

};

16588

auto tryToWidenViaDuplication = [&]() -> SDValue {

16589

if (!canWidenViaDuplication(Mask))

16590

return SDValue();

16591

SmallVector<int, 4> LoInputs;

16592

copy_if(Mask, std::back_inserter(LoInputs),

16593

[](int M) { return M >= 0 && M < 8; });

16594

array_pod_sort(LoInputs.begin(), LoInputs.end());

16595

LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),

16596

LoInputs.end());

16597

SmallVector<int, 4> HiInputs;

16598

copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });

16599

array_pod_sort(HiInputs.begin(), HiInputs.end());

16600

HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),

16601

HiInputs.end());

16602

16603

bool TargetLo = LoInputs.size() >= HiInputs.size();

16604

ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;

16605

ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;

16606

16607

int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};

16608

SmallDenseMap<int, int, 8> LaneMap;

16609

for (int I : InPlaceInputs) {

16610

PreDupI16Shuffle[I/2] = I/2;

16611

LaneMap[I] = I;

16612

}

16613

int j = TargetLo ? 0 : 4, je = j + 4;

16614

for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {

16615

// Check if j is already a shuffle of this input. This happens when

16616

// there are two adjacent bytes after we move the low one.

16617

if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {

16618

// If we haven't yet mapped the input, search for a slot into which

16619

// we can map it.

16620

while (j < je && PreDupI16Shuffle[j] >= 0)

16621

++j;

16622

16623

if (j == je)

16624

// We can't place the inputs into a single half with a simple i16 shuffle, so bail.

16625

return SDValue();

16626

16627

// Map this input with the i16 shuffle.

16628

PreDupI16Shuffle[j] = MovingInputs[i] / 2;

16629

}

16630

16631

// Update the lane map based on the mapping we ended up with.

16632

LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;

16633

}

16634

V1 = DAG.getBitcast(

16635

MVT::v16i8,

16636

DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),

16637

DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));

16638

16639

// Unpack the bytes to form the i16s that will be shuffled into place.

16640

bool EvenInUse = false, OddInUse = false;

16641

for (int i = 0; i < 16; i += 2) {

16642

EvenInUse |= (Mask[i + 0] >= 0);

16643

OddInUse |= (Mask[i + 1] >= 0);

16644

if (EvenInUse && OddInUse)

16645

break;

16646

}

16647

V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,

16648

MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),

16649

OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));

16650

16651

int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};

16652

for (int i = 0; i < 16; ++i)

16653

if (Mask[i] >= 0) {

16654

int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);

16655

assert(MappedMask < 8 && "Invalid v8 shuffle mask!")(static_cast <bool> (MappedMask < 8 && "Invalid v8 shuffle mask!"
) ? void (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16655, __extension__
__PRETTY_FUNCTION__));

16656

if (PostDupI16Shuffle[i / 2] < 0)

16657

PostDupI16Shuffle[i / 2] = MappedMask;

16658

else

16659

assert(PostDupI16Shuffle[i / 2] == MappedMask &&(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16660, __extension__
__PRETTY_FUNCTION__))

16660

"Conflicting entries in the original shuffle!")(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16660, __extension__
__PRETTY_FUNCTION__));

16661

}

16662

return DAG.getBitcast(

16663

MVT::v16i8,

16664

DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),

16665

DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));

16666

};

16667

if (SDValue V = tryToWidenViaDuplication())

16668

return V;

16669

}

16670

16671

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,

16672

Zeroable, Subtarget, DAG))

16673

return Masked;

16674

16675

// Use dedicated unpack instructions for masks that match their pattern.

16676

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))

16677

return V;

16678

16679

// Try to use byte shift instructions to mask.

16680

if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,

16681

Zeroable, Subtarget, DAG))

16682

return V;

16683

16684

// Check for compaction patterns.

16685

bool IsSingleInput = V2.isUndef();

16686

int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);

16687

16688

// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly

16689

// with PSHUFB. It is important to do this before we attempt to generate any

16690

// blends but after all of the single-input lowerings. If the single input

16691

// lowerings can find an instruction sequence that is faster than a PSHUFB, we

16692

// want to preserve that and we can DAG combine any longer sequences into

16693

// a PSHUFB in the end. But once we start blending from multiple inputs,

16694

// the complexity of DAG combining bad patterns back into PSHUFB is too high,

16695

// and there are *very* few patterns that would actually be faster than the

16696

// PSHUFB approach because of its ability to zero lanes.

16697

//

16698

// If the mask is a binary compaction, we can more efficiently perform this

16699

// as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).

16700

//

16701

// FIXME: The only exceptions to the above are blends which are exact

16702

// interleavings with direct instructions supporting them. We currently don't

16703

// handle those well here.

16704

if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {

16705

bool V1InUse = false;

16706

bool V2InUse = false;

16707

16708

SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(

16709

DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);

16710

16711

// If both V1 and V2 are in use and we can use a direct blend or an unpack,

16712

// do so. This avoids using them to handle blends-with-zero which is

16713

// important as a single pshufb is significantly faster for that.

16714

if (V1InUse && V2InUse) {

16715

if (Subtarget.hasSSE41())

16716

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,

16717

Zeroable, Subtarget, DAG))

16718

return Blend;

16719

16720

// We can use an unpack to do the blending rather than an or in some

16721

// cases. Even though the or may be (very minorly) more efficient, we

16722

// preference this lowering because there are common cases where part of

16723

// the complexity of the shuffles goes away when we do the final blend as

16724

// an unpack.

16725

// FIXME: It might be worth trying to detect if the unpack-feeding

16726

// shuffles will both be pshufb, in which case we shouldn't bother with

16727

// this.

16728

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(

16729

DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))

16730

return Unpack;

16731

16732

// AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).

16733

if (Subtarget.hasVBMI())

16734

return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,

16735

DAG);

16736

16737

// If we have XOP we can use one VPPERM instead of multiple PSHUFBs.

16738

if (Subtarget.hasXOP()) {

16739

SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);

16740

return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);

16741

}

16742

16743

// Use PALIGNR+Permute if possible - permute might become PSHUFB but the

16744

// PALIGNR will be cheaper than the second PSHUFB+OR.

16745

if (SDValue V = lowerShuffleAsByteRotateAndPermute(

16746

DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))

16747

return V;

16748

}

16749

16750

return PSHUFB;

16751

}

16752

16753

// There are special ways we can lower some single-element blends.

16754

if (NumV2Elements == 1)

16755

if (SDValue V = lowerShuffleAsElementInsertion(

16756

DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

16757

return V;

16758

16759

if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))

16760

return Blend;

16761

16762

// Check whether a compaction lowering can be done. This handles shuffles

16763

// which take every Nth element for some even N. See the helper function for

16764

// details.

16765

//

16766

// We special case these as they can be particularly efficiently handled with

16767

// the PACKUSB instruction on x86 and they show up in common patterns of

16768

// rearranging bytes to truncate wide elements.

16769

if (NumEvenDrops) {

16770

// NumEvenDrops is the power of two stride of the elements. Another way of

16771

// thinking about it is that we need to drop the even elements this many

16772

// times to get the original input.

16773

16774

// First we need to zero all the dropped bytes.

16775

assert(NumEvenDrops <= 3 &&(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16776, __extension__
__PRETTY_FUNCTION__))

16776

"No support for dropping even elements more than 3 times.")(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16776, __extension__
__PRETTY_FUNCTION__));

16777

SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));

16778

for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))

16779

WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);

16780

SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);

16781

V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),

16782

WordClearMask);

16783

if (!IsSingleInput)

16784

V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),

16785

WordClearMask);

16786

16787

// Now pack things back together.

16788

SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,

16789

IsSingleInput ? V1 : V2);

16790

for (int i = 1; i < NumEvenDrops; ++i) {

16791

Result = DAG.getBitcast(MVT::v8i16, Result);

16792

Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);

16793

}

16794

return Result;

16795

}

16796

16797

int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);

16798

if (NumOddDrops == 1) {

16799

V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,

16800

DAG.getBitcast(MVT::v8i16, V1),

16801

DAG.getTargetConstant(8, DL, MVT::i8));

16802

if (!IsSingleInput)

16803

V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,

16804

DAG.getBitcast(MVT::v8i16, V2),

16805

DAG.getTargetConstant(8, DL, MVT::i8));

16806

return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,

16807

IsSingleInput ? V1 : V2);

16808

}

16809

16810

// Handle multi-input cases by blending/unpacking single-input shuffles.

16811

if (NumV2Elements > 0)

16812

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,

16813

Subtarget, DAG);

16814

16815

// The fallback path for single-input shuffles widens this into two v8i16

16816

// vectors with unpacks, shuffles those, and then pulls them back together

16817

// with a pack.

16818

SDValue V = V1;

16819

16820

std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};

16821

std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};

16822

for (int i = 0; i < 16; ++i)

16823

if (Mask[i] >= 0)

16824

(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];

16825

16826

SDValue VLoHalf, VHiHalf;

16827

// Check if any of the odd lanes in the v16i8 are used. If not, we can mask

16828

// them out and avoid using UNPCK{L,H} to extract the elements of V as

16829

// i16s.

16830

if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&

16831

none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {

16832

// Use a mask to drop the high bytes.

16833

VLoHalf = DAG.getBitcast(MVT::v8i16, V);

16834

VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,

16835

DAG.getConstant(0x00FF, DL, MVT::v8i16));

16836

16837

// This will be a single vector shuffle instead of a blend so nuke VHiHalf.

16838

VHiHalf = DAG.getUNDEF(MVT::v8i16);

16839

16840

// Squash the masks to point directly into VLoHalf.

16841

for (int &M : LoBlendMask)

16842

if (M >= 0)

16843

M /= 2;

16844

for (int &M : HiBlendMask)

16845

if (M >= 0)

16846

M /= 2;

16847

} else {

16848

// Otherwise just unpack the low half of V into VLoHalf and the high half into

16849

// VHiHalf so that we can blend them as i16s.

16850

SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);

16851

16852

VLoHalf = DAG.getBitcast(

16853

MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));

16854

VHiHalf = DAG.getBitcast(

16855

MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));

16856

}

16857

16858

SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);

16859

SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);

16860

16861

return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);

16862

}

16863

16864

/// Dispatching routine to lower various 128-bit x86 vector shuffles.

16865

///

16866

/// This routine breaks down the specific type of 128-bit shuffle and

16867

/// dispatches to the lowering routines accordingly.

16868

static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

16869

MVT VT, SDValue V1, SDValue V2,

16870

const APInt &Zeroable,

16871

const X86Subtarget &Subtarget,

16872

SelectionDAG &DAG) {

16873

switch (VT.SimpleTy) {

16874

case MVT::v2i64:

16875

return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16876

case MVT::v2f64:

16877

return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16878

case MVT::v4i32:

16879

return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16880

case MVT::v4f32:

16881

return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16882

case MVT::v8i16:

16883

return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16884

case MVT::v8f16:

16885

return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16886

case MVT::v16i8:

16887

return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16888

16889

default:

16890

llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16890);

16891

}

16892

}

16893

16894

/// Generic routine to split vector shuffle into half-sized shuffles.

16895

///

16896

/// This routine just extracts two subvectors, shuffles them independently, and

16897

/// then concatenates them back together. This should work effectively with all

16898

/// AVX vector shuffle types.

16899

static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,

16900

SDValue V2, ArrayRef<int> Mask,

16901

SelectionDAG &DAG, bool SimpleOnly) {

16902

assert(VT.getSizeInBits() >= 256 &&(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16903, __extension__
__PRETTY_FUNCTION__))

16903

"Only for 256-bit or wider vector shuffles!")(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16903, __extension__
__PRETTY_FUNCTION__));

16904

assert(V1.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16904, __extension__
__PRETTY_FUNCTION__));

16905

assert(V2.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16905, __extension__
__PRETTY_FUNCTION__));

16906

16907

ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);

16908

ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);

16909

16910

int NumElements = VT.getVectorNumElements();

16911

int SplitNumElements = NumElements / 2;

16912

MVT ScalarVT = VT.getVectorElementType();

16913

MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);

16914

16915

// Use splitVector/extractSubVector so that split build-vectors just build two

16916

// narrower build vectors. This helps shuffling with splats and zeros.

16917

auto SplitVector = [&](SDValue V) {

16918

SDValue LoV, HiV;

16919

std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);

16920

return std::make_pair(DAG.getBitcast(SplitVT, LoV),

16921

DAG.getBitcast(SplitVT, HiV));

16922

};

16923

16924

SDValue LoV1, HiV1, LoV2, HiV2;

16925

std::tie(LoV1, HiV1) = SplitVector(V1);

16926

std::tie(LoV2, HiV2) = SplitVector(V2);

16927

16928

// Now create two 4-way blends of these half-width vectors.

16929

auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,

16930

bool &UseHiV1, bool &UseLoV2,

16931

bool &UseHiV2) {

16932

UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;

16933

for (int i = 0; i < SplitNumElements; ++i) {

16934

int M = HalfMask[i];

16935

if (M >= NumElements) {

16936

if (M >= NumElements + SplitNumElements)

16937

UseHiV2 = true;

16938

else

16939

UseLoV2 = true;

16940

} else if (M >= 0) {

16941

if (M >= SplitNumElements)

16942

UseHiV1 = true;

16943

else

16944

UseLoV1 = true;

16945

}

16946

}

16947

};

16948

16949

auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {

16950

if (!SimpleOnly)

16951

return true;

16952

16953

bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;

16954

GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);

16955

16956

return !(UseHiV1 || UseHiV2);

16957

};

16958

16959

auto HalfBlend = [&](ArrayRef<int> HalfMask) {

16960

SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);

16961

SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);

16962

SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);

16963

for (int i = 0; i < SplitNumElements; ++i) {

16964

int M = HalfMask[i];

16965

if (M >= NumElements) {

16966

V2BlendMask[i] = M - NumElements;

16967

BlendMask[i] = SplitNumElements + i;

16968

} else if (M >= 0) {

16969

V1BlendMask[i] = M;

16970

BlendMask[i] = i;

16971

}

16972

}

16973

16974

bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;

16975

GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);

16976

16977

// Because the lowering happens after all combining takes place, we need to

16978

// manually combine these blend masks as much as possible so that we create

16979

// a minimal number of high-level vector shuffle nodes.

16980

assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple")(static_cast <bool> ((!SimpleOnly || (!UseHiV1 &&
!UseHiV2)) && "Shuffle isn't simple") ? void (0) : __assert_fail
("(!SimpleOnly || (!UseHiV1 && !UseHiV2)) && \"Shuffle isn't simple\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16980, __extension__
__PRETTY_FUNCTION__));

16981

16982

// First try just blending the halves of V1 or V2.

16983

if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)

16984

return DAG.getUNDEF(SplitVT);

16985

if (!UseLoV2 && !UseHiV2)

16986

return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);

16987

if (!UseLoV1 && !UseHiV1)

16988

return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

16989

16990

SDValue V1Blend, V2Blend;

16991

if (UseLoV1 && UseHiV1) {

16992

V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);

16993

} else {

16994

// We only use half of V1 so map the usage down into the final blend mask.

16995

V1Blend = UseLoV1 ? LoV1 : HiV1;

16996

for (int i = 0; i < SplitNumElements; ++i)

16997

if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)

16998

BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);

16999

}

17000

if (UseLoV2 && UseHiV2) {

17001

V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

17002

} else {

17003

// We only use half of V2 so map the usage down into the final blend mask.

17004

V2Blend = UseLoV2 ? LoV2 : HiV2;

17005

for (int i = 0; i < SplitNumElements; ++i)

17006

if (BlendMask[i] >= SplitNumElements)

17007

BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);

17008

}

17009

return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);

17010

};

17011

17012

if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))

17013

return SDValue();

17014

17015

SDValue Lo = HalfBlend(LoMask);

17016

SDValue Hi = HalfBlend(HiMask);

17017

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

17018

}

17019

17020

/// Either split a vector in halves or decompose the shuffles and the

17021

/// blend/unpack.

17022

///

17023

/// This is provided as a good fallback for many lowerings of non-single-input

17024

/// shuffles with more than one 128-bit lane. In those cases, we want to select

17025

/// between splitting the shuffle into 128-bit components and stitching those

17026

/// back together vs. extracting the single-input shuffles and blending those

17027

/// results.

17028

static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,

17029

SDValue V2, ArrayRef<int> Mask,

17030

const X86Subtarget &Subtarget,

17031

SelectionDAG &DAG) {

17032

assert(!V2.isUndef() && "This routine must not be used to lower single-input "(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17033, __extension__
__PRETTY_FUNCTION__))

17033

"shuffles as it could then recurse on itself.")(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17033, __extension__
__PRETTY_FUNCTION__));

17034

int Size = Mask.size();

17035

17036

// If this can be modeled as a broadcast of two elements followed by a blend,

17037

// prefer that lowering. This is especially important because broadcasts can

17038

// often fold with memory operands.

17039

auto DoBothBroadcast = [&] {

17040

int V1BroadcastIdx = -1, V2BroadcastIdx = -1;

17041

for (int M : Mask)

17042

if (M >= Size) {

17043

if (V2BroadcastIdx < 0)

17044

V2BroadcastIdx = M - Size;

17045

else if (M - Size != V2BroadcastIdx)

17046

return false;

17047

} else if (M >= 0) {

17048

if (V1BroadcastIdx < 0)

17049

V1BroadcastIdx = M;

17050

else if (M != V1BroadcastIdx)

17051

return false;

17052

}

17053

return true;

17054

};

17055

if (DoBothBroadcast())

17056

return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,

17057

DAG);

17058

17059

// If the inputs all stem from a single 128-bit lane of each input, then we

17060

// split them rather than blending because the split will decompose to

17061

// unusually few instructions.

17062

int LaneCount = VT.getSizeInBits() / 128;

17063

int LaneSize = Size / LaneCount;

17064

SmallBitVector LaneInputs[2];

17065

LaneInputs[0].resize(LaneCount, false);

17066

LaneInputs[1].resize(LaneCount, false);

17067

for (int i = 0; i < Size; ++i)

17068

if (Mask[i] >= 0)

17069

LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;

17070

if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)

17071

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,

17072

/*SimpleOnly*/ false);

17073

17074

// Otherwise, just fall back to decomposed shuffles and a blend/unpack. This

17075

// requires that the decomposed single-input shuffles don't end up here.

17076

return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,

17077

DAG);

17078

}

17079

17080

// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

17081

// TODO: Extend to support v8f32 (+ 512-bit shuffles).

17082

static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,

17083

SDValue V1, SDValue V2,

17084

ArrayRef<int> Mask,

17085

SelectionDAG &DAG) {

17086

assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")(static_cast <bool> (VT == MVT::v4f64 && "Only for v4f64 shuffles"
) ? void (0) : __assert_fail ("VT == MVT::v4f64 && \"Only for v4f64 shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17086, __extension__
__PRETTY_FUNCTION__));

17087

17088

int LHSMask[4] = {-1, -1, -1, -1};

17089

int RHSMask[4] = {-1, -1, -1, -1};

17090

unsigned SHUFPMask = 0;

17091

17092

// As SHUFPD uses a single LHS/RHS element per lane, we can always

17093

// perform the shuffle once the lanes have been shuffled in place.

17094

for (int i = 0; i != 4; ++i) {

17095

int M = Mask[i];

17096

if (M < 0)

17097

continue;

17098

int LaneBase = i & ~1;

17099

auto &LaneMask = (i & 1) ? RHSMask : LHSMask;

17100

LaneMask[LaneBase + (M & 1)] = M;

17101

SHUFPMask |= (M & 1) << i;

17102

}

17103

17104

SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);

17105

SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);

17106

return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,

17107

DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));

17108

}

17109

17110

/// Lower a vector shuffle crossing multiple 128-bit lanes as

17111

/// a lane permutation followed by a per-lane permutation.

17112

///

17113

/// This is mainly for cases where we can have non-repeating permutes

17114

/// in each lane.

17115

///

17116

/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,

17117

/// we should investigate merging them.

17118

static SDValue lowerShuffleAsLanePermuteAndPermute(

17119

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

17120

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

17121

int NumElts = VT.getVectorNumElements();

17122

int NumLanes = VT.getSizeInBits() / 128;

17123

int NumEltsPerLane = NumElts / NumLanes;

17124

bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();

17125

17126

/// Attempts to find a sublane permute with the given size

17127

/// that gets all elements into their target lanes.

17128

///

17129

/// If successful, fills CrossLaneMask and InLaneMask and returns true.

17130

/// If unsuccessful, returns false and may overwrite InLaneMask.

17131

auto getSublanePermute = [&](int NumSublanes) -> SDValue {

17132

int NumSublanesPerLane = NumSublanes / NumLanes;

17133

int NumEltsPerSublane = NumElts / NumSublanes;

17134

17135

SmallVector<int, 16> CrossLaneMask;

17136

SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);

17137

// CrossLaneMask but one entry == one sublane.

17138

SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);

17139

17140

for (int i = 0; i != NumElts; ++i) {

17141

int M = Mask[i];

17142

if (M < 0)

17143

continue;

17144

17145

int SrcSublane = M / NumEltsPerSublane;

17146

int DstLane = i / NumEltsPerLane;

17147

17148

// We only need to get the elements into the right lane, not sublane.

17149

// So search all sublanes that make up the destination lane.

17150

bool Found = false;

17151

int DstSubStart = DstLane * NumSublanesPerLane;

17152

int DstSubEnd = DstSubStart + NumSublanesPerLane;

17153

for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {

17154

if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))

17155

continue;

17156

17157

Found = true;

17158

CrossLaneMaskLarge[DstSublane] = SrcSublane;

17159

int DstSublaneOffset = DstSublane * NumEltsPerSublane;

17160

InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;

17161

break;

17162

}

17163

if (!Found)

17164

return SDValue();

17165

}

17166

17167

// Fill CrossLaneMask using CrossLaneMaskLarge.

17168

narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);

17169

17170

if (!CanUseSublanes) {

17171

// If we're only shuffling a single lowest lane and the rest are identity

17172

// then don't bother.

17173

// TODO - isShuffleMaskInputInPlace could be extended to something like

17174

// this.

17175

int NumIdentityLanes = 0;

17176

bool OnlyShuffleLowestLane = true;

17177

for (int i = 0; i != NumLanes; ++i) {

17178

int LaneOffset = i * NumEltsPerLane;

17179

if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,

17180

i * NumEltsPerLane))

17181

NumIdentityLanes++;

17182

else if (CrossLaneMask[LaneOffset] != 0)

17183

OnlyShuffleLowestLane = false;

17184

}

17185

if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))

17186

return SDValue();

17187

}

17188

17189

// Avoid returning the same shuffle operation. For example,

17190

// t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,

17191

// undef:v16i16

17192

if (CrossLaneMask == Mask || InLaneMask == Mask)

17193

return SDValue();

17194

17195

SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);

17196

return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),

17197

InLaneMask);

17198

};

17199

17200

// First attempt a solution with full lanes.

17201

if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))

17202

return V;

17203

17204

// The rest of the solutions use sublanes.

17205

if (!CanUseSublanes)

17206

return SDValue();

17207

17208

// Then attempt a solution with 64-bit sublanes (vpermq).

17209

if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))

17210

return V;

17211

17212

// If that doesn't work and we have fast variable cross-lane shuffle,

17213

// attempt 32-bit sublanes (vpermd).

17214

if (!Subtarget.hasFastVariableCrossLaneShuffle())

17215

return SDValue();

17216

17217

return getSublanePermute(/*NumSublanes=*/NumLanes * 4);

17218

}

17219

17220

/// Helper to get compute inlane shuffle mask for a complete shuffle mask.

17221

static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,

17222

SmallVector<int> &InLaneMask) {

17223

int Size = Mask.size();

17224

InLaneMask.assign(Mask.begin(), Mask.end());

17225

for (int i = 0; i < Size; ++i) {

17226

int &M = InLaneMask[i];

17227

if (M < 0)

17228

continue;

17229

if (((M % Size) / LaneSize) != (i / LaneSize))

17230

M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;

17231

}

17232

}

17233

17234

/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one

17235

/// source with a lane permutation.

17236

///

17237

/// This lowering strategy results in four instructions in the worst case for a

17238

/// single-input cross lane shuffle which is lower than any other fully general

17239

/// cross-lane shuffle strategy I'm aware of. Special cases for each particular

17240

/// shuffle pattern should be handled prior to trying this lowering.

17241

static SDValue lowerShuffleAsLanePermuteAndShuffle(

17242

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

17243

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

17244

// FIXME: This should probably be generalized for 512-bit vectors as well.

17245

assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")(static_cast <bool> (VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17245, __extension__
__PRETTY_FUNCTION__));

17246

int Size = Mask.size();

17247

int LaneSize = Size / 2;

17248

17249

// Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

17250

// Only do this if the elements aren't all from the lower lane,

17251

// otherwise we're (probably) better off doing a split.

17252

if (VT == MVT::v4f64 &&

17253

!all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))

17254

return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);

17255

17256

// If there are only inputs from one 128-bit lane, splitting will in fact be

17257

// less expensive. The flags track whether the given lane contains an element

17258

// that crosses to another lane.

17259

bool AllLanes;

17260

if (!Subtarget.hasAVX2()) {

17261

bool LaneCrossing[2] = {false, false};

17262

for (int i = 0; i < Size; ++i)

17263

if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))

17264

LaneCrossing[(Mask[i] % Size) / LaneSize] = true;

17265

AllLanes = LaneCrossing[0] && LaneCrossing[1];

17266

} else {

17267

bool LaneUsed[2] = {false, false};

17268

for (int i = 0; i < Size; ++i)

17269

if (Mask[i] >= 0)

17270

LaneUsed[(Mask[i] % Size) / LaneSize] = true;

17271

AllLanes = LaneUsed[0] && LaneUsed[1];

17272

}

17273

17274

// TODO - we could support shuffling V2 in the Flipped input.

17275

assert(V2.isUndef() &&(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17276, __extension__
__PRETTY_FUNCTION__))

17276

"This last part of this routine only works on single input shuffles")(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17276, __extension__
__PRETTY_FUNCTION__));

17277

17278

SmallVector<int> InLaneMask;

17279

computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);

17280

17281

assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17282, __extension__
__PRETTY_FUNCTION__))

17282

"In-lane shuffle mask expected")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17282, __extension__
__PRETTY_FUNCTION__));

17283

17284

// If we're not using both lanes in each lane and the inlane mask is not

17285

// repeating, then we're better off splitting.

17286

if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))

17287

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,

17288

/*SimpleOnly*/ false);

17289

17290

// Flip the lanes, and shuffle the results which should now be in-lane.

17291

MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;

17292

SDValue Flipped = DAG.getBitcast(PVT, V1);

17293

Flipped =

17294

DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});

17295

Flipped = DAG.getBitcast(VT, Flipped);

17296

return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);

17297

}

17298

17299

/// Handle lowering 2-lane 128-bit shuffles.

17300

static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,

17301

SDValue V2, ArrayRef<int> Mask,

17302

const APInt &Zeroable,

17303

const X86Subtarget &Subtarget,

17304

SelectionDAG &DAG) {

17305

if (V2.isUndef()) {

17306

// Attempt to match VBROADCAST*128 subvector broadcast load.

17307

bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);

17308

bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);

17309

if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&

17310

X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {

17311

MVT MemVT = VT.getHalfNumVectorElementsVT();

17312

unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();

17313

auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));

17314

if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,

17315

VT, MemVT, Ld, Ofs, DAG))

17316

return BcstLd;

17317

}

17318

17319

// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.

17320

if (Subtarget.hasAVX2())

17321

return SDValue();

17322

}

17323

17324

bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());

17325

17326

SmallVector<int, 4> WidenedMask;

17327

if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))

17328

return SDValue();

17329

17330

bool IsLowZero = (Zeroable & 0x3) == 0x3;

17331

bool IsHighZero = (Zeroable & 0xc) == 0xc;

17332

17333

// Try to use an insert into a zero vector.

17334

if (WidenedMask[0] == 0 && IsHighZero) {

17335

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

17336

SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

17337

DAG.getIntPtrConstant(0, DL));

17338

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

17339

getZeroVector(VT, Subtarget, DAG, DL), LoV,

17340

DAG.getIntPtrConstant(0, DL));

17341

}

17342

17343

// TODO: If minimizing size and one of the inputs is a zero vector and the

17344

// the zero vector has only one use, we could use a VPERM2X128 to save the

17345

// instruction bytes needed to explicitly generate the zero vector.

17346

17347

// Blends are faster and handle all the non-lane-crossing cases.

17348

if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,

17349

Subtarget, DAG))

17350

return Blend;

17351

17352

// If either input operand is a zero vector, use VPERM2X128 because its mask

17353

// allows us to replace the zero input with an implicit zero.

17354

if (!IsLowZero && !IsHighZero) {

17355

// Check for patterns which can be matched with a single insert of a 128-bit

17356

// subvector.

17357

bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);

17358

if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {

17359

17360

// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,

17361

// this will likely become vinsertf128 which can't fold a 256-bit memop.

17362

if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {

17363

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

17364

SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,

17365

OnlyUsesV1 ? V1 : V2,

17366

DAG.getIntPtrConstant(0, DL));

17367

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,

17368

DAG.getIntPtrConstant(2, DL));

17369

}

17370

}

17371

17372

// Try to use SHUF128 if possible.

17373

if (Subtarget.hasVLX()) {

17374

if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {

17375

unsigned PermMask = ((WidenedMask[0] % 2) << 0) |

17376

((WidenedMask[1] % 2) << 1);

17377

return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,

17378

DAG.getTargetConstant(PermMask, DL, MVT::i8));

17379

}

17380

}

17381

}

17382

17383

// Otherwise form a 128-bit permutation. After accounting for undefs,

17384

// convert the 64-bit shuffle mask selection values into 128-bit

17385

// selection bits by dividing the indexes by 2 and shifting into positions

17386

// defined by a vperm2*128 instruction's immediate control byte.

17387

17388

// The immediate permute control byte looks like this:

17389

// [1:0] - select 128 bits from sources for low half of destination

17390

// [2] - ignore

17391

// [3] - zero low half of destination

17392

// [5:4] - select 128 bits from sources for high half of destination

17393

// [6] - ignore

17394

// [7] - zero high half of destination

17395

17396

assert((WidenedMask[0] >= 0 || IsLowZero) &&(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17397, __extension__
__PRETTY_FUNCTION__))

17397

(WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17397, __extension__
__PRETTY_FUNCTION__));

17398

17399

unsigned PermMask = 0;

17400

PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);

17401

PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);

17402

17403

// Check the immediate mask and replace unused sources with undef.

17404

if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)

17405

V1 = DAG.getUNDEF(VT);

17406

if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)

17407

V2 = DAG.getUNDEF(VT);

17408

17409

return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,

17410

DAG.getTargetConstant(PermMask, DL, MVT::i8));

17411

}

17412

17413

/// Lower a vector shuffle by first fixing the 128-bit lanes and then

17414

/// shuffling each lane.

17415

///

17416

/// This attempts to create a repeated lane shuffle where each lane uses one

17417

/// or two of the lanes of the inputs. The lanes of the input vectors are

17418

/// shuffled in one or two independent shuffles to get the lanes into the

17419

/// position needed by the final shuffle.

17420

static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(

17421

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

17422

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

17423

assert(!V2.isUndef() && "This is only useful with multiple inputs.")(static_cast <bool> (!V2.isUndef() && "This is only useful with multiple inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17423, __extension__
__PRETTY_FUNCTION__));

17424

17425

if (is128BitLaneRepeatedShuffleMask(VT, Mask))

17426

return SDValue();

17427

17428

int NumElts = Mask.size();

17429

int NumLanes = VT.getSizeInBits() / 128;

17430

int NumLaneElts = 128 / VT.getScalarSizeInBits();

17431

SmallVector<int, 16> RepeatMask(NumLaneElts, -1);

17432

SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});

17433

17434

// First pass will try to fill in the RepeatMask from lanes that need two

17435

// sources.

17436

for (int Lane = 0; Lane != NumLanes; ++Lane) {

17437

int Srcs[2] = {-1, -1};

17438

SmallVector<int, 16> InLaneMask(NumLaneElts, -1);

17439

for (int i = 0; i != NumLaneElts; ++i) {

17440

int M = Mask[(Lane * NumLaneElts) + i];

17441

if (M < 0)

17442

continue;

17443

// Determine which of the possible input lanes (NumLanes from each source)

17444

// this element comes from. Assign that as one of the sources for this

17445

// lane. We can assign up to 2 sources for this lane. If we run out

17446

// sources we can't do anything.

17447

int LaneSrc = M / NumLaneElts;

17448

int Src;

17449

if (Srcs[0] < 0 || Srcs[0] == LaneSrc)

17450

Src = 0;

17451

else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)

17452

Src = 1;

17453

else

17454

return SDValue();

17455

17456

Srcs[Src] = LaneSrc;

17457

InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;

17458

}

17459

17460

// If this lane has two sources, see if it fits with the repeat mask so far.

17461

if (Srcs[1] < 0)

17462

continue;

17463

17464

LaneSrcs[Lane][0] = Srcs[0];

17465

LaneSrcs[Lane][1] = Srcs[1];

17466

17467

auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {

17468

assert(M1.size() == M2.size() && "Unexpected mask size")(static_cast <bool> (M1.size() == M2.size() && "Unexpected mask size"
) ? void (0) : __assert_fail ("M1.size() == M2.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17468, __extension__
__PRETTY_FUNCTION__));

17469

for (int i = 0, e = M1.size(); i != e; ++i)

17470

if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])

17471

return false;

17472

return true;

17473

};

17474

17475

auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {

17476

assert(Mask.size() == MergedMask.size() && "Unexpected mask size")(static_cast <bool> (Mask.size() == MergedMask.size() &&
"Unexpected mask size") ? void (0) : __assert_fail ("Mask.size() == MergedMask.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17476, __extension__
__PRETTY_FUNCTION__));

17477

for (int i = 0, e = MergedMask.size(); i != e; ++i) {

17478

int M = Mask[i];

17479

if (M < 0)

17480

continue;

17481

assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17482, __extension__
__PRETTY_FUNCTION__))

17482

"Unexpected mask element")(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17482, __extension__
__PRETTY_FUNCTION__));

17483

MergedMask[i] = M;

17484

}

17485

};

17486

17487

if (MatchMasks(InLaneMask, RepeatMask)) {

17488

// Merge this lane mask into the final repeat mask.

17489

MergeMasks(InLaneMask, RepeatMask);

17490

continue;

17491

}

17492

17493

// Didn't find a match. Swap the operands and try again.

17494

std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);

17495

ShuffleVectorSDNode::commuteMask(InLaneMask);

17496

17497

if (MatchMasks(InLaneMask, RepeatMask)) {

17498

// Merge this lane mask into the final repeat mask.

17499

MergeMasks(InLaneMask, RepeatMask);

17500

continue;

17501

}

17502

17503

// Couldn't find a match with the operands in either order.

17504

return SDValue();

17505

}

17506

17507

// Now handle any lanes with only one source.

17508

for (int Lane = 0; Lane != NumLanes; ++Lane) {

17509

// If this lane has already been processed, skip it.

17510

if (LaneSrcs[Lane][0] >= 0)

17511

continue;

17512

17513

for (int i = 0; i != NumLaneElts; ++i) {

17514

int M = Mask[(Lane * NumLaneElts) + i];

17515

if (M < 0)

17516

continue;

17517

17518

// If RepeatMask isn't defined yet we can define it ourself.

17519

if (RepeatMask[i] < 0)

17520

RepeatMask[i] = M % NumLaneElts;

17521

17522

if (RepeatMask[i] < NumElts) {

17523

if (RepeatMask[i] != M % NumLaneElts)

17524

return SDValue();

17525

LaneSrcs[Lane][0] = M / NumLaneElts;

17526

} else {

17527

if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))

17528

return SDValue();

17529

LaneSrcs[Lane][1] = M / NumLaneElts;

17530

}

17531

}

17532

17533

if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)

17534

return SDValue();

17535

}

17536

17537

SmallVector<int, 16> NewMask(NumElts, -1);

17538

for (int Lane = 0; Lane != NumLanes; ++Lane) {

17539

int Src = LaneSrcs[Lane][0];

17540

for (int i = 0; i != NumLaneElts; ++i) {

17541

int M = -1;

17542

if (Src >= 0)

17543

M = Src * NumLaneElts + i;

17544

NewMask[Lane * NumLaneElts + i] = M;

17545

}

17546

}

17547

SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

17548

// Ensure we didn't get back the shuffle we started with.

17549

// FIXME: This is a hack to make up for some splat handling code in

17550

// getVectorShuffle.

17551

if (isa<ShuffleVectorSDNode>(NewV1) &&

17552

cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)

17553

return SDValue();

17554

17555

for (int Lane = 0; Lane != NumLanes; ++Lane) {

17556

int Src = LaneSrcs[Lane][1];

17557

for (int i = 0; i != NumLaneElts; ++i) {

17558

int M = -1;

17559

if (Src >= 0)

17560

M = Src * NumLaneElts + i;

17561

NewMask[Lane * NumLaneElts + i] = M;

17562

}

17563

}

17564

SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

17565

// Ensure we didn't get back the shuffle we started with.

17566

// FIXME: This is a hack to make up for some splat handling code in

17567

// getVectorShuffle.

17568

if (isa<ShuffleVectorSDNode>(NewV2) &&

17569

cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)

17570

return SDValue();

17571

17572

for (int i = 0; i != NumElts; ++i) {

17573

if (Mask[i] < 0) {

17574

NewMask[i] = -1;

17575

continue;

17576

}

17577

NewMask[i] = RepeatMask[i % NumLaneElts];

17578

if (NewMask[i] < 0)

17579

continue;

17580

17581

NewMask[i] += (i / NumLaneElts) * NumLaneElts;

17582

}

17583

return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);

17584

}

17585

17586

/// If the input shuffle mask results in a vector that is undefined in all upper

17587

/// or lower half elements and that mask accesses only 2 halves of the

17588

/// shuffle's operands, return true. A mask of half the width with mask indexes

17589

/// adjusted to access the extracted halves of the original shuffle operands is

17590

/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or

17591

/// lower half of each input operand is accessed.

17592

static bool

17593

getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,

17594

int &HalfIdx1, int &HalfIdx2) {

17595

assert((Mask.size() == HalfMask.size() * 2) &&(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17596, __extension__
__PRETTY_FUNCTION__))

17596

"Expected input mask to be twice as long as output")(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17596, __extension__
__PRETTY_FUNCTION__));

17597

17598

// Exactly one half of the result must be undef to allow narrowing.

17599

bool UndefLower = isUndefLowerHalf(Mask);

17600

bool UndefUpper = isUndefUpperHalf(Mask);

17601

if (UndefLower == UndefUpper)

17602

return false;

17603

17604

unsigned HalfNumElts = HalfMask.size();

17605

unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;

17606

HalfIdx1 = -1;

17607

HalfIdx2 = -1;

17608

for (unsigned i = 0; i != HalfNumElts; ++i) {

17609

int M = Mask[i + MaskIndexOffset];

17610

if (M < 0) {

17611

HalfMask[i] = M;

17612

continue;

17613

}

17614

17615

// Determine which of the 4 half vectors this element is from.

17616

// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.

17617

int HalfIdx = M / HalfNumElts;

17618

17619

// Determine the element index into its half vector source.

17620

int HalfElt = M % HalfNumElts;

17621

17622

// We can shuffle with up to 2 half vectors, set the new 'half'

17623

// shuffle mask accordingly.

17624

if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {

17625

HalfMask[i] = HalfElt;

17626

HalfIdx1 = HalfIdx;

17627

continue;

17628

}

17629

if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {

17630

HalfMask[i] = HalfElt + HalfNumElts;

17631

HalfIdx2 = HalfIdx;

17632

continue;

17633

}

17634

17635

// Too many half vectors referenced.

17636

return false;

17637

}

17638

17639

return true;

17640

}

17641

17642

/// Given the output values from getHalfShuffleMask(), create a half width

17643

/// shuffle of extracted vectors followed by an insert back to full width.

17644

static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,

17645

ArrayRef<int> HalfMask, int HalfIdx1,

17646

int HalfIdx2, bool UndefLower,

17647

SelectionDAG &DAG, bool UseConcat = false) {

17648

assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "Different sized vectors?") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"Different sized vectors?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17648, __extension__
__PRETTY_FUNCTION__));

17649

assert(V1.getValueType().isSimple() && "Expecting only simple types")(static_cast <bool> (V1.getValueType().isSimple() &&
"Expecting only simple types") ? void (0) : __assert_fail ("V1.getValueType().isSimple() && \"Expecting only simple types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17649, __extension__
__PRETTY_FUNCTION__));

17650

17651

MVT VT = V1.getSimpleValueType();

17652

MVT HalfVT = VT.getHalfNumVectorElementsVT();

17653

unsigned HalfNumElts = HalfVT.getVectorNumElements();

17654

17655

auto getHalfVector = [&](int HalfIdx) {

17656

if (HalfIdx < 0)

17657

return DAG.getUNDEF(HalfVT);

17658

SDValue V = (HalfIdx < 2 ? V1 : V2);

17659

HalfIdx = (HalfIdx % 2) * HalfNumElts;

17660

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,

17661

DAG.getIntPtrConstant(HalfIdx, DL));

17662

};

17663

17664

// ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset

17665

SDValue Half1 = getHalfVector(HalfIdx1);

17666

SDValue Half2 = getHalfVector(HalfIdx2);

17667

SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);

17668

if (UseConcat) {

17669

SDValue Op0 = V;

17670

SDValue Op1 = DAG.getUNDEF(HalfVT);

17671

if (UndefLower)

17672

std::swap(Op0, Op1);

17673

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);

17674

}

17675

17676

unsigned Offset = UndefLower ? HalfNumElts : 0;

17677

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,

17678

DAG.getIntPtrConstant(Offset, DL));

17679

}

17680

17681

/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.

17682

/// This allows for fast cases such as subvector extraction/insertion

17683

/// or shuffling smaller vector types which can lower more efficiently.

17684

static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,

17685

SDValue V2, ArrayRef<int> Mask,

17686

const X86Subtarget &Subtarget,

17687

SelectionDAG &DAG) {

17688

assert((VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17689, __extension__
__PRETTY_FUNCTION__))

17689

"Expected 256-bit or 512-bit vector")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17689, __extension__
__PRETTY_FUNCTION__));

17690

17691

bool UndefLower = isUndefLowerHalf(Mask);

17692

if (!UndefLower && !isUndefUpperHalf(Mask))

17693

return SDValue();

17694

17695

assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17696, __extension__
__PRETTY_FUNCTION__))

17696

"Completely undef shuffle mask should have been simplified already")(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17696, __extension__
__PRETTY_FUNCTION__));

17697

17698

// Upper half is undef and lower half is whole upper subvector.

17699

// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>

17700

MVT HalfVT = VT.getHalfNumVectorElementsVT();

17701

unsigned HalfNumElts = HalfVT.getVectorNumElements();

17702

if (!UndefLower &&

17703

isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {

17704

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,

17705

DAG.getIntPtrConstant(HalfNumElts, DL));

17706

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,

17707

DAG.getIntPtrConstant(0, DL));

17708

}

17709

17710

// Lower half is undef and upper half is whole lower subvector.

17711

// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>

17712

if (UndefLower &&

17713

isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {

17714

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,

17715

DAG.getIntPtrConstant(0, DL));

17716

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,

17717

DAG.getIntPtrConstant(HalfNumElts, DL));

17718

}

17719

17720

int HalfIdx1, HalfIdx2;

17721

SmallVector<int, 8> HalfMask(HalfNumElts);

17722

if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))

17723

return SDValue();

17724

17725

assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")(static_cast <bool> (HalfMask.size() == HalfNumElts &&
"Unexpected shuffle mask length") ? void (0) : __assert_fail
("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17725, __extension__
__PRETTY_FUNCTION__));

17726

17727

// Only shuffle the halves of the inputs when useful.

17728

unsigned NumLowerHalves =

17729

(HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);

17730

unsigned NumUpperHalves =

17731

(HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);

17732

assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")(static_cast <bool> (NumLowerHalves + NumUpperHalves <=
2 && "Only 1 or 2 halves allowed") ? void (0) : __assert_fail
("NumLowerHalves + NumUpperHalves <= 2 && \"Only 1 or 2 halves allowed\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17732, __extension__
__PRETTY_FUNCTION__));

17733

17734

// Determine the larger pattern of undef/halves, then decide if it's worth

17735

// splitting the shuffle based on subtarget capabilities and types.

17736

unsigned EltWidth = VT.getVectorElementType().getSizeInBits();

17737

if (!UndefLower) {

17738

// XXXXuuuu: no insert is needed.

17739

// Always extract lowers when setting lower - these are all free subreg ops.

17740

if (NumUpperHalves == 0)

17741

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

17742

UndefLower, DAG);

17743

17744

if (NumUpperHalves == 1) {

17745

// AVX2 has efficient 32/64-bit element cross-lane shuffles.

17746

if (Subtarget.hasAVX2()) {

17747

// extract128 + vunpckhps/vshufps, is better than vblend + vpermps.

17748

if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&

17749

!is128BitUnpackShuffleMask(HalfMask, DAG) &&

17750

(!isSingleSHUFPSMask(HalfMask) ||

17751

Subtarget.hasFastVariableCrossLaneShuffle()))

17752

return SDValue();

17753

// If this is a unary shuffle (assume that the 2nd operand is

17754

// canonicalized to undef), then we can use vpermpd. Otherwise, we

17755

// are better off extracting the upper half of 1 operand and using a

17756

// narrow shuffle.

17757

if (EltWidth == 64 && V2.isUndef())

17758

return SDValue();

17759

}

17760

// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.

17761

if (Subtarget.hasAVX512() && VT.is512BitVector())

17762

return SDValue();

17763

// Extract + narrow shuffle is better than the wide alternative.

17764

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

17765

UndefLower, DAG);

17766

}

17767

17768

// Don't extract both uppers, instead shuffle and then extract.

17769

assert(NumUpperHalves == 2 && "Half vector count went wrong")(static_cast <bool> (NumUpperHalves == 2 && "Half vector count went wrong"
) ? void (0) : __assert_fail ("NumUpperHalves == 2 && \"Half vector count went wrong\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17769, __extension__
__PRETTY_FUNCTION__));

17770

return SDValue();

17771

}

17772

17773

// UndefLower - uuuuXXXX: an insert to high half is required if we split this.

17774

if (NumUpperHalves == 0) {

17775

// AVX2 has efficient 64-bit element cross-lane shuffles.

17776

// TODO: Refine to account for unary shuffle, splat, and other masks?

17777

if (Subtarget.hasAVX2() && EltWidth == 64)

17778

return SDValue();

17779

// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.

17780

if (Subtarget.hasAVX512() && VT.is512BitVector())

17781

return SDValue();

17782

// Narrow shuffle + insert is better than the wide alternative.

17783

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

17784

UndefLower, DAG);

17785

}

17786

17787

// NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.

17788

return SDValue();

17789

}

17790

17791

/// Handle case where shuffle sources are coming from the same 128-bit lane and

17792

/// every lane can be represented as the same repeating mask - allowing us to

17793

/// shuffle the sources with the repeating shuffle and then permute the result

17794

/// to the destination lanes.

17795

static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(

17796

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

17797

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

17798

int NumElts = VT.getVectorNumElements();

17799

int NumLanes = VT.getSizeInBits() / 128;

17800

int NumLaneElts = NumElts / NumLanes;

17801

17802

// On AVX2 we may be able to just shuffle the lowest elements and then

17803

// broadcast the result.

17804

if (Subtarget.hasAVX2()) {

17805

for (unsigned BroadcastSize : {16, 32, 64}) {

17806

if (BroadcastSize <= VT.getScalarSizeInBits())

17807

continue;

17808

int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();

17809

17810

// Attempt to match a repeating pattern every NumBroadcastElts,

17811

// accounting for UNDEFs but only references the lowest 128-bit

17812

// lane of the inputs.

17813

auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {

17814

for (int i = 0; i != NumElts; i += NumBroadcastElts)

17815

for (int j = 0; j != NumBroadcastElts; ++j) {

17816

int M = Mask[i + j];

17817

if (M < 0)

17818

continue;

17819

int &R = RepeatMask[j];

17820

if (0 != ((M % NumElts) / NumLaneElts))

17821

return false;

17822

if (0 <= R && R != M)

17823

return false;

17824

R = M;

17825

}

17826

return true;

17827

};

17828

17829

SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);

17830

if (!FindRepeatingBroadcastMask(RepeatMask))

17831

continue;

17832

17833

// Shuffle the (lowest) repeated elements in place for broadcast.

17834

SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);

17835

17836

// Shuffle the actual broadcast.

17837

SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);

17838

for (int i = 0; i != NumElts; i += NumBroadcastElts)

17839

for (int j = 0; j != NumBroadcastElts; ++j)

17840

BroadcastMask[i + j] = j;

17841

return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),

17842

BroadcastMask);

17843

}

17844

}

17845

17846

// Bail if the shuffle mask doesn't cross 128-bit lanes.

17847

if (!is128BitLaneCrossingShuffleMask(VT, Mask))

17848

return SDValue();

17849

17850

// Bail if we already have a repeated lane shuffle mask.

17851

if (is128BitLaneRepeatedShuffleMask(VT, Mask))

17852

return SDValue();

17853

17854

// Helper to look for repeated mask in each split sublane, and that those

17855

// sublanes can then be permuted into place.

17856

auto ShuffleSubLanes = [&](int SubLaneScale) {

17857

int NumSubLanes = NumLanes * SubLaneScale;

17858

int NumSubLaneElts = NumLaneElts / SubLaneScale;

17859

17860

// Check that all the sources are coming from the same lane and see if we

17861

// can form a repeating shuffle mask (local to each sub-lane). At the same

17862

// time, determine the source sub-lane for each destination sub-lane.

17863

int TopSrcSubLane = -1;

17864

SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);

17865

SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(

17866

SubLaneScale,

17867

SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));

17868

17869

for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {

17870

// Extract the sub-lane mask, check that it all comes from the same lane

17871

// and normalize the mask entries to come from the first lane.

17872

int SrcLane = -1;

17873

SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);

17874

for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {

17875

int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];

17876

if (M < 0)

17877

continue;

17878

int Lane = (M % NumElts) / NumLaneElts;

17879

if ((0 <= SrcLane) && (SrcLane != Lane))

17880

return SDValue();

17881

SrcLane = Lane;

17882

int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);

17883

SubLaneMask[Elt] = LocalM;

17884

}

17885

17886

// Whole sub-lane is UNDEF.

17887

if (SrcLane < 0)

17888

continue;

17889

17890

// Attempt to match against the candidate repeated sub-lane masks.

17891

for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {

17892

auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {

17893

for (int i = 0; i != NumSubLaneElts; ++i) {

17894

if (M1[i] < 0 || M2[i] < 0)

17895

continue;

17896

if (M1[i] != M2[i])

17897

return false;

17898

}

17899

return true;

17900

};

17901

17902

auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];

17903

if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))

17904

continue;

17905

17906

// Merge the sub-lane mask into the matching repeated sub-lane mask.

17907

for (int i = 0; i != NumSubLaneElts; ++i) {

17908

int M = SubLaneMask[i];

17909

if (M < 0)

17910

continue;

17911

assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17912, __extension__
__PRETTY_FUNCTION__))

17912

"Unexpected mask element")(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17912, __extension__
__PRETTY_FUNCTION__));

17913

RepeatedSubLaneMask[i] = M;

17914

}

17915

17916

// Track the top most source sub-lane - by setting the remaining to

17917

// UNDEF we can greatly simplify shuffle matching.

17918

int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;

17919

TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);

17920

Dst2SrcSubLanes[DstSubLane] = SrcSubLane;

17921

break;

17922

}

17923

17924

// Bail if we failed to find a matching repeated sub-lane mask.

17925

if (Dst2SrcSubLanes[DstSubLane] < 0)

17926

return SDValue();

17927

}

17928

assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17929, __extension__
__PRETTY_FUNCTION__))

17929

"Unexpected source lane")(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17929, __extension__
__PRETTY_FUNCTION__));

17930

17931

// Create a repeating shuffle mask for the entire vector.

17932

SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);

17933

for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {

17934

int Lane = SubLane / SubLaneScale;

17935

auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];

17936

for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {

17937

int M = RepeatedSubLaneMask[Elt];

17938

if (M < 0)

17939

continue;

17940

int Idx = (SubLane * NumSubLaneElts) + Elt;

17941

RepeatedMask[Idx] = M + (Lane * NumLaneElts);

17942

}

17943

}

17944

17945

// Shuffle each source sub-lane to its destination.

17946

SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);

17947

for (int i = 0; i != NumElts; i += NumSubLaneElts) {

17948

int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];

17949

if (SrcSubLane < 0)

17950

continue;

17951

for (int j = 0; j != NumSubLaneElts; ++j)

17952

SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);

17953

}

17954

17955

// Avoid returning the same shuffle operation.

17956

// v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32

17957

if (RepeatedMask == Mask || SubLaneMask == Mask)

17958

return SDValue();

17959

17960

SDValue RepeatedShuffle =

17961

DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);

17962

17963

return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),

17964

SubLaneMask);

17965

};

17966

17967

// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes

17968

// (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,

17969

// even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.

17970

// Otherwise we can only permute whole 128-bit lanes.

17971

int MinSubLaneScale = 1, MaxSubLaneScale = 1;

17972

if (Subtarget.hasAVX2() && VT.is256BitVector()) {

17973

bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);

17974

MinSubLaneScale = 2;

17975

MaxSubLaneScale =

17976

(!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;

17977

}

17978

if (Subtarget.hasBWI() && VT == MVT::v64i8)

17979

MinSubLaneScale = MaxSubLaneScale = 4;

17980

17981

for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)

17982

if (SDValue Shuffle = ShuffleSubLanes(Scale))

17983

return Shuffle;

17984

17985

return SDValue();

17986

}

17987

17988

static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,

17989

bool &ForceV1Zero, bool &ForceV2Zero,

17990

unsigned &ShuffleImm, ArrayRef<int> Mask,

17991

const APInt &Zeroable) {

17992

int NumElts = VT.getVectorNumElements();

17993

assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17995, __extension__
__PRETTY_FUNCTION__))

17994

(NumElts == 2 || NumElts == 4 || NumElts == 8) &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17995, __extension__
__PRETTY_FUNCTION__))

17995

"Unexpected data type for VSHUFPD")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17995, __extension__
__PRETTY_FUNCTION__));

17996

assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17997, __extension__
__PRETTY_FUNCTION__))

17997

"Illegal shuffle mask")(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17997, __extension__
__PRETTY_FUNCTION__));

17998

17999

bool ZeroLane[2] = { true, true };

18000

for (int i = 0; i < NumElts; ++i)

18001

ZeroLane[i & 1] &= Zeroable[i];

18002

18003

// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..

18004

// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..

18005

ShuffleImm = 0;

18006

bool ShufpdMask = true;

18007

bool CommutableMask = true;

18008

for (int i = 0; i < NumElts; ++i) {

18009

if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])

18010

continue;

18011

if (Mask[i] < 0)

18012

return false;

18013

int Val = (i & 6) + NumElts * (i & 1);

18014

int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);

18015

if (Mask[i] < Val || Mask[i] > Val + 1)

18016

ShufpdMask = false;

18017

if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)

18018

CommutableMask = false;

18019

ShuffleImm |= (Mask[i] % 2) << i;

18020

}

18021

18022

if (!ShufpdMask && !CommutableMask)

18023

return false;

18024

18025

if (!ShufpdMask && CommutableMask)

18026

std::swap(V1, V2);

18027

18028

ForceV1Zero = ZeroLane[0];

18029

ForceV2Zero = ZeroLane[1];

18030

return true;

18031

}

18032

18033

static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,

18034

SDValue V2, ArrayRef<int> Mask,

18035

const APInt &Zeroable,

18036

const X86Subtarget &Subtarget,

18037

SelectionDAG &DAG) {

18038

assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18039, __extension__
__PRETTY_FUNCTION__))

18039

"Unexpected data type for VSHUFPD")(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18039, __extension__
__PRETTY_FUNCTION__));

18040

18041

unsigned Immediate = 0;

18042

bool ForceV1Zero = false, ForceV2Zero = false;

18043

if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,

18044

Mask, Zeroable))

18045

return SDValue();

18046

18047

// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.

18048

if (ForceV1Zero)

18049

V1 = getZeroVector(VT, Subtarget, DAG, DL);

18050

if (ForceV2Zero)

18051

V2 = getZeroVector(VT, Subtarget, DAG, DL);

18052

18053

return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,

18054

DAG.getTargetConstant(Immediate, DL, MVT::i8));

18055

}

18056

18057

// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed

18058

// by zeroable elements in the remaining 24 elements. Turn this into two

18059

// vmovqb instructions shuffled together.

18060

static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,

18061

SDValue V1, SDValue V2,

18062

ArrayRef<int> Mask,

18063

const APInt &Zeroable,

18064

SelectionDAG &DAG) {

18065

assert(VT == MVT::v32i8 && "Unexpected type!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected type!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18065, __extension__
__PRETTY_FUNCTION__));

18066

18067

// The first 8 indices should be every 8th element.

18068

if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))

18069

return SDValue();

18070

18071

// Remaining elements need to be zeroable.

18072

if (Zeroable.countl_one() < (Mask.size() - 8))

18073

return SDValue();

18074

18075

V1 = DAG.getBitcast(MVT::v4i64, V1);

18076

V2 = DAG.getBitcast(MVT::v4i64, V2);

18077

18078

V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);

18079

V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);

18080

18081

// The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in

18082

// the upper bits of the result using an unpckldq.

18083

SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,

18084

{ 0, 1, 2, 3, 16, 17, 18, 19,

18085

4, 5, 6, 7, 20, 21, 22, 23 });

18086

// Insert the unpckldq into a zero vector to widen to v32i8.

18087

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,

18088

DAG.getConstant(0, DL, MVT::v32i8), Unpack,

18089

DAG.getIntPtrConstant(0, DL));

18090

}

18091

18092

// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2

18093

// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2

18094

// =>

18095

// ul = unpckl v1, v2

18096

// uh = unpckh v1, v2

18097

// a = vperm ul, uh

18098

// b = vperm ul, uh

18099

//

18100

// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck

18101

// and permute. We cannot directly match v3 because it is split into two

18102

// 256-bit vectors in earlier isel stages. Therefore, this function matches a

18103

// pair of 256-bit shuffles and makes sure the masks are consecutive.

18104

//

18105

// Once unpck and permute nodes are created, the permute corresponding to this

18106

// shuffle is returned, while the other permute replaces the other half of the

18107

// shuffle in the selection dag.

18108

static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT,

18109

SDValue V1, SDValue V2,

18110

ArrayRef<int> Mask,

18111

SelectionDAG &DAG) {

18112

if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&

18113

VT != MVT::v32i8)

18114

return SDValue();

18115

// <B0, B1, B0+1, B1+1, ..., >

18116

auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,

18117

unsigned Begin1) {

18118

size_t Size = Mask.size();

18119

assert(Size % 2 == 0 && "Expected even mask size")(static_cast <bool> (Size % 2 == 0 && "Expected even mask size"
) ? void (0) : __assert_fail ("Size % 2 == 0 && \"Expected even mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18119, __extension__
__PRETTY_FUNCTION__));

18120

for (unsigned I = 0; I < Size; I += 2) {

18121

if (Mask[I] != (int)(Begin0 + I / 2) ||

18122

Mask[I + 1] != (int)(Begin1 + I / 2))

18123

return false;

18124

}

18125

return true;

18126

};

18127

// Check which half is this shuffle node

18128

int NumElts = VT.getVectorNumElements();

18129

size_t FirstQtr = NumElts / 2;

18130

size_t ThirdQtr = NumElts + NumElts / 2;

18131

bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);

18132

bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);

18133

if (!IsFirstHalf && !IsSecondHalf)

18134

return SDValue();

18135

18136

// Find the intersection between shuffle users of V1 and V2.

18137

SmallVector<SDNode *, 2> Shuffles;

18138

for (SDNode *User : V1->uses())

18139

if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&

18140

User->getOperand(1) == V2)

18141

Shuffles.push_back(User);

18142

// Limit user size to two for now.

18143

if (Shuffles.size() != 2)

18144

return SDValue();

18145

// Find out which half of the 512-bit shuffles is each smaller shuffle

18146

auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);

18147

auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);

18148

SDNode *FirstHalf;

18149

SDNode *SecondHalf;

18150

if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&

18151

IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {

18152

FirstHalf = Shuffles[0];

18153

SecondHalf = Shuffles[1];

18154

} else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&

18155

IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {

18156

FirstHalf = Shuffles[1];

18157

SecondHalf = Shuffles[0];

18158

} else {

18159

return SDValue();

18160

}

18161

// Lower into unpck and perm. Return the perm of this shuffle and replace

18162

// the other.

18163

SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

18164

SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

18165

SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,

18166

DAG.getTargetConstant(0x20, DL, MVT::i8));

18167

SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,

18168

DAG.getTargetConstant(0x31, DL, MVT::i8));

18169

if (IsFirstHalf) {

18170

DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);

18171

return Perm1;

18172

}

18173

DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);

18174

return Perm2;

18175

}

18176

18177

/// Handle lowering of 4-lane 64-bit floating point shuffles.

18178

///

18179

/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2

18180

/// isn't available.

18181

static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18182

const APInt &Zeroable, SDValue V1, SDValue V2,

18183

const X86Subtarget &Subtarget,

18184

SelectionDAG &DAG) {

18185

assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18185, __extension__
__PRETTY_FUNCTION__));

18186

assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18186, __extension__
__PRETTY_FUNCTION__));

18187

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18187, __extension__
__PRETTY_FUNCTION__));

18188

18189

if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,

18190

Subtarget, DAG))

18191

return V;

18192

18193

if (V2.isUndef()) {

18194

// Check for being able to broadcast a single element.

18195

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,

18196

Mask, Subtarget, DAG))

18197

return Broadcast;

18198

18199

// Use low duplicate instructions for masks that match their pattern.

18200

if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))

18201

return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);

18202

18203

if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {

18204

// Non-half-crossing single input shuffles can be lowered with an

18205

// interleaved permutation.

18206

unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |

18207

((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);

18208

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,

18209

DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));

18210

}

18211

18212

// With AVX2 we have direct support for this permutation.

18213

if (Subtarget.hasAVX2())

18214

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,

18215

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

18216

18217

// Try to create an in-lane repeating shuffle mask and then shuffle the

18218

// results into the target lanes.

18219

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18220

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

18221

return V;

18222

18223

// Try to permute the lanes and then use a per-lane permute.

18224

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,

18225

Mask, DAG, Subtarget))

18226

return V;

18227

18228

// Otherwise, fall back.

18229

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,

18230

DAG, Subtarget);

18231

}

18232

18233

// Use dedicated unpack instructions for masks that match their pattern.

18234

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))

18235

return V;

18236

18237

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,

18238

Zeroable, Subtarget, DAG))

18239

return Blend;

18240

18241

// Check if the blend happens to exactly fit that of SHUFPD.

18242

if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,

18243

Zeroable, Subtarget, DAG))

18244

return Op;

18245

18246

bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);

18247

bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);

18248

18249

// If we have lane crossing shuffles AND they don't all come from the lower

18250

// lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

18251

// TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently

18252

// canonicalize to a blend of splat which isn't necessary for this combine.

18253

if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&

18254

!all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&

18255

(V1.getOpcode() != ISD::BUILD_VECTOR) &&

18256

(V2.getOpcode() != ISD::BUILD_VECTOR))

18257

return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);

18258

18259

// If we have one input in place, then we can permute the other input and

18260

// blend the result.

18261

if (V1IsInPlace || V2IsInPlace)

18262

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,

18263

Subtarget, DAG);

18264

18265

// Try to create an in-lane repeating shuffle mask and then shuffle the

18266

// results into the target lanes.

18267

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18268

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

18269

return V;

18270

18271

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18272

// shuffle. However, if we have AVX2 and either inputs are already in place,

18273

// we will be able to shuffle even across lanes the other input in a single

18274

// instruction so skip this pattern.

18275

if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))

18276

if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(

18277

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

18278

return V;

18279

18280

// If we have VLX support, we can use VEXPAND.

18281

if (Subtarget.hasVLX())

18282

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,

18283

DAG, Subtarget))

18284

return V;

18285

18286

// If we have AVX2 then we always want to lower with a blend because an v4 we

18287

// can fully permute the elements.

18288

if (Subtarget.hasAVX2())

18289

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,

18290

Subtarget, DAG);

18291

18292

// Otherwise fall back on generic lowering.

18293

return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,

18294

Subtarget, DAG);

18295

}

18296

18297

/// Handle lowering of 4-lane 64-bit integer shuffles.

18298

///

18299

/// This routine is only called when we have AVX2 and thus a reasonable

18300

/// instruction set for v4i64 shuffling..

18301

static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18302

const APInt &Zeroable, SDValue V1, SDValue V2,

18303

const X86Subtarget &Subtarget,

18304

SelectionDAG &DAG) {

18305

assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18305, __extension__
__PRETTY_FUNCTION__));

18306

assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18306, __extension__
__PRETTY_FUNCTION__));

18307

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18307, __extension__
__PRETTY_FUNCTION__));

18308

assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18308, __extension__
__PRETTY_FUNCTION__));

18309

18310

if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,

18311

Subtarget, DAG))

18312

return V;

18313

18314

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,

18315

Zeroable, Subtarget, DAG))

18316

return Blend;

18317

18318

// Check for being able to broadcast a single element.

18319

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,

18320

Subtarget, DAG))

18321

return Broadcast;

18322

18323

// Try to use shift instructions if fast.

18324

if (Subtarget.preferLowerShuffleAsShift())

18325

if (SDValue Shift =

18326

lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,

18327

Subtarget, DAG, /*BitwiseOnly*/ true))

18328

return Shift;

18329

18330

if (V2.isUndef()) {

18331

// When the shuffle is mirrored between the 128-bit lanes of the unit, we

18332

// can use lower latency instructions that will operate on both lanes.

18333

SmallVector<int, 2> RepeatedMask;

18334

if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {

18335

SmallVector<int, 4> PSHUFDMask;

18336

narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);

18337

return DAG.getBitcast(

18338

MVT::v4i64,

18339

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,

18340

DAG.getBitcast(MVT::v8i32, V1),

18341

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

18342

}

18343

18344

// AVX2 provides a direct instruction for permuting a single input across

18345

// lanes.

18346

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,

18347

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

18348

}

18349

18350

// Try to use shift instructions.

18351

if (SDValue Shift =

18352

lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,

18353

DAG, /*BitwiseOnly*/ false))

18354

return Shift;

18355

18356

// If we have VLX support, we can use VALIGN or VEXPAND.

18357

if (Subtarget.hasVLX()) {

18358

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,

18359

Subtarget, DAG))

18360

return Rotate;

18361

18362

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,

18363

DAG, Subtarget))

18364

return V;

18365

}

18366

18367

// Try to use PALIGNR.

18368

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,

18369

Subtarget, DAG))

18370

return Rotate;

18371

18372

// Use dedicated unpack instructions for masks that match their pattern.

18373

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))

18374

return V;

18375

18376

bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);

18377

bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);

18378

18379

// If we have one input in place, then we can permute the other input and

18380

// blend the result.

18381

if (V1IsInPlace || V2IsInPlace)

18382

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,

18383

Subtarget, DAG);

18384

18385

// Try to create an in-lane repeating shuffle mask and then shuffle the

18386

// results into the target lanes.

18387

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18388

DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))

18389

return V;

18390

18391

// Try to lower to PERMQ(BLENDD(V1,V2)).

18392

if (SDValue V =

18393

lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))

18394

return V;

18395

18396

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18397

// shuffle. However, if we have AVX2 and either inputs are already in place,

18398

// we will be able to shuffle even across lanes the other input in a single

18399

// instruction so skip this pattern.

18400

if (!V1IsInPlace && !V2IsInPlace)

18401

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18402

DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))

18403

return Result;

18404

18405

// Otherwise fall back on generic blend lowering.

18406

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,

18407

Subtarget, DAG);

18408

}

18409

18410

/// Handle lowering of 8-lane 32-bit floating point shuffles.

18411

///

18412

/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2

18413

/// isn't available.

18414

static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18415

const APInt &Zeroable, SDValue V1, SDValue V2,

18416

const X86Subtarget &Subtarget,

18417

SelectionDAG &DAG) {

18418

assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18418, __extension__
__PRETTY_FUNCTION__));

18419

assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18419, __extension__
__PRETTY_FUNCTION__));

18420

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18420, __extension__
__PRETTY_FUNCTION__));

18421

18422

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,

18423

Zeroable, Subtarget, DAG))

18424

return Blend;

18425

18426

// Check for being able to broadcast a single element.

18427

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,

18428

Subtarget, DAG))

18429

return Broadcast;

18430

18431

if (!Subtarget.hasAVX2()) {

18432

SmallVector<int> InLaneMask;

18433

computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);

18434

18435

if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))

18436

if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,

18437

/*SimpleOnly*/ true))

18438

return R;

18439

}

18440

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,

18441

Zeroable, Subtarget, DAG))

18442

return DAG.getBitcast(MVT::v8f32, ZExt);

18443

18444

// If the shuffle mask is repeated in each 128-bit lane, we have many more

18445

// options to efficiently lower the shuffle.

18446

SmallVector<int, 4> RepeatedMask;

18447

if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {

18448

assert(RepeatedMask.size() == 4 &&(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18449, __extension__
__PRETTY_FUNCTION__))

18449

"Repeated masks must be half the mask width!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18449, __extension__
__PRETTY_FUNCTION__));

18450

18451

// Use even/odd duplicate instructions for masks that match their pattern.

18452

if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))

18453

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);

18454

if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))

18455

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);

18456

18457

if (V2.isUndef())

18458

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,

18459

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

18460

18461

// Use dedicated unpack instructions for masks that match their pattern.

18462

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))

18463

return V;

18464

18465

// Otherwise, fall back to a SHUFPS sequence. Here it is important that we

18466

// have already handled any direct blends.

18467

return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);

18468

}

18469

18470

// Try to create an in-lane repeating shuffle mask and then shuffle the

18471

// results into the target lanes.

18472

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18473

DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))

18474

return V;

18475

18476

// If we have a single input shuffle with different shuffle patterns in the

18477

// two 128-bit lanes use the variable mask to VPERMILPS.

18478

if (V2.isUndef()) {

18479

if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {

18480

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

18481

return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);

18482

}

18483

if (Subtarget.hasAVX2()) {

18484

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

18485

return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);

18486

}

18487

// Otherwise, fall back.

18488

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,

18489

DAG, Subtarget);

18490

}

18491

18492

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18493

// shuffle.

18494

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18495

DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))

18496

return Result;

18497

18498

// If we have VLX support, we can use VEXPAND.

18499

if (Subtarget.hasVLX())

18500

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,

18501

DAG, Subtarget))

18502

return V;

18503

18504

// Try to match an interleave of two v8f32s and lower them as unpck and

18505

// permutes using ymms. This needs to go before we try to split the vectors.

18506

//

18507

// TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits

18508

// this path inadvertently.

18509

if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())

18510

if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,

18511

Mask, DAG))

18512

return V;

18513

18514

// For non-AVX512 if the Mask is of 16bit elements in lane then try to split

18515

// since after split we get a more efficient code using vpunpcklwd and

18516

// vpunpckhwd instrs than vblend.

18517

if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))

18518

return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,

18519

DAG);

18520

18521

// If we have AVX2 then we always want to lower with a blend because at v8 we

18522

// can fully permute the elements.

18523

if (Subtarget.hasAVX2())

18524

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,

18525

Subtarget, DAG);

18526

18527

// Otherwise fall back on generic lowering.

18528

return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,

18529

Subtarget, DAG);

18530

}

18531

18532

/// Handle lowering of 8-lane 32-bit integer shuffles.

18533

///

18534

/// This routine is only called when we have AVX2 and thus a reasonable

18535

/// instruction set for v8i32 shuffling..

18536

static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18537

const APInt &Zeroable, SDValue V1, SDValue V2,

18538

const X86Subtarget &Subtarget,

18539

SelectionDAG &DAG) {

18540

assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18540, __extension__
__PRETTY_FUNCTION__));

18541

assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18541, __extension__
__PRETTY_FUNCTION__));

18542

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18542, __extension__
__PRETTY_FUNCTION__));

18543

assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18543, __extension__
__PRETTY_FUNCTION__));

18544

18545

int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });

18546

18547

// Whenever we can lower this as a zext, that instruction is strictly faster

18548

// than any alternative. It also allows us to fold memory operands into the

18549

// shuffle in many cases.

18550

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,

18551

Zeroable, Subtarget, DAG))

18552

return ZExt;

18553

18554

// Try to match an interleave of two v8i32s and lower them as unpck and

18555

// permutes using ymms. This needs to go before we try to split the vectors.

18556

if (!Subtarget.hasAVX512())

18557

if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,

18558

Mask, DAG))

18559

return V;

18560

18561

// For non-AVX512 if the Mask is of 16bit elements in lane then try to split

18562

// since after split we get a more efficient code than vblend by using

18563

// vpunpcklwd and vpunpckhwd instrs.

18564

if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&

18565

!Subtarget.hasAVX512())

18566

return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,

18567

DAG);

18568

18569

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,

18570

Zeroable, Subtarget, DAG))

18571

return Blend;

18572

18573

// Check for being able to broadcast a single element.

18574

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,

18575

Subtarget, DAG))

18576

return Broadcast;

18577

18578

// Try to use shift instructions if fast.

18579

if (Subtarget.preferLowerShuffleAsShift()) {

18580

if (SDValue Shift =

18581

lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,

18582

Subtarget, DAG, /*BitwiseOnly*/ true))

18583

return Shift;

18584

if (NumV2Elements == 0)

18585

if (SDValue Rotate =

18586

lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))

18587

return Rotate;

18588

}

18589

18590

// If the shuffle mask is repeated in each 128-bit lane we can use more

18591

// efficient instructions that mirror the shuffles across the two 128-bit

18592

// lanes.

18593

SmallVector<int, 4> RepeatedMask;

18594

bool Is128BitLaneRepeatedShuffle =

18595

is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);

18596

if (Is128BitLaneRepeatedShuffle) {

18597

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18597, __extension__
__PRETTY_FUNCTION__));

18598

if (V2.isUndef())

18599

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,

18600

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

18601

18602

// Use dedicated unpack instructions for masks that match their pattern.

18603

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))

18604

return V;

18605

}

18606

18607

// Try to use shift instructions.

18608

if (SDValue Shift =

18609

lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,

18610

DAG, /*BitwiseOnly*/ false))

18611

return Shift;

18612

18613

if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)

18614

if (SDValue Rotate =

18615

lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))

18616

return Rotate;

18617

18618

// If we have VLX support, we can use VALIGN or EXPAND.

18619

if (Subtarget.hasVLX()) {

18620

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,

18621

Subtarget, DAG))

18622

return Rotate;

18623

18624

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,

18625

DAG, Subtarget))

18626

return V;

18627

}

18628

18629

// Try to use byte rotation instructions.

18630

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,

18631

Subtarget, DAG))

18632

return Rotate;

18633

18634

// Try to create an in-lane repeating shuffle mask and then shuffle the

18635

// results into the target lanes.

18636

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18637

DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

18638

return V;

18639

18640

if (V2.isUndef()) {

18641

// Try to produce a fixed cross-128-bit lane permute followed by unpack

18642

// because that should be faster than the variable permute alternatives.

18643

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))

18644

return V;

18645

18646

// If the shuffle patterns aren't repeated but it's a single input, directly

18647

// generate a cross-lane VPERMD instruction.

18648

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

18649

return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);

18650

}

18651

18652

// Assume that a single SHUFPS is faster than an alternative sequence of

18653

// multiple instructions (even if the CPU has a domain penalty).

18654

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

18655

if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {

18656

SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);

18657

SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);

18658

SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,

18659

CastV1, CastV2, DAG);

18660

return DAG.getBitcast(MVT::v8i32, ShufPS);

18661

}

18662

18663

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18664

// shuffle.

18665

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18666

DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

18667

return Result;

18668

18669

// Otherwise fall back on generic blend lowering.

18670

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,

18671

Subtarget, DAG);

18672

}

18673

18674

/// Handle lowering of 16-lane 16-bit integer shuffles.

18675

///

18676

/// This routine is only called when we have AVX2 and thus a reasonable

18677

/// instruction set for v16i16 shuffling..

18678

static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18679

const APInt &Zeroable, SDValue V1, SDValue V2,

18680

const X86Subtarget &Subtarget,

18681

SelectionDAG &DAG) {

18682

assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18682, __extension__
__PRETTY_FUNCTION__));

18683

assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18683, __extension__
__PRETTY_FUNCTION__));

18684

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18684, __extension__
__PRETTY_FUNCTION__));

18685

assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18685, __extension__
__PRETTY_FUNCTION__));

18686

18687

// Whenever we can lower this as a zext, that instruction is strictly faster

18688

// than any alternative. It also allows us to fold memory operands into the

18689

// shuffle in many cases.

18690

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

18691

DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

18692

return ZExt;

18693

18694

// Check for being able to broadcast a single element.

18695

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,

18696

Subtarget, DAG))

18697

return Broadcast;

18698

18699

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,

18700

Zeroable, Subtarget, DAG))

18701

return Blend;

18702

18703

// Use dedicated unpack instructions for masks that match their pattern.

18704

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))

18705

return V;

18706

18707

// Use dedicated pack instructions for masks that match their pattern.

18708

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,

18709

Subtarget))

18710

return V;

18711

18712

// Try to use lower using a truncation.

18713

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,

18714

Subtarget, DAG))

18715

return V;

18716

18717

// Try to use shift instructions.

18718

if (SDValue Shift =

18719

lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,

18720

Subtarget, DAG, /*BitwiseOnly*/ false))

18721

return Shift;

18722

18723

// Try to use byte rotation instructions.

18724

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,

18725

Subtarget, DAG))

18726

return Rotate;

18727

18728

// Try to create an in-lane repeating shuffle mask and then shuffle the

18729

// results into the target lanes.

18730

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18731

DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

18732

return V;

18733

18734

if (V2.isUndef()) {

18735

// Try to use bit rotation instructions.

18736

if (SDValue Rotate =

18737

lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))

18738

return Rotate;

18739

18740

// Try to produce a fixed cross-128-bit lane permute followed by unpack

18741

// because that should be faster than the variable permute alternatives.

18742

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))

18743

return V;

18744

18745

// There are no generalized cross-lane shuffle operations available on i16

18746

// element types.

18747

if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {

18748

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18749

DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))

18750

return V;

18751

18752

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,

18753

DAG, Subtarget);

18754

}

18755

18756

SmallVector<int, 8> RepeatedMask;

18757

if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {

18758

// As this is a single-input shuffle, the repeated mask should be

18759

// a strictly valid v8i16 mask that we can pass through to the v8i16

18760

// lowering to handle even the v16 case.

18761

return lowerV8I16GeneralSingleInputShuffle(

18762

DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);

18763

}

18764

}

18765

18766

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,

18767

Zeroable, Subtarget, DAG))

18768

return PSHUFB;

18769

18770

// AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).

18771

if (Subtarget.hasBWI())

18772

return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);

18773

18774

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18775

// shuffle.

18776

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18777

DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

18778

return Result;

18779

18780

// Try to permute the lanes and then use a per-lane permute.

18781

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18782

DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))

18783

return V;

18784

18785

// Try to match an interleave of two v16i16s and lower them as unpck and

18786

// permutes using ymms.

18787

if (!Subtarget.hasAVX512())

18788

if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,

18789

Mask, DAG))

18790

return V;

18791

18792

// Otherwise fall back on generic lowering.

18793

return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,

18794

Subtarget, DAG);

18795

}

18796

18797

/// Handle lowering of 32-lane 8-bit integer shuffles.

18798

///

18799

/// This routine is only called when we have AVX2 and thus a reasonable

18800

/// instruction set for v32i8 shuffling..

18801

static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18802

const APInt &Zeroable, SDValue V1, SDValue V2,

18803

const X86Subtarget &Subtarget,

18804

SelectionDAG &DAG) {

18805

assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18805, __extension__
__PRETTY_FUNCTION__));

18806

assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18806, __extension__
__PRETTY_FUNCTION__));

18807

assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18807, __extension__
__PRETTY_FUNCTION__));

18808

assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18808, __extension__
__PRETTY_FUNCTION__));

18809

18810

// Whenever we can lower this as a zext, that instruction is strictly faster

18811

// than any alternative. It also allows us to fold memory operands into the

18812

// shuffle in many cases.

18813

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,

18814

Zeroable, Subtarget, DAG))

18815

return ZExt;

18816

18817

// Check for being able to broadcast a single element.

18818

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,

18819

Subtarget, DAG))

18820

return Broadcast;

18821

18822

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,

18823

Zeroable, Subtarget, DAG))

18824

return Blend;

18825

18826

// Use dedicated unpack instructions for masks that match their pattern.

18827

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))

18828

return V;

18829

18830

// Use dedicated pack instructions for masks that match their pattern.

18831

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,

18832

Subtarget))

18833

return V;

18834

18835

// Try to use lower using a truncation.

18836

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,

18837

Subtarget, DAG))

18838

return V;

18839

18840

// Try to use shift instructions.

18841

if (SDValue Shift =

18842

lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,

18843

DAG, /*BitwiseOnly*/ false))

18844

return Shift;

18845

18846

// Try to use byte rotation instructions.

18847

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,

18848

Subtarget, DAG))

18849

return Rotate;

18850

18851

// Try to use bit rotation instructions.

18852

if (V2.isUndef())

18853

if (SDValue Rotate =

18854

lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))

18855

return Rotate;

18856

18857

// Try to create an in-lane repeating shuffle mask and then shuffle the

18858

// results into the target lanes.

18859

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18860

DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

18861

return V;

18862

18863

// There are no generalized cross-lane shuffle operations available on i8

18864

// element types.

18865

if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {

18866

// Try to produce a fixed cross-128-bit lane permute followed by unpack

18867

// because that should be faster than the variable permute alternatives.

18868

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))

18869

return V;

18870

18871

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18872

DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))

18873

return V;

18874

18875

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,

18876

DAG, Subtarget);

18877

}

18878

18879

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,

18880

Zeroable, Subtarget, DAG))

18881

return PSHUFB;

18882

18883

// AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).

18884

if (Subtarget.hasVBMI())

18885

return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);

18886

18887

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18888

// shuffle.

18889

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18890

DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

18891

return Result;

18892

18893

// Try to permute the lanes and then use a per-lane permute.

18894

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18895

DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))

18896

return V;

18897

18898

// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed

18899

// by zeroable elements in the remaining 24 elements. Turn this into two

18900

// vmovqb instructions shuffled together.

18901

if (Subtarget.hasVLX())

18902

if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,

18903

Mask, Zeroable, DAG))

18904

return V;

18905

18906

// Try to match an interleave of two v32i8s and lower them as unpck and

18907

// permutes using ymms.

18908

if (!Subtarget.hasAVX512())

18909

if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,

18910

Mask, DAG))

18911

return V;

18912

18913

// Otherwise fall back on generic lowering.

18914

return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,

18915

Subtarget, DAG);

18916

}

18917

18918

/// High-level routine to lower various 256-bit x86 vector shuffles.

18919

///

18920

/// This routine either breaks down the specific type of a 256-bit x86 vector

18921

/// shuffle or splits it into two 128-bit shuffles and fuses the results back

18922

/// together based on the available instructions.

18923

static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,

18924

SDValue V1, SDValue V2, const APInt &Zeroable,

18925

const X86Subtarget &Subtarget,

18926

SelectionDAG &DAG) {

18927

// If we have a single input to the zero element, insert that into V1 if we

18928

// can do so cheaply.

18929

int NumElts = VT.getVectorNumElements();

18930

int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

18931

18932

if (NumV2Elements == 1 && Mask[0] >= NumElts)

18933

if (SDValue Insertion = lowerShuffleAsElementInsertion(

18934

DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))

18935

return Insertion;

18936

18937

// Handle special cases where the lower or upper half is UNDEF.

18938

if (SDValue V =

18939

lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))

18940

return V;

18941

18942

// There is a really nice hard cut-over between AVX1 and AVX2 that means we

18943

// can check for those subtargets here and avoid much of the subtarget

18944

// querying in the per-vector-type lowering routines. With AVX1 we have

18945

// essentially *zero* ability to manipulate a 256-bit vector with integer

18946

// types. Since we'll use floating point types there eventually, just

18947

// immediately cast everything to a float and operate entirely in that domain.

18948

if (VT.isInteger() && !Subtarget.hasAVX2()) {

18949

int ElementBits = VT.getScalarSizeInBits();

18950

if (ElementBits < 32) {

18951

// No floating point type available, if we can't use the bit operations

18952

// for masking/blending then decompose into 128-bit vectors.

18953

if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

18954

Subtarget, DAG))

18955

return V;

18956

if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

18957

return V;

18958

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);

18959

}

18960

18961

MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),

18962

VT.getVectorNumElements());

18963

V1 = DAG.getBitcast(FpVT, V1);

18964

V2 = DAG.getBitcast(FpVT, V2);

18965

return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));

18966

}

18967

18968

if (VT == MVT::v16f16) {

18969

V1 = DAG.getBitcast(MVT::v16i16, V1);

18970

V2 = DAG.getBitcast(MVT::v16i16, V2);

18971

return DAG.getBitcast(MVT::v16f16,

18972

DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));

18973

}

18974

18975

switch (VT.SimpleTy) {

18976

case MVT::v4f64:

18977

return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18978

case MVT::v4i64:

18979

return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18980

case MVT::v8f32:

18981

return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18982

case MVT::v8i32:

18983

return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18984

case MVT::v16i16:

18985

return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18986

case MVT::v32i8:

18987

return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18988

18989

default:

18990

llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18990);

18991

}

18992

}

18993

18994

/// Try to lower a vector shuffle as a 128-bit shuffles.

18995

static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

18996

const APInt &Zeroable, SDValue V1, SDValue V2,

18997

const X86Subtarget &Subtarget,

18998

SelectionDAG &DAG) {

18999

assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19000, __extension__
__PRETTY_FUNCTION__))

19000

"Unexpected element type size for 128bit shuffle.")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19000, __extension__
__PRETTY_FUNCTION__));

19001

19002

// To handle 256 bit vector requires VLX and most probably

19003

// function lowerV2X128VectorShuffle() is better solution.

19004

assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")(static_cast <bool> (VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? void (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19004, __extension__
__PRETTY_FUNCTION__));

19005

19006

// TODO - use Zeroable like we do for lowerV2X128VectorShuffle?

19007

SmallVector<int, 4> Widened128Mask;

19008

if (!canWidenShuffleElements(Mask, Widened128Mask))

19009

return SDValue();

19010

assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")(static_cast <bool> (Widened128Mask.size() == 4 &&
"Shuffle widening mismatch") ? void (0) : __assert_fail ("Widened128Mask.size() == 4 && \"Shuffle widening mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19010, __extension__
__PRETTY_FUNCTION__));

19011

19012

// Try to use an insert into a zero vector.

19013

if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&

19014

(Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {

19015

unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;

19016

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

19017

SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

19018

DAG.getIntPtrConstant(0, DL));

19019

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

19020

getZeroVector(VT, Subtarget, DAG, DL), LoV,

19021

DAG.getIntPtrConstant(0, DL));

19022

}

19023

19024

// Check for patterns which can be matched with a single insert of a 256-bit

19025

// subvector.

19026

bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);

19027

if (OnlyUsesV1 ||

19028

isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {

19029

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);

19030

SDValue SubVec =

19031

DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,

19032

DAG.getIntPtrConstant(0, DL));

19033

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,

19034

DAG.getIntPtrConstant(4, DL));

19035

}

19036

19037

// See if this is an insertion of the lower 128-bits of V2 into V1.

19038

bool IsInsert = true;

19039

int V2Index = -1;

19040

for (int i = 0; i < 4; ++i) {

19041

assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19041, __extension__
__PRETTY_FUNCTION__));

19042

if (Widened128Mask[i] < 0)

19043

continue;

19044

19045

// Make sure all V1 subvectors are in place.

19046

if (Widened128Mask[i] < 4) {

19047

if (Widened128Mask[i] != i) {

19048

IsInsert = false;

19049

break;

19050

}

19051

} else {

19052

// Make sure we only have a single V2 index and its the lowest 128-bits.

19053

if (V2Index >= 0 || Widened128Mask[i] != 4) {

19054

IsInsert = false;

19055

break;

19056

}

19057

V2Index = i;

19058

}

19059

}

19060

if (IsInsert && V2Index >= 0) {

19061

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

19062

SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,

19063

DAG.getIntPtrConstant(0, DL));

19064

return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);

19065

}

19066

19067

// See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane

19068

// UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where

19069

// possible we at least ensure the lanes stay sequential to help later

19070

// combines.

19071

SmallVector<int, 2> Widened256Mask;

19072

if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {

19073

Widened128Mask.clear();

19074

narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);

19075

}

19076

19077

// Try to lower to vshuf64x2/vshuf32x4.

19078

SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

19079

unsigned PermMask = 0;

19080

// Insure elements came from the same Op.

19081

for (int i = 0; i < 4; ++i) {

19082

assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19082, __extension__
__PRETTY_FUNCTION__));

19083

if (Widened128Mask[i] < 0)

19084

continue;

19085

19086

SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;

19087

unsigned OpIndex = i / 2;

19088

if (Ops[OpIndex].isUndef())

19089

Ops[OpIndex] = Op;

19090

else if (Ops[OpIndex] != Op)

19091

return SDValue();

19092

19093

// Convert the 128-bit shuffle mask selection values into 128-bit selection

19094

// bits defined by a vshuf64x2 instruction's immediate control byte.

19095

PermMask |= (Widened128Mask[i] % 4) << (i * 2);

19096

}

19097

19098

return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],

19099

DAG.getTargetConstant(PermMask, DL, MVT::i8));

19100

}

19101

19102

/// Handle lowering of 8-lane 64-bit floating point shuffles.

19103

static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19104

const APInt &Zeroable, SDValue V1, SDValue V2,

19105

const X86Subtarget &Subtarget,

19106

SelectionDAG &DAG) {

19107

assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19107, __extension__
__PRETTY_FUNCTION__));

19108

assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19108, __extension__
__PRETTY_FUNCTION__));

19109

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19109, __extension__
__PRETTY_FUNCTION__));

19110

19111

if (V2.isUndef()) {

19112

// Use low duplicate instructions for masks that match their pattern.

19113

if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))

19114

return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);

19115

19116

if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {

19117

// Non-half-crossing single input shuffles can be lowered with an

19118

// interleaved permutation.

19119

unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |

19120

((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |

19121

((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |

19122

((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);

19123

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,

19124

DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));

19125

}

19126

19127

SmallVector<int, 4> RepeatedMask;

19128

if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))

19129

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,

19130

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

19131

}

19132

19133

if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,

19134

V2, Subtarget, DAG))

19135

return Shuf128;

19136

19137

if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))

19138

return Unpck;

19139

19140

// Check if the blend happens to exactly fit that of SHUFPD.

19141

if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,

19142

Zeroable, Subtarget, DAG))

19143

return Op;

19144

19145

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,

19146

DAG, Subtarget))

19147

return V;

19148

19149

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,

19150

Zeroable, Subtarget, DAG))

19151

return Blend;

19152

19153

return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);

19154

}

19155

19156

/// Handle lowering of 16-lane 32-bit floating point shuffles.

19157

static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19158

const APInt &Zeroable, SDValue V1, SDValue V2,

19159

const X86Subtarget &Subtarget,

19160

SelectionDAG &DAG) {

19161

assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19161, __extension__
__PRETTY_FUNCTION__));

19162

assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19162, __extension__
__PRETTY_FUNCTION__));

19163

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19163, __extension__
__PRETTY_FUNCTION__));

19164

19165

// If the shuffle mask is repeated in each 128-bit lane, we have many more

19166

// options to efficiently lower the shuffle.

19167

SmallVector<int, 4> RepeatedMask;

19168

if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {

19169

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19169, __extension__
__PRETTY_FUNCTION__));

19170

19171

// Use even/odd duplicate instructions for masks that match their pattern.

19172

if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))

19173

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);

19174

if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))

19175

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);

19176

19177

if (V2.isUndef())

19178

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,

19179

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

19180

19181

// Use dedicated unpack instructions for masks that match their pattern.

19182

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))

19183

return V;

19184

19185

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,

19186

Zeroable, Subtarget, DAG))

19187

return Blend;

19188

19189

// Otherwise, fall back to a SHUFPS sequence.

19190

return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);

19191

}

19192

19193

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,

19194

Zeroable, Subtarget, DAG))

19195

return Blend;

19196

19197

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

19198

DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

19199

return DAG.getBitcast(MVT::v16f32, ZExt);

19200

19201

// Try to create an in-lane repeating shuffle mask and then shuffle the

19202

// results into the target lanes.

19203

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

19204

DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))

19205

return V;

19206

19207

// If we have a single input shuffle with different shuffle patterns in the

19208

// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.

19209

if (V2.isUndef() &&

19210

!is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {

19211

SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);

19212

return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);

19213

}

19214

19215

// If we have AVX512F support, we can use VEXPAND.

19216

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,

19217

V1, V2, DAG, Subtarget))

19218

return V;

19219

19220

return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);

19221

}

19222

19223

/// Handle lowering of 8-lane 64-bit integer shuffles.

19224

static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19225

const APInt &Zeroable, SDValue V1, SDValue V2,

19226

const X86Subtarget &Subtarget,

19227

SelectionDAG &DAG) {

19228

assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19228, __extension__
__PRETTY_FUNCTION__));

19229

assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19229, __extension__
__PRETTY_FUNCTION__));

19230

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19230, __extension__
__PRETTY_FUNCTION__));

19231

19232

// Try to use shift instructions if fast.

19233

if (Subtarget.preferLowerShuffleAsShift())

19234

if (SDValue Shift =

19235

lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,

19236

Subtarget, DAG, /*BitwiseOnly*/ true))

19237

return Shift;

19238

19239

if (V2.isUndef()) {

19240

// When the shuffle is mirrored between the 128-bit lanes of the unit, we

19241

// can use lower latency instructions that will operate on all four

19242

// 128-bit lanes.

19243

SmallVector<int, 2> Repeated128Mask;

19244

if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {

19245

SmallVector<int, 4> PSHUFDMask;

19246

narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);

19247

return DAG.getBitcast(

19248

MVT::v8i64,

19249

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,

19250

DAG.getBitcast(MVT::v16i32, V1),

19251

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

19252

}

19253

19254

SmallVector<int, 4> Repeated256Mask;

19255

if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))

19256

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,

19257

getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));

19258

}

19259

19260

if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,

19261

V2, Subtarget, DAG))

19262

return Shuf128;

19263

19264

// Try to use shift instructions.

19265

if (SDValue Shift =

19266

lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,

19267

DAG, /*BitwiseOnly*/ false))

19268

return Shift;

19269

19270

// Try to use VALIGN.

19271

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,

19272

Subtarget, DAG))

19273

return Rotate;

19274

19275

// Try to use PALIGNR.

19276

if (Subtarget.hasBWI())

19277

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,

19278

Subtarget, DAG))

19279

return Rotate;

19280

19281

if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))

19282

return Unpck;

19283

19284

// If we have AVX512F support, we can use VEXPAND.

19285

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,

19286

DAG, Subtarget))

19287

return V;

19288

19289

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,

19290

Zeroable, Subtarget, DAG))

19291

return Blend;

19292

19293

return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);

19294

}

19295

19296

/// Handle lowering of 16-lane 32-bit integer shuffles.

19297

static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19298

const APInt &Zeroable, SDValue V1, SDValue V2,

19299

const X86Subtarget &Subtarget,

19300

SelectionDAG &DAG) {

19301

assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19301, __extension__
__PRETTY_FUNCTION__));

19302

assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19302, __extension__
__PRETTY_FUNCTION__));

19303

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19303, __extension__
__PRETTY_FUNCTION__));

19304

19305

int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

19306

19307

// Whenever we can lower this as a zext, that instruction is strictly faster

19308

// than any alternative. It also allows us to fold memory operands into the

19309

// shuffle in many cases.

19310

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

19311

DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

19312

return ZExt;

19313

19314

// Try to use shift instructions if fast.

19315

if (Subtarget.preferLowerShuffleAsShift()) {

19316

if (SDValue Shift =

19317

lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,

19318

Subtarget, DAG, /*BitwiseOnly*/ true))

19319

return Shift;

19320

if (NumV2Elements == 0)

19321

if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,

19322

Subtarget, DAG))

19323

return Rotate;

19324

}

19325

19326

// If the shuffle mask is repeated in each 128-bit lane we can use more

19327

// efficient instructions that mirror the shuffles across the four 128-bit

19328

// lanes.

19329

SmallVector<int, 4> RepeatedMask;

19330

bool Is128BitLaneRepeatedShuffle =

19331

is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);

19332

if (Is128BitLaneRepeatedShuffle) {

19333

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19333, __extension__
__PRETTY_FUNCTION__));

19334

if (V2.isUndef())

19335

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,

19336

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

19337

19338

// Use dedicated unpack instructions for masks that match their pattern.

19339

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))

19340

return V;

19341

}

19342

19343

// Try to use shift instructions.

19344

if (SDValue Shift =

19345

lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,

19346

Subtarget, DAG, /*BitwiseOnly*/ false))

19347

return Shift;

19348

19349

if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)

19350

if (SDValue Rotate =

19351

lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))

19352

return Rotate;

19353

19354

// Try to use VALIGN.

19355

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,

19356

Subtarget, DAG))

19357

return Rotate;

19358

19359

// Try to use byte rotation instructions.

19360

if (Subtarget.hasBWI())

19361

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,

19362

Subtarget, DAG))

19363

return Rotate;

19364

19365

// Assume that a single SHUFPS is faster than using a permv shuffle.

19366

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

19367

if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {

19368

SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);

19369

SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);

19370

SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,

19371

CastV1, CastV2, DAG);

19372

return DAG.getBitcast(MVT::v16i32, ShufPS);

19373

}

19374

19375

// Try to create an in-lane repeating shuffle mask and then shuffle the

19376

// results into the target lanes.

19377

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

19378

DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))

19379

return V;

19380

19381

// If we have AVX512F support, we can use VEXPAND.

19382

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,

19383

DAG, Subtarget))

19384

return V;

19385

19386

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,

19387

Zeroable, Subtarget, DAG))

19388

return Blend;

19389

19390

return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);

19391

}

19392

19393

/// Handle lowering of 32-lane 16-bit integer shuffles.

19394

static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19395

const APInt &Zeroable, SDValue V1, SDValue V2,

19396

const X86Subtarget &Subtarget,

19397

SelectionDAG &DAG) {

19398

assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19398, __extension__
__PRETTY_FUNCTION__));

19399

assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19399, __extension__
__PRETTY_FUNCTION__));

19400

assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19400, __extension__
__PRETTY_FUNCTION__));

19401

assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19401, __extension__
__PRETTY_FUNCTION__));

19402

19403

// Whenever we can lower this as a zext, that instruction is strictly faster

19404

// than any alternative. It also allows us to fold memory operands into the

19405

// shuffle in many cases.

19406

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

19407

DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

19408

return ZExt;

19409

19410

// Use dedicated unpack instructions for masks that match their pattern.

19411

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))

19412

return V;

19413

19414

// Use dedicated pack instructions for masks that match their pattern.

19415

if (SDValue V =

19416

lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))

19417

return V;

19418

19419

// Try to use shift instructions.

19420

if (SDValue Shift =

19421

lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,

19422

Subtarget, DAG, /*BitwiseOnly*/ false))

19423

return Shift;

19424

19425

// Try to use byte rotation instructions.

19426

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,

19427

Subtarget, DAG))

19428

return Rotate;

19429

19430

if (V2.isUndef()) {

19431

// Try to use bit rotation instructions.

19432

if (SDValue Rotate =

19433

lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))

19434

return Rotate;

19435

19436

SmallVector<int, 8> RepeatedMask;

19437

if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {

19438

// As this is a single-input shuffle, the repeated mask should be

19439

// a strictly valid v8i16 mask that we can pass through to the v8i16

19440

// lowering to handle even the v32 case.

19441

return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,

19442

RepeatedMask, Subtarget, DAG);

19443

}

19444

}

19445

19446

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,

19447

Zeroable, Subtarget, DAG))

19448

return Blend;

19449

19450

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,

19451

Zeroable, Subtarget, DAG))

19452

return PSHUFB;

19453

19454

return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);

19455

}

19456

19457

/// Handle lowering of 64-lane 8-bit integer shuffles.

19458

static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19459

const APInt &Zeroable, SDValue V1, SDValue V2,

19460

const X86Subtarget &Subtarget,

19461

SelectionDAG &DAG) {

19462

assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19462, __extension__
__PRETTY_FUNCTION__));

19463

assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19463, __extension__
__PRETTY_FUNCTION__));

19464

assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")(static_cast <bool> (Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19464, __extension__
__PRETTY_FUNCTION__));

19465

assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19465, __extension__
__PRETTY_FUNCTION__));

19466

19467

// Whenever we can lower this as a zext, that instruction is strictly faster

19468

// than any alternative. It also allows us to fold memory operands into the

19469

// shuffle in many cases.

19470

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

19471

DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

19472

return ZExt;

19473

19474

// Use dedicated unpack instructions for masks that match their pattern.

19475

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))

19476

return V;

19477

19478

// Use dedicated pack instructions for masks that match their pattern.

19479

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,

19480

Subtarget))

19481

return V;

19482

19483

// Try to use shift instructions.

19484

if (SDValue Shift =

19485

lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,

19486

DAG, /*BitwiseOnly*/ false))

19487

return Shift;

19488

19489

// Try to use byte rotation instructions.

19490

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,

19491

Subtarget, DAG))

19492

return Rotate;

19493

19494

// Try to use bit rotation instructions.

19495

if (V2.isUndef())

19496

if (SDValue Rotate =

19497

lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))

19498

return Rotate;

19499

19500

// Lower as AND if possible.

19501

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,

19502

Zeroable, Subtarget, DAG))

19503

return Masked;

19504

19505

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,

19506

Zeroable, Subtarget, DAG))

19507

return PSHUFB;

19508

19509

// Try to create an in-lane repeating shuffle mask and then shuffle the

19510

// results into the target lanes.

19511

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

19512

DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))

19513

return V;

19514

19515

if (SDValue Result = lowerShuffleAsLanePermuteAndPermute(

19516

DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))

19517

return Result;

19518

19519

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,

19520

Zeroable, Subtarget, DAG))

19521

return Blend;

19522

19523

if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {

19524

// Use PALIGNR+Permute if possible - permute might become PSHUFB but the

19525

// PALIGNR will be cheaper than the second PSHUFB+OR.

19526

if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,

19527

Mask, Subtarget, DAG))

19528

return V;

19529

19530

// If we can't directly blend but can use PSHUFB, that will be better as it

19531

// can both shuffle and set up the inefficient blend.

19532

bool V1InUse, V2InUse;

19533

return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,

19534

DAG, V1InUse, V2InUse);

19535

}

19536

19537

// Try to simplify this by merging 128-bit lanes to enable a lane-based

19538

// shuffle.

19539

if (!V2.isUndef())

19540

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

19541

DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))

19542

return Result;

19543

19544

// VBMI can use VPERMV/VPERMV3 byte shuffles.

19545

if (Subtarget.hasVBMI())

19546

return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);

19547

19548

return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);

19549

}

19550

19551

/// High-level routine to lower various 512-bit x86 vector shuffles.

19552

///

19553

/// This routine either breaks down the specific type of a 512-bit x86 vector

19554

/// shuffle or splits it into two 256-bit shuffles and fuses the results back

19555

/// together based on the available instructions.

19556

static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

19557

MVT VT, SDValue V1, SDValue V2,

19558

const APInt &Zeroable,

19559

const X86Subtarget &Subtarget,

19560

SelectionDAG &DAG) {

19561

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19562, __extension__
__PRETTY_FUNCTION__))

19562

"Cannot lower 512-bit vectors w/ basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19562, __extension__
__PRETTY_FUNCTION__));

19563

19564

// If we have a single input to the zero element, insert that into V1 if we

19565

// can do so cheaply.

19566

int NumElts = Mask.size();

19567

int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

19568

19569

if (NumV2Elements == 1 && Mask[0] >= NumElts)

19570

if (SDValue Insertion = lowerShuffleAsElementInsertion(

19571

DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))

19572

return Insertion;

19573

19574

// Handle special cases where the lower or upper half is UNDEF.

19575

if (SDValue V =

19576

lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))

19577

return V;

19578

19579

// Check for being able to broadcast a single element.

19580

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,

19581

Subtarget, DAG))

19582

return Broadcast;

19583

19584

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {

19585

// Try using bit ops for masking and blending before falling back to

19586

// splitting.

19587

if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

19588

Subtarget, DAG))

19589

return V;

19590

if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

19591

return V;

19592

19593

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);

19594

}

19595

19596

if (VT == MVT::v32f16) {

19597

V1 = DAG.getBitcast(MVT::v32i16, V1);

19598

V2 = DAG.getBitcast(MVT::v32i16, V2);

19599

return DAG.getBitcast(MVT::v32f16,

19600

DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));

19601

}

19602

19603

// Dispatch to each element type for lowering. If we don't have support for

19604

// specific element type shuffles at 512 bits, immediately split them and

19605

// lower them. Each lowering routine of a given type is allowed to assume that

19606

// the requisite ISA extensions for that element type are available.

19607

switch (VT.SimpleTy) {

19608

case MVT::v8f64:

19609

return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19610

case MVT::v16f32:

19611

return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19612

case MVT::v8i64:

19613

return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19614

case MVT::v16i32:

19615

return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19616

case MVT::v32i16:

19617

return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19618

case MVT::v64i8:

19619

return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19620

19621

default:

19622

llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19622);

19623

}

19624

}

19625

19626

static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,

19627

MVT VT, SDValue V1, SDValue V2,

19628

const X86Subtarget &Subtarget,

19629

SelectionDAG &DAG) {

19630

// Shuffle should be unary.

19631

if (!V2.isUndef())

19632

return SDValue();

19633

19634

int ShiftAmt = -1;

19635

int NumElts = Mask.size();

19636

for (int i = 0; i != NumElts; ++i) {

19637

int M = Mask[i];

19638

assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19639, __extension__
__PRETTY_FUNCTION__))

19639

"Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19639, __extension__
__PRETTY_FUNCTION__));

19640

if (M < 0)

19641

continue;

19642

19643

// The first non-undef element determines our shift amount.

19644

if (ShiftAmt < 0) {

19645

ShiftAmt = M - i;

19646

// Need to be shifting right.

19647

if (ShiftAmt <= 0)

19648

return SDValue();

19649

}

19650

// All non-undef elements must shift by the same amount.

19651

if (ShiftAmt != M - i)

19652

return SDValue();

19653

}

19654

assert(ShiftAmt >= 0 && "All undef?")(static_cast <bool> (ShiftAmt >= 0 && "All undef?"
) ? void (0) : __assert_fail ("ShiftAmt >= 0 && \"All undef?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19654, __extension__
__PRETTY_FUNCTION__));

19655

19656

// Great we found a shift right.

19657

MVT WideVT = VT;

19658

if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)

19659

WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

19660

SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,

19661

DAG.getUNDEF(WideVT), V1,

19662

DAG.getIntPtrConstant(0, DL));

19663

Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,

19664

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

19665

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

19666

DAG.getIntPtrConstant(0, DL));

19667

}

19668

19669

// Determine if this shuffle can be implemented with a KSHIFT instruction.

19670

// Returns the shift amount if possible or -1 if not. This is a simplified

19671

// version of matchShuffleAsShift.

19672

static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,

19673

int MaskOffset, const APInt &Zeroable) {

19674

int Size = Mask.size();

19675

19676

auto CheckZeros = [&](int Shift, bool Left) {

19677

for (int j = 0; j < Shift; ++j)

19678

if (!Zeroable[j + (Left ? 0 : (Size - Shift))])

19679

return false;

19680

19681

return true;

19682

};

19683

19684

auto MatchShift = [&](int Shift, bool Left) {

19685

unsigned Pos = Left ? Shift : 0;

19686

unsigned Low = Left ? 0 : Shift;

19687

unsigned Len = Size - Shift;

19688

return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);

19689

};

19690

19691

for (int Shift = 1; Shift != Size; ++Shift)

19692

for (bool Left : {true, false})

19693

if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {

19694

Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;

19695

return Shift;

19696

}

19697

19698

return -1;

19699

}

19700

19701

19702

// Lower vXi1 vector shuffles.

19703

// There is no a dedicated instruction on AVX-512 that shuffles the masks.

19704

// The only way to shuffle bits is to sign-extend the mask vector to SIMD

19705

// vector, shuffle and then truncate it back.

19706

static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

19707

MVT VT, SDValue V1, SDValue V2,

19708

const APInt &Zeroable,

19709

const X86Subtarget &Subtarget,

19710

SelectionDAG &DAG) {

19711

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19712, __extension__
__PRETTY_FUNCTION__))

19712

"Cannot lower 512-bit vectors w/o basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19712, __extension__
__PRETTY_FUNCTION__));

19713

19714

int NumElts = Mask.size();

19715

19716

// Try to recognize shuffles that are just padding a subvector with zeros.

19717

int SubvecElts = 0;

19718

int Src = -1;

19719

for (int i = 0; i != NumElts; ++i) {

19720

if (Mask[i] >= 0) {

19721

// Grab the source from the first valid mask. All subsequent elements need

19722

// to use this same source.

19723

if (Src < 0)

19724

Src = Mask[i] / NumElts;

19725

if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)

19726

break;

19727

}

19728

19729

++SubvecElts;

19730

}

19731

assert(SubvecElts != NumElts && "Identity shuffle?")(static_cast <bool> (SubvecElts != NumElts && "Identity shuffle?"
) ? void (0) : __assert_fail ("SubvecElts != NumElts && \"Identity shuffle?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19731, __extension__
__PRETTY_FUNCTION__));

19732

19733

// Clip to a power 2.

19734

SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);

19735

19736

// Make sure the number of zeroable bits in the top at least covers the bits

19737

// not covered by the subvector.

19738

if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {

19739

assert(Src >= 0 && "Expected a source!")(static_cast <bool> (Src >= 0 && "Expected a source!"
) ? void (0) : __assert_fail ("Src >= 0 && \"Expected a source!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19739, __extension__
__PRETTY_FUNCTION__));

19740

MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);

19741

SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,

19742

Src == 0 ? V1 : V2,

19743

DAG.getIntPtrConstant(0, DL));

19744

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

19745

DAG.getConstant(0, DL, VT),

19746

Extract, DAG.getIntPtrConstant(0, DL));

19747

}

19748

19749

// Try a simple shift right with undef elements. Later we'll try with zeros.

19750

if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,

19751

DAG))

19752

return Shift;

19753

19754

// Try to match KSHIFTs.

19755

unsigned Offset = 0;

19756

for (SDValue V : { V1, V2 }) {

19757

unsigned Opcode;

19758

int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);

19759

if (ShiftAmt >= 0) {

19760

MVT WideVT = VT;

19761

if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)

19762

WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

19763

SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,

19764

DAG.getUNDEF(WideVT), V,

19765

DAG.getIntPtrConstant(0, DL));

19766

// Widened right shifts need two shifts to ensure we shift in zeroes.

19767

if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {

19768

int WideElts = WideVT.getVectorNumElements();

19769

// Shift left to put the original vector in the MSBs of the new size.

19770

Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,

19771

DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));

19772

// Increase the shift amount to account for the left shift.

19773

ShiftAmt += WideElts - NumElts;

19774

}

19775

19776

Res = DAG.getNode(Opcode, DL, WideVT, Res,

19777

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

19778

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

19779

DAG.getIntPtrConstant(0, DL));

19780

}

19781

Offset += NumElts; // Increment for next iteration.

19782

}

19783

19784

// If we're broadcasting a SETCC result, try to broadcast the ops instead.

19785

// TODO: What other unary shuffles would benefit from this?

19786

if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&

19787

V1->hasOneUse()) {

19788

SDValue Op0 = V1.getOperand(0);

19789

SDValue Op1 = V1.getOperand(1);

19790

ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();

19791

EVT OpVT = Op0.getValueType();

19792

return DAG.getSetCC(

19793

DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),

19794

DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);

19795

}

19796

19797

MVT ExtVT;

19798

switch (VT.SimpleTy) {

19799

default:

19800

llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19800);

19801

case MVT::v2i1:

19802

ExtVT = MVT::v2i64;

19803

break;

19804

case MVT::v4i1:

19805

ExtVT = MVT::v4i32;

19806

break;

19807

case MVT::v8i1:

19808

// Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit

19809

// shuffle.

19810

ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;

19811

break;

19812

case MVT::v16i1:

19813

// Take 512-bit type, unless we are avoiding 512-bit types and have the

19814

// 256-bit operation available.

19815

ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;

19816

break;

19817

case MVT::v32i1:

19818

// Take 512-bit type, unless we are avoiding 512-bit types and have the

19819

// 256-bit operation available.

19820

assert(Subtarget.hasBWI() && "Expected AVX512BW support")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW support"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19820, __extension__
__PRETTY_FUNCTION__));

19821

ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;

19822

break;

19823

case MVT::v64i1:

19824

// Fall back to scalarization. FIXME: We can do better if the shuffle

19825

// can be partitioned cleanly.

19826

if (!Subtarget.useBWIRegs())

19827

return SDValue();

19828

ExtVT = MVT::v64i8;

19829

break;

19830

}

19831

19832

V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);

19833

V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);

19834

19835

SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);

19836

// i1 was sign extended we can use X86ISD::CVT2MASK.

19837

int NumElems = VT.getVectorNumElements();

19838

if ((Subtarget.hasBWI() && (NumElems >= 32)) ||

19839

(Subtarget.hasDQI() && (NumElems < 32)))

19840

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),

19841

Shuffle, ISD::SETGT);

19842

19843

return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);

19844

}

19845

19846

/// Helper function that returns true if the shuffle mask should be

19847

/// commuted to improve canonicalization.

19848

static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {

19849

int NumElements = Mask.size();

19850

19851

int NumV1Elements = 0, NumV2Elements = 0;

19852

for (int M : Mask)

19853

if (M < 0)

19854

continue;

19855

else if (M < NumElements)

19856

++NumV1Elements;

19857

else

19858

++NumV2Elements;

19859

19860

// Commute the shuffle as needed such that more elements come from V1 than

19861

// V2. This allows us to match the shuffle pattern strictly on how many

19862

// elements come from V1 without handling the symmetric cases.

19863

if (NumV2Elements > NumV1Elements)

19864

return true;

19865

19866

assert(NumV1Elements > 0 && "No V1 indices")(static_cast <bool> (NumV1Elements > 0 && "No V1 indices"
) ? void (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19866, __extension__
__PRETTY_FUNCTION__));

19867

19868

if (NumV2Elements == 0)

19869

return false;

19870

19871

// When the number of V1 and V2 elements are the same, try to minimize the

19872

// number of uses of V2 in the low half of the vector. When that is tied,

19873

// ensure that the sum of indices for V1 is equal to or lower than the sum

19874

// indices for V2. When those are equal, try to ensure that the number of odd

19875

// indices for V1 is lower than the number of odd indices for V2.

19876

if (NumV1Elements == NumV2Elements) {

19877

int LowV1Elements = 0, LowV2Elements = 0;

19878

for (int M : Mask.slice(0, NumElements / 2))

19879

if (M >= NumElements)

19880

++LowV2Elements;

19881

else if (M >= 0)

19882

++LowV1Elements;

19883

if (LowV2Elements > LowV1Elements)

19884

return true;

19885

if (LowV2Elements == LowV1Elements) {

19886

int SumV1Indices = 0, SumV2Indices = 0;

19887

for (int i = 0, Size = Mask.size(); i < Size; ++i)

19888

if (Mask[i] >= NumElements)

19889

SumV2Indices += i;

19890

else if (Mask[i] >= 0)

19891

SumV1Indices += i;

19892

if (SumV2Indices < SumV1Indices)

19893

return true;

19894

if (SumV2Indices == SumV1Indices) {

19895

int NumV1OddIndices = 0, NumV2OddIndices = 0;

19896

for (int i = 0, Size = Mask.size(); i < Size; ++i)

19897

if (Mask[i] >= NumElements)

19898

NumV2OddIndices += i % 2;

19899

else if (Mask[i] >= 0)

19900

NumV1OddIndices += i % 2;

19901

if (NumV2OddIndices < NumV1OddIndices)

19902

return true;

19903

}

19904

}

19905

}

19906

19907

return false;

19908

}

19909

19910

static bool canCombineAsMaskOperation(SDValue V1, SDValue V2,

19911

const X86Subtarget &Subtarget) {

19912

if (!Subtarget.hasAVX512())

19913

return false;

19914

19915

MVT VT = V1.getSimpleValueType().getScalarType();

19916

if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())

19917

return false;

19918

19919

// If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd

19920

// are preferable to blendw/blendvb/masked-mov.

19921

if ((VT == MVT::i16 || VT == MVT::i8) &&

19922

V1.getSimpleValueType().getSizeInBits() < 512)

19923

return false;

19924

19925

auto HasMaskOperation = [&](SDValue V) {

19926

// TODO: Currently we only check limited opcode. We probably extend

19927

// it to all binary operation by checking TLI.isBinOp().

19928

switch (V->getOpcode()) {

19929

default:

19930

return false;

19931

case ISD::ADD:

19932

case ISD::SUB:

19933

case ISD::AND:

19934

case ISD::XOR:

19935

case ISD::OR:

19936

case ISD::SMAX:

19937

case ISD::SMIN:

19938

case ISD::UMAX:

19939

case ISD::UMIN:

19940

case ISD::ABS:

19941

case ISD::SHL:

19942

case ISD::SRL:

19943

case ISD::SRA:

19944

case ISD::MUL:

19945

break;

19946

}

19947

if (!V->hasOneUse())

19948

return false;

19949

19950

return true;

19951

};

19952

19953

if (HasMaskOperation(V1) || HasMaskOperation(V2))

19954

return true;

19955

19956

return false;

19957

}

19958

19959

// Forward declaration.

19960

static SDValue canonicalizeShuffleMaskWithHorizOp(

19961

MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,

19962

unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,

19963

const X86Subtarget &Subtarget);

19964

19965

/// Top-level lowering for x86 vector shuffles.

19966

///

19967

/// This handles decomposition, canonicalization, and lowering of all x86

19968

/// vector shuffles. Most of the specific lowering strategies are encapsulated

19969

/// above in helper routines. The canonicalization attempts to widen shuffles

19970

/// to involve fewer lanes of wider elements, consolidate symmetric patterns

19971

/// s.t. only one of the two inputs needs to be tested, etc.

19972

static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,

19973

SelectionDAG &DAG) {

19974

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

19975

ArrayRef<int> OrigMask = SVOp->getMask();

19976

SDValue V1 = Op.getOperand(0);

19977

SDValue V2 = Op.getOperand(1);

19978

MVT VT = Op.getSimpleValueType();

19979

int NumElements = VT.getVectorNumElements();

19980

SDLoc DL(Op);

19981

bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);

19982

19983

assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19984, __extension__
__PRETTY_FUNCTION__))

19984

"Can't lower MMX shuffles")(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19984, __extension__
__PRETTY_FUNCTION__));

19985

19986

bool V1IsUndef = V1.isUndef();

19987

bool V2IsUndef = V2.isUndef();

19988

if (V1IsUndef && V2IsUndef)

19989

return DAG.getUNDEF(VT);

19990

19991

// When we create a shuffle node we put the UNDEF node to second operand,

19992

// but in some cases the first operand may be transformed to UNDEF.

19993

// In this case we should just commute the node.

19994

if (V1IsUndef)

19995

return DAG.getCommutedVectorShuffle(*SVOp);

19996

19997

// Check for non-undef masks pointing at an undef vector and make the masks

19998

// undef as well. This makes it easier to match the shuffle based solely on

19999

// the mask.

20000

if (V2IsUndef &&

20001

any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {

20002

SmallVector<int, 8> NewMask(OrigMask);

20003

for (int &M : NewMask)

20004

if (M >= NumElements)

20005

M = -1;

20006

return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

20007

}

20008

20009

// Check for illegal shuffle mask element index values.

20010

int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);

20011

(void)MaskUpperLimit;

20012

assert(llvm::all_of(OrigMask,(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20014, __extension__
__PRETTY_FUNCTION__))

20013

[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20014, __extension__
__PRETTY_FUNCTION__))

20014

"Out of bounds shuffle index")(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20014, __extension__
__PRETTY_FUNCTION__));

20015

20016

// We actually see shuffles that are entirely re-arrangements of a set of

20017

// zero inputs. This mostly happens while decomposing complex shuffles into

20018

// simple ones. Directly lower these as a buildvector of zeros.

20019

APInt KnownUndef, KnownZero;

20020

computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);

20021

20022

APInt Zeroable = KnownUndef | KnownZero;

20023

if (Zeroable.isAllOnes())

20024

return getZeroVector(VT, Subtarget, DAG, DL);

20025

20026

bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());

20027

20028

// Try to collapse shuffles into using a vector type with fewer elements but

20029

// wider element types. We cap this to not form integers or floating point

20030

// elements wider than 64 bits. It does not seem beneficial to form i128

20031

// integers to handle flipping the low and high halves of AVX 256-bit vectors.

20032

SmallVector<int, 16> WidenedMask;

20033

if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&

20034

!canCombineAsMaskOperation(V1, V2, Subtarget) &&

20035

canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {

20036

// Shuffle mask widening should not interfere with a broadcast opportunity

20037

// by obfuscating the operands with bitcasts.

20038

// TODO: Avoid lowering directly from this top-level function: make this

20039

// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.

20040

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,

20041

Subtarget, DAG))

20042

return Broadcast;

20043

20044

MVT NewEltVT = VT.isFloatingPoint()

20045

? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)

20046

: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);

20047

int NewNumElts = NumElements / 2;

20048

MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);

20049

// Make sure that the new vector type is legal. For example, v2f64 isn't

20050

// legal on SSE1.

20051

if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {

20052

if (V2IsZero) {

20053

// Modify the new Mask to take all zeros from the all-zero vector.

20054

// Choose indices that are blend-friendly.

20055

bool UsedZeroVector = false;

20056

assert(is_contained(WidenedMask, SM_SentinelZero) &&(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20057, __extension__
__PRETTY_FUNCTION__))

20057

"V2's non-undef elements are used?!")(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20057, __extension__
__PRETTY_FUNCTION__));

20058

for (int i = 0; i != NewNumElts; ++i)

20059

if (WidenedMask[i] == SM_SentinelZero) {

20060

WidenedMask[i] = i + NewNumElts;

20061

UsedZeroVector = true;

20062

}

20063

// Ensure all elements of V2 are zero - isBuildVectorAllZeros permits

20064

// some elements to be undef.

20065

if (UsedZeroVector)

20066

V2 = getZeroVector(NewVT, Subtarget, DAG, DL);

20067

}

20068

V1 = DAG.getBitcast(NewVT, V1);

20069

V2 = DAG.getBitcast(NewVT, V2);

20070

return DAG.getBitcast(

20071

VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));

20072

}

20073

}

20074

20075

SmallVector<SDValue> Ops = {V1, V2};

20076

SmallVector<int> Mask(OrigMask);

20077

20078

// Canonicalize the shuffle with any horizontal ops inputs.

20079

// NOTE: This may update Ops and Mask.

20080

if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(

20081

Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))

20082

return DAG.getBitcast(VT, HOp);

20083

20084

V1 = DAG.getBitcast(VT, Ops[0]);

20085

V2 = DAG.getBitcast(VT, Ops[1]);

20086

assert(NumElements == (int)Mask.size() &&(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20088, __extension__
__PRETTY_FUNCTION__))

20087

"canonicalizeShuffleMaskWithHorizOp "(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20088, __extension__
__PRETTY_FUNCTION__))

20088

"shouldn't alter the shuffle mask size")(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20088, __extension__
__PRETTY_FUNCTION__));

20089

20090

// Commute the shuffle if it will improve canonicalization.

20091

if (canonicalizeShuffleMaskWithCommute(Mask)) {

20092

ShuffleVectorSDNode::commuteMask(Mask);

20093

std::swap(V1, V2);

20094

}

20095

20096

// For each vector width, delegate to a specialized lowering routine.

20097

if (VT.is128BitVector())

20098

return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

20099

20100

if (VT.is256BitVector())

20101

return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

20102

20103

if (VT.is512BitVector())

20104

return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

20105

20106

if (Is1BitVector)

20107

return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

20108

20109

llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20109);

20110

}

20111

20112

/// Try to lower a VSELECT instruction to a vector shuffle.

20113

static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,

20114

const X86Subtarget &Subtarget,

20115

SelectionDAG &DAG) {

20116

SDValue Cond = Op.getOperand(0);

20117

SDValue LHS = Op.getOperand(1);

20118

SDValue RHS = Op.getOperand(2);

20119

MVT VT = Op.getSimpleValueType();

20120

20121

// Only non-legal VSELECTs reach this lowering, convert those into generic

20122

// shuffles and re-use the shuffle lowering path for blends.

20123

if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {

20124

SmallVector<int, 32> Mask;

20125

if (createShuffleMaskFromVSELECT(Mask, Cond))

20126

return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);

20127

}

20128

20129

return SDValue();

20130

}

20131

20132

SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {

20133

SDValue Cond = Op.getOperand(0);

20134

SDValue LHS = Op.getOperand(1);

20135

SDValue RHS = Op.getOperand(2);

20136

20137

SDLoc dl(Op);

20138

MVT VT = Op.getSimpleValueType();

20139

if (isSoftFP16(VT)) {

20140

MVT NVT = VT.changeVectorElementTypeToInteger();

20141

return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,

20142

DAG.getBitcast(NVT, LHS),

20143

DAG.getBitcast(NVT, RHS)));

20144

}

20145

20146

// A vselect where all conditions and data are constants can be optimized into

20147

// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().

20148

if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&

20149

ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&

20150

ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))

20151

return SDValue();

20152

20153

// Try to lower this to a blend-style vector shuffle. This can handle all

20154

// constant condition cases.

20155

if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))

20156

return BlendOp;

20157

20158

// If this VSELECT has a vector if i1 as a mask, it will be directly matched

20159

// with patterns on the mask registers on AVX-512.

20160

MVT CondVT = Cond.getSimpleValueType();

20161

unsigned CondEltSize = Cond.getScalarValueSizeInBits();

20162

if (CondEltSize == 1)

20163

return Op;

20164

20165

// Variable blends are only legal from SSE4.1 onward.

20166

if (!Subtarget.hasSSE41())

20167

return SDValue();

20168

20169

unsigned EltSize = VT.getScalarSizeInBits();

20170

unsigned NumElts = VT.getVectorNumElements();

20171

20172

// Expand v32i16/v64i8 without BWI.

20173

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

20174

return SDValue();

20175

20176

// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition

20177

// into an i1 condition so that we can use the mask-based 512-bit blend

20178

// instructions.

20179

if (VT.getSizeInBits() == 512) {

20180

// Build a mask by testing the condition against zero.

20181

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

20182

SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,

20183

DAG.getConstant(0, dl, CondVT),

20184

ISD::SETNE);

20185

// Now return a new VSELECT using the mask.

20186

return DAG.getSelect(dl, VT, Mask, LHS, RHS);

20187

}

20188

20189

// SEXT/TRUNC cases where the mask doesn't match the destination size.

20190

if (CondEltSize != EltSize) {

20191

// If we don't have a sign splat, rely on the expansion.

20192

if (CondEltSize != DAG.ComputeNumSignBits(Cond))

20193

return SDValue();

20194

20195

MVT NewCondSVT = MVT::getIntegerVT(EltSize);

20196

MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);

20197

Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);

20198

return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);

20199

}

20200

20201

// Only some types will be legal on some subtargets. If we can emit a legal

20202

// VSELECT-matching blend, return Op, and but if we need to expand, return

20203

// a null value.

20204

switch (VT.SimpleTy) {

20205

default:

20206

// Most of the vector types have blends past SSE4.1.

20207

return Op;

20208

20209

case MVT::v32i8:

20210

// The byte blends for AVX vectors were introduced only in AVX2.

20211

if (Subtarget.hasAVX2())

20212

return Op;

20213

20214

return SDValue();

20215

20216

case MVT::v8i16:

20217

case MVT::v16i16: {

20218

// Bitcast everything to the vXi8 type and use a vXi8 vselect.

20219

MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);

20220

Cond = DAG.getBitcast(CastVT, Cond);

20221

LHS = DAG.getBitcast(CastVT, LHS);

20222

RHS = DAG.getBitcast(CastVT, RHS);

20223

SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);

20224

return DAG.getBitcast(VT, Select);

20225

}

20226

}

20227

}

20228

20229

static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {

20230

MVT VT = Op.getSimpleValueType();

20231

SDValue Vec = Op.getOperand(0);

20232

SDValue Idx = Op.getOperand(1);

20233

assert(isa<ConstantSDNode>(Idx) && "Constant index expected")(static_cast <bool> (isa<ConstantSDNode>(Idx) &&
"Constant index expected") ? void (0) : __assert_fail ("isa<ConstantSDNode>(Idx) && \"Constant index expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20233, __extension__
__PRETTY_FUNCTION__));

20234

SDLoc dl(Op);

20235

20236

if (!Vec.getSimpleValueType().is128BitVector())

20237

return SDValue();

20238

20239

if (VT.getSizeInBits() == 8) {

20240

// If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless

20241

// we're going to zero extend the register or fold the store.

20242

if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) &&

20243

!X86::mayFoldIntoStore(Op))

20244

return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,

20245

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

20246

DAG.getBitcast(MVT::v4i32, Vec), Idx));

20247

20248

unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

20249

SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,

20250

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

20251

return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);

20252

}

20253

20254

if (VT == MVT::f32) {

20255

// EXTRACTPS outputs to a GPR32 register which will require a movd to copy

20256

// the result back to FR32 register. It's only worth matching if the

20257

// result has a single use which is a store or a bitcast to i32. And in

20258

// the case of a store, it's not worth it if the index is a constant 0,

20259

// because a MOVSSmr can be used instead, which is smaller and faster.

20260

if (!Op.hasOneUse())

20261

return SDValue();

20262

SDNode *User = *Op.getNode()->use_begin();

20263

if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&

20264

(User->getOpcode() != ISD::BITCAST ||

20265

User->getValueType(0) != MVT::i32))

20266

return SDValue();

20267

SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

20268

DAG.getBitcast(MVT::v4i32, Vec), Idx);

20269

return DAG.getBitcast(MVT::f32, Extract);

20270

}

20271

20272

if (VT == MVT::i32 || VT == MVT::i64)

20273

return Op;

20274

20275

return SDValue();

20276

}

20277

20278

/// Extract one bit from mask vector, like v16i1 or v8i1.

20279

/// AVX-512 feature.

20280

static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,

20281

const X86Subtarget &Subtarget) {

20282

SDValue Vec = Op.getOperand(0);

20283

SDLoc dl(Vec);

20284

MVT VecVT = Vec.getSimpleValueType();

20285

SDValue Idx = Op.getOperand(1);

20286

auto* IdxC = dyn_cast<ConstantSDNode>(Idx);

20287

MVT EltVT = Op.getSimpleValueType();

20288

20289

assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20290, __extension__
__PRETTY_FUNCTION__))

20290

"Unexpected vector type in ExtractBitFromMaskVector")(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20290, __extension__
__PRETTY_FUNCTION__));

20291

20292

// variable index can't be handled in mask registers,

20293

// extend vector to VR512/128

20294

if (!IdxC) {

20295

unsigned NumElts = VecVT.getVectorNumElements();

20296

// Extending v8i1/v16i1 to 512-bit get better performance on KNL

20297

// than extending to 128/256bit.

20298

MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;

20299

MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);

20300

SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);

20301

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);

20302

return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);

20303

}

20304

20305

unsigned IdxVal = IdxC->getZExtValue();

20306

if (IdxVal == 0) // the operation is legal

20307

return Op;

20308

20309

// Extend to natively supported kshift.

20310

unsigned NumElems = VecVT.getVectorNumElements();

20311

MVT WideVecVT = VecVT;

20312

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {

20313

WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

20314

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,

20315

DAG.getUNDEF(WideVecVT), Vec,

20316

DAG.getIntPtrConstant(0, dl));

20317

}

20318

20319

// Use kshiftr instruction to move to the lower element.

20320

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,

20321

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

20322

20323

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,

20324

DAG.getIntPtrConstant(0, dl));

20325

}

20326

20327

SDValue

20328

X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,

20329

SelectionDAG &DAG) const {

20330

SDLoc dl(Op);

20331

SDValue Vec = Op.getOperand(0);

20332

MVT VecVT = Vec.getSimpleValueType();

20333

SDValue Idx = Op.getOperand(1);

20334

auto* IdxC = dyn_cast<ConstantSDNode>(Idx);

20335

20336

if (VecVT.getVectorElementType() == MVT::i1)

20337

return ExtractBitFromMaskVector(Op, DAG, Subtarget);

20338

20339

if (!IdxC) {

20340

// Its more profitable to go through memory (1 cycles throughput)

20341

// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)

20342

// IACA tool was used to get performance estimation

20343

// (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)

20344

//

20345

// example : extractelement <16 x i8> %a, i32 %i

20346

//

20347

// Block Throughput: 3.00 Cycles

20348

// Throughput Bottleneck: Port5

20349

//

20350

// | Num Of | Ports pressure in cycles | |

20351

// | Uops | 0 - DV | 5 | 6 | 7 | |

20352

// ---------------------------------------------

20353

// | 1 | | 1.0 | | | CP | vmovd xmm1, edi

20354

// | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1

20355

// | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0

20356

// Total Num Of Uops: 4

20357

//

20358

//

20359

// Block Throughput: 1.00 Cycles

20360

// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4

20361

//

20362

// | | Ports pressure in cycles | |

20363

// |Uops| 1 | 2 - D |3 - D | 4 | 5 | |

20364

// ---------------------------------------------------------

20365

// |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0

20366

// |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]

20367

// |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]

20368

// Total Num Of Uops: 4

20369

20370

return SDValue();

20371

}

20372

20373

unsigned IdxVal = IdxC->getZExtValue();

20374

20375

// If this is a 256-bit vector result, first extract the 128-bit vector and

20376

// then extract the element from the 128-bit vector.

20377

if (VecVT.is256BitVector() || VecVT.is512BitVector()) {

20378

// Get the 128-bit vector.

20379

Vec = extract128BitVector(Vec, IdxVal, DAG, dl);

20380

MVT EltVT = VecVT.getVectorElementType();

20381

20382

unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();

20383

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20383, __extension__
__PRETTY_FUNCTION__));

20384

20385

// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2

20386

// this can be done with a mask.

20387

IdxVal &= ElemsPerChunk - 1;

20388

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,

20389

DAG.getIntPtrConstant(IdxVal, dl));

20390

}

20391

20392

assert(VecVT.is128BitVector() && "Unexpected vector length")(static_cast <bool> (VecVT.is128BitVector() && "Unexpected vector length"
) ? void (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20392, __extension__
__PRETTY_FUNCTION__));

20393

20394

MVT VT = Op.getSimpleValueType();

20395

20396

if (VT == MVT::i16) {

20397

// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless

20398

// we're going to zero extend the register or fold the store (SSE41 only).

20399

if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&

20400

!(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {

20401

if (Subtarget.hasFP16())

20402

return Op;

20403

20404

return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,

20405

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

20406

DAG.getBitcast(MVT::v4i32, Vec), Idx));

20407

}

20408

20409

SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,

20410

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

20411

return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);

20412

}

20413

20414

if (Subtarget.hasSSE41())

20415

if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))

20416

return Res;

20417

20418

// TODO: We only extract a single element from v16i8, we can probably afford

20419

// to be more aggressive here before using the default approach of spilling to

20420

// stack.

20421

if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {

20422

// Extract either the lowest i32 or any i16, and extract the sub-byte.

20423

int DWordIdx = IdxVal / 4;

20424

if (DWordIdx == 0) {

20425

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

20426

DAG.getBitcast(MVT::v4i32, Vec),

20427

DAG.getIntPtrConstant(DWordIdx, dl));

20428

int ShiftVal = (IdxVal % 4) * 8;

20429

if (ShiftVal != 0)

20430

Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,

20431

DAG.getConstant(ShiftVal, dl, MVT::i8));

20432

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

20433

}

20434

20435

int WordIdx = IdxVal / 2;

20436

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,

20437

DAG.getBitcast(MVT::v8i16, Vec),

20438

DAG.getIntPtrConstant(WordIdx, dl));

20439

int ShiftVal = (IdxVal % 2) * 8;

20440

if (ShiftVal != 0)

20441

Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,

20442

DAG.getConstant(ShiftVal, dl, MVT::i8));

20443

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

20444

}

20445

20446

if (VT == MVT::f16 || VT.getSizeInBits() == 32) {

20447

if (IdxVal == 0)

20448

return Op;

20449

20450

// Shuffle the element to the lowest element, then movss or movsh.

20451

SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);

20452

Mask[0] = static_cast<int>(IdxVal);

20453

Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);

20454

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,

20455

DAG.getIntPtrConstant(0, dl));

20456

}

20457

20458

if (VT.getSizeInBits() == 64) {

20459

// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b

20460

// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught

20461

// to match extract_elt for f64.

20462

if (IdxVal == 0)

20463

return Op;

20464

20465

// UNPCKHPD the element to the lowest double word, then movsd.

20466

// Note if the lower 64 bits of the result of the UNPCKHPD is then stored

20467

// to a f64mem, the whole operation is folded into a single MOVHPDmr.

20468

int Mask[2] = { 1, -1 };

20469

Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);

20470

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,

20471

DAG.getIntPtrConstant(0, dl));

20472

}

20473

20474

return SDValue();

20475

}

20476

20477

/// Insert one bit to mask vector, like v16i1 or v8i1.

20478

/// AVX-512 feature.

20479

static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,

20480

const X86Subtarget &Subtarget) {

20481

SDLoc dl(Op);

20482

SDValue Vec = Op.getOperand(0);

20483

SDValue Elt = Op.getOperand(1);

20484

SDValue Idx = Op.getOperand(2);

20485

MVT VecVT = Vec.getSimpleValueType();

20486

20487

if (!isa<ConstantSDNode>(Idx)) {

20488

// Non constant index. Extend source and destination,

20489

// insert element and then truncate the result.

20490

unsigned NumElts = VecVT.getVectorNumElements();

20491

MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;

20492

MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);

20493

SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,

20494

DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),

20495

DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);

20496

return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);

20497

}

20498

20499

// Copy into a k-register, extract to v1i1 and insert_subvector.

20500

SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);

20501

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);

20502

}

20503

20504

SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,

20505

SelectionDAG &DAG) const {

20506

MVT VT = Op.getSimpleValueType();

20507

MVT EltVT = VT.getVectorElementType();

20508

unsigned NumElts = VT.getVectorNumElements();

20509

unsigned EltSizeInBits = EltVT.getScalarSizeInBits();

20510

20511

if (EltVT == MVT::i1)

20512

return InsertBitToMaskVector(Op, DAG, Subtarget);

20513

20514

SDLoc dl(Op);

20515

SDValue N0 = Op.getOperand(0);

20516

SDValue N1 = Op.getOperand(1);

20517

SDValue N2 = Op.getOperand(2);

20518

auto *N2C = dyn_cast<ConstantSDNode>(N2);

20519

20520

if (!N2C) {

20521

// Variable insertion indices, usually we're better off spilling to stack,

20522

// but AVX512 can use a variable compare+select by comparing against all

20523

// possible vector indices, and FP insertion has less gpr->simd traffic.

20524

if (!(Subtarget.hasBWI() ||

20525

(Subtarget.hasAVX512() && EltSizeInBits >= 32) ||

20526

(Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))

20527

return SDValue();

20528

20529

MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);

20530

MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);

20531

if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))

20532

return SDValue();

20533

20534

SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);

20535

SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);

20536

SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);

20537

20538

SmallVector<SDValue, 16> RawIndices;

20539

for (unsigned I = 0; I != NumElts; ++I)

20540

RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));

20541

SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);

20542

20543

// inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.

20544

return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,

20545

ISD::CondCode::SETEQ);

20546

}

20547

20548

if (N2C->getAPIntValue().uge(NumElts))

20549

return SDValue();

20550

uint64_t IdxVal = N2C->getZExtValue();

20551

20552

bool IsZeroElt = X86::isZeroNode(N1);

20553

bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);

20554

20555

if (IsZeroElt || IsAllOnesElt) {

20556

// Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.

20557

// We don't deal with i8 0 since it appears to be handled elsewhere.

20558

if (IsAllOnesElt &&

20559

((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||

20560

((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {

20561

SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());

20562

SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());

20563

SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);

20564

CstVectorElts[IdxVal] = OnesCst;

20565

SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);

20566

return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);

20567

}

20568

// See if we can do this more efficiently with a blend shuffle with a

20569

// rematerializable vector.

20570

if (Subtarget.hasSSE41() &&

20571

(EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {

20572

SmallVector<int, 8> BlendMask;

20573

for (unsigned i = 0; i != NumElts; ++i)

20574

BlendMask.push_back(i == IdxVal ? i + NumElts : i);

20575

SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)

20576

: getOnesVector(VT, DAG, dl);

20577

return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);

20578

}

20579

}

20580

20581

// If the vector is wider than 128 bits, extract the 128-bit subvector, insert

20582

// into that, and then insert the subvector back into the result.

20583

if (VT.is256BitVector() || VT.is512BitVector()) {

20584

// With a 256-bit vector, we can insert into the zero element efficiently

20585

// using a blend if we have AVX or AVX2 and the right data type.

20586

if (VT.is256BitVector() && IdxVal == 0) {

20587

// TODO: It is worthwhile to cast integer to floating point and back

20588

// and incur a domain crossing penalty if that's what we'll end up

20589

// doing anyway after extracting to a 128-bit vector.

20590

if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||

20591

(Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {

20592

SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);

20593

return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,

20594

DAG.getTargetConstant(1, dl, MVT::i8));

20595

}

20596

}

20597

20598

unsigned NumEltsIn128 = 128 / EltSizeInBits;

20599

assert(isPowerOf2_32(NumEltsIn128) &&(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20600, __extension__
__PRETTY_FUNCTION__))

20600

"Vectors will always have power-of-two number of elements.")(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20600, __extension__
__PRETTY_FUNCTION__));

20601

20602

// If we are not inserting into the low 128-bit vector chunk,

20603

// then prefer the broadcast+blend sequence.

20604

// FIXME: relax the profitability check iff all N1 uses are insertions.

20605

if (IdxVal >= NumEltsIn128 &&

20606

((Subtarget.hasAVX2() && EltSizeInBits != 8) ||

20607

(Subtarget.hasAVX() && (EltSizeInBits >= 32) &&

20608

X86::mayFoldLoad(N1, Subtarget)))) {

20609

SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);

20610

SmallVector<int, 8> BlendMask;

20611

for (unsigned i = 0; i != NumElts; ++i)

20612

BlendMask.push_back(i == IdxVal ? i + NumElts : i);

20613

return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);

20614

}

20615

20616

// Get the desired 128-bit vector chunk.

20617

SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);

20618

20619

// Insert the element into the desired chunk.

20620

// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.

20621

unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);

20622

20623

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,

20624

DAG.getIntPtrConstant(IdxIn128, dl));

20625

20626

// Insert the changed part back into the bigger vector

20627

return insert128BitVector(N0, V, IdxVal, DAG, dl);

20628

}

20629

assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20629, __extension__
__PRETTY_FUNCTION__));

20630

20631

// This will be just movw/movd/movq/movsh/movss/movsd.

20632

if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {

20633

if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||

20634

EltVT == MVT::f16 || EltVT == MVT::i64) {

20635

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);

20636

return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);

20637

}

20638

20639

// We can't directly insert an i8 or i16 into a vector, so zero extend

20640

// it to i32 first.

20641

if (EltVT == MVT::i16 || EltVT == MVT::i8) {

20642

N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);

20643

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

20644

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);

20645

N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);

20646

return DAG.getBitcast(VT, N1);

20647

}

20648

}

20649

20650

// Transform it so it match pinsr{b,w} which expects a GR32 as its second

20651

// argument. SSE41 required for pinsrb.

20652

if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {

20653

unsigned Opc;

20654

if (VT == MVT::v8i16) {

20655

assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")(static_cast <bool> (Subtarget.hasSSE2() && "SSE2 required for PINSRW"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20655, __extension__
__PRETTY_FUNCTION__));

20656

Opc = X86ISD::PINSRW;

20657

} else {

20658

assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")(static_cast <bool> (VT == MVT::v16i8 && "PINSRB requires v16i8 vector"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20658, __extension__
__PRETTY_FUNCTION__));

20659

assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")(static_cast <bool> (Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20659, __extension__
__PRETTY_FUNCTION__));

20660

Opc = X86ISD::PINSRB;

20661

}

20662

20663

assert(N1.getValueType() != MVT::i32 && "Unexpected VT")(static_cast <bool> (N1.getValueType() != MVT::i32 &&
"Unexpected VT") ? void (0) : __assert_fail ("N1.getValueType() != MVT::i32 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20663, __extension__
__PRETTY_FUNCTION__));

20664

N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);

20665

N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);

20666

return DAG.getNode(Opc, dl, VT, N0, N1, N2);

20667

}

20668

20669

if (Subtarget.hasSSE41()) {

20670

if (EltVT == MVT::f32) {

20671

// Bits [7:6] of the constant are the source select. This will always be

20672

// zero here. The DAG Combiner may combine an extract_elt index into

20673

// these bits. For example (insert (extract, 3), 2) could be matched by

20674

// putting the '3' into bits [7:6] of X86ISD::INSERTPS.

20675

// Bits [5:4] of the constant are the destination select. This is the

20676

// value of the incoming immediate.

20677

// Bits [3:0] of the constant are the zero mask. The DAG Combiner may

20678

// combine either bitwise AND or insert of float 0.0 to set these bits.

20679

20680

bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();

20681

if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {

20682

// If this is an insertion of 32-bits into the low 32-bits of

20683

// a vector, we prefer to generate a blend with immediate rather

20684

// than an insertps. Blends are simpler operations in hardware and so

20685

// will always have equal or better performance than insertps.

20686

// But if optimizing for size and there's a load folding opportunity,

20687

// generate insertps because blendps does not have a 32-bit memory

20688

// operand form.

20689

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

20690

return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,

20691

DAG.getTargetConstant(1, dl, MVT::i8));

20692

}

20693

// Create this as a scalar to vector..

20694

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

20695

return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,

20696

DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));

20697

}

20698

20699

// PINSR* works with constant index.

20700

if (EltVT == MVT::i32 || EltVT == MVT::i64)

20701

return Op;

20702

}

20703

20704

return SDValue();

20705

}

20706

20707

static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,

20708

SelectionDAG &DAG) {

20709

SDLoc dl(Op);

20710

MVT OpVT = Op.getSimpleValueType();

20711

20712

// It's always cheaper to replace a xor+movd with xorps and simplifies further

20713

// combines.

20714

if (X86::isZeroNode(Op.getOperand(0)))

20715

return getZeroVector(OpVT, Subtarget, DAG, dl);

20716

20717

// If this is a 256-bit vector result, first insert into a 128-bit

20718

// vector and then insert into the 256-bit vector.

20719

if (!OpVT.is128BitVector()) {

20720

// Insert into a 128-bit vector.

20721

unsigned SizeFactor = OpVT.getSizeInBits() / 128;

20722

MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),

20723

OpVT.getVectorNumElements() / SizeFactor);

20724

20725

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));

20726

20727

// Insert the 128-bit vector.

20728

return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);

20729

}

20730

assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20731, __extension__
__PRETTY_FUNCTION__))

20731

"Expected an SSE type!")(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20731, __extension__
__PRETTY_FUNCTION__));

20732

20733

// Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in

20734

// tblgen.

20735

if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))

20736

return Op;

20737

20738

SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));

20739

return DAG.getBitcast(

20740

OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));

20741

}

20742

20743

// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a

20744

// simple superregister reference or explicit instructions to insert

20745

// the upper bits of a vector.

20746

static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,

20747

SelectionDAG &DAG) {

20748

assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20748, __extension__
__PRETTY_FUNCTION__));

20749

20750

return insert1BitVector(Op, DAG, Subtarget);

20751

}

20752

20753

static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,

20754

SelectionDAG &DAG) {

20755

assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20756, __extension__
__PRETTY_FUNCTION__))

20756

"Only vXi1 extract_subvectors need custom lowering")(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20756, __extension__
__PRETTY_FUNCTION__));

20757

20758

SDLoc dl(Op);

20759

SDValue Vec = Op.getOperand(0);

20760

uint64_t IdxVal = Op.getConstantOperandVal(1);

20761

20762

if (IdxVal == 0) // the operation is legal

20763

return Op;

20764

20765

MVT VecVT = Vec.getSimpleValueType();

20766

unsigned NumElems = VecVT.getVectorNumElements();

20767

20768

// Extend to natively supported kshift.

20769

MVT WideVecVT = VecVT;

20770

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {

20771

WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

20772

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,

20773

DAG.getUNDEF(WideVecVT), Vec,

20774

DAG.getIntPtrConstant(0, dl));

20775

}

20776

20777

// Shift to the LSB.

20778

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,

20779

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

20780

20781

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,

20782

DAG.getIntPtrConstant(0, dl));

20783

}

20784

20785

// Returns the appropriate wrapper opcode for a global reference.

20786

unsigned X86TargetLowering::getGlobalWrapperKind(

20787

const GlobalValue *GV, const unsigned char OpFlags) const {

20788

// References to absolute symbols are never PC-relative.

20789

if (GV && GV->isAbsoluteSymbolRef())

20790

return X86ISD::Wrapper;

20791

20792

CodeModel::Model M = getTargetMachine().getCodeModel();

20793

if (Subtarget.isPICStyleRIPRel() &&

20794

(M == CodeModel::Small || M == CodeModel::Kernel))

20795

return X86ISD::WrapperRIP;

20796

20797

// In the medium model, functions can always be referenced RIP-relatively,

20798

// since they must be within 2GiB. This is also possible in non-PIC mode, and

20799

// shorter than the 64-bit absolute immediate that would otherwise be emitted.

20800

if (M == CodeModel::Medium && isa_and_nonnull<Function>(GV))

20801

return X86ISD::WrapperRIP;

20802

20803

// GOTPCREL references must always use RIP.

20804

if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)

20805

return X86ISD::WrapperRIP;

20806

20807

return X86ISD::Wrapper;

20808

}

20809

20810

// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as

20811

// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is

20812

// one of the above mentioned nodes. It has to be wrapped because otherwise

20813

// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only

20814

// be used to form addressing mode. These wrapped nodes will be selected

20815

// into MOV32ri.

20816

SDValue

20817

X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {

20818

ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

20819

20820

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

20821

// global base reg.

20822

unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

20823

20824

auto PtrVT = getPointerTy(DAG.getDataLayout());

20825

SDValue Result = DAG.getTargetConstantPool(

20826

CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);

20827

SDLoc DL(CP);

20828

Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

20829

// With PIC, the address is actually $g + Offset.

20830

if (OpFlag) {

20831

Result =

20832

DAG.getNode(ISD::ADD, DL, PtrVT,

20833

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

20834

}

20835

20836

return Result;

20837

}

20838

20839

SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {

20840

JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

20841

20842

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

20843

// global base reg.

20844

unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

20845

20846

auto PtrVT = getPointerTy(DAG.getDataLayout());

20847

SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);

20848

SDLoc DL(JT);

20849

Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

20850

20851

// With PIC, the address is actually $g + Offset.

20852

if (OpFlag)

20853

Result =

20854

DAG.getNode(ISD::ADD, DL, PtrVT,

20855

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

20856

20857

return Result;

20858

}

20859

20860

SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,

20861

SelectionDAG &DAG) const {

20862

return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);

20863

}

20864

20865

SDValue

20866

X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {

20867

// Create the TargetBlockAddressAddress node.

20868

unsigned char OpFlags =

20869

Subtarget.classifyBlockAddressReference();

20870

const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();

20871

int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();

20872

SDLoc dl(Op);

20873

auto PtrVT = getPointerTy(DAG.getDataLayout());

20874

SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);

20875

Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);

20876

20877

// With PIC, the address is actually $g + Offset.

20878

if (isGlobalRelativeToPICBase(OpFlags)) {

20879

Result = DAG.getNode(ISD::ADD, dl, PtrVT,

20880

DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);

20881

}

20882

20883

return Result;

20884

}

20885

20886

/// Creates target global address or external symbol nodes for calls or

20887

/// other uses.

20888

SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,

20889

bool ForCall) const {

20890

// Unpack the global address or external symbol.

20891

const SDLoc &dl = SDLoc(Op);

20892

const GlobalValue *GV = nullptr;

20893

int64_t Offset = 0;

20894

const char *ExternalSym = nullptr;

20895

if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {

20896

GV = G->getGlobal();

20897

Offset = G->getOffset();

20898

} else {

20899

const auto *ES = cast<ExternalSymbolSDNode>(Op);

20900

ExternalSym = ES->getSymbol();

20901

}

20902

20903

// Calculate some flags for address lowering.

20904

const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();

20905

unsigned char OpFlags;

20906

if (ForCall)

20907

OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);

20908

else

20909

OpFlags = Subtarget.classifyGlobalReference(GV, Mod);

20910

bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);

20911

bool NeedsLoad = isGlobalStubReference(OpFlags);

20912

20913

CodeModel::Model M = DAG.getTarget().getCodeModel();

20914

auto PtrVT = getPointerTy(DAG.getDataLayout());

20915

SDValue Result;

20916

20917

if (GV) {

20918

// Create a target global address if this is a global. If possible, fold the

20919

// offset into the global address reference. Otherwise, ADD it on later.

20920

// Suppress the folding if Offset is negative: movl foo-1, %eax is not

20921

// allowed because if the address of foo is 0, the ELF R_X86_64_32

20922

// relocation will compute to a negative value, which is invalid.

20923

int64_t GlobalOffset = 0;

20924

if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&

20925

X86::isOffsetSuitableForCodeModel(Offset, M, true)) {

20926

std::swap(GlobalOffset, Offset);

20927

}

20928

Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);

20929

} else {

20930

// If this is not a global address, this must be an external symbol.

20931

Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);

20932

}

20933

20934

// If this is a direct call, avoid the wrapper if we don't need to do any

20935

// loads or adds. This allows SDAG ISel to match direct calls.

20936

if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)

20937

return Result;

20938

20939

Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);

20940

20941

// With PIC, the address is actually $g + Offset.

20942

if (HasPICReg) {

20943

Result = DAG.getNode(ISD::ADD, dl, PtrVT,

20944

DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);

20945

}

20946

20947

// For globals that require a load from a stub to get the address, emit the

20948

// load.

20949

if (NeedsLoad)

20950

Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,

20951

MachinePointerInfo::getGOT(DAG.getMachineFunction()));

20952

20953

// If there was a non-zero offset that we didn't fold, create an explicit

20954

// addition for it.

20955

if (Offset != 0)

20956

Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,

20957

DAG.getConstant(Offset, dl, PtrVT));

20958

20959

return Result;

20960

}

20961

20962

SDValue

20963

X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {

20964

return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);

20965

}

20966

20967

static SDValue

20968

GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,

20969

SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg,

20970

unsigned char OperandFlags, bool LocalDynamic = false) {

20971

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

20972

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

20973

SDLoc dl(GA);

20974

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

20975

GA->getValueType(0),

20976

GA->getOffset(),

20977

OperandFlags);

20978

20979

X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR

20980

: X86ISD::TLSADDR;

20981

20982

if (InGlue) {

20983

SDValue Ops[] = { Chain, TGA, *InGlue };

20984

Chain = DAG.getNode(CallType, dl, NodeTys, Ops);

20985

} else {

20986

SDValue Ops[] = { Chain, TGA };

20987

Chain = DAG.getNode(CallType, dl, NodeTys, Ops);

20988

}

20989

20990

// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.

20991

MFI.setAdjustsStack(true);

20992

MFI.setHasCalls(true);

20993

20994

SDValue Glue = Chain.getValue(1);

20995

return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);

20996

}

20997

20998

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit

20999

static SDValue

21000

LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,

21001

const EVT PtrVT) {

21002

SDValue InGlue;

21003

SDLoc dl(GA); // ? function entry point might be better

21004

SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,

21005

DAG.getNode(X86ISD::GlobalBaseReg,

21006

SDLoc(), PtrVT), InGlue);

21007

InGlue = Chain.getValue(1);

21008

21009

return GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSGD);

21010

}

21011

21012

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64

21013

static SDValue

21014

LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,

21015

const EVT PtrVT) {

21016

return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,

21017

X86::RAX, X86II::MO_TLSGD);

21018

}

21019

21020

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32

21021

static SDValue

21022

LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,

21023

const EVT PtrVT) {

21024

return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,

21025

X86::EAX, X86II::MO_TLSGD);

21026

}

21027

21028

static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,

21029

SelectionDAG &DAG, const EVT PtrVT,

21030

bool Is64Bit, bool Is64BitLP64) {

21031

SDLoc dl(GA);

21032

21033

// Get the start address of the TLS block for this module.

21034

X86MachineFunctionInfo *MFI = DAG.getMachineFunction()

21035

.getInfo<X86MachineFunctionInfo>();

21036

MFI->incNumLocalDynamicTLSAccesses();

21037

21038

SDValue Base;

21039

if (Is64Bit) {

21040

unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;

21041

Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,

21042

X86II::MO_TLSLD, /*LocalDynamic=*/true);

21043

} else {

21044

SDValue InGlue;

21045

SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,

21046

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue);

21047

InGlue = Chain.getValue(1);

21048

Base = GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX,

21049

X86II::MO_TLSLDM, /*LocalDynamic=*/true);

21050

}

21051

21052

// Note: the CleanupLocalDynamicTLSPass will remove redundant computations

21053

// of Base.

21054

21055

// Build x@dtpoff.

21056

unsigned char OperandFlags = X86II::MO_DTPOFF;

21057

unsigned WrapperKind = X86ISD::Wrapper;

21058

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

21059

GA->getValueType(0),

21060

GA->getOffset(), OperandFlags);

21061

SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

21062

21063

// Add x@dtpoff with the base.

21064

return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);

21065

}

21066

21067

// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.

21068

static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,

21069

const EVT PtrVT, TLSModel::Model model,

21070

bool is64Bit, bool isPIC) {

21071

SDLoc dl(GA);

21072

21073

// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).

21074

Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),

21075

is64Bit ? 257 : 256));

21076

21077

SDValue ThreadPointer =

21078

DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),

21079

MachinePointerInfo(Ptr));

21080

21081

unsigned char OperandFlags = 0;

21082

// Most TLS accesses are not RIP relative, even on x86-64. One exception is

21083

// initialexec.

21084

unsigned WrapperKind = X86ISD::Wrapper;

21085

if (model == TLSModel::LocalExec) {

21086

OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;

21087

} else if (model == TLSModel::InitialExec) {

21088

if (is64Bit) {

21089

OperandFlags = X86II::MO_GOTTPOFF;

21090

WrapperKind = X86ISD::WrapperRIP;

21091

} else {

21092

OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;

21093

}

21094

} else {

21095

llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21095);

21096

}

21097

21098

// emit "addl x@ntpoff,%eax" (local exec)

21099

// or "addl x@indntpoff,%eax" (initial exec)

21100

// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)

21101

SDValue TGA =

21102

DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),

21103

GA->getOffset(), OperandFlags);

21104

SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

21105

21106

if (model == TLSModel::InitialExec) {

21107

if (isPIC && !is64Bit) {

21108

Offset = DAG.getNode(ISD::ADD, dl, PtrVT,

21109

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),

21110

Offset);

21111

}

21112

21113

Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,

21114

MachinePointerInfo::getGOT(DAG.getMachineFunction()));

21115

}

21116

21117

// The address of the thread local variable is the add of the thread

21118

// pointer with the offset of the variable.

21119

return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);

21120

}

21121

21122

SDValue

21123

X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

21124

21125

GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

21126

21127

if (DAG.getTarget().useEmulatedTLS())

21128

return LowerToTLSEmulatedModel(GA, DAG);

21129

21130

const GlobalValue *GV = GA->getGlobal();

21131

auto PtrVT = getPointerTy(DAG.getDataLayout());

21132

bool PositionIndependent = isPositionIndependent();

21133

21134

if (Subtarget.isTargetELF()) {

21135

TLSModel::Model model = DAG.getTarget().getTLSModel(GV);

21136

switch (model) {

21137

case TLSModel::GeneralDynamic:

21138

if (Subtarget.is64Bit()) {

21139

if (Subtarget.isTarget64BitLP64())

21140

return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);

21141

return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);

21142

}

21143

return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);

21144

case TLSModel::LocalDynamic:

21145

return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),

21146

Subtarget.isTarget64BitLP64());

21147

case TLSModel::InitialExec:

21148

case TLSModel::LocalExec:

21149

return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),

21150

PositionIndependent);

21151

}

21152

llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21152);

21153

}

21154

21155

if (Subtarget.isTargetDarwin()) {

21156

// Darwin only has one model of TLS. Lower to that.

21157

unsigned char OpFlag = 0;

21158

unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?

21159

X86ISD::WrapperRIP : X86ISD::Wrapper;

21160

21161

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

21162

// global base reg.

21163

bool PIC32 = PositionIndependent && !Subtarget.is64Bit();

21164

if (PIC32)

21165

OpFlag = X86II::MO_TLVP_PIC_BASE;

21166

else

21167

OpFlag = X86II::MO_TLVP;

21168

SDLoc DL(Op);

21169

SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,

21170

GA->getValueType(0),

21171

GA->getOffset(), OpFlag);

21172

SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);

21173

21174

// With PIC32, the address is actually $g + Offset.

21175

if (PIC32)

21176

Offset = DAG.getNode(ISD::ADD, DL, PtrVT,

21177

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),

21178

Offset);

21179

21180

// Lowering the machine isd will make sure everything is in the right

21181

// location.

21182

SDValue Chain = DAG.getEntryNode();

21183

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

21184

Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);

21185

SDValue Args[] = { Chain, Offset };

21186

Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);

21187

Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);

21188

21189

// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.

21190

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

21191

MFI.setAdjustsStack(true);

21192

21193

// And our return value (tls address) is in the standard call return value

21194

// location.

21195

unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;

21196

return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));

21197

}

21198

21199

if (Subtarget.isOSWindows()) {

21200

// Just use the implicit TLS architecture

21201

// Need to generate something similar to:

21202

// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage

21203

// ; from TEB

21204

// mov ecx, dword [rel _tls_index]: Load index (from C runtime)

21205

// mov rcx, qword [rdx+rcx*8]

21206

// mov eax, .tls$:tlsvar

21207

// [rax+rcx] contains the address

21208

// Windows 64bit: gs:0x58

21209

// Windows 32bit: fs:__tls_array

21210

21211

SDLoc dl(GA);

21212

SDValue Chain = DAG.getEntryNode();

21213

21214

// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or

21215

// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly

21216

// use its literal value of 0x2C.

21217

Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()

21218

? Type::getInt8PtrTy(*DAG.getContext(),

21219

256)

21220

: Type::getInt32PtrTy(*DAG.getContext(),

21221

257));

21222

21223

SDValue TlsArray = Subtarget.is64Bit()

21224

? DAG.getIntPtrConstant(0x58, dl)

21225

: (Subtarget.isTargetWindowsGNU()

21226

? DAG.getIntPtrConstant(0x2C, dl)

21227

: DAG.getExternalSymbol("_tls_array", PtrVT));

21228

21229

SDValue ThreadPointer =

21230

DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));

21231

21232

SDValue res;

21233

if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {

21234

res = ThreadPointer;

21235

} else {

21236

// Load the _tls_index variable

21237

SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);

21238

if (Subtarget.is64Bit())

21239

IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,

21240

MachinePointerInfo(), MVT::i32);

21241

else

21242

IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());

21243

21244

const DataLayout &DL = DAG.getDataLayout();

21245

SDValue Scale =

21246

DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);

21247

IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);

21248

21249

res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);

21250

}

21251

21252

res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());

21253

21254

// Get the offset of start of .tls section

21255

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

21256

GA->getValueType(0),

21257

GA->getOffset(), X86II::MO_SECREL);

21258

SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);

21259

21260

// The address of the thread local variable is the add of the thread

21261

// pointer with the offset of the variable.

21262

return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);

21263

}

21264

21265

llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21265);

21266

}

21267

21268

/// Lower SRA_PARTS and friends, which return two i32 values

21269

/// and take a 2 x i32 value to shift plus a shift amount.

21270

/// TODO: Can this be moved to general expansion code?

21271

static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {

21272

SDValue Lo, Hi;

21273

DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);

21274

return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));

21275

}

21276

21277

// Try to use a packed vector operation to handle i64 on 32-bit targets when

21278

// AVX512DQ is enabled.

21279

static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,

21280

const X86Subtarget &Subtarget) {

21281

assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21285, __extension__
__PRETTY_FUNCTION__))

21282

Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21285, __extension__
__PRETTY_FUNCTION__))

21283

Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21285, __extension__
__PRETTY_FUNCTION__))

21284

Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21285, __extension__
__PRETTY_FUNCTION__))

21285

"Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21285, __extension__
__PRETTY_FUNCTION__));

21286

bool IsStrict = Op->isStrictFPOpcode();

21287

unsigned OpNo = IsStrict ? 1 : 0;

21288

SDValue Src = Op.getOperand(OpNo);

21289

MVT SrcVT = Src.getSimpleValueType();

21290

MVT VT = Op.getSimpleValueType();

21291

21292

if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||

21293

(VT != MVT::f32 && VT != MVT::f64))

21294

return SDValue();

21295

21296

// Pack the i64 into a vector, do the operation and extract.

21297

21298

// Using 256-bit to ensure result is 128-bits for f32 case.

21299

unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;

21300

MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);

21301

MVT VecVT = MVT::getVectorVT(VT, NumElts);

21302

21303

SDLoc dl(Op);

21304

SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);

21305

if (IsStrict) {

21306

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},

21307

{Op.getOperand(0), InVec});

21308

SDValue Chain = CvtVec.getValue(1);

21309

SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

21310

DAG.getIntPtrConstant(0, dl));

21311

return DAG.getMergeValues({Value, Chain}, dl);

21312

}

21313

21314

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);

21315

21316

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

21317

DAG.getIntPtrConstant(0, dl));

21318

}

21319

21320

// Try to use a packed vector operation to handle i64 on 32-bit targets.

21321

static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,

21322

const X86Subtarget &Subtarget) {

21323

assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21327, __extension__
__PRETTY_FUNCTION__))

21324

Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21327, __extension__
__PRETTY_FUNCTION__))

21325

Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21327, __extension__
__PRETTY_FUNCTION__))

21326

Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21327, __extension__
__PRETTY_FUNCTION__))

21327

"Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21327, __extension__
__PRETTY_FUNCTION__));

21328

bool IsStrict = Op->isStrictFPOpcode();

21329

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

21330

MVT SrcVT = Src.getSimpleValueType();

21331

MVT VT = Op.getSimpleValueType();

21332

21333

if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)

21334

return SDValue();

21335

21336

// Pack the i64 into a vector, do the operation and extract.

21337

21338

assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21338, __extension__
__PRETTY_FUNCTION__));

21339

21340

SDLoc dl(Op);

21341

SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);

21342

if (IsStrict) {

21343

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},

21344

{Op.getOperand(0), InVec});

21345

SDValue Chain = CvtVec.getValue(1);

21346

SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

21347

DAG.getIntPtrConstant(0, dl));

21348

return DAG.getMergeValues({Value, Chain}, dl);

21349

}

21350

21351

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);

21352

21353

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

21354

DAG.getIntPtrConstant(0, dl));

21355

}

21356

21357

static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,

21358

const X86Subtarget &Subtarget) {

21359

switch (Opcode) {

21360

case ISD::SINT_TO_FP:

21361

// TODO: Handle wider types with AVX/AVX512.

21362

if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)

21363

return false;

21364

// CVTDQ2PS or (V)CVTDQ2PD

21365

return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);

21366

21367

case ISD::UINT_TO_FP:

21368

// TODO: Handle wider types and i64 elements.

21369

if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)

21370

return false;

21371

// VCVTUDQ2PS or VCVTUDQ2PD

21372

return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;

21373

21374

default:

21375

return false;

21376

}

21377

}

21378

21379

/// Given a scalar cast operation that is extracted from a vector, try to

21380

/// vectorize the cast op followed by extraction. This will avoid an expensive

21381

/// round-trip between XMM and GPR.

21382

static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,

21383

const X86Subtarget &Subtarget) {

21384

// TODO: This could be enhanced to handle smaller integer types by peeking

21385

// through an extend.

21386

SDValue Extract = Cast.getOperand(0);

21387

MVT DestVT = Cast.getSimpleValueType();

21388

if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

21389

!isa<ConstantSDNode>(Extract.getOperand(1)))

21390

return SDValue();

21391

21392

// See if we have a 128-bit vector cast op for this type of cast.

21393

SDValue VecOp = Extract.getOperand(0);

21394

MVT FromVT = VecOp.getSimpleValueType();

21395

unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();

21396

MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);

21397

MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);

21398

if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))

21399

return SDValue();

21400

21401

// If we are extracting from a non-zero element, first shuffle the source

21402

// vector to allow extracting from element zero.

21403

SDLoc DL(Cast);

21404

if (!isNullConstant(Extract.getOperand(1))) {

21405

SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);

21406

Mask[0] = Extract.getConstantOperandVal(1);

21407

VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);

21408

}

21409

// If the source vector is wider than 128-bits, extract the low part. Do not

21410

// create an unnecessarily wide vector cast op.

21411

if (FromVT != Vec128VT)

21412

VecOp = extract128BitVector(VecOp, 0, DAG, DL);

21413

21414

// cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0

21415

// cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0

21416

SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);

21417

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,

21418

DAG.getIntPtrConstant(0, DL));

21419

}

21420

21421

/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),

21422

/// try to vectorize the cast ops. This will avoid an expensive round-trip

21423

/// between XMM and GPR.

21424

static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,

21425

const X86Subtarget &Subtarget) {

21426

// TODO: Allow FP_TO_UINT.

21427

SDValue CastToInt = CastToFP.getOperand(0);

21428

MVT VT = CastToFP.getSimpleValueType();

21429

if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())

21430

return SDValue();

21431

21432

MVT IntVT = CastToInt.getSimpleValueType();

21433

SDValue X = CastToInt.getOperand(0);

21434

MVT SrcVT = X.getSimpleValueType();

21435

if (SrcVT != MVT::f32 && SrcVT != MVT::f64)

21436

return SDValue();

21437

21438

// See if we have 128-bit vector cast instructions for this type of cast.

21439

// We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.

21440

if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||

21441

IntVT != MVT::i32)

21442

return SDValue();

21443

21444

unsigned SrcSize = SrcVT.getSizeInBits();

21445

unsigned IntSize = IntVT.getSizeInBits();

21446

unsigned VTSize = VT.getSizeInBits();

21447

MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);

21448

MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);

21449

MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);

21450

21451

// We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.

21452

unsigned ToIntOpcode =

21453

SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;

21454

unsigned ToFPOpcode =

21455

IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;

21456

21457

// sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0

21458

//

21459

// We are not defining the high elements (for example, zero them) because

21460

// that could nullify any performance advantage that we hoped to gain from

21461

// this vector op hack. We do not expect any adverse effects (like denorm

21462

// penalties) with cast ops.

21463

SDLoc DL(CastToFP);

21464

SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);

21465

SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);

21466

SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);

21467

SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);

21468

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);

21469

}

21470

21471

static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,

21472

const X86Subtarget &Subtarget) {

21473

SDLoc DL(Op);

21474

bool IsStrict = Op->isStrictFPOpcode();

21475

MVT VT = Op->getSimpleValueType(0);

21476

SDValue Src = Op->getOperand(IsStrict ? 1 : 0);

21477

21478

if (Subtarget.hasDQI()) {

21479

assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21479, __extension__
__PRETTY_FUNCTION__));

21480

21481

assert((Src.getSimpleValueType() == MVT::v2i64 ||(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21483, __extension__
__PRETTY_FUNCTION__))

21482

Src.getSimpleValueType() == MVT::v4i64) &&(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21483, __extension__
__PRETTY_FUNCTION__))

21483

"Unsupported custom type")(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21483, __extension__
__PRETTY_FUNCTION__));

21484

21485

// With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.

21486

assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21487, __extension__
__PRETTY_FUNCTION__))

21487

"Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21487, __extension__
__PRETTY_FUNCTION__));

21488

MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

21489

21490

// Need to concat with zero vector for strict fp to avoid spurious

21491

// exceptions.

21492

SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)

21493

: DAG.getUNDEF(MVT::v8i64);

21494

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,

21495

DAG.getIntPtrConstant(0, DL));

21496

SDValue Res, Chain;

21497

if (IsStrict) {

21498

Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},

21499

{Op->getOperand(0), Src});

21500

Chain = Res.getValue(1);

21501

} else {

21502

Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);

21503

}

21504

21505

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

21506

DAG.getIntPtrConstant(0, DL));

21507

21508

if (IsStrict)

21509

return DAG.getMergeValues({Res, Chain}, DL);

21510

return Res;

21511

}

21512

21513

bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||

21514

Op->getOpcode() == ISD::STRICT_SINT_TO_FP;

21515

if (VT != MVT::v4f32 || IsSigned)

21516

return SDValue();

21517

21518

SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);

21519

SDValue One = DAG.getConstant(1, DL, MVT::v4i64);

21520

SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,

21521

DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),

21522

DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));

21523

SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);

21524

SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);

21525

SmallVector<SDValue, 4> SignCvts(4);

21526

SmallVector<SDValue, 4> Chains(4);

21527

for (int i = 0; i != 4; ++i) {

21528

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,

21529

DAG.getIntPtrConstant(i, DL));

21530

if (IsStrict) {

21531

SignCvts[i] =

21532

DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},

21533

{Op.getOperand(0), Elt});

21534

Chains[i] = SignCvts[i].getValue(1);

21535

} else {

21536

SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);

21537

}

21538

}

21539

SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);

21540

21541

SDValue Slow, Chain;

21542

if (IsStrict) {

21543

Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);

21544

Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},

21545

{Chain, SignCvt, SignCvt});

21546

Chain = Slow.getValue(1);

21547

} else {

21548

Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);

21549

}

21550

21551

IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);

21552

SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);

21553

21554

if (IsStrict)

21555

return DAG.getMergeValues({Cvt, Chain}, DL);

21556

21557

return Cvt;

21558

}

21559

21560

static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {

21561

bool IsStrict = Op->isStrictFPOpcode();

21562

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

21563

SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();

21564

MVT VT = Op.getSimpleValueType();

21565

MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;

21566

SDLoc dl(Op);

21567

21568

SDValue Rnd = DAG.getIntPtrConstant(0, dl);

21569

if (IsStrict)

21570

return DAG.getNode(

21571

ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},

21572

{Chain,

21573

DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),

21574

Rnd});

21575

return DAG.getNode(ISD::FP_ROUND, dl, VT,

21576

DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);

21577

}

21578

21579

static bool isLegalConversion(MVT VT, bool IsSigned,

21580

const X86Subtarget &Subtarget) {

21581

if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)

21582

return true;

21583

if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)

21584

return true;

21585

if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))

21586

return true;

21587

if (Subtarget.useAVX512Regs()) {

21588

if (VT == MVT::v16i32)

21589

return true;

21590

if (VT == MVT::v8i64 && Subtarget.hasDQI())

21591

return true;

21592

}

21593

if (Subtarget.hasDQI() && Subtarget.hasVLX() &&

21594

(VT == MVT::v2i64 || VT == MVT::v4i64))

21595

return true;

21596

return false;

21597

}

21598

21599

SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,

21600

SelectionDAG &DAG) const {

21601

bool IsStrict = Op->isStrictFPOpcode();

21602

unsigned OpNo = IsStrict ? 1 : 0;

21603

SDValue Src = Op.getOperand(OpNo);

21604

SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();

21605

MVT SrcVT = Src.getSimpleValueType();

21606

MVT VT = Op.getSimpleValueType();

21607

SDLoc dl(Op);

21608

21609

if (isSoftFP16(VT))

21610

return promoteXINT_TO_FP(Op, DAG);

21611

else if (isLegalConversion(SrcVT, true, Subtarget))

21612

return Op;

21613

21614

if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)

21615

return LowerWin64_INT128_TO_FP(Op, DAG);

21616

21617

if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))

21618

return Extract;

21619

21620

if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))

21621

return R;

21622

21623

if (SrcVT.isVector()) {

21624

if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {

21625

// Note: Since v2f64 is a legal type. We don't need to zero extend the

21626

// source for strict FP.

21627

if (IsStrict)

21628

return DAG.getNode(

21629

X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},

21630

{Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

21631

DAG.getUNDEF(SrcVT))});

21632

return DAG.getNode(X86ISD::CVTSI2P, dl, VT,

21633

DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

21634

DAG.getUNDEF(SrcVT)));

21635

}

21636

if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)

21637

return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);

21638

21639

return SDValue();

21640

}

21641

21642

assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21643, __extension__
__PRETTY_FUNCTION__))

21643

"Unknown SINT_TO_FP to lower!")(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21643, __extension__
__PRETTY_FUNCTION__));

21644

21645

bool UseSSEReg = isScalarFPTypeInSSEReg(VT);

21646

21647

// These are really Legal; return the operand so the caller accepts it as

21648

// Legal.

21649

if (SrcVT == MVT::i32 && UseSSEReg)

21650

return Op;

21651

if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())

21652

return Op;

21653

21654

if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))

21655

return V;

21656

if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))

21657

return V;

21658

21659

// SSE doesn't have an i16 conversion so we need to promote.

21660

if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {

21661

SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);

21662

if (IsStrict)

21663

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

21664

{Chain, Ext});

21665

21666

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);

21667

}

21668

21669

if (VT == MVT::f128 || !Subtarget.hasX87())

21670

return SDValue();

21671

21672

SDValue ValueToStore = Src;

21673

if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())

21674

// Bitcasting to f64 here allows us to do a single 64-bit store from

21675

// an SSE register, avoiding the store forwarding penalty that would come

21676

// with two 32-bit stores.

21677

ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

21678

21679

unsigned Size = SrcVT.getStoreSize();

21680

Align Alignment(Size);

21681

MachineFunction &MF = DAG.getMachineFunction();

21682

auto PtrVT = getPointerTy(MF.getDataLayout());

21683

int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);

21684

MachinePointerInfo MPI =

21685

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);

21686

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

21687

Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);

21688

std::pair<SDValue, SDValue> Tmp =

21689

BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);

21690

21691

if (IsStrict)

21692

return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

21693

21694

return Tmp.first;

21695

}

21696

21697

std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(

21698

EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,

21699

MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {

21700

// Build the FILD

21701

SDVTList Tys;

21702

bool useSSE = isScalarFPTypeInSSEReg(DstVT);

21703

if (useSSE)

21704

Tys = DAG.getVTList(MVT::f80, MVT::Other);

21705

else

21706

Tys = DAG.getVTList(DstVT, MVT::Other);

21707

21708

SDValue FILDOps[] = {Chain, Pointer};

21709

SDValue Result =

21710

DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,

21711

Alignment, MachineMemOperand::MOLoad);

21712

Chain = Result.getValue(1);

21713

21714

if (useSSE) {

21715

MachineFunction &MF = DAG.getMachineFunction();

21716

unsigned SSFISize = DstVT.getStoreSize();

21717

int SSFI =

21718

MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);

21719

auto PtrVT = getPointerTy(MF.getDataLayout());

21720

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

21721

Tys = DAG.getVTList(MVT::Other);

21722

SDValue FSTOps[] = {Chain, Result, StackSlot};

21723

MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(

21724

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),

21725

MachineMemOperand::MOStore, SSFISize, Align(SSFISize));

21726

21727

Chain =

21728

DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);

21729

Result = DAG.getLoad(

21730

DstVT, DL, Chain, StackSlot,

21731

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));

21732

Chain = Result.getValue(1);

21733

}

21734

21735

return { Result, Chain };

21736

}

21737

21738

/// Horizontal vector math instructions may be slower than normal math with

21739

/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch

21740

/// implementation, and likely shuffle complexity of the alternate sequence.

21741

static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,

21742

const X86Subtarget &Subtarget) {

21743

bool IsOptimizingSize = DAG.shouldOptForSize();

21744

bool HasFastHOps = Subtarget.hasFastHorizontalOps();

21745

return !IsSingleSource || IsOptimizingSize || HasFastHOps;

21746

}

21747

21748

/// 64-bit unsigned integer to double expansion.

21749

static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,

21750

const X86Subtarget &Subtarget) {

21751

// We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0

21752

// when converting 0 when rounding toward negative infinity. Caller will

21753

// fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.

21754

assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!")(static_cast <bool> (!Op->isStrictFPOpcode() &&
"Expected non-strict uint_to_fp!") ? void (0) : __assert_fail
("!Op->isStrictFPOpcode() && \"Expected non-strict uint_to_fp!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21754, __extension__
__PRETTY_FUNCTION__));

21755

// This algorithm is not obvious. Here it is what we're trying to output:

21756

/*

21757

movq %rax, %xmm0

21758

punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }

21759

subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }

21760

#ifdef __SSE3__

21761

haddpd %xmm0, %xmm0

21762

#else

21763

pshufd $0x4e, %xmm0, %xmm1

21764

addpd %xmm1, %xmm0

21765

#endif

21766

*/

21767

21768

SDLoc dl(Op);

21769

LLVMContext *Context = DAG.getContext();

21770

21771

// Build some magic constants.

21772

static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };

21773

Constant *C0 = ConstantDataVector::get(*Context, CV0);

21774

auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

21775

SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));

21776

21777

SmallVector<Constant*,2> CV1;

21778

CV1.push_back(

21779

ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),

21780

APInt(64, 0x4330000000000000ULL))));

21781

CV1.push_back(

21782

ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),

21783

APInt(64, 0x4530000000000000ULL))));

21784

Constant *C1 = ConstantVector::get(CV1);

21785

SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));

21786

21787

// Load the 64-bit value into an XMM register.

21788

SDValue XR1 =

21789

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));

21790

SDValue CLod0 = DAG.getLoad(

21791

MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,

21792

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));

21793

SDValue Unpck1 =

21794

getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);

21795

21796

SDValue CLod1 = DAG.getLoad(

21797

MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,

21798

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));

21799

SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);

21800

// TODO: Are there any fast-math-flags to propagate here?

21801

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);

21802

SDValue Result;

21803

21804

if (Subtarget.hasSSE3() &&

21805

shouldUseHorizontalOp(true, DAG, Subtarget)) {

21806

Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);

21807

} else {

21808

SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});

21809

Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);

21810

}

21811

Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,

21812

DAG.getIntPtrConstant(0, dl));

21813

return Result;

21814

}

21815

21816

/// 32-bit unsigned integer to float expansion.

21817

static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,

21818

const X86Subtarget &Subtarget) {

21819

unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;

21820

SDLoc dl(Op);

21821

// FP constant to bias correct the final result.

21822

SDValue Bias = DAG.getConstantFP(

21823

llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);

21824

21825

// Load the 32-bit value into an XMM register.

21826

SDValue Load =

21827

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));

21828

21829

// Zero out the upper parts of the register.

21830

Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);

21831

21832

// Or the load with the bias.

21833

SDValue Or = DAG.getNode(

21834

ISD::OR, dl, MVT::v2i64,

21835

DAG.getBitcast(MVT::v2i64, Load),

21836

DAG.getBitcast(MVT::v2i64,

21837

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));

21838

Or =

21839

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

21840

DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));

21841

21842

if (Op.getNode()->isStrictFPOpcode()) {

21843

// Subtract the bias.

21844

// TODO: Are there any fast-math-flags to propagate here?

21845

SDValue Chain = Op.getOperand(0);

21846

SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},

21847

{Chain, Or, Bias});

21848

21849

if (Op.getValueType() == Sub.getValueType())

21850

return Sub;

21851

21852

// Handle final rounding.

21853

std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(

21854

Sub, Sub.getValue(1), dl, Op.getSimpleValueType());

21855

21856

return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);

21857

}

21858

21859

// Subtract the bias.

21860

// TODO: Are there any fast-math-flags to propagate here?

21861

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

21862

21863

// Handle final rounding.

21864

return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());

21865

}

21866

21867

static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,

21868

const X86Subtarget &Subtarget,

21869

const SDLoc &DL) {

21870

if (Op.getSimpleValueType() != MVT::v2f64)

21871

return SDValue();

21872

21873

bool IsStrict = Op->isStrictFPOpcode();

21874

21875

SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);

21876

assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")(static_cast <bool> (N0.getSimpleValueType() == MVT::v2i32
&& "Unexpected input type") ? void (0) : __assert_fail
("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21876, __extension__
__PRETTY_FUNCTION__));

21877

21878

if (Subtarget.hasAVX512()) {

21879

if (!Subtarget.hasVLX()) {

21880

// Let generic type legalization widen this.

21881

if (!IsStrict)

21882

return SDValue();

21883

// Otherwise pad the integer input with 0s and widen the operation.

21884

N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

21885

DAG.getConstant(0, DL, MVT::v2i32));

21886

SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},

21887

{Op.getOperand(0), N0});

21888

SDValue Chain = Res.getValue(1);

21889

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,

21890

DAG.getIntPtrConstant(0, DL));

21891

return DAG.getMergeValues({Res, Chain}, DL);

21892

}

21893

21894

// Legalize to v4i32 type.

21895

N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

21896

DAG.getUNDEF(MVT::v2i32));

21897

if (IsStrict)

21898

return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},

21899

{Op.getOperand(0), N0});

21900

return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);

21901

}

21902

21903

// Zero extend to 2i64, OR with the floating point representation of 2^52.

21904

// This gives us the floating point equivalent of 2^52 + the i32 integer

21905

// since double has 52-bits of mantissa. Then subtract 2^52 in floating

21906

// point leaving just our i32 integers in double format.

21907

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);

21908

SDValue VBias = DAG.getConstantFP(

21909

llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);

21910

SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,

21911

DAG.getBitcast(MVT::v2i64, VBias));

21912

Or = DAG.getBitcast(MVT::v2f64, Or);

21913

21914

if (IsStrict)

21915

return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},

21916

{Op.getOperand(0), Or, VBias});

21917

return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);

21918

}

21919

21920

static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,

21921

const X86Subtarget &Subtarget) {

21922

SDLoc DL(Op);

21923

bool IsStrict = Op->isStrictFPOpcode();

21924

SDValue V = Op->getOperand(IsStrict ? 1 : 0);

21925

MVT VecIntVT = V.getSimpleValueType();

21926

assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21927, __extension__
__PRETTY_FUNCTION__))

21927

"Unsupported custom type")(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21927, __extension__
__PRETTY_FUNCTION__));

21928

21929

if (Subtarget.hasAVX512()) {

21930

// With AVX512, but not VLX we need to widen to get a 512-bit result type.

21931

assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21931, __extension__
__PRETTY_FUNCTION__));

21932

MVT VT = Op->getSimpleValueType(0);

21933

21934

// v8i32->v8f64 is legal with AVX512 so just return it.

21935

if (VT == MVT::v8f64)

21936

return Op;

21937

21938

assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21939, __extension__
__PRETTY_FUNCTION__))

21939

"Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21939, __extension__
__PRETTY_FUNCTION__));

21940

MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;

21941

MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;

21942

// Need to concat with zero vector for strict fp to avoid spurious

21943

// exceptions.

21944

SDValue Tmp =

21945

IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);

21946

V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,

21947

DAG.getIntPtrConstant(0, DL));

21948

SDValue Res, Chain;

21949

if (IsStrict) {

21950

Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},

21951

{Op->getOperand(0), V});

21952

Chain = Res.getValue(1);

21953

} else {

21954

Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);

21955

}

21956

21957

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

21958

DAG.getIntPtrConstant(0, DL));

21959

21960

if (IsStrict)

21961

return DAG.getMergeValues({Res, Chain}, DL);

21962

return Res;

21963

}

21964

21965

if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&

21966

Op->getSimpleValueType(0) == MVT::v4f64) {

21967

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);

21968

Constant *Bias = ConstantFP::get(

21969

*DAG.getContext(),

21970

APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));

21971

auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

21972

SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));

21973

SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);

21974

SDValue Ops[] = {DAG.getEntryNode(), CPIdx};

21975

SDValue VBias = DAG.getMemIntrinsicNode(

21976

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,

21977

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),

21978

MachineMemOperand::MOLoad);

21979

21980

SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,

21981

DAG.getBitcast(MVT::v4i64, VBias));

21982

Or = DAG.getBitcast(MVT::v4f64, Or);

21983

21984

if (IsStrict)

21985

return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},

21986

{Op.getOperand(0), Or, VBias});

21987

return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);

21988

}

21989

21990

// The algorithm is the following:

21991

// #ifdef __SSE4_1__

21992

// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

21993

// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),

21994

// (uint4) 0x53000000, 0xaa);

21995

// #else

21996

// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;

21997

// uint4 hi = (v >> 16) | (uint4) 0x53000000;

21998

// #endif

21999

// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

22000

// return (float4) lo + fhi;

22001

22002

bool Is128 = VecIntVT == MVT::v4i32;

22003

MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;

22004

// If we convert to something else than the supported type, e.g., to v4f64,

22005

// abort early.

22006

if (VecFloatVT != Op->getSimpleValueType(0))

22007

return SDValue();

22008

22009

// In the #idef/#else code, we have in common:

22010

// - The vector of constants:

22011

// -- 0x4b000000

22012

// -- 0x53000000

22013

// - A shift:

22014

// -- v >> 16

22015

22016

// Create the splat vector for 0x4b000000.

22017

SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);

22018

// Create the splat vector for 0x53000000.

22019

SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);

22020

22021

// Create the right shift.

22022

SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);

22023

SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);

22024

22025

SDValue Low, High;

22026

if (Subtarget.hasSSE41()) {

22027

MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;

22028

// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

22029

SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);

22030

SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);

22031

// Low will be bitcasted right away, so do not bother bitcasting back to its

22032

// original type.

22033

Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,

22034

VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));

22035

// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),

22036

// (uint4) 0x53000000, 0xaa);

22037

SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);

22038

SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);

22039

// High will be bitcasted right away, so do not bother bitcasting back to

22040

// its original type.

22041

High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,

22042

VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));

22043

} else {

22044

SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);

22045

// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;

22046

SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);

22047

Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);

22048

22049

// uint4 hi = (v >> 16) | (uint4) 0x53000000;

22050

High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);

22051

}

22052

22053

// Create the vector constant for (0x1.0p39f + 0x1.0p23f).

22054

SDValue VecCstFSub = DAG.getConstantFP(

22055

APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);

22056

22057

// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

22058

// NOTE: By using fsub of a positive constant instead of fadd of a negative

22059

// constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is

22060

// enabled. See PR24512.

22061

SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);

22062

// TODO: Are there any fast-math-flags to propagate here?

22063

// (float4) lo;

22064

SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);

22065

// return (float4) lo + fhi;

22066

if (IsStrict) {

22067

SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},

22068

{Op.getOperand(0), HighBitcast, VecCstFSub});

22069

return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},

22070

{FHigh.getValue(1), LowBitcast, FHigh});

22071

}

22072

22073

SDValue FHigh =

22074

DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);

22075

return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);

22076

}

22077

22078

static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,

22079

const X86Subtarget &Subtarget) {

22080

unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;

22081

SDValue N0 = Op.getOperand(OpNo);

22082

MVT SrcVT = N0.getSimpleValueType();

22083

SDLoc dl(Op);

22084

22085

switch (SrcVT.SimpleTy) {

22086

default:

22087

llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22087);

22088

case MVT::v2i32:

22089

return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);

22090

case MVT::v4i32:

22091

case MVT::v8i32:

22092

return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);

22093

case MVT::v2i64:

22094

case MVT::v4i64:

22095

return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);

22096

}

22097

}

22098

22099

SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

22100

SelectionDAG &DAG) const {

22101

bool IsStrict = Op->isStrictFPOpcode();

22102

unsigned OpNo = IsStrict ? 1 : 0;

22103

SDValue Src = Op.getOperand(OpNo);

22104

SDLoc dl(Op);

22105

auto PtrVT = getPointerTy(DAG.getDataLayout());

22106

MVT SrcVT = Src.getSimpleValueType();

22107

MVT DstVT = Op->getSimpleValueType(0);

22108

SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

22109

22110

// Bail out when we don't have native conversion instructions.

22111

if (DstVT == MVT::f128)

22112

return SDValue();

22113

22114

if (isSoftFP16(DstVT))

22115

return promoteXINT_TO_FP(Op, DAG);

22116

else if (isLegalConversion(SrcVT, false, Subtarget))

22117

return Op;

22118

22119

if (DstVT.isVector())

22120

return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);

22121

22122

if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)

22123

return LowerWin64_INT128_TO_FP(Op, DAG);

22124

22125

if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))

22126

return Extract;

22127

22128

if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&

22129

(SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {

22130

// Conversions from unsigned i32 to f32/f64 are legal,

22131

// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.

22132

return Op;

22133

}

22134

22135

// Promote i32 to i64 and use a signed conversion on 64-bit targets.

22136

if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {

22137

Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);

22138

if (IsStrict)

22139

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},

22140

{Chain, Src});

22141

return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);

22142

}

22143

22144

if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))

22145

return V;

22146

if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))

22147

return V;

22148

22149

// The transform for i64->f64 isn't correct for 0 when rounding to negative

22150

// infinity. It produces -0.0, so disable under strictfp.

22151

if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&

22152

!IsStrict)

22153

return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);

22154

// The transform for i32->f64/f32 isn't correct for 0 when rounding to

22155

// negative infinity. So disable under strictfp. Using FILD instead.

22156

if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&

22157

!IsStrict)

22158

return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);

22159

if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&

22160

(DstVT == MVT::f32 || DstVT == MVT::f64))

22161

return SDValue();

22162

22163

// Make a 64-bit buffer, and use it to build an FILD.

22164

SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);

22165

int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();

22166

Align SlotAlign(8);

22167

MachinePointerInfo MPI =

22168

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);

22169

if (SrcVT == MVT::i32) {

22170

SDValue OffsetSlot =

22171

DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);

22172

SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);

22173

SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),

22174

OffsetSlot, MPI.getWithOffset(4), SlotAlign);

22175

std::pair<SDValue, SDValue> Tmp =

22176

BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);

22177

if (IsStrict)

22178

return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

22179

22180

return Tmp.first;

22181

}

22182

22183

assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")(static_cast <bool> (SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22183, __extension__
__PRETTY_FUNCTION__));

22184

SDValue ValueToStore = Src;

22185

if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {

22186

// Bitcasting to f64 here allows us to do a single 64-bit store from

22187

// an SSE register, avoiding the store forwarding penalty that would come

22188

// with two 32-bit stores.

22189

ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

22190

}

22191

SDValue Store =

22192

DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);

22193

// For i64 source, we need to add the appropriate power of 2 if the input

22194

// was negative. We must be careful to do the computation in x87 extended

22195

// precision, not in SSE.

22196

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

22197

SDValue Ops[] = { Store, StackSlot };

22198

SDValue Fild =

22199

DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,

22200

SlotAlign, MachineMemOperand::MOLoad);

22201

Chain = Fild.getValue(1);

22202

22203

22204

// Check whether the sign bit is set.

22205

SDValue SignSet = DAG.getSetCC(

22206

dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),

22207

Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);

22208

22209

// Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.

22210

APInt FF(64, 0x5F80000000000000ULL);

22211

SDValue FudgePtr = DAG.getConstantPool(

22212

ConstantInt::get(*DAG.getContext(), FF), PtrVT);

22213

Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();

22214

22215

// Get a pointer to FF if the sign bit was set, or to 0 otherwise.

22216

SDValue Zero = DAG.getIntPtrConstant(0, dl);

22217

SDValue Four = DAG.getIntPtrConstant(4, dl);

22218

SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);

22219

FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);

22220

22221

// Load the value out, extending it from f32 to f80.

22222

SDValue Fudge = DAG.getExtLoad(

22223

ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,

22224

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,

22225

CPAlignment);

22226

Chain = Fudge.getValue(1);

22227

// Extend everything to 80 bits to force it to be done on x87.

22228

// TODO: Are there any fast-math-flags to propagate here?

22229

if (IsStrict) {

22230

unsigned Opc = ISD::STRICT_FADD;

22231

// Windows needs the precision control changed to 80bits around this add.

22232

if (Subtarget.isOSWindows() && DstVT == MVT::f32)

22233

Opc = X86ISD::STRICT_FP80_ADD;

22234

22235

SDValue Add =

22236

DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});

22237

// STRICT_FP_ROUND can't handle equal types.

22238

if (DstVT == MVT::f80)

22239

return Add;

22240

return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},

22241

{Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});

22242

}

22243

unsigned Opc = ISD::FADD;

22244

// Windows needs the precision control changed to 80bits around this add.

22245

if (Subtarget.isOSWindows() && DstVT == MVT::f32)

22246

Opc = X86ISD::FP80_ADD;

22247

22248

SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);

22249

return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,

22250

DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));

22251

}

22252

22253

// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation

22254

// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),

22255

// just return an SDValue().

22256

// Otherwise it is assumed to be a conversion from one of f32, f64 or f80

22257

// to i16, i32 or i64, and we lower it to a legal sequence and return the

22258

// result.

22259

SDValue

22260

X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,

22261

bool IsSigned, SDValue &Chain) const {

22262

bool IsStrict = Op->isStrictFPOpcode();

22263

SDLoc DL(Op);

22264

22265

EVT DstTy = Op.getValueType();

22266

SDValue Value = Op.getOperand(IsStrict ? 1 : 0);

22267

EVT TheVT = Value.getValueType();

22268

auto PtrVT = getPointerTy(DAG.getDataLayout());

22269

22270

if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {

22271

// f16 must be promoted before using the lowering in this routine.

22272

// fp128 does not use this lowering.

22273

return SDValue();

22274

}

22275

22276

// If using FIST to compute an unsigned i64, we'll need some fixup

22277

// to handle values above the maximum signed i64. A FIST is always

22278

// used for the 32-bit subtarget, but also for f80 on a 64-bit target.

22279

bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;

22280

22281

// FIXME: This does not generate an invalid exception if the input does not

22282

// fit in i32. PR44019

22283

if (!IsSigned && DstTy != MVT::i64) {

22284

// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.

22285

// The low 32 bits of the fist result will have the correct uint32 result.

22286

assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")(static_cast <bool> (DstTy == MVT::i32 && "Unexpected FP_TO_UINT"
) ? void (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22286, __extension__
__PRETTY_FUNCTION__));

22287

DstTy = MVT::i64;

22288

}

22289

22290

assert(DstTy.getSimpleVT() <= MVT::i64 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22292, __extension__
__PRETTY_FUNCTION__))

22291

DstTy.getSimpleVT() >= MVT::i16 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22292, __extension__
__PRETTY_FUNCTION__))

22292

"Unknown FP_TO_INT to lower!")(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22292, __extension__
__PRETTY_FUNCTION__));

22293

22294

// We lower FP->int64 into FISTP64 followed by a load from a temporary

22295

// stack slot.

22296

MachineFunction &MF = DAG.getMachineFunction();

22297

unsigned MemSize = DstTy.getStoreSize();

22298

int SSFI =

22299

MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);

22300

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

22301

22302

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

22303

22304

SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.

22305

22306

if (UnsignedFixup) {

22307

//

22308

// Conversion to unsigned i64 is implemented with a select,

22309

// depending on whether the source value fits in the range

22310

// of a signed i64. Let Thresh be the FP equivalent of

22311

// 0x8000000000000000ULL.

22312

//

22313

// Adjust = (Value >= Thresh) ? 0x80000000 : 0;

22314

// FltOfs = (Value >= Thresh) ? 0x80000000 : 0;

22315

// FistSrc = (Value - FltOfs);

22316

// Fist-to-mem64 FistSrc

22317

// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent

22318

// to XOR'ing the high 32 bits with Adjust.

22319

//

22320

// Being a power of 2, Thresh is exactly representable in all FP formats.

22321

// For X87 we'd like to use the smallest FP type for this constant, but

22322

// for DAG type consistency we have to match the FP operand type.

22323

22324

APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));

22325

LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;

22326

bool LosesInfo = false;

22327

if (TheVT == MVT::f64)

22328

// The rounding mode is irrelevant as the conversion should be exact.

22329

Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,

22330

&LosesInfo);

22331

else if (TheVT == MVT::f80)

22332

Status = Thresh.convert(APFloat::x87DoubleExtended(),

22333

APFloat::rmNearestTiesToEven, &LosesInfo);

22334

22335

assert(Status == APFloat::opOK && !LosesInfo &&(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22336, __extension__
__PRETTY_FUNCTION__))

22336

"FP conversion should have been exact")(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22336, __extension__
__PRETTY_FUNCTION__));

22337

22338

SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);

22339

22340

EVT ResVT = getSetCCResultType(DAG.getDataLayout(),

22341

*DAG.getContext(), TheVT);

22342

SDValue Cmp;

22343

if (IsStrict) {

22344

Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,

22345

/*IsSignaling*/ true);

22346

Chain = Cmp.getValue(1);

22347

} else {

22348

Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);

22349

}

22350

22351

// Our preferred lowering of

22352

//

22353

// (Value >= Thresh) ? 0x8000000000000000ULL : 0

22354

//

22355

// is

22356

//

22357

// (Value >= Thresh) << 63

22358

//

22359

// but since we can get here after LegalOperations, DAGCombine might do the

22360

// wrong thing if we create a select. So, directly create the preferred

22361

// version.

22362

SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);

22363

SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);

22364

Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);

22365

22366

SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,

22367

DAG.getConstantFP(0.0, DL, TheVT));

22368

22369

if (IsStrict) {

22370

Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},

22371

{ Chain, Value, FltOfs });

22372

Chain = Value.getValue(1);

22373

} else

22374

Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);

22375

}

22376

22377

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

22378

22379

// FIXME This causes a redundant load/store if the SSE-class value is already

22380

// in memory, such as if it is on the callstack.

22381

if (isScalarFPTypeInSSEReg(TheVT)) {

22382

assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")(static_cast <bool> (DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? void (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22382, __extension__
__PRETTY_FUNCTION__));

22383

Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);

22384

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

22385

SDValue Ops[] = { Chain, StackSlot };

22386

22387

unsigned FLDSize = TheVT.getStoreSize();

22388

assert(FLDSize <= MemSize && "Stack slot not big enough")(static_cast <bool> (FLDSize <= MemSize && "Stack slot not big enough"
) ? void (0) : __assert_fail ("FLDSize <= MemSize && \"Stack slot not big enough\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22388, __extension__
__PRETTY_FUNCTION__));

22389

MachineMemOperand *MMO = MF.getMachineMemOperand(

22390

MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));

22391

Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);

22392

Chain = Value.getValue(1);

22393

}

22394

22395

// Build the FP_TO_INT*_IN_MEM

22396

MachineMemOperand *MMO = MF.getMachineMemOperand(

22397

MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));

22398

SDValue Ops[] = { Chain, Value, StackSlot };

22399

SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,

22400

DAG.getVTList(MVT::Other),

22401

Ops, DstTy, MMO);

22402

22403

SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);

22404

Chain = Res.getValue(1);

22405

22406

// If we need an unsigned fixup, XOR the result with adjust.

22407

if (UnsignedFixup)

22408

Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);

22409

22410

return Res;

22411

}

22412

22413

static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,

22414

const X86Subtarget &Subtarget) {

22415

MVT VT = Op.getSimpleValueType();

22416

SDValue In = Op.getOperand(0);

22417

MVT InVT = In.getSimpleValueType();

22418

SDLoc dl(Op);

22419

unsigned Opc = Op.getOpcode();

22420

22421

assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22421, __extension__
__PRETTY_FUNCTION__));

22422

assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22423, __extension__
__PRETTY_FUNCTION__))

22423

"Unexpected extension opcode")(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22423, __extension__
__PRETTY_FUNCTION__));

22424

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22425, __extension__
__PRETTY_FUNCTION__))

22425

"Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22425, __extension__
__PRETTY_FUNCTION__));

22426

assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22429, __extension__
__PRETTY_FUNCTION__))

22427

VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22429, __extension__
__PRETTY_FUNCTION__))

22428

VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22429, __extension__
__PRETTY_FUNCTION__))

22429

"Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22429, __extension__
__PRETTY_FUNCTION__));

22430

assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22433, __extension__
__PRETTY_FUNCTION__))

22431

InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22433, __extension__
__PRETTY_FUNCTION__))

22432

InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22433, __extension__
__PRETTY_FUNCTION__))

22433

"Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22433, __extension__
__PRETTY_FUNCTION__));

22434

22435

unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);

22436

22437

if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {

22438

assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22438, __extension__
__PRETTY_FUNCTION__));

22439

return splitVectorIntUnary(Op, DAG);

22440

}

22441

22442

if (Subtarget.hasInt256())

22443

return Op;

22444

22445

// Optimize vectors in AVX mode:

22446

//

22447

// v8i16 -> v8i32

22448

// Use vpmovzwd for 4 lower elements v8i16 -> v4i32.

22449

// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.

22450

// Concat upper and lower parts.

22451

//

22452

// v4i32 -> v4i64

22453

// Use vpmovzdq for 4 lower elements v4i32 -> v2i64.

22454

// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.

22455

// Concat upper and lower parts.

22456

//

22457

MVT HalfVT = VT.getHalfNumVectorElementsVT();

22458

SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);

22459

22460

// Short-circuit if we can determine that each 128-bit half is the same value.

22461

// Otherwise, this is difficult to match and optimize.

22462

if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))

22463

if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))

22464

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);

22465

22466

SDValue ZeroVec = DAG.getConstant(0, dl, InVT);

22467

SDValue Undef = DAG.getUNDEF(InVT);

22468

bool NeedZero = Opc == ISD::ZERO_EXTEND;

22469

SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);

22470

OpHi = DAG.getBitcast(HalfVT, OpHi);

22471

22472

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);

22473

}

22474

22475

// Helper to split and extend a v16i1 mask to v16i8 or v16i16.

22476

static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,

22477

const SDLoc &dl, SelectionDAG &DAG) {

22478

assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v16i16
) && "Unexpected VT.") ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v16i16) && \"Unexpected VT.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22478, __extension__
__PRETTY_FUNCTION__));

22479

SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,

22480

DAG.getIntPtrConstant(0, dl));

22481

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,

22482

DAG.getIntPtrConstant(8, dl));

22483

Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);

22484

Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);

22485

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);

22486

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

22487

}

22488

22489

static SDValue LowerZERO_EXTEND_Mask(SDValue Op,

22490

const X86Subtarget &Subtarget,

22491

SelectionDAG &DAG) {

22492

MVT VT = Op->getSimpleValueType(0);

22493

SDValue In = Op->getOperand(0);

22494

MVT InVT = In.getSimpleValueType();

22495

assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22495, __extension__
__PRETTY_FUNCTION__));

22496

SDLoc DL(Op);

22497

unsigned NumElts = VT.getVectorNumElements();

22498

22499

// For all vectors, but vXi8 we can just emit a sign_extend and a shift. This

22500

// avoids a constant pool load.

22501

if (VT.getVectorElementType() != MVT::i8) {

22502

SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);

22503

return DAG.getNode(ISD::SRL, DL, VT, Extend,

22504

DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));

22505

}

22506

22507

// Extend VT if BWI is not supported.

22508

MVT ExtVT = VT;

22509

if (!Subtarget.hasBWI()) {

22510

// If v16i32 is to be avoided, we'll need to split and concatenate.

22511

if (NumElts == 16 && !Subtarget.canExtendTo512DQ())

22512

return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);

22513

22514

ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

22515

}

22516

22517

// Widen to 512-bits if VLX is not supported.

22518

MVT WideVT = ExtVT;

22519

if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {

22520

NumElts *= 512 / ExtVT.getSizeInBits();

22521

InVT = MVT::getVectorVT(MVT::i1, NumElts);

22522

In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),

22523

In, DAG.getIntPtrConstant(0, DL));

22524

WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),

22525

NumElts);

22526

}

22527

22528

SDValue One = DAG.getConstant(1, DL, WideVT);

22529

SDValue Zero = DAG.getConstant(0, DL, WideVT);

22530

22531

SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);

22532

22533

// Truncate if we had to extend above.

22534

if (VT != ExtVT) {

22535

WideVT = MVT::getVectorVT(MVT::i8, NumElts);

22536

SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);

22537

}

22538

22539

// Extract back to 128/256-bit if we widened.

22540

if (WideVT != VT)

22541

SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,

22542

DAG.getIntPtrConstant(0, DL));

22543

22544

return SelectedVal;

22545

}

22546

22547

static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

22548

SelectionDAG &DAG) {

22549

SDValue In = Op.getOperand(0);

22550

MVT SVT = In.getSimpleValueType();

22551

22552

if (SVT.getVectorElementType() == MVT::i1)

22553

return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);

22554

22555

assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22555, __extension__
__PRETTY_FUNCTION__));

22556

return LowerAVXExtend(Op, DAG, Subtarget);

22557

}

22558

22559

/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.

22560

/// It makes use of the fact that vectors with enough leading sign/zero bits

22561

/// prevent the PACKSS/PACKUS from saturating the results.

22562

/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates

22563

/// within each 128-bit lane.

22564

static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,

22565

const SDLoc &DL, SelectionDAG &DAG,

22566

const X86Subtarget &Subtarget) {

22567

assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22568, __extension__
__PRETTY_FUNCTION__))

22568

"Unexpected PACK opcode")(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22568, __extension__
__PRETTY_FUNCTION__));

22569

assert(DstVT.isVector() && "VT not a vector?")(static_cast <bool> (DstVT.isVector() && "VT not a vector?"
) ? void (0) : __assert_fail ("DstVT.isVector() && \"VT not a vector?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22569, __extension__
__PRETTY_FUNCTION__));

22570

22571

// Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).

22572

if (!Subtarget.hasSSE2())

22573

return SDValue();

22574

22575

EVT SrcVT = In.getValueType();

22576

22577

// No truncation required, we might get here due to recursive calls.

22578

if (SrcVT == DstVT)

22579

return In;

22580

22581

// We only support vector truncation to 64bits or greater from a

22582

// 128bits or greater source.

22583

unsigned DstSizeInBits = DstVT.getSizeInBits();

22584

unsigned SrcSizeInBits = SrcVT.getSizeInBits();

22585

if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)

22586

return SDValue();

22587

22588

unsigned NumElems = SrcVT.getVectorNumElements();

22589

if (!isPowerOf2_32(NumElems))

22590

return SDValue();

22591

22592

LLVMContext &Ctx = *DAG.getContext();

22593

assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")(static_cast <bool> (DstVT.getVectorNumElements() == NumElems
&& "Illegal truncation") ? void (0) : __assert_fail (
"DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22593, __extension__
__PRETTY_FUNCTION__));

22594

assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")(static_cast <bool> (SrcSizeInBits > DstSizeInBits &&
"Illegal truncation") ? void (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22594, __extension__
__PRETTY_FUNCTION__));

22595

22596

EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);

22597

22598

// Pack to the largest type possible:

22599

// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.

22600

EVT InVT = MVT::i16, OutVT = MVT::i8;

22601

if (SrcVT.getScalarSizeInBits() > 16 &&

22602

(Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {

22603

InVT = MVT::i32;

22604

OutVT = MVT::i16;

22605

}

22606

22607

// 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.

22608

if (SrcVT.is128BitVector()) {

22609

InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());

22610

OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());

22611

In = DAG.getBitcast(InVT, In);

22612

SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));

22613

Res = extractSubVector(Res, 0, DAG, DL, 64);

22614

return DAG.getBitcast(DstVT, Res);

22615

}

22616

22617

// Split lower/upper subvectors.

22618

SDValue Lo, Hi;

22619

std::tie(Lo, Hi) = splitVector(In, DAG, DL);

22620

22621

unsigned SubSizeInBits = SrcSizeInBits / 2;

22622

InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());

22623

OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());

22624

22625

// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.

22626

if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {

22627

Lo = DAG.getBitcast(InVT, Lo);

22628

Hi = DAG.getBitcast(InVT, Hi);

22629

SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

22630

return DAG.getBitcast(DstVT, Res);

22631

}

22632

22633

// AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.

22634

// AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).

22635

if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {

22636

Lo = DAG.getBitcast(InVT, Lo);

22637

Hi = DAG.getBitcast(InVT, Hi);

22638

SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

22639

22640

// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),

22641

// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).

22642

// Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.

22643

SmallVector<int, 64> Mask;

22644

int Scale = 64 / OutVT.getScalarSizeInBits();

22645

narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);

22646

Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);

22647

22648

if (DstVT.is256BitVector())

22649

return DAG.getBitcast(DstVT, Res);

22650

22651

// If 512bit -> 128bit truncate another stage.

22652

EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);

22653

Res = DAG.getBitcast(PackedVT, Res);

22654

return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);

22655

}

22656

22657

// Recursively pack lower/upper subvectors, concat result and pack again.

22658

assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")(static_cast <bool> (SrcSizeInBits >= 256 &&
"Expected 256-bit vector or greater") ? void (0) : __assert_fail
("SrcSizeInBits >= 256 && \"Expected 256-bit vector or greater\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22658, __extension__
__PRETTY_FUNCTION__));

22659

EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);

22660

Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);

22661

Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);

22662

22663

PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);

22664

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);

22665

return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);

22666

}

22667

22668

static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,

22669

const X86Subtarget &Subtarget) {

22670

22671

SDLoc DL(Op);

22672

MVT VT = Op.getSimpleValueType();

22673

SDValue In = Op.getOperand(0);

22674

MVT InVT = In.getSimpleValueType();

22675

22676

assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Unexpected vector type.") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22676, __extension__
__PRETTY_FUNCTION__));

22677

22678

// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.

22679

unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;

22680

if (InVT.getScalarSizeInBits() <= 16) {

22681

if (Subtarget.hasBWI()) {

22682

// legal, will go to VPMOVB2M, VPMOVW2M

22683

if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {

22684

// We need to shift to get the lsb into sign position.

22685

// Shift packed bytes not supported natively, bitcast to word

22686

MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);

22687

In = DAG.getNode(ISD::SHL, DL, ExtVT,

22688

DAG.getBitcast(ExtVT, In),

22689

DAG.getConstant(ShiftInx, DL, ExtVT));

22690

In = DAG.getBitcast(InVT, In);

22691

}

22692

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),

22693

In, ISD::SETGT);

22694

}

22695

// Use TESTD/Q, extended vector to packed dword/qword.

22696

assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22697, __extension__
__PRETTY_FUNCTION__))

22697

"Unexpected vector type.")(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22697, __extension__
__PRETTY_FUNCTION__));

22698

unsigned NumElts = InVT.getVectorNumElements();

22699

assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(static_cast <bool> ((NumElts == 8 || NumElts == 16) &&
"Unexpected number of elements") ? void (0) : __assert_fail (
"(NumElts == 8 || NumElts == 16) && \"Unexpected number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22699, __extension__
__PRETTY_FUNCTION__));

22700

// We need to change to a wider element type that we have support for.

22701

// For 8 element vectors this is easy, we either extend to v8i32 or v8i64.

22702

// For 16 element vectors we extend to v16i32 unless we are explicitly

22703

// trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors

22704

// we need to split into two 8 element vectors which we can extend to v8i32,

22705

// truncate and concat the results. There's an additional complication if

22706

// the original type is v16i8. In that case we can't split the v16i8

22707

// directly, so we need to shuffle high elements to low and use

22708

// sign_extend_vector_inreg.

22709

if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {

22710

SDValue Lo, Hi;

22711

if (InVT == MVT::v16i8) {

22712

Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);

22713

Hi = DAG.getVectorShuffle(

22714

InVT, DL, In, In,

22715

{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});

22716

Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);

22717

} else {

22718

assert(InVT == MVT::v16i16 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v16i16 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v16i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22718, __extension__
__PRETTY_FUNCTION__));

22719

Lo = extract128BitVector(In, 0, DAG, DL);

22720

Hi = extract128BitVector(In, 8, DAG, DL);

22721

}

22722

// We're split now, just emit two truncates and a concat. The two

22723

// truncates will trigger legalization to come back to this function.

22724

Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);

22725

Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);

22726

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

22727

}

22728

// We either have 8 elements or we're allowed to use 512-bit vectors.

22729

// If we have VLX, we want to use the narrowest vector that can get the

22730

// job done so we use vXi32.

22731

MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);

22732

MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);

22733

In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);

22734

InVT = ExtVT;

22735

ShiftInx = InVT.getScalarSizeInBits() - 1;

22736

}

22737

22738

if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {

22739

// We need to shift to get the lsb into sign position.

22740

In = DAG.getNode(ISD::SHL, DL, InVT, In,

22741

DAG.getConstant(ShiftInx, DL, InVT));

22742

}

22743

// If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.

22744

if (Subtarget.hasDQI())

22745

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);

22746

return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);

22747

}

22748

22749

SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {

22750

SDLoc DL(Op);

22751

MVT VT = Op.getSimpleValueType();

22752

SDValue In = Op.getOperand(0);

22753

MVT InVT = In.getSimpleValueType();

22754

unsigned InNumEltBits = InVT.getScalarSizeInBits();

22755

22756

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22757, __extension__
__PRETTY_FUNCTION__))

22757

"Invalid TRUNCATE operation")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22757, __extension__
__PRETTY_FUNCTION__));

22758

22759

// If we're called by the type legalizer, handle a few cases.

22760

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

22761

if (!TLI.isTypeLegal(InVT)) {

22762

if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&

22763

VT.is128BitVector()) {

22764

assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22765, __extension__
__PRETTY_FUNCTION__))

22765

"Unexpected subtarget!")(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22765, __extension__
__PRETTY_FUNCTION__));

22766

// The default behavior is to truncate one step, concatenate, and then

22767

// truncate the remainder. We'd rather produce two 64-bit results and

22768

// concatenate those.

22769

SDValue Lo, Hi;

22770

std::tie(Lo, Hi) = DAG.SplitVector(In, DL);

22771

22772

EVT LoVT, HiVT;

22773

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

22774

22775

Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);

22776

Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);

22777

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

22778

}

22779

22780

// Otherwise let default legalization handle it.

22781

return SDValue();

22782

}

22783

22784

if (VT.getVectorElementType() == MVT::i1)

22785

return LowerTruncateVecI1(Op, DAG, Subtarget);

22786

22787

// vpmovqb/w/d, vpmovdb/w, vpmovwb

22788

if (Subtarget.hasAVX512()) {

22789

if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {

22790

assert(VT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22790, __extension__
__PRETTY_FUNCTION__));

22791

return splitVectorIntUnary(Op, DAG);

22792

}

22793

22794

// word to byte only under BWI. Otherwise we have to promoted to v16i32

22795

// and then truncate that. But we should only do that if we haven't been

22796

// asked to avoid 512-bit vectors. The actual promotion to v16i32 will be

22797

// handled by isel patterns.

22798

if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||

22799

Subtarget.canExtendTo512DQ())

22800

return Op;

22801

}

22802

22803

unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);

22804

unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

22805

22806

// Truncate with PACKUS if we are truncating a vector with leading zero bits

22807

// that extend all the way to the packed/truncated value.

22808

// Pre-SSE41 we can only use PACKUSWB.

22809

KnownBits Known = DAG.computeKnownBits(In);

22810

if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())

22811

if (SDValue V =

22812

truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))

22813

return V;

22814

22815

// Truncate with PACKSS if we are truncating a vector with sign-bits that

22816

// extend all the way to the packed/truncated value.

22817

if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))

22818

if (SDValue V =

22819

truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))

22820

return V;

22821

22822

// Handle truncation of V256 to V128 using shuffles.

22823

assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")(static_cast <bool> (VT.is128BitVector() && InVT
.is256BitVector() && "Unexpected types!") ? void (0) :
__assert_fail ("VT.is128BitVector() && InVT.is256BitVector() && \"Unexpected types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22823, __extension__
__PRETTY_FUNCTION__));

22824

22825

if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {

22826

// On AVX2, v4i64 -> v4i32 becomes VPERMD.

22827

if (Subtarget.hasInt256()) {

22828

static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};

22829

In = DAG.getBitcast(MVT::v8i32, In);

22830

In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);

22831

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,

22832

DAG.getIntPtrConstant(0, DL));

22833

}

22834

22835

SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

22836

DAG.getIntPtrConstant(0, DL));

22837

SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

22838

DAG.getIntPtrConstant(2, DL));

22839

static const int ShufMask[] = {0, 2, 4, 6};

22840

return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),

22841

DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);

22842

}

22843

22844

if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {

22845

// On AVX2, v8i32 -> v8i16 becomes PSHUFB.

22846

if (Subtarget.hasInt256()) {

22847

// The PSHUFB mask:

22848

static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,

22849

-1, -1, -1, -1, -1, -1, -1, -1,

22850

16, 17, 20, 21, 24, 25, 28, 29,

22851

-1, -1, -1, -1, -1, -1, -1, -1 };

22852

In = DAG.getBitcast(MVT::v32i8, In);

22853

In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);

22854

In = DAG.getBitcast(MVT::v4i64, In);

22855

22856

static const int ShufMask2[] = {0, 2, -1, -1};

22857

In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);

22858

In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

22859

DAG.getIntPtrConstant(0, DL));

22860

return DAG.getBitcast(MVT::v8i16, In);

22861

}

22862

22863

SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,

22864

DAG.getIntPtrConstant(0, DL));

22865

SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,

22866

DAG.getIntPtrConstant(4, DL));

22867

22868

// The PSHUFB mask:

22869

static const int ShufMask1[] = {0, 2, 4, 6, -1, -1, -1, -1};

22870

22871

OpLo = DAG.getBitcast(MVT::v8i16, OpLo);

22872

OpHi = DAG.getBitcast(MVT::v8i16, OpHi);

22873

22874

OpLo = DAG.getVectorShuffle(MVT::v8i16, DL, OpLo, OpLo, ShufMask1);

22875

OpHi = DAG.getVectorShuffle(MVT::v8i16, DL, OpHi, OpHi, ShufMask1);

22876

22877

OpLo = DAG.getBitcast(MVT::v4i32, OpLo);

22878

OpHi = DAG.getBitcast(MVT::v4i32, OpHi);

22879

22880

// The MOVLHPS Mask:

22881

static const int ShufMask2[] = {0, 1, 4, 5};

22882

SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);

22883

return DAG.getBitcast(MVT::v8i16, res);

22884

}

22885

22886

if (VT == MVT::v16i8 && InVT == MVT::v16i16) {

22887

// Use an AND to zero uppper bits for PACKUS.

22888

In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));

22889

22890

SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,

22891

DAG.getIntPtrConstant(0, DL));

22892

SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,

22893

DAG.getIntPtrConstant(8, DL));

22894

return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);

22895

}

22896

22897

llvm_unreachable("All 256->128 cases should have been handled above!")::llvm::llvm_unreachable_internal("All 256->128 cases should have been handled above!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22897);

22898

}

22899

22900

// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction

22901

// behaves on out of range inputs to generate optimized conversions.

22902

static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,

22903

SelectionDAG &DAG,

22904

const X86Subtarget &Subtarget) {

22905

MVT SrcVT = Src.getSimpleValueType();

22906

unsigned DstBits = VT.getScalarSizeInBits();

22907

assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported")(static_cast <bool> (DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported"
) ? void (0) : __assert_fail ("DstBits == 32 && \"expandFP_TO_UINT_SSE - only vXi32 supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22907, __extension__
__PRETTY_FUNCTION__));

22908

22909

// Calculate the converted result for values in the range 0 to

22910

// 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").

22911

SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);

22912

SDValue Big =

22913

DAG.getNode(X86ISD::CVTTP2SI, dl, VT,

22914

DAG.getNode(ISD::FSUB, dl, SrcVT, Src,

22915

DAG.getConstantFP(2147483648.0f, dl, SrcVT)));

22916

22917

// The "CVTTP2SI" instruction conveniently sets the sign bit if

22918

// and only if the value was out of range. So we can use that

22919

// as our indicator that we rather use "Big" instead of "Small".

22920

//

22921

// Use "Small" if "IsOverflown" has all bits cleared

22922

// and "0x80000000 | Big" if all bits in "IsOverflown" are set.

22923

22924

// AVX1 can't use the signsplat masking for 256-bit vectors - we have to

22925

// use the slightly slower blendv select instead.

22926

if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {

22927

SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);

22928

return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);

22929

}

22930

22931

SDValue IsOverflown =

22932

DAG.getNode(X86ISD::VSRAI, dl, VT, Small,

22933

DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));

22934

return DAG.getNode(ISD::OR, dl, VT, Small,

22935

DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));

22936

}

22937

22938

SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {

22939

bool IsStrict = Op->isStrictFPOpcode();

22940

bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||

22941

Op.getOpcode() == ISD::STRICT_FP_TO_SINT;

22942

MVT VT = Op->getSimpleValueType(0);

22943

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

22944

SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();

22945

MVT SrcVT = Src.getSimpleValueType();

22946

SDLoc dl(Op);

22947

22948

SDValue Res;

22949

if (isSoftFP16(SrcVT)) {

22950

MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;

22951

if (IsStrict)

22952

return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},

22953

{Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,

22954

{NVT, MVT::Other}, {Chain, Src})});

22955

return DAG.getNode(Op.getOpcode(), dl, VT,

22956

DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));

22957

} else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {

22958

return Op;

22959

}

22960

22961

if (VT.isVector()) {

22962

if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {

22963

MVT ResVT = MVT::v4i32;

22964

MVT TruncVT = MVT::v4i1;

22965

unsigned Opc;

22966

if (IsStrict)

22967

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

22968

else

22969

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

22970

22971

if (!IsSigned && !Subtarget.hasVLX()) {

22972

assert(Subtarget.useAVX512Regs() && "Unexpected features!")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Unexpected features!") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22972, __extension__
__PRETTY_FUNCTION__));

22973

// Widen to 512-bits.

22974

ResVT = MVT::v8i32;

22975

TruncVT = MVT::v8i1;

22976

Opc = Op.getOpcode();

22977

// Need to concat with zero vector for strict fp to avoid spurious

22978

// exceptions.

22979

// TODO: Should we just do this for non-strict as well?

22980

SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)

22981

: DAG.getUNDEF(MVT::v8f64);

22982

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,

22983

DAG.getIntPtrConstant(0, dl));

22984

}

22985

if (IsStrict) {

22986

Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});

22987

Chain = Res.getValue(1);

22988

} else {

22989

Res = DAG.getNode(Opc, dl, ResVT, Src);

22990

}

22991

22992

Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);

22993

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,

22994

DAG.getIntPtrConstant(0, dl));

22995

if (IsStrict)

22996

return DAG.getMergeValues({Res, Chain}, dl);

22997

return Res;

22998

}

22999

23000

if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {

23001

if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)

23002

return Op;

23003

23004

MVT ResVT = VT;

23005

MVT EleVT = VT.getVectorElementType();

23006

if (EleVT != MVT::i64)

23007

ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;

23008

23009

if (SrcVT != MVT::v8f16) {

23010

SDValue Tmp =

23011

IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);

23012

SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);

23013

Ops[0] = Src;

23014

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);

23015

}

23016

23017

if (IsStrict) {

23018

Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI

23019

: X86ISD::STRICT_CVTTP2UI,

23020

dl, {ResVT, MVT::Other}, {Chain, Src});

23021

Chain = Res.getValue(1);

23022

} else {

23023

Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,

23024

ResVT, Src);

23025

}

23026

23027

// TODO: Need to add exception check code for strict FP.

23028

if (EleVT.getSizeInBits() < 16) {

23029

ResVT = MVT::getVectorVT(EleVT, 8);

23030

Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);

23031

}

23032

23033

if (ResVT != VT)

23034

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

23035

DAG.getIntPtrConstant(0, dl));

23036

23037

if (IsStrict)

23038

return DAG.getMergeValues({Res, Chain}, dl);

23039

return Res;

23040

}

23041

23042

// v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.

23043

if (VT.getVectorElementType() == MVT::i16) {

23044

assert((SrcVT.getVectorElementType() == MVT::f32 ||(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23046, __extension__
__PRETTY_FUNCTION__))

23045

SrcVT.getVectorElementType() == MVT::f64) &&(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23046, __extension__
__PRETTY_FUNCTION__))

23046

"Expected f32/f64 vector!")(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23046, __extension__
__PRETTY_FUNCTION__));

23047

MVT NVT = VT.changeVectorElementType(MVT::i32);

23048

if (IsStrict) {

23049

Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT

23050

: ISD::STRICT_FP_TO_UINT,

23051

dl, {NVT, MVT::Other}, {Chain, Src});

23052

Chain = Res.getValue(1);

23053

} else {

23054

Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,

23055

NVT, Src);

23056

}

23057

23058

// TODO: Need to add exception check code for strict FP.

23059

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

23060

23061

if (IsStrict)

23062

return DAG.getMergeValues({Res, Chain}, dl);

23063

return Res;

23064

}

23065

23066

// v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.

23067

if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {

23068

assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23068, __extension__
__PRETTY_FUNCTION__));

23069

assert(Subtarget.useAVX512Regs() && "Requires avx512f")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Requires avx512f") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Requires avx512f\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23069, __extension__
__PRETTY_FUNCTION__));

23070

return Op;

23071

}

23072

23073

// Widen vXi32 fp_to_uint with avx512f to 512-bit source.

23074

if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&

23075

(SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&

23076

Subtarget.useAVX512Regs()) {

23077

assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23077, __extension__
__PRETTY_FUNCTION__));

23078

assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23078, __extension__
__PRETTY_FUNCTION__));

23079

MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;

23080

MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;

23081

// Need to concat with zero vector for strict fp to avoid spurious

23082

// exceptions.

23083

// TODO: Should we just do this for non-strict as well?

23084

SDValue Tmp =

23085

IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);

23086

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,

23087

DAG.getIntPtrConstant(0, dl));

23088

23089

if (IsStrict) {

23090

Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},

23091

{Chain, Src});

23092

Chain = Res.getValue(1);

23093

} else {

23094

Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);

23095

}

23096

23097

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

23098

DAG.getIntPtrConstant(0, dl));

23099

23100

if (IsStrict)

23101

return DAG.getMergeValues({Res, Chain}, dl);

23102

return Res;

23103

}

23104

23105

// Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.

23106

if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&

23107

(SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&

23108

Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {

23109

assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23109, __extension__
__PRETTY_FUNCTION__));

23110

MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

23111

// Need to concat with zero vector for strict fp to avoid spurious

23112

// exceptions.

23113

// TODO: Should we just do this for non-strict as well?

23114

SDValue Tmp =

23115

IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);

23116

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,

23117

DAG.getIntPtrConstant(0, dl));

23118

23119

if (IsStrict) {

23120

Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},

23121

{Chain, Src});

23122

Chain = Res.getValue(1);

23123

} else {

23124

Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);

23125

}

23126

23127

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

23128

DAG.getIntPtrConstant(0, dl));

23129

23130

if (IsStrict)

23131

return DAG.getMergeValues({Res, Chain}, dl);

23132

return Res;

23133

}

23134

23135

if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {

23136

if (!Subtarget.hasVLX()) {

23137

// Non-strict nodes without VLX can we widened to v4f32->v4i64 by type

23138

// legalizer and then widened again by vector op legalization.

23139

if (!IsStrict)

23140

return SDValue();

23141

23142

SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);

23143

SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,

23144

{Src, Zero, Zero, Zero});

23145

Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},

23146

{Chain, Tmp});

23147

SDValue Chain = Tmp.getValue(1);

23148

Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,

23149

DAG.getIntPtrConstant(0, dl));

23150

return DAG.getMergeValues({Tmp, Chain}, dl);

23151

}

23152

23153

assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")(static_cast <bool> (Subtarget.hasDQI() && Subtarget
.hasVLX() && "Requires AVX512DQVL") ? void (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23153, __extension__
__PRETTY_FUNCTION__));

23154

SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

23155

DAG.getUNDEF(MVT::v2f32));

23156

if (IsStrict) {

23157

unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI

23158

: X86ISD::STRICT_CVTTP2UI;

23159

return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});

23160

}

23161

unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

23162

return DAG.getNode(Opc, dl, VT, Tmp);

23163

}

23164

23165

// Generate optimized instructions for pre AVX512 unsigned conversions from

23166

// vXf32 to vXi32.

23167

if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||

23168

(VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||

23169

(VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {

23170

assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23170, __extension__
__PRETTY_FUNCTION__));

23171

return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);

23172

}

23173

23174

return SDValue();

23175

}

23176

23177

assert(!VT.isVector())(static_cast <bool> (!VT.isVector()) ? void (0) : __assert_fail
("!VT.isVector()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23177, __extension__ __PRETTY_FUNCTION__));

23178

23179

bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);

23180

23181

if (!IsSigned && UseSSEReg) {

23182

// Conversions from f32/f64 with AVX512 should be legal.

23183

if (Subtarget.hasAVX512())

23184

return Op;

23185

23186

// We can leverage the specific way the "cvttss2si/cvttsd2si" instruction

23187

// behaves on out of range inputs to generate optimized conversions.

23188

if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||

23189

(VT == MVT::i64 && Subtarget.is64Bit()))) {

23190

unsigned DstBits = VT.getScalarSizeInBits();

23191

APInt UIntLimit = APInt::getSignMask(DstBits);

23192

SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,

23193

DAG.getConstant(UIntLimit, dl, VT));

23194

MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());

23195

23196

// Calculate the converted result for values in the range:

23197

// (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").

23198

// (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").

23199

SDValue Small =

23200

DAG.getNode(X86ISD::CVTTS2SI, dl, VT,

23201

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));

23202

SDValue Big = DAG.getNode(

23203

X86ISD::CVTTS2SI, dl, VT,

23204

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,

23205

DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));

23206

23207

// The "CVTTS2SI" instruction conveniently sets the sign bit if

23208

// and only if the value was out of range. So we can use that

23209

// as our indicator that we rather use "Big" instead of "Small".

23210

//

23211

// Use "Small" if "IsOverflown" has all bits cleared

23212

// and "0x80000000 | Big" if all bits in "IsOverflown" are set.

23213

SDValue IsOverflown = DAG.getNode(

23214

ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));

23215

return DAG.getNode(ISD::OR, dl, VT, Small,

23216

DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));

23217

}

23218

23219

// Use default expansion for i64.

23220

if (VT == MVT::i64)

23221

return SDValue();

23222

23223

assert(VT == MVT::i32 && "Unexpected VT!")(static_cast <bool> (VT == MVT::i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23223, __extension__
__PRETTY_FUNCTION__));

23224

23225

// Promote i32 to i64 and use a signed operation on 64-bit targets.

23226

// FIXME: This does not generate an invalid exception if the input does not

23227

// fit in i32. PR44019

23228

if (Subtarget.is64Bit()) {

23229

if (IsStrict) {

23230

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},

23231

{Chain, Src});

23232

Chain = Res.getValue(1);

23233

} else

23234

Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);

23235

23236

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

23237

if (IsStrict)

23238

return DAG.getMergeValues({Res, Chain}, dl);

23239

return Res;

23240

}

23241

23242

// Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can

23243

// use fisttp which will be handled later.

23244

if (!Subtarget.hasSSE3())

23245

return SDValue();

23246

}

23247

23248

// Promote i16 to i32 if we can use a SSE operation or the type is f128.

23249

// FIXME: This does not generate an invalid exception if the input does not

23250

// fit in i16. PR44019

23251

if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {

23252

assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")(static_cast <bool> (IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"
) ? void (0) : __assert_fail ("IsSigned && \"Expected i16 FP_TO_UINT to have been promoted!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23252, __extension__
__PRETTY_FUNCTION__));

23253

if (IsStrict) {

23254

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},

23255

{Chain, Src});

23256

Chain = Res.getValue(1);

23257

} else

23258

Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);

23259

23260

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

23261

if (IsStrict)

23262

return DAG.getMergeValues({Res, Chain}, dl);

23263

return Res;

23264

}

23265

23266

// If this is a FP_TO_SINT using SSEReg we're done.

23267

if (UseSSEReg && IsSigned)

23268

return Op;

23269

23270

// fp128 needs to use a libcall.

23271

if (SrcVT == MVT::f128) {

23272

RTLIB::Libcall LC;

23273

if (IsSigned)

23274

LC = RTLIB::getFPTOSINT(SrcVT, VT);

23275

else

23276

LC = RTLIB::getFPTOUINT(SrcVT, VT);

23277

23278

MakeLibCallOptions CallOptions;

23279

std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,

23280

SDLoc(Op), Chain);

23281

23282

if (IsStrict)

23283

return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

23284

23285

return Tmp.first;

23286

}

23287

23288

// Fall back to X87.

23289

if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {

23290

if (IsStrict)

23291

return DAG.getMergeValues({V, Chain}, dl);

23292

return V;

23293

}

23294

23295

llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")::llvm::llvm_unreachable_internal("Expected FP_TO_INTHelper to handle all remaining cases."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23295);

23296

}

23297

23298

SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,

23299

SelectionDAG &DAG) const {

23300

SDValue Src = Op.getOperand(0);

23301

MVT SrcVT = Src.getSimpleValueType();

23302

23303

if (SrcVT == MVT::f16)

23304

return SDValue();

23305

23306

// If the source is in an SSE register, the node is Legal.

23307

if (isScalarFPTypeInSSEReg(SrcVT))

23308

return Op;

23309

23310

return LRINT_LLRINTHelper(Op.getNode(), DAG);

23311

}

23312

23313

SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,

23314

SelectionDAG &DAG) const {

23315

EVT DstVT = N->getValueType(0);

23316

SDValue Src = N->getOperand(0);

23317

EVT SrcVT = Src.getValueType();

23318

23319

if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {

23320

// f16 must be promoted before using the lowering in this routine.

23321

// fp128 does not use this lowering.

23322

return SDValue();

23323

}

23324

23325

SDLoc DL(N);

23326

SDValue Chain = DAG.getEntryNode();

23327

23328

bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);

23329

23330

// If we're converting from SSE, the stack slot needs to hold both types.

23331

// Otherwise it only needs to hold the DstVT.

23332

EVT OtherVT = UseSSE ? SrcVT : DstVT;

23333

SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);

23334

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

23335

MachinePointerInfo MPI =

23336

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

23337

23338

if (UseSSE) {

23339

assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")(static_cast <bool> (DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"
) ? void (0) : __assert_fail ("DstVT == MVT::i64 && \"Invalid LRINT/LLRINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23339, __extension__
__PRETTY_FUNCTION__));

23340

Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);

23341

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

23342

SDValue Ops[] = { Chain, StackPtr };

23343

23344

Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,

23345

/*Align*/ std::nullopt,

23346

MachineMemOperand::MOLoad);

23347

Chain = Src.getValue(1);

23348

}

23349

23350

SDValue StoreOps[] = { Chain, Src, StackPtr };

23351

Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),

23352

StoreOps, DstVT, MPI, /*Align*/ std::nullopt,

23353

MachineMemOperand::MOStore);

23354

23355

return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);

23356

}

23357

23358

SDValue

23359

X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {

23360

// This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,

23361

// but making use of X86 specifics to produce better instruction sequences.

23362

SDNode *Node = Op.getNode();

23363

bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;

23364

unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;

23365

SDLoc dl(SDValue(Node, 0));

23366

SDValue Src = Node->getOperand(0);

23367

23368

// There are three types involved here: SrcVT is the source floating point

23369

// type, DstVT is the type of the result, and TmpVT is the result of the

23370

// intermediate FP_TO_*INT operation we'll use (which may be a promotion of

23371

// DstVT).

23372

EVT SrcVT = Src.getValueType();

23373

EVT DstVT = Node->getValueType(0);

23374

EVT TmpVT = DstVT;

23375

23376

// This code is only for floats and doubles. Fall back to generic code for

23377

// anything else.

23378

if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT))

23379

return SDValue();

23380

23381

EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();

23382

unsigned SatWidth = SatVT.getScalarSizeInBits();

23383

unsigned DstWidth = DstVT.getScalarSizeInBits();

23384

unsigned TmpWidth = TmpVT.getScalarSizeInBits();

23385

assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23386, __extension__
__PRETTY_FUNCTION__))

23386

"Expected saturation width smaller than result width")(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23386, __extension__
__PRETTY_FUNCTION__));

23387

23388

// Promote result of FP_TO_*INT to at least 32 bits.

23389

if (TmpWidth < 32) {

23390

TmpVT = MVT::i32;

23391

TmpWidth = 32;

23392

}

23393

23394

// Promote conversions to unsigned 32-bit to 64-bit, because it will allow

23395

// us to use a native signed conversion instead.

23396

if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {

23397

TmpVT = MVT::i64;

23398

TmpWidth = 64;

23399

}

23400

23401

// If the saturation width is smaller than the size of the temporary result,

23402

// we can always use signed conversion, which is native.

23403

if (SatWidth < TmpWidth)

23404

FpToIntOpcode = ISD::FP_TO_SINT;

23405

23406

// Determine minimum and maximum integer values and their corresponding

23407

// floating-point values.

23408

APInt MinInt, MaxInt;

23409

if (IsSigned) {

23410

MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);

23411

MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);

23412

} else {

23413

MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);

23414

MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);

23415

}

23416

23417

APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));

23418

APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));

23419

23420

APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(

23421

MinInt, IsSigned, APFloat::rmTowardZero);

23422

APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(

23423

MaxInt, IsSigned, APFloat::rmTowardZero);

23424

bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)

23425

&& !(MaxStatus & APFloat::opStatus::opInexact);

23426

23427

SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);

23428

SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);

23429

23430

// If the integer bounds are exactly representable as floats, emit a

23431

// min+max+fptoi sequence. Otherwise use comparisons and selects.

23432

if (AreExactFloatBounds) {

23433

if (DstVT != TmpVT) {

23434

// Clamp by MinFloat from below. If Src is NaN, propagate NaN.

23435

SDValue MinClamped = DAG.getNode(

23436

X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);

23437

// Clamp by MaxFloat from above. If Src is NaN, propagate NaN.

23438

SDValue BothClamped = DAG.getNode(

23439

X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);

23440

// Convert clamped value to integer.

23441

SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);

23442

23443

// NaN will become INDVAL, with the top bit set and the rest zero.

23444

// Truncation will discard the top bit, resulting in zero.

23445

return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);

23446

}

23447

23448

// Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.

23449

SDValue MinClamped = DAG.getNode(

23450

X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);

23451

// Clamp by MaxFloat from above. NaN cannot occur.

23452

SDValue BothClamped = DAG.getNode(

23453

X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);

23454

// Convert clamped value to integer.

23455

SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);

23456

23457

if (!IsSigned) {

23458

// In the unsigned case we're done, because we mapped NaN to MinFloat,

23459

// which is zero.

23460

return FpToInt;

23461

}

23462

23463

// Otherwise, select zero if Src is NaN.

23464

SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);

23465

return DAG.getSelectCC(

23466

dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);

23467

}

23468

23469

SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);

23470

SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);

23471

23472

// Result of direct conversion, which may be selected away.

23473

SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);

23474

23475

if (DstVT != TmpVT) {

23476

// NaN will become INDVAL, with the top bit set and the rest zero.

23477

// Truncation will discard the top bit, resulting in zero.

23478

FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);

23479

}

23480

23481

SDValue Select = FpToInt;

23482

// For signed conversions where we saturate to the same size as the

23483

// result type of the fptoi instructions, INDVAL coincides with integer

23484

// minimum, so we don't need to explicitly check it.

23485

if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {

23486

// If Src ULT MinFloat, select MinInt. In particular, this also selects

23487

// MinInt if Src is NaN.

23488

Select = DAG.getSelectCC(

23489

dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);

23490

}

23491

23492

// If Src OGT MaxFloat, select MaxInt.

23493

Select = DAG.getSelectCC(

23494

dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);

23495

23496

// In the unsigned case we are done, because we mapped NaN to MinInt, which

23497

// is already zero. The promoted case was already handled above.

23498

if (!IsSigned || DstVT != TmpVT) {

23499

return Select;

23500

}

23501

23502

// Otherwise, select 0 if Src is NaN.

23503

SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);

23504

return DAG.getSelectCC(

23505

dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);

23506

}

23507

23508

SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {

23509

bool IsStrict = Op->isStrictFPOpcode();

23510

23511

SDLoc DL(Op);

23512

MVT VT = Op.getSimpleValueType();

23513

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

23514

SDValue In = Op.getOperand(IsStrict ? 1 : 0);

23515

MVT SVT = In.getSimpleValueType();

23516

23517

// Let f16->f80 get lowered to a libcall, except for darwin, where we should

23518

// lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)

23519

if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&

23520

!Subtarget.getTargetTriple().isOSDarwin()))

23521

return SDValue();

23522

23523

if (SVT == MVT::f16) {

23524

if (Subtarget.hasFP16())

23525

return Op;

23526

23527

if (VT != MVT::f32) {

23528

if (IsStrict)

23529

return DAG.getNode(

23530

ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},

23531

{Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,

23532

{MVT::f32, MVT::Other}, {Chain, In})});

23533

23534

return DAG.getNode(ISD::FP_EXTEND, DL, VT,

23535

DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));

23536

}

23537

23538

if (!Subtarget.hasF16C()) {

23539

if (!Subtarget.getTargetTriple().isOSDarwin())

23540

return SDValue();

23541

23542

assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall")(static_cast <bool> (VT == MVT::f32 && SVT == MVT
::f16 && "unexpected extend libcall") ? void (0) : __assert_fail
("VT == MVT::f32 && SVT == MVT::f16 && \"unexpected extend libcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23542, __extension__
__PRETTY_FUNCTION__));

23543

23544

// Need a libcall, but ABI for f16 is soft-float on MacOS.

23545

TargetLowering::CallLoweringInfo CLI(DAG);

23546

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

23547

23548

In = DAG.getBitcast(MVT::i16, In);

23549

TargetLowering::ArgListTy Args;

23550

TargetLowering::ArgListEntry Entry;

23551

Entry.Node = In;

23552

Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());

23553

Entry.IsSExt = false;

23554

Entry.IsZExt = true;

23555

Args.push_back(Entry);

23556

23557

SDValue Callee = DAG.getExternalSymbol(

23558

getLibcallName(RTLIB::FPEXT_F16_F32),

23559

getPointerTy(DAG.getDataLayout()));

23560

CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(

23561

CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,

23562

std::move(Args));

23563

23564

SDValue Res;

23565

std::tie(Res,Chain) = LowerCallTo(CLI);

23566

if (IsStrict)

23567

Res = DAG.getMergeValues({Res, Chain}, DL);

23568

23569

return Res;

23570

}

23571

23572

In = DAG.getBitcast(MVT::i16, In);

23573

In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,

23574

getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,

23575

DAG.getIntPtrConstant(0, DL));

23576

SDValue Res;

23577

if (IsStrict) {

23578

Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},

23579

{Chain, In});

23580

Chain = Res.getValue(1);

23581

} else {

23582

Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,

23583

DAG.getTargetConstant(4, DL, MVT::i32));

23584

}

23585

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,

23586

DAG.getIntPtrConstant(0, DL));

23587

if (IsStrict)

23588

return DAG.getMergeValues({Res, Chain}, DL);

23589

return Res;

23590

}

23591

23592

if (!SVT.isVector())

23593

return Op;

23594

23595

if (SVT.getVectorElementType() == MVT::f16) {

23596

assert(Subtarget.hasF16C() && "Unexpected features!")(static_cast <bool> (Subtarget.hasF16C() && "Unexpected features!"
) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23596, __extension__
__PRETTY_FUNCTION__));

23597

if (SVT == MVT::v2f16)

23598

In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,

23599

DAG.getUNDEF(MVT::v2f16));

23600

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,

23601

DAG.getUNDEF(MVT::v4f16));

23602

if (IsStrict)

23603

return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},

23604

{Op->getOperand(0), Res});

23605

return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);

23606

} else if (VT == MVT::v4f64 || VT == MVT::v8f64) {

23607

return Op;

23608

}

23609

23610

assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")(static_cast <bool> (SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? void (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23610, __extension__
__PRETTY_FUNCTION__));

23611

23612

SDValue Res =

23613

DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));

23614

if (IsStrict)

23615

return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},

23616

{Op->getOperand(0), Res});

23617

return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);

23618

}

23619

23620

SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {

23621

bool IsStrict = Op->isStrictFPOpcode();

23622

23623

SDLoc DL(Op);

23624

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

23625

SDValue In = Op.getOperand(IsStrict ? 1 : 0);

23626

MVT VT = Op.getSimpleValueType();

23627

MVT SVT = In.getSimpleValueType();

23628

23629

if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))

23630

return SDValue();

23631

23632

if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&

23633

!Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {

23634

if (!Subtarget.getTargetTriple().isOSDarwin())

23635

return SDValue();

23636

23637

// We need a libcall but the ABI for f16 libcalls on MacOS is soft.

23638

TargetLowering::CallLoweringInfo CLI(DAG);

23639

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

23640

23641

TargetLowering::ArgListTy Args;

23642

TargetLowering::ArgListEntry Entry;

23643

Entry.Node = In;

23644

Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());

23645

Entry.IsSExt = false;

23646

Entry.IsZExt = true;

23647

Args.push_back(Entry);

23648

23649

SDValue Callee = DAG.getExternalSymbol(

23650

getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16

23651

: RTLIB::FPROUND_F32_F16),

23652

getPointerTy(DAG.getDataLayout()));

23653

CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(

23654

CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,

23655

std::move(Args));

23656

23657

SDValue Res;

23658

std::tie(Res, Chain) = LowerCallTo(CLI);

23659

23660

Res = DAG.getBitcast(MVT::f16, Res);

23661

23662

if (IsStrict)

23663

Res = DAG.getMergeValues({Res, Chain}, DL);

23664

23665

return Res;

23666

}

23667

23668

if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {

23669

if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)

23670

return SDValue();

23671

23672

if (VT.isVector())

23673

return Op;

23674

23675

SDValue Res;

23676

SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,

23677

MVT::i32);

23678

if (IsStrict) {

23679

Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,

23680

DAG.getConstantFP(0, DL, MVT::v4f32), In,

23681

DAG.getIntPtrConstant(0, DL));

23682

Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},

23683

{Chain, Res, Rnd});

23684

Chain = Res.getValue(1);

23685

} else {

23686

// FIXME: Should we use zeros for upper elements for non-strict?

23687

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);

23688

Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);

23689

}

23690

23691

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,

23692

DAG.getIntPtrConstant(0, DL));

23693

Res = DAG.getBitcast(MVT::f16, Res);

23694

23695

if (IsStrict)

23696

return DAG.getMergeValues({Res, Chain}, DL);

23697

23698

return Res;

23699

}

23700

23701

return Op;

23702

}

23703

23704

static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {

23705

bool IsStrict = Op->isStrictFPOpcode();

23706

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

23707

assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23708, __extension__
__PRETTY_FUNCTION__))

23708

"Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23708, __extension__
__PRETTY_FUNCTION__));

23709

23710

SDLoc dl(Op);

23711

SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,

23712

DAG.getConstant(0, dl, MVT::v8i16), Src,

23713

DAG.getIntPtrConstant(0, dl));

23714

23715

SDValue Chain;

23716

if (IsStrict) {

23717

Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},

23718

{Op.getOperand(0), Res});

23719

Chain = Res.getValue(1);

23720

} else {

23721

Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);

23722

}

23723

23724

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,

23725

DAG.getIntPtrConstant(0, dl));

23726

23727

if (IsStrict)

23728

return DAG.getMergeValues({Res, Chain}, dl);

23729

23730

return Res;

23731

}

23732

23733

static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {

23734

bool IsStrict = Op->isStrictFPOpcode();

23735

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

23736

assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23737, __extension__
__PRETTY_FUNCTION__))

23737

"Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23737, __extension__
__PRETTY_FUNCTION__));

23738

23739

SDLoc dl(Op);

23740

SDValue Res, Chain;

23741

if (IsStrict) {

23742

Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,

23743

DAG.getConstantFP(0, dl, MVT::v4f32), Src,

23744

DAG.getIntPtrConstant(0, dl));

23745

Res = DAG.getNode(

23746

X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},

23747

{Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});

23748

Chain = Res.getValue(1);

23749

} else {

23750

// FIXME: Should we use zeros for upper elements for non-strict?

23751

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);

23752

Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,

23753

DAG.getTargetConstant(4, dl, MVT::i32));

23754

}

23755

23756

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,

23757

DAG.getIntPtrConstant(0, dl));

23758

23759

if (IsStrict)

23760

return DAG.getMergeValues({Res, Chain}, dl);

23761

23762

return Res;

23763

}

23764

23765

SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,

23766

SelectionDAG &DAG) const {

23767

SDLoc DL(Op);

23768

MakeLibCallOptions CallOptions;

23769

RTLIB::Libcall LC =

23770

RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16);

23771

SDValue Res =

23772

makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;

23773

return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16,

23774

DAG.getBitcast(MVT::i32, Res));

23775

}

23776

23777

/// Depending on uarch and/or optimizing for size, we might prefer to use a

23778

/// vector operation in place of the typical scalar operation.

23779

static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,

23780

const X86Subtarget &Subtarget) {

23781

// If both operands have other uses, this is probably not profitable.

23782

SDValue LHS = Op.getOperand(0);

23783

SDValue RHS = Op.getOperand(1);

23784

if (!LHS.hasOneUse() && !RHS.hasOneUse())

23785

return Op;

23786

23787

// FP horizontal add/sub were added with SSE3. Integer with SSSE3.

23788

bool IsFP = Op.getSimpleValueType().isFloatingPoint();

23789

if (IsFP && !Subtarget.hasSSE3())

23790

return Op;

23791

if (!IsFP && !Subtarget.hasSSSE3())

23792

return Op;

23793

23794

// Extract from a common vector.

23795

if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

23796

RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

23797

LHS.getOperand(0) != RHS.getOperand(0) ||

23798

!isa<ConstantSDNode>(LHS.getOperand(1)) ||

23799

!isa<ConstantSDNode>(RHS.getOperand(1)) ||

23800

!shouldUseHorizontalOp(true, DAG, Subtarget))

23801

return Op;

23802

23803

// Allow commuted 'hadd' ops.

23804

// TODO: Allow commuted (f)sub by negating the result of (F)HSUB?

23805

unsigned HOpcode;

23806

switch (Op.getOpcode()) {

23807

case ISD::ADD: HOpcode = X86ISD::HADD; break;

23808

case ISD::SUB: HOpcode = X86ISD::HSUB; break;

23809

case ISD::FADD: HOpcode = X86ISD::FHADD; break;

23810

case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;

23811

default:

23812

llvm_unreachable("Trying to lower unsupported opcode to horizontal op")::llvm::llvm_unreachable_internal("Trying to lower unsupported opcode to horizontal op"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23812);

23813

}

23814

unsigned LExtIndex = LHS.getConstantOperandVal(1);

23815

unsigned RExtIndex = RHS.getConstantOperandVal(1);

23816

if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&

23817

(HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))

23818

std::swap(LExtIndex, RExtIndex);

23819

23820

if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))

23821

return Op;

23822

23823

SDValue X = LHS.getOperand(0);

23824

EVT VecVT = X.getValueType();

23825

unsigned BitWidth = VecVT.getSizeInBits();

23826

unsigned NumLanes = BitWidth / 128;

23827

unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;

23828

assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23829, __extension__
__PRETTY_FUNCTION__))

23829

"Not expecting illegal vector widths here")(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23829, __extension__
__PRETTY_FUNCTION__));

23830

23831

// Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit

23832

// equivalent, so extract the 256/512-bit source op to 128-bit if we can.

23833

SDLoc DL(Op);

23834

if (BitWidth == 256 || BitWidth == 512) {

23835

unsigned LaneIdx = LExtIndex / NumEltsPerLane;

23836

X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);

23837

LExtIndex %= NumEltsPerLane;

23838

}

23839

23840

// add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0

23841

// add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0

23842

// add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1

23843

// sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0

23844

SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);

23845

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,

23846

DAG.getIntPtrConstant(LExtIndex / 2, DL));

23847

}

23848

23849

/// Depending on uarch and/or optimizing for size, we might prefer to use a

23850

/// vector operation in place of the typical scalar operation.

23851

SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {

23852

assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23853, __extension__
__PRETTY_FUNCTION__))

23853

"Only expecting float/double")(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23853, __extension__
__PRETTY_FUNCTION__));

23854

return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

23855

}

23856

23857

/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.

23858

/// This mode isn't supported in hardware on X86. But as long as we aren't

23859

/// compiling with trapping math, we can emulate this with

23860

/// trunc(X + copysign(nextafter(0.5, 0.0), X)).

23861

static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {

23862

SDValue N0 = Op.getOperand(0);

23863

SDLoc dl(Op);

23864

MVT VT = Op.getSimpleValueType();

23865

23866

// N0 += copysign(nextafter(0.5, 0.0), N0)

23867

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

23868

bool Ignored;

23869

APFloat Point5Pred = APFloat(0.5f);

23870

Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);

23871

Point5Pred.next(/*nextDown*/true);

23872

23873

SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,

23874

DAG.getConstantFP(Point5Pred, dl, VT), N0);

23875

N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);

23876

23877

// Truncate the result to remove fraction.

23878

return DAG.getNode(ISD::FTRUNC, dl, VT, N0);

23879

}

23880

23881

/// The only differences between FABS and FNEG are the mask and the logic op.

23882

/// FNEG also has a folding opportunity for FNEG(FABS(x)).

23883

static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {

23884

assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23885, __extension__
__PRETTY_FUNCTION__))

23885

"Wrong opcode for lowering FABS or FNEG.")(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23885, __extension__
__PRETTY_FUNCTION__));

23886

23887

bool IsFABS = (Op.getOpcode() == ISD::FABS);

23888

23889

// If this is a FABS and it has an FNEG user, bail out to fold the combination

23890

// into an FNABS. We'll lower the FABS after that if it is still in use.

23891

if (IsFABS)

23892

for (SDNode *User : Op->uses())

23893

if (User->getOpcode() == ISD::FNEG)

23894

return Op;

23895

23896

SDLoc dl(Op);

23897

MVT VT = Op.getSimpleValueType();

23898

23899

bool IsF128 = (VT == MVT::f128);

23900

assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23902, __extension__
__PRETTY_FUNCTION__))

23901

DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23902, __extension__
__PRETTY_FUNCTION__))

23902

"Unexpected type in LowerFABSorFNEG")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23902, __extension__
__PRETTY_FUNCTION__));

23903

23904

// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to

23905

// decide if we should generate a 16-byte constant mask when we only need 4 or

23906

// 8 bytes for the scalar case.

23907

23908

// There are no scalar bitwise logical SSE/AVX instructions, so we

23909

// generate a 16-byte vector constant and logic op even for the scalar case.

23910

// Using a 16-byte mask allows folding the load of the mask with

23911

// the logic op, so it can save (~4 bytes) on code size.

23912

bool IsFakeVector = !VT.isVector() && !IsF128;

23913

MVT LogicVT = VT;

23914

if (IsFakeVector)

23915

LogicVT = (VT == MVT::f64) ? MVT::v2f64

23916

: (VT == MVT::f32) ? MVT::v4f32

23917

: MVT::v8f16;

23918

23919

unsigned EltBits = VT.getScalarSizeInBits();

23920

// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...

23921

APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :

23922

APInt::getSignMask(EltBits);

23923

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

23924

SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);

23925

23926

SDValue Op0 = Op.getOperand(0);

23927

bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);

23928

unsigned LogicOp = IsFABS ? X86ISD::FAND :

23929

IsFNABS ? X86ISD::FOR :

23930

X86ISD::FXOR;

23931

SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;

23932

23933

if (VT.isVector() || IsF128)

23934

return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

23935

23936

// For the scalar case extend to a 128-bit vector, perform the logic op,

23937

// and extract the scalar result back out.

23938

Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);

23939

SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

23940

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,

23941

DAG.getIntPtrConstant(0, dl));

23942

}

23943

23944

static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {

23945

SDValue Mag = Op.getOperand(0);

23946

SDValue Sign = Op.getOperand(1);

23947

SDLoc dl(Op);

23948

23949

// If the sign operand is smaller, extend it first.

23950

MVT VT = Op.getSimpleValueType();

23951

if (Sign.getSimpleValueType().bitsLT(VT))

23952

Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);

23953

23954

// And if it is bigger, shrink it first.

23955

if (Sign.getSimpleValueType().bitsGT(VT))

23956

Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,

23957

DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));

23958

23959

// At this point the operands and the result should have the same

23960

// type, and that won't be f80 since that is not custom lowered.

23961

bool IsF128 = (VT == MVT::f128);

23962

assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23964, __extension__
__PRETTY_FUNCTION__))

23963

DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23964, __extension__
__PRETTY_FUNCTION__))

23964

"Unexpected type in LowerFCOPYSIGN")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23964, __extension__
__PRETTY_FUNCTION__));

23965

23966

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

23967

23968

// Perform all scalar logic operations as 16-byte vectors because there are no

23969

// scalar FP logic instructions in SSE.

23970

// TODO: This isn't necessary. If we used scalar types, we might avoid some

23971

// unnecessary splats, but we might miss load folding opportunities. Should

23972

// this decision be based on OptimizeForSize?

23973

bool IsFakeVector = !VT.isVector() && !IsF128;

23974

MVT LogicVT = VT;

23975

if (IsFakeVector)

23976

LogicVT = (VT == MVT::f64) ? MVT::v2f64

23977

: (VT == MVT::f32) ? MVT::v4f32

23978

: MVT::v8f16;

23979

23980

// The mask constants are automatically splatted for vector types.

23981

unsigned EltSizeInBits = VT.getScalarSizeInBits();

23982

SDValue SignMask = DAG.getConstantFP(

23983

APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);

23984

SDValue MagMask = DAG.getConstantFP(

23985

APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);

23986

23987

// First, clear all bits but the sign bit from the second operand (sign).

23988

if (IsFakeVector)

23989

Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);

23990

SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);

23991

23992

// Next, clear the sign bit from the first operand (magnitude).

23993

// TODO: If we had general constant folding for FP logic ops, this check

23994

// wouldn't be necessary.

23995

SDValue MagBits;

23996

if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {

23997

APFloat APF = Op0CN->getValueAPF();

23998

APF.clearSign();

23999

MagBits = DAG.getConstantFP(APF, dl, LogicVT);

24000

} else {

24001

// If the magnitude operand wasn't a constant, we need to AND out the sign.

24002

if (IsFakeVector)

24003

Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);

24004

MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);

24005

}

24006

24007

// OR the magnitude value with the sign bit.

24008

SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);

24009

return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,

24010

DAG.getIntPtrConstant(0, dl));

24011

}

24012

24013

static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {

24014

SDValue N0 = Op.getOperand(0);

24015

SDLoc dl(Op);

24016

MVT VT = Op.getSimpleValueType();

24017

24018

MVT OpVT = N0.getSimpleValueType();

24019

assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24020, __extension__
__PRETTY_FUNCTION__))

24020

"Unexpected type for FGETSIGN")(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24020, __extension__
__PRETTY_FUNCTION__));

24021

24022

// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).

24023

MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);

24024

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);

24025

Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);

24026

Res = DAG.getZExtOrTrunc(Res, dl, VT);

24027

Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));

24028

return Res;

24029

}

24030

24031

/// Helper for attempting to create a X86ISD::BT node.

24032

static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {

24033

// If Src is i8, promote it to i32 with any_extend. There is no i8 BT

24034

// instruction. Since the shift amount is in-range-or-undefined, we know

24035

// that doing a bittest on the i32 value is ok. We extend to i32 because

24036

// the encoding for the i16 version is larger than the i32 version.

24037

// Also promote i16 to i32 for performance / code size reason.

24038

if (Src.getValueType().getScalarSizeInBits() < 32)

24039

Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);

24040

24041

// No legal type found, give up.

24042

if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))

24043

return SDValue();

24044

24045

// See if we can use the 32-bit instruction instead of the 64-bit one for a

24046

// shorter encoding. Since the former takes the modulo 32 of BitNo and the

24047

// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is

24048

// known to be zero.

24049

if (Src.getValueType() == MVT::i64 &&

24050

DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))

24051

Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);

24052

24053

// If the operand types disagree, extend the shift amount to match. Since

24054

// BT ignores high bits (like shifts) we can use anyextend.

24055

if (Src.getValueType() != BitNo.getValueType()) {

24056

// Peek through a mask/modulo operation.

24057

// TODO: DAGCombine fails to do this as it just checks isTruncateFree, but

24058

// we probably need a better IsDesirableToPromoteOp to handle this as well.

24059

if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())

24060

BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),

24061

DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),

24062

BitNo.getOperand(0)),

24063

DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),

24064

BitNo.getOperand(1)));

24065

else

24066

BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);

24067

}

24068

24069

return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);

24070

}

24071

24072

/// Helper for creating a X86ISD::SETCC node.

24073

static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,

24074

SelectionDAG &DAG) {

24075

return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,

24076

DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);

24077

}

24078

24079

/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a

24080

/// recognizable memcmp expansion.

24081

static bool isOrXorXorTree(SDValue X, bool Root = true) {

24082

if (X.getOpcode() == ISD::OR)

24083

return isOrXorXorTree(X.getOperand(0), false) &&

24084

isOrXorXorTree(X.getOperand(1), false);

24085

if (Root)

24086

return false;

24087

return X.getOpcode() == ISD::XOR;

24088

}

24089

24090

/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp

24091

/// expansion.

24092

template <typename F>

24093

static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG,

24094

EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {

24095

SDValue Op0 = X.getOperand(0);

24096

SDValue Op1 = X.getOperand(1);

24097

if (X.getOpcode() == ISD::OR) {

24098

SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);

24099

SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);

24100

if (VecVT != CmpVT)

24101

return DAG.getNode(ISD::OR, DL, CmpVT, A, B);

24102

if (HasPT)

24103

return DAG.getNode(ISD::OR, DL, VecVT, A, B);

24104

return DAG.getNode(ISD::AND, DL, CmpVT, A, B);

24105

}

24106

if (X.getOpcode() == ISD::XOR) {

24107

SDValue A = SToV(Op0);

24108

SDValue B = SToV(Op1);

24109

if (VecVT != CmpVT)

24110

return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);

24111

if (HasPT)

24112

return DAG.getNode(ISD::XOR, DL, VecVT, A, B);

24113

return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);

24114

}

24115

llvm_unreachable("Impossible")::llvm::llvm_unreachable_internal("Impossible", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24115);

24116

}

24117

24118

/// Try to map a 128-bit or larger integer comparison to vector instructions

24119

/// before type legalization splits it up into chunks.

24120

static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,

24121

ISD::CondCode CC,

24122

const SDLoc &DL,

24123

SelectionDAG &DAG,

24124

const X86Subtarget &Subtarget) {

24125

assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETEQ
) && "Bad comparison predicate") ? void (0) : __assert_fail
("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24125, __extension__
__PRETTY_FUNCTION__));

24126

24127

// We're looking for an oversized integer equality comparison.

24128

EVT OpVT = X.getValueType();

24129

unsigned OpSize = OpVT.getSizeInBits();

24130

if (!OpVT.isScalarInteger() || OpSize < 128)

24131

return SDValue();

24132

24133

// Ignore a comparison with zero because that gets special treatment in

24134

// EmitTest(). But make an exception for the special case of a pair of

24135

// logically-combined vector-sized operands compared to zero. This pattern may

24136

// be generated by the memcmp expansion pass with oversized integer compares

24137

// (see PR33325).

24138

bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);

24139

if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)

24140

return SDValue();

24141

24142

// Don't perform this combine if constructing the vector will be expensive.

24143

auto IsVectorBitCastCheap = [](SDValue X) {

24144

X = peekThroughBitcasts(X);

24145

return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||

24146

X.getOpcode() == ISD::LOAD;

24147

};

24148

if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&

24149

!IsOrXorXorTreeCCZero)

24150

return SDValue();

24151

24152

// Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.

24153

// Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.

24154

// Otherwise use PCMPEQ (plus AND) and mask testing.

24155

bool NoImplicitFloatOps =

24156

DAG.getMachineFunction().getFunction().hasFnAttribute(

24157

Attribute::NoImplicitFloat);

24158

if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

24159

((OpSize == 128 && Subtarget.hasSSE2()) ||

24160

(OpSize == 256 && Subtarget.hasAVX()) ||

24161

(OpSize == 512 && Subtarget.useAVX512Regs()))) {

24162

bool HasPT = Subtarget.hasSSE41();

24163

24164

// PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened

24165

// vector registers are essentially free. (Technically, widening registers

24166

// prevents load folding, but the tradeoff is worth it.)

24167

bool PreferKOT = Subtarget.preferMaskRegisters();

24168

bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;

24169

24170

EVT VecVT = MVT::v16i8;

24171

EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;

24172

if (OpSize == 256) {

24173

VecVT = MVT::v32i8;

24174

CmpVT = PreferKOT ? MVT::v32i1 : VecVT;

24175

}

24176

EVT CastVT = VecVT;

24177

bool NeedsAVX512FCast = false;

24178

if (OpSize == 512 || NeedZExt) {

24179

if (Subtarget.hasBWI()) {

24180

VecVT = MVT::v64i8;

24181

CmpVT = MVT::v64i1;

24182

if (OpSize == 512)

24183

CastVT = VecVT;

24184

} else {

24185

VecVT = MVT::v16i32;

24186

CmpVT = MVT::v16i1;

24187

CastVT = OpSize == 512 ? VecVT

24188

: OpSize == 256 ? MVT::v8i32

24189

: MVT::v4i32;

24190

NeedsAVX512FCast = true;

24191

}

24192

}

24193

24194

auto ScalarToVector = [&](SDValue X) -> SDValue {

24195

bool TmpZext = false;

24196

EVT TmpCastVT = CastVT;

24197

if (X.getOpcode() == ISD::ZERO_EXTEND) {

24198

SDValue OrigX = X.getOperand(0);

24199

unsigned OrigSize = OrigX.getScalarValueSizeInBits();

24200

if (OrigSize < OpSize) {

24201

if (OrigSize == 128) {

24202

TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;

24203

X = OrigX;

24204

TmpZext = true;

24205

} else if (OrigSize == 256) {

24206

TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;

24207

X = OrigX;

24208

TmpZext = true;

24209

}

24210

}

24211

}

24212

X = DAG.getBitcast(TmpCastVT, X);

24213

if (!NeedZExt && !TmpZext)

24214

return X;

24215

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,

24216

DAG.getConstant(0, DL, VecVT), X,

24217

DAG.getVectorIdxConstant(0, DL));

24218

};

24219

24220

SDValue Cmp;

24221

if (IsOrXorXorTreeCCZero) {

24222

// This is a bitwise-combined equality comparison of 2 pairs of vectors:

24223

// setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne

24224

// Use 2 vector equality compares and 'and' the results before doing a

24225

// MOVMSK.

24226

Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);

24227

} else {

24228

SDValue VecX = ScalarToVector(X);

24229

SDValue VecY = ScalarToVector(Y);

24230

if (VecVT != CmpVT) {

24231

Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);

24232

} else if (HasPT) {

24233

Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);

24234

} else {

24235

Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);

24236

}

24237

}

24238

// AVX512 should emit a setcc that will lower to kortest.

24239

if (VecVT != CmpVT) {

24240

EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64

24241

: CmpVT == MVT::v32i1 ? MVT::i32

24242

: MVT::i16;

24243

return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),

24244

DAG.getConstant(0, DL, KRegVT), CC);

24245

}

24246

if (HasPT) {

24247

SDValue BCCmp =

24248

DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);

24249

SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);

24250

X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;

24251

SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);

24252

return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));

24253

}

24254

// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.

24255

// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq

24256

// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne

24257

assert(Cmp.getValueType() == MVT::v16i8 &&(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24258, __extension__
__PRETTY_FUNCTION__))

24258

"Non 128-bit vector on pre-SSE41 target")(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24258, __extension__
__PRETTY_FUNCTION__));

24259

SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);

24260

SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);

24261

return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);

24262

}

24263

24264

return SDValue();

24265

}

24266

24267

/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))

24268

/// style scalarized (associative) reduction patterns. Partial reductions

24269

/// are supported when the pointer SrcMask is non-null.

24270

/// TODO - move this to SelectionDAG?

24271

static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,

24272

SmallVectorImpl<SDValue> &SrcOps,

24273

SmallVectorImpl<APInt> *SrcMask = nullptr) {

24274

SmallVector<SDValue, 8> Opnds;

24275

DenseMap<SDValue, APInt> SrcOpMap;

24276

EVT VT = MVT::Other;

24277

24278

// Recognize a special case where a vector is casted into wide integer to

24279

// test all 0s.

24280

assert(Op.getOpcode() == unsigned(BinOp) &&(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24281, __extension__
__PRETTY_FUNCTION__))

24281

"Unexpected bit reduction opcode")(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24281, __extension__
__PRETTY_FUNCTION__));

24282

Opnds.push_back(Op.getOperand(0));

24283

Opnds.push_back(Op.getOperand(1));

24284

24285

for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {

24286

SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;

24287

// BFS traverse all BinOp operands.

24288

if (I->getOpcode() == unsigned(BinOp)) {

24289

Opnds.push_back(I->getOperand(0));

24290

Opnds.push_back(I->getOperand(1));

24291

// Re-evaluate the number of nodes to be traversed.

24292

e += 2; // 2 more nodes (LHS and RHS) are pushed.

24293

continue;

24294

}

24295

24296

// Quit if a non-EXTRACT_VECTOR_ELT

24297

if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)

24298

return false;

24299

24300

// Quit if without a constant index.

24301

auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));

24302

if (!Idx)

24303

return false;

24304

24305

SDValue Src = I->getOperand(0);

24306

DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);

24307

if (M == SrcOpMap.end()) {

24308

VT = Src.getValueType();

24309

// Quit if not the same type.

24310

if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())

24311

return false;

24312

unsigned NumElts = VT.getVectorNumElements();

24313

APInt EltCount = APInt::getZero(NumElts);

24314

M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;

24315

SrcOps.push_back(Src);

24316

}

24317

24318

// Quit if element already used.

24319

unsigned CIdx = Idx->getZExtValue();

24320

if (M->second[CIdx])

24321

return false;

24322

M->second.setBit(CIdx);

24323

}

24324

24325

if (SrcMask) {

24326

// Collect the source partial masks.

24327

for (SDValue &SrcOp : SrcOps)

24328

SrcMask->push_back(SrcOpMap[SrcOp]);

24329

} else {

24330

// Quit if not all elements are used.

24331

for (const auto &I : SrcOpMap)

24332

if (!I.second.isAllOnes())

24333

return false;

24334

}

24335

24336

return true;

24337

}

24338

24339

// Helper function for comparing all bits of two vectors.

24340

static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS,

24341

ISD::CondCode CC, const APInt &OriginalMask,

24342

const X86Subtarget &Subtarget,

24343

SelectionDAG &DAG, X86::CondCode &X86CC) {

24344

EVT VT = LHS.getValueType();

24345

unsigned ScalarSize = VT.getScalarSizeInBits();

24346

if (OriginalMask.getBitWidth() != ScalarSize) {

24347

assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch")(static_cast <bool> (ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch"
) ? void (0) : __assert_fail ("ScalarSize == 1 && \"Element Mask vs Vector bitwidth mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24347, __extension__
__PRETTY_FUNCTION__));

24348

return SDValue();

24349

}

24350

24351

// Quit if not convertable to legal scalar or 128/256-bit vector.

24352

if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))

24353

return SDValue();

24354

24355

// FCMP may use ISD::SETNE when nnan - early out if we manage to get here.

24356

if (VT.isFloatingPoint())

24357

return SDValue();

24358

24359

assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24359, __extension__
__PRETTY_FUNCTION__));

24360

X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);

24361

24362

APInt Mask = OriginalMask;

24363

24364

auto MaskBits = [&](SDValue Src) {

24365

if (Mask.isAllOnes())

24366

return Src;

24367

EVT SrcVT = Src.getValueType();

24368

SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);

24369

return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);

24370

};

24371

24372

// For sub-128-bit vector, cast to (legal) integer and compare with zero.

24373

if (VT.getSizeInBits() < 128) {

24374

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());

24375

if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {

24376

if (IntVT != MVT::i64)

24377

return SDValue();

24378

auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,

24379

MVT::i32, MVT::i32);

24380

auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,

24381

MVT::i32, MVT::i32);

24382

SDValue Lo =

24383

DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);

24384

SDValue Hi =

24385

DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);

24386

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

24387

DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),

24388

DAG.getConstant(0, DL, MVT::i32));

24389

}

24390

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

24391

DAG.getBitcast(IntVT, MaskBits(LHS)),

24392

DAG.getBitcast(IntVT, MaskBits(RHS)));

24393

}

24394

24395

// Without PTEST, a masked v2i64 or-reduction is not faster than

24396

// scalarization.

24397

bool UseKORTEST = Subtarget.useAVX512Regs();

24398

bool UsePTEST = Subtarget.hasSSE41();

24399

if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)

24400

return SDValue();

24401

24402

// Split down to 128/256/512-bit vector.

24403

unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);

24404

24405

// If the input vector has vector elements wider than the target test size,

24406

// then cast to <X x i64> so it will safely split.

24407

if (ScalarSize > TestSize) {

24408

if (!Mask.isAllOnes())

24409

return SDValue();

24410

VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);

24411

LHS = DAG.getBitcast(VT, LHS);

24412

RHS = DAG.getBitcast(VT, RHS);

24413

Mask = APInt::getAllOnes(64);

24414

}

24415

24416

if (VT.getSizeInBits() > TestSize) {

24417

KnownBits KnownRHS = DAG.computeKnownBits(RHS);

24418

if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {

24419

// If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.

24420

while (VT.getSizeInBits() > TestSize) {

24421

auto Split = DAG.SplitVector(LHS, DL);

24422

VT = Split.first.getValueType();

24423

LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);

24424

}

24425

RHS = DAG.getAllOnesConstant(DL, VT);

24426

} else if (!UsePTEST && !KnownRHS.isZero()) {

24427

// MOVMSK Special Case:

24428

// ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)

24429

MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;

24430

VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());

24431

LHS = DAG.getBitcast(VT, MaskBits(LHS));

24432

RHS = DAG.getBitcast(VT, MaskBits(RHS));

24433

EVT BoolVT = VT.changeVectorElementType(MVT::i1);

24434

SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);

24435

V = DAG.getSExtOrTrunc(V, DL, VT);

24436

while (VT.getSizeInBits() > TestSize) {

24437

auto Split = DAG.SplitVector(V, DL);

24438

VT = Split.first.getValueType();

24439

V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);

24440

}

24441

V = DAG.getNOT(DL, V, VT);

24442

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

24443

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,

24444

DAG.getConstant(0, DL, MVT::i32));

24445

} else {

24446

// Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.

24447

SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);

24448

while (VT.getSizeInBits() > TestSize) {

24449

auto Split = DAG.SplitVector(V, DL);

24450

VT = Split.first.getValueType();

24451

V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);

24452

}

24453

LHS = V;

24454

RHS = DAG.getConstant(0, DL, VT);

24455

}

24456

}

24457

24458

if (UseKORTEST && VT.is512BitVector()) {

24459

MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

24460

MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);

24461

LHS = DAG.getBitcast(TestVT, MaskBits(LHS));

24462

RHS = DAG.getBitcast(TestVT, MaskBits(RHS));

24463

SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);

24464

return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);

24465

}

24466

24467

if (UsePTEST) {

24468

MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);

24469

LHS = DAG.getBitcast(TestVT, MaskBits(LHS));

24470

RHS = DAG.getBitcast(TestVT, MaskBits(RHS));

24471

SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);

24472

return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);

24473

}

24474

24475

assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits")(static_cast <bool> (VT.getSizeInBits() == 128 &&
"Failure to split to 128-bits") ? void (0) : __assert_fail (
"VT.getSizeInBits() == 128 && \"Failure to split to 128-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24475, __extension__
__PRETTY_FUNCTION__));

24476

MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;

24477

LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));

24478

RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));

24479

SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);

24480

V = DAG.getNOT(DL, V, MaskVT);

24481

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

24482

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,

24483

DAG.getConstant(0, DL, MVT::i32));

24484

}

24485

24486

// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback

24487

// to CMP(MOVMSK(PCMPEQB(X,Y))).

24488

static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS,

24489

ISD::CondCode CC, const SDLoc &DL,

24490

const X86Subtarget &Subtarget,

24491

SelectionDAG &DAG,

24492

X86::CondCode &X86CC) {

24493

assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24493, __extension__
__PRETTY_FUNCTION__));

24494

24495

bool CmpNull = isNullConstant(RHS);

24496

bool CmpAllOnes = isAllOnesConstant(RHS);

24497

if (!CmpNull && !CmpAllOnes)

24498

return SDValue();

24499

24500

SDValue Op = LHS;

24501

if (!Subtarget.hasSSE2() || !Op->hasOneUse())

24502

return SDValue();

24503

24504

// Check whether we're masking/truncating an OR-reduction result, in which

24505

// case track the masked bits.

24506

// TODO: Add CmpAllOnes support.

24507

APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());

24508

if (CmpNull) {

24509

switch (Op.getOpcode()) {

24510

case ISD::TRUNCATE: {

24511

SDValue Src = Op.getOperand(0);

24512

Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),

24513

Op.getScalarValueSizeInBits());

24514

Op = Src;

24515

break;

24516

}

24517

case ISD::AND: {

24518

if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

24519

Mask = Cst->getAPIntValue();

24520

Op = Op.getOperand(0);

24521

}

24522

break;

24523

}

24524

}

24525

}

24526

24527

ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;

24528

24529

// Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.

24530

// Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.

24531

SmallVector<SDValue, 8> VecIns;

24532

if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {

24533

EVT VT = VecIns[0].getValueType();

24534

assert(llvm::all_of(VecIns,(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24536, __extension__
__PRETTY_FUNCTION__))

24535

[VT](SDValue V) { return VT == V.getValueType(); }) &&(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24536, __extension__
__PRETTY_FUNCTION__))

24536

"Reduction source vector mismatch")(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24536, __extension__
__PRETTY_FUNCTION__));

24537

24538

// Quit if not splittable to scalar/128/256/512-bit vector.

24539

if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))

24540

return SDValue();

24541

24542

// If more than one full vector is evaluated, AND/OR them first before

24543

// PTEST.

24544

for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;

24545

Slot += 2, e += 1) {

24546

// Each iteration will AND/OR 2 nodes and append the result until there is

24547

// only 1 node left, i.e. the final value of all vectors.

24548

SDValue LHS = VecIns[Slot];

24549

SDValue RHS = VecIns[Slot + 1];

24550

VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));

24551

}

24552

24553

return LowerVectorAllEqual(DL, VecIns.back(),

24554

CmpNull ? DAG.getConstant(0, DL, VT)

24555

: DAG.getAllOnesConstant(DL, VT),

24556

CC, Mask, Subtarget, DAG, X86CC);

24557

}

24558

24559

// Match icmp(reduce_or(X),0) anyof reduction patterns.

24560

// Match icmp(reduce_and(X),-1) allof reduction patterns.

24561

if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

24562

ISD::NodeType BinOp;

24563

if (SDValue Match =

24564

DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {

24565

EVT MatchVT = Match.getValueType();

24566

return LowerVectorAllEqual(DL, Match,

24567

CmpNull ? DAG.getConstant(0, DL, MatchVT)

24568

: DAG.getAllOnesConstant(DL, MatchVT),

24569

CC, Mask, Subtarget, DAG, X86CC);

24570

}

24571

}

24572

24573

if (Mask.isAllOnes()) {

24574

assert(!Op.getValueType().isVector() &&(static_cast <bool> (!Op.getValueType().isVector() &&
"Illegal vector type for reduction pattern") ? void (0) : __assert_fail
("!Op.getValueType().isVector() && \"Illegal vector type for reduction pattern\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24575, __extension__
__PRETTY_FUNCTION__))

24575

"Illegal vector type for reduction pattern")(static_cast <bool> (!Op.getValueType().isVector() &&
"Illegal vector type for reduction pattern") ? void (0) : __assert_fail
("!Op.getValueType().isVector() && \"Illegal vector type for reduction pattern\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24575, __extension__
__PRETTY_FUNCTION__));

24576

SDValue Src = peekThroughBitcasts(Op);

24577

if (Src.getValueType().isFixedLengthVector() &&

24578

Src.getValueType().getScalarType() == MVT::i1) {

24579

// Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.

24580

// Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.

24581

if (Src.getOpcode() == ISD::SETCC) {

24582

SDValue LHS = Src.getOperand(0);

24583

SDValue RHS = Src.getOperand(1);

24584

EVT LHSVT = LHS.getValueType();

24585

ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();

24586

if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&

24587

llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {

24588

APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());

24589

return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,

24590

X86CC);

24591

}

24592

}

24593

// Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.

24594

// Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.

24595

// Peek through truncation, mask the LSB and compare against zero/LSB.

24596

if (Src.getOpcode() == ISD::TRUNCATE) {

24597

SDValue Inner = Src.getOperand(0);

24598

EVT InnerVT = Inner.getValueType();

24599

if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {

24600

unsigned BW = InnerVT.getScalarSizeInBits();

24601

APInt SrcMask = APInt(BW, 1);

24602

APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;

24603

return LowerVectorAllEqual(DL, Inner,

24604

DAG.getConstant(Cmp, DL, InnerVT), CC,

24605

SrcMask, Subtarget, DAG, X86CC);

24606

}

24607

}

24608

}

24609

}

24610

24611

return SDValue();

24612

}

24613

24614

/// return true if \c Op has a use that doesn't just read flags.

24615

static bool hasNonFlagsUse(SDValue Op) {

24616

for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;

24617

++UI) {

24618

SDNode *User = *UI;

24619

unsigned UOpNo = UI.getOperandNo();

24620

if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {

24621

// Look pass truncate.

24622

UOpNo = User->use_begin().getOperandNo();

24623

User = *User->use_begin();

24624

}

24625

24626

if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&

24627

!(User->getOpcode() == ISD::SELECT && UOpNo == 0))

24628

return true;

24629

}

24630

return false;

24631

}

24632

24633

// Transform to an x86-specific ALU node with flags if there is a chance of

24634

// using an RMW op or only the flags are used. Otherwise, leave

24635

// the node alone and emit a 'cmp' or 'test' instruction.

24636

static bool isProfitableToUseFlagOp(SDValue Op) {

24637

for (SDNode *U : Op->uses())

24638

if (U->getOpcode() != ISD::CopyToReg &&

24639

U->getOpcode() != ISD::SETCC &&

24640

U->getOpcode() != ISD::STORE)

24641

return false;

24642

24643

return true;

24644

}

24645

24646

/// Emit nodes that will be selected as "test Op0,Op0", or something

24647

/// equivalent.

24648

static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,

24649

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

24650

// CF and OF aren't always set the way we want. Determine which

24651

// of these we need.

24652

bool NeedCF = false;

24653

bool NeedOF = false;

24654

switch (X86CC) {

24655

default: break;

24656

case X86::COND_A: case X86::COND_AE:

24657

case X86::COND_B: case X86::COND_BE:

24658

NeedCF = true;

24659

break;

24660

case X86::COND_G: case X86::COND_GE:

24661

case X86::COND_L: case X86::COND_LE:

24662

case X86::COND_O: case X86::COND_NO: {

24663

// Check if we really need to set the

24664

// Overflow flag. If NoSignedWrap is present

24665

// that is not actually needed.

24666

switch (Op->getOpcode()) {

24667

case ISD::ADD:

24668

case ISD::SUB:

24669

case ISD::MUL:

24670

case ISD::SHL:

24671

if (Op.getNode()->getFlags().hasNoSignedWrap())

24672

break;

24673

[[fallthrough]];

24674

default:

24675

NeedOF = true;

24676

break;

24677

}

24678

break;

24679

}

24680

}

24681

// See if we can use the EFLAGS value from the operand instead of

24682

// doing a separate TEST. TEST always sets OF and CF to 0, so unless

24683

// we prove that the arithmetic won't overflow, we can't use OF or CF.

24684

if (Op.getResNo() != 0 || NeedOF || NeedCF) {

24685

// Emit a CMP with 0, which is the TEST pattern.

24686

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

24687

DAG.getConstant(0, dl, Op.getValueType()));

24688

}

24689

unsigned Opcode = 0;

24690

unsigned NumOperands = 0;

24691

24692

SDValue ArithOp = Op;

24693

24694

// NOTICE: In the code below we use ArithOp to hold the arithmetic operation

24695

// which may be the result of a CAST. We use the variable 'Op', which is the

24696

// non-casted variable when we check for possible users.

24697

switch (ArithOp.getOpcode()) {

24698

case ISD::AND:

24699

// If the primary 'and' result isn't used, don't bother using X86ISD::AND,

24700

// because a TEST instruction will be better.

24701

if (!hasNonFlagsUse(Op))

24702

break;

24703

24704

[[fallthrough]];

24705

case ISD::ADD:

24706

case ISD::SUB:

24707

case ISD::OR:

24708

case ISD::XOR:

24709

if (!isProfitableToUseFlagOp(Op))

24710

break;

24711

24712

// Otherwise use a regular EFLAGS-setting instruction.

24713

switch (ArithOp.getOpcode()) {

24714

default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24714);

24715

case ISD::ADD: Opcode = X86ISD::ADD; break;

24716

case ISD::SUB: Opcode = X86ISD::SUB; break;

24717

case ISD::XOR: Opcode = X86ISD::XOR; break;

24718

case ISD::AND: Opcode = X86ISD::AND; break;

24719

case ISD::OR: Opcode = X86ISD::OR; break;

24720

}

24721

24722

NumOperands = 2;

24723

break;

24724

case X86ISD::ADD:

24725

case X86ISD::SUB:

24726

case X86ISD::OR:

24727

case X86ISD::XOR:

24728

case X86ISD::AND:

24729

return SDValue(Op.getNode(), 1);

24730

case ISD::SSUBO:

24731

case ISD::USUBO: {

24732

// /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.

24733

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

24734

return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),

24735

Op->getOperand(1)).getValue(1);

24736

}

24737

default:

24738

break;

24739

}

24740

24741

if (Opcode == 0) {

24742

// Emit a CMP with 0, which is the TEST pattern.

24743

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

24744

DAG.getConstant(0, dl, Op.getValueType()));

24745

}

24746

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

24747

SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);

24748

24749

SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);

24750

DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);

24751

return SDValue(New.getNode(), 1);

24752

}

24753

24754

/// Emit nodes that will be selected as "cmp Op0,Op1", or something

24755

/// equivalent.

24756

static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,

24757

const SDLoc &dl, SelectionDAG &DAG,

24758

const X86Subtarget &Subtarget) {

24759

if (isNullConstant(Op1))

24760

return EmitTest(Op0, X86CC, dl, DAG, Subtarget);

24761

24762

EVT CmpVT = Op0.getValueType();

24763

24764

assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24765, __extension__
__PRETTY_FUNCTION__))

24765

CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24765, __extension__
__PRETTY_FUNCTION__));

24766

24767

// Only promote the compare up to I32 if it is a 16 bit operation

24768

// with an immediate. 16 bit immediates are to be avoided.

24769

if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&

24770

!DAG.getMachineFunction().getFunction().hasMinSize()) {

24771

ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);

24772

ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);

24773

// Don't do this if the immediate can fit in 8-bits.

24774

if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||

24775

(COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {

24776

unsigned ExtendOp =

24777

isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

24778

if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {

24779

// For equality comparisons try to use SIGN_EXTEND if the input was

24780

// truncate from something with enough sign bits.

24781

if (Op0.getOpcode() == ISD::TRUNCATE) {

24782

if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)

24783

ExtendOp = ISD::SIGN_EXTEND;

24784

} else if (Op1.getOpcode() == ISD::TRUNCATE) {

24785

if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)

24786

ExtendOp = ISD::SIGN_EXTEND;

24787

}

24788

}

24789

24790

CmpVT = MVT::i32;

24791

Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);

24792

Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);

24793

}

24794

}

24795

24796

// Try to shrink i64 compares if the input has enough zero bits.

24797

// FIXME: Do this for non-constant compares for constant on LHS?

24798

if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&

24799

Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.

24800

cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&

24801

DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {

24802

CmpVT = MVT::i32;

24803

Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);

24804

Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);

24805

}

24806

24807

// 0-x == y --> x+y == 0

24808

// 0-x != y --> x+y != 0

24809

if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&

24810

Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {

24811

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

24812

SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);

24813

return Add.getValue(1);

24814

}

24815

24816

// x == 0-y --> x+y == 0

24817

// x != 0-y --> x+y != 0

24818

if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&

24819

Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {

24820

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

24821

SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));

24822

return Add.getValue(1);

24823

}

24824

24825

// Use SUB instead of CMP to enable CSE between SUB and CMP.

24826

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

24827

SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);

24828

return Sub.getValue(1);

24829

}

24830

24831

/// Check if replacement of SQRT with RSQRT should be disabled.

24832

bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {

24833

EVT VT = Op.getValueType();

24834

24835

// We don't need to replace SQRT with RSQRT for half type.

24836

if (VT.getScalarType() == MVT::f16)

24837

return true;

24838

24839

// We never want to use both SQRT and RSQRT instructions for the same input.

24840

if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))

24841

return false;

24842

24843

if (VT.isVector())

24844

return Subtarget.hasFastVectorFSQRT();

24845

return Subtarget.hasFastScalarFSQRT();

24846

}

24847

24848

/// The minimum architected relative accuracy is 2^-12. We need one

24849

/// Newton-Raphson step to have a good float result (24 bits of precision).

24850

SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,

24851

SelectionDAG &DAG, int Enabled,

24852

int &RefinementSteps,

24853

bool &UseOneConstNR,

24854

bool Reciprocal) const {

24855

SDLoc DL(Op);

24856

EVT VT = Op.getValueType();

24857

24858

// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.

24859

// It is likely not profitable to do this for f64 because a double-precision

24860

// rsqrt estimate with refinement on x86 prior to FMA requires at least 16

24861

// instructions: convert to single, rsqrtss, convert back to double, refine

24862

// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA

24863

// along with FMA, this could be a throughput win.

24864

// TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32

24865

// after legalize types.

24866

if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||

24867

(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||

24868

(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||

24869

(VT == MVT::v8f32 && Subtarget.hasAVX()) ||

24870

(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {

24871

if (RefinementSteps == ReciprocalEstimate::Unspecified)

24872

RefinementSteps = 1;

24873

24874

UseOneConstNR = false;

24875

// There is no FSQRT for 512-bits, but there is RSQRT14.

24876

unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;

24877

SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);

24878

if (RefinementSteps == 0 && !Reciprocal)

24879

Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);

24880

return Estimate;

24881

}

24882

24883

if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&

24884

Subtarget.hasFP16()) {

24885

assert(Reciprocal && "Don't replace SQRT with RSQRT for half type")(static_cast <bool> (Reciprocal && "Don't replace SQRT with RSQRT for half type"
) ? void (0) : __assert_fail ("Reciprocal && \"Don't replace SQRT with RSQRT for half type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24885, __extension__
__PRETTY_FUNCTION__));

24886

if (RefinementSteps == ReciprocalEstimate::Unspecified)

24887

RefinementSteps = 0;

24888

24889

if (VT == MVT::f16) {

24890

SDValue Zero = DAG.getIntPtrConstant(0, DL);

24891

SDValue Undef = DAG.getUNDEF(MVT::v8f16);

24892

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);

24893

Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);

24894

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);

24895

}

24896

24897

return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);

24898

}

24899

return SDValue();

24900

}

24901

24902

/// The minimum architected relative accuracy is 2^-12. We need one

24903

/// Newton-Raphson step to have a good float result (24 bits of precision).

24904

SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,

24905

int Enabled,

24906

int &RefinementSteps) const {

24907

SDLoc DL(Op);

24908

EVT VT = Op.getValueType();

24909

24910

// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.

24911

// It is likely not profitable to do this for f64 because a double-precision

24912

// reciprocal estimate with refinement on x86 prior to FMA requires

24913

// 15 instructions: convert to single, rcpss, convert back to double, refine

24914

// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA

24915

// along with FMA, this could be a throughput win.

24916

24917

if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||

24918

(VT == MVT::v4f32 && Subtarget.hasSSE1()) ||

24919

(VT == MVT::v8f32 && Subtarget.hasAVX()) ||

24920

(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {

24921

// Enable estimate codegen with 1 refinement step for vector division.

24922

// Scalar division estimates are disabled because they break too much

24923

// real-world code. These defaults are intended to match GCC behavior.

24924

if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)

24925

return SDValue();

24926

24927

if (RefinementSteps == ReciprocalEstimate::Unspecified)

24928

RefinementSteps = 1;

24929

24930

// There is no FSQRT for 512-bits, but there is RCP14.

24931

unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;

24932

return DAG.getNode(Opcode, DL, VT, Op);

24933

}

24934

24935

if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&

24936

Subtarget.hasFP16()) {

24937

if (RefinementSteps == ReciprocalEstimate::Unspecified)

24938

RefinementSteps = 0;

24939

24940

if (VT == MVT::f16) {

24941

SDValue Zero = DAG.getIntPtrConstant(0, DL);

24942

SDValue Undef = DAG.getUNDEF(MVT::v8f16);

24943

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);

24944

Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);

24945

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);

24946

}

24947

24948

return DAG.getNode(X86ISD::RCP14, DL, VT, Op);

24949

}

24950

return SDValue();

24951

}

24952

24953

/// If we have at least two divisions that use the same divisor, convert to

24954

/// multiplication by a reciprocal. This may need to be adjusted for a given

24955

/// CPU if a division's cost is not at least twice the cost of a multiplication.

24956

/// This is because we still need one division to calculate the reciprocal and

24957

/// then we need two multiplies by that reciprocal as replacements for the

24958

/// original divisions.

24959

unsigned X86TargetLowering::combineRepeatedFPDivisors() const {

24960

return 2;

24961

}

24962

24963

SDValue

24964

X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,

24965

SelectionDAG &DAG,

24966

SmallVectorImpl<SDNode *> &Created) const {

24967

AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();

24968

if (isIntDivCheap(N->getValueType(0), Attr))

24969

return SDValue(N,0); // Lower SDIV as SDIV

24970

24971

assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24972, __extension__
__PRETTY_FUNCTION__))

24972

"Unexpected divisor!")(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24972, __extension__
__PRETTY_FUNCTION__));

24973

24974

// Only perform this transform if CMOV is supported otherwise the select

24975

// below will become a branch.

24976

if (!Subtarget.canUseCMOV())

24977

return SDValue();

24978

24979

// fold (sdiv X, pow2)

24980

EVT VT = N->getValueType(0);

24981

// FIXME: Support i8.

24982

if (VT != MVT::i16 && VT != MVT::i32 &&

24983

!(Subtarget.is64Bit() && VT == MVT::i64))

24984

return SDValue();

24985

24986

unsigned Lg2 = Divisor.countr_zero();

24987

24988

// If the divisor is 2 or -2, the default expansion is better.

24989

if (Lg2 == 1)

24990

return SDValue();

24991

24992

SDLoc DL(N);

24993

SDValue N0 = N->getOperand(0);

24994

SDValue Zero = DAG.getConstant(0, DL, VT);

24995

APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);

24996

SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);

24997

24998

// If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.

24999

SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);

25000

SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);

25001

SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);

25002

25003

Created.push_back(Cmp.getNode());

25004

Created.push_back(Add.getNode());

25005

Created.push_back(CMov.getNode());

25006

25007

// Divide by pow2.

25008

SDValue SRA =

25009

DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));

25010

25011

// If we're dividing by a positive value, we're done. Otherwise, we must

25012

// negate the result.

25013

if (Divisor.isNonNegative())

25014

return SRA;

25015

25016

Created.push_back(SRA.getNode());

25017

return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);

25018

}

25019

25020

/// Result of 'and' is compared against zero. Change to a BT node if possible.

25021

/// Returns the BT node and the condition code needed to use it.

25022

static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,

25023

SelectionDAG &DAG, X86::CondCode &X86CC) {

25024

assert(And.getOpcode() == ISD::AND && "Expected AND node!")(static_cast <bool> (And.getOpcode() == ISD::AND &&
"Expected AND node!") ? void (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25024, __extension__
__PRETTY_FUNCTION__));

25025

SDValue Op0 = And.getOperand(0);

25026

SDValue Op1 = And.getOperand(1);

25027

if (Op0.getOpcode() == ISD::TRUNCATE)

25028

Op0 = Op0.getOperand(0);

25029

if (Op1.getOpcode() == ISD::TRUNCATE)

25030

Op1 = Op1.getOperand(0);

25031

25032

SDValue Src, BitNo;

25033

if (Op1.getOpcode() == ISD::SHL)

25034

std::swap(Op0, Op1);

25035

if (Op0.getOpcode() == ISD::SHL) {

25036

if (isOneConstant(Op0.getOperand(0))) {

25037

// If we looked past a truncate, check that it's only truncating away

25038

// known zeros.

25039

unsigned BitWidth = Op0.getValueSizeInBits();

25040

unsigned AndBitWidth = And.getValueSizeInBits();

25041

if (BitWidth > AndBitWidth) {

25042

KnownBits Known = DAG.computeKnownBits(Op0);

25043

if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)

25044

return SDValue();

25045

}

25046

Src = Op1;

25047

BitNo = Op0.getOperand(1);

25048

}

25049

} else if (Op1.getOpcode() == ISD::Constant) {

25050

ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);

25051

uint64_t AndRHSVal = AndRHS->getZExtValue();

25052

SDValue AndLHS = Op0;

25053

25054

if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {

25055

Src = AndLHS.getOperand(0);

25056

BitNo = AndLHS.getOperand(1);

25057

} else {

25058

// Use BT if the immediate can't be encoded in a TEST instruction or we

25059

// are optimizing for size and the immedaite won't fit in a byte.

25060

bool OptForSize = DAG.shouldOptForSize();

25061

if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&

25062

isPowerOf2_64(AndRHSVal)) {

25063

Src = AndLHS;

25064

BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,

25065

Src.getValueType());

25066

}

25067

}

25068

}

25069

25070

// No patterns found, give up.

25071

if (!Src.getNode())

25072

return SDValue();

25073

25074

// Remove any bit flip.

25075

if (isBitwiseNot(Src)) {

25076

Src = Src.getOperand(0);

25077

CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;

25078

}

25079

25080

// Attempt to create the X86ISD::BT node.

25081

if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {

25082

X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;

25083

return BT;

25084

}

25085

25086

return SDValue();

25087

}

25088

25089

// Check if pre-AVX condcode can be performed by a single FCMP op.

25090

static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {

25091

return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);

25092

}

25093

25094

/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask

25095

/// CMPs.

25096

static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,

25097

SDValue &Op1, bool &IsAlwaysSignaling) {

25098

unsigned SSECC;

25099

bool Swap = false;

25100

25101

// SSE Condition code mapping:

25102

// 0 - EQ

25103

// 1 - LT

25104

// 2 - LE

25105

// 3 - UNORD

25106

// 4 - NEQ

25107

// 5 - NLT

25108

// 6 - NLE

25109

// 7 - ORD

25110

switch (SetCCOpcode) {

25111

default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25111);

25112

case ISD::SETOEQ:

25113

case ISD::SETEQ: SSECC = 0; break;

25114

case ISD::SETOGT:

25115

case ISD::SETGT: Swap = true; [[fallthrough]];

25116

case ISD::SETLT:

25117

case ISD::SETOLT: SSECC = 1; break;

25118

case ISD::SETOGE:

25119

case ISD::SETGE: Swap = true; [[fallthrough]];

25120

case ISD::SETLE:

25121

case ISD::SETOLE: SSECC = 2; break;

25122

case ISD::SETUO: SSECC = 3; break;

25123

case ISD::SETUNE:

25124

case ISD::SETNE: SSECC = 4; break;

25125

case ISD::SETULE: Swap = true; [[fallthrough]];

25126

case ISD::SETUGE: SSECC = 5; break;

25127

case ISD::SETULT: Swap = true; [[fallthrough]];

25128

case ISD::SETUGT: SSECC = 6; break;

25129

case ISD::SETO: SSECC = 7; break;

25130

case ISD::SETUEQ: SSECC = 8; break;

25131

case ISD::SETONE: SSECC = 12; break;

25132

}

25133

if (Swap)

25134

std::swap(Op0, Op1);

25135

25136

switch (SetCCOpcode) {

25137

default:

25138

IsAlwaysSignaling = true;

25139

break;

25140

case ISD::SETEQ:

25141

case ISD::SETOEQ:

25142

case ISD::SETUEQ:

25143

case ISD::SETNE:

25144

case ISD::SETONE:

25145

case ISD::SETUNE:

25146

case ISD::SETO:

25147

case ISD::SETUO:

25148

IsAlwaysSignaling = false;

25149

break;

25150

}

25151

25152

return SSECC;

25153

}

25154

25155

/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then

25156

/// concatenate the result back.

25157

static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,

25158

ISD::CondCode Cond, SelectionDAG &DAG,

25159

const SDLoc &dl) {

25160

assert(VT.isInteger() && VT == LHS.getValueType() &&(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25161, __extension__
__PRETTY_FUNCTION__))

25161

VT == RHS.getValueType() && "Unsupported VTs!")(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25161, __extension__
__PRETTY_FUNCTION__));

25162

25163

SDValue CC = DAG.getCondCode(Cond);

25164

25165

// Extract the LHS Lo/Hi vectors

25166

SDValue LHS1, LHS2;

25167

std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);

25168

25169

// Extract the RHS Lo/Hi vectors

25170

SDValue RHS1, RHS2;

25171

std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);

25172

25173

// Issue the operation on the smaller types and concatenate the result back

25174

EVT LoVT, HiVT;

25175

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

25176

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

25177

DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),

25178

DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));

25179

}

25180

25181

static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {

25182

25183

SDValue Op0 = Op.getOperand(0);

25184

SDValue Op1 = Op.getOperand(1);

25185

SDValue CC = Op.getOperand(2);

25186

MVT VT = Op.getSimpleValueType();

25187

SDLoc dl(Op);

25188

25189

assert(VT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25190, __extension__
__PRETTY_FUNCTION__))

25190

"Cannot set masked compare for this operation")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25190, __extension__
__PRETTY_FUNCTION__));

25191

25192

ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();

25193

25194

// Prefer SETGT over SETLT.

25195

if (SetCCOpcode == ISD::SETLT) {

25196

SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);

25197

std::swap(Op0, Op1);

25198

}

25199

25200

return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);

25201

}

25202

25203

/// Given a buildvector constant, return a new vector constant with each element

25204

/// incremented or decremented. If incrementing or decrementing would result in

25205

/// unsigned overflow or underflow or this is not a simple vector constant,

25206

/// return an empty value.

25207

static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc,

25208

bool NSW) {

25209

auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());

25210

if (!BV || !V.getValueType().isSimple())

25211

return SDValue();

25212

25213

MVT VT = V.getSimpleValueType();

25214

MVT EltVT = VT.getVectorElementType();

25215

unsigned NumElts = VT.getVectorNumElements();

25216

SmallVector<SDValue, 8> NewVecC;

25217

SDLoc DL(V);

25218

for (unsigned i = 0; i < NumElts; ++i) {

25219

auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));

25220

if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)

25221

return SDValue();

25222

25223

// Avoid overflow/underflow.

25224

const APInt &EltC = Elt->getAPIntValue();

25225

if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))

25226

return SDValue();

25227

if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||

25228

(!IsInc && EltC.isMinSignedValue())))

25229

return SDValue();

25230

25231

NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));

25232

}

25233

25234

return DAG.getBuildVector(VT, DL, NewVecC);

25235

}

25236

25237

/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for

25238

/// Op0 u<= Op1:

25239

/// t = psubus Op0, Op1

25240

/// pcmpeq t, <0..0>

25241

static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,

25242

ISD::CondCode Cond, const SDLoc &dl,

25243

const X86Subtarget &Subtarget,

25244

SelectionDAG &DAG) {

25245

if (!Subtarget.hasSSE2())

25246

return SDValue();

25247

25248

MVT VET = VT.getVectorElementType();

25249

if (VET != MVT::i8 && VET != MVT::i16)

25250

return SDValue();

25251

25252

switch (Cond) {

25253

default:

25254

return SDValue();

25255

case ISD::SETULT: {

25256

// If the comparison is against a constant we can turn this into a

25257

// setule. With psubus, setule does not require a swap. This is

25258

// beneficial because the constant in the register is no longer

25259

// destructed as the destination so it can be hoisted out of a loop.

25260

// Only do this pre-AVX since vpcmp* is no longer destructive.

25261

if (Subtarget.hasAVX())

25262

return SDValue();

25263

SDValue ULEOp1 =

25264

incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);

25265

if (!ULEOp1)

25266

return SDValue();

25267

Op1 = ULEOp1;

25268

break;

25269

}

25270

case ISD::SETUGT: {

25271

// If the comparison is against a constant, we can turn this into a setuge.

25272

// This is beneficial because materializing a constant 0 for the PCMPEQ is

25273

// probably cheaper than XOR+PCMPGT using 2 different vector constants:

25274

// cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0

25275

SDValue UGEOp1 =

25276

incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);

25277

if (!UGEOp1)

25278

return SDValue();

25279

Op1 = Op0;

25280

Op0 = UGEOp1;

25281

break;

25282

}

25283

// Psubus is better than flip-sign because it requires no inversion.

25284

case ISD::SETUGE:

25285

std::swap(Op0, Op1);

25286

break;

25287

case ISD::SETULE:

25288

break;

25289

}

25290

25291

SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);

25292

return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,

25293

DAG.getConstant(0, dl, VT));

25294

}

25295

25296

static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,

25297

SelectionDAG &DAG) {

25298

bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||

25299

Op.getOpcode() == ISD::STRICT_FSETCCS;

25300

SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);

25301

SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);

25302

SDValue CC = Op.getOperand(IsStrict ? 3 : 2);

25303

MVT VT = Op->getSimpleValueType(0);

25304

ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();

25305

bool isFP = Op1.getSimpleValueType().isFloatingPoint();

25306

SDLoc dl(Op);

25307

25308

if (isFP) {

25309

MVT EltVT = Op0.getSimpleValueType().getVectorElementType();

25310

assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64)(static_cast <bool> (EltVT == MVT::f16 || EltVT == MVT::
f32 || EltVT == MVT::f64) ? void (0) : __assert_fail ("EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25310, __extension__
__PRETTY_FUNCTION__));

25311

if (isSoftFP16(EltVT, Subtarget))

25312

return SDValue();

25313

25314

bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;

25315

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

25316

25317

// If we have a strict compare with a vXi1 result and the input is 128/256

25318

// bits we can't use a masked compare unless we have VLX. If we use a wider

25319

// compare like we do for non-strict, we might trigger spurious exceptions

25320

// from the upper elements. Instead emit a AVX compare and convert to mask.

25321

unsigned Opc;

25322

if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&

25323

(!IsStrict || Subtarget.hasVLX() ||

25324

Op0.getSimpleValueType().is512BitVector())) {

25325

#ifndef NDEBUG

25326

unsigned Num = VT.getVectorNumElements();

25327

assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16))(static_cast <bool> (Num <= 16 || (Num == 32 &&
EltVT == MVT::f16)) ? void (0) : __assert_fail ("Num <= 16 || (Num == 32 && EltVT == MVT::f16)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25327, __extension__
__PRETTY_FUNCTION__));

25328

#endif

25329

Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;

25330

} else {

25331

Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;

25332

// The SSE/AVX packed FP comparison nodes are defined with a

25333

// floating-point vector result that matches the operand type. This allows

25334

// them to work with an SSE1 target (integer vector types are not legal).

25335

VT = Op0.getSimpleValueType();

25336

}

25337

25338

SDValue Cmp;

25339

bool IsAlwaysSignaling;

25340

unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);

25341

if (!Subtarget.hasAVX()) {

25342

// TODO: We could use following steps to handle a quiet compare with

25343

// signaling encodings.

25344

// 1. Get ordered masks from a quiet ISD::SETO

25345

// 2. Use the masks to mask potential unordered elements in operand A, B

25346

// 3. Get the compare results of masked A, B

25347

// 4. Calculating final result using the mask and result from 3

25348

// But currently, we just fall back to scalar operations.

25349

if (IsStrict && IsAlwaysSignaling && !IsSignaling)

25350

return SDValue();

25351

25352

// Insert an extra signaling instruction to raise exception.

25353

if (IsStrict && !IsAlwaysSignaling && IsSignaling) {

25354

SDValue SignalCmp = DAG.getNode(

25355

Opc, dl, {VT, MVT::Other},

25356

{Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS

25357

// FIXME: It seems we need to update the flags of all new strict nodes.

25358

// Otherwise, mayRaiseFPException in MI will return false due to

25359

// NoFPExcept = false by default. However, I didn't find it in other

25360

// patches.

25361

SignalCmp->setFlags(Op->getFlags());

25362

Chain = SignalCmp.getValue(1);

25363

}

25364

25365

// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),

25366

// emit two comparisons and a logic op to tie them together.

25367

if (!cheapX86FSETCC_SSE(Cond)) {

25368

// LLVM predicate is SETUEQ or SETONE.

25369

unsigned CC0, CC1;

25370

unsigned CombineOpc;

25371

if (Cond == ISD::SETUEQ) {

25372

CC0 = 3; // UNORD

25373

CC1 = 0; // EQ

25374

CombineOpc = X86ISD::FOR;

25375

} else {

25376

assert(Cond == ISD::SETONE)(static_cast <bool> (Cond == ISD::SETONE) ? void (0) : __assert_fail
("Cond == ISD::SETONE", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25376, __extension__ __PRETTY_FUNCTION__));

25377

CC0 = 7; // ORD

25378

CC1 = 4; // NEQ

25379

CombineOpc = X86ISD::FAND;

25380

}

25381

25382

SDValue Cmp0, Cmp1;

25383

if (IsStrict) {

25384

Cmp0 = DAG.getNode(

25385

Opc, dl, {VT, MVT::Other},

25386

{Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});

25387

Cmp1 = DAG.getNode(

25388

Opc, dl, {VT, MVT::Other},

25389

{Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});

25390

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),

25391

Cmp1.getValue(1));

25392

} else {

25393

Cmp0 = DAG.getNode(

25394

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));

25395

Cmp1 = DAG.getNode(

25396

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));

25397

}

25398

Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);

25399

} else {

25400

if (IsStrict) {

25401

Cmp = DAG.getNode(

25402

Opc, dl, {VT, MVT::Other},

25403

{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});

25404

Chain = Cmp.getValue(1);

25405

} else

25406

Cmp = DAG.getNode(

25407

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));

25408

}

25409

} else {

25410

// Handle all other FP comparisons here.

25411

if (IsStrict) {

25412

// Make a flip on already signaling CCs before setting bit 4 of AVX CC.

25413

SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;

25414

Cmp = DAG.getNode(

25415

Opc, dl, {VT, MVT::Other},

25416

{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});

25417

Chain = Cmp.getValue(1);

25418

} else

25419

Cmp = DAG.getNode(

25420

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));

25421

}

25422

25423

if (VT.getFixedSizeInBits() >

25424

Op.getSimpleValueType().getFixedSizeInBits()) {

25425

// We emitted a compare with an XMM/YMM result. Finish converting to a

25426

// mask register using a vptestm.

25427

EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();

25428

Cmp = DAG.getBitcast(CastVT, Cmp);

25429

Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,

25430

DAG.getConstant(0, dl, CastVT), ISD::SETNE);

25431

} else {

25432

// If this is SSE/AVX CMPP, bitcast the result back to integer to match

25433

// the result type of SETCC. The bitcast is expected to be optimized

25434

// away during combining/isel.

25435

Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

25436

}

25437

25438

if (IsStrict)

25439

return DAG.getMergeValues({Cmp, Chain}, dl);

25440

25441

return Cmp;

25442

}

25443

25444

assert(!IsStrict && "Strict SETCC only handles FP operands.")(static_cast <bool> (!IsStrict && "Strict SETCC only handles FP operands."
) ? void (0) : __assert_fail ("!IsStrict && \"Strict SETCC only handles FP operands.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25444, __extension__
__PRETTY_FUNCTION__));

25445

25446

MVT VTOp0 = Op0.getSimpleValueType();

25447

(void)VTOp0;

25448

assert(VTOp0 == Op1.getSimpleValueType() &&(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25449, __extension__
__PRETTY_FUNCTION__))

25449

"Expected operands with same type!")(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25449, __extension__
__PRETTY_FUNCTION__));

25450

assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25451, __extension__
__PRETTY_FUNCTION__))

25451

"Invalid number of packed elements for source and destination!")(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25451, __extension__
__PRETTY_FUNCTION__));

25452

25453

// The non-AVX512 code below works under the assumption that source and

25454

// destination types are the same.

25455

assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25456, __extension__
__PRETTY_FUNCTION__))

25456

"Value types for source and destination must be the same!")(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25456, __extension__
__PRETTY_FUNCTION__));

25457

25458

// The result is boolean, but operands are int/float

25459

if (VT.getVectorElementType() == MVT::i1) {

25460

// In AVX-512 architecture setcc returns mask with i1 elements,

25461

// But there is no compare instruction for i8 and i16 elements in KNL.

25462

assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25463, __extension__
__PRETTY_FUNCTION__))

25463

"Unexpected operand type")(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25463, __extension__
__PRETTY_FUNCTION__));

25464

return LowerIntVSETCC_AVX512(Op, DAG);

25465

}

25466

25467

// Lower using XOP integer comparisons.

25468

if (VT.is128BitVector() && Subtarget.hasXOP()) {

25469

// Translate compare code to XOP PCOM compare mode.

25470

unsigned CmpMode = 0;

25471

switch (Cond) {

25472

default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25472);

25473

case ISD::SETULT:

25474

case ISD::SETLT: CmpMode = 0x00; break;

25475

case ISD::SETULE:

25476

case ISD::SETLE: CmpMode = 0x01; break;

25477

case ISD::SETUGT:

25478

case ISD::SETGT: CmpMode = 0x02; break;

25479

case ISD::SETUGE:

25480

case ISD::SETGE: CmpMode = 0x03; break;

25481

case ISD::SETEQ: CmpMode = 0x04; break;

25482

case ISD::SETNE: CmpMode = 0x05; break;

25483

}

25484

25485

// Are we comparing unsigned or signed integers?

25486

unsigned Opc =

25487

ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;

25488

25489

return DAG.getNode(Opc, dl, VT, Op0, Op1,

25490

DAG.getTargetConstant(CmpMode, dl, MVT::i8));

25491

}

25492

25493

// (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.

25494

// Revert part of the simplifySetCCWithAnd combine, to avoid an invert.

25495

if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {

25496

SDValue BC0 = peekThroughBitcasts(Op0);

25497

if (BC0.getOpcode() == ISD::AND) {

25498

APInt UndefElts;

25499

SmallVector<APInt, 64> EltBits;

25500

if (getTargetConstantBitsFromNode(BC0.getOperand(1),

25501

VT.getScalarSizeInBits(), UndefElts,

25502

EltBits, false, false)) {

25503

if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {

25504

Cond = ISD::SETEQ;

25505

Op1 = DAG.getBitcast(VT, BC0.getOperand(1));

25506

}

25507

}

25508

}

25509

}

25510

25511

// ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.

25512

if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&

25513

Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {

25514

ConstantSDNode *C1 = isConstOrConstSplat(Op1);

25515

if (C1 && C1->getAPIntValue().isPowerOf2()) {

25516

unsigned BitWidth = VT.getScalarSizeInBits();

25517

unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;

25518

25519

SDValue Result = Op0.getOperand(0);

25520

Result = DAG.getNode(ISD::SHL, dl, VT, Result,

25521

DAG.getConstant(ShiftAmt, dl, VT));

25522

Result = DAG.getNode(ISD::SRA, dl, VT, Result,

25523

DAG.getConstant(BitWidth - 1, dl, VT));

25524

return Result;

25525

}

25526

}

25527

25528

// Break 256-bit integer vector compare into smaller ones.

25529

if (VT.is256BitVector() && !Subtarget.hasInt256())

25530

return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);

25531

25532

// Break 512-bit integer vector compare into smaller ones.

25533

// TODO: Try harder to use VPCMPx + VPMOV2x?

25534

if (VT.is512BitVector())

25535

return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);

25536

25537

// If we have a limit constant, try to form PCMPGT (signed cmp) to avoid

25538

// not-of-PCMPEQ:

25539

// X != INT_MIN --> X >s INT_MIN

25540

// X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X

25541

// +X != 0 --> +X >s 0

25542

APInt ConstValue;

25543

if (Cond == ISD::SETNE &&

25544

ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {

25545

if (ConstValue.isMinSignedValue())

25546

Cond = ISD::SETGT;

25547

else if (ConstValue.isMaxSignedValue())

25548

Cond = ISD::SETLT;

25549

else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))

25550

Cond = ISD::SETGT;

25551

}

25552

25553

// If both operands are known non-negative, then an unsigned compare is the

25554

// same as a signed compare and there's no need to flip signbits.

25555

// TODO: We could check for more general simplifications here since we're

25556

// computing known bits.

25557

bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&

25558

!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));

25559

25560

// Special case: Use min/max operations for unsigned compares.

25561

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

25562

if (ISD::isUnsignedIntSetCC(Cond) &&

25563

(FlipSigns || ISD::isTrueWhenEqual(Cond)) &&

25564

TLI.isOperationLegal(ISD::UMIN, VT)) {

25565

// If we have a constant operand, increment/decrement it and change the

25566

// condition to avoid an invert.

25567

if (Cond == ISD::SETUGT) {

25568

// X > C --> X >= (C+1) --> X == umax(X, C+1)

25569

if (SDValue UGTOp1 =

25570

incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {

25571

Op1 = UGTOp1;

25572

Cond = ISD::SETUGE;

25573

}

25574

}

25575

if (Cond == ISD::SETULT) {

25576

// X < C --> X <= (C-1) --> X == umin(X, C-1)

25577

if (SDValue ULTOp1 =

25578

incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {

25579

Op1 = ULTOp1;

25580

Cond = ISD::SETULE;

25581

}

25582

}

25583

bool Invert = false;

25584

unsigned Opc;

25585

switch (Cond) {

25586

default: llvm_unreachable("Unexpected condition code")::llvm::llvm_unreachable_internal("Unexpected condition code"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25586);

25587

case ISD::SETUGT: Invert = true; [[fallthrough]];

25588

case ISD::SETULE: Opc = ISD::UMIN; break;

25589

case ISD::SETULT: Invert = true; [[fallthrough]];

25590

case ISD::SETUGE: Opc = ISD::UMAX; break;

25591

}

25592

25593

SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

25594

Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);

25595

25596

// If the logical-not of the result is required, perform that now.

25597

if (Invert)

25598

Result = DAG.getNOT(dl, Result, VT);

25599

25600

return Result;

25601

}

25602

25603

// Try to use SUBUS and PCMPEQ.

25604

if (FlipSigns)

25605

if (SDValue V =

25606

LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))

25607

return V;

25608

25609

// We are handling one of the integer comparisons here. Since SSE only has

25610

// GT and EQ comparisons for integer, swapping operands and multiple

25611

// operations may be required for some comparisons.

25612

unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ

25613

: X86ISD::PCMPGT;

25614

bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||

25615

Cond == ISD::SETGE || Cond == ISD::SETUGE;

25616

bool Invert = Cond == ISD::SETNE ||

25617

(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));

25618

25619

if (Swap)

25620

std::swap(Op0, Op1);

25621

25622

// Check that the operation in question is available (most are plain SSE2,

25623

// but PCMPGTQ and PCMPEQQ have different requirements).

25624

if (VT == MVT::v2i64) {

25625

if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {

25626

assert(Subtarget.hasSSE2() && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && "Don't know how to lower!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25626, __extension__
__PRETTY_FUNCTION__));

25627

25628

// Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle

25629

// the odd elements over the even elements.

25630

if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {

25631

Op0 = DAG.getConstant(0, dl, MVT::v4i32);

25632

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

25633

25634

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

25635

static const int MaskHi[] = { 1, 1, 3, 3 };

25636

SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

25637

25638

return DAG.getBitcast(VT, Result);

25639

}

25640

25641

if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {

25642

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

25643

Op1 = DAG.getConstant(-1, dl, MVT::v4i32);

25644

25645

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

25646

static const int MaskHi[] = { 1, 1, 3, 3 };

25647

SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

25648

25649

return DAG.getBitcast(VT, Result);

25650

}

25651

25652

// Since SSE has no unsigned integer comparisons, we need to flip the sign

25653

// bits of the inputs before performing those operations. The lower

25654

// compare is always unsigned.

25655

SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL

25656

: 0x0000000080000000ULL,

25657

dl, MVT::v2i64);

25658

25659

Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);

25660

Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);

25661

25662

// Cast everything to the right type.

25663

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

25664

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

25665

25666

// Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))

25667

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

25668

SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);

25669

25670

// Create masks for only the low parts/high parts of the 64 bit integers.

25671

static const int MaskHi[] = { 1, 1, 3, 3 };

25672

static const int MaskLo[] = { 0, 0, 2, 2 };

25673

SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);

25674

SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);

25675

SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

25676

25677

SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);

25678

Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);

25679

25680

if (Invert)

25681

Result = DAG.getNOT(dl, Result, MVT::v4i32);

25682

25683

return DAG.getBitcast(VT, Result);

25684

}

25685

25686

if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {

25687

// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with

25688

// pcmpeqd + pshufd + pand.

25689

assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && !FlipSigns
&& "Don't know how to lower!") ? void (0) : __assert_fail
("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25689, __extension__
__PRETTY_FUNCTION__));

25690

25691

// First cast everything to the right type.

25692

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

25693

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

25694

25695

// Do the compare.

25696

SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);

25697

25698

// Make sure the lower and upper halves are both all-ones.

25699

static const int Mask[] = { 1, 0, 3, 2 };

25700

SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);

25701

Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);

25702

25703

if (Invert)

25704

Result = DAG.getNOT(dl, Result, MVT::v4i32);

25705

25706

return DAG.getBitcast(VT, Result);

25707

}

25708

}

25709

25710

// Since SSE has no unsigned integer comparisons, we need to flip the sign

25711

// bits of the inputs before performing those operations.

25712

if (FlipSigns) {

25713

MVT EltVT = VT.getVectorElementType();

25714

SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,

25715

VT);

25716

Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);

25717

Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);

25718

}

25719

25720

SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

25721

25722

// If the logical-not of the result is required, perform that now.

25723

if (Invert)

25724

Result = DAG.getNOT(dl, Result, VT);

25725

25726

return Result;

25727

}

25728

25729

// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.

25730

static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,

25731

const SDLoc &dl, SelectionDAG &DAG,

25732

const X86Subtarget &Subtarget,

25733

SDValue &X86CC) {

25734

assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25734, __extension__
__PRETTY_FUNCTION__));

25735

25736

// Must be a bitcast from vXi1.

25737

if (Op0.getOpcode() != ISD::BITCAST)

25738

return SDValue();

25739

25740

Op0 = Op0.getOperand(0);

25741

MVT VT = Op0.getSimpleValueType();

25742

if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&

25743

!(Subtarget.hasDQI() && VT == MVT::v8i1) &&

25744

!(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))

25745

return SDValue();

25746

25747

X86::CondCode X86Cond;

25748

if (isNullConstant(Op1)) {

25749

X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;

25750

} else if (isAllOnesConstant(Op1)) {

25751

// C flag is set for all ones.

25752

X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;

25753

} else

25754

return SDValue();

25755

25756

// If the input is an AND, we can combine it's operands into the KTEST.

25757

bool KTestable = false;

25758

if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))

25759

KTestable = true;

25760

if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))

25761

KTestable = true;

25762

if (!isNullConstant(Op1))

25763

KTestable = false;

25764

if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {

25765

SDValue LHS = Op0.getOperand(0);

25766

SDValue RHS = Op0.getOperand(1);

25767

X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

25768

return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);

25769

}

25770

25771

// If the input is an OR, we can combine it's operands into the KORTEST.

25772

SDValue LHS = Op0;

25773

SDValue RHS = Op0;

25774

if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {

25775

LHS = Op0.getOperand(0);

25776

RHS = Op0.getOperand(1);

25777

}

25778

25779

X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

25780

return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);

25781

}

25782

25783

/// Emit flags for the given setcc condition and operands. Also returns the

25784

/// corresponding X86 condition code constant in X86CC.

25785

SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,

25786

ISD::CondCode CC, const SDLoc &dl,

25787

SelectionDAG &DAG,

25788

SDValue &X86CC) const {

25789

// Equality Combines.

25790

if (CC == ISD::SETEQ || CC == ISD::SETNE) {

25791

X86::CondCode X86CondCode;

25792

25793

// Optimize to BT if possible.

25794

// Lower (X & (1 << N)) == 0 to BT(X, N).

25795

// Lower ((X >>u N) & 1) != 0 to BT(X, N).

25796

// Lower ((X >>s N) & 1) != 0 to BT(X, N).

25797

if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {

25798

if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {

25799

X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);

25800

return BT;

25801

}

25802

}

25803

25804

// Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.

25805

if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,

25806

X86CondCode)) {

25807

X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);

25808

return CmpZ;

25809

}

25810

25811

// Try to lower using KORTEST or KTEST.

25812

if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))

25813

return Test;

25814

25815

// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms

25816

// of these.

25817

if (isOneConstant(Op1) || isNullConstant(Op1)) {

25818

// If the input is a setcc, then reuse the input setcc or use a new one

25819

// with the inverted condition.

25820

if (Op0.getOpcode() == X86ISD::SETCC) {

25821

bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);

25822

25823

X86CC = Op0.getOperand(0);

25824

if (Invert) {

25825

X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);

25826

X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);

25827

X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);

25828

}

25829

25830

return Op0.getOperand(1);

25831

}

25832

}

25833

25834

// Try to use the carry flag from the add in place of an separate CMP for:

25835

// (seteq (add X, -1), -1). Similar for setne.

25836

if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&

25837

Op0.getOperand(1) == Op1) {

25838

if (isProfitableToUseFlagOp(Op0)) {

25839

SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);

25840

25841

SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),

25842

Op0.getOperand(1));

25843

DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);

25844

X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;

25845

X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);

25846

return SDValue(New.getNode(), 1);

25847

}

25848

}

25849

}

25850

25851

X86::CondCode CondCode =

25852

TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);

25853

assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")(static_cast <bool> (CondCode != X86::COND_INVALID &&
"Unexpected condition code!") ? void (0) : __assert_fail ("CondCode != X86::COND_INVALID && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25853, __extension__
__PRETTY_FUNCTION__));

25854

25855

SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);

25856

X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);

25857

return EFLAGS;

25858

}

25859

25860

SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

25861

25862

bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||

25863

Op.getOpcode() == ISD::STRICT_FSETCCS;

25864

MVT VT = Op->getSimpleValueType(0);

25865

25866

if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);

25867

25868

assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")(static_cast <bool> (VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? void (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25868, __extension__
__PRETTY_FUNCTION__));

25869

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

25870

SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);

25871

SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);

25872

SDLoc dl(Op);

25873

ISD::CondCode CC =

25874

cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();

25875

25876

if (isSoftFP16(Op0.getValueType()))

25877

return SDValue();

25878

25879

// Handle f128 first, since one possible outcome is a normal integer

25880

// comparison which gets handled by emitFlagsForSetcc.

25881

if (Op0.getValueType() == MVT::f128) {

25882

softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,

25883

Op.getOpcode() == ISD::STRICT_FSETCCS);

25884

25885

// If softenSetCCOperands returned a scalar, use it.

25886

if (!Op1.getNode()) {

25887

assert(Op0.getValueType() == Op.getValueType() &&(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25888, __extension__
__PRETTY_FUNCTION__))

25888

"Unexpected setcc expansion!")(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25888, __extension__
__PRETTY_FUNCTION__));

25889

if (IsStrict)

25890

return DAG.getMergeValues({Op0, Chain}, dl);

25891

return Op0;

25892

}

25893

}

25894

25895

if (Op0.getSimpleValueType().isInteger()) {

25896

// Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which

25897

// reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),

25898

// this may translate to less uops depending on uarch implementation. The

25899

// equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already

25900

// canonicalize to that CondCode.

25901

// NOTE: Only do this if incrementing the constant doesn't increase the bit

25902

// encoding size - so it must either already be a i8 or i32 immediate, or it

25903

// shrinks down to that. We don't do this for any i64's to avoid additional

25904

// constant materializations.

25905

// TODO: Can we move this to TranslateX86CC to handle jumps/branches too?

25906

if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {

25907

const APInt &Op1Val = Op1C->getAPIntValue();

25908

if (!Op1Val.isZero()) {

25909

// Ensure the constant+1 doesn't overflow.

25910

if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||

25911

(CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {

25912

APInt Op1ValPlusOne = Op1Val + 1;

25913

if (Op1ValPlusOne.isSignedIntN(32) &&

25914

(!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {

25915

Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());

25916

CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE

25917

: ISD::CondCode::SETUGE;

25918

}

25919

}

25920

}

25921

}

25922

25923

SDValue X86CC;

25924

SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);

25925

SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

25926

return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;

25927

}

25928

25929

// Handle floating point.

25930

X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);

25931

if (CondCode == X86::COND_INVALID)

25932

return SDValue();

25933

25934

SDValue EFLAGS;

25935

if (IsStrict) {

25936

bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;

25937

EFLAGS =

25938

DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,

25939

dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});

25940

Chain = EFLAGS.getValue(1);

25941

} else {

25942

EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);

25943

}

25944

25945

SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);

25946

SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

25947

return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;

25948

}

25949

25950

SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {

25951

SDValue LHS = Op.getOperand(0);

25952

SDValue RHS = Op.getOperand(1);

25953

SDValue Carry = Op.getOperand(2);

25954

SDValue Cond = Op.getOperand(3);

25955

SDLoc DL(Op);

25956

25957

assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")(static_cast <bool> (LHS.getSimpleValueType().isInteger
() && "SETCCCARRY is integer only.") ? void (0) : __assert_fail
("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25957, __extension__
__PRETTY_FUNCTION__));

25958

X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());

25959

25960

// Recreate the carry if needed.

25961

EVT CarryVT = Carry.getValueType();

25962

Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),

25963

Carry, DAG.getAllOnesConstant(DL, CarryVT));

25964

25965

SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

25966

SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));

25967

return getSETCC(CC, Cmp.getValue(1), DL, DAG);

25968

}

25969

25970

// This function returns three things: the arithmetic computation itself

25971

// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The

25972

// flag and the condition code define the case in which the arithmetic

25973

// computation overflows.

25974

static std::pair<SDValue, SDValue>

25975

getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {

25976

assert(Op.getResNo() == 0 && "Unexpected result number!")(static_cast <bool> (Op.getResNo() == 0 && "Unexpected result number!"
) ? void (0) : __assert_fail ("Op.getResNo() == 0 && \"Unexpected result number!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25976, __extension__
__PRETTY_FUNCTION__));

25977

SDValue Value, Overflow;

25978

SDValue LHS = Op.getOperand(0);

25979

SDValue RHS = Op.getOperand(1);

25980

unsigned BaseOp = 0;

25981

SDLoc DL(Op);

25982

switch (Op.getOpcode()) {

25983

default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 25983);

25984

case ISD::SADDO:

25985

BaseOp = X86ISD::ADD;

25986

Cond = X86::COND_O;

25987

break;

25988

case ISD::UADDO:

25989

BaseOp = X86ISD::ADD;

25990

Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;

25991

break;

25992

case ISD::SSUBO:

25993

BaseOp = X86ISD::SUB;

25994

Cond = X86::COND_O;

25995

break;

25996

case ISD::USUBO:

25997

BaseOp = X86ISD::SUB;

25998

Cond = X86::COND_B;

25999

break;

26000

case ISD::SMULO:

26001

BaseOp = X86ISD::SMUL;

26002

Cond = X86::COND_O;

26003

break;

26004

case ISD::UMULO:

26005

BaseOp = X86ISD::UMUL;

26006

Cond = X86::COND_O;

26007

break;

26008

}

26009

26010

if (BaseOp) {

26011

// Also sets EFLAGS.

26012

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

26013

Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);

26014

Overflow = Value.getValue(1);

26015

}

26016

26017

return std::make_pair(Value, Overflow);

26018

}

26019

26020

static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {

26021

// Lower the "add/sub/mul with overflow" instruction into a regular ins plus

26022

// a "setcc" instruction that checks the overflow flag. The "brcond" lowering

26023

// looks for this combo and may remove the "setcc" instruction if the "setcc"

26024

// has only one use.

26025

SDLoc DL(Op);

26026

X86::CondCode Cond;

26027

SDValue Value, Overflow;

26028

std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);

26029

26030

SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);

26031

assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Op->getValueType(1) == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Op->getValueType(1) == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26031, __extension__
__PRETTY_FUNCTION__));

26032

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);

26033

}

26034

26035

/// Return true if opcode is a X86 logical comparison.

26036

static bool isX86LogicalCmp(SDValue Op) {

26037

unsigned Opc = Op.getOpcode();

26038

if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||

26039

Opc == X86ISD::FCMP)

26040

return true;

26041

if (Op.getResNo() == 1 &&

26042

(Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||

26043

Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||

26044

Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))

26045

return true;

26046

26047

return false;

26048

}

26049

26050

static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {

26051

if (V.getOpcode() != ISD::TRUNCATE)

26052

return false;

26053

26054

SDValue VOp0 = V.getOperand(0);

26055

unsigned InBits = VOp0.getValueSizeInBits();

26056

unsigned Bits = V.getValueSizeInBits();

26057

return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));

26058

}

26059

26060

SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

26061

bool AddTest = true;

26062

SDValue Cond = Op.getOperand(0);

26063

SDValue Op1 = Op.getOperand(1);

26064

SDValue Op2 = Op.getOperand(2);

26065

SDLoc DL(Op);

26066

MVT VT = Op1.getSimpleValueType();

26067

SDValue CC;

26068

26069

if (isSoftFP16(VT)) {

26070

MVT NVT = VT.changeTypeToInteger();

26071

return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,

26072

DAG.getBitcast(NVT, Op1),

26073

DAG.getBitcast(NVT, Op2)));

26074

}

26075

26076

// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops

26077

// are available or VBLENDV if AVX is available.

26078

// Otherwise FP cmovs get lowered into a less efficient branch sequence later.

26079

if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&

26080

VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {

26081

SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);

26082

bool IsAlwaysSignaling;

26083

unsigned SSECC =

26084

translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),

26085

CondOp0, CondOp1, IsAlwaysSignaling);

26086

26087

if (Subtarget.hasAVX512()) {

26088

SDValue Cmp =

26089

DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,

26090

DAG.getTargetConstant(SSECC, DL, MVT::i8));

26091

assert(!VT.isVector() && "Not a scalar type?")(static_cast <bool> (!VT.isVector() && "Not a scalar type?"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26091, __extension__
__PRETTY_FUNCTION__));

26092

return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);

26093

}

26094

26095

if (SSECC < 8 || Subtarget.hasAVX()) {

26096

SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,

26097

DAG.getTargetConstant(SSECC, DL, MVT::i8));

26098

26099

// If we have AVX, we can use a variable vector select (VBLENDV) instead

26100

// of 3 logic instructions for size savings and potentially speed.

26101

// Unfortunately, there is no scalar form of VBLENDV.

26102

26103

// If either operand is a +0.0 constant, don't try this. We can expect to

26104

// optimize away at least one of the logic instructions later in that

26105

// case, so that sequence would be faster than a variable blend.

26106

26107

// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly

26108

// uses XMM0 as the selection register. That may need just as many

26109

// instructions as the AND/ANDN/OR sequence due to register moves, so

26110

// don't bother.

26111

if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&

26112

!isNullFPConstant(Op2)) {

26113

// Convert to vectors, do a VSELECT, and convert back to scalar.

26114

// All of the conversions should be optimized away.

26115

MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;

26116

SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);

26117

SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);

26118

SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);

26119

26120

MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;

26121

VCmp = DAG.getBitcast(VCmpVT, VCmp);

26122

26123

SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);

26124

26125

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

26126

VSel, DAG.getIntPtrConstant(0, DL));

26127

}

26128

SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);

26129

SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);

26130

return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);

26131

}

26132

}

26133

26134

// AVX512 fallback is to lower selects of scalar floats to masked moves.

26135

if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {

26136

SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);

26137

return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);

26138

}

26139

26140

if (Cond.getOpcode() == ISD::SETCC &&

26141

!isSoftFP16(Cond.getOperand(0).getSimpleValueType())) {

26142

if (SDValue NewCond = LowerSETCC(Cond, DAG)) {

26143

Cond = NewCond;

26144

// If the condition was updated, it's possible that the operands of the

26145

// select were also updated (for example, EmitTest has a RAUW). Refresh

26146

// the local references to the select operands in case they got stale.

26147

Op1 = Op.getOperand(1);

26148

Op2 = Op.getOperand(2);

26149

}

26150

}

26151

26152

// (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y

26153

// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y

26154

// (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y

26155

// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y

26156

// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y

26157

// (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y

26158

// (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x

26159

// (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x

26160

if (Cond.getOpcode() == X86ISD::SETCC &&

26161

Cond.getOperand(1).getOpcode() == X86ISD::CMP &&

26162

isNullConstant(Cond.getOperand(1).getOperand(1))) {

26163

SDValue Cmp = Cond.getOperand(1);

26164

SDValue CmpOp0 = Cmp.getOperand(0);

26165

unsigned CondCode = Cond.getConstantOperandVal(0);

26166

26167

// Special handling for __builtin_ffs(X) - 1 pattern which looks like

26168

// (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special

26169

// handle to keep the CMP with 0. This should be removed by

26170

// optimizeCompareInst by using the flags from the BSR/TZCNT used for the

26171

// cttz_zero_undef.

26172

auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {

26173

return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&

26174

Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));

26175

};

26176

if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&

26177

((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||

26178

(CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {

26179

// Keep Cmp.

26180

} else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&

26181

(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {

26182

SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;

26183

SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);

26184

26185

// 'X - 1' sets the carry flag if X == 0.

26186

// '0 - X' sets the carry flag if X != 0.

26187

// Convert the carry flag to a -1/0 mask with sbb:

26188

// select (X != 0), -1, Y --> 0 - X; or (sbb), Y

26189

// select (X == 0), Y, -1 --> 0 - X; or (sbb), Y

26190

// select (X != 0), Y, -1 --> X - 1; or (sbb), Y

26191

// select (X == 0), -1, Y --> X - 1; or (sbb), Y

26192

SDValue Sub;

26193

if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {

26194

SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());

26195

Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);

26196

} else {

26197

SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());

26198

Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);

26199

}

26200

SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

26201

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

26202

Sub.getValue(1));

26203

return DAG.getNode(ISD::OR, DL, VT, SBB, Y);

26204

} else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&

26205

CmpOp0.getOpcode() == ISD::AND &&

26206

isOneConstant(CmpOp0.getOperand(1))) {

26207

SDValue Src1, Src2;

26208

// true if Op2 is XOR or OR operator and one of its operands

26209

// is equal to Op1

26210

// ( a , a op b) || ( b , a op b)

26211

auto isOrXorPattern = [&]() {

26212

if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&

26213

(Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {

26214

Src1 =

26215

Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);

26216

Src2 = Op1;

26217

return true;

26218

}

26219

return false;

26220

};

26221

26222

if (isOrXorPattern()) {

26223

SDValue Neg;

26224

unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();

26225

// we need mask of all zeros or ones with same size of the other

26226

// operands.

26227

if (CmpSz > VT.getSizeInBits())

26228

Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);

26229

else if (CmpSz < VT.getSizeInBits())

26230

Neg = DAG.getNode(ISD::AND, DL, VT,

26231

DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),

26232

DAG.getConstant(1, DL, VT));

26233

else

26234

Neg = CmpOp0;

26235

SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

26236

Neg); // -(and (x, 0x1))

26237

SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z

26238

return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y

26239

}

26240

} else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&

26241

Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&

26242

((CondCode == X86::COND_S) || // smin(x, 0)

26243

(CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)

26244

// (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x

26245

//

26246

// If the comparison is testing for a positive value, we have to invert

26247

// the sign bit mask, so only do that transform if the target has a

26248

// bitwise 'and not' instruction (the invert is free).

26249

// (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x

26250

unsigned ShCt = VT.getSizeInBits() - 1;

26251

SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);

26252

SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);

26253

if (CondCode == X86::COND_G)

26254

Shift = DAG.getNOT(DL, Shift, VT);

26255

return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);

26256

}

26257

}

26258

26259

// Look past (and (setcc_carry (cmp ...)), 1).

26260

if (Cond.getOpcode() == ISD::AND &&

26261

Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&

26262

isOneConstant(Cond.getOperand(1)))

26263

Cond = Cond.getOperand(0);

26264

26265

// If condition flag is set by a X86ISD::CMP, then use it as the condition

26266

// setting operand in place of the X86ISD::SETCC.

26267

unsigned CondOpcode = Cond.getOpcode();

26268

if (CondOpcode == X86ISD::SETCC ||

26269

CondOpcode == X86ISD::SETCC_CARRY) {

26270

CC = Cond.getOperand(0);

26271

26272

SDValue Cmp = Cond.getOperand(1);

26273

bool IllegalFPCMov = false;

26274

if (VT.isFloatingPoint() && !VT.isVector() &&

26275

!isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?

26276

IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());

26277

26278

if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||

26279

Cmp.getOpcode() == X86ISD::BT) { // FIXME

26280

Cond = Cmp;

26281

AddTest = false;

26282

}

26283

} else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||

26284

CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||

26285

CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {

26286

SDValue Value;

26287

X86::CondCode X86Cond;

26288

std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

26289

26290

CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);

26291

AddTest = false;

26292

}

26293

26294

if (AddTest) {

26295

// Look past the truncate if the high bits are known zero.

26296

if (isTruncWithZeroHighBitsInput(Cond, DAG))

26297

Cond = Cond.getOperand(0);

26298

26299

// We know the result of AND is compared against zero. Try to match

26300

// it to BT.

26301

if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {

26302

X86::CondCode X86CondCode;

26303

if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {

26304

CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);

26305

Cond = BT;

26306

AddTest = false;

26307

}

26308

}

26309

}

26310

26311

if (AddTest) {

26312

CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);

26313

Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);

26314

}

26315

26316

// a < b ? -1 : 0 -> RES = ~setcc_carry

26317

// a < b ? 0 : -1 -> RES = setcc_carry

26318

// a >= b ? -1 : 0 -> RES = setcc_carry

26319

// a >= b ? 0 : -1 -> RES = ~setcc_carry

26320

if (Cond.getOpcode() == X86ISD::SUB) {

26321

unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

26322

26323

if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&

26324

(isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&

26325

(isNullConstant(Op1) || isNullConstant(Op2))) {

26326

SDValue Res =

26327

DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),

26328

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);

26329

if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))

26330

return DAG.getNOT(DL, Res, Res.getValueType());

26331

return Res;

26332

}

26333

}

26334

26335

// X86 doesn't have an i8 cmov. If both operands are the result of a truncate

26336

// widen the cmov and push the truncate through. This avoids introducing a new

26337

// branch during isel and doesn't add any extensions.

26338

if (Op.getValueType() == MVT::i8 &&

26339

Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {

26340

SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);

26341

if (T1.getValueType() == T2.getValueType() &&

26342

// Exclude CopyFromReg to avoid partial register stalls.

26343

T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){

26344

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,

26345

CC, Cond);

26346

return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);

26347

}

26348

}

26349

26350

// Or finally, promote i8 cmovs if we have CMOV,

26351

// or i16 cmovs if it won't prevent folding a load.

26352

// FIXME: we should not limit promotion of i8 case to only when the CMOV is

26353

// legal, but EmitLoweredSelect() can not deal with these extensions

26354

// being inserted between two CMOV's. (in i16 case too TBN)

26355

// https://bugs.llvm.org/show_bug.cgi?id=40974

26356

if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||

26357

(Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&

26358

!X86::mayFoldLoad(Op2, Subtarget))) {

26359

Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);

26360

Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);

26361

SDValue Ops[] = { Op2, Op1, CC, Cond };

26362

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);

26363

return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);

26364

}

26365

26366

// X86ISD::CMOV means set the result (which is operand 1) to the RHS if

26367

// condition is true.

26368

SDValue Ops[] = { Op2, Op1, CC, Cond };

26369

return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);

26370

}

26371

26372

static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,

26373

const X86Subtarget &Subtarget,

26374

SelectionDAG &DAG) {

26375

MVT VT = Op->getSimpleValueType(0);

26376

SDValue In = Op->getOperand(0);

26377

MVT InVT = In.getSimpleValueType();

26378

assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26378, __extension__
__PRETTY_FUNCTION__));

26379

MVT VTElt = VT.getVectorElementType();

26380

SDLoc dl(Op);

26381

26382

unsigned NumElts = VT.getVectorNumElements();

26383

26384

// Extend VT if the scalar type is i8/i16 and BWI is not supported.

26385

MVT ExtVT = VT;

26386

if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {

26387

// If v16i32 is to be avoided, we'll need to split and concatenate.

26388

if (NumElts == 16 && !Subtarget.canExtendTo512DQ())

26389

return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);

26390

26391

ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

26392

}

26393

26394

// Widen to 512-bits if VLX is not supported.

26395

MVT WideVT = ExtVT;

26396

if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {

26397

NumElts *= 512 / ExtVT.getSizeInBits();

26398

InVT = MVT::getVectorVT(MVT::i1, NumElts);

26399

In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),

26400

In, DAG.getIntPtrConstant(0, dl));

26401

WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);

26402

}

26403

26404

SDValue V;

26405

MVT WideEltVT = WideVT.getVectorElementType();

26406

if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||

26407

(Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {

26408

V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);

26409

} else {

26410

SDValue NegOne = DAG.getConstant(-1, dl, WideVT);

26411

SDValue Zero = DAG.getConstant(0, dl, WideVT);

26412

V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);

26413

}

26414

26415

// Truncate if we had to extend i16/i8 above.

26416

if (VT != ExtVT) {

26417

WideVT = MVT::getVectorVT(VTElt, NumElts);

26418

V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);

26419

}

26420

26421

// Extract back to 128/256-bit if we widened.

26422

if (WideVT != VT)

26423

V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,

26424

DAG.getIntPtrConstant(0, dl));

26425

26426

return V;

26427

}

26428

26429

static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

26430

SelectionDAG &DAG) {

26431

SDValue In = Op->getOperand(0);

26432

MVT InVT = In.getSimpleValueType();

26433

26434

if (InVT.getVectorElementType() == MVT::i1)

26435

return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

26436

26437

assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26437, __extension__
__PRETTY_FUNCTION__));

26438

return LowerAVXExtend(Op, DAG, Subtarget);

26439

}

26440

26441

// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.

26442

// For sign extend this needs to handle all vector sizes and SSE4.1 and

26443

// non-SSE4.1 targets. For zero extend this should only handle inputs of

26444

// MVT::v64i8 when BWI is not supported, but AVX512 is.

26445

static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,

26446

const X86Subtarget &Subtarget,

26447

SelectionDAG &DAG) {

26448

SDValue In = Op->getOperand(0);

26449

MVT VT = Op->getSimpleValueType(0);

26450

MVT InVT = In.getSimpleValueType();

26451

26452

MVT SVT = VT.getVectorElementType();

26453

MVT InSVT = InVT.getVectorElementType();

26454

assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits())(static_cast <bool> (SVT.getFixedSizeInBits() > InSVT
.getFixedSizeInBits()) ? void (0) : __assert_fail ("SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26454, __extension__
__PRETTY_FUNCTION__));

26455

26456

if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)

26457

return SDValue();

26458

if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)

26459

return SDValue();

26460

if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&

26461

!(VT.is256BitVector() && Subtarget.hasAVX()) &&

26462

!(VT.is512BitVector() && Subtarget.hasAVX512()))

26463

return SDValue();

26464

26465

SDLoc dl(Op);

26466

unsigned Opc = Op.getOpcode();

26467

unsigned NumElts = VT.getVectorNumElements();

26468

26469

// For 256-bit vectors, we only need the lower (128-bit) half of the input.

26470

// For 512-bit vectors, we need 128-bits or 256-bits.

26471

if (InVT.getSizeInBits() > 128) {

26472

// Input needs to be at least the same number of elements as output, and

26473

// at least 128-bits.

26474

int InSize = InSVT.getSizeInBits() * NumElts;

26475

In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));

26476

InVT = In.getSimpleValueType();

26477

}

26478

26479

// SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,

26480

// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still

26481

// need to be handled here for 256/512-bit results.

26482

if (Subtarget.hasInt256()) {

26483

assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Unexpected 128-bit vector extension") ? void (0) : __assert_fail
("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26483, __extension__
__PRETTY_FUNCTION__));

26484

26485

if (InVT.getVectorNumElements() != NumElts)

26486

return DAG.getNode(Op.getOpcode(), dl, VT, In);

26487

26488

// FIXME: Apparently we create inreg operations that could be regular

26489

// extends.

26490

unsigned ExtOpc =

26491

Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND

26492

: ISD::ZERO_EXTEND;

26493

return DAG.getNode(ExtOpc, dl, VT, In);

26494

}

26495

26496

// pre-AVX2 256-bit extensions need to be split into 128-bit instructions.

26497

if (Subtarget.hasAVX()) {

26498

assert(VT.is256BitVector() && "256-bit vector expected")(static_cast <bool> (VT.is256BitVector() && "256-bit vector expected"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"256-bit vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26498, __extension__
__PRETTY_FUNCTION__));

26499

MVT HalfVT = VT.getHalfNumVectorElementsVT();

26500

int HalfNumElts = HalfVT.getVectorNumElements();

26501

26502

unsigned NumSrcElts = InVT.getVectorNumElements();

26503

SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);

26504

for (int i = 0; i != HalfNumElts; ++i)

26505

HiMask[i] = HalfNumElts + i;

26506

26507

SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);

26508

SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);

26509

Hi = DAG.getNode(Opc, dl, HalfVT, Hi);

26510

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

26511

}

26512

26513

// We should only get here for sign extend.

26514

assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")(static_cast <bool> (Opc == ISD::SIGN_EXTEND_VECTOR_INREG
&& "Unexpected opcode!") ? void (0) : __assert_fail (
"Opc == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26514, __extension__
__PRETTY_FUNCTION__));

26515

assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")(static_cast <bool> (VT.is128BitVector() && InVT
.is128BitVector() && "Unexpected VTs") ? void (0) : __assert_fail
("VT.is128BitVector() && InVT.is128BitVector() && \"Unexpected VTs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26515, __extension__
__PRETTY_FUNCTION__));

26516

26517

// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.

26518

SDValue Curr = In;

26519

SDValue SignExt = Curr;

26520

26521

// As SRAI is only available on i16/i32 types, we expand only up to i32

26522

// and handle i64 separately.

26523

if (InVT != MVT::v4i32) {

26524

MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;

26525

26526

unsigned DestWidth = DestVT.getScalarSizeInBits();

26527

unsigned Scale = DestWidth / InSVT.getSizeInBits();

26528

26529

unsigned InNumElts = InVT.getVectorNumElements();

26530

unsigned DestElts = DestVT.getVectorNumElements();

26531

26532

// Build a shuffle mask that takes each input element and places it in the

26533

// MSBs of the new element size.

26534

SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);

26535

for (unsigned i = 0; i != DestElts; ++i)

26536

Mask[i * Scale + (Scale - 1)] = i;

26537

26538

Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);

26539

Curr = DAG.getBitcast(DestVT, Curr);

26540

26541

unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();

26542

SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,

26543

DAG.getTargetConstant(SignExtShift, dl, MVT::i8));

26544

}

26545

26546

if (VT == MVT::v2i64) {

26547

assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")(static_cast <bool> (Curr.getValueType() == MVT::v4i32 &&
"Unexpected input VT") ? void (0) : __assert_fail ("Curr.getValueType() == MVT::v4i32 && \"Unexpected input VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26547, __extension__
__PRETTY_FUNCTION__));

26548

SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);

26549

SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);

26550

SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});

26551

SignExt = DAG.getBitcast(VT, SignExt);

26552

}

26553

26554

return SignExt;

26555

}

26556

26557

static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

26558

SelectionDAG &DAG) {

26559

MVT VT = Op->getSimpleValueType(0);

26560

SDValue In = Op->getOperand(0);

26561

MVT InVT = In.getSimpleValueType();

26562

SDLoc dl(Op);

26563

26564

if (InVT.getVectorElementType() == MVT::i1)

26565

return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

26566

26567

assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26567, __extension__
__PRETTY_FUNCTION__));

26568

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26569, __extension__
__PRETTY_FUNCTION__))

26569

"Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26569, __extension__
__PRETTY_FUNCTION__));

26570

assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26573, __extension__
__PRETTY_FUNCTION__))

26571

VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26573, __extension__
__PRETTY_FUNCTION__))

26572

VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26573, __extension__
__PRETTY_FUNCTION__))

26573

"Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26573, __extension__
__PRETTY_FUNCTION__));

26574

assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26577, __extension__
__PRETTY_FUNCTION__))

26575

InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26577, __extension__
__PRETTY_FUNCTION__))

26576

InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26577, __extension__
__PRETTY_FUNCTION__))

26577

"Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26577, __extension__
__PRETTY_FUNCTION__));

26578

26579

if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {

26580

assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26580, __extension__
__PRETTY_FUNCTION__));

26581

return splitVectorIntUnary(Op, DAG);

26582

}

26583

26584

if (Subtarget.hasInt256())

26585

return Op;

26586

26587

// Optimize vectors in AVX mode

26588

// Sign extend v8i16 to v8i32 and

26589

// v4i32 to v4i64

26590

//

26591

// Divide input vector into two parts

26592

// for v4i32 the high shuffle mask will be {2, 3, -1, -1}

26593

// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32

26594

// concat the vectors to original VT

26595

MVT HalfVT = VT.getHalfNumVectorElementsVT();

26596

SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);

26597

26598

unsigned NumElems = InVT.getVectorNumElements();

26599

SmallVector<int,8> ShufMask(NumElems, -1);

26600

for (unsigned i = 0; i != NumElems/2; ++i)

26601

ShufMask[i] = i + NumElems/2;

26602

26603

SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);

26604

OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);

26605

26606

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);

26607

}

26608

26609

/// Change a vector store into a pair of half-size vector stores.

26610

static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {

26611

SDValue StoredVal = Store->getValue();

26612

assert((StoredVal.getValueType().is256BitVector() ||(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26614, __extension__
__PRETTY_FUNCTION__))

26613

StoredVal.getValueType().is512BitVector()) &&(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26614, __extension__
__PRETTY_FUNCTION__))

26614

"Expecting 256/512-bit op")(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26614, __extension__
__PRETTY_FUNCTION__));

26615

26616

// Splitting volatile memory ops is not allowed unless the operation was not

26617

// legal to begin with. Assume the input store is legal (this transform is

26618

// only used for targets with AVX). Note: It is possible that we have an

26619

// illegal type like v2i128, and so we could allow splitting a volatile store

26620

// in that case if that is important.

26621

if (!Store->isSimple())

26622

return SDValue();

26623

26624

SDLoc DL(Store);

26625

SDValue Value0, Value1;

26626

std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);

26627

unsigned HalfOffset = Value0.getValueType().getStoreSize();

26628

SDValue Ptr0 = Store->getBasePtr();

26629

SDValue Ptr1 =

26630

DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);

26631

SDValue Ch0 =

26632

DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),

26633

Store->getOriginalAlign(),

26634

Store->getMemOperand()->getFlags());

26635

SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,

26636

Store->getPointerInfo().getWithOffset(HalfOffset),

26637

Store->getOriginalAlign(),

26638

Store->getMemOperand()->getFlags());

26639

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);

26640

}

26641

26642

/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar

26643

/// type.

26644

static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,

26645

SelectionDAG &DAG) {

26646

SDValue StoredVal = Store->getValue();

26647

assert(StoreVT.is128BitVector() &&(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26648, __extension__
__PRETTY_FUNCTION__))

26648

StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26648, __extension__
__PRETTY_FUNCTION__));

26649

StoredVal = DAG.getBitcast(StoreVT, StoredVal);

26650

26651

// Splitting volatile memory ops is not allowed unless the operation was not

26652

// legal to begin with. We are assuming the input op is legal (this transform

26653

// is only used for targets with AVX).

26654

if (!Store->isSimple())

26655

return SDValue();

26656

26657

MVT StoreSVT = StoreVT.getScalarType();

26658

unsigned NumElems = StoreVT.getVectorNumElements();

26659

unsigned ScalarSize = StoreSVT.getStoreSize();

26660

26661

SDLoc DL(Store);

26662

SmallVector<SDValue, 4> Stores;

26663

for (unsigned i = 0; i != NumElems; ++i) {

26664

unsigned Offset = i * ScalarSize;

26665

SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),

26666

TypeSize::Fixed(Offset), DL);

26667

SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,

26668

DAG.getIntPtrConstant(i, DL));

26669

SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,

26670

Store->getPointerInfo().getWithOffset(Offset),

26671

Store->getOriginalAlign(),

26672

Store->getMemOperand()->getFlags());

26673

Stores.push_back(Ch);

26674

}

26675

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);

26676

}

26677

26678

static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,

26679

SelectionDAG &DAG) {

26680

StoreSDNode *St = cast<StoreSDNode>(Op.getNode());

26681

SDLoc dl(St);

26682

SDValue StoredVal = St->getValue();

26683

26684

// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.

26685

if (StoredVal.getValueType().isVector() &&

26686

StoredVal.getValueType().getVectorElementType() == MVT::i1) {

26687

unsigned NumElts = StoredVal.getValueType().getVectorNumElements();

26688

assert(NumElts <= 8 && "Unexpected VT")(static_cast <bool> (NumElts <= 8 && "Unexpected VT"
) ? void (0) : __assert_fail ("NumElts <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26688, __extension__
__PRETTY_FUNCTION__));

26689

assert(!St->isTruncatingStore() && "Expected non-truncating store")(static_cast <bool> (!St->isTruncatingStore() &&
"Expected non-truncating store") ? void (0) : __assert_fail (
"!St->isTruncatingStore() && \"Expected non-truncating store\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26689, __extension__
__PRETTY_FUNCTION__));

26690

assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26691, __extension__
__PRETTY_FUNCTION__))

26691

"Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26691, __extension__
__PRETTY_FUNCTION__));

26692

26693

// We must pad with zeros to ensure we store zeroes to any unused bits.

26694

StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,

26695

DAG.getUNDEF(MVT::v16i1), StoredVal,

26696

DAG.getIntPtrConstant(0, dl));

26697

StoredVal = DAG.getBitcast(MVT::i16, StoredVal);

26698

StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);

26699

// Make sure we store zeros in the extra bits.

26700

if (NumElts < 8)

26701

StoredVal = DAG.getZeroExtendInReg(

26702

StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));

26703

26704

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

26705

St->getPointerInfo(), St->getOriginalAlign(),

26706

St->getMemOperand()->getFlags());

26707

}

26708

26709

if (St->isTruncatingStore())

26710

return SDValue();

26711

26712

// If this is a 256-bit store of concatenated ops, we are better off splitting

26713

// that store into two 128-bit stores. This avoids spurious use of 256-bit ops

26714

// and each half can execute independently. Some cores would split the op into

26715

// halves anyway, so the concat (vinsertf128) is purely an extra op.

26716

MVT StoreVT = StoredVal.getSimpleValueType();

26717

if (StoreVT.is256BitVector() ||

26718

((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&

26719

!Subtarget.hasBWI())) {

26720

SmallVector<SDValue, 4> CatOps;

26721

if (StoredVal.hasOneUse() &&

26722

collectConcatOps(StoredVal.getNode(), CatOps, DAG))

26723

return splitVectorStore(St, DAG);

26724

return SDValue();

26725

}

26726

26727

if (StoreVT.is32BitVector())

26728

return SDValue();

26729

26730

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

26731

assert(StoreVT.is64BitVector() && "Unexpected VT")(static_cast <bool> (StoreVT.is64BitVector() &&
"Unexpected VT") ? void (0) : __assert_fail ("StoreVT.is64BitVector() && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26731, __extension__
__PRETTY_FUNCTION__));

26732

assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26734, __extension__
__PRETTY_FUNCTION__))

26733

TargetLowering::TypeWidenVector &&(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26734, __extension__
__PRETTY_FUNCTION__))

26734

"Unexpected type action!")(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26734, __extension__
__PRETTY_FUNCTION__));

26735

26736

EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);

26737

StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,

26738

DAG.getUNDEF(StoreVT));

26739

26740

if (Subtarget.hasSSE2()) {

26741

// Widen the vector, cast to a v2x64 type, extract the single 64-bit element

26742

// and store it.

26743

MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;

26744

MVT CastVT = MVT::getVectorVT(StVT, 2);

26745

StoredVal = DAG.getBitcast(CastVT, StoredVal);

26746

StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,

26747

DAG.getIntPtrConstant(0, dl));

26748

26749

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

26750

St->getPointerInfo(), St->getOriginalAlign(),

26751

St->getMemOperand()->getFlags());

26752

}

26753

assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26753, __extension__
__PRETTY_FUNCTION__));

26754

SDVTList Tys = DAG.getVTList(MVT::Other);

26755

SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};

26756

return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,

26757

St->getMemOperand());

26758

}

26759

26760

// Lower vector extended loads using a shuffle. If SSSE3 is not available we

26761

// may emit an illegal shuffle but the expansion is still better than scalar

26762

// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise

26763

// we'll emit a shuffle and a arithmetic shift.

26764

// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.

26765

// TODO: It is possible to support ZExt by zeroing the undef values during

26766

// the shuffle phase or after the shuffle.

26767

static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,

26768

SelectionDAG &DAG) {

26769

MVT RegVT = Op.getSimpleValueType();

26770

assert(RegVT.isVector() && "We only custom lower vector loads.")(static_cast <bool> (RegVT.isVector() && "We only custom lower vector loads."
) ? void (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26770, __extension__
__PRETTY_FUNCTION__));

26771

assert(RegVT.isInteger() &&(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26772, __extension__
__PRETTY_FUNCTION__))

26772

"We only custom lower integer vector loads.")(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26772, __extension__
__PRETTY_FUNCTION__));

26773

26774

LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());

26775

SDLoc dl(Ld);

26776

26777

// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.

26778

if (RegVT.getVectorElementType() == MVT::i1) {

26779

assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")(static_cast <bool> (EVT(RegVT) == Ld->getMemoryVT()
&& "Expected non-extending load") ? void (0) : __assert_fail
("EVT(RegVT) == Ld->getMemoryVT() && \"Expected non-extending load\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26779, __extension__
__PRETTY_FUNCTION__));

26780

assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")(static_cast <bool> (RegVT.getVectorNumElements() <=
8 && "Unexpected VT") ? void (0) : __assert_fail ("RegVT.getVectorNumElements() <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26780, __extension__
__PRETTY_FUNCTION__));

26781

assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26782, __extension__
__PRETTY_FUNCTION__))

26782

"Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26782, __extension__
__PRETTY_FUNCTION__));

26783

26784

SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),

26785

Ld->getPointerInfo(), Ld->getOriginalAlign(),

26786

Ld->getMemOperand()->getFlags());

26787

26788

// Replace chain users with the new chain.

26789

assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")(static_cast <bool> (NewLd->getNumValues() == 2 &&
"Loads must carry a chain!") ? void (0) : __assert_fail ("NewLd->getNumValues() == 2 && \"Loads must carry a chain!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26789, __extension__
__PRETTY_FUNCTION__));

26790

26791

SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);

26792

Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,

26793

DAG.getBitcast(MVT::v16i1, Val),

26794

DAG.getIntPtrConstant(0, dl));

26795

return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);

26796

}

26797

26798

return SDValue();

26799

}

26800

26801

/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes

26802

/// each of which has no other use apart from the AND / OR.

26803

static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {

26804

Opc = Op.getOpcode();

26805

if (Opc != ISD::OR && Opc != ISD::AND)

26806

return false;

26807

return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&

26808

Op.getOperand(0).hasOneUse() &&

26809

Op.getOperand(1).getOpcode() == X86ISD::SETCC &&

26810

Op.getOperand(1).hasOneUse());

26811

}

26812

26813

SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {

26814

SDValue Chain = Op.getOperand(0);

26815

SDValue Cond = Op.getOperand(1);

26816

SDValue Dest = Op.getOperand(2);

26817

SDLoc dl(Op);

26818

26819

// Bail out when we don't have native compare instructions.

26820

if (Cond.getOpcode() == ISD::SETCC &&

26821

Cond.getOperand(0).getValueType() != MVT::f128 &&

26822

!isSoftFP16(Cond.getOperand(0).getValueType())) {

26823

SDValue LHS = Cond.getOperand(0);

26824

SDValue RHS = Cond.getOperand(1);

26825

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

26826

26827

// Special case for

26828

// setcc([su]{add,sub,mul}o == 0)

26829

// setcc([su]{add,sub,mul}o != 1)

26830

if (ISD::isOverflowIntrOpRes(LHS) &&

26831

(CC == ISD::SETEQ || CC == ISD::SETNE) &&

26832

(isNullConstant(RHS) || isOneConstant(RHS))) {

26833

SDValue Value, Overflow;

26834

X86::CondCode X86Cond;

26835

std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);

26836

26837

if ((CC == ISD::SETEQ) == isNullConstant(RHS))

26838

X86Cond = X86::GetOppositeBranchCondition(X86Cond);

26839

26840

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

26841

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26842

Overflow);

26843

}

26844

26845

if (LHS.getSimpleValueType().isInteger()) {

26846

SDValue CCVal;

26847

SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);

26848

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26849

EFLAGS);

26850

}

26851

26852

if (CC == ISD::SETOEQ) {

26853

// For FCMP_OEQ, we can emit

26854

// two branches instead of an explicit AND instruction with a

26855

// separate test. However, we only do this if this block doesn't

26856

// have a fall-through edge, because this requires an explicit

26857

// jmp when the condition is false.

26858

if (Op.getNode()->hasOneUse()) {

26859

SDNode *User = *Op.getNode()->use_begin();

26860

// Look for an unconditional branch following this conditional branch.

26861

// We need this because we need to reverse the successors in order

26862

// to implement FCMP_OEQ.

26863

if (User->getOpcode() == ISD::BR) {

26864

SDValue FalseBB = User->getOperand(1);

26865

SDNode *NewBR =

26866

DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);

26867

assert(NewBR == User)(static_cast <bool> (NewBR == User) ? void (0) : __assert_fail
("NewBR == User", "llvm/lib/Target/X86/X86ISelLowering.cpp",
26867, __extension__ __PRETTY_FUNCTION__));

26868

(void)NewBR;

26869

Dest = FalseBB;

26870

26871

SDValue Cmp =

26872

DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

26873

SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);

26874

Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,

26875

CCVal, Cmp);

26876

CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);

26877

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26878

Cmp);

26879

}

26880

}

26881

} else if (CC == ISD::SETUNE) {

26882

// For FCMP_UNE, we can emit

26883

// two branches instead of an explicit OR instruction with a

26884

// separate test.

26885

SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

26886

SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);

26887

Chain =

26888

DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);

26889

CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);

26890

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26891

Cmp);

26892

} else {

26893

X86::CondCode X86Cond =

26894

TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);

26895

SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

26896

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

26897

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26898

Cmp);

26899

}

26900

}

26901

26902

if (ISD::isOverflowIntrOpRes(Cond)) {

26903

SDValue Value, Overflow;

26904

X86::CondCode X86Cond;

26905

std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

26906

26907

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

26908

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26909

Overflow);

26910

}

26911

26912

// Look past the truncate if the high bits are known zero.

26913

if (isTruncWithZeroHighBitsInput(Cond, DAG))

26914

Cond = Cond.getOperand(0);

26915

26916

EVT CondVT = Cond.getValueType();

26917

26918

// Add an AND with 1 if we don't already have one.

26919

if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))

26920

Cond =

26921

DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));

26922

26923

SDValue LHS = Cond;

26924

SDValue RHS = DAG.getConstant(0, dl, CondVT);

26925

26926

SDValue CCVal;

26927

SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);

26928

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26929

EFLAGS);

26930

}

26931

26932

// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.

26933

// Calls to _alloca are needed to probe the stack when allocating more than 4k

26934

// bytes in one go. Touching the stack at 4K increments is necessary to ensure

26935

// that the guard pages used by the OS virtual memory manager are allocated in

26936

// correct sequence.

26937

SDValue

26938

X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

26939

SelectionDAG &DAG) const {

26940

MachineFunction &MF = DAG.getMachineFunction();

26941

bool SplitStack = MF.shouldSplitStack();

26942

bool EmitStackProbeCall = hasStackProbeSymbol(MF);

26943

bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||

26944

SplitStack || EmitStackProbeCall;

26945

SDLoc dl(Op);

26946

26947

// Get the inputs.

26948

SDNode *Node = Op.getNode();

26949

SDValue Chain = Op.getOperand(0);

26950

SDValue Size = Op.getOperand(1);

26951

MaybeAlign Alignment(Op.getConstantOperandVal(2));

26952

EVT VT = Node->getValueType(0);

26953

26954

// Chain the dynamic stack allocation so that it doesn't modify the stack

26955

// pointer when other instructions are using the stack.

26956

Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

26957

26958

bool Is64Bit = Subtarget.is64Bit();

26959

MVT SPTy = getPointerTy(DAG.getDataLayout());

26960

26961

SDValue Result;

26962

if (!Lower) {

26963

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

26964

Register SPReg = TLI.getStackPointerRegisterToSaveRestore();

26965

assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26966, __extension__
__PRETTY_FUNCTION__))

26966

" not tell us which reg is the stack pointer!")(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26966, __extension__
__PRETTY_FUNCTION__));

26967

26968

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

26969

const Align StackAlign = TFI.getStackAlign();

26970

if (hasInlineStackProbe(MF)) {

26971

MachineRegisterInfo &MRI = MF.getRegInfo();

26972

26973

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

26974

Register Vreg = MRI.createVirtualRegister(AddrRegClass);

26975

Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);

26976

Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,

26977

DAG.getRegister(Vreg, SPTy));

26978

} else {

26979

SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);

26980

Chain = SP.getValue(1);

26981

Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value

26982

}

26983

if (Alignment && *Alignment > StackAlign)

26984

Result =

26985

DAG.getNode(ISD::AND, dl, VT, Result,

26986

DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));

26987

Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain

26988

} else if (SplitStack) {

26989

MachineRegisterInfo &MRI = MF.getRegInfo();

26990

26991

if (Is64Bit) {

26992

// The 64 bit implementation of segmented stacks needs to clobber both r10

26993

// r11. This makes it impossible to use it along with nested parameters.

26994

const Function &F = MF.getFunction();

26995

for (const auto &A : F.args()) {

26996

if (A.hasNestAttr())

26997

report_fatal_error("Cannot use segmented stacks with functions that "

26998

"have nested arguments.");

26999

}

27000

}

27001

27002

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

27003

Register Vreg = MRI.createVirtualRegister(AddrRegClass);

27004

Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);

27005

Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,

27006

DAG.getRegister(Vreg, SPTy));

27007

} else {

27008

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

27009

Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);

27010

MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);

27011

27012

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

27013

Register SPReg = RegInfo->getStackRegister();

27014

SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);

27015

Chain = SP.getValue(1);

27016

27017

if (Alignment) {

27018

SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),

27019

DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));

27020

Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);

27021

}

27022

27023

Result = SP;

27024

}

27025

27026

Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);

27027

27028

SDValue Ops[2] = {Result, Chain};

27029

return DAG.getMergeValues(Ops, dl);

27030

}

27031

27032

SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {

27033

MachineFunction &MF = DAG.getMachineFunction();

27034

auto PtrVT = getPointerTy(MF.getDataLayout());

27035

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

27036

27037

const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

27038

SDLoc DL(Op);

27039

27040

if (!Subtarget.is64Bit() ||

27041

Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {

27042

// vastart just stores the address of the VarArgsFrameIndex slot into the

27043

// memory location argument.

27044

SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

27045

return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),

27046

MachinePointerInfo(SV));

27047

}

27048

27049

// __va_list_tag:

27050

// gp_offset (0 - 6 * 8)

27051

// fp_offset (48 - 48 + 8 * 16)

27052

// overflow_arg_area (point to parameters coming in memory).

27053

// reg_save_area

27054

SmallVector<SDValue, 8> MemOps;

27055

SDValue FIN = Op.getOperand(1);

27056

// Store gp_offset

27057

SDValue Store = DAG.getStore(

27058

Op.getOperand(0), DL,

27059

DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,

27060

MachinePointerInfo(SV));

27061

MemOps.push_back(Store);

27062

27063

// Store fp_offset

27064

FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);

27065

Store = DAG.getStore(

27066

Op.getOperand(0), DL,

27067

DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,

27068

MachinePointerInfo(SV, 4));

27069

MemOps.push_back(Store);

27070

27071

// Store ptr to overflow_arg_area

27072

FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));

27073

SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

27074

Store =

27075

DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));

27076

MemOps.push_back(Store);

27077

27078

// Store ptr to reg_save_area.

27079

FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(

27080

Subtarget.isTarget64BitLP64() ? 8 : 4, DL));

27081

SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);

27082

Store = DAG.getStore(

27083

Op.getOperand(0), DL, RSFIN, FIN,

27084

MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));

27085

MemOps.push_back(Store);

27086

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

27087

}

27088

27089

SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {

27090

assert(Subtarget.is64Bit() &&(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27091, __extension__
__PRETTY_FUNCTION__))

27091

"LowerVAARG only handles 64-bit va_arg!")(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27091, __extension__
__PRETTY_FUNCTION__));

27092

assert(Op.getNumOperands() == 4)(static_cast <bool> (Op.getNumOperands() == 4) ? void (
0) : __assert_fail ("Op.getNumOperands() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27092, __extension__ __PRETTY_FUNCTION__));

27093

27094

MachineFunction &MF = DAG.getMachineFunction();

27095

if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))

27096

// The Win64 ABI uses char* instead of a structure.

27097

return DAG.expandVAArg(Op.getNode());

27098

27099

SDValue Chain = Op.getOperand(0);

27100

SDValue SrcPtr = Op.getOperand(1);

27101

const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

27102

unsigned Align = Op.getConstantOperandVal(3);

27103

SDLoc dl(Op);

27104

27105

EVT ArgVT = Op.getNode()->getValueType(0);

27106

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

27107

uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);

27108

uint8_t ArgMode;

27109

27110

// Decide which area this value should be read from.

27111

// TODO: Implement the AMD64 ABI in its entirety. This simple

27112

// selection mechanism works only for the basic types.

27113

assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")(static_cast <bool> (ArgVT != MVT::f80 && "va_arg for f80 not yet implemented"
) ? void (0) : __assert_fail ("ArgVT != MVT::f80 && \"va_arg for f80 not yet implemented\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27113, __extension__
__PRETTY_FUNCTION__));

27114

if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {

27115

ArgMode = 2; // Argument passed in XMM register. Use fp_offset.

27116

} else {

27117

assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27118, __extension__
__PRETTY_FUNCTION__))

27118

"Unhandled argument type in LowerVAARG")(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27118, __extension__
__PRETTY_FUNCTION__));

27119

ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.

27120

}

27121

27122

if (ArgMode == 2) {

27123

// Make sure using fp_offset makes sense.

27124

assert(!Subtarget.useSoftFloat() &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27126, __extension__
__PRETTY_FUNCTION__))

27125

!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27126, __extension__
__PRETTY_FUNCTION__))

27126

Subtarget.hasSSE1())(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27126, __extension__
__PRETTY_FUNCTION__));

27127

}

27128

27129

// Insert VAARG node into the DAG

27130

// VAARG returns two values: Variable Argument Address, Chain

27131

SDValue InstOps[] = {Chain, SrcPtr,

27132

DAG.getTargetConstant(ArgSize, dl, MVT::i32),

27133

DAG.getTargetConstant(ArgMode, dl, MVT::i8),

27134

DAG.getTargetConstant(Align, dl, MVT::i32)};

27135

SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);

27136

SDValue VAARG = DAG.getMemIntrinsicNode(

27137

Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,

27138

VTs, InstOps, MVT::i64, MachinePointerInfo(SV),

27139

/*Alignment=*/std::nullopt,

27140

MachineMemOperand::MOLoad | MachineMemOperand::MOStore);

27141

Chain = VAARG.getValue(1);

27142

27143

// Load the next argument and return it

27144

return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());

27145

}

27146

27147

static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,

27148

SelectionDAG &DAG) {

27149

// X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,

27150

// where a va_list is still an i8*.

27151

assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")(static_cast <bool> (Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27151, __extension__
__PRETTY_FUNCTION__));

27152

if (Subtarget.isCallingConvWin64(

27153

DAG.getMachineFunction().getFunction().getCallingConv()))

27154

// Probably a Win64 va_copy.

27155

return DAG.expandVACopy(Op.getNode());

27156

27157

SDValue Chain = Op.getOperand(0);

27158

SDValue DstPtr = Op.getOperand(1);

27159

SDValue SrcPtr = Op.getOperand(2);

27160

const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();

27161

const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

27162

SDLoc DL(Op);

27163

27164

return DAG.getMemcpy(

27165

Chain, DL, DstPtr, SrcPtr,

27166

DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),

27167

Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,

27168

false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));

27169

}

27170

27171

// Helper to get immediate/variable SSE shift opcode from other shift opcodes.

27172

static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {

27173

switch (Opc) {

27174

case ISD::SHL:

27175

case X86ISD::VSHL:

27176

case X86ISD::VSHLI:

27177

return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;

27178

case ISD::SRL:

27179

case X86ISD::VSRL:

27180

case X86ISD::VSRLI:

27181

return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;

27182

case ISD::SRA:

27183

case X86ISD::VSRA:

27184

case X86ISD::VSRAI:

27185

return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;

27186

}

27187

llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27187);

27188

}

27189

27190

/// Handle vector element shifts where the shift amount is a constant.

27191

/// Takes immediate version of shift as input.

27192

static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,

27193

SDValue SrcOp, uint64_t ShiftAmt,

27194

SelectionDAG &DAG) {

27195

MVT ElementType = VT.getVectorElementType();

27196

27197

// Bitcast the source vector to the output type, this is mainly necessary for

27198

// vXi8/vXi64 shifts.

27199

if (VT != SrcOp.getSimpleValueType())

27200

SrcOp = DAG.getBitcast(VT, SrcOp);

27201

27202

// Fold this packed shift into its first operand if ShiftAmt is 0.

27203

if (ShiftAmt == 0)

27204

return SrcOp;

27205

27206

// Check for ShiftAmt >= element width

27207

if (ShiftAmt >= ElementType.getSizeInBits()) {

27208

if (Opc == X86ISD::VSRAI)

27209

ShiftAmt = ElementType.getSizeInBits() - 1;

27210

else

27211

return DAG.getConstant(0, dl, VT);

27212

}

27213

27214

assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27215, __extension__
__PRETTY_FUNCTION__))

27215

&& "Unknown target vector shift-by-constant node")(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27215, __extension__
__PRETTY_FUNCTION__));

27216

27217

// Fold this packed vector shift into a build vector if SrcOp is a

27218

// vector of Constants or UNDEFs.

27219

if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {

27220

unsigned ShiftOpc;

27221

switch (Opc) {

27222

default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27222);

27223

case X86ISD::VSHLI:

27224

ShiftOpc = ISD::SHL;

27225

break;

27226

case X86ISD::VSRLI:

27227

ShiftOpc = ISD::SRL;

27228

break;

27229

case X86ISD::VSRAI:

27230

ShiftOpc = ISD::SRA;

27231

break;

27232

}

27233

27234

SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);

27235

if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))

27236

return C;

27237

}

27238

27239

return DAG.getNode(Opc, dl, VT, SrcOp,

27240

DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));

27241

}

27242

27243

/// Handle vector element shifts by a splat shift amount

27244

static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,

27245

SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,

27246

const X86Subtarget &Subtarget,

27247

SelectionDAG &DAG) {

27248

MVT AmtVT = ShAmt.getSimpleValueType();

27249

assert(AmtVT.isVector() && "Vector shift type mismatch")(static_cast <bool> (AmtVT.isVector() && "Vector shift type mismatch"
) ? void (0) : __assert_fail ("AmtVT.isVector() && \"Vector shift type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27249, __extension__
__PRETTY_FUNCTION__));

27250

assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27251, __extension__
__PRETTY_FUNCTION__))

27251

"Illegal vector splat index")(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27251, __extension__
__PRETTY_FUNCTION__));

27252

27253

// Move the splat element to the bottom element.

27254

if (ShAmtIdx != 0) {

27255

SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);

27256

Mask[0] = ShAmtIdx;

27257

ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);

27258

}

27259

27260

// Peek through any zext node if we can get back to a 128-bit source.

27261

if (AmtVT.getScalarSizeInBits() == 64 &&

27262

(ShAmt.getOpcode() == ISD::ZERO_EXTEND ||

27263

ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&

27264

ShAmt.getOperand(0).getValueType().isSimple() &&

27265

ShAmt.getOperand(0).getValueType().is128BitVector()) {

27266

ShAmt = ShAmt.getOperand(0);

27267

AmtVT = ShAmt.getSimpleValueType();

27268

}

27269

27270

// See if we can mask off the upper elements using the existing source node.

27271

// The shift uses the entire lower 64-bits of the amount vector, so no need to

27272

// do this for vXi64 types.

27273

bool IsMasked = false;

27274

if (AmtVT.getScalarSizeInBits() < 64) {

27275

if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||

27276

ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {

27277

// If the shift amount has come from a scalar, then zero-extend the scalar

27278

// before moving to the vector.

27279

ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);

27280

ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);

27281

ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);

27282

AmtVT = MVT::v4i32;

27283

IsMasked = true;

27284

} else if (ShAmt.getOpcode() == ISD::AND) {

27285

// See if the shift amount is already masked (e.g. for rotation modulo),

27286

// then we can zero-extend it by setting all the other mask elements to

27287

// zero.

27288

SmallVector<SDValue> MaskElts(

27289

AmtVT.getVectorNumElements(),

27290

DAG.getConstant(0, dl, AmtVT.getScalarType()));

27291

MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());

27292

SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);

27293

if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,

27294

{ShAmt.getOperand(1), Mask}))) {

27295

ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);

27296

IsMasked = true;

27297

}

27298

}

27299

}

27300

27301

// Extract if the shift amount vector is larger than 128-bits.

27302

if (AmtVT.getSizeInBits() > 128) {

27303

ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);

27304

AmtVT = ShAmt.getSimpleValueType();

27305

}

27306

27307

// Zero-extend bottom element to v2i64 vector type, either by extension or

27308

// shuffle masking.

27309

if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {

27310

if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||

27311

ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {

27312

ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);

27313

} else if (Subtarget.hasSSE41()) {

27314

ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),

27315

MVT::v2i64, ShAmt);

27316

} else {

27317

SDValue ByteShift = DAG.getTargetConstant(

27318

(128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);

27319

ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);

27320

ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,

27321

ByteShift);

27322

ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,

27323

ByteShift);

27324

}

27325

}

27326

27327

// Change opcode to non-immediate version.

27328

Opc = getTargetVShiftUniformOpcode(Opc, true);

27329

27330

// The return type has to be a 128-bit type with the same element

27331

// type as the input type.

27332

MVT EltVT = VT.getVectorElementType();

27333

MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());

27334

27335

ShAmt = DAG.getBitcast(ShVT, ShAmt);

27336

return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);

27337

}

27338

27339

/// Return Mask with the necessary casting or extending

27340

/// for \p Mask according to \p MaskVT when lowering masking intrinsics

27341

static SDValue getMaskNode(SDValue Mask, MVT MaskVT,

27342

const X86Subtarget &Subtarget, SelectionDAG &DAG,

27343

const SDLoc &dl) {

27344

27345

if (isAllOnesConstant(Mask))

27346

return DAG.getConstant(1, dl, MaskVT);

27347

if (X86::isZeroNode(Mask))

27348

return DAG.getConstant(0, dl, MaskVT);

27349

27350

assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")(static_cast <bool> (MaskVT.bitsLE(Mask.getSimpleValueType
()) && "Unexpected mask size!") ? void (0) : __assert_fail
("MaskVT.bitsLE(Mask.getSimpleValueType()) && \"Unexpected mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27350, __extension__
__PRETTY_FUNCTION__));

27351

27352

if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {

27353

assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")(static_cast <bool> (MaskVT == MVT::v64i1 && "Expected v64i1 mask!"
) ? void (0) : __assert_fail ("MaskVT == MVT::v64i1 && \"Expected v64i1 mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27353, __extension__
__PRETTY_FUNCTION__));

27354

assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27354, __extension__
__PRETTY_FUNCTION__));

27355

// In case 32bit mode, bitcast i64 is illegal, extend/split it.

27356

SDValue Lo, Hi;

27357

std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);

27358

Lo = DAG.getBitcast(MVT::v32i1, Lo);

27359

Hi = DAG.getBitcast(MVT::v32i1, Hi);

27360

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);

27361

} else {

27362

MVT BitcastVT = MVT::getVectorVT(MVT::i1,

27363

Mask.getSimpleValueType().getSizeInBits());

27364

// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements

27365

// are extracted by EXTRACT_SUBVECTOR.

27366

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,

27367

DAG.getBitcast(BitcastVT, Mask),

27368

DAG.getIntPtrConstant(0, dl));

27369

}

27370

}

27371

27372

/// Return (and \p Op, \p Mask) for compare instructions or

27373

/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the

27374

/// necessary casting or extending for \p Mask when lowering masking intrinsics

27375

static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,

27376

SDValue PreservedSrc,

27377

const X86Subtarget &Subtarget,

27378

SelectionDAG &DAG) {

27379

MVT VT = Op.getSimpleValueType();

27380

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

27381

unsigned OpcodeSelect = ISD::VSELECT;

27382

SDLoc dl(Op);

27383

27384

if (isAllOnesConstant(Mask))

27385

return Op;

27386

27387

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

27388

27389

if (PreservedSrc.isUndef())

27390

PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);

27391

return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);

27392

}

27393

27394

/// Creates an SDNode for a predicated scalar operation.

27395

/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).

27396

/// The mask is coming as MVT::i8 and it should be transformed

27397

/// to MVT::v1i1 while lowering masking intrinsics.

27398

/// The main difference between ScalarMaskingNode and VectorMaskingNode is using

27399

/// "X86select" instead of "vselect". We just can't create the "vselect" node

27400

/// for a scalar instruction.

27401

static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,

27402

SDValue PreservedSrc,

27403

const X86Subtarget &Subtarget,

27404

SelectionDAG &DAG) {

27405

27406

if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))

27407

if (MaskConst->getZExtValue() & 0x1)

27408

return Op;

27409

27410

MVT VT = Op.getSimpleValueType();

27411

SDLoc dl(Op);

27412

27413

assert(Mask.getValueType() == MVT::i8 && "Unexpect type")(static_cast <bool> (Mask.getValueType() == MVT::i8 &&
"Unexpect type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::i8 && \"Unexpect type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27413, __extension__
__PRETTY_FUNCTION__));

27414

SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,

27415

DAG.getBitcast(MVT::v8i1, Mask),

27416

DAG.getIntPtrConstant(0, dl));

27417

if (Op.getOpcode() == X86ISD::FSETCCM ||

27418

Op.getOpcode() == X86ISD::FSETCCM_SAE ||

27419

Op.getOpcode() == X86ISD::VFPCLASSS)

27420

return DAG.getNode(ISD::AND, dl, VT, Op, IMask);

27421

27422

if (PreservedSrc.isUndef())

27423

PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);

27424

return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);

27425

}

27426

27427

static int getSEHRegistrationNodeSize(const Function *Fn) {

27428

if (!Fn->hasPersonalityFn())

27429

report_fatal_error(

27430

"querying registration node size for function without personality");

27431

// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See

27432

// WinEHStatePass for the full struct definition.

27433

switch (classifyEHPersonality(Fn->getPersonalityFn())) {

27434

case EHPersonality::MSVC_X86SEH: return 24;

27435

case EHPersonality::MSVC_CXX: return 16;

27436

default: break;

27437

}

27438

report_fatal_error(

27439

"can only recover FP for 32-bit MSVC EH personality functions");

27440

}

27441

27442

/// When the MSVC runtime transfers control to us, either to an outlined

27443

/// function or when returning to a parent frame after catching an exception, we

27444

/// recover the parent frame pointer by doing arithmetic on the incoming EBP.

27445

/// Here's the math:

27446

/// RegNodeBase = EntryEBP - RegNodeSize

27447

/// ParentFP = RegNodeBase - ParentFrameOffset

27448

/// Subtracting RegNodeSize takes us to the offset of the registration node, and

27449

/// subtracting the offset (negative on x86) takes us back to the parent FP.

27450

static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,

27451

SDValue EntryEBP) {

27452

MachineFunction &MF = DAG.getMachineFunction();

27453

SDLoc dl;

27454

27455

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

27456

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

27457

27458

// It's possible that the parent function no longer has a personality function

27459

// if the exceptional code was optimized away, in which case we just return

27460

// the incoming EBP.

27461

if (!Fn->hasPersonalityFn())

27462

return EntryEBP;

27463

27464

// Get an MCSymbol that will ultimately resolve to the frame offset of the EH

27465

// registration, or the .set_setframe offset.

27466

MCSymbol *OffsetSym =

27467

MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(

27468

GlobalValue::dropLLVMManglingEscape(Fn->getName()));

27469

SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);

27470

SDValue ParentFrameOffset =

27471

DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);

27472

27473

// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after

27474

// prologue to RBP in the parent function.

27475

const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();

27476

if (Subtarget.is64Bit())

27477

return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);

27478

27479

int RegNodeSize = getSEHRegistrationNodeSize(Fn);

27480

// RegNodeBase = EntryEBP - RegNodeSize

27481

// ParentFP = RegNodeBase - ParentFrameOffset

27482

SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,

27483

DAG.getConstant(RegNodeSize, dl, PtrVT));

27484

return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);

27485

}

27486

27487

SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

27488

SelectionDAG &DAG) const {

27489

// Helper to detect if the operand is CUR_DIRECTION rounding mode.

27490

auto isRoundModeCurDirection = [](SDValue Rnd) {

27491

if (auto *C = dyn_cast<ConstantSDNode>(Rnd))

27492

return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;

27493

27494

return false;

27495

};

27496

auto isRoundModeSAE = [](SDValue Rnd) {

27497

if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {

27498

unsigned RC = C->getZExtValue();

27499

if (RC & X86::STATIC_ROUNDING::NO_EXC) {

27500

// Clear the NO_EXC bit and check remaining bits.

27501

RC ^= X86::STATIC_ROUNDING::NO_EXC;

27502

// As a convenience we allow no other bits or explicitly

27503

// current direction.

27504

return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;

27505

}

27506

}

27507

27508

return false;

27509

};

27510

auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {

27511

if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {

27512

RC = C->getZExtValue();

27513

if (RC & X86::STATIC_ROUNDING::NO_EXC) {

27514

// Clear the NO_EXC bit and check remaining bits.

27515

RC ^= X86::STATIC_ROUNDING::NO_EXC;

27516

return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||

27517

RC == X86::STATIC_ROUNDING::TO_NEG_INF ||

27518

RC == X86::STATIC_ROUNDING::TO_POS_INF ||

27519

RC == X86::STATIC_ROUNDING::TO_ZERO;

27520

}

27521

}

27522

27523

return false;

27524

};

27525

27526

SDLoc dl(Op);

27527

unsigned IntNo = Op.getConstantOperandVal(0);

27528

MVT VT = Op.getSimpleValueType();

27529

const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);

27530

27531

// Propagate flags from original node to transformed node(s).

27532

SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());

27533

27534

if (IntrData) {

27535

switch(IntrData->Type) {

27536

case INTR_TYPE_1OP: {

27537

// We specify 2 possible opcodes for intrinsics with rounding modes.

27538

// First, we check if the intrinsic may have non-default rounding mode,

27539

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

27540

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27541

if (IntrWithRoundingModeOpcode != 0) {

27542

SDValue Rnd = Op.getOperand(2);

27543

unsigned RC = 0;

27544

if (isRoundModeSAEToX(Rnd, RC))

27545

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

27546

Op.getOperand(1),

27547

DAG.getTargetConstant(RC, dl, MVT::i32));

27548

if (!isRoundModeCurDirection(Rnd))

27549

return SDValue();

27550

}

27551

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27552

Op.getOperand(1));

27553

}

27554

case INTR_TYPE_1OP_SAE: {

27555

SDValue Sae = Op.getOperand(2);

27556

27557

unsigned Opc;

27558

if (isRoundModeCurDirection(Sae))

27559

Opc = IntrData->Opc0;

27560

else if (isRoundModeSAE(Sae))

27561

Opc = IntrData->Opc1;

27562

else

27563

return SDValue();

27564

27565

return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));

27566

}

27567

case INTR_TYPE_2OP: {

27568

SDValue Src2 = Op.getOperand(2);

27569

27570

// We specify 2 possible opcodes for intrinsics with rounding modes.

27571

// First, we check if the intrinsic may have non-default rounding mode,

27572

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

27573

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27574

if (IntrWithRoundingModeOpcode != 0) {

27575

SDValue Rnd = Op.getOperand(3);

27576

unsigned RC = 0;

27577

if (isRoundModeSAEToX(Rnd, RC))

27578

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

27579

Op.getOperand(1), Src2,

27580

DAG.getTargetConstant(RC, dl, MVT::i32));

27581

if (!isRoundModeCurDirection(Rnd))

27582

return SDValue();

27583

}

27584

27585

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27586

Op.getOperand(1), Src2);

27587

}

27588

case INTR_TYPE_2OP_SAE: {

27589

SDValue Sae = Op.getOperand(3);

27590

27591

unsigned Opc;

27592

if (isRoundModeCurDirection(Sae))

27593

Opc = IntrData->Opc0;

27594

else if (isRoundModeSAE(Sae))

27595

Opc = IntrData->Opc1;

27596

else

27597

return SDValue();

27598

27599

return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),

27600

Op.getOperand(2));

27601

}

27602

case INTR_TYPE_3OP:

27603

case INTR_TYPE_3OP_IMM8: {

27604

SDValue Src1 = Op.getOperand(1);

27605

SDValue Src2 = Op.getOperand(2);

27606

SDValue Src3 = Op.getOperand(3);

27607

27608

if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&

27609

Src3.getValueType() != MVT::i8) {

27610

Src3 = DAG.getTargetConstant(

27611

cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);

27612

}

27613

27614

// We specify 2 possible opcodes for intrinsics with rounding modes.

27615

// First, we check if the intrinsic may have non-default rounding mode,

27616

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

27617

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27618

if (IntrWithRoundingModeOpcode != 0) {

27619

SDValue Rnd = Op.getOperand(4);

27620

unsigned RC = 0;

27621

if (isRoundModeSAEToX(Rnd, RC))

27622

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

27623

Src1, Src2, Src3,

27624

DAG.getTargetConstant(RC, dl, MVT::i32));

27625

if (!isRoundModeCurDirection(Rnd))

27626

return SDValue();

27627

}

27628

27629

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27630

{Src1, Src2, Src3});

27631

}

27632

case INTR_TYPE_4OP_IMM8: {

27633

assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)(static_cast <bool> (Op.getOperand(4)->getOpcode() ==
ISD::TargetConstant) ? void (0) : __assert_fail ("Op.getOperand(4)->getOpcode() == ISD::TargetConstant"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27633, __extension__
__PRETTY_FUNCTION__));

27634

SDValue Src4 = Op.getOperand(4);

27635

if (Src4.getValueType() != MVT::i8) {

27636

Src4 = DAG.getTargetConstant(

27637

cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);

27638

}

27639

27640

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27641

Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),

27642

Src4);

27643

}

27644

case INTR_TYPE_1OP_MASK: {

27645

SDValue Src = Op.getOperand(1);

27646

SDValue PassThru = Op.getOperand(2);

27647

SDValue Mask = Op.getOperand(3);

27648

// We add rounding mode to the Node when

27649

// - RC Opcode is specified and

27650

// - RC is not "current direction".

27651

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27652

if (IntrWithRoundingModeOpcode != 0) {

27653

SDValue Rnd = Op.getOperand(4);

27654

unsigned RC = 0;

27655

if (isRoundModeSAEToX(Rnd, RC))

27656

return getVectorMaskingNode(

27657

DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

27658

Src, DAG.getTargetConstant(RC, dl, MVT::i32)),

27659

Mask, PassThru, Subtarget, DAG);

27660

if (!isRoundModeCurDirection(Rnd))

27661

return SDValue();

27662

}

27663

return getVectorMaskingNode(

27664

DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,

27665

Subtarget, DAG);

27666

}

27667

case INTR_TYPE_1OP_MASK_SAE: {

27668

SDValue Src = Op.getOperand(1);

27669

SDValue PassThru = Op.getOperand(2);

27670

SDValue Mask = Op.getOperand(3);

27671

SDValue Rnd = Op.getOperand(4);

27672

27673

unsigned Opc;

27674

if (isRoundModeCurDirection(Rnd))

27675

Opc = IntrData->Opc0;

27676

else if (isRoundModeSAE(Rnd))

27677

Opc = IntrData->Opc1;

27678

else

27679

return SDValue();

27680

27681

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,

27682

Subtarget, DAG);

27683

}

27684

case INTR_TYPE_SCALAR_MASK: {

27685

SDValue Src1 = Op.getOperand(1);

27686

SDValue Src2 = Op.getOperand(2);

27687

SDValue passThru = Op.getOperand(3);

27688

SDValue Mask = Op.getOperand(4);

27689

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27690

// There are 2 kinds of intrinsics in this group:

27691

// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands

27692

// (2) With rounding mode and sae - 7 operands.

27693

bool HasRounding = IntrWithRoundingModeOpcode != 0;

27694

if (Op.getNumOperands() == (5U + HasRounding)) {

27695

if (HasRounding) {

27696

SDValue Rnd = Op.getOperand(5);

27697

unsigned RC = 0;

27698

if (isRoundModeSAEToX(Rnd, RC))

27699

return getScalarMaskingNode(

27700

DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,

27701

DAG.getTargetConstant(RC, dl, MVT::i32)),

27702

Mask, passThru, Subtarget, DAG);

27703

if (!isRoundModeCurDirection(Rnd))

27704

return SDValue();

27705

}

27706

return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,

27707

Src2),

27708

Mask, passThru, Subtarget, DAG);

27709

}

27710

27711

assert(Op.getNumOperands() == (6U + HasRounding) &&(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27712, __extension__
__PRETTY_FUNCTION__))

27712

"Unexpected intrinsic form")(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27712, __extension__
__PRETTY_FUNCTION__));

27713

SDValue RoundingMode = Op.getOperand(5);

27714

unsigned Opc = IntrData->Opc0;

27715

if (HasRounding) {

27716

SDValue Sae = Op.getOperand(6);

27717

if (isRoundModeSAE(Sae))

27718

Opc = IntrWithRoundingModeOpcode;

27719

else if (!isRoundModeCurDirection(Sae))

27720

return SDValue();

27721

}

27722

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,

27723

Src2, RoundingMode),

27724

Mask, passThru, Subtarget, DAG);

27725

}

27726

case INTR_TYPE_SCALAR_MASK_RND: {

27727

SDValue Src1 = Op.getOperand(1);

27728

SDValue Src2 = Op.getOperand(2);

27729

SDValue passThru = Op.getOperand(3);

27730

SDValue Mask = Op.getOperand(4);

27731

SDValue Rnd = Op.getOperand(5);

27732

27733

SDValue NewOp;

27734

unsigned RC = 0;

27735

if (isRoundModeCurDirection(Rnd))

27736

NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);

27737

else if (isRoundModeSAEToX(Rnd, RC))

27738

NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,

27739

DAG.getTargetConstant(RC, dl, MVT::i32));

27740

else

27741

return SDValue();

27742

27743

return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);

27744

}

27745

case INTR_TYPE_SCALAR_MASK_SAE: {

27746

SDValue Src1 = Op.getOperand(1);

27747

SDValue Src2 = Op.getOperand(2);

27748

SDValue passThru = Op.getOperand(3);

27749

SDValue Mask = Op.getOperand(4);

27750

SDValue Sae = Op.getOperand(5);

27751

unsigned Opc;

27752

if (isRoundModeCurDirection(Sae))

27753

Opc = IntrData->Opc0;

27754

else if (isRoundModeSAE(Sae))

27755

Opc = IntrData->Opc1;

27756

else

27757

return SDValue();

27758

27759

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),

27760

Mask, passThru, Subtarget, DAG);

27761

}

27762

case INTR_TYPE_2OP_MASK: {

27763

SDValue Src1 = Op.getOperand(1);

27764

SDValue Src2 = Op.getOperand(2);

27765

SDValue PassThru = Op.getOperand(3);

27766

SDValue Mask = Op.getOperand(4);

27767

SDValue NewOp;

27768

if (IntrData->Opc1 != 0) {

27769

SDValue Rnd = Op.getOperand(5);

27770

unsigned RC = 0;

27771

if (isRoundModeSAEToX(Rnd, RC))

27772

NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,

27773

DAG.getTargetConstant(RC, dl, MVT::i32));

27774

else if (!isRoundModeCurDirection(Rnd))

27775

return SDValue();

27776

}

27777

if (!NewOp)

27778

NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);

27779

return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);

27780

}

27781

case INTR_TYPE_2OP_MASK_SAE: {

27782

SDValue Src1 = Op.getOperand(1);

27783

SDValue Src2 = Op.getOperand(2);

27784

SDValue PassThru = Op.getOperand(3);

27785

SDValue Mask = Op.getOperand(4);

27786

27787

unsigned Opc = IntrData->Opc0;

27788

if (IntrData->Opc1 != 0) {

27789

SDValue Sae = Op.getOperand(5);

27790

if (isRoundModeSAE(Sae))

27791

Opc = IntrData->Opc1;

27792

else if (!isRoundModeCurDirection(Sae))

27793

return SDValue();

27794

}

27795

27796

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),

27797

Mask, PassThru, Subtarget, DAG);

27798

}

27799

case INTR_TYPE_3OP_SCALAR_MASK_SAE: {

27800

SDValue Src1 = Op.getOperand(1);

27801

SDValue Src2 = Op.getOperand(2);

27802

SDValue Src3 = Op.getOperand(3);

27803

SDValue PassThru = Op.getOperand(4);

27804

SDValue Mask = Op.getOperand(5);

27805

SDValue Sae = Op.getOperand(6);

27806

unsigned Opc;

27807

if (isRoundModeCurDirection(Sae))

27808

Opc = IntrData->Opc0;

27809

else if (isRoundModeSAE(Sae))

27810

Opc = IntrData->Opc1;

27811

else

27812

return SDValue();

27813

27814

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),

27815

Mask, PassThru, Subtarget, DAG);

27816

}

27817

case INTR_TYPE_3OP_MASK_SAE: {

27818

SDValue Src1 = Op.getOperand(1);

27819

SDValue Src2 = Op.getOperand(2);

27820

SDValue Src3 = Op.getOperand(3);

27821

SDValue PassThru = Op.getOperand(4);

27822

SDValue Mask = Op.getOperand(5);

27823

27824

unsigned Opc = IntrData->Opc0;

27825

if (IntrData->Opc1 != 0) {

27826

SDValue Sae = Op.getOperand(6);

27827

if (isRoundModeSAE(Sae))

27828

Opc = IntrData->Opc1;

27829

else if (!isRoundModeCurDirection(Sae))

27830

return SDValue();

27831

}

27832

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),

27833

Mask, PassThru, Subtarget, DAG);

27834

}

27835

case BLENDV: {

27836

SDValue Src1 = Op.getOperand(1);

27837

SDValue Src2 = Op.getOperand(2);

27838

SDValue Src3 = Op.getOperand(3);

27839

27840

EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();

27841

Src3 = DAG.getBitcast(MaskVT, Src3);

27842

27843

// Reverse the operands to match VSELECT order.

27844

return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);

27845

}

27846

case VPERM_2OP : {

27847

SDValue Src1 = Op.getOperand(1);

27848

SDValue Src2 = Op.getOperand(2);

27849

27850

// Swap Src1 and Src2 in the node creation

27851

return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);

27852

}

27853

case CFMA_OP_MASKZ:

27854

case CFMA_OP_MASK: {

27855

SDValue Src1 = Op.getOperand(1);

27856

SDValue Src2 = Op.getOperand(2);

27857

SDValue Src3 = Op.getOperand(3);

27858

SDValue Mask = Op.getOperand(4);

27859

MVT VT = Op.getSimpleValueType();

27860

27861

SDValue PassThru = Src3;

27862

if (IntrData->Type == CFMA_OP_MASKZ)

27863

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

27864

27865

// We add rounding mode to the Node when

27866

// - RC Opcode is specified and

27867

// - RC is not "current direction".

27868

SDValue NewOp;

27869

if (IntrData->Opc1 != 0) {

27870

SDValue Rnd = Op.getOperand(5);

27871

unsigned RC = 0;

27872

if (isRoundModeSAEToX(Rnd, RC))

27873

NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,

27874

DAG.getTargetConstant(RC, dl, MVT::i32));

27875

else if (!isRoundModeCurDirection(Rnd))

27876

return SDValue();

27877

}

27878

if (!NewOp)

27879

NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);

27880

return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);

27881

}

27882

case IFMA_OP:

27883

// NOTE: We need to swizzle the operands to pass the multiply operands

27884

// first.

27885

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27886

Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

27887

case FPCLASSS: {

27888

SDValue Src1 = Op.getOperand(1);

27889

SDValue Imm = Op.getOperand(2);

27890

SDValue Mask = Op.getOperand(3);

27891

SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);

27892

SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),

27893

Subtarget, DAG);

27894

// Need to fill with zeros to ensure the bitcast will produce zeroes

27895

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

27896

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,

27897

DAG.getConstant(0, dl, MVT::v8i1),

27898

FPclassMask, DAG.getIntPtrConstant(0, dl));

27899

return DAG.getBitcast(MVT::i8, Ins);

27900

}

27901

27902

case CMP_MASK_CC: {

27903

MVT MaskVT = Op.getSimpleValueType();

27904

SDValue CC = Op.getOperand(3);

27905

SDValue Mask = Op.getOperand(4);

27906

// We specify 2 possible opcodes for intrinsics with rounding modes.

27907

// First, we check if the intrinsic may have non-default rounding mode,

27908

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

27909

if (IntrData->Opc1 != 0) {

27910

SDValue Sae = Op.getOperand(5);

27911

if (isRoundModeSAE(Sae))

27912

return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),

27913

Op.getOperand(2), CC, Mask, Sae);

27914

if (!isRoundModeCurDirection(Sae))

27915

return SDValue();

27916

}

27917

//default rounding mode

27918

return DAG.getNode(IntrData->Opc0, dl, MaskVT,

27919

{Op.getOperand(1), Op.getOperand(2), CC, Mask});

27920

}

27921

case CMP_MASK_SCALAR_CC: {

27922

SDValue Src1 = Op.getOperand(1);

27923

SDValue Src2 = Op.getOperand(2);

27924

SDValue CC = Op.getOperand(3);

27925

SDValue Mask = Op.getOperand(4);

27926

27927

SDValue Cmp;

27928

if (IntrData->Opc1 != 0) {

27929

SDValue Sae = Op.getOperand(5);

27930

if (isRoundModeSAE(Sae))

27931

Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);

27932

else if (!isRoundModeCurDirection(Sae))

27933

return SDValue();

27934

}

27935

//default rounding mode

27936

if (!Cmp.getNode())

27937

Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);

27938

27939

SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),

27940

Subtarget, DAG);

27941

// Need to fill with zeros to ensure the bitcast will produce zeroes

27942

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

27943

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,

27944

DAG.getConstant(0, dl, MVT::v8i1),

27945

CmpMask, DAG.getIntPtrConstant(0, dl));

27946

return DAG.getBitcast(MVT::i8, Ins);

27947

}

27948

case COMI: { // Comparison intrinsics

27949

ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;

27950

SDValue LHS = Op.getOperand(1);

27951

SDValue RHS = Op.getOperand(2);

27952

// Some conditions require the operands to be swapped.

27953

if (CC == ISD::SETLT || CC == ISD::SETLE)

27954

std::swap(LHS, RHS);

27955

27956

SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);

27957

SDValue SetCC;

27958

switch (CC) {

27959

case ISD::SETEQ: { // (ZF = 0 and PF = 0)

27960

SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);

27961

SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);

27962

SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);

27963

break;

27964

}

27965

case ISD::SETNE: { // (ZF = 1 or PF = 1)

27966

SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);

27967

SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);

27968

SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);

27969

break;

27970

}

27971

case ISD::SETGT: // (CF = 0 and ZF = 0)

27972

case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.

27973

SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);

27974

break;

27975

}

27976

case ISD::SETGE: // CF = 0

27977

case ISD::SETLE: // Condition opposite to GE. Operands swapped above.

27978

SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);

27979

break;

27980

default:

27981

llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27981);

27982

}

27983

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

27984

}

27985

case COMI_RM: { // Comparison intrinsics with Sae

27986

SDValue LHS = Op.getOperand(1);

27987

SDValue RHS = Op.getOperand(2);

27988

unsigned CondVal = Op.getConstantOperandVal(3);

27989

SDValue Sae = Op.getOperand(4);

27990

27991

SDValue FCmp;

27992

if (isRoundModeCurDirection(Sae))

27993

FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,

27994

DAG.getTargetConstant(CondVal, dl, MVT::i8));

27995

else if (isRoundModeSAE(Sae))

27996

FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,

27997

DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);

27998

else

27999

return SDValue();

28000

// Need to fill with zeros to ensure the bitcast will produce zeroes

28001

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

28002

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,

28003

DAG.getConstant(0, dl, MVT::v16i1),

28004

FCmp, DAG.getIntPtrConstant(0, dl));

28005

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,

28006

DAG.getBitcast(MVT::i16, Ins));

28007

}

28008

case VSHIFT: {

28009

SDValue SrcOp = Op.getOperand(1);

28010

SDValue ShAmt = Op.getOperand(2);

28011

assert(ShAmt.getValueType() == MVT::i32 &&(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28012, __extension__
__PRETTY_FUNCTION__))

28012

"Unexpected VSHIFT amount type")(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28012, __extension__
__PRETTY_FUNCTION__));

28013

28014

// Catch shift-by-constant.

28015

if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))

28016

return getTargetVShiftByConstNode(IntrData->Opc0, dl,

28017

Op.getSimpleValueType(), SrcOp,

28018

CShAmt->getZExtValue(), DAG);

28019

28020

ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);

28021

return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),

28022

SrcOp, ShAmt, 0, Subtarget, DAG);

28023

}

28024

case COMPRESS_EXPAND_IN_REG: {

28025

SDValue Mask = Op.getOperand(3);

28026

SDValue DataToCompress = Op.getOperand(1);

28027

SDValue PassThru = Op.getOperand(2);

28028

if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is

28029

return Op.getOperand(1);

28030

28031

// Avoid false dependency.

28032

if (PassThru.isUndef())

28033

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

28034

28035

return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,

28036

Mask);

28037

}

28038

case FIXUPIMM:

28039

case FIXUPIMM_MASKZ: {

28040

SDValue Src1 = Op.getOperand(1);

28041

SDValue Src2 = Op.getOperand(2);

28042

SDValue Src3 = Op.getOperand(3);

28043

SDValue Imm = Op.getOperand(4);

28044

SDValue Mask = Op.getOperand(5);

28045

SDValue Passthru = (IntrData->Type == FIXUPIMM)

28046

? Src1

28047

: getZeroVector(VT, Subtarget, DAG, dl);

28048

28049

unsigned Opc = IntrData->Opc0;

28050

if (IntrData->Opc1 != 0) {

28051

SDValue Sae = Op.getOperand(6);

28052

if (isRoundModeSAE(Sae))

28053

Opc = IntrData->Opc1;

28054

else if (!isRoundModeCurDirection(Sae))

28055

return SDValue();

28056

}

28057

28058

SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);

28059

28060

if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)

28061

return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);

28062

28063

return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);

28064

}

28065

case ROUNDP: {

28066

assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALE
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28066, __extension__
__PRETTY_FUNCTION__));

28067

// Clear the upper bits of the rounding immediate so that the legacy

28068

// intrinsic can't trigger the scaling behavior of VRNDSCALE.

28069

auto Round = cast<ConstantSDNode>(Op.getOperand(2));

28070

SDValue RoundingMode =

28071

DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);

28072

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

28073

Op.getOperand(1), RoundingMode);

28074

}

28075

case ROUNDS: {

28076

assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALES
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28076, __extension__
__PRETTY_FUNCTION__));

28077

// Clear the upper bits of the rounding immediate so that the legacy

28078

// intrinsic can't trigger the scaling behavior of VRNDSCALE.

28079

auto Round = cast<ConstantSDNode>(Op.getOperand(3));

28080

SDValue RoundingMode =

28081

DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);

28082

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

28083

Op.getOperand(1), Op.getOperand(2), RoundingMode);

28084

}

28085

case BEXTRI: {

28086

assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::BEXTRI
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::BEXTRI && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28086, __extension__
__PRETTY_FUNCTION__));

28087

28088

uint64_t Imm = Op.getConstantOperandVal(2);

28089

SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,

28090

Op.getValueType());

28091

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

28092

Op.getOperand(1), Control);

28093

}

28094

// ADC/ADCX/SBB

28095

case ADX: {

28096

SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);

28097

SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);

28098

28099

SDValue Res;

28100

// If the carry in is zero, then we should just use ADD/SUB instead of

28101

// ADC/SBB.

28102

if (isNullConstant(Op.getOperand(1))) {

28103

Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),

28104

Op.getOperand(3));

28105

} else {

28106

SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),

28107

DAG.getConstant(-1, dl, MVT::i8));

28108

Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),

28109

Op.getOperand(3), GenCF.getValue(1));

28110

}

28111

SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);

28112

SDValue Results[] = { SetCC, Res };

28113

return DAG.getMergeValues(Results, dl);

28114

}

28115

case CVTPD2PS_MASK:

28116

case CVTPD2DQ_MASK:

28117

case CVTQQ2PS_MASK:

28118

case TRUNCATE_TO_REG: {

28119

SDValue Src = Op.getOperand(1);

28120

SDValue PassThru = Op.getOperand(2);

28121

SDValue Mask = Op.getOperand(3);

28122

28123

if (isAllOnesConstant(Mask))

28124

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

28125

28126

MVT SrcVT = Src.getSimpleValueType();

28127

MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());

28128

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28129

return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),

28130

{Src, PassThru, Mask});

28131

}

28132

case CVTPS2PH_MASK: {

28133

SDValue Src = Op.getOperand(1);

28134

SDValue Rnd = Op.getOperand(2);

28135

SDValue PassThru = Op.getOperand(3);

28136

SDValue Mask = Op.getOperand(4);

28137

28138

unsigned RC = 0;

28139

unsigned Opc = IntrData->Opc0;

28140

bool SAE = Src.getValueType().is512BitVector() &&

28141

(isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));

28142

if (SAE) {

28143

Opc = X86ISD::CVTPS2PH_SAE;

28144

Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);

28145

}

28146

28147

if (isAllOnesConstant(Mask))

28148

return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);

28149

28150

if (SAE)

28151

Opc = X86ISD::MCVTPS2PH_SAE;

28152

else

28153

Opc = IntrData->Opc1;

28154

MVT SrcVT = Src.getSimpleValueType();

28155

MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());

28156

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28157

return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);

28158

}

28159

case CVTNEPS2BF16_MASK: {

28160

SDValue Src = Op.getOperand(1);

28161

SDValue PassThru = Op.getOperand(2);

28162

SDValue Mask = Op.getOperand(3);

28163

28164

if (ISD::isBuildVectorAllOnes(Mask.getNode()))

28165

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

28166

28167

// Break false dependency.

28168

if (PassThru.isUndef())

28169

PassThru = DAG.getConstant(0, dl, PassThru.getValueType());

28170

28171

return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,

28172

Mask);

28173

}

28174

default:

28175

break;

28176

}

28177

}

28178

28179

switch (IntNo) {

28180

default: return SDValue(); // Don't custom lower most intrinsics.

28181

28182

// ptest and testp intrinsics. The intrinsic these come from are designed to

28183

// return an integer value, not just an instruction so lower it to the ptest

28184

// or testp pattern and a setcc for the result.

28185

case Intrinsic::x86_avx512_ktestc_b:

28186

case Intrinsic::x86_avx512_ktestc_w:

28187

case Intrinsic::x86_avx512_ktestc_d:

28188

case Intrinsic::x86_avx512_ktestc_q:

28189

case Intrinsic::x86_avx512_ktestz_b:

28190

case Intrinsic::x86_avx512_ktestz_w:

28191

case Intrinsic::x86_avx512_ktestz_d:

28192

case Intrinsic::x86_avx512_ktestz_q:

28193

case Intrinsic::x86_sse41_ptestz:

28194

case Intrinsic::x86_sse41_ptestc:

28195

case Intrinsic::x86_sse41_ptestnzc:

28196

case Intrinsic::x86_avx_ptestz_256:

28197

case Intrinsic::x86_avx_ptestc_256:

28198

case Intrinsic::x86_avx_ptestnzc_256:

28199

case Intrinsic::x86_avx_vtestz_ps:

28200

case Intrinsic::x86_avx_vtestc_ps:

28201

case Intrinsic::x86_avx_vtestnzc_ps:

28202

case Intrinsic::x86_avx_vtestz_pd:

28203

case Intrinsic::x86_avx_vtestc_pd:

28204

case Intrinsic::x86_avx_vtestnzc_pd:

28205

case Intrinsic::x86_avx_vtestz_ps_256:

28206

case Intrinsic::x86_avx_vtestc_ps_256:

28207

case Intrinsic::x86_avx_vtestnzc_ps_256:

28208

case Intrinsic::x86_avx_vtestz_pd_256:

28209

case Intrinsic::x86_avx_vtestc_pd_256:

28210

case Intrinsic::x86_avx_vtestnzc_pd_256: {

28211

unsigned TestOpc = X86ISD::PTEST;

28212

X86::CondCode X86CC;

28213

switch (IntNo) {

28214

default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28214);

28215

case Intrinsic::x86_avx512_ktestc_b:

28216

case Intrinsic::x86_avx512_ktestc_w:

28217

case Intrinsic::x86_avx512_ktestc_d:

28218

case Intrinsic::x86_avx512_ktestc_q:

28219

// CF = 1

28220

TestOpc = X86ISD::KTEST;

28221

X86CC = X86::COND_B;

28222

break;

28223

case Intrinsic::x86_avx512_ktestz_b:

28224

case Intrinsic::x86_avx512_ktestz_w:

28225

case Intrinsic::x86_avx512_ktestz_d:

28226

case Intrinsic::x86_avx512_ktestz_q:

28227

TestOpc = X86ISD::KTEST;

28228

X86CC = X86::COND_E;

28229

break;

28230

case Intrinsic::x86_avx_vtestz_ps:

28231

case Intrinsic::x86_avx_vtestz_pd:

28232

case Intrinsic::x86_avx_vtestz_ps_256:

28233

case Intrinsic::x86_avx_vtestz_pd_256:

28234

TestOpc = X86ISD::TESTP;

28235

[[fallthrough]];

28236

case Intrinsic::x86_sse41_ptestz:

28237

case Intrinsic::x86_avx_ptestz_256:

28238

// ZF = 1

28239

X86CC = X86::COND_E;

28240

break;

28241

case Intrinsic::x86_avx_vtestc_ps:

28242

case Intrinsic::x86_avx_vtestc_pd:

28243

case Intrinsic::x86_avx_vtestc_ps_256:

28244

case Intrinsic::x86_avx_vtestc_pd_256:

28245

TestOpc = X86ISD::TESTP;

28246

[[fallthrough]];

28247

case Intrinsic::x86_sse41_ptestc:

28248

case Intrinsic::x86_avx_ptestc_256:

28249

// CF = 1

28250

X86CC = X86::COND_B;

28251

break;

28252

case Intrinsic::x86_avx_vtestnzc_ps:

28253

case Intrinsic::x86_avx_vtestnzc_pd:

28254

case Intrinsic::x86_avx_vtestnzc_ps_256:

28255

case Intrinsic::x86_avx_vtestnzc_pd_256:

28256

TestOpc = X86ISD::TESTP;

28257

[[fallthrough]];

28258

case Intrinsic::x86_sse41_ptestnzc:

28259

case Intrinsic::x86_avx_ptestnzc_256:

28260

// ZF and CF = 0

28261

X86CC = X86::COND_A;

28262

break;

28263

}

28264

28265

SDValue LHS = Op.getOperand(1);

28266

SDValue RHS = Op.getOperand(2);

28267

SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);

28268

SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);

28269

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

28270

}

28271

28272

case Intrinsic::x86_sse42_pcmpistria128:

28273

case Intrinsic::x86_sse42_pcmpestria128:

28274

case Intrinsic::x86_sse42_pcmpistric128:

28275

case Intrinsic::x86_sse42_pcmpestric128:

28276

case Intrinsic::x86_sse42_pcmpistrio128:

28277

case Intrinsic::x86_sse42_pcmpestrio128:

28278

case Intrinsic::x86_sse42_pcmpistris128:

28279

case Intrinsic::x86_sse42_pcmpestris128:

28280

case Intrinsic::x86_sse42_pcmpistriz128:

28281

case Intrinsic::x86_sse42_pcmpestriz128: {

28282

unsigned Opcode;

28283

X86::CondCode X86CC;

28284

switch (IntNo) {

28285

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28285); // Can't reach here.

28286

case Intrinsic::x86_sse42_pcmpistria128:

28287

Opcode = X86ISD::PCMPISTR;

28288

X86CC = X86::COND_A;

28289

break;

28290

case Intrinsic::x86_sse42_pcmpestria128:

28291

Opcode = X86ISD::PCMPESTR;

28292

X86CC = X86::COND_A;

28293

break;

28294

case Intrinsic::x86_sse42_pcmpistric128:

28295

Opcode = X86ISD::PCMPISTR;

28296

X86CC = X86::COND_B;

28297

break;

28298

case Intrinsic::x86_sse42_pcmpestric128:

28299

Opcode = X86ISD::PCMPESTR;

28300

X86CC = X86::COND_B;

28301

break;

28302

case Intrinsic::x86_sse42_pcmpistrio128:

28303

Opcode = X86ISD::PCMPISTR;

28304

X86CC = X86::COND_O;

28305

break;

28306

case Intrinsic::x86_sse42_pcmpestrio128:

28307

Opcode = X86ISD::PCMPESTR;

28308

X86CC = X86::COND_O;

28309

break;

28310

case Intrinsic::x86_sse42_pcmpistris128:

28311

Opcode = X86ISD::PCMPISTR;

28312

X86CC = X86::COND_S;

28313

break;

28314

case Intrinsic::x86_sse42_pcmpestris128:

28315

Opcode = X86ISD::PCMPESTR;

28316

X86CC = X86::COND_S;

28317

break;

28318

case Intrinsic::x86_sse42_pcmpistriz128:

28319

Opcode = X86ISD::PCMPISTR;

28320

X86CC = X86::COND_E;

28321

break;

28322

case Intrinsic::x86_sse42_pcmpestriz128:

28323

Opcode = X86ISD::PCMPESTR;

28324

X86CC = X86::COND_E;

28325

break;

28326

}

28327

SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

28328

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

28329

SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);

28330

SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);

28331

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

28332

}

28333

28334

case Intrinsic::x86_sse42_pcmpistri128:

28335

case Intrinsic::x86_sse42_pcmpestri128: {

28336

unsigned Opcode;

28337

if (IntNo == Intrinsic::x86_sse42_pcmpistri128)

28338

Opcode = X86ISD::PCMPISTR;

28339

else

28340

Opcode = X86ISD::PCMPESTR;

28341

28342

SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

28343

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

28344

return DAG.getNode(Opcode, dl, VTs, NewOps);

28345

}

28346

28347

case Intrinsic::x86_sse42_pcmpistrm128:

28348

case Intrinsic::x86_sse42_pcmpestrm128: {

28349

unsigned Opcode;

28350

if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)

28351

Opcode = X86ISD::PCMPISTR;

28352

else

28353

Opcode = X86ISD::PCMPESTR;

28354

28355

SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

28356

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

28357

return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);

28358

}

28359

28360

case Intrinsic::eh_sjlj_lsda: {

28361

MachineFunction &MF = DAG.getMachineFunction();

28362

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28363

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

28364

auto &Context = MF.getMMI().getContext();

28365

MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +

28366

Twine(MF.getFunctionNumber()));

28367

return DAG.getNode(getGlobalWrapperKind(), dl, VT,

28368

DAG.getMCSymbol(S, PtrVT));

28369

}

28370

28371

case Intrinsic::x86_seh_lsda: {

28372

// Compute the symbol for the LSDA. We know it'll get emitted later.

28373

MachineFunction &MF = DAG.getMachineFunction();

28374

SDValue Op1 = Op.getOperand(1);

28375

auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());

28376

MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(

28377

GlobalValue::dropLLVMManglingEscape(Fn->getName()));

28378

28379

// Generate a simple absolute symbol reference. This intrinsic is only

28380

// supported on 32-bit Windows, which isn't PIC.

28381

SDValue Result = DAG.getMCSymbol(LSDASym, VT);

28382

return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);

28383

}

28384

28385

case Intrinsic::eh_recoverfp: {

28386

SDValue FnOp = Op.getOperand(1);

28387

SDValue IncomingFPOp = Op.getOperand(2);

28388

GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);

28389

auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);

28390

if (!Fn)

28391

report_fatal_error(

28392

"llvm.eh.recoverfp must take a function as the first argument");

28393

return recoverFramePointer(DAG, Fn, IncomingFPOp);

28394

}

28395

28396

case Intrinsic::localaddress: {

28397

// Returns one of the stack, base, or frame pointer registers, depending on

28398

// which is used to reference local variables.

28399

MachineFunction &MF = DAG.getMachineFunction();

28400

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

28401

unsigned Reg;

28402

if (RegInfo->hasBasePointer(MF))

28403

Reg = RegInfo->getBaseRegister();

28404

else { // Handles the SP or FP case.

28405

bool CantUseFP = RegInfo->hasStackRealignment(MF);

28406

if (CantUseFP)

28407

Reg = RegInfo->getPtrSizedStackRegister(MF);

28408

else

28409

Reg = RegInfo->getPtrSizedFrameRegister(MF);

28410

}

28411

return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);

28412

}

28413

case Intrinsic::x86_avx512_vp2intersect_q_512:

28414

case Intrinsic::x86_avx512_vp2intersect_q_256:

28415

case Intrinsic::x86_avx512_vp2intersect_q_128:

28416

case Intrinsic::x86_avx512_vp2intersect_d_512:

28417

case Intrinsic::x86_avx512_vp2intersect_d_256:

28418

case Intrinsic::x86_avx512_vp2intersect_d_128: {

28419

MVT MaskVT = Op.getSimpleValueType();

28420

28421

SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);

28422

SDLoc DL(Op);

28423

28424

SDValue Operation =

28425

DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,

28426

Op->getOperand(1), Op->getOperand(2));

28427

28428

SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,

28429

MaskVT, Operation);

28430

SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,

28431

MaskVT, Operation);

28432

return DAG.getMergeValues({Result0, Result1}, DL);

28433

}

28434

case Intrinsic::x86_mmx_pslli_w:

28435

case Intrinsic::x86_mmx_pslli_d:

28436

case Intrinsic::x86_mmx_pslli_q:

28437

case Intrinsic::x86_mmx_psrli_w:

28438

case Intrinsic::x86_mmx_psrli_d:

28439

case Intrinsic::x86_mmx_psrli_q:

28440

case Intrinsic::x86_mmx_psrai_w:

28441

case Intrinsic::x86_mmx_psrai_d: {

28442

SDLoc DL(Op);

28443

SDValue ShAmt = Op.getOperand(2);

28444

// If the argument is a constant, convert it to a target constant.

28445

if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {

28446

// Clamp out of bounds shift amounts since they will otherwise be masked

28447

// to 8-bits which may make it no longer out of bounds.

28448

unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);

28449

if (ShiftAmount == 0)

28450

return Op.getOperand(1);

28451

28452

return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),

28453

Op.getOperand(0), Op.getOperand(1),

28454

DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));

28455

}

28456

28457

unsigned NewIntrinsic;

28458

switch (IntNo) {

28459

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28459); // Can't reach here.

28460

case Intrinsic::x86_mmx_pslli_w:

28461

NewIntrinsic = Intrinsic::x86_mmx_psll_w;

28462

break;

28463

case Intrinsic::x86_mmx_pslli_d:

28464

NewIntrinsic = Intrinsic::x86_mmx_psll_d;

28465

break;

28466

case Intrinsic::x86_mmx_pslli_q:

28467

NewIntrinsic = Intrinsic::x86_mmx_psll_q;

28468

break;

28469

case Intrinsic::x86_mmx_psrli_w:

28470

NewIntrinsic = Intrinsic::x86_mmx_psrl_w;

28471

break;

28472

case Intrinsic::x86_mmx_psrli_d:

28473

NewIntrinsic = Intrinsic::x86_mmx_psrl_d;

28474

break;

28475

case Intrinsic::x86_mmx_psrli_q:

28476

NewIntrinsic = Intrinsic::x86_mmx_psrl_q;

28477

break;

28478

case Intrinsic::x86_mmx_psrai_w:

28479

NewIntrinsic = Intrinsic::x86_mmx_psra_w;

28480

break;

28481

case Intrinsic::x86_mmx_psrai_d:

28482

NewIntrinsic = Intrinsic::x86_mmx_psra_d;

28483

break;

28484

}

28485

28486

// The vector shift intrinsics with scalars uses 32b shift amounts but

28487

// the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an

28488

// MMX register.

28489

ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);

28490

return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),

28491

DAG.getTargetConstant(NewIntrinsic, DL,

28492

getPointerTy(DAG.getDataLayout())),

28493

Op.getOperand(1), ShAmt);

28494

}

28495

case Intrinsic::thread_pointer: {

28496

if (Subtarget.isTargetELF()) {

28497

SDLoc dl(Op);

28498

EVT PtrVT = getPointerTy(DAG.getDataLayout());

28499

// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).

28500

Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(

28501

*DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));

28502

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),

28503

DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));

28504

}

28505

report_fatal_error(

28506

"Target OS doesn't support __builtin_thread_pointer() yet.");

28507

}

28508

}

28509

}

28510

28511

static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

28512

SDValue Src, SDValue Mask, SDValue Base,

28513

SDValue Index, SDValue ScaleOp, SDValue Chain,

28514

const X86Subtarget &Subtarget) {

28515

SDLoc dl(Op);

28516

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

28517

// Scale must be constant.

28518

if (!C)

28519

return SDValue();

28520

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28521

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

28522

TLI.getPointerTy(DAG.getDataLayout()));

28523

EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();

28524

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);

28525

// If source is undef or we know it won't be used, use a zero vector

28526

// to break register dependency.

28527

// TODO: use undef instead and let BreakFalseDeps deal with it?

28528

if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))

28529

Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

28530

28531

// Cast mask to an integer type.

28532

Mask = DAG.getBitcast(MaskVT, Mask);

28533

28534

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28535

28536

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };

28537

SDValue Res =

28538

DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,

28539

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

28540

return DAG.getMergeValues({Res, Res.getValue(1)}, dl);

28541

}

28542

28543

static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,

28544

SDValue Src, SDValue Mask, SDValue Base,

28545

SDValue Index, SDValue ScaleOp, SDValue Chain,

28546

const X86Subtarget &Subtarget) {

28547

MVT VT = Op.getSimpleValueType();

28548

SDLoc dl(Op);

28549

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

28550

// Scale must be constant.

28551

if (!C)

28552

return SDValue();

28553

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28554

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

28555

TLI.getPointerTy(DAG.getDataLayout()));

28556

unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),

28557

VT.getVectorNumElements());

28558

MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

28559

28560

// We support two versions of the gather intrinsics. One with scalar mask and

28561

// one with vXi1 mask. Convert scalar to vXi1 if necessary.

28562

if (Mask.getValueType() != MaskVT)

28563

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28564

28565

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);

28566

// If source is undef or we know it won't be used, use a zero vector

28567

// to break register dependency.

28568

// TODO: use undef instead and let BreakFalseDeps deal with it?

28569

if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))

28570

Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

28571

28572

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28573

28574

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };

28575

SDValue Res =

28576

DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,

28577

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

28578

return DAG.getMergeValues({Res, Res.getValue(1)}, dl);

28579

}

28580

28581

static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

28582

SDValue Src, SDValue Mask, SDValue Base,

28583

SDValue Index, SDValue ScaleOp, SDValue Chain,

28584

const X86Subtarget &Subtarget) {

28585

SDLoc dl(Op);

28586

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

28587

// Scale must be constant.

28588

if (!C)

28589

return SDValue();

28590

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28591

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

28592

TLI.getPointerTy(DAG.getDataLayout()));

28593

unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),

28594

Src.getSimpleValueType().getVectorNumElements());

28595

MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

28596

28597

// We support two versions of the scatter intrinsics. One with scalar mask and

28598

// one with vXi1 mask. Convert scalar to vXi1 if necessary.

28599

if (Mask.getValueType() != MaskVT)

28600

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28601

28602

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28603

28604

SDVTList VTs = DAG.getVTList(MVT::Other);

28605

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};

28606

SDValue Res =

28607

DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

28608

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

28609

return Res;

28610

}

28611

28612

static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

28613

SDValue Mask, SDValue Base, SDValue Index,

28614

SDValue ScaleOp, SDValue Chain,

28615

const X86Subtarget &Subtarget) {

28616

SDLoc dl(Op);

28617

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

28618

// Scale must be constant.

28619

if (!C)

28620

return SDValue();

28621

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28622

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

28623

TLI.getPointerTy(DAG.getDataLayout()));

28624

SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);

28625

SDValue Segment = DAG.getRegister(0, MVT::i32);

28626

MVT MaskVT =

28627

MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());

28628

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28629

SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};

28630

SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);

28631

return SDValue(Res, 0);

28632

}

28633

28634

/// Handles the lowering of builtin intrinsics with chain that return their

28635

/// value into registers EDX:EAX.

28636

/// If operand ScrReg is a valid register identifier, then operand 2 of N is

28637

/// copied to SrcReg. The assumption is that SrcReg is an implicit input to

28638

/// TargetOpcode.

28639

/// Returns a Glue value which can be used to add extra copy-from-reg if the

28640

/// expanded intrinsics implicitly defines extra registers (i.e. not just

28641

/// EDX:EAX).

28642

static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,

28643

SelectionDAG &DAG,

28644

unsigned TargetOpcode,

28645

unsigned SrcReg,

28646

const X86Subtarget &Subtarget,

28647

SmallVectorImpl<SDValue> &Results) {

28648

SDValue Chain = N->getOperand(0);

28649

SDValue Glue;

28650

28651

if (SrcReg) {

28652

assert(N->getNumOperands() == 3 && "Unexpected number of operands!")(static_cast <bool> (N->getNumOperands() == 3 &&
"Unexpected number of operands!") ? void (0) : __assert_fail
("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28652, __extension__
__PRETTY_FUNCTION__));

28653

Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);

28654

Glue = Chain.getValue(1);

28655

}

28656

28657

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

28658

SDValue N1Ops[] = {Chain, Glue};

28659

SDNode *N1 = DAG.getMachineNode(

28660

TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));

28661

Chain = SDValue(N1, 0);

28662

28663

// Reads the content of XCR and returns it in registers EDX:EAX.

28664

SDValue LO, HI;

28665

if (Subtarget.is64Bit()) {

28666

LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));

28667

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,

28668

LO.getValue(2));

28669

} else {

28670

LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));

28671

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,

28672

LO.getValue(2));

28673

}

28674

Chain = HI.getValue(1);

28675

Glue = HI.getValue(2);

28676

28677

if (Subtarget.is64Bit()) {

28678

// Merge the two 32-bit values into a 64-bit one.

28679

SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,

28680

DAG.getConstant(32, DL, MVT::i8));

28681

Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));

28682

Results.push_back(Chain);

28683

return Glue;

28684

}

28685

28686

// Use a buildpair to merge the two 32-bit values into a 64-bit one.

28687

SDValue Ops[] = { LO, HI };

28688

SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);

28689

Results.push_back(Pair);

28690

Results.push_back(Chain);

28691

return Glue;

28692

}

28693

28694

/// Handles the lowering of builtin intrinsics that read the time stamp counter

28695

/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower

28696

/// READCYCLECOUNTER nodes.

28697

static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,

28698

SelectionDAG &DAG,

28699

const X86Subtarget &Subtarget,

28700

SmallVectorImpl<SDValue> &Results) {

28701

// The processor's time-stamp counter (a 64-bit MSR) is stored into the

28702

// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR

28703

// and the EAX register is loaded with the low-order 32 bits.

28704

SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,

28705

/* NoRegister */0, Subtarget,

28706

Results);

28707

if (Opcode != X86::RDTSCP)

28708

return;

28709

28710

SDValue Chain = Results[1];

28711

// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into

28712

// the ECX register. Add 'ecx' explicitly to the chain.

28713

SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);

28714

Results[1] = ecx;

28715

Results.push_back(ecx.getValue(1));

28716

}

28717

28718

static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,

28719

SelectionDAG &DAG) {

28720

SmallVector<SDValue, 3> Results;

28721

SDLoc DL(Op);

28722

getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,

28723

Results);

28724

return DAG.getMergeValues(Results, DL);

28725

}

28726

28727

static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {

28728

MachineFunction &MF = DAG.getMachineFunction();

28729

SDValue Chain = Op.getOperand(0);

28730

SDValue RegNode = Op.getOperand(2);

28731

WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();

28732

if (!EHInfo)

28733

report_fatal_error("EH registrations only live in functions using WinEH");

28734

28735

// Cast the operand to an alloca, and remember the frame index.

28736

auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);

28737

if (!FINode)

28738

report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");

28739

EHInfo->EHRegNodeFrameIndex = FINode->getIndex();

28740

28741

// Return the chain operand without making any DAG nodes.

28742

return Chain;

28743

}

28744

28745

static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {

28746

MachineFunction &MF = DAG.getMachineFunction();

28747

SDValue Chain = Op.getOperand(0);

28748

SDValue EHGuard = Op.getOperand(2);

28749

WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();

28750

if (!EHInfo)

28751

report_fatal_error("EHGuard only live in functions using WinEH");

28752

28753

// Cast the operand to an alloca, and remember the frame index.

28754

auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);

28755

if (!FINode)

28756

report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");

28757

EHInfo->EHGuardFrameIndex = FINode->getIndex();

28758

28759

// Return the chain operand without making any DAG nodes.

28760

return Chain;

28761

}

28762

28763

/// Emit Truncating Store with signed or unsigned saturation.

28764

static SDValue

28765

EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,

28766

SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,

28767

SelectionDAG &DAG) {

28768

SDVTList VTs = DAG.getVTList(MVT::Other);

28769

SDValue Undef = DAG.getUNDEF(Ptr.getValueType());

28770

SDValue Ops[] = { Chain, Val, Ptr, Undef };

28771

unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;

28772

return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);

28773

}

28774

28775

/// Emit Masked Truncating Store with signed or unsigned saturation.

28776

static SDValue

28777

EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,

28778

SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,

28779

MachineMemOperand *MMO, SelectionDAG &DAG) {

28780

SDVTList VTs = DAG.getVTList(MVT::Other);

28781

SDValue Ops[] = { Chain, Val, Ptr, Mask };

28782

unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;

28783

return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);

28784

}

28785

28786

static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,

28787

SelectionDAG &DAG) {

28788

unsigned IntNo = Op.getConstantOperandVal(1);

28789

const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);

28790

if (!IntrData) {

28791

switch (IntNo) {

28792

28793

case Intrinsic::swift_async_context_addr: {

28794

SDLoc dl(Op);

28795

auto &MF = DAG.getMachineFunction();

28796

auto X86FI = MF.getInfo<X86MachineFunctionInfo>();

28797

if (Subtarget.is64Bit()) {

28798

MF.getFrameInfo().setFrameAddressIsTaken(true);

28799

X86FI->setHasSwiftAsyncContext(true);

28800

SDValue Chain = Op->getOperand(0);

28801

SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);

28802

SDValue Result =

28803

SDValue(DAG.getMachineNode(X86::SUB64ri8, dl, MVT::i64, CopyRBP,

28804

DAG.getTargetConstant(8, dl, MVT::i32)),

28805

0);

28806

// Return { result, chain }.

28807

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,

28808

CopyRBP.getValue(1));

28809

} else {

28810

// 32-bit so no special extended frame, create or reuse an existing

28811

// stack slot.

28812

if (!X86FI->getSwiftAsyncContextFrameIdx())

28813

X86FI->setSwiftAsyncContextFrameIdx(

28814

MF.getFrameInfo().CreateStackObject(4, Align(4), false));

28815

SDValue Result =

28816

DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);

28817

// Return { result, chain }.

28818

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,

28819

Op->getOperand(0));

28820

}

28821

}

28822

28823

case llvm::Intrinsic::x86_seh_ehregnode:

28824

return MarkEHRegistrationNode(Op, DAG);

28825

case llvm::Intrinsic::x86_seh_ehguard:

28826

return MarkEHGuard(Op, DAG);

28827

case llvm::Intrinsic::x86_rdpkru: {

28828

SDLoc dl(Op);

28829

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

28830

// Create a RDPKRU node and pass 0 to the ECX parameter.

28831

return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),

28832

DAG.getConstant(0, dl, MVT::i32));

28833

}

28834

case llvm::Intrinsic::x86_wrpkru: {

28835

SDLoc dl(Op);

28836

// Create a WRPKRU node, pass the input to the EAX parameter, and pass 0

28837

// to the EDX and ECX parameters.

28838

return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,

28839

Op.getOperand(0), Op.getOperand(2),

28840

DAG.getConstant(0, dl, MVT::i32),

28841

DAG.getConstant(0, dl, MVT::i32));

28842

}

28843

case llvm::Intrinsic::asan_check_memaccess: {

28844

// Mark this as adjustsStack because it will be lowered to a call.

28845

DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true);

28846

// Don't do anything here, we will expand these intrinsics out later.

28847

return Op;

28848

}

28849

case llvm::Intrinsic::x86_flags_read_u32:

28850

case llvm::Intrinsic::x86_flags_read_u64:

28851

case llvm::Intrinsic::x86_flags_write_u32:

28852

case llvm::Intrinsic::x86_flags_write_u64: {

28853

// We need a frame pointer because this will get lowered to a PUSH/POP

28854

// sequence.

28855

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

28856

MFI.setHasCopyImplyingStackAdjustment(true);

28857

// Don't do anything here, we will expand these intrinsics out later

28858

// during FinalizeISel in EmitInstrWithCustomInserter.

28859

return Op;

28860

}

28861

case Intrinsic::x86_lwpins32:

28862

case Intrinsic::x86_lwpins64:

28863

case Intrinsic::x86_umwait:

28864

case Intrinsic::x86_tpause: {

28865

SDLoc dl(Op);

28866

SDValue Chain = Op->getOperand(0);

28867

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

28868

unsigned Opcode;

28869

28870

switch (IntNo) {

28871

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28871);

28872

case Intrinsic::x86_umwait:

28873

Opcode = X86ISD::UMWAIT;

28874

break;

28875

case Intrinsic::x86_tpause:

28876

Opcode = X86ISD::TPAUSE;

28877

break;

28878

case Intrinsic::x86_lwpins32:

28879

case Intrinsic::x86_lwpins64:

28880

Opcode = X86ISD::LWPINS;

28881

break;

28882

}

28883

28884

SDValue Operation =

28885

DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),

28886

Op->getOperand(3), Op->getOperand(4));

28887

SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);

28888

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

28889

Operation.getValue(1));

28890

}

28891

case Intrinsic::x86_enqcmd:

28892

case Intrinsic::x86_enqcmds: {

28893

SDLoc dl(Op);

28894

SDValue Chain = Op.getOperand(0);

28895

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

28896

unsigned Opcode;

28897

switch (IntNo) {

28898

default: llvm_unreachable("Impossible intrinsic!")::llvm::llvm_unreachable_internal("Impossible intrinsic!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28898);

28899

case Intrinsic::x86_enqcmd:

28900

Opcode = X86ISD::ENQCMD;

28901

break;

28902

case Intrinsic::x86_enqcmds:

28903

Opcode = X86ISD::ENQCMDS;

28904

break;

28905

}

28906

SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),

28907

Op.getOperand(3));

28908

SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);

28909

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

28910

Operation.getValue(1));

28911

}

28912

case Intrinsic::x86_aesenc128kl:

28913

case Intrinsic::x86_aesdec128kl:

28914

case Intrinsic::x86_aesenc256kl:

28915

case Intrinsic::x86_aesdec256kl: {

28916

SDLoc DL(Op);

28917

SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);

28918

SDValue Chain = Op.getOperand(0);

28919

unsigned Opcode;

28920

28921

switch (IntNo) {

28922

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28922);

28923

case Intrinsic::x86_aesenc128kl:

28924

Opcode = X86ISD::AESENC128KL;

28925

break;

28926

case Intrinsic::x86_aesdec128kl:

28927

Opcode = X86ISD::AESDEC128KL;

28928

break;

28929

case Intrinsic::x86_aesenc256kl:

28930

Opcode = X86ISD::AESENC256KL;

28931

break;

28932

case Intrinsic::x86_aesdec256kl:

28933

Opcode = X86ISD::AESDEC256KL;

28934

break;

28935

}

28936

28937

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28938

MachineMemOperand *MMO = MemIntr->getMemOperand();

28939

EVT MemVT = MemIntr->getMemoryVT();

28940

SDValue Operation = DAG.getMemIntrinsicNode(

28941

Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,

28942

MMO);

28943

SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);

28944

28945

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

28946

{ZF, Operation.getValue(0), Operation.getValue(2)});

28947

}

28948

case Intrinsic::x86_aesencwide128kl:

28949

case Intrinsic::x86_aesdecwide128kl:

28950

case Intrinsic::x86_aesencwide256kl:

28951

case Intrinsic::x86_aesdecwide256kl: {

28952

SDLoc DL(Op);

28953

SDVTList VTs = DAG.getVTList(

28954

{MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,

28955

MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});

28956

SDValue Chain = Op.getOperand(0);

28957

unsigned Opcode;

28958

28959

switch (IntNo) {

28960

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28960);

28961

case Intrinsic::x86_aesencwide128kl:

28962

Opcode = X86ISD::AESENCWIDE128KL;

28963

break;

28964

case Intrinsic::x86_aesdecwide128kl:

28965

Opcode = X86ISD::AESDECWIDE128KL;

28966

break;

28967

case Intrinsic::x86_aesencwide256kl:

28968

Opcode = X86ISD::AESENCWIDE256KL;

28969

break;

28970

case Intrinsic::x86_aesdecwide256kl:

28971

Opcode = X86ISD::AESDECWIDE256KL;

28972

break;

28973

}

28974

28975

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28976

MachineMemOperand *MMO = MemIntr->getMemOperand();

28977

EVT MemVT = MemIntr->getMemoryVT();

28978

SDValue Operation = DAG.getMemIntrinsicNode(

28979

Opcode, DL, VTs,

28980

{Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),

28981

Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),

28982

Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},

28983

MemVT, MMO);

28984

SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);

28985

28986

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

28987

{ZF, Operation.getValue(1), Operation.getValue(2),

28988

Operation.getValue(3), Operation.getValue(4),

28989

Operation.getValue(5), Operation.getValue(6),

28990

Operation.getValue(7), Operation.getValue(8),

28991

Operation.getValue(9)});

28992

}

28993

case Intrinsic::x86_testui: {

28994

SDLoc dl(Op);

28995

SDValue Chain = Op.getOperand(0);

28996

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

28997

SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);

28998

SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);

28999

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

29000

Operation.getValue(1));

29001

}

29002

case Intrinsic::x86_atomic_bts_rm:

29003

case Intrinsic::x86_atomic_btc_rm:

29004

case Intrinsic::x86_atomic_btr_rm: {

29005

SDLoc DL(Op);

29006

MVT VT = Op.getSimpleValueType();

29007

SDValue Chain = Op.getOperand(0);

29008

SDValue Op1 = Op.getOperand(2);

29009

SDValue Op2 = Op.getOperand(3);

29010

unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM

29011

: IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM

29012

: X86ISD::LBTR_RM;

29013

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

29014

SDValue Res =

29015

DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),

29016

{Chain, Op1, Op2}, VT, MMO);

29017

Chain = Res.getValue(1);

29018

Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);

29019

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);

29020

}

29021

case Intrinsic::x86_atomic_bts:

29022

case Intrinsic::x86_atomic_btc:

29023

case Intrinsic::x86_atomic_btr: {

29024

SDLoc DL(Op);

29025

MVT VT = Op.getSimpleValueType();

29026

SDValue Chain = Op.getOperand(0);

29027

SDValue Op1 = Op.getOperand(2);

29028

SDValue Op2 = Op.getOperand(3);

29029

unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS

29030

: IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC

29031

: X86ISD::LBTR;

29032

SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);

29033

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

29034

SDValue Res =

29035

DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),

29036

{Chain, Op1, Op2, Size}, VT, MMO);

29037

Chain = Res.getValue(1);

29038

Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);

29039

unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();

29040

if (Imm)

29041

Res = DAG.getNode(ISD::SHL, DL, VT, Res,

29042

DAG.getShiftAmountConstant(Imm, VT, DL));

29043

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);

29044

}

29045

case Intrinsic::x86_cmpccxadd32:

29046

case Intrinsic::x86_cmpccxadd64: {

29047

SDLoc DL(Op);

29048

SDValue Chain = Op.getOperand(0);

29049

SDValue Addr = Op.getOperand(2);

29050

SDValue Src1 = Op.getOperand(3);

29051

SDValue Src2 = Op.getOperand(4);

29052

SDValue CC = Op.getOperand(5);

29053

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

29054

SDValue Operation = DAG.getMemIntrinsicNode(

29055

X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},

29056

MVT::i32, MMO);

29057

return Operation;

29058

}

29059

case Intrinsic::x86_aadd32:

29060

case Intrinsic::x86_aadd64:

29061

case Intrinsic::x86_aand32:

29062

case Intrinsic::x86_aand64:

29063

case Intrinsic::x86_aor32:

29064

case Intrinsic::x86_aor64:

29065

case Intrinsic::x86_axor32:

29066

case Intrinsic::x86_axor64: {

29067

SDLoc DL(Op);

29068

SDValue Chain = Op.getOperand(0);

29069

SDValue Op1 = Op.getOperand(2);

29070

SDValue Op2 = Op.getOperand(3);

29071

MVT VT = Op2.getSimpleValueType();

29072

unsigned Opc = 0;

29073

switch (IntNo) {

29074

default:

29075

llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29075);

29076

case Intrinsic::x86_aadd32:

29077

case Intrinsic::x86_aadd64:

29078

Opc = X86ISD::AADD;

29079

break;

29080

case Intrinsic::x86_aand32:

29081

case Intrinsic::x86_aand64:

29082

Opc = X86ISD::AAND;

29083

break;

29084

case Intrinsic::x86_aor32:

29085

case Intrinsic::x86_aor64:

29086

Opc = X86ISD::AOR;

29087

break;

29088

case Intrinsic::x86_axor32:

29089

case Intrinsic::x86_axor64:

29090

Opc = X86ISD::AXOR;

29091

break;

29092

}

29093

MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();

29094

return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),

29095

{Chain, Op1, Op2}, VT, MMO);

29096

}

29097

case Intrinsic::x86_atomic_add_cc:

29098

case Intrinsic::x86_atomic_sub_cc:

29099

case Intrinsic::x86_atomic_or_cc:

29100

case Intrinsic::x86_atomic_and_cc:

29101

case Intrinsic::x86_atomic_xor_cc: {

29102

SDLoc DL(Op);

29103

SDValue Chain = Op.getOperand(0);

29104

SDValue Op1 = Op.getOperand(2);

29105

SDValue Op2 = Op.getOperand(3);

29106

X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);

29107

MVT VT = Op2.getSimpleValueType();

29108

unsigned Opc = 0;

29109

switch (IntNo) {

29110

default:

29111

llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29111);

29112

case Intrinsic::x86_atomic_add_cc:

29113

Opc = X86ISD::LADD;

29114

break;

29115

case Intrinsic::x86_atomic_sub_cc:

29116

Opc = X86ISD::LSUB;

29117

break;

29118

case Intrinsic::x86_atomic_or_cc:

29119

Opc = X86ISD::LOR;

29120

break;

29121

case Intrinsic::x86_atomic_and_cc:

29122

Opc = X86ISD::LAND;

29123

break;

29124

case Intrinsic::x86_atomic_xor_cc:

29125

Opc = X86ISD::LXOR;

29126

break;

29127

}

29128

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

29129

SDValue LockArith =

29130

DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),

29131

{Chain, Op1, Op2}, VT, MMO);

29132

Chain = LockArith.getValue(1);

29133

return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);

29134

}

29135

}

29136

return SDValue();

29137

}

29138

29139

SDLoc dl(Op);

29140

switch(IntrData->Type) {

29141

default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29141);

29142

case RDSEED:

29143

case RDRAND: {

29144

// Emit the node with the right value type.

29145

SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);

29146

SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

29147

29148

// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.

29149

// Otherwise return the value from Rand, which is always 0, casted to i32.

29150

SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),

29151

DAG.getConstant(1, dl, Op->getValueType(1)),

29152

DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),

29153

SDValue(Result.getNode(), 1)};

29154

SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);

29155

29156

// Return { result, isValid, chain }.

29157

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,

29158

SDValue(Result.getNode(), 2));

29159

}

29160

case GATHER_AVX2: {

29161

SDValue Chain = Op.getOperand(0);

29162

SDValue Src = Op.getOperand(2);

29163

SDValue Base = Op.getOperand(3);

29164

SDValue Index = Op.getOperand(4);

29165

SDValue Mask = Op.getOperand(5);

29166

SDValue Scale = Op.getOperand(6);

29167

return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,

29168

Scale, Chain, Subtarget);

29169

}

29170

case GATHER: {

29171

//gather(v1, mask, index, base, scale);

29172

SDValue Chain = Op.getOperand(0);

29173

SDValue Src = Op.getOperand(2);

29174

SDValue Base = Op.getOperand(3);

29175

SDValue Index = Op.getOperand(4);

29176

SDValue Mask = Op.getOperand(5);

29177

SDValue Scale = Op.getOperand(6);

29178

return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,

29179

Chain, Subtarget);

29180

}

29181

case SCATTER: {

29182

//scatter(base, mask, index, v1, scale);

29183

SDValue Chain = Op.getOperand(0);

29184

SDValue Base = Op.getOperand(2);

29185

SDValue Mask = Op.getOperand(3);

29186

SDValue Index = Op.getOperand(4);

29187

SDValue Src = Op.getOperand(5);

29188

SDValue Scale = Op.getOperand(6);

29189

return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,

29190

Scale, Chain, Subtarget);

29191

}

29192

case PREFETCH: {

29193

const APInt &HintVal = Op.getConstantOperandAPInt(6);

29194

assert((HintVal == 2 || HintVal == 3) &&(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29195, __extension__
__PRETTY_FUNCTION__))

29195

"Wrong prefetch hint in intrinsic: should be 2 or 3")(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29195, __extension__
__PRETTY_FUNCTION__));

29196

unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);

29197

SDValue Chain = Op.getOperand(0);

29198

SDValue Mask = Op.getOperand(2);

29199

SDValue Index = Op.getOperand(3);

29200

SDValue Base = Op.getOperand(4);

29201

SDValue Scale = Op.getOperand(5);

29202

return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,

29203

Subtarget);

29204

}

29205

// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).

29206

case RDTSC: {

29207

SmallVector<SDValue, 2> Results;

29208

getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,

29209

Results);

29210

return DAG.getMergeValues(Results, dl);

29211

}

29212

// Read Performance Monitoring Counters.

29213

case RDPMC:

29214

// Read Processor Register.

29215

case RDPRU:

29216

// GetExtended Control Register.

29217

case XGETBV: {

29218

SmallVector<SDValue, 2> Results;

29219

29220

// RDPMC uses ECX to select the index of the performance counter to read.

29221

// RDPRU uses ECX to select the processor register to read.

29222

// XGETBV uses ECX to select the index of the XCR register to return.

29223

// The result is stored into registers EDX:EAX.

29224

expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,

29225

Subtarget, Results);

29226

return DAG.getMergeValues(Results, dl);

29227

}

29228

// XTEST intrinsics.

29229

case XTEST: {

29230

SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);

29231

SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

29232

29233

SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);

29234

SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);

29235

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),

29236

Ret, SDValue(InTrans.getNode(), 1));

29237

}

29238

case TRUNCATE_TO_MEM_VI8:

29239

case TRUNCATE_TO_MEM_VI16:

29240

case TRUNCATE_TO_MEM_VI32: {

29241

SDValue Mask = Op.getOperand(4);

29242

SDValue DataToTruncate = Op.getOperand(3);

29243

SDValue Addr = Op.getOperand(2);

29244

SDValue Chain = Op.getOperand(0);

29245

29246

MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);

29247

assert(MemIntr && "Expected MemIntrinsicSDNode!")(static_cast <bool> (MemIntr && "Expected MemIntrinsicSDNode!"
) ? void (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29247, __extension__
__PRETTY_FUNCTION__));

29248

29249

EVT MemVT = MemIntr->getMemoryVT();

29250

29251

uint16_t TruncationOp = IntrData->Opc0;

29252

switch (TruncationOp) {

29253

case X86ISD::VTRUNC: {

29254

if (isAllOnesConstant(Mask)) // return just a truncate store

29255

return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,

29256

MemIntr->getMemOperand());

29257

29258

MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());

29259

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

29260

SDValue Offset = DAG.getUNDEF(VMask.getValueType());

29261

29262

return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,

29263

MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,

29264

true /* truncating */);

29265

}

29266

case X86ISD::VTRUNCUS:

29267

case X86ISD::VTRUNCS: {

29268

bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);

29269

if (isAllOnesConstant(Mask))

29270

return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,

29271

MemIntr->getMemOperand(), DAG);

29272

29273

MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());

29274

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

29275

29276

return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,

29277

VMask, MemVT, MemIntr->getMemOperand(), DAG);

29278

}

29279

default:

29280

llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29280);

29281

}

29282

}

29283

}

29284

}

29285

29286

SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,

29287

SelectionDAG &DAG) const {

29288

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

29289

MFI.setReturnAddressIsTaken(true);

29290

29291

if (verifyReturnAddressArgumentIsConstant(Op, DAG))

29292

return SDValue();

29293

29294

unsigned Depth = Op.getConstantOperandVal(0);

29295

SDLoc dl(Op);

29296

EVT PtrVT = getPointerTy(DAG.getDataLayout());

29297

29298

if (Depth > 0) {

29299

SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);

29300

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

29301

SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);

29302

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),

29303

DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),

29304

MachinePointerInfo());

29305

}

29306

29307

// Just load the return address.

29308

SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);

29309

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,

29310

MachinePointerInfo());

29311

}

29312

29313

SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,

29314

SelectionDAG &DAG) const {

29315

DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);

29316

return getReturnAddressFrameIndex(DAG);

29317

}

29318

29319

SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {

29320

MachineFunction &MF = DAG.getMachineFunction();

29321

MachineFrameInfo &MFI = MF.getFrameInfo();

29322

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

29323

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

29324

EVT VT = Op.getValueType();

29325

29326

MFI.setFrameAddressIsTaken(true);

29327

29328

if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {

29329

// Depth > 0 makes no sense on targets which use Windows unwind codes. It

29330

// is not possible to crawl up the stack without looking at the unwind codes

29331

// simultaneously.

29332

int FrameAddrIndex = FuncInfo->getFAIndex();

29333

if (!FrameAddrIndex) {

29334

// Set up a frame object for the return address.

29335

unsigned SlotSize = RegInfo->getSlotSize();

29336

FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(

29337

SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);

29338

FuncInfo->setFAIndex(FrameAddrIndex);

29339

}

29340

return DAG.getFrameIndex(FrameAddrIndex, VT);

29341

}

29342

29343

unsigned FrameReg =

29344

RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());

29345

SDLoc dl(Op); // FIXME probably not meaningful

29346

unsigned Depth = Op.getConstantOperandVal(0);

29347

assert(((FrameReg == X86::RBP && VT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29349, __extension__
__PRETTY_FUNCTION__))

29348

(FrameReg == X86::EBP && VT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29349, __extension__
__PRETTY_FUNCTION__))

29349

"Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29349, __extension__
__PRETTY_FUNCTION__));

29350

SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);

29351

while (Depth--)

29352

FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,

29353

MachinePointerInfo());

29354

return FrameAddr;

29355

}

29356

29357

// FIXME? Maybe this could be a TableGen attribute on some registers and

29358

// this table could be generated automatically from RegInfo.

29359

Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,

29360

const MachineFunction &MF) const {

29361

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

29362

29363

Register Reg = StringSwitch<unsigned>(RegName)

29364

.Case("esp", X86::ESP)

29365

.Case("rsp", X86::RSP)

29366

.Case("ebp", X86::EBP)

29367

.Case("rbp", X86::RBP)

29368

.Default(0);

29369

29370

if (Reg == X86::EBP || Reg == X86::RBP) {

29371

if (!TFI.hasFP(MF))

29372

report_fatal_error("register " + StringRef(RegName) +

29373

" is allocatable: function has no frame pointer");

29374

#ifndef NDEBUG

29375

else {

29376

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

29377

Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);

29378

assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29379, __extension__
__PRETTY_FUNCTION__))

29379

"Invalid Frame Register!")(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29379, __extension__
__PRETTY_FUNCTION__));

29380

}

29381

#endif

29382

}

29383

29384

if (Reg)

29385

return Reg;

29386

29387

report_fatal_error("Invalid register name global variable");

29388

}

29389

29390

SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,

29391

SelectionDAG &DAG) const {

29392

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

29393

return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));

29394

}

29395

29396

Register X86TargetLowering::getExceptionPointerRegister(

29397

const Constant *PersonalityFn) const {

29398

if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)

29399

return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

29400

29401

return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;

29402

}

29403

29404

Register X86TargetLowering::getExceptionSelectorRegister(

29405

const Constant *PersonalityFn) const {

29406

// Funclet personalities don't use selectors (the runtime does the selection).

29407

if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))

29408

return X86::NoRegister;

29409

return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

29410

}

29411

29412

bool X86TargetLowering::needsFixedCatchObjects() const {

29413

return Subtarget.isTargetWin64();

29414

}

29415

29416

SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {

29417

SDValue Chain = Op.getOperand(0);

29418

SDValue Offset = Op.getOperand(1);

29419

SDValue Handler = Op.getOperand(2);

29420

SDLoc dl (Op);

29421

29422

EVT PtrVT = getPointerTy(DAG.getDataLayout());

29423

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

29424

Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());

29425

assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29427, __extension__
__PRETTY_FUNCTION__))

29426

(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29427, __extension__
__PRETTY_FUNCTION__))

29427

"Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29427, __extension__
__PRETTY_FUNCTION__));

29428

SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);

29429

Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

29430

29431

SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,

29432

DAG.getIntPtrConstant(RegInfo->getSlotSize(),

29433

dl));

29434

StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);

29435

Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());

29436

Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);

29437

29438

return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,

29439

DAG.getRegister(StoreAddrReg, PtrVT));

29440

}

29441

29442

SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,

29443

SelectionDAG &DAG) const {

29444

SDLoc DL(Op);

29445

// If the subtarget is not 64bit, we may need the global base reg

29446

// after isel expand pseudo, i.e., after CGBR pass ran.

29447

// Therefore, ask for the GlobalBaseReg now, so that the pass

29448

// inserts the code for us in case we need it.

29449

// Otherwise, we will end up in a situation where we will

29450

// reference a virtual register that is not defined!

29451

if (!Subtarget.is64Bit()) {

29452

const X86InstrInfo *TII = Subtarget.getInstrInfo();

29453

(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());

29454

}

29455

return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,

29456

DAG.getVTList(MVT::i32, MVT::Other),

29457

Op.getOperand(0), Op.getOperand(1));

29458

}

29459

29460

SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,

29461

SelectionDAG &DAG) const {

29462

SDLoc DL(Op);

29463

return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,

29464

Op.getOperand(0), Op.getOperand(1));

29465

}

29466

29467

SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,

29468

SelectionDAG &DAG) const {

29469

SDLoc DL(Op);

29470

return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,

29471

Op.getOperand(0));

29472

}

29473

29474

static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {

29475

return Op.getOperand(0);

29476

}

29477

29478

SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,

29479

SelectionDAG &DAG) const {

29480

SDValue Root = Op.getOperand(0);

29481

SDValue Trmp = Op.getOperand(1); // trampoline

29482

SDValue FPtr = Op.getOperand(2); // nested function

29483

SDValue Nest = Op.getOperand(3); // 'nest' parameter value

29484

SDLoc dl (Op);

29485

29486

const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

29487

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

29488

29489

if (Subtarget.is64Bit()) {

29490

SDValue OutChains[6];

29491

29492

// Large code-model.

29493

const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.

29494

const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.

29495

29496

const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;

29497

const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;

29498

29499

const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix

29500

29501

// Load the pointer to the nested function into R11.

29502

unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11

29503

SDValue Addr = Trmp;

29504

OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

29505

Addr, MachinePointerInfo(TrmpAddr));

29506

29507

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

29508

DAG.getConstant(2, dl, MVT::i64));

29509

OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,

29510

MachinePointerInfo(TrmpAddr, 2), Align(2));

29511

29512

// Load the 'nest' parameter value into R10.

29513

// R10 is specified in X86CallingConv.td

29514

OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10

29515

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

29516

DAG.getConstant(10, dl, MVT::i64));

29517

OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

29518

Addr, MachinePointerInfo(TrmpAddr, 10));

29519

29520

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

29521

DAG.getConstant(12, dl, MVT::i64));

29522

OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,

29523

MachinePointerInfo(TrmpAddr, 12), Align(2));

29524

29525

// Jump to the nested function.

29526

OpCode = (JMP64r << 8) | REX_WB; // jmpq *...

29527

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

29528

DAG.getConstant(20, dl, MVT::i64));

29529

OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

29530

Addr, MachinePointerInfo(TrmpAddr, 20));

29531

29532

unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11

29533

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

29534

DAG.getConstant(22, dl, MVT::i64));

29535

OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),

29536

Addr, MachinePointerInfo(TrmpAddr, 22));

29537

29538

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);

29539

} else {

29540

const Function *Func =

29541

cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());

29542

CallingConv::ID CC = Func->getCallingConv();

29543

unsigned NestReg;

29544

29545

switch (CC) {

29546

default:

29547

llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29547);

29548

case CallingConv::C:

29549

case CallingConv::X86_StdCall: {

29550

// Pass 'nest' parameter in ECX.

29551

// Must be kept in sync with X86CallingConv.td

29552

NestReg = X86::ECX;

29553

29554

// Check that ECX wasn't needed by an 'inreg' parameter.

29555

FunctionType *FTy = Func->getFunctionType();

29556

const AttributeList &Attrs = Func->getAttributes();

29557

29558

if (!Attrs.isEmpty() && !Func->isVarArg()) {

29559

unsigned InRegCount = 0;

29560

unsigned Idx = 0;

29561

29562

for (FunctionType::param_iterator I = FTy->param_begin(),

29563

E = FTy->param_end(); I != E; ++I, ++Idx)

29564

if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {

29565

const DataLayout &DL = DAG.getDataLayout();

29566

// FIXME: should only count parameters that are lowered to integers.

29567

InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;

29568

}

29569

29570

if (InRegCount > 2) {

29571

report_fatal_error("Nest register in use - reduce number of inreg"

29572

" parameters!");

29573

}

29574

}

29575

break;

29576

}

29577

case CallingConv::X86_FastCall:

29578

case CallingConv::X86_ThisCall:

29579

case CallingConv::Fast:

29580

case CallingConv::Tail:

29581

case CallingConv::SwiftTail:

29582

// Pass 'nest' parameter in EAX.

29583

// Must be kept in sync with X86CallingConv.td

29584

NestReg = X86::EAX;

29585

break;

29586

}

29587

29588

SDValue OutChains[4];

29589

SDValue Addr, Disp;

29590

29591

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

29592

DAG.getConstant(10, dl, MVT::i32));

29593

Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);

29594

29595

// This is storing the opcode for MOV32ri.

29596

const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.

29597

const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;

29598

OutChains[0] =

29599

DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),

29600

Trmp, MachinePointerInfo(TrmpAddr));

29601

29602

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

29603

DAG.getConstant(1, dl, MVT::i32));

29604

OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,

29605

MachinePointerInfo(TrmpAddr, 1), Align(1));

29606

29607

const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.

29608

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

29609

DAG.getConstant(5, dl, MVT::i32));

29610

OutChains[2] =

29611

DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,

29612

MachinePointerInfo(TrmpAddr, 5), Align(1));

29613

29614

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

29615

DAG.getConstant(6, dl, MVT::i32));

29616

OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,

29617

MachinePointerInfo(TrmpAddr, 6), Align(1));

29618

29619

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);

29620

}

29621

}

29622

29623

SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,

29624

SelectionDAG &DAG) const {

29625

/*

29626

The rounding mode is in bits 11:10 of FPSR, and has the following

29627

settings:

29628

00 Round to nearest

29629

01 Round to -inf

29630

10 Round to +inf

29631

11 Round to 0

29632

29633

GET_ROUNDING, on the other hand, expects the following:

29634

-1 Undefined

29635

0 Round to 0

29636

1 Round to nearest

29637

2 Round to +inf

29638

3 Round to -inf

29639

29640

To perform the conversion, we use a packed lookup table of the four 2-bit

29641

values that we can index by FPSP[11:10]

29642

0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]

29643

29644

(0x2d >> ((FPSR & 0xc00) >> 9)) & 3

29645

*/

29646

29647

MachineFunction &MF = DAG.getMachineFunction();

29648

MVT VT = Op.getSimpleValueType();

29649

SDLoc DL(Op);

29650

29651

// Save FP Control Word to stack slot

29652

int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);

29653

SDValue StackSlot =

29654

DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));

29655

29656

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

29657

29658

SDValue Chain = Op.getOperand(0);

29659

SDValue Ops[] = {Chain, StackSlot};

29660

Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,

29661

DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,

29662

Align(2), MachineMemOperand::MOStore);

29663

29664

// Load FP Control Word from stack slot

29665

SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));

29666

Chain = CWD.getValue(1);

29667

29668

// Mask and turn the control bits into a shift for the lookup table.

29669

SDValue Shift =

29670

DAG.getNode(ISD::SRL, DL, MVT::i16,

29671

DAG.getNode(ISD::AND, DL, MVT::i16,

29672

CWD, DAG.getConstant(0xc00, DL, MVT::i16)),

29673

DAG.getConstant(9, DL, MVT::i8));

29674

Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);

29675

29676

SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);

29677

SDValue RetVal =

29678

DAG.getNode(ISD::AND, DL, MVT::i32,

29679

DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),

29680

DAG.getConstant(3, DL, MVT::i32));

29681

29682

RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);

29683

29684

return DAG.getMergeValues({RetVal, Chain}, DL);

29685

}

29686

29687

SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,

29688

SelectionDAG &DAG) const {

29689

MachineFunction &MF = DAG.getMachineFunction();

29690

SDLoc DL(Op);

29691

SDValue Chain = Op.getNode()->getOperand(0);

29692

29693

// FP control word may be set only from data in memory. So we need to allocate

29694

// stack space to save/load FP control word.

29695

int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);

29696

SDValue StackSlot =

29697

DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));

29698

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);

29699

MachineMemOperand *MMO =

29700

MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));

29701

29702

// Store FP control word into memory.

29703

SDValue Ops[] = {Chain, StackSlot};

29704

Chain = DAG.getMemIntrinsicNode(

29705

X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);

29706

29707

// Load FP Control Word from stack slot and clear RM field (bits 11:10).

29708

SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);

29709

Chain = CWD.getValue(1);

29710

CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),

29711

DAG.getConstant(0xf3ff, DL, MVT::i16));

29712

29713

// Calculate new rounding mode.

29714

SDValue NewRM = Op.getNode()->getOperand(1);

29715

SDValue RMBits;

29716

if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {

29717

uint64_t RM = CVal->getZExtValue();

29718

int FieldVal;

29719

switch (static_cast<RoundingMode>(RM)) {

29720

case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;

29721

case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;

29722

case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;

29723

case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;

29724

default:

29725

llvm_unreachable("rounding mode is not supported by X86 hardware")::llvm::llvm_unreachable_internal("rounding mode is not supported by X86 hardware"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29725);

29726

}

29727

RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);

29728

} else {

29729

// Need to convert argument into bits of control word:

29730

// 0 Round to 0 -> 11

29731

// 1 Round to nearest -> 00

29732

// 2 Round to +inf -> 10

29733

// 3 Round to -inf -> 01

29734

// The 2-bit value needs then to be shifted so that it occupies bits 11:10.

29735

// To make the conversion, put all these values into a value 0xc9 and shift

29736

// it left depending on the rounding mode:

29737

// (0xc9 << 4) & 0xc00 = X86::rmTowardZero

29738

// (0xc9 << 6) & 0xc00 = X86::rmToNearest

29739

// ...

29740

// (0xc9 << (2 * NewRM + 4)) & 0xc00

29741

SDValue ShiftValue =

29742

DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,

29743

DAG.getNode(ISD::ADD, DL, MVT::i32,

29744

DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,

29745

DAG.getConstant(1, DL, MVT::i8)),

29746

DAG.getConstant(4, DL, MVT::i32)));

29747

SDValue Shifted =

29748

DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),

29749

ShiftValue);

29750

RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,

29751

DAG.getConstant(0xc00, DL, MVT::i16));

29752

}

29753

29754

// Update rounding mode bits and store the new FP Control Word into stack.

29755

CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);

29756

Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));

29757

29758

// Load FP control word from the slot.

29759

SDValue OpsLD[] = {Chain, StackSlot};

29760

MachineMemOperand *MMOL =

29761

MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));

29762

Chain = DAG.getMemIntrinsicNode(

29763

X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);

29764

29765

// If target supports SSE, set MXCSR as well. Rounding mode is encoded in the

29766

// same way but in bits 14:13.

29767

if (Subtarget.hasSSE1()) {

29768

// Store MXCSR into memory.

29769

Chain = DAG.getNode(

29770

ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,

29771

DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),

29772

StackSlot);

29773

29774

// Load MXCSR from stack slot and clear RM field (bits 14:13).

29775

SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);

29776

Chain = CWD.getValue(1);

29777

CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),

29778

DAG.getConstant(0xffff9fff, DL, MVT::i32));

29779

29780

// Shift X87 RM bits from 11:10 to 14:13.

29781

RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);

29782

RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,

29783

DAG.getConstant(3, DL, MVT::i8));

29784

29785

// Update rounding mode bits and store the new FP Control Word into stack.

29786

CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);

29787

Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));

29788

29789

// Load MXCSR from the slot.

29790

Chain = DAG.getNode(

29791

ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,

29792

DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),

29793

StackSlot);

29794

}

29795

29796

return Chain;

29797

}

29798

29799

/// Lower a vector CTLZ using native supported vector CTLZ instruction.

29800

//

29801

// i8/i16 vector implemented using dword LZCNT vector instruction

29802

// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,

29803

// split the vector, perform operation on it's Lo a Hi part and

29804

// concatenate the results.

29805

static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,

29806

const X86Subtarget &Subtarget) {

29807

assert(Op.getOpcode() == ISD::CTLZ)(static_cast <bool> (Op.getOpcode() == ISD::CTLZ) ? void
(0) : __assert_fail ("Op.getOpcode() == ISD::CTLZ", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29807, __extension__ __PRETTY_FUNCTION__));

29808

SDLoc dl(Op);

29809

MVT VT = Op.getSimpleValueType();

29810

MVT EltVT = VT.getVectorElementType();

29811

unsigned NumElems = VT.getVectorNumElements();

29812

29813

assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29814, __extension__
__PRETTY_FUNCTION__))

29814

"Unsupported element type")(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29814, __extension__
__PRETTY_FUNCTION__));

29815

29816

// Split vector, it's Lo and Hi parts will be handled in next iteration.

29817

if (NumElems > 16 ||

29818

(NumElems == 16 && !Subtarget.canExtendTo512DQ()))

29819

return splitVectorIntUnary(Op, DAG);

29820

29821

MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);

29822

assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29823, __extension__
__PRETTY_FUNCTION__))

29823

"Unsupported value type for operation")(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29823, __extension__
__PRETTY_FUNCTION__));

29824

29825

// Use native supported vector instruction vplzcntd.

29826

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));

29827

SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);

29828

SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);

29829

SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);

29830

29831

return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);

29832

}

29833

29834

// Lower CTLZ using a PSHUFB lookup table implementation.

29835

static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,

29836

const X86Subtarget &Subtarget,

29837

SelectionDAG &DAG) {

29838

MVT VT = Op.getSimpleValueType();

29839

int NumElts = VT.getVectorNumElements();

29840

int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);

29841

MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);

29842

29843

// Per-nibble leading zero PSHUFB lookup table.

29844

const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,

29845

/* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,

29846

/* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,

29847

/* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};

29848

29849

SmallVector<SDValue, 64> LUTVec;

29850

for (int i = 0; i < NumBytes; ++i)

29851

LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));

29852

SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);

29853

29854

// Begin by bitcasting the input to byte vector, then split those bytes

29855

// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.

29856

// If the hi input nibble is zero then we add both results together, otherwise

29857

// we just take the hi result (by masking the lo result to zero before the

29858

// add).

29859

SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));

29860

SDValue Zero = DAG.getConstant(0, DL, CurrVT);

29861

29862

SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);

29863

SDValue Lo = Op0;

29864

SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);

29865

SDValue HiZ;

29866

if (CurrVT.is512BitVector()) {

29867

MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());

29868

HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);

29869

HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);

29870

} else {

29871

HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);

29872

}

29873

29874

Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);

29875

Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);

29876

Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);

29877

SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);

29878

29879

// Merge result back from vXi8 back to VT, working on the lo/hi halves

29880

// of the current vector width in the same way we did for the nibbles.

29881

// If the upper half of the input element is zero then add the halves'

29882

// leading zero counts together, otherwise just use the upper half's.

29883

// Double the width of the result until we are at target width.

29884

while (CurrVT != VT) {

29885

int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();

29886

int CurrNumElts = CurrVT.getVectorNumElements();

29887

MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);

29888

MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);

29889

SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);

29890

29891

// Check if the upper half of the input element is zero.

29892

if (CurrVT.is512BitVector()) {

29893

MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());

29894

HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),

29895

DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);

29896

HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);

29897

} else {

29898

HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),

29899

DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);

29900

}

29901

HiZ = DAG.getBitcast(NextVT, HiZ);

29902

29903

// Move the upper/lower halves to the lower bits as we'll be extending to

29904

// NextVT. Mask the lower result to zero if HiZ is true and add the results

29905

// together.

29906

SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);

29907

SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);

29908

SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);

29909

R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);

29910

Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);

29911

CurrVT = NextVT;

29912

}

29913

29914

return Res;

29915

}

29916

29917

static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,

29918

const X86Subtarget &Subtarget,

29919

SelectionDAG &DAG) {

29920

MVT VT = Op.getSimpleValueType();

29921

29922

if (Subtarget.hasCDI() &&

29923

// vXi8 vectors need to be promoted to 512-bits for vXi32.

29924

(Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))

29925

return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);

29926

29927

// Decompose 256-bit ops into smaller 128-bit ops.

29928

if (VT.is256BitVector() && !Subtarget.hasInt256())

29929

return splitVectorIntUnary(Op, DAG);

29930

29931

// Decompose 512-bit ops into smaller 256-bit ops.

29932

if (VT.is512BitVector() && !Subtarget.hasBWI())

29933

return splitVectorIntUnary(Op, DAG);

29934

29935

assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")(static_cast <bool> (Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29935, __extension__
__PRETTY_FUNCTION__));

29936

return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);

29937

}

29938

29939

static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,

29940

SelectionDAG &DAG) {

29941

MVT VT = Op.getSimpleValueType();

29942

MVT OpVT = VT;

29943

unsigned NumBits = VT.getSizeInBits();

29944

SDLoc dl(Op);

29945

unsigned Opc = Op.getOpcode();

29946

29947

if (VT.isVector())

29948

return LowerVectorCTLZ(Op, dl, Subtarget, DAG);

29949

29950

Op = Op.getOperand(0);

29951

if (VT == MVT::i8) {

29952

// Zero extend to i32 since there is not an i8 bsr.

29953

OpVT = MVT::i32;

29954

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);

29955

}

29956

29957

// Issue a bsr (scan bits in reverse) which also sets EFLAGS.

29958

SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);

29959

Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);

29960

29961

if (Opc == ISD::CTLZ) {

29962

// If src is zero (i.e. bsr sets ZF), returns NumBits.

29963

SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),

29964

DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),

29965

Op.getValue(1)};

29966

Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);

29967

}

29968

29969

// Finally xor with NumBits-1.

29970

Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,

29971

DAG.getConstant(NumBits - 1, dl, OpVT));

29972

29973

if (VT == MVT::i8)

29974

Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);

29975

return Op;

29976

}

29977

29978

static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,

29979

SelectionDAG &DAG) {

29980

MVT VT = Op.getSimpleValueType();

29981

unsigned NumBits = VT.getScalarSizeInBits();

29982

SDValue N0 = Op.getOperand(0);

29983

SDLoc dl(Op);

29984

29985

assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29986, __extension__
__PRETTY_FUNCTION__))

29986

"Only scalar CTTZ requires custom lowering")(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29986, __extension__
__PRETTY_FUNCTION__));

29987

29988

// Issue a bsf (scan bits forward) which also sets EFLAGS.

29989

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

29990

Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);

29991

29992

// If src is known never zero we can skip the CMOV.

29993

if (DAG.isKnownNeverZero(N0))

29994

return Op;

29995

29996

// If src is zero (i.e. bsf sets ZF), returns NumBits.

29997

SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),

29998

DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),

29999

Op.getValue(1)};

30000

return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);

30001

}

30002

30003

static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,

30004

const X86Subtarget &Subtarget) {

30005

MVT VT = Op.getSimpleValueType();

30006

if (VT == MVT::i16 || VT == MVT::i32)

30007

return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

30008

30009

if (VT == MVT::v32i16 || VT == MVT::v64i8)

30010

return splitVectorIntBinary(Op, DAG);

30011

30012

assert(Op.getSimpleValueType().is256BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30014, __extension__
__PRETTY_FUNCTION__))

30013

Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30014, __extension__
__PRETTY_FUNCTION__))

30014

"Only handle AVX 256-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30014, __extension__
__PRETTY_FUNCTION__));

30015

return splitVectorIntBinary(Op, DAG);

30016

}

30017

30018

static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,

30019

const X86Subtarget &Subtarget) {

30020

MVT VT = Op.getSimpleValueType();

30021

SDValue X = Op.getOperand(0), Y = Op.getOperand(1);

30022

unsigned Opcode = Op.getOpcode();

30023

SDLoc DL(Op);

30024

30025

if (VT == MVT::v32i16 || VT == MVT::v64i8 ||

30026

(VT.is256BitVector() && !Subtarget.hasInt256())) {

30027

assert(Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30028, __extension__
__PRETTY_FUNCTION__))

30028

"Only handle AVX vector integer operation")(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30028, __extension__
__PRETTY_FUNCTION__));

30029

return splitVectorIntBinary(Op, DAG);

30030

}

30031

30032

// Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.

30033

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

30034

EVT SetCCResultType =

30035

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

30036

30037

unsigned BitWidth = VT.getScalarSizeInBits();

30038

if (Opcode == ISD::USUBSAT) {

30039

if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {

30040

// Handle a special-case with a bit-hack instead of cmp+select:

30041

// usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)

30042

// If the target can use VPTERNLOG, DAGToDAG will match this as

30043

// "vpsra + vpternlog" which is better than "vpmax + vpsub" with a

30044

// "broadcast" constant load.

30045

ConstantSDNode *C = isConstOrConstSplat(Y, true);

30046

if (C && C->getAPIntValue().isSignMask()) {

30047

SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);

30048

SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);

30049

SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);

30050

SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);

30051

return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);

30052

}

30053

}

30054

if (!TLI.isOperationLegal(ISD::UMAX, VT)) {

30055

// usubsat X, Y --> (X >u Y) ? X - Y : 0

30056

SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);

30057

SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);

30058

// TODO: Move this to DAGCombiner?

30059

if (SetCCResultType == VT &&

30060

DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())

30061

return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);

30062

return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));

30063

}

30064

}

30065

30066

if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&

30067

(!VT.isVector() || VT == MVT::v2i64)) {

30068

APInt MinVal = APInt::getSignedMinValue(BitWidth);

30069

APInt MaxVal = APInt::getSignedMaxValue(BitWidth);

30070

SDValue Zero = DAG.getConstant(0, DL, VT);

30071

SDValue Result =

30072

DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,

30073

DAG.getVTList(VT, SetCCResultType), X, Y);

30074

SDValue SumDiff = Result.getValue(0);

30075

SDValue Overflow = Result.getValue(1);

30076

SDValue SatMin = DAG.getConstant(MinVal, DL, VT);

30077

SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);

30078

SDValue SumNeg =

30079

DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);

30080

Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);

30081

return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);

30082

}

30083

30084

// Use default expansion.

30085

return SDValue();

30086

}

30087

30088

static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,

30089

SelectionDAG &DAG) {

30090

MVT VT = Op.getSimpleValueType();

30091

if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {

30092

// Since X86 does not have CMOV for 8-bit integer, we don't convert

30093

// 8-bit integer abs to NEG and CMOV.

30094

SDLoc DL(Op);

30095

SDValue N0 = Op.getOperand(0);

30096

SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),

30097

DAG.getConstant(0, DL, VT), N0);

30098

SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),

30099

SDValue(Neg.getNode(), 1)};

30100

return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);

30101

}

30102

30103

// ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).

30104

if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {

30105

SDLoc DL(Op);

30106

SDValue Src = Op.getOperand(0);

30107

SDValue Sub =

30108

DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);

30109

return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);

30110

}

30111

30112

if (VT.is256BitVector() && !Subtarget.hasInt256()) {

30113

assert(VT.isInteger() &&(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30114, __extension__
__PRETTY_FUNCTION__))

30114

"Only handle AVX 256-bit vector integer operation")(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30114, __extension__
__PRETTY_FUNCTION__));

30115

return splitVectorIntUnary(Op, DAG);

30116

}

30117

30118

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

30119

return splitVectorIntUnary(Op, DAG);

30120

30121

// Default to expand.

30122

return SDValue();

30123

}

30124

30125

static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,

30126

SelectionDAG &DAG) {

30127

MVT VT = Op.getSimpleValueType();

30128

30129

// For AVX1 cases, split to use legal ops.

30130

if (VT.is256BitVector() && !Subtarget.hasInt256())

30131

return splitVectorIntBinary(Op, DAG);

30132

30133

if (VT == MVT::v32i16 || VT == MVT::v64i8)

30134

return splitVectorIntBinary(Op, DAG);

30135

30136

// Default to expand.

30137

return SDValue();

30138

}

30139

30140

static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,

30141

SelectionDAG &DAG) {

30142

MVT VT = Op.getSimpleValueType();

30143

30144

// For AVX1 cases, split to use legal ops.

30145

if (VT.is256BitVector() && !Subtarget.hasInt256())

30146

return splitVectorIntBinary(Op, DAG);

30147

30148

if (VT == MVT::v32i16 || VT == MVT::v64i8)

30149

return splitVectorIntBinary(Op, DAG);

30150

30151

// umax(x,1) --> sub(x,cmpeq(x,0))

30152

// TODO: Move this to expandIntMINMAX?

30153

if (VT.isVector() && Op.getOpcode() == ISD::UMAX &&

30154

llvm::isOneOrOneSplat(Op.getOperand(1), true)) {

30155

SDLoc DL(Op);

30156

SDValue X = DAG.getFreeze(Op.getOperand(0));

30157

SDValue Zero = getZeroVector(VT, Subtarget, DAG, DL);

30158

return DAG.getNode(ISD::SUB, DL, VT, X,

30159

DAG.getSetCC(DL, VT, X, Zero, ISD::SETEQ));

30160

}

30161

30162

// Default to expand.

30163

return SDValue();

30164

}

30165

30166

static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,

30167

SelectionDAG &DAG) {

30168

MVT VT = Op.getSimpleValueType();

30169

30170

// For AVX1 cases, split to use legal ops.

30171

if (VT.is256BitVector() && !Subtarget.hasInt256())

30172

return splitVectorIntBinary(Op, DAG);

30173

30174

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())

30175

return splitVectorIntBinary(Op, DAG);

30176

30177

// TODO: Add TargetLowering expandABD() support.

30178

SDLoc dl(Op);

30179

bool IsSigned = Op.getOpcode() == ISD::ABDS;

30180

SDValue LHS = DAG.getFreeze(Op.getOperand(0));

30181

SDValue RHS = DAG.getFreeze(Op.getOperand(1));

30182

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

30183

30184

// abds(lhs, rhs) -> sub(smax(lhs,rhs), smin(lhs,rhs))

30185

// abdu(lhs, rhs) -> sub(umax(lhs,rhs), umin(lhs,rhs))

30186

unsigned MaxOpc = IsSigned ? ISD::SMAX : ISD::UMAX;

30187

unsigned MinOpc = IsSigned ? ISD::SMIN : ISD::UMIN;

30188

if (TLI.isOperationLegal(MaxOpc, VT) && TLI.isOperationLegal(MinOpc, VT)) {

30189

SDValue Max = DAG.getNode(MaxOpc, dl, VT, LHS, RHS);

30190

SDValue Min = DAG.getNode(MinOpc, dl, VT, LHS, RHS);

30191

return DAG.getNode(ISD::SUB, dl, VT, Max, Min);

30192

}

30193

30194

// abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))

30195

// abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))

30196

EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

30197

ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;

30198

SDValue Cmp = DAG.getSetCC(dl, CCVT, LHS, RHS, CC);

30199

return DAG.getSelect(dl, VT, Cmp, DAG.getNode(ISD::SUB, dl, VT, LHS, RHS),

30200

DAG.getNode(ISD::SUB, dl, VT, RHS, LHS));

30201

}

30202

30203

static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,

30204

SelectionDAG &DAG) {

30205

SDLoc dl(Op);

30206

MVT VT = Op.getSimpleValueType();

30207

30208

// Decompose 256-bit ops into 128-bit ops.

30209

if (VT.is256BitVector() && !Subtarget.hasInt256())

30210

return splitVectorIntBinary(Op, DAG);

30211

30212

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

30213

return splitVectorIntBinary(Op, DAG);

30214

30215

SDValue A = Op.getOperand(0);

30216

SDValue B = Op.getOperand(1);

30217

30218

// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16

30219

// vector pairs, multiply and truncate.

30220

if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {

30221

unsigned NumElts = VT.getVectorNumElements();

30222

30223

if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

30224

(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

30225

MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());

30226

return DAG.getNode(

30227

ISD::TRUNCATE, dl, VT,

30228

DAG.getNode(ISD::MUL, dl, ExVT,

30229

DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),

30230

DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));

30231

}

30232

30233

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

30234

30235

// Extract the lo/hi parts to any extend to i16.

30236

// We're going to mask off the low byte of each result element of the

30237

// pmullw, so it doesn't matter what's in the high byte of each 16-bit

30238

// element.

30239

SDValue Undef = DAG.getUNDEF(VT);

30240

SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));

30241

SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));

30242

30243

SDValue BLo, BHi;

30244

if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {

30245

// If the RHS is a constant, manually unpackl/unpackh.

30246

SmallVector<SDValue, 16> LoOps, HiOps;

30247

for (unsigned i = 0; i != NumElts; i += 16) {

30248

for (unsigned j = 0; j != 8; ++j) {

30249

LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,

30250

MVT::i16));

30251

HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,

30252

MVT::i16));

30253

}

30254

}

30255

30256

BLo = DAG.getBuildVector(ExVT, dl, LoOps);

30257

BHi = DAG.getBuildVector(ExVT, dl, HiOps);

30258

} else {

30259

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));

30260

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));

30261

}

30262

30263

// Multiply, mask the lower 8bits of the lo/hi results and pack.

30264

SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);

30265

SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);

30266

return getPack(DAG, Subtarget, dl, VT, RLo, RHi);

30267

}

30268

30269

// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.

30270

if (VT == MVT::v4i32) {

30271

assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30272, __extension__
__PRETTY_FUNCTION__))

30272

"Should not custom lower when pmulld is available!")(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30272, __extension__
__PRETTY_FUNCTION__));

30273

30274

// Extract the odd parts.

30275

static const int UnpackMask[] = { 1, -1, 3, -1 };

30276

SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);

30277

SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);

30278

30279

// Multiply the even parts.

30280

SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,

30281

DAG.getBitcast(MVT::v2i64, A),

30282

DAG.getBitcast(MVT::v2i64, B));

30283

// Now multiply odd parts.

30284

SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,

30285

DAG.getBitcast(MVT::v2i64, Aodds),

30286

DAG.getBitcast(MVT::v2i64, Bodds));

30287

30288

Evens = DAG.getBitcast(VT, Evens);

30289

Odds = DAG.getBitcast(VT, Odds);

30290

30291

// Merge the two vectors back together with a shuffle. This expands into 2

30292

// shuffles.

30293

static const int ShufMask[] = { 0, 4, 2, 6 };

30294

return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);

30295

}

30296

30297

assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30298, __extension__
__PRETTY_FUNCTION__))

30298

"Only know how to lower V2I64/V4I64/V8I64 multiply")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30298, __extension__
__PRETTY_FUNCTION__));

30299

assert(!Subtarget.hasDQI() && "DQI should use MULLQ")(static_cast <bool> (!Subtarget.hasDQI() && "DQI should use MULLQ"
) ? void (0) : __assert_fail ("!Subtarget.hasDQI() && \"DQI should use MULLQ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30299, __extension__
__PRETTY_FUNCTION__));

30300

30301

// Ahi = psrlqi(a, 32);

30302

// Bhi = psrlqi(b, 32);

30303

//

30304

// AloBlo = pmuludq(a, b);

30305

// AloBhi = pmuludq(a, Bhi);

30306

// AhiBlo = pmuludq(Ahi, b);

30307

//

30308

// Hi = psllqi(AloBhi + AhiBlo, 32);

30309

// return AloBlo + Hi;

30310

KnownBits AKnown = DAG.computeKnownBits(A);

30311

KnownBits BKnown = DAG.computeKnownBits(B);

30312

30313

APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);

30314

bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);

30315

bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);

30316

30317

APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);

30318

bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);

30319

bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);

30320

30321

SDValue Zero = DAG.getConstant(0, dl, VT);

30322

30323

// Only multiply lo/hi halves that aren't known to be zero.

30324

SDValue AloBlo = Zero;

30325

if (!ALoIsZero && !BLoIsZero)

30326

AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);

30327

30328

SDValue AloBhi = Zero;

30329

if (!ALoIsZero && !BHiIsZero) {

30330

SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);

30331

AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);

30332

}

30333

30334

SDValue AhiBlo = Zero;

30335

if (!AHiIsZero && !BLoIsZero) {

30336

SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);

30337

AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);

30338

}

30339

30340

SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);

30341

Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);

30342

30343

return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);

30344

}

30345

30346

static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,

30347

MVT VT, bool IsSigned,

30348

const X86Subtarget &Subtarget,

30349

SelectionDAG &DAG,

30350

SDValue *Low = nullptr) {

30351

unsigned NumElts = VT.getVectorNumElements();

30352

30353

// For vXi8 we will unpack the low and high half of each 128 bit lane to widen

30354

// to a vXi16 type. Do the multiplies, shift the results and pack the half

30355

// lane results back together.

30356

30357

// We'll take different approaches for signed and unsigned.

30358

// For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes

30359

// and use pmullw to calculate the full 16-bit product.

30360

// For signed we'll use punpcklbw/punpckbw to extend the bytes to words and

30361

// shift them left into the upper byte of each word. This allows us to use

30362

// pmulhw to calculate the full 16-bit product. This trick means we don't

30363

// need to sign extend the bytes to use pmullw.

30364

30365

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

30366

SDValue Zero = DAG.getConstant(0, dl, VT);

30367

30368

SDValue ALo, AHi;

30369

if (IsSigned) {

30370

ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));

30371

AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));

30372

} else {

30373

ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));

30374

AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));

30375

}

30376

30377

SDValue BLo, BHi;

30378

if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {

30379

// If the RHS is a constant, manually unpackl/unpackh and extend.

30380

SmallVector<SDValue, 16> LoOps, HiOps;

30381

for (unsigned i = 0; i != NumElts; i += 16) {

30382

for (unsigned j = 0; j != 8; ++j) {

30383

SDValue LoOp = B.getOperand(i + j);

30384

SDValue HiOp = B.getOperand(i + j + 8);

30385

30386

if (IsSigned) {

30387

LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);

30388

HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);

30389

LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,

30390

DAG.getConstant(8, dl, MVT::i16));

30391

HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,

30392

DAG.getConstant(8, dl, MVT::i16));

30393

} else {

30394

LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);

30395

HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);

30396

}

30397

30398

LoOps.push_back(LoOp);

30399

HiOps.push_back(HiOp);

30400

}

30401

}

30402

30403

BLo = DAG.getBuildVector(ExVT, dl, LoOps);

30404

BHi = DAG.getBuildVector(ExVT, dl, HiOps);

30405

} else if (IsSigned) {

30406

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));

30407

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));

30408

} else {

30409

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));

30410

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));

30411

}

30412

30413

// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and

30414

// pack back to vXi8.

30415

unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;

30416

SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);

30417

SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);

30418

30419

if (Low)

30420

*Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);

30421

30422

return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);

30423

}

30424

30425

static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,

30426

SelectionDAG &DAG) {

30427

SDLoc dl(Op);

30428

MVT VT = Op.getSimpleValueType();

30429

bool IsSigned = Op->getOpcode() == ISD::MULHS;

30430

unsigned NumElts = VT.getVectorNumElements();

30431

SDValue A = Op.getOperand(0);

30432

SDValue B = Op.getOperand(1);

30433

30434

// Decompose 256-bit ops into 128-bit ops.

30435

if (VT.is256BitVector() && !Subtarget.hasInt256())

30436

return splitVectorIntBinary(Op, DAG);

30437

30438

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

30439

return splitVectorIntBinary(Op, DAG);

30440

30441

if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {

30442

assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30444, __extension__
__PRETTY_FUNCTION__))

30443

(VT == MVT::v8i32 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30444, __extension__
__PRETTY_FUNCTION__))

30444

(VT == MVT::v16i32 && Subtarget.hasAVX512()))(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30444, __extension__
__PRETTY_FUNCTION__));

30445

30446

// PMULxD operations multiply each even value (starting at 0) of LHS with

30447

// the related value of RHS and produce a widen result.

30448

// E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>

30449

// => <2 x i64> <ae|cg>

30450

//

30451

// In other word, to have all the results, we need to perform two PMULxD:

30452

// 1. one with the even values.

30453

// 2. one with the odd values.

30454

// To achieve #2, with need to place the odd values at an even position.

30455

//

30456

// Place the odd value at an even position (basically, shift all values 1

30457

// step to the left):

30458

const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,

30459

9, -1, 11, -1, 13, -1, 15, -1};

30460

// <a|b|c|d> => <b|undef|d|undef>

30461

SDValue Odd0 =

30462

DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));

30463

// <e|f|g|h> => <f|undef|h|undef>

30464

SDValue Odd1 =

30465

DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));

30466

30467

// Emit two multiplies, one for the lower 2 ints and one for the higher 2

30468

// ints.

30469

MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);

30470

unsigned Opcode =

30471

(IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;

30472

// PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>

30473

// => <2 x i64> <ae|cg>

30474

SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,

30475

DAG.getBitcast(MulVT, A),

30476

DAG.getBitcast(MulVT, B)));

30477

30478

// => <2 x i64> <bf|dh>

30479

SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,

30480

DAG.getBitcast(MulVT, Odd0),

30481

DAG.getBitcast(MulVT, Odd1)));

30482

30483

// Shuffle it back into the right order.

30484

SmallVector<int, 16> ShufMask(NumElts);

30485

for (int i = 0; i != (int)NumElts; ++i)

30486

ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;

30487

30488

SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);

30489

30490

// If we have a signed multiply but no PMULDQ fix up the result of an

30491

// unsigned multiply.

30492

if (IsSigned && !Subtarget.hasSSE41()) {

30493

SDValue Zero = DAG.getConstant(0, dl, VT);

30494

SDValue T1 = DAG.getNode(ISD::AND, dl, VT,

30495

DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);

30496

SDValue T2 = DAG.getNode(ISD::AND, dl, VT,

30497

DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);

30498

30499

SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);

30500

Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);

30501

}

30502

30503

return Res;

30504

}

30505

30506

// Only i8 vectors should need custom lowering after this.

30507

assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30509, __extension__
__PRETTY_FUNCTION__))

30508

(VT == MVT::v64i8 && Subtarget.hasBWI())) &&(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30509, __extension__
__PRETTY_FUNCTION__))

30509

"Unsupported vector type")(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30509, __extension__
__PRETTY_FUNCTION__));

30510

30511

// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,

30512

// logical shift down the upper half and pack back to i8.

30513

30514

// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack

30515

// and then ashr/lshr the upper bits down to the lower bits before multiply.

30516

30517

if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

30518

(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

30519

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

30520

unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

30521

SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);

30522

SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);

30523

SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);

30524

Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

30525

return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);

30526

}

30527

30528

return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);

30529

}

30530

30531

// Custom lowering for SMULO/UMULO.

30532

static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,

30533

SelectionDAG &DAG) {

30534

MVT VT = Op.getSimpleValueType();

30535

30536

// Scalars defer to LowerXALUO.

30537

if (!VT.isVector())

30538

return LowerXALUO(Op, DAG);

30539

30540

SDLoc dl(Op);

30541

bool IsSigned = Op->getOpcode() == ISD::SMULO;

30542

SDValue A = Op.getOperand(0);

30543

SDValue B = Op.getOperand(1);

30544

EVT OvfVT = Op->getValueType(1);

30545

30546

if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||

30547

(VT == MVT::v64i8 && !Subtarget.hasBWI())) {

30548

// Extract the LHS Lo/Hi vectors

30549

SDValue LHSLo, LHSHi;

30550

std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);

30551

30552

// Extract the RHS Lo/Hi vectors

30553

SDValue RHSLo, RHSHi;

30554

std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);

30555

30556

EVT LoOvfVT, HiOvfVT;

30557

std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);

30558

SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);

30559

SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);

30560

30561

// Issue the split operations.

30562

SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);

30563

SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);

30564

30565

// Join the separate data results and the overflow results.

30566

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

30567

SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),

30568

Hi.getValue(1));

30569

30570

return DAG.getMergeValues({Res, Ovf}, dl);

30571

}

30572

30573

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

30574

EVT SetccVT =

30575

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

30576

30577

if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

30578

(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

30579

unsigned NumElts = VT.getVectorNumElements();

30580

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

30581

unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

30582

SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);

30583

SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);

30584

SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);

30585

30586

SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);

30587

30588

SDValue Ovf;

30589

if (IsSigned) {

30590

SDValue High, LowSign;

30591

if (OvfVT.getVectorElementType() == MVT::i1 &&

30592

(Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {

30593

// Rather the truncating try to do the compare on vXi16 or vXi32.

30594

// Shift the high down filling with sign bits.

30595

High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);

30596

// Fill all 16 bits with the sign bit from the low.

30597

LowSign =

30598

getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);

30599

LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,

30600

15, DAG);

30601

SetccVT = OvfVT;

30602

if (!Subtarget.hasBWI()) {

30603

// We can't do a vXi16 compare so sign extend to v16i32.

30604

High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);

30605

LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);

30606

}

30607

} else {

30608

// Otherwise do the compare at vXi8.

30609

High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

30610

High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);

30611

LowSign =

30612

DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));

30613

}

30614

30615

Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);

30616

} else {

30617

SDValue High =

30618

getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

30619

if (OvfVT.getVectorElementType() == MVT::i1 &&

30620

(Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {

30621

// Rather the truncating try to do the compare on vXi16 or vXi32.

30622

SetccVT = OvfVT;

30623

if (!Subtarget.hasBWI()) {

30624

// We can't do a vXi16 compare so sign extend to v16i32.

30625

High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);

30626

}

30627

} else {

30628

// Otherwise do the compare at vXi8.

30629

High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);

30630

}

30631

30632

Ovf =

30633

DAG.getSetCC(dl, SetccVT, High,

30634

DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);

30635

}

30636

30637

Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);

30638

30639

return DAG.getMergeValues({Low, Ovf}, dl);

30640

}

30641

30642

SDValue Low;

30643

SDValue High =

30644

LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);

30645

30646

SDValue Ovf;

30647

if (IsSigned) {

30648

// SMULO overflows if the high bits don't match the sign of the low.

30649

SDValue LowSign =

30650

DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));

30651

Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);

30652

} else {

30653

// UMULO overflows if the high bits are non-zero.

30654

Ovf =

30655

DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);

30656

}

30657

30658

Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);

30659

30660

return DAG.getMergeValues({Low, Ovf}, dl);

30661

}

30662

30663

SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {

30664

assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30664, __extension__
__PRETTY_FUNCTION__));

30665

EVT VT = Op.getValueType();

30666

assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30667, __extension__
__PRETTY_FUNCTION__))

30667

"Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30667, __extension__
__PRETTY_FUNCTION__));

30668

30669

if (isa<ConstantSDNode>(Op->getOperand(1))) {

30670

SmallVector<SDValue> Result;

30671

if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))

30672

return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);

30673

}

30674

30675

RTLIB::Libcall LC;

30676

bool isSigned;

30677

switch (Op->getOpcode()) {

30678

default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30678);

30679

case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;

30680

case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;

30681

case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;

30682

case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;

30683

}

30684

30685

SDLoc dl(Op);

30686

SDValue InChain = DAG.getEntryNode();

30687

30688

TargetLowering::ArgListTy Args;

30689

TargetLowering::ArgListEntry Entry;

30690

for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {

30691

EVT ArgVT = Op->getOperand(i).getValueType();

30692

assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30693, __extension__
__PRETTY_FUNCTION__))

30693

"Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30693, __extension__
__PRETTY_FUNCTION__));

30694

SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);

30695

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

30696

MachinePointerInfo MPI =

30697

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

30698

Entry.Node = StackPtr;

30699

InChain =

30700

DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));

30701

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

30702

Entry.Ty = PointerType::get(ArgTy,0);

30703

Entry.IsSExt = false;

30704

Entry.IsZExt = false;

30705

Args.push_back(Entry);

30706

}

30707

30708

SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),

30709

getPointerTy(DAG.getDataLayout()));

30710

30711

TargetLowering::CallLoweringInfo CLI(DAG);

30712

CLI.setDebugLoc(dl)

30713

.setChain(InChain)

30714

.setLibCallee(

30715

getLibcallCallingConv(LC),

30716

static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,

30717

std::move(Args))

30718

.setInRegister()

30719

.setSExtResult(isSigned)

30720

.setZExtResult(!isSigned);

30721

30722

std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);

30723

return DAG.getBitcast(VT, CallInfo.first);

30724

}

30725

30726

SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,

30727

SelectionDAG &DAG,

30728

SDValue &Chain) const {

30729

assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30729, __extension__
__PRETTY_FUNCTION__));

30730

EVT VT = Op.getValueType();

30731

bool IsStrict = Op->isStrictFPOpcode();

30732

30733

SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);

30734

EVT ArgVT = Arg.getValueType();

30735

30736

assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30737, __extension__
__PRETTY_FUNCTION__))

30737

"Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30737, __extension__
__PRETTY_FUNCTION__));

30738

30739

RTLIB::Libcall LC;

30740

if (Op->getOpcode() == ISD::FP_TO_SINT ||

30741

Op->getOpcode() == ISD::STRICT_FP_TO_SINT)

30742

LC = RTLIB::getFPTOSINT(ArgVT, VT);

30743

else

30744

LC = RTLIB::getFPTOUINT(ArgVT, VT);

30745

assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30745, __extension__
__PRETTY_FUNCTION__));

30746

30747

SDLoc dl(Op);

30748

MakeLibCallOptions CallOptions;

30749

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

30750

30751

SDValue Result;

30752

// Expect the i128 argument returned as a v2i64 in xmm0, cast back to the

30753

// expected VT (i128).

30754

std::tie(Result, Chain) =

30755

makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);

30756

Result = DAG.getBitcast(VT, Result);

30757

return Result;

30758

}

30759

30760

SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,

30761

SelectionDAG &DAG) const {

30762

assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30762, __extension__
__PRETTY_FUNCTION__));

30763

EVT VT = Op.getValueType();

30764

bool IsStrict = Op->isStrictFPOpcode();

30765

30766

SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);

30767

EVT ArgVT = Arg.getValueType();

30768

30769

assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30770, __extension__
__PRETTY_FUNCTION__))

30770

"Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30770, __extension__
__PRETTY_FUNCTION__));

30771

30772

RTLIB::Libcall LC;

30773

if (Op->getOpcode() == ISD::SINT_TO_FP ||

30774

Op->getOpcode() == ISD::STRICT_SINT_TO_FP)

30775

LC = RTLIB::getSINTTOFP(ArgVT, VT);

30776

else

30777

LC = RTLIB::getUINTTOFP(ArgVT, VT);

30778

assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30778, __extension__
__PRETTY_FUNCTION__));

30779

30780

SDLoc dl(Op);

30781

MakeLibCallOptions CallOptions;

30782

SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

30783

30784

// Pass the i128 argument as an indirect argument on the stack.

30785

SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);

30786

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

30787

MachinePointerInfo MPI =

30788

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

30789

Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));

30790

30791

SDValue Result;

30792

std::tie(Result, Chain) =

30793

makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);

30794

return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;

30795

}

30796

30797

// Return true if the required (according to Opcode) shift-imm form is natively

30798

// supported by the Subtarget

30799

static bool supportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,

30800

unsigned Opcode) {

30801

if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))

30802

return false;

30803

30804

if (VT.getScalarSizeInBits() < 16)

30805

return false;

30806

30807

if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

30808

(VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))

30809

return true;

30810

30811

bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||

30812

(VT.is256BitVector() && Subtarget.hasInt256());

30813

30814

bool AShift = LShift && (Subtarget.hasAVX512() ||

30815

(VT != MVT::v2i64 && VT != MVT::v4i64));

30816

return (Opcode == ISD::SRA) ? AShift : LShift;

30817

}

30818

30819

// The shift amount is a variable, but it is the same for all vector lanes.

30820

// These instructions are defined together with shift-immediate.

30821

static

30822

bool supportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,

30823

unsigned Opcode) {

30824

return supportedVectorShiftWithImm(VT, Subtarget, Opcode);

30825

}

30826

30827

// Return true if the required (according to Opcode) variable-shift form is

30828

// natively supported by the Subtarget

30829

static bool supportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,

30830

unsigned Opcode) {

30831

if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))

30832

return false;

30833

30834

if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)

30835

return false;

30836

30837

// vXi16 supported only on AVX-512, BWI

30838

if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())

30839

return false;

30840

30841

if (Subtarget.hasAVX512() &&

30842

(Subtarget.useAVX512Regs() || !VT.is512BitVector()))

30843

return true;

30844

30845

bool LShift = VT.is128BitVector() || VT.is256BitVector();

30846

bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;

30847

return (Opcode == ISD::SRA) ? AShift : LShift;

30848

}

30849

30850

static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,

30851

const X86Subtarget &Subtarget) {

30852

MVT VT = Op.getSimpleValueType();

30853

SDLoc dl(Op);

30854

SDValue R = Op.getOperand(0);

30855

SDValue Amt = Op.getOperand(1);

30856

unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);

30857

30858

auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {

30859

assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
) && "Unexpected SRA type") ? void (0) : __assert_fail
("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30859, __extension__
__PRETTY_FUNCTION__));

30860

MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);

30861

SDValue Ex = DAG.getBitcast(ExVT, R);

30862

30863

// ashr(R, 63) === cmp_slt(R, 0)

30864

if (ShiftAmt == 63 && Subtarget.hasSSE42()) {

30865

assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30866, __extension__
__PRETTY_FUNCTION__))

30866

"Unsupported PCMPGT op")(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30866, __extension__
__PRETTY_FUNCTION__));

30867

return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);

30868

}

30869

30870

if (ShiftAmt >= 32) {

30871

// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.

30872

SDValue Upper =

30873

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);

30874

SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,

30875

ShiftAmt - 32, DAG);

30876

if (VT == MVT::v2i64)

30877

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});

30878

if (VT == MVT::v4i64)

30879

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,

30880

{9, 1, 11, 3, 13, 5, 15, 7});

30881

} else {

30882

// SRA upper i32, SRL whole i64 and select lower i32.

30883

SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,

30884

ShiftAmt, DAG);

30885

SDValue Lower =

30886

getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);

30887

Lower = DAG.getBitcast(ExVT, Lower);

30888

if (VT == MVT::v2i64)

30889

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});

30890

if (VT == MVT::v4i64)

30891

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,

30892

{8, 1, 10, 3, 12, 5, 14, 7});

30893

}

30894

return DAG.getBitcast(VT, Ex);

30895

};

30896

30897

// Optimize shl/srl/sra with constant shift amount.

30898

APInt APIntShiftAmt;

30899

if (!X86::isConstantSplat(Amt, APIntShiftAmt))

30900

return SDValue();

30901

30902

// If the shift amount is out of range, return undef.

30903

if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))

30904

return DAG.getUNDEF(VT);

30905

30906

uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();

30907

30908

if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {

30909

// Hardware support for vector shifts is sparse which makes us scalarize the

30910

// vector operations in many cases. Also, on sandybridge ADD is faster than

30911

// shl: (shl V, 1) -> (add (freeze V), (freeze V))

30912

if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {

30913

// R may be undef at run-time, but (shl R, 1) must be an even number (LSB

30914

// must be 0). (add undef, undef) however can be any value. To make this

30915

// safe, we must freeze R to ensure that register allocation uses the same

30916

// register for an undefined value. This ensures that the result will

30917

// still be even and preserves the original semantics.

30918

R = DAG.getFreeze(R);

30919

return DAG.getNode(ISD::ADD, dl, VT, R, R);

30920

}

30921

30922

return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

30923

}

30924

30925

// i64 SRA needs to be performed as partial shifts.

30926

if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||

30927

(Subtarget.hasInt256() && VT == MVT::v4i64)) &&

30928

Op.getOpcode() == ISD::SRA)

30929

return ArithmeticShiftRight64(ShiftAmt);

30930

30931

if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||

30932

(Subtarget.hasBWI() && VT == MVT::v64i8)) {

30933

unsigned NumElts = VT.getVectorNumElements();

30934

MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

30935

30936

// Simple i8 add case

30937

if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {

30938

// R may be undef at run-time, but (shl R, 1) must be an even number (LSB

30939

// must be 0). (add undef, undef) however can be any value. To make this

30940

// safe, we must freeze R to ensure that register allocation uses the same

30941

// register for an undefined value. This ensures that the result will

30942

// still be even and preserves the original semantics.

30943

R = DAG.getFreeze(R);

30944

return DAG.getNode(ISD::ADD, dl, VT, R, R);

30945

}

30946

30947

// ashr(R, 7) === cmp_slt(R, 0)

30948

if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {

30949

SDValue Zeros = DAG.getConstant(0, dl, VT);

30950

if (VT.is512BitVector()) {

30951

assert(VT == MVT::v64i8 && "Unexpected element type!")(static_cast <bool> (VT == MVT::v64i8 && "Unexpected element type!"
) ? void (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30951, __extension__
__PRETTY_FUNCTION__));

30952

SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);

30953

return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);

30954

}

30955

return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);

30956

}

30957

30958

// XOP can shift v16i8 directly instead of as shift v8i16 + mask.

30959

if (VT == MVT::v16i8 && Subtarget.hasXOP())

30960

return SDValue();

30961

30962

if (Op.getOpcode() == ISD::SHL) {

30963

// Make a large shift.

30964

SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,

30965

ShiftAmt, DAG);

30966

SHL = DAG.getBitcast(VT, SHL);

30967

// Zero out the rightmost bits.

30968

APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);

30969

return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));

30970

}

30971

if (Op.getOpcode() == ISD::SRL) {

30972

// Make a large shift.

30973

SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,

30974

ShiftAmt, DAG);

30975

SRL = DAG.getBitcast(VT, SRL);

30976

// Zero out the leftmost bits.

30977

APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);

30978

return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));

30979

}

30980

if (Op.getOpcode() == ISD::SRA) {

30981

// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)

30982

SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

30983

30984

SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);

30985

Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);

30986

Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);

30987

return Res;

30988

}

30989

llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30989);

30990

}

30991

30992

return SDValue();

30993

}

30994

30995

static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,

30996

const X86Subtarget &Subtarget) {

30997

MVT VT = Op.getSimpleValueType();

30998

SDLoc dl(Op);

30999

SDValue R = Op.getOperand(0);

31000

SDValue Amt = Op.getOperand(1);

31001

unsigned Opcode = Op.getOpcode();

31002

unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);

31003

31004

int BaseShAmtIdx = -1;

31005

if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {

31006

if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))

31007

return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,

31008

Subtarget, DAG);

31009

31010

// vXi8 shifts - shift as v8i16 + mask result.

31011

if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||

31012

(VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||

31013

VT == MVT::v64i8) &&

31014

!Subtarget.hasXOP()) {

31015

unsigned NumElts = VT.getVectorNumElements();

31016

MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

31017

if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {

31018

unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);

31019

unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);

31020

31021

// Create the mask using vXi16 shifts. For shift-rights we need to move

31022

// the upper byte down before splatting the vXi8 mask.

31023

SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);

31024

BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,

31025

BaseShAmt, BaseShAmtIdx, Subtarget, DAG);

31026

if (Opcode != ISD::SHL)

31027

BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,

31028

8, DAG);

31029

BitMask = DAG.getBitcast(VT, BitMask);

31030

BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,

31031

SmallVector<int, 64>(NumElts, 0));

31032

31033

SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,

31034

DAG.getBitcast(ExtVT, R), BaseShAmt,

31035

BaseShAmtIdx, Subtarget, DAG);

31036

Res = DAG.getBitcast(VT, Res);

31037

Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);

31038

31039

if (Opcode == ISD::SRA) {

31040

// ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)

31041

// SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.

31042

SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);

31043

SignMask =

31044

getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,

31045

BaseShAmtIdx, Subtarget, DAG);

31046

SignMask = DAG.getBitcast(VT, SignMask);

31047

Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);

31048

Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);

31049

}

31050

return Res;

31051

}

31052

}

31053

}

31054

31055

return SDValue();

31056

}

31057

31058

// Convert a shift/rotate left amount to a multiplication scale factor.

31059

static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,

31060

const X86Subtarget &Subtarget,

31061

SelectionDAG &DAG) {

31062

MVT VT = Amt.getSimpleValueType();

31063

if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||

31064

(Subtarget.hasInt256() && VT == MVT::v16i16) ||

31065

(Subtarget.hasAVX512() && VT == MVT::v32i16) ||

31066

(!Subtarget.hasAVX512() && VT == MVT::v16i8) ||

31067

(Subtarget.hasInt256() && VT == MVT::v32i8) ||

31068

(Subtarget.hasBWI() && VT == MVT::v64i8)))

31069

return SDValue();

31070

31071

MVT SVT = VT.getVectorElementType();

31072

unsigned SVTBits = SVT.getSizeInBits();

31073

unsigned NumElems = VT.getVectorNumElements();

31074

31075

APInt UndefElts;

31076

SmallVector<APInt> EltBits;

31077

if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {

31078

APInt One(SVTBits, 1);

31079

SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));

31080

for (unsigned I = 0; I != NumElems; ++I) {

31081

if (UndefElts[I] || EltBits[I].uge(SVTBits))

31082

continue;

31083

uint64_t ShAmt = EltBits[I].getZExtValue();

31084

Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);

31085

}

31086

return DAG.getBuildVector(VT, dl, Elts);

31087

}

31088

31089

// If the target doesn't support variable shifts, use either FP conversion

31090

// or integer multiplication to avoid shifting each element individually.

31091

if (VT == MVT::v4i32) {

31092

Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));

31093

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,

31094

DAG.getConstant(0x3f800000U, dl, VT));

31095

Amt = DAG.getBitcast(MVT::v4f32, Amt);

31096

return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);

31097

}

31098

31099

// AVX2 can more effectively perform this as a zext/trunc to/from v8i32.

31100

if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {

31101

SDValue Z = DAG.getConstant(0, dl, VT);

31102

SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));

31103

SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));

31104

Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);

31105

Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);

31106

if (Subtarget.hasSSE41())

31107

return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

31108

return getPack(DAG, Subtarget, dl, VT, Lo, Hi);

31109

}

31110

31111

return SDValue();

31112

}

31113

31114

static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

31115

SelectionDAG &DAG) {

31116

MVT VT = Op.getSimpleValueType();

31117

SDLoc dl(Op);

31118

SDValue R = Op.getOperand(0);

31119

SDValue Amt = Op.getOperand(1);

31120

unsigned EltSizeInBits = VT.getScalarSizeInBits();

31121

bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

31122

31123

unsigned Opc = Op.getOpcode();

31124

unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);

31125

unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);

31126

31127

assert(VT.isVector() && "Custom lowering only for vector shifts!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector shifts!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31127, __extension__
__PRETTY_FUNCTION__));

31128

assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31128, __extension__
__PRETTY_FUNCTION__));

31129

31130

if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))

31131

return V;

31132

31133

if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))

31134

return V;

31135

31136

if (supportedVectorVarShift(VT, Subtarget, Opc))

31137

return Op;

31138

31139

// i64 vector arithmetic shift can be emulated with the transform:

31140

// M = lshr(SIGN_MASK, Amt)

31141

// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)

31142

if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||

31143

(VT == MVT::v4i64 && Subtarget.hasInt256())) &&

31144

Opc == ISD::SRA) {

31145

SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);

31146

SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);

31147

R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

31148

R = DAG.getNode(ISD::XOR, dl, VT, R, M);

31149

R = DAG.getNode(ISD::SUB, dl, VT, R, M);

31150

return R;

31151

}

31152

31153

// XOP has 128-bit variable logical/arithmetic shifts.

31154

// +ve/-ve Amt = shift left/right.

31155

if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||

31156

VT == MVT::v8i16 || VT == MVT::v16i8)) {

31157

if (Opc == ISD::SRL || Opc == ISD::SRA) {

31158

SDValue Zero = DAG.getConstant(0, dl, VT);

31159

Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);

31160

}

31161

if (Opc == ISD::SHL || Opc == ISD::SRL)

31162

return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);

31163

if (Opc == ISD::SRA)

31164

return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);

31165

}

31166

31167

// 2i64 vector logical shifts can efficiently avoid scalarization - do the

31168

// shifts per-lane and then shuffle the partial results back together.

31169

if (VT == MVT::v2i64 && Opc != ISD::SRA) {

31170

// Splat the shift amounts so the scalar shifts above will catch it.

31171

SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});

31172

SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});

31173

SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);

31174

SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);

31175

return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});

31176

}

31177

31178

// If possible, lower this shift as a sequence of two shifts by

31179

// constant plus a BLENDing shuffle instead of scalarizing it.

31180

// Example:

31181

// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))

31182

//

31183

// Could be rewritten as:

31184

// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))

31185

//

31186

// The advantage is that the two shifts from the example would be

31187

// lowered as X86ISD::VSRLI nodes in parallel before blending.

31188

if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||

31189

(VT == MVT::v16i16 && Subtarget.hasInt256()))) {

31190

SDValue Amt1, Amt2;

31191

unsigned NumElts = VT.getVectorNumElements();

31192

SmallVector<int, 8> ShuffleMask;

31193

for (unsigned i = 0; i != NumElts; ++i) {

31194

SDValue A = Amt->getOperand(i);

31195

if (A.isUndef()) {

31196

ShuffleMask.push_back(SM_SentinelUndef);

31197

continue;

31198

}

31199

if (!Amt1 || Amt1 == A) {

31200

ShuffleMask.push_back(i);

31201

Amt1 = A;

31202

continue;

31203

}

31204

if (!Amt2 || Amt2 == A) {

31205

ShuffleMask.push_back(i + NumElts);

31206

Amt2 = A;

31207

continue;

31208

}

31209

break;

31210

}

31211

31212

// Only perform this blend if we can perform it without loading a mask.

31213

if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&

31214

(VT != MVT::v16i16 ||

31215

is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&

31216

(VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||

31217

canWidenShuffleElements(ShuffleMask))) {

31218

auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);

31219

auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);

31220

if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&

31221

Cst2->getAPIntValue().ult(EltSizeInBits)) {

31222

SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,

31223

Cst1->getZExtValue(), DAG);

31224

SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,

31225

Cst2->getZExtValue(), DAG);

31226

return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);

31227

}

31228

}

31229

}

31230

31231

// If possible, lower this packed shift into a vector multiply instead of

31232

// expanding it into a sequence of scalar shifts.

31233

// For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.

31234

if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||

31235

Subtarget.canExtendTo512BW())))

31236

if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))

31237

return DAG.getNode(ISD::MUL, dl, VT, R, Scale);

31238

31239

// Constant ISD::SRL can be performed efficiently on vXi16 vectors as we

31240

// can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).

31241

if (Opc == ISD::SRL && ConstantAmt &&

31242

(VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {

31243

SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);

31244

SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);

31245

if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {

31246

SDValue Zero = DAG.getConstant(0, dl, VT);

31247

SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);

31248

SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);

31249

return DAG.getSelect(dl, VT, ZAmt, R, Res);

31250

}

31251

}

31252

31253

// Constant ISD::SRA can be performed efficiently on vXi16 vectors as we

31254

// can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).

31255

// TODO: Special case handling for shift by 0/1, really we can afford either

31256

// of these cases in pre-SSE41/XOP/AVX512 but not both.

31257

if (Opc == ISD::SRA && ConstantAmt &&

31258

(VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&

31259

((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&

31260

!Subtarget.hasAVX512()) ||

31261

DAG.isKnownNeverZero(Amt))) {

31262

SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);

31263

SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);

31264

if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {

31265

SDValue Amt0 =

31266

DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);

31267

SDValue Amt1 =

31268

DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);

31269

SDValue Sra1 =

31270

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);

31271

SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);

31272

Res = DAG.getSelect(dl, VT, Amt0, R, Res);

31273

return DAG.getSelect(dl, VT, Amt1, Sra1, Res);

31274

}

31275

}

31276

31277

// v4i32 Non Uniform Shifts.

31278

// If the shift amount is constant we can shift each lane using the SSE2

31279

// immediate shifts, else we need to zero-extend each lane to the lower i64

31280

// and shift using the SSE2 variable shifts.

31281

// The separate results can then be blended together.

31282

if (VT == MVT::v4i32) {

31283

SDValue Amt0, Amt1, Amt2, Amt3;

31284

if (ConstantAmt) {

31285

Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});

31286

Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});

31287

Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});

31288

Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});

31289

} else {

31290

// The SSE2 shifts use the lower i64 as the same shift amount for

31291

// all lanes and the upper i64 is ignored. On AVX we're better off

31292

// just zero-extending, but for SSE just duplicating the top 16-bits is

31293

// cheaper and has the same effect for out of range values.

31294

if (Subtarget.hasAVX()) {

31295

SDValue Z = DAG.getConstant(0, dl, VT);

31296

Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});

31297

Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});

31298

Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});

31299

Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});

31300

} else {

31301

SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);

31302

SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,

31303

{4, 5, 6, 7, -1, -1, -1, -1});

31304

SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);

31305

SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);

31306

Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);

31307

Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);

31308

Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);

31309

Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);

31310

}

31311

}

31312

31313

unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;

31314

SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));

31315

SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));

31316

SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));

31317

SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));

31318

31319

// Merge the shifted lane results optimally with/without PBLENDW.

31320

// TODO - ideally shuffle combining would handle this.

31321

if (Subtarget.hasSSE41()) {

31322

SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});

31323

SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});

31324

return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});

31325

}

31326

SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});

31327

SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});

31328

return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});

31329

}

31330

31331

// It's worth extending once and using the vXi16/vXi32 shifts for smaller

31332

// types, but without AVX512 the extra overheads to get from vXi8 to vXi32

31333

// make the existing SSE solution better.

31334

// NOTE: We honor prefered vector width before promoting to 512-bits.

31335

if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||

31336

(Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||

31337

(Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||

31338

(Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||

31339

(Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {

31340

assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31341, __extension__
__PRETTY_FUNCTION__))

31341

"Unexpected vector type")(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31341, __extension__
__PRETTY_FUNCTION__));

31342

MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;

31343

MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());

31344

unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

31345

R = DAG.getNode(ExtOpc, dl, ExtVT, R);

31346

Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);

31347

return DAG.getNode(ISD::TRUNCATE, dl, VT,

31348

DAG.getNode(Opc, dl, ExtVT, R, Amt));

31349

}

31350

31351

// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we

31352

// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.

31353

if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&

31354

(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||

31355

(VT == MVT::v64i8 && Subtarget.hasBWI())) &&

31356

!Subtarget.hasXOP()) {

31357

int NumElts = VT.getVectorNumElements();

31358

SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);

31359

31360

// Extend constant shift amount to vXi16 (it doesn't matter if the type

31361

// isn't legal).

31362

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

31363

Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);

31364

Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);

31365

Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);

31366

assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31367, __extension__
__PRETTY_FUNCTION__))

31367

"Constant build vector expected")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31367, __extension__
__PRETTY_FUNCTION__));

31368

31369

if (VT == MVT::v16i8 && Subtarget.hasInt256()) {

31370

R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)

31371

: DAG.getZExtOrTrunc(R, dl, ExVT);

31372

R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);

31373

R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);

31374

return DAG.getZExtOrTrunc(R, dl, VT);

31375

}

31376

31377

SmallVector<SDValue, 16> LoAmt, HiAmt;

31378

for (int i = 0; i != NumElts; i += 16) {

31379

for (int j = 0; j != 8; ++j) {

31380

LoAmt.push_back(Amt.getOperand(i + j));

31381

HiAmt.push_back(Amt.getOperand(i + j + 8));

31382

}

31383

}

31384

31385

MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);

31386

SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);

31387

SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);

31388

31389

SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));

31390

SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));

31391

LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);

31392

HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);

31393

LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);

31394

HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);

31395

LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);

31396

HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);

31397

return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);

31398

}

31399

31400

if (VT == MVT::v16i8 ||

31401

(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||

31402

(VT == MVT::v64i8 && Subtarget.hasBWI())) {

31403

MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);

31404

31405

auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {

31406

if (VT.is512BitVector()) {

31407

// On AVX512BW targets we make use of the fact that VSELECT lowers

31408

// to a masked blend which selects bytes based just on the sign bit

31409

// extracted to a mask.

31410

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

31411

V0 = DAG.getBitcast(VT, V0);

31412

V1 = DAG.getBitcast(VT, V1);

31413

Sel = DAG.getBitcast(VT, Sel);

31414

Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,

31415

ISD::SETGT);

31416

return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));

31417

} else if (Subtarget.hasSSE41()) {

31418

// On SSE41 targets we can use PBLENDVB which selects bytes based just

31419

// on the sign bit.

31420

V0 = DAG.getBitcast(VT, V0);

31421

V1 = DAG.getBitcast(VT, V1);

31422

Sel = DAG.getBitcast(VT, Sel);

31423

return DAG.getBitcast(SelVT,

31424

DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));

31425

}

31426

// On pre-SSE41 targets we test for the sign bit by comparing to

31427

// zero - a negative value will set all bits of the lanes to true

31428

// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.

31429

SDValue Z = DAG.getConstant(0, dl, SelVT);

31430

SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);

31431

return DAG.getSelect(dl, SelVT, C, V0, V1);

31432

};

31433

31434

// Turn 'a' into a mask suitable for VSELECT: a = a << 5;

31435

// We can safely do this using i16 shifts as we're only interested in

31436

// the 3 lower bits of each byte.

31437

Amt = DAG.getBitcast(ExtVT, Amt);

31438

Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);

31439

Amt = DAG.getBitcast(VT, Amt);

31440

31441

if (Opc == ISD::SHL || Opc == ISD::SRL) {

31442

// r = VSELECT(r, shift(r, 4), a);

31443

SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));

31444

R = SignBitSelect(VT, Amt, M, R);

31445

31446

// a += a

31447

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

31448

31449

// r = VSELECT(r, shift(r, 2), a);

31450

M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));

31451

R = SignBitSelect(VT, Amt, M, R);

31452

31453

// a += a

31454

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

31455

31456

// return VSELECT(r, shift(r, 1), a);

31457

M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));

31458

R = SignBitSelect(VT, Amt, M, R);

31459

return R;

31460

}

31461

31462

if (Opc == ISD::SRA) {

31463

// For SRA we need to unpack each byte to the higher byte of a i16 vector

31464

// so we can correctly sign extend. We don't care what happens to the

31465

// lower byte.

31466

SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);

31467

SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);

31468

SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);

31469

SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);

31470

ALo = DAG.getBitcast(ExtVT, ALo);

31471

AHi = DAG.getBitcast(ExtVT, AHi);

31472

RLo = DAG.getBitcast(ExtVT, RLo);

31473

RHi = DAG.getBitcast(ExtVT, RHi);

31474

31475

// r = VSELECT(r, shift(r, 4), a);

31476

SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);

31477

SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);

31478

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

31479

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

31480

31481

// a += a

31482

ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);

31483

AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

31484

31485

// r = VSELECT(r, shift(r, 2), a);

31486

MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);

31487

MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);

31488

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

31489

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

31490

31491

// a += a

31492

ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);

31493

AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

31494

31495

// r = VSELECT(r, shift(r, 1), a);

31496

MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);

31497

MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);

31498

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

31499

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

31500

31501

// Logical shift the result back to the lower byte, leaving a zero upper

31502

// byte meaning that we can safely pack with PACKUSWB.

31503

RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);

31504

RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);

31505

return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);

31506

}

31507

}

31508

31509

if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {

31510

MVT ExtVT = MVT::v8i32;

31511

SDValue Z = DAG.getConstant(0, dl, VT);

31512

SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);

31513

SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);

31514

SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);

31515

SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);

31516

ALo = DAG.getBitcast(ExtVT, ALo);

31517

AHi = DAG.getBitcast(ExtVT, AHi);

31518

RLo = DAG.getBitcast(ExtVT, RLo);

31519

RHi = DAG.getBitcast(ExtVT, RHi);

31520

SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);

31521

SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);

31522

Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);

31523

Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);

31524

return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

31525

}

31526

31527

if (VT == MVT::v8i16) {

31528

// If we have a constant shift amount, the non-SSE41 path is best as

31529

// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.

31530

bool UseSSE41 = Subtarget.hasSSE41() &&

31531

!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

31532

31533

auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {

31534

// On SSE41 targets we can use PBLENDVB which selects bytes based just on

31535

// the sign bit.

31536

if (UseSSE41) {

31537

MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);

31538

V0 = DAG.getBitcast(ExtVT, V0);

31539

V1 = DAG.getBitcast(ExtVT, V1);

31540

Sel = DAG.getBitcast(ExtVT, Sel);

31541

return DAG.getBitcast(

31542

VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));

31543

}

31544

// On pre-SSE41 targets we splat the sign bit - a negative value will

31545

// set all bits of the lanes to true and VSELECT uses that in

31546

// its OR(AND(V0,C),AND(V1,~C)) lowering.

31547

SDValue C =

31548

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);

31549

return DAG.getSelect(dl, VT, C, V0, V1);

31550

};

31551

31552

// Turn 'a' into a mask suitable for VSELECT: a = a << 12;

31553

if (UseSSE41) {

31554

// On SSE41 targets we need to replicate the shift mask in both

31555

// bytes for PBLENDVB.

31556

Amt = DAG.getNode(

31557

ISD::OR, dl, VT,

31558

getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),

31559

getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));

31560

} else {

31561

Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);

31562

}

31563

31564

// r = VSELECT(r, shift(r, 8), a);

31565

SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);

31566

R = SignBitSelect(Amt, M, R);

31567

31568

// a += a

31569

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

31570

31571

// r = VSELECT(r, shift(r, 4), a);

31572

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);

31573

R = SignBitSelect(Amt, M, R);

31574

31575

// a += a

31576

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

31577

31578

// r = VSELECT(r, shift(r, 2), a);

31579

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);

31580

R = SignBitSelect(Amt, M, R);

31581

31582

// a += a

31583

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

31584

31585

// return VSELECT(r, shift(r, 1), a);

31586

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);

31587

R = SignBitSelect(Amt, M, R);

31588

return R;

31589

}

31590

31591

// Decompose 256-bit shifts into 128-bit shifts.

31592

if (VT.is256BitVector())

31593

return splitVectorIntBinary(Op, DAG);

31594

31595

if (VT == MVT::v32i16 || VT == MVT::v64i8)

31596

return splitVectorIntBinary(Op, DAG);

31597

31598

return SDValue();

31599

}

31600

31601

static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,

31602

SelectionDAG &DAG) {

31603

MVT VT = Op.getSimpleValueType();

31604

assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31605, __extension__
__PRETTY_FUNCTION__))

31605

"Unexpected funnel shift opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31605, __extension__
__PRETTY_FUNCTION__));

31606

31607

SDLoc DL(Op);

31608

SDValue Op0 = Op.getOperand(0);

31609

SDValue Op1 = Op.getOperand(1);

31610

SDValue Amt = Op.getOperand(2);

31611

unsigned EltSizeInBits = VT.getScalarSizeInBits();

31612

bool IsFSHR = Op.getOpcode() == ISD::FSHR;

31613

31614

if (VT.isVector()) {

31615

APInt APIntShiftAmt;

31616

bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);

31617

31618

if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {

31619

if (IsFSHR)

31620

std::swap(Op0, Op1);

31621

31622

if (IsCstSplat) {

31623

uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);

31624

SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);

31625

return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,

31626

{Op0, Op1, Imm}, DAG, Subtarget);

31627

}

31628

return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,

31629

{Op0, Op1, Amt}, DAG, Subtarget);

31630

}

31631

assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31634, __extension__
__PRETTY_FUNCTION__))

31632

VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31634, __extension__
__PRETTY_FUNCTION__))

31633

VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31634, __extension__
__PRETTY_FUNCTION__))

31634

"Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31634, __extension__
__PRETTY_FUNCTION__));

31635

31636

// fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.

31637

// fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).

31638

if (IsCstSplat)

31639

return SDValue();

31640

31641

SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);

31642

SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

31643

bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());

31644

31645

// Constant vXi16 funnel shifts can be efficiently handled by default.

31646

if (IsCst && EltSizeInBits == 16)

31647

return SDValue();

31648

31649

unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;

31650

unsigned NumElts = VT.getVectorNumElements();

31651

MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);

31652

MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);

31653

31654

// Split 256-bit integers on XOP/pre-AVX2 targets.

31655

// Split 512-bit integers on non 512-bit BWI targets.

31656

if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||

31657

!Subtarget.hasAVX2())) ||

31658

(VT.is512BitVector() && !Subtarget.useBWIRegs() &&

31659

EltSizeInBits < 32)) {

31660

// Pre-mask the amount modulo using the wider vector.

31661

Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);

31662

return splitVectorOp(Op, DAG);

31663

}

31664

31665

// Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))

31666

if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {

31667

int ScalarAmtIdx = -1;

31668

if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {

31669

// Uniform vXi16 funnel shifts can be efficiently handled by default.

31670

if (EltSizeInBits == 16)

31671

return SDValue();

31672

31673

SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));

31674

SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));

31675

Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,

31676

ScalarAmtIdx, Subtarget, DAG);

31677

Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,

31678

ScalarAmtIdx, Subtarget, DAG);

31679

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);

31680

}

31681

}

31682

31683

MVT WideSVT = MVT::getIntegerVT(

31684

std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));

31685

MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);

31686

31687

// If per-element shifts are legal, fallback to generic expansion.

31688

if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())

31689

return SDValue();

31690

31691

// Attempt to fold as:

31692

// fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.

31693

// fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).

31694

if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&

31695

supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {

31696

Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);

31697

Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);

31698

AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);

31699

Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,

31700

EltSizeInBits, DAG);

31701

SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);

31702

Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);

31703

if (!IsFSHR)

31704

Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,

31705

EltSizeInBits, DAG);

31706

return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);

31707

}

31708

31709

// Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)

31710

if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||

31711

supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {

31712

SDValue Z = DAG.getConstant(0, DL, VT);

31713

SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));

31714

SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));

31715

SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));

31716

SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));

31717

SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);

31718

SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);

31719

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);

31720

}

31721

31722

// Fallback to generic expansion.

31723

return SDValue();

31724

}

31725

assert((static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31727, __extension__
__PRETTY_FUNCTION__))

31726

(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31727, __extension__
__PRETTY_FUNCTION__))

31727

"Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31727, __extension__
__PRETTY_FUNCTION__));

31728

31729

// Expand slow SHLD/SHRD cases if we are not optimizing for size.

31730

bool OptForSize = DAG.shouldOptForSize();

31731

bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();

31732

31733

// fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.

31734

// fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).

31735

if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&

31736

!isa<ConstantSDNode>(Amt)) {

31737

SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());

31738

SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());

31739

Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);

31740

Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);

31741

Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);

31742

SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);

31743

Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);

31744

if (IsFSHR) {

31745

Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);

31746

} else {

31747

Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);

31748

Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);

31749

}

31750

return DAG.getZExtOrTrunc(Res, DL, VT);

31751

}

31752

31753

if (VT == MVT::i8 || ExpandFunnel)

31754

return SDValue();

31755

31756

// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.

31757

if (VT == MVT::i16) {

31758

Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,

31759

DAG.getConstant(15, DL, Amt.getValueType()));

31760

unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);

31761

return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);

31762

}

31763

31764

return Op;

31765

}

31766

31767

static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,

31768

SelectionDAG &DAG) {

31769

MVT VT = Op.getSimpleValueType();

31770

assert(VT.isVector() && "Custom lowering only for vector rotates!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector rotates!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31770, __extension__
__PRETTY_FUNCTION__));

31771

31772

SDLoc DL(Op);

31773

SDValue R = Op.getOperand(0);

31774

SDValue Amt = Op.getOperand(1);

31775

unsigned Opcode = Op.getOpcode();

31776

unsigned EltSizeInBits = VT.getScalarSizeInBits();

31777

int NumElts = VT.getVectorNumElements();

31778

bool IsROTL = Opcode == ISD::ROTL;

31779

31780

// Check for constant splat rotation amount.

31781

APInt CstSplatValue;

31782

bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);

31783

31784

// Check for splat rotate by zero.

31785

if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)

31786

return R;

31787

31788

// AVX512 implicitly uses modulo rotation amounts.

31789

if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {

31790

// Attempt to rotate by immediate.

31791

if (IsCstSplat) {

31792

unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;

31793

uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);

31794

return DAG.getNode(RotOpc, DL, VT, R,

31795

DAG.getTargetConstant(RotAmt, DL, MVT::i8));

31796

}

31797

31798

// Else, fall-back on VPROLV/VPRORV.

31799

return Op;

31800

}

31801

31802

// AVX512 VBMI2 vXi16 - lower to funnel shifts.

31803

if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {

31804

unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;

31805

return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);

31806

}

31807

31808

SDValue Z = DAG.getConstant(0, DL, VT);

31809

31810

if (!IsROTL) {

31811

// If the ISD::ROTR amount is constant, we're always better converting to

31812

// ISD::ROTL.

31813

if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))

31814

return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);

31815

31816

// XOP targets always prefers ISD::ROTL.

31817

if (Subtarget.hasXOP())

31818

return DAG.getNode(ISD::ROTL, DL, VT, R,

31819

DAG.getNode(ISD::SUB, DL, VT, Z, Amt));

31820

}

31821

31822

// Split 256-bit integers on XOP/pre-AVX2 targets.

31823

if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))

31824

return splitVectorIntBinary(Op, DAG);

31825

31826

// XOP has 128-bit vector variable + immediate rotates.

31827

// +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.

31828

// XOP implicitly uses modulo rotation amounts.

31829

if (Subtarget.hasXOP()) {

31830

assert(IsROTL && "Only ROTL expected")(static_cast <bool> (IsROTL && "Only ROTL expected"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31830, __extension__
__PRETTY_FUNCTION__));

31831

assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")(static_cast <bool> (VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31831, __extension__
__PRETTY_FUNCTION__));

31832

31833

// Attempt to rotate by immediate.

31834

if (IsCstSplat) {

31835

uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);

31836

return DAG.getNode(X86ISD::VROTLI, DL, VT, R,

31837

DAG.getTargetConstant(RotAmt, DL, MVT::i8));

31838

}

31839

31840

// Use general rotate by variable (per-element).

31841

return Op;

31842

}

31843

31844

// Rotate by an uniform constant - expand back to shifts.

31845

if (IsCstSplat)

31846

return SDValue();

31847

31848

// Split 512-bit integers on non 512-bit BWI targets.

31849

if (VT.is512BitVector() && !Subtarget.useBWIRegs())

31850

return splitVectorIntBinary(Op, DAG);

31851

31852

assert((static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31857, __extension__
__PRETTY_FUNCTION__))

31853

(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31857, __extension__
__PRETTY_FUNCTION__))

31854

((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31857, __extension__
__PRETTY_FUNCTION__))

31855

Subtarget.hasAVX2()) ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31857, __extension__
__PRETTY_FUNCTION__))

31856

((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31857, __extension__
__PRETTY_FUNCTION__))

31857

"Only vXi32/vXi16/vXi8 vector rotates supported")(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31857, __extension__
__PRETTY_FUNCTION__));

31858

31859

MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);

31860

MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);

31861

31862

SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);

31863

SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

31864

31865

// Attempt to fold as unpack(x,x) << zext(splat(y)):

31866

// rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.

31867

// rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).

31868

if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {

31869

int BaseRotAmtIdx = -1;

31870

if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {

31871

if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {

31872

unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;

31873

return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);

31874

}

31875

unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;

31876

SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));

31877

SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));

31878

Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,

31879

BaseRotAmtIdx, Subtarget, DAG);

31880

Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,

31881

BaseRotAmtIdx, Subtarget, DAG);

31882

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);

31883

}

31884

}

31885

31886

// v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by

31887

// the amount bit.

31888

// TODO: We're doing nothing here that we couldn't do for funnel shifts.

31889

if (EltSizeInBits == 8) {

31890

bool IsConstAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

31891

MVT WideVT =

31892

MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);

31893

unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;

31894

31895

// Attempt to fold as:

31896

// rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.

31897

// rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).

31898

if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&

31899

supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {

31900

// If we're rotating by constant, just use default promotion.

31901

if (IsConstAmt)

31902

return SDValue();

31903

// See if we can perform this by widening to vXi16 or vXi32.

31904

R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);

31905

R = DAG.getNode(

31906

ISD::OR, DL, WideVT, R,

31907

getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));

31908

Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);

31909

R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);

31910

if (IsROTL)

31911

R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);

31912

return DAG.getNode(ISD::TRUNCATE, DL, VT, R);

31913

}

31914

31915

// Attempt to fold as unpack(x,x) << zext(y):

31916

// rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.

31917

// rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).

31918

if (IsConstAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {

31919

// See if we can perform this by unpacking to lo/hi vXi16.

31920

SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));

31921

SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));

31922

SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));

31923

SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));

31924

SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);

31925

SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);

31926

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);

31927

}

31928

assert((VT == MVT::v16i8 || VT == MVT::v32i8) && "Unsupported vXi8 type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
) && "Unsupported vXi8 type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v32i8) && \"Unsupported vXi8 type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31928, __extension__
__PRETTY_FUNCTION__));

31929

31930

// We don't need ModuloAmt here as we just peek at individual bits.

31931

auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {

31932

if (Subtarget.hasSSE41()) {

31933

// On SSE41 targets we can use PBLENDVB which selects bytes based just

31934

// on the sign bit.

31935

V0 = DAG.getBitcast(VT, V0);

31936

V1 = DAG.getBitcast(VT, V1);

31937

Sel = DAG.getBitcast(VT, Sel);

31938

return DAG.getBitcast(SelVT,

31939

DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));

31940

}

31941

// On pre-SSE41 targets we test for the sign bit by comparing to

31942

// zero - a negative value will set all bits of the lanes to true

31943

// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.

31944

SDValue Z = DAG.getConstant(0, DL, SelVT);

31945

SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);

31946

return DAG.getSelect(DL, SelVT, C, V0, V1);

31947

};

31948

31949

// ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.

31950

if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {

31951

Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);

31952

IsROTL = true;

31953

}

31954

31955

unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;

31956

unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;

31957

31958

// Turn 'a' into a mask suitable for VSELECT: a = a << 5;

31959

// We can safely do this using i16 shifts as we're only interested in

31960

// the 3 lower bits of each byte.

31961

Amt = DAG.getBitcast(ExtVT, Amt);

31962

Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));

31963

Amt = DAG.getBitcast(VT, Amt);

31964

31965

// r = VSELECT(r, rot(r, 4), a);

31966

SDValue M;

31967

M = DAG.getNode(

31968

ISD::OR, DL, VT,

31969

DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),

31970

DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));

31971

R = SignBitSelect(VT, Amt, M, R);

31972

31973

// a += a

31974

Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

31975

31976

// r = VSELECT(r, rot(r, 2), a);

31977

M = DAG.getNode(

31978

ISD::OR, DL, VT,

31979

DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),

31980

DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));

31981

R = SignBitSelect(VT, Amt, M, R);

31982

31983

// a += a

31984

Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

31985

31986

// return VSELECT(r, rot(r, 1), a);

31987

M = DAG.getNode(

31988

ISD::OR, DL, VT,

31989

DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),

31990

DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));

31991

return SignBitSelect(VT, Amt, M, R);

31992

}

31993

31994

bool IsSplatAmt = DAG.isSplatValue(Amt);

31995

bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

31996

bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&

31997

supportedVectorVarShift(VT, Subtarget, ISD::SRL);

31998

31999

// Fallback for splats + all supported variable shifts.

32000

// Fallback for non-constants AVX2 vXi16 as well.

32001

if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {

32002

Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

32003

SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);

32004

AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);

32005

SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);

32006

SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);

32007

return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);

32008

}

32009

32010

// Everything below assumes ISD::ROTL.

32011

if (!IsROTL) {

32012

Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);

32013

IsROTL = true;

32014

}

32015

32016

// ISD::ROT* uses modulo rotate amounts.

32017

Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

32018

32019

assert(IsROTL && "Only ROTL supported")(static_cast <bool> (IsROTL && "Only ROTL supported"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32019, __extension__
__PRETTY_FUNCTION__));

32020

32021

// As with shifts, attempt to convert the rotation amount to a multiplication

32022

// factor, fallback to general expansion.

32023

SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);

32024

if (!Scale)

32025

return SDValue();

32026

32027

// v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.

32028

if (EltSizeInBits == 16) {

32029

SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);

32030

SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);

32031

return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);

32032

}

32033

32034

// v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32

32035

// to v2i64 results at a time. The upper 32-bits contain the wrapped bits

32036

// that can then be OR'd with the lower 32-bits.

32037

assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")(static_cast <bool> (VT == MVT::v4i32 && "Only v4i32 vector rotate expected"
) ? void (0) : __assert_fail ("VT == MVT::v4i32 && \"Only v4i32 vector rotate expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32037, __extension__
__PRETTY_FUNCTION__));

32038

static const int OddMask[] = {1, -1, 3, -1};

32039

SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);

32040

SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);

32041

32042

SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,

32043

DAG.getBitcast(MVT::v2i64, R),

32044

DAG.getBitcast(MVT::v2i64, Scale));

32045

SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,

32046

DAG.getBitcast(MVT::v2i64, R13),

32047

DAG.getBitcast(MVT::v2i64, Scale13));

32048

Res02 = DAG.getBitcast(VT, Res02);

32049

Res13 = DAG.getBitcast(VT, Res13);

32050

32051

return DAG.getNode(ISD::OR, DL, VT,

32052

DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),

32053

DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));

32054

}

32055

32056

/// Returns true if the operand type is exactly twice the native width, and

32057

/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.

32058

/// Used to know whether to use cmpxchg8/16b when expanding atomic operations

32059

/// (otherwise we leave them alone to become __sync_fetch_and_... calls).

32060

bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {

32061

unsigned OpWidth = MemType->getPrimitiveSizeInBits();

32062

32063

if (OpWidth == 64)

32064

return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();

32065

if (OpWidth == 128)

32066

return Subtarget.canUseCMPXCHG16B();

32067

32068

return false;

32069

}

32070

32071

TargetLoweringBase::AtomicExpansionKind

32072

X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {

32073

Type *MemType = SI->getValueOperand()->getType();

32074

32075

bool NoImplicitFloatOps =

32076

SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);

32077

if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&

32078

!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

32079

(Subtarget.hasSSE1() || Subtarget.hasX87()))

32080

return AtomicExpansionKind::None;

32081

32082

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand

32083

: AtomicExpansionKind::None;

32084

}

32085

32086

// Note: this turns large loads into lock cmpxchg8b/16b.

32087

// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?

32088

TargetLowering::AtomicExpansionKind

32089

X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {

32090

Type *MemType = LI->getType();

32091

32092

// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we

32093

// can use movq to do the load. If we have X87 we can load into an 80-bit

32094

// X87 register and store it to a stack temporary.

32095

bool NoImplicitFloatOps =

32096

LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);

32097

if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&

32098

!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

32099

(Subtarget.hasSSE1() || Subtarget.hasX87()))

32100

return AtomicExpansionKind::None;

32101

32102

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg

32103

: AtomicExpansionKind::None;

32104

}

32105

32106

enum BitTestKind : unsigned {

32107

UndefBit,

32108

ConstantBit,

32109

NotConstantBit,

32110

ShiftBit,

32111

NotShiftBit

32112

};

32113

32114

static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {

32115

using namespace llvm::PatternMatch;

32116

BitTestKind BTK = UndefBit;

32117

auto *C = dyn_cast<ConstantInt>(V);

32118

if (C) {

32119

// Check if V is a power of 2 or NOT power of 2.

32120

if (isPowerOf2_64(C->getZExtValue()))

32121

BTK = ConstantBit;

32122

else if (isPowerOf2_64((~C->getValue()).getZExtValue()))

32123

BTK = NotConstantBit;

32124

return {V, BTK};

32125

}

32126

32127

// Check if V is some power of 2 pattern known to be non-zero

32128

auto *I = dyn_cast<Instruction>(V);

32129

if (I) {

32130

bool Not = false;

32131

// Check if we have a NOT

32132

Value *PeekI;

32133

if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||

32134

match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {

32135

Not = true;

32136

I = dyn_cast<Instruction>(PeekI);

32137

32138

// If I is constant, it will fold and we can evaluate later. If its an

32139

// argument or something of that nature, we can't analyze.

32140

if (I == nullptr)

32141

return {nullptr, UndefBit};

32142

}

32143

// We can only use 1 << X without more sophisticated analysis. C << X where

32144

// C is a power of 2 but not 1 can result in zero which cannot be translated

32145

// to bittest. Likewise any C >> X (either arith or logical) can be zero.

32146

if (I->getOpcode() == Instruction::Shl) {

32147

// Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &

32148

// -X` and some other provable power of 2 patterns that we can use CTZ on

32149

// may be profitable.

32150

// Todo(2): It may be possible in some cases to prove that Shl(C, X) is

32151

// non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also

32152

// be provably a non-zero power of 2.

32153

// Todo(3): ROTL and ROTR patterns on a power of 2 C should also be

32154

// transformable to bittest.

32155

auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));

32156

if (!ShiftVal)

32157

return {nullptr, UndefBit};

32158

if (ShiftVal->equalsInt(1))

32159

BTK = Not ? NotShiftBit : ShiftBit;

32160

32161

if (BTK == UndefBit)

32162

return {nullptr, UndefBit};

32163

32164

Value *BitV = I->getOperand(1);

32165

32166

Value *AndOp;

32167

const APInt *AndC;

32168

if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {

32169

// Read past a shiftmask instruction to find count

32170

if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))

32171

BitV = AndOp;

32172

}

32173

return {BitV, BTK};

32174

}

32175

}

32176

return {nullptr, UndefBit};

32177

}

32178

32179

TargetLowering::AtomicExpansionKind

32180

X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {

32181

// If the atomicrmw's result isn't actually used, we can just add a "lock"

32182

// prefix to a normal instruction for these operations.

32183

if (AI->use_empty())

32184

return AtomicExpansionKind::None;

32185

32186

// If the atomicrmw's result is used by a single bit AND, we may use

32187

// bts/btr/btc instruction for these operations.

32188

// Note: InstCombinePass can cause a de-optimization here. It replaces the

32189

// SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor

32190

// (depending on CC). This pattern can only use bts/btr/btc but we don't

32191

// detect it.

32192

Instruction *I = AI->user_back();

32193

auto BitChange = FindSingleBitChange(AI->getValOperand());

32194

if (BitChange.second == UndefBit || !AI->hasOneUse() ||

32195

I->getOpcode() != Instruction::And ||

32196

AI->getType()->getPrimitiveSizeInBits() == 8 ||

32197

AI->getParent() != I->getParent())

32198

return AtomicExpansionKind::CmpXChg;

32199

32200

unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;

32201

32202

// This is a redundant AND, it should get cleaned up elsewhere.

32203

if (AI == I->getOperand(OtherIdx))

32204

return AtomicExpansionKind::CmpXChg;

32205

32206

// The following instruction must be a AND single bit.

32207

if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {

32208

auto *C1 = cast<ConstantInt>(AI->getValOperand());

32209

auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));

32210

if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {

32211

return AtomicExpansionKind::CmpXChg;

32212

}

32213

if (AI->getOperation() == AtomicRMWInst::And) {

32214

return ~C1->getValue() == C2->getValue()

32215

? AtomicExpansionKind::BitTestIntrinsic

32216

: AtomicExpansionKind::CmpXChg;

32217

}

32218

return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic

32219

: AtomicExpansionKind::CmpXChg;

32220

}

32221

32222

assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit)(static_cast <bool> (BitChange.second == ShiftBit || BitChange
.second == NotShiftBit) ? void (0) : __assert_fail ("BitChange.second == ShiftBit || BitChange.second == NotShiftBit"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32222, __extension__
__PRETTY_FUNCTION__));

32223

32224

auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));

32225

if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)

32226

return AtomicExpansionKind::CmpXChg;

32227

32228

assert(BitChange.first != nullptr && BitTested.first != nullptr)(static_cast <bool> (BitChange.first != nullptr &&
BitTested.first != nullptr) ? void (0) : __assert_fail ("BitChange.first != nullptr && BitTested.first != nullptr"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32228, __extension__
__PRETTY_FUNCTION__));

32229

32230

// If shift amounts are not the same we can't use BitTestIntrinsic.

32231

if (BitChange.first != BitTested.first)

32232

return AtomicExpansionKind::CmpXChg;

32233

32234

// If atomic AND need to be masking all be one bit and testing the one bit

32235

// unset in the mask.

32236

if (AI->getOperation() == AtomicRMWInst::And)

32237

return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)

32238

? AtomicExpansionKind::BitTestIntrinsic

32239

: AtomicExpansionKind::CmpXChg;

32240

32241

// If atomic XOR/OR need to be setting and testing the same bit.

32242

return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)

32243

? AtomicExpansionKind::BitTestIntrinsic

32244

: AtomicExpansionKind::CmpXChg;

32245

}

32246

32247

void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {

32248

IRBuilder<> Builder(AI);

32249

Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});

32250

Intrinsic::ID IID_C = Intrinsic::not_intrinsic;

32251

Intrinsic::ID IID_I = Intrinsic::not_intrinsic;

32252

switch (AI->getOperation()) {

32253

default:

32254

llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 32254);

32255

case AtomicRMWInst::Or:

32256

IID_C = Intrinsic::x86_atomic_bts;

32257

IID_I = Intrinsic::x86_atomic_bts_rm;

32258

break;

32259

case AtomicRMWInst::Xor:

32260

IID_C = Intrinsic::x86_atomic_btc;

32261

IID_I = Intrinsic::x86_atomic_btc_rm;

32262

break;

32263

case AtomicRMWInst::And:

32264

IID_C = Intrinsic::x86_atomic_btr;

32265

IID_I = Intrinsic::x86_atomic_btr_rm;

32266

break;

32267

}

32268

Instruction *I = AI->user_back();

32269

LLVMContext &Ctx = AI->getContext();

32270

Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),

32271

Type::getInt8PtrTy(Ctx));

32272

Function *BitTest = nullptr;

32273

Value *Result = nullptr;

32274

auto BitTested = FindSingleBitChange(AI->getValOperand());

32275

assert(BitTested.first != nullptr)(static_cast <bool> (BitTested.first != nullptr) ? void
(0) : __assert_fail ("BitTested.first != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32275, __extension__ __PRETTY_FUNCTION__));

32276

32277

if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {

32278

auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));

32279

32280

BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());

32281

32282

unsigned Imm = llvm::countr_zero(C->getZExtValue());

32283

Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});

32284

} else {

32285

BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());

32286

32287

assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit)(static_cast <bool> (BitTested.second == ShiftBit || BitTested
.second == NotShiftBit) ? void (0) : __assert_fail ("BitTested.second == ShiftBit || BitTested.second == NotShiftBit"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32287, __extension__
__PRETTY_FUNCTION__));

32288

32289

Value *SI = BitTested.first;

32290

assert(SI != nullptr)(static_cast <bool> (SI != nullptr) ? void (0) : __assert_fail
("SI != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp",
32290, __extension__ __PRETTY_FUNCTION__));

32291

32292

// BT{S|R|C} on memory operand don't modulo bit position so we need to

32293

// mask it.

32294

unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();

32295

Value *BitPos =

32296

Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));

32297

// Todo(1): In many cases it may be provable that SI is less than

32298

// ShiftBits in which case this mask is unnecessary

32299

// Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1

32300

// << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in

32301

// favor of just a raw BT{S|R|C}.

32302

32303

Result = Builder.CreateCall(BitTest, {Addr, BitPos});

32304

Result = Builder.CreateZExtOrTrunc(Result, AI->getType());

32305

32306

// If the result is only used for zero/non-zero status then we don't need to

32307

// shift value back. Otherwise do so.

32308

for (auto It = I->user_begin(); It != I->user_end(); ++It) {

32309

if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {

32310

if (ICmp->isEquality()) {

32311

auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));

32312

auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));

32313

if (C0 || C1) {

32314

assert(C0 == nullptr || C1 == nullptr)(static_cast <bool> (C0 == nullptr || C1 == nullptr) ? void
(0) : __assert_fail ("C0 == nullptr || C1 == nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32314, __extension__ __PRETTY_FUNCTION__));

32315

if ((C0 ? C0 : C1)->isZero())

32316

continue;

32317

}

32318

}

32319

}

32320

Result = Builder.CreateShl(Result, BitPos);

32321

break;

32322

}

32323

}

32324

32325

I->replaceAllUsesWith(Result);

32326

I->eraseFromParent();

32327

AI->eraseFromParent();

32328

}

32329

32330

static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {

32331

using namespace llvm::PatternMatch;

32332

if (!AI->hasOneUse())

32333

return false;

32334

32335

Value *Op = AI->getOperand(1);

32336

ICmpInst::Predicate Pred;

32337

Instruction *I = AI->user_back();

32338

AtomicRMWInst::BinOp Opc = AI->getOperation();

32339

if (Opc == AtomicRMWInst::Add) {

32340

if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))

32341

return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;

32342

if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {

32343

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

32344

return Pred == CmpInst::ICMP_SLT;

32345

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))

32346

return Pred == CmpInst::ICMP_SGT;

32347

}

32348

return false;

32349

}

32350

if (Opc == AtomicRMWInst::Sub) {

32351

if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))

32352

return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;

32353

if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {

32354

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

32355

return Pred == CmpInst::ICMP_SLT;

32356

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))

32357

return Pred == CmpInst::ICMP_SGT;

32358

}

32359

return false;

32360

}

32361

if ((Opc == AtomicRMWInst::Or &&

32362

match(I, m_OneUse(m_c_Or(m_Specific(Op), m_Value())))) ||

32363

(Opc == AtomicRMWInst::And &&

32364

match(I, m_OneUse(m_c_And(m_Specific(Op), m_Value()))))) {

32365

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

32366

return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||

32367

Pred == CmpInst::ICMP_SLT;

32368

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))

32369

return Pred == CmpInst::ICMP_SGT;

32370

return false;

32371

}

32372

if (Opc == AtomicRMWInst::Xor) {

32373

if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))

32374

return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;

32375

if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {

32376

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

32377

return Pred == CmpInst::ICMP_SLT;

32378

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))

32379

return Pred == CmpInst::ICMP_SGT;

32380

}

32381

return false;

32382

}

32383

32384

return false;

32385

}

32386

32387

void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(

32388

AtomicRMWInst *AI) const {

32389

IRBuilder<> Builder(AI);

32390

Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});

32391

Instruction *TempI = nullptr;

32392

LLVMContext &Ctx = AI->getContext();

32393

ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());

32394

if (!ICI) {

32395

TempI = AI->user_back();

32396

assert(TempI->hasOneUse() && "Must have one use")(static_cast <bool> (TempI->hasOneUse() && "Must have one use"
) ? void (0) : __assert_fail ("TempI->hasOneUse() && \"Must have one use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32396, __extension__
__PRETTY_FUNCTION__));

32397

ICI = cast<ICmpInst>(TempI->user_back());

32398

}

32399

X86::CondCode CC = X86::COND_INVALID;

32400

ICmpInst::Predicate Pred = ICI->getPredicate();

32401

switch (Pred) {

32402

default:

32403

llvm_unreachable("Not supported Pred")::llvm::llvm_unreachable_internal("Not supported Pred", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32403);

32404

case CmpInst::ICMP_EQ:

32405

CC = X86::COND_E;

32406

break;

32407

case CmpInst::ICMP_NE:

32408

CC = X86::COND_NE;

32409

break;

32410

case CmpInst::ICMP_SLT:

32411

CC = X86::COND_S;

32412

break;

32413

case CmpInst::ICMP_SGT:

32414

CC = X86::COND_NS;

32415

break;

32416

}

32417

Intrinsic::ID IID = Intrinsic::not_intrinsic;

32418

switch (AI->getOperation()) {

32419

default:

32420

llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 32420);

32421

case AtomicRMWInst::Add:

32422

IID = Intrinsic::x86_atomic_add_cc;

32423

break;

32424

case AtomicRMWInst::Sub:

32425

IID = Intrinsic::x86_atomic_sub_cc;

32426

break;

32427

case AtomicRMWInst::Or:

32428

IID = Intrinsic::x86_atomic_or_cc;

32429

break;

32430

case AtomicRMWInst::And:

32431

IID = Intrinsic::x86_atomic_and_cc;

32432

break;

32433

case AtomicRMWInst::Xor:

32434

IID = Intrinsic::x86_atomic_xor_cc;

32435

break;

32436

}

32437

Function *CmpArith =

32438

Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());

32439

Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),

32440

Type::getInt8PtrTy(Ctx));

32441

Value *Call = Builder.CreateCall(

32442

CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});

32443

Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));

32444

ICI->replaceAllUsesWith(Result);

32445

ICI->eraseFromParent();

32446

if (TempI)

32447

TempI->eraseFromParent();

32448

AI->eraseFromParent();

32449

}

32450

32451

TargetLowering::AtomicExpansionKind

32452

X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {

32453

unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

32454

Type *MemType = AI->getType();

32455

32456

// If the operand is too big, we must see if cmpxchg8/16b is available

32457

// and default to library calls otherwise.

32458

if (MemType->getPrimitiveSizeInBits() > NativeWidth) {

32459

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg

32460

: AtomicExpansionKind::None;

32461

}

32462

32463

AtomicRMWInst::BinOp Op = AI->getOperation();

32464

switch (Op) {

32465

case AtomicRMWInst::Xchg:

32466

return AtomicExpansionKind::None;

32467

case AtomicRMWInst::Add:

32468

case AtomicRMWInst::Sub:

32469

if (shouldExpandCmpArithRMWInIR(AI))

32470

return AtomicExpansionKind::CmpArithIntrinsic;

32471

// It's better to use xadd, xsub or xchg for these in other cases.

32472

return AtomicExpansionKind::None;

32473

case AtomicRMWInst::Or:

32474

case AtomicRMWInst::And:

32475

case AtomicRMWInst::Xor:

32476

if (shouldExpandCmpArithRMWInIR(AI))

32477

return AtomicExpansionKind::CmpArithIntrinsic;

32478

return shouldExpandLogicAtomicRMWInIR(AI);

32479

case AtomicRMWInst::Nand:

32480

case AtomicRMWInst::Max:

32481

case AtomicRMWInst::Min:

32482

case AtomicRMWInst::UMax:

32483

case AtomicRMWInst::UMin:

32484

case AtomicRMWInst::FAdd:

32485

case AtomicRMWInst::FSub:

32486

case AtomicRMWInst::FMax:

32487

case AtomicRMWInst::FMin:

32488

case AtomicRMWInst::UIncWrap:

32489

case AtomicRMWInst::UDecWrap:

32490

default:

32491

// These always require a non-trivial set of data operations on x86. We must

32492

// use a cmpxchg loop.

32493

return AtomicExpansionKind::CmpXChg;

32494

}

32495

}

32496

32497

LoadInst *

32498

X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {

32499

unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

32500

Type *MemType = AI->getType();

32501

// Accesses larger than the native width are turned into cmpxchg/libcalls, so

32502

// there is no benefit in turning such RMWs into loads, and it is actually

32503

// harmful as it introduces a mfence.

32504

if (MemType->getPrimitiveSizeInBits() > NativeWidth)

32505

return nullptr;

32506

32507

// If this is a canonical idempotent atomicrmw w/no uses, we have a better

32508

// lowering available in lowerAtomicArith.

32509

// TODO: push more cases through this path.

32510

if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))

32511

if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&

32512

AI->use_empty())

32513

return nullptr;

32514

32515

IRBuilder<> Builder(AI);

32516

Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});

32517

Module *M = Builder.GetInsertBlock()->getParent()->getParent();

32518

auto SSID = AI->getSyncScopeID();

32519

// We must restrict the ordering to avoid generating loads with Release or

32520

// ReleaseAcquire orderings.

32521

auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());

32522

32523

// Before the load we need a fence. Here is an example lifted from

32524

// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence

32525

// is required:

32526

// Thread 0:

32527

// x.store(1, relaxed);

32528

// r1 = y.fetch_add(0, release);

32529

// Thread 1:

32530

// y.fetch_add(42, acquire);

32531

// r2 = x.load(relaxed);

32532

// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is

32533

// lowered to just a load without a fence. A mfence flushes the store buffer,

32534

// making the optimization clearly correct.

32535

// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear

32536

// otherwise, we might be able to be more aggressive on relaxed idempotent

32537

// rmw. In practice, they do not look useful, so we don't try to be

32538

// especially clever.

32539

if (SSID == SyncScope::SingleThread)

32540

// FIXME: we could just insert an ISD::MEMBARRIER here, except we are at

32541

// the IR level, so we must wrap it in an intrinsic.

32542

return nullptr;

32543

32544

if (!Subtarget.hasMFence())

32545

// FIXME: it might make sense to use a locked operation here but on a

32546

// different cache-line to prevent cache-line bouncing. In practice it

32547

// is probably a small win, and x86 processors without mfence are rare

32548

// enough that we do not bother.

32549

return nullptr;

32550

32551

Function *MFence =

32552

llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);

32553

Builder.CreateCall(MFence, {});

32554

32555

// Finally we can emit the atomic load.

32556

LoadInst *Loaded = Builder.CreateAlignedLoad(

32557

AI->getType(), AI->getPointerOperand(), AI->getAlign());

32558

Loaded->setAtomic(Order, SSID);

32559

AI->replaceAllUsesWith(Loaded);

32560

AI->eraseFromParent();

32561

return Loaded;

32562

}

32563

32564

bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {

32565

if (!SI.isUnordered())

32566

return false;

32567

return ExperimentalUnorderedISEL;

32568

}

32569

bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {

32570

if (!LI.isUnordered())

32571

return false;

32572

return ExperimentalUnorderedISEL;

32573

}

32574

32575

32576

/// Emit a locked operation on a stack location which does not change any

32577

/// memory location, but does involve a lock prefix. Location is chosen to be

32578

/// a) very likely accessed only by a single thread to minimize cache traffic,

32579

/// and b) definitely dereferenceable. Returns the new Chain result.

32580

static SDValue emitLockedStackOp(SelectionDAG &DAG,

32581

const X86Subtarget &Subtarget, SDValue Chain,

32582

const SDLoc &DL) {

32583

// Implementation notes:

32584

// 1) LOCK prefix creates a full read/write reordering barrier for memory

32585

// operations issued by the current processor. As such, the location

32586

// referenced is not relevant for the ordering properties of the instruction.

32587

// See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,

32588

// 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions

32589

// 2) Using an immediate operand appears to be the best encoding choice

32590

// here since it doesn't require an extra register.

32591

// 3) OR appears to be very slightly faster than ADD. (Though, the difference

32592

// is small enough it might just be measurement noise.)

32593

// 4) When choosing offsets, there are several contributing factors:

32594

// a) If there's no redzone, we default to TOS. (We could allocate a cache

32595

// line aligned stack object to improve this case.)

32596

// b) To minimize our chances of introducing a false dependence, we prefer

32597

// to offset the stack usage from TOS slightly.

32598

// c) To minimize concerns about cross thread stack usage - in particular,

32599

// the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which

32600

// captures state in the TOS frame and accesses it from many threads -

32601

// we want to use an offset such that the offset is in a distinct cache

32602

// line from the TOS frame.

32603

//

32604

// For a general discussion of the tradeoffs and benchmark results, see:

32605

// https://shipilev.net/blog/2014/on-the-fence-with-dependencies/

32606

32607

auto &MF = DAG.getMachineFunction();

32608

auto &TFL = *Subtarget.getFrameLowering();

32609

const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;

32610

32611

if (Subtarget.is64Bit()) {

32612

SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);

32613

SDValue Ops[] = {

32614

DAG.getRegister(X86::RSP, MVT::i64), // Base

32615

DAG.getTargetConstant(1, DL, MVT::i8), // Scale

32616

DAG.getRegister(0, MVT::i64), // Index

32617

DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp

32618

DAG.getRegister(0, MVT::i16), // Segment.

32619

Zero,

32620

Chain};

32621

SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,

32622

MVT::Other, Ops);

32623

return SDValue(Res, 1);

32624

}

32625

32626

SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);

32627

SDValue Ops[] = {

32628

DAG.getRegister(X86::ESP, MVT::i32), // Base

32629

DAG.getTargetConstant(1, DL, MVT::i8), // Scale

32630

DAG.getRegister(0, MVT::i32), // Index

32631

DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp

32632

DAG.getRegister(0, MVT::i16), // Segment.

32633

Zero,

32634

Chain

32635

};

32636

SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,

32637

MVT::Other, Ops);

32638

return SDValue(Res, 1);

32639

}

32640

32641

static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,

32642

SelectionDAG &DAG) {

32643

SDLoc dl(Op);

32644

AtomicOrdering FenceOrdering =

32645

static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));

32646

SyncScope::ID FenceSSID =

32647

static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));

32648

32649

// The only fence that needs an instruction is a sequentially-consistent

32650

// cross-thread fence.

32651

if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&

32652

FenceSSID == SyncScope::System) {

32653

if (Subtarget.hasMFence())

32654

return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

32655

32656

SDValue Chain = Op.getOperand(0);

32657

return emitLockedStackOp(DAG, Subtarget, Chain, dl);

32658

}

32659

32660

// MEMBARRIER is a compiler barrier; it codegens to a no-op.

32661

return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));

32662

}

32663

32664

static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,

32665

SelectionDAG &DAG) {

32666

MVT T = Op.getSimpleValueType();

32667

SDLoc DL(Op);

32668

unsigned Reg = 0;

32669

unsigned size = 0;

32670

switch(T.SimpleTy) {

32671

default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32671);

32672

case MVT::i8: Reg = X86::AL; size = 1; break;

32673

case MVT::i16: Reg = X86::AX; size = 2; break;

32674

case MVT::i32: Reg = X86::EAX; size = 4; break;

32675

case MVT::i64:

32676

assert(Subtarget.is64Bit() && "Node not type legal!")(static_cast <bool> (Subtarget.is64Bit() && "Node not type legal!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32676, __extension__
__PRETTY_FUNCTION__));

32677

Reg = X86::RAX; size = 8;

32678

break;

32679

}

32680

SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,

32681

Op.getOperand(2), SDValue());

32682

SDValue Ops[] = { cpIn.getValue(0),

32683

Op.getOperand(1),

32684

Op.getOperand(3),

32685

DAG.getTargetConstant(size, DL, MVT::i8),

32686

cpIn.getValue(1) };

32687

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

32688

MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();

32689

SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,

32690

Ops, T, MMO);

32691

32692

SDValue cpOut =

32693

DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));

32694

SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,

32695

MVT::i32, cpOut.getValue(2));

32696

SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);

32697

32698

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

32699

cpOut, Success, EFLAGS.getValue(1));

32700

}

32701

32702

// Create MOVMSKB, taking into account whether we need to split for AVX1.

32703

static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,

32704

const X86Subtarget &Subtarget) {

32705

MVT InVT = V.getSimpleValueType();

32706

32707

if (InVT == MVT::v64i8) {

32708

SDValue Lo, Hi;

32709

std::tie(Lo, Hi) = DAG.SplitVector(V, DL);

32710

Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);

32711

Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);

32712

Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);

32713

Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);

32714

Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,

32715

DAG.getConstant(32, DL, MVT::i8));

32716

return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);

32717

}

32718

if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {

32719

SDValue Lo, Hi;

32720

std::tie(Lo, Hi) = DAG.SplitVector(V, DL);

32721

Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);

32722

Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);

32723

Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,

32724

DAG.getConstant(16, DL, MVT::i8));

32725

return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);

32726

}

32727

32728

return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

32729

}

32730

32731

static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,

32732

SelectionDAG &DAG) {

32733

SDValue Src = Op.getOperand(0);

32734

MVT SrcVT = Src.getSimpleValueType();

32735

MVT DstVT = Op.getSimpleValueType();

32736

32737

// Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each

32738

// half to v32i1 and concatenating the result.

32739

if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {

32740

assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32740, __extension__
__PRETTY_FUNCTION__));

32741

assert(Subtarget.hasBWI() && "Expected BWI target")(static_cast <bool> (Subtarget.hasBWI() && "Expected BWI target"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32741, __extension__
__PRETTY_FUNCTION__));

32742

SDLoc dl(Op);

32743

SDValue Lo, Hi;

32744

std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);

32745

Lo = DAG.getBitcast(MVT::v32i1, Lo);

32746

Hi = DAG.getBitcast(MVT::v32i1, Hi);

32747

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);

32748

}

32749

32750

// Use MOVMSK for vector to scalar conversion to prevent scalarization.

32751

if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {

32752

assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")(static_cast <bool> (!Subtarget.hasAVX512() && "Should use K-registers with AVX512"
) ? void (0) : __assert_fail ("!Subtarget.hasAVX512() && \"Should use K-registers with AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32752, __extension__
__PRETTY_FUNCTION__));

32753

MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;

32754

SDLoc DL(Op);

32755

SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);

32756

V = getPMOVMSKB(DL, V, DAG, Subtarget);

32757

return DAG.getZExtOrTrunc(V, DL, DstVT);

32758

}

32759

32760

assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32761, __extension__
__PRETTY_FUNCTION__))

32761

SrcVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32761, __extension__
__PRETTY_FUNCTION__));

32762

32763

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32763, __extension__
__PRETTY_FUNCTION__));

32764

if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&

32765

!(DstVT == MVT::x86mmx && SrcVT.isVector()))

32766

// This conversion needs to be expanded.

32767

return SDValue();

32768

32769

SDLoc dl(Op);

32770

if (SrcVT.isVector()) {

32771

// Widen the vector in input in the case of MVT::v2i32.

32772

// Example: from MVT::v2i32 to MVT::v4i32.

32773

MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),

32774

SrcVT.getVectorNumElements() * 2);

32775

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,

32776

DAG.getUNDEF(SrcVT));

32777

} else {

32778

assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32779, __extension__
__PRETTY_FUNCTION__))

32779

"Unexpected source type in LowerBITCAST")(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32779, __extension__
__PRETTY_FUNCTION__));

32780

Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);

32781

}

32782

32783

MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;

32784

Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);

32785

32786

if (DstVT == MVT::x86mmx)

32787

return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);

32788

32789

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,

32790

DAG.getIntPtrConstant(0, dl));

32791

}

32792

32793

/// Compute the horizontal sum of bytes in V for the elements of VT.

32794

///

32795

/// Requires V to be a byte vector and VT to be an integer vector type with

32796

/// wider elements than V's type. The width of the elements of VT determines

32797

/// how many bytes of V are summed horizontally to produce each element of the

32798

/// result.

32799

static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,

32800

const X86Subtarget &Subtarget,

32801

SelectionDAG &DAG) {

32802

SDLoc DL(V);

32803

MVT ByteVecVT = V.getSimpleValueType();

32804

MVT EltVT = VT.getVectorElementType();

32805

assert(ByteVecVT.getVectorElementType() == MVT::i8 &&(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32806, __extension__
__PRETTY_FUNCTION__))

32806

"Expected value to have byte element type.")(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32806, __extension__
__PRETTY_FUNCTION__));

32807

assert(EltVT != MVT::i8 &&(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32808, __extension__
__PRETTY_FUNCTION__))

32808

"Horizontal byte sum only makes sense for wider elements!")(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32808, __extension__
__PRETTY_FUNCTION__));

32809

unsigned VecSize = VT.getSizeInBits();

32810

assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")(static_cast <bool> (ByteVecVT.getSizeInBits() == VecSize
&& "Cannot change vector size!") ? void (0) : __assert_fail
("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32810, __extension__
__PRETTY_FUNCTION__));

32811

32812

// PSADBW instruction horizontally add all bytes and leave the result in i64

32813

// chunks, thus directly computes the pop count for v2i64 and v4i64.

32814

if (EltVT == MVT::i64) {

32815

SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);

32816

MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);

32817

V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);

32818

return DAG.getBitcast(VT, V);

32819

}

32820

32821

if (EltVT == MVT::i32) {

32822

// We unpack the low half and high half into i32s interleaved with zeros so

32823

// that we can use PSADBW to horizontally sum them. The most useful part of

32824

// this is that it lines up the results of two PSADBW instructions to be

32825

// two v2i64 vectors which concatenated are the 4 population counts. We can

32826

// then use PACKUSWB to shrink and concatenate them into a v4i32 again.

32827

SDValue Zeros = DAG.getConstant(0, DL, VT);

32828

SDValue V32 = DAG.getBitcast(VT, V);

32829

SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);

32830

SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);

32831

32832

// Do the horizontal sums into two v2i64s.

32833

Zeros = DAG.getConstant(0, DL, ByteVecVT);

32834

MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);

32835

Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,

32836

DAG.getBitcast(ByteVecVT, Low), Zeros);

32837

High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,

32838

DAG.getBitcast(ByteVecVT, High), Zeros);

32839

32840

// Merge them together.

32841

MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);

32842

V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,

32843

DAG.getBitcast(ShortVecVT, Low),

32844

DAG.getBitcast(ShortVecVT, High));

32845

32846

return DAG.getBitcast(VT, V);

32847

}

32848

32849

// The only element type left is i16.

32850

assert(EltVT == MVT::i16 && "Unknown how to handle type")(static_cast <bool> (EltVT == MVT::i16 && "Unknown how to handle type"
) ? void (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32850, __extension__
__PRETTY_FUNCTION__));

32851

32852

// To obtain pop count for each i16 element starting from the pop count for

32853

// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s

32854

// right by 8. It is important to shift as i16s as i8 vector shift isn't

32855

// directly supported.

32856

SDValue ShifterV = DAG.getConstant(8, DL, VT);

32857

SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);

32858

V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),

32859

DAG.getBitcast(ByteVecVT, V));

32860

return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);

32861

}

32862

32863

static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,

32864

const X86Subtarget &Subtarget,

32865

SelectionDAG &DAG) {

32866

MVT VT = Op.getSimpleValueType();

32867

MVT EltVT = VT.getVectorElementType();

32868

int NumElts = VT.getVectorNumElements();

32869

(void)EltVT;

32870

assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")(static_cast <bool> (EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."
) ? void (0) : __assert_fail ("EltVT == MVT::i8 && \"Only vXi8 vector CTPOP lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32870, __extension__
__PRETTY_FUNCTION__));

32871

32872

// Implement a lookup table in register by using an algorithm based on:

32873

// http://wm.ite.pl/articles/sse-popcount.html

32874

//

32875

// The general idea is that every lower byte nibble in the input vector is an

32876

// index into a in-register pre-computed pop count table. We then split up the

32877

// input vector in two new ones: (1) a vector with only the shifted-right

32878

// higher nibbles for each byte and (2) a vector with the lower nibbles (and

32879

// masked out higher ones) for each byte. PSHUFB is used separately with both

32880

// to index the in-register table. Next, both are added and the result is a

32881

// i8 vector where each element contains the pop count for input byte.

32882

const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,

32883

/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,

32884

/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,

32885

/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};

32886

32887

SmallVector<SDValue, 64> LUTVec;

32888

for (int i = 0; i < NumElts; ++i)

32889

LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));

32890

SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);

32891

SDValue M0F = DAG.getConstant(0x0F, DL, VT);

32892

32893

// High nibbles

32894

SDValue FourV = DAG.getConstant(4, DL, VT);

32895

SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);

32896

32897

// Low nibbles

32898

SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);

32899

32900

// The input vector is used as the shuffle mask that index elements into the

32901

// LUT. After counting low and high nibbles, add the vector to obtain the

32902

// final pop count per i8 element.

32903

SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);

32904

SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);

32905

return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);

32906

}

32907

32908

// Please ensure that any codegen change from LowerVectorCTPOP is reflected in

32909

// updated cost models in X86TTIImpl::getIntrinsicInstrCost.

32910

static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,

32911

SelectionDAG &DAG) {

32912

MVT VT = Op.getSimpleValueType();

32913

assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32914, __extension__
__PRETTY_FUNCTION__))

32914

"Unknown CTPOP type to handle")(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32914, __extension__
__PRETTY_FUNCTION__));

32915

SDLoc DL(Op.getNode());

32916

SDValue Op0 = Op.getOperand(0);

32917

32918

// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.

32919

if (Subtarget.hasVPOPCNTDQ()) {

32920

unsigned NumElems = VT.getVectorNumElements();

32921

assert((VT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32922, __extension__
__PRETTY_FUNCTION__))

32922

VT.getVectorElementType() == MVT::i16) && "Unexpected type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32922, __extension__
__PRETTY_FUNCTION__));

32923

if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {

32924

MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);

32925

Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);

32926

Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);

32927

return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);

32928

}

32929

}

32930

32931

// Decompose 256-bit ops into smaller 128-bit ops.

32932

if (VT.is256BitVector() && !Subtarget.hasInt256())

32933

return splitVectorIntUnary(Op, DAG);

32934

32935

// Decompose 512-bit ops into smaller 256-bit ops.

32936

if (VT.is512BitVector() && !Subtarget.hasBWI())

32937

return splitVectorIntUnary(Op, DAG);

32938

32939

// For element types greater than i8, do vXi8 pop counts and a bytesum.

32940

if (VT.getScalarType() != MVT::i8) {

32941

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

32942

SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);

32943

SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);

32944

return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);

32945

}

32946

32947

// We can't use the fast LUT approach, so fall back on LegalizeDAG.

32948

if (!Subtarget.hasSSSE3())

32949

return SDValue();

32950

32951

return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);

32952

}

32953

32954

static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,

32955

SelectionDAG &DAG) {

32956

assert(Op.getSimpleValueType().isVector() &&(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32957, __extension__
__PRETTY_FUNCTION__))

32957

"We only do custom lowering for vector population count.")(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32957, __extension__
__PRETTY_FUNCTION__));

32958

return LowerVectorCTPOP(Op, Subtarget, DAG);

32959

}

32960

32961

static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {

32962

MVT VT = Op.getSimpleValueType();

32963

SDValue In = Op.getOperand(0);

32964

SDLoc DL(Op);

32965

32966

// For scalars, its still beneficial to transfer to/from the SIMD unit to

32967

// perform the BITREVERSE.

32968

if (!VT.isVector()) {

32969

MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());

32970

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);

32971

Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);

32972

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,

32973

DAG.getIntPtrConstant(0, DL));

32974

}

32975

32976

int NumElts = VT.getVectorNumElements();

32977

int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;

32978

32979

// Decompose 256-bit ops into smaller 128-bit ops.

32980

if (VT.is256BitVector())

32981

return splitVectorIntUnary(Op, DAG);

32982

32983

assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32984, __extension__
__PRETTY_FUNCTION__))

32984

"Only 128-bit vector bitreverse lowering supported.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32984, __extension__
__PRETTY_FUNCTION__));

32985

32986

// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we

32987

// perform the BSWAP in the shuffle.

32988

// Its best to shuffle using the second operand as this will implicitly allow

32989

// memory folding for multiple vectors.

32990

SmallVector<SDValue, 16> MaskElts;

32991

for (int i = 0; i != NumElts; ++i) {

32992

for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {

32993

int SourceByte = 16 + (i * ScalarSizeInBytes) + j;

32994

int PermuteByte = SourceByte | (2 << 5);

32995

MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));

32996

}

32997

}

32998

32999

SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);

33000

SDValue Res = DAG.getBitcast(MVT::v16i8, In);

33001

Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),

33002

Res, Mask);

33003

return DAG.getBitcast(VT, Res);

33004

}

33005

33006

static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,

33007

SelectionDAG &DAG) {

33008

MVT VT = Op.getSimpleValueType();

33009

33010

if (Subtarget.hasXOP() && !VT.is512BitVector())

33011

return LowerBITREVERSE_XOP(Op, DAG);

33012

33013

assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")(static_cast <bool> (Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33013, __extension__
__PRETTY_FUNCTION__));

33014

33015

SDValue In = Op.getOperand(0);

33016

SDLoc DL(Op);

33017

33018

assert(VT.getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33019, __extension__
__PRETTY_FUNCTION__))

33019

"Only byte vector BITREVERSE supported")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33019, __extension__
__PRETTY_FUNCTION__));

33020

33021

// Split v64i8 without BWI so that we can still use the PSHUFB lowering.

33022

if (VT == MVT::v64i8 && !Subtarget.hasBWI())

33023

return splitVectorIntUnary(Op, DAG);

33024

33025

// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.

33026

if (VT == MVT::v32i8 && !Subtarget.hasInt256())

33027

return splitVectorIntUnary(Op, DAG);

33028

33029

unsigned NumElts = VT.getVectorNumElements();

33030

33031

// If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.

33032

if (Subtarget.hasGFNI()) {

33033

MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);

33034

SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);

33035

Matrix = DAG.getBitcast(VT, Matrix);

33036

return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,

33037

DAG.getTargetConstant(0, DL, MVT::i8));

33038

}

33039

33040

// Perform BITREVERSE using PSHUFB lookups. Each byte is split into

33041

// two nibbles and a PSHUFB lookup to find the bitreverse of each

33042

// 0-15 value (moved to the other nibble).

33043

SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);

33044

SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);

33045

SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));

33046

33047

const int LoLUT[16] = {

33048

/* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,

33049

/* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,

33050

/* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,

33051

/* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};

33052

const int HiLUT[16] = {

33053

/* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,

33054

/* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,

33055

/* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,

33056

/* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};

33057

33058

SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;

33059

for (unsigned i = 0; i < NumElts; ++i) {

33060

LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));

33061

HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));

33062

}

33063

33064

SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);

33065

SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);

33066

Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);

33067

Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);

33068

return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);

33069

}

33070

33071

static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,

33072

SelectionDAG &DAG) {

33073

SDLoc DL(Op);

33074

SDValue X = Op.getOperand(0);

33075

MVT VT = Op.getSimpleValueType();

33076

33077

// Special case. If the input fits in 8-bits we can use a single 8-bit TEST.

33078

if (VT == MVT::i8 ||

33079

DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {

33080

X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);

33081

SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,

33082

DAG.getConstant(0, DL, MVT::i8));

33083

// Copy the inverse of the parity flag into a register with setcc.

33084

SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);

33085

// Extend to the original type.

33086

return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);

33087

}

33088

33089

// If we have POPCNT, use the default expansion.

33090

if (Subtarget.hasPOPCNT())

33091

return SDValue();

33092

33093

if (VT == MVT::i64) {

33094

// Xor the high and low 16-bits together using a 32-bit operation.

33095

SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,

33096

DAG.getNode(ISD::SRL, DL, MVT::i64, X,

33097

DAG.getConstant(32, DL, MVT::i8)));

33098

SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);

33099

X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);

33100

}

33101

33102

if (VT != MVT::i16) {

33103

// Xor the high and low 16-bits together using a 32-bit operation.

33104

SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,

33105

DAG.getConstant(16, DL, MVT::i8));

33106

X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);

33107

} else {

33108

// If the input is 16-bits, we need to extend to use an i32 shift below.

33109

X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);

33110

}

33111

33112

// Finally xor the low 2 bytes together and use a 8-bit flag setting xor.

33113

// This should allow an h-reg to be used to save a shift.

33114

SDValue Hi = DAG.getNode(

33115

ISD::TRUNCATE, DL, MVT::i8,

33116

DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));

33117

SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);

33118

SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);

33119

SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);

33120

33121

// Copy the inverse of the parity flag into a register with setcc.

33122

SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);

33123

// Extend to the original type.

33124

return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);

33125

}

33126

33127

static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,

33128

const X86Subtarget &Subtarget) {

33129

unsigned NewOpc = 0;

33130

switch (N->getOpcode()) {

33131

case ISD::ATOMIC_LOAD_ADD:

33132

NewOpc = X86ISD::LADD;

33133

break;

33134

case ISD::ATOMIC_LOAD_SUB:

33135

NewOpc = X86ISD::LSUB;

33136

break;

33137

case ISD::ATOMIC_LOAD_OR:

33138

NewOpc = X86ISD::LOR;

33139

break;

33140

case ISD::ATOMIC_LOAD_XOR:

33141

NewOpc = X86ISD::LXOR;

33142

break;

33143

case ISD::ATOMIC_LOAD_AND:

33144

NewOpc = X86ISD::LAND;

33145

break;

33146

default:

33147

llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33147);

33148

}

33149

33150

MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();

33151

33152

return DAG.getMemIntrinsicNode(

33153

NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),

33154

{N->getOperand(0), N->getOperand(1), N->getOperand(2)},

33155

/*MemVT=*/N->getSimpleValueType(0), MMO);

33156

}

33157

33158

/// Lower atomic_load_ops into LOCK-prefixed operations.

33159

static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,

33160

const X86Subtarget &Subtarget) {

33161

AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());

33162

SDValue Chain = N->getOperand(0);

33163

SDValue LHS = N->getOperand(1);

33164

SDValue RHS = N->getOperand(2);

33165

unsigned Opc = N->getOpcode();

33166

MVT VT = N->getSimpleValueType(0);

33167

SDLoc DL(N);

33168

33169

// We can lower atomic_load_add into LXADD. However, any other atomicrmw op

33170

// can only be lowered when the result is unused. They should have already

33171

// been transformed into a cmpxchg loop in AtomicExpand.

33172

if (N->hasAnyUseOfValue(0)) {

33173

// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to

33174

// select LXADD if LOCK_SUB can't be selected.

33175

if (Opc == ISD::ATOMIC_LOAD_SUB) {

33176

RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);

33177

return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,

33178

RHS, AN->getMemOperand());

33179

}

33180

assert(Opc == ISD::ATOMIC_LOAD_ADD &&(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33181, __extension__
__PRETTY_FUNCTION__))

33181

"Used AtomicRMW ops other than Add should have been expanded!")(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33181, __extension__
__PRETTY_FUNCTION__));

33182

return N;

33183

}

33184

33185

// Specialized lowering for the canonical form of an idemptotent atomicrmw.

33186

// The core idea here is that since the memory location isn't actually

33187

// changing, all we need is a lowering for the *ordering* impacts of the

33188

// atomicrmw. As such, we can chose a different operation and memory

33189

// location to minimize impact on other code.

33190

if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {

33191

// On X86, the only ordering which actually requires an instruction is

33192

// seq_cst which isn't SingleThread, everything just needs to be preserved

33193

// during codegen and then dropped. Note that we expect (but don't assume),

33194

// that orderings other than seq_cst and acq_rel have been canonicalized to

33195

// a store or load.

33196

if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&

33197

AN->getSyncScopeID() == SyncScope::System) {

33198

// Prefer a locked operation against a stack location to minimize cache

33199

// traffic. This assumes that stack locations are very likely to be

33200

// accessed only by the owning thread.

33201

SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);

33202

assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33202, __extension__ __PRETTY_FUNCTION__));

33203

// NOTE: The getUNDEF is needed to give something for the unused result 0.

33204

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

33205

DAG.getUNDEF(VT), NewChain);

33206

}

33207

// MEMBARRIER is a compiler barrier; it codegens to a no-op.

33208

SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);

33209

assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33209, __extension__ __PRETTY_FUNCTION__));

33210

// NOTE: The getUNDEF is needed to give something for the unused result 0.

33211

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

33212

DAG.getUNDEF(VT), NewChain);

33213

}

33214

33215

SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);

33216

// RAUW the chain, but don't worry about the result, as it's unused.

33217

assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33217, __extension__ __PRETTY_FUNCTION__));

33218

// NOTE: The getUNDEF is needed to give something for the unused result 0.

33219

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

33220

DAG.getUNDEF(VT), LockOp.getValue(1));

33221

}

33222

33223

static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,

33224

const X86Subtarget &Subtarget) {

33225

auto *Node = cast<AtomicSDNode>(Op.getNode());

33226

SDLoc dl(Node);

33227

EVT VT = Node->getMemoryVT();

33228

33229

bool IsSeqCst =

33230

Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;

33231

bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);

33232

33233

// If this store is not sequentially consistent and the type is legal

33234

// we can just keep it.

33235

if (!IsSeqCst && IsTypeLegal)

33236

return Op;

33237

33238

if (VT == MVT::i64 && !IsTypeLegal) {

33239

// For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE

33240

// is enabled.

33241

bool NoImplicitFloatOps =

33242

DAG.getMachineFunction().getFunction().hasFnAttribute(

33243

Attribute::NoImplicitFloat);

33244

if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {

33245

SDValue Chain;

33246

if (Subtarget.hasSSE1()) {

33247

SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

33248

Node->getOperand(2));

33249

MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;

33250

SclToVec = DAG.getBitcast(StVT, SclToVec);

33251

SDVTList Tys = DAG.getVTList(MVT::Other);

33252

SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};

33253

Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,

33254

MVT::i64, Node->getMemOperand());

33255

} else if (Subtarget.hasX87()) {

33256

// First load this into an 80-bit X87 register using a stack temporary.

33257

// This will put the whole integer into the significand.

33258

SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);

33259

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

33260

MachinePointerInfo MPI =

33261

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

33262

Chain =

33263

DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,

33264

MPI, MaybeAlign(), MachineMemOperand::MOStore);

33265

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

33266

SDValue LdOps[] = {Chain, StackPtr};

33267

SDValue Value = DAG.getMemIntrinsicNode(

33268

X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,

33269

/*Align*/ std::nullopt, MachineMemOperand::MOLoad);

33270

Chain = Value.getValue(1);

33271

33272

// Now use an FIST to do the atomic store.

33273

SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};

33274

Chain =

33275

DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),

33276

StoreOps, MVT::i64, Node->getMemOperand());

33277

}

33278

33279

if (Chain) {

33280

// If this is a sequentially consistent store, also emit an appropriate

33281

// barrier.

33282

if (IsSeqCst)

33283

Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);

33284

33285

return Chain;

33286

}

33287

}

33288

}

33289

33290

// Convert seq_cst store -> xchg

33291

// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)

33292

// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.

33293

SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,

33294

Node->getMemoryVT(),

33295

Node->getOperand(0),

33296

Node->getOperand(1), Node->getOperand(2),

33297

Node->getMemOperand());

33298

return Swap.getValue(1);

33299

}

33300

33301

static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {

33302

SDNode *N = Op.getNode();

33303

MVT VT = N->getSimpleValueType(0);

33304

unsigned Opc = Op.getOpcode();

33305

33306

// Let legalize expand this if it isn't a legal type yet.

33307

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

33308

return SDValue();

33309

33310

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

33311

SDLoc DL(N);

33312

33313

// Set the carry flag.

33314

SDValue Carry = Op.getOperand(2);

33315

EVT CarryVT = Carry.getValueType();

33316

Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),

33317

Carry, DAG.getAllOnesConstant(DL, CarryVT));

33318

33319

bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;

33320

SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,

33321

Op.getOperand(0), Op.getOperand(1),

33322

Carry.getValue(1));

33323

33324

bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;

33325

SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,

33326

Sum.getValue(1), DL, DAG);

33327

if (N->getValueType(1) == MVT::i1)

33328

SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

33329

33330

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);

33331

}

33332

33333

static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,

33334

SelectionDAG &DAG) {

33335

assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())(static_cast <bool> (Subtarget.isTargetDarwin() &&
Subtarget.is64Bit()) ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33335, __extension__
__PRETTY_FUNCTION__));

33336

33337

// For MacOSX, we want to call an alternative entry point: __sincos_stret,

33338

// which returns the values as { float, float } (in XMM0) or

33339

// { double, double } (which is returned in XMM0, XMM1).

33340

SDLoc dl(Op);

33341

SDValue Arg = Op.getOperand(0);

33342

EVT ArgVT = Arg.getValueType();

33343

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

33344

33345

TargetLowering::ArgListTy Args;

33346

TargetLowering::ArgListEntry Entry;

33347

33348

Entry.Node = Arg;

33349

Entry.Ty = ArgTy;

33350

Entry.IsSExt = false;

33351

Entry.IsZExt = false;

33352

Args.push_back(Entry);

33353

33354

bool isF64 = ArgVT == MVT::f64;

33355

// Only optimize x86_64 for now. i386 is a bit messy. For f32,

33356

// the small struct {f32, f32} is returned in (eax, edx). For f64,

33357

// the results are returned via SRet in memory.

33358

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

33359

RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;

33360

const char *LibcallName = TLI.getLibcallName(LC);

33361

SDValue Callee =

33362

DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));

33363

33364

Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)

33365

: (Type *)FixedVectorType::get(ArgTy, 4);

33366

33367

TargetLowering::CallLoweringInfo CLI(DAG);

33368

CLI.setDebugLoc(dl)

33369

.setChain(DAG.getEntryNode())

33370

.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));

33371

33372

std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

33373

33374

if (isF64)

33375

// Returned in xmm0 and xmm1.

33376

return CallResult.first;

33377

33378

// Returned in bits 0:31 and 32:64 xmm0.

33379

SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,

33380

CallResult.first, DAG.getIntPtrConstant(0, dl));

33381

SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,

33382

CallResult.first, DAG.getIntPtrConstant(1, dl));

33383

SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);

33384

return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);

33385

}

33386

33387

/// Widen a vector input to a vector of NVT. The

33388

/// input vector must have the same element type as NVT.

33389

static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,

33390

bool FillWithZeroes = false) {

33391

// Check if InOp already has the right width.

33392

MVT InVT = InOp.getSimpleValueType();

33393

if (InVT == NVT)

33394

return InOp;

33395

33396

if (InOp.isUndef())

33397

return DAG.getUNDEF(NVT);

33398

33399

assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33400, __extension__
__PRETTY_FUNCTION__))

33400

"input and widen element type must match")(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33400, __extension__
__PRETTY_FUNCTION__));

33401

33402

unsigned InNumElts = InVT.getVectorNumElements();

33403

unsigned WidenNumElts = NVT.getVectorNumElements();

33404

assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33405, __extension__
__PRETTY_FUNCTION__))

33405

"Unexpected request for vector widening")(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33405, __extension__
__PRETTY_FUNCTION__));

33406

33407

SDLoc dl(InOp);

33408

if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&

33409

InOp.getNumOperands() == 2) {

33410

SDValue N1 = InOp.getOperand(1);

33411

if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||

33412

N1.isUndef()) {

33413

InOp = InOp.getOperand(0);

33414

InVT = InOp.getSimpleValueType();

33415

InNumElts = InVT.getVectorNumElements();

33416

}

33417

}

33418

if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||

33419

ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {

33420

SmallVector<SDValue, 16> Ops;

33421

for (unsigned i = 0; i < InNumElts; ++i)

33422

Ops.push_back(InOp.getOperand(i));

33423

33424

EVT EltVT = InOp.getOperand(0).getValueType();

33425

33426

SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :

33427

DAG.getUNDEF(EltVT);

33428

for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)

33429

Ops.push_back(FillVal);

33430

return DAG.getBuildVector(NVT, dl, Ops);

33431

}

33432

SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :

33433

DAG.getUNDEF(NVT);

33434

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,

33435

InOp, DAG.getIntPtrConstant(0, dl));

33436

}

33437

33438

static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,

33439

SelectionDAG &DAG) {

33440

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33441, __extension__
__PRETTY_FUNCTION__))

33441

"MGATHER/MSCATTER are supported on AVX-512 arch only")(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33441, __extension__
__PRETTY_FUNCTION__));

33442

33443

MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());

33444

SDValue Src = N->getValue();

33445

MVT VT = Src.getSimpleValueType();

33446

assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported scatter op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33446, __extension__
__PRETTY_FUNCTION__));

33447

SDLoc dl(Op);

33448

33449

SDValue Scale = N->getScale();

33450

SDValue Index = N->getIndex();

33451

SDValue Mask = N->getMask();

33452

SDValue Chain = N->getChain();

33453

SDValue BasePtr = N->getBasePtr();

33454

33455

if (VT == MVT::v2f32 || VT == MVT::v2i32) {

33456

assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33456, __extension__
__PRETTY_FUNCTION__));

33457

// If the index is v2i64 and we have VLX we can use xmm for data and index.

33458

if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {

33459

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

33460

EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);

33461

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));

33462

SDVTList VTs = DAG.getVTList(MVT::Other);

33463

SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};

33464

return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

33465

N->getMemoryVT(), N->getMemOperand());

33466

}

33467

return SDValue();

33468

}

33469

33470

MVT IndexVT = Index.getSimpleValueType();

33471

33472

// If the index is v2i32, we're being called by type legalization and we

33473

// should just let the default handling take care of it.

33474

if (IndexVT == MVT::v2i32)

33475

return SDValue();

33476

33477

// If we don't have VLX and neither the passthru or index is 512-bits, we

33478

// need to widen until one is.

33479

if (!Subtarget.hasVLX() && !VT.is512BitVector() &&

33480

!Index.getSimpleValueType().is512BitVector()) {

33481

// Determine how much we need to widen by to get a 512-bit type.

33482

unsigned Factor = std::min(512/VT.getSizeInBits(),

33483

512/IndexVT.getSizeInBits());

33484

unsigned NumElts = VT.getVectorNumElements() * Factor;

33485

33486

VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

33487

IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);

33488

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

33489

33490

Src = ExtendToType(Src, VT, DAG);

33491

Index = ExtendToType(Index, IndexVT, DAG);

33492

Mask = ExtendToType(Mask, MaskVT, DAG, true);

33493

}

33494

33495

SDVTList VTs = DAG.getVTList(MVT::Other);

33496

SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};

33497

return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

33498

N->getMemoryVT(), N->getMemOperand());

33499

}

33500

33501

static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,

33502

SelectionDAG &DAG) {

33503

33504

MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());

33505

MVT VT = Op.getSimpleValueType();

33506

MVT ScalarVT = VT.getScalarType();

33507

SDValue Mask = N->getMask();

33508

MVT MaskVT = Mask.getSimpleValueType();

33509

SDValue PassThru = N->getPassThru();

33510

SDLoc dl(Op);

33511

33512

// Handle AVX masked loads which don't support passthru other than 0.

33513

if (MaskVT.getVectorElementType() != MVT::i1) {

33514

// We also allow undef in the isel pattern.

33515

if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))

33516

return Op;

33517

33518

SDValue NewLoad = DAG.getMaskedLoad(

33519

VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,

33520

getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),

33521

N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),

33522

N->isExpandingLoad());

33523

// Emit a blend.

33524

SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);

33525

return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);

33526

}

33527

33528

assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33529, __extension__
__PRETTY_FUNCTION__))

33529

"Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33529, __extension__
__PRETTY_FUNCTION__));

33530

33531

assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33532, __extension__
__PRETTY_FUNCTION__))

33532

"Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33532, __extension__
__PRETTY_FUNCTION__));

33533

33534

assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33535, __extension__
__PRETTY_FUNCTION__))

33535

"Cannot lower masked load op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33535, __extension__
__PRETTY_FUNCTION__));

33536

33537

assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33540, __extension__
__PRETTY_FUNCTION__))

33538

(Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33540, __extension__
__PRETTY_FUNCTION__))

33539

(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33540, __extension__
__PRETTY_FUNCTION__))

33540

"Unsupported masked load op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33540, __extension__
__PRETTY_FUNCTION__));

33541

33542

// This operation is legal for targets with VLX, but without

33543

// VLX the vector should be widened to 512 bit

33544

unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();

33545

MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

33546

PassThru = ExtendToType(PassThru, WideDataVT, DAG);

33547

33548

// Mask element has to be i1.

33549

assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33550, __extension__
__PRETTY_FUNCTION__))

33550

"Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33550, __extension__
__PRETTY_FUNCTION__));

33551

33552

MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

33553

33554

Mask = ExtendToType(Mask, WideMaskVT, DAG, true);

33555

SDValue NewLoad = DAG.getMaskedLoad(

33556

WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,

33557

PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),

33558

N->getExtensionType(), N->isExpandingLoad());

33559

33560

SDValue Extract =

33561

DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),

33562

DAG.getIntPtrConstant(0, dl));

33563

SDValue RetOps[] = {Extract, NewLoad.getValue(1)};

33564

return DAG.getMergeValues(RetOps, dl);

33565

}

33566

33567

static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,

33568

SelectionDAG &DAG) {

33569

MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());

33570

SDValue DataToStore = N->getValue();

33571

MVT VT = DataToStore.getSimpleValueType();

33572

MVT ScalarVT = VT.getScalarType();

33573

SDValue Mask = N->getMask();

33574

SDLoc dl(Op);

33575

33576

assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33577, __extension__
__PRETTY_FUNCTION__))

33577

"Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33577, __extension__
__PRETTY_FUNCTION__));

33578

33579

assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33580, __extension__
__PRETTY_FUNCTION__))

33580

"Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33580, __extension__
__PRETTY_FUNCTION__));

33581

33582

assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33583, __extension__
__PRETTY_FUNCTION__))

33583

"Cannot lower masked store op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33583, __extension__
__PRETTY_FUNCTION__));

33584

33585

assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33588, __extension__
__PRETTY_FUNCTION__))

33586

(Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33588, __extension__
__PRETTY_FUNCTION__))

33587

(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33588, __extension__
__PRETTY_FUNCTION__))

33588

"Unsupported masked store op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33588, __extension__
__PRETTY_FUNCTION__));

33589

33590

// This operation is legal for targets with VLX, but without

33591

// VLX the vector should be widened to 512 bit

33592

unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();

33593

MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

33594

33595

// Mask element has to be i1.

33596

assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33597, __extension__
__PRETTY_FUNCTION__))

33597

"Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33597, __extension__
__PRETTY_FUNCTION__));

33598

33599

MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

33600

33601

DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);

33602

Mask = ExtendToType(Mask, WideMaskVT, DAG, true);

33603

return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),

33604

N->getOffset(), Mask, N->getMemoryVT(),

33605

N->getMemOperand(), N->getAddressingMode(),

33606

N->isTruncatingStore(), N->isCompressingStore());

33607

}

33608

33609

static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,

33610

SelectionDAG &DAG) {

33611

assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33612, __extension__
__PRETTY_FUNCTION__))

33612

"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33612, __extension__
__PRETTY_FUNCTION__));

33613

33614

MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());

33615

SDLoc dl(Op);

33616

MVT VT = Op.getSimpleValueType();

33617

SDValue Index = N->getIndex();

33618

SDValue Mask = N->getMask();

33619

SDValue PassThru = N->getPassThru();

33620

MVT IndexVT = Index.getSimpleValueType();

33621

33622

assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported gather op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33622, __extension__
__PRETTY_FUNCTION__));

33623

33624

// If the index is v2i32, we're being called by type legalization.

33625

if (IndexVT == MVT::v2i32)

33626

return SDValue();

33627

33628

// If we don't have VLX and neither the passthru or index is 512-bits, we

33629

// need to widen until one is.

33630

MVT OrigVT = VT;

33631

if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&

33632

!IndexVT.is512BitVector()) {

33633

// Determine how much we need to widen by to get a 512-bit type.

33634

unsigned Factor = std::min(512/VT.getSizeInBits(),

33635

512/IndexVT.getSizeInBits());

33636

33637

unsigned NumElts = VT.getVectorNumElements() * Factor;

33638

33639

VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

33640

IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);

33641

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

33642

33643

PassThru = ExtendToType(PassThru, VT, DAG);

33644

Index = ExtendToType(Index, IndexVT, DAG);

33645

Mask = ExtendToType(Mask, MaskVT, DAG, true);

33646

}

33647

33648

// Break dependency on the data register.

33649

if (PassThru.isUndef())

33650

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

33651

33652

SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,

33653

N->getScale() };

33654

SDValue NewGather = DAG.getMemIntrinsicNode(

33655

X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),

33656

N->getMemOperand());

33657

SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,

33658

NewGather, DAG.getIntPtrConstant(0, dl));

33659

return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);

33660

}

33661

33662

static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {

33663

SDLoc dl(Op);

33664

SDValue Src = Op.getOperand(0);

33665

MVT DstVT = Op.getSimpleValueType();

33666

33667

AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());

33668

unsigned SrcAS = N->getSrcAddressSpace();

33669

33670

assert(SrcAS != N->getDestAddressSpace() &&(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33671, __extension__
__PRETTY_FUNCTION__))

33671

"addrspacecast must be between different address spaces")(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33671, __extension__
__PRETTY_FUNCTION__));

33672

33673

if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {

33674

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);

33675

} else if (DstVT == MVT::i64) {

33676

Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);

33677

} else if (DstVT == MVT::i32) {

33678

Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);

33679

} else {

33680

report_fatal_error("Bad address space in addrspacecast");

33681

}

33682

return Op;

33683

}

33684

33685

SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,

33686

SelectionDAG &DAG) const {

33687

// TODO: Eventually, the lowering of these nodes should be informed by or

33688

// deferred to the GC strategy for the function in which they appear. For

33689

// now, however, they must be lowered to something. Since they are logically

33690

// no-ops in the case of a null GC strategy (or a GC strategy which does not

33691

// require special handling for these nodes), lower them as literal NOOPs for

33692

// the time being.

33693

SmallVector<SDValue, 2> Ops;

33694

Ops.push_back(Op.getOperand(0));

33695

if (Op->getGluedNode())

33696

Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

33697

33698

SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);

33699

return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

33700

}

33701

33702

// Custom split CVTPS2PH with wide types.

33703

static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {

33704

SDLoc dl(Op);

33705

EVT VT = Op.getValueType();

33706

SDValue Lo, Hi;

33707

std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);

33708

EVT LoVT, HiVT;

33709

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

33710

SDValue RC = Op.getOperand(1);

33711

Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);

33712

Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);

33713

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

33714

}

33715

33716

static StringRef getInstrStrFromOpNo(const SmallVectorImpl<StringRef> &AsmStrs,

33717

unsigned OpNo) {

33718

const APInt Operand(32, OpNo);

33719

std::string OpNoStr = llvm::toString(Operand, 10, false);

33720

std::string Str(" $");

33721

33722

std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)

33723

std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}

33724

33725

auto I = StringRef::npos;

33726

for (auto &AsmStr : AsmStrs) {

33727

// Match the OpNo string. We should match exactly to exclude match

33728

// sub-string, e.g. "$12" contain "$1"

33729

if (AsmStr.endswith(OpNoStr1))

33730

I = AsmStr.size() - OpNoStr1.size();

33731

33732

// Get the index of operand in AsmStr.

33733

if (I == StringRef::npos)

33734

I = AsmStr.find(OpNoStr1 + ",");

33735

if (I == StringRef::npos)

33736

I = AsmStr.find(OpNoStr2);

33737

33738

if (I == StringRef::npos)

33739

continue;

33740

33741

assert(I > 0 && "Unexpected inline asm string!")(static_cast <bool> (I > 0 && "Unexpected inline asm string!"
) ? void (0) : __assert_fail ("I > 0 && \"Unexpected inline asm string!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33741, __extension__
__PRETTY_FUNCTION__));

33742

// Remove the operand string and label (if exsit).

33743

// For example:

33744

// ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"

33745

// ==>

33746

// ".L__MSASMLABEL_.${:uid}__l:call dword ptr "

33747

// ==>

33748

// "call dword ptr "

33749

auto TmpStr = AsmStr.substr(0, I);

33750

I = TmpStr.rfind(':');

33751

if (I == StringRef::npos)

33752

return TmpStr;

33753

33754

assert(I < TmpStr.size() && "Unexpected inline asm string!")(static_cast <bool> (I < TmpStr.size() && "Unexpected inline asm string!"
) ? void (0) : __assert_fail ("I < TmpStr.size() && \"Unexpected inline asm string!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33754, __extension__
__PRETTY_FUNCTION__));

33755

auto Asm = TmpStr.drop_front(I + 1);

33756

return Asm;

33757

}

33758

33759

return StringRef();

33760

}

33761

33762

bool X86TargetLowering::isInlineAsmTargetBranch(

33763

const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {

33764

StringRef InstrStr = getInstrStrFromOpNo(AsmStrs, OpNo);

33765

33766

if (InstrStr.contains("call"))

33767

return true;

33768

33769

return false;

33770

}

33771

33772

/// Provide custom lowering hooks for some operations.

33773

SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

33774

switch (Op.getOpcode()) {

33775

default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33775);

33776

case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);

33777

case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:

33778

return LowerCMP_SWAP(Op, Subtarget, DAG);

33779

case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);

33780

case ISD::ATOMIC_LOAD_ADD:

33781

case ISD::ATOMIC_LOAD_SUB:

33782

case ISD::ATOMIC_LOAD_OR:

33783

case ISD::ATOMIC_LOAD_XOR:

33784

case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);

33785

case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);

33786

case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);

33787

case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);

33788

case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);

33789

case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);

33790

case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);

33791

case ISD::VSELECT: return LowerVSELECT(Op, DAG);

33792

case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);

33793

case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);

33794

case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);

33795

case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);

33796

case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);

33797

case ISD::ConstantPool: return LowerConstantPool(Op, DAG);

33798

case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);

33799

case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);

33800

case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);

33801

case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);

33802

case ISD::SHL_PARTS:

33803

case ISD::SRA_PARTS:

33804

case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);

33805

case ISD::FSHL:

33806

case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);

33807

case ISD::STRICT_SINT_TO_FP:

33808

case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);

33809

case ISD::STRICT_UINT_TO_FP:

33810

case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);

33811

case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);

33812

case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);

33813

case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);

33814

case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);

33815

case ISD::ZERO_EXTEND_VECTOR_INREG:

33816

case ISD::SIGN_EXTEND_VECTOR_INREG:

33817

return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);

33818

case ISD::FP_TO_SINT:

33819

case ISD::STRICT_FP_TO_SINT:

33820

case ISD::FP_TO_UINT:

33821

case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);

33822

case ISD::FP_TO_SINT_SAT:

33823

case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);

33824

case ISD::FP_EXTEND:

33825

case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);

33826

case ISD::FP_ROUND:

33827

case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);

33828

case ISD::FP16_TO_FP:

33829

case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);

33830

case ISD::FP_TO_FP16:

33831

case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);

33832

case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);

33833

case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);

33834

case ISD::STORE: return LowerStore(Op, Subtarget, DAG);

33835

case ISD::FADD:

33836

case ISD::FSUB: return lowerFaddFsub(Op, DAG);

33837

case ISD::FROUND: return LowerFROUND(Op, DAG);

33838

case ISD::FABS:

33839

case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);

33840

case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);

33841

case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);

33842

case ISD::LRINT:

33843

case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);

33844

case ISD::SETCC:

33845

case ISD::STRICT_FSETCC:

33846

case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);

33847

case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);

33848

case ISD::SELECT: return LowerSELECT(Op, DAG);

33849

case ISD::BRCOND: return LowerBRCOND(Op, DAG);

33850

case ISD::JumpTable: return LowerJumpTable(Op, DAG);

33851

case ISD::VASTART: return LowerVASTART(Op, DAG);

33852

case ISD::VAARG: return LowerVAARG(Op, DAG);

33853

case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);

33854

case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);

33855

case ISD::INTRINSIC_VOID:

33856

case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);

33857

case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);

33858

case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);

33859

case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);

33860

case ISD::FRAME_TO_ARGS_OFFSET:

33861

return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);

33862

case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);

33863

case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);

33864

case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);

33865

case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);

33866

case ISD::EH_SJLJ_SETUP_DISPATCH:

33867

return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);

33868

case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);

33869

case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);

33870

case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);

33871

case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);

33872

case ISD::CTLZ:

33873

case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);

33874

case ISD::CTTZ:

33875

case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);

33876

case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);

33877

case ISD::MULHS:

33878

case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);

33879

case ISD::ROTL:

33880

case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);

33881

case ISD::SRA:

33882

case ISD::SRL:

33883

case ISD::SHL: return LowerShift(Op, Subtarget, DAG);

33884

case ISD::SADDO:

33885

case ISD::UADDO:

33886

case ISD::SSUBO:

33887

case ISD::USUBO: return LowerXALUO(Op, DAG);

33888

case ISD::SMULO:

33889

case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);

33890

case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);

33891

case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);

33892

case ISD::SADDO_CARRY:

33893

case ISD::SSUBO_CARRY:

33894

case ISD::ADDCARRY:

33895

case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);

33896

case ISD::ADD:

33897

case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);

33898

case ISD::UADDSAT:

33899

case ISD::SADDSAT:

33900

case ISD::USUBSAT:

33901

case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);

33902

case ISD::SMAX:

33903

case ISD::SMIN:

33904

case ISD::UMAX:

33905

case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);

33906

case ISD::ABS: return LowerABS(Op, Subtarget, DAG);

33907

case ISD::ABDS:

33908

case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);

33909

case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);

33910

case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);

33911

case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);

33912

case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);

33913

case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);

33914

case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);

33915

case ISD::GC_TRANSITION_START:

33916

case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);

33917

case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);

33918

case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);

33919

}

33920

}

33921

33922

/// Replace a node with an illegal result type with a new node built out of

33923

/// custom code.

33924

void X86TargetLowering::ReplaceNodeResults(SDNode *N,

33925

SmallVectorImpl<SDValue>&Results,

33926

SelectionDAG &DAG) const {

33927

SDLoc dl(N);

33928

switch (N->getOpcode()) {

33929

default:

33930

#ifndef NDEBUG

33931

dbgs() << "ReplaceNodeResults: ";

33932

N->dump(&DAG);

33933

#endif

33934

llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33934);

33935

case X86ISD::CVTPH2PS: {

33936

EVT VT = N->getValueType(0);

33937

SDValue Lo, Hi;

33938

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

33939

EVT LoVT, HiVT;

33940

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

33941

Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);

33942

Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);

33943

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

33944

Results.push_back(Res);

33945

return;

33946

}

33947

case X86ISD::STRICT_CVTPH2PS: {

33948

EVT VT = N->getValueType(0);

33949

SDValue Lo, Hi;

33950

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);

33951

EVT LoVT, HiVT;

33952

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

33953

Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},

33954

{N->getOperand(0), Lo});

33955

Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},

33956

{N->getOperand(0), Hi});

33957

SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

33958

Lo.getValue(1), Hi.getValue(1));

33959

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

33960

Results.push_back(Res);

33961

Results.push_back(Chain);

33962

return;

33963

}

33964

case X86ISD::CVTPS2PH:

33965

Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));

33966

return;

33967

case ISD::CTPOP: {

33968

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33968, __extension__
__PRETTY_FUNCTION__));

33969

// Use a v2i64 if possible.

33970

bool NoImplicitFloatOps =

33971

DAG.getMachineFunction().getFunction().hasFnAttribute(

33972

Attribute::NoImplicitFloat);

33973

if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {

33974

SDValue Wide =

33975

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));

33976

Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);

33977

// Bit count should fit in 32-bits, extract it as that and then zero

33978

// extend to i64. Otherwise we end up extracting bits 63:32 separately.

33979

Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);

33980

Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,

33981

DAG.getIntPtrConstant(0, dl));

33982

Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);

33983

Results.push_back(Wide);

33984

}

33985

return;

33986

}

33987

case ISD::MUL: {

33988

EVT VT = N->getValueType(0);

33989

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33990, __extension__
__PRETTY_FUNCTION__))

33990

VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33990, __extension__
__PRETTY_FUNCTION__));

33991

// Pre-promote these to vXi16 to avoid op legalization thinking all 16

33992

// elements are needed.

33993

MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());

33994

SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));

33995

SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));

33996

SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);

33997

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

33998

unsigned NumConcats = 16 / VT.getVectorNumElements();

33999

SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));

34000

ConcatOps[0] = Res;

34001

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);

34002

Results.push_back(Res);

34003

return;

34004

}

34005

case ISD::SMULO:

34006

case ISD::UMULO: {

34007

EVT VT = N->getValueType(0);

34008

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34009, __extension__
__PRETTY_FUNCTION__))

34009

VT == MVT::v2i32 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34009, __extension__
__PRETTY_FUNCTION__));

34010

bool IsSigned = N->getOpcode() == ISD::SMULO;

34011

unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

34012

SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));

34013

SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));

34014

SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);

34015

// Extract the high 32 bits from each result using PSHUFD.

34016

// TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.

34017

SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);

34018

Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});

34019

Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,

34020

DAG.getIntPtrConstant(0, dl));

34021

34022

// Truncate the low bits of the result. This will become PSHUFD.

34023

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

34024

34025

SDValue HiCmp;

34026

if (IsSigned) {

34027

// SMULO overflows if the high bits don't match the sign of the low.

34028

HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));

34029

} else {

34030

// UMULO overflows if the high bits are non-zero.

34031

HiCmp = DAG.getConstant(0, dl, VT);

34032

}

34033

SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);

34034

34035

// Widen the result with by padding with undef.

34036

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,

34037

DAG.getUNDEF(VT));

34038

Results.push_back(Res);

34039

Results.push_back(Ovf);

34040

return;

34041

}

34042

case X86ISD::VPMADDWD: {

34043

// Legalize types for X86ISD::VPMADDWD by widening.

34044

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34044, __extension__
__PRETTY_FUNCTION__));

34045

34046

EVT VT = N->getValueType(0);

34047

EVT InVT = N->getOperand(0).getValueType();

34048

assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34049, __extension__
__PRETTY_FUNCTION__))

34049

"Expected a VT that divides into 128 bits.")(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34049, __extension__
__PRETTY_FUNCTION__));

34050

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34051, __extension__
__PRETTY_FUNCTION__))

34051

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34051, __extension__
__PRETTY_FUNCTION__));

34052

unsigned NumConcat = 128 / InVT.getSizeInBits();

34053

34054

EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),

34055

InVT.getVectorElementType(),

34056

NumConcat * InVT.getVectorNumElements());

34057

EVT WideVT = EVT::getVectorVT(*DAG.getContext(),

34058

VT.getVectorElementType(),

34059

NumConcat * VT.getVectorNumElements());

34060

34061

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));

34062

Ops[0] = N->getOperand(0);

34063

SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);

34064

Ops[0] = N->getOperand(1);

34065

SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);

34066

34067

SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);

34068

Results.push_back(Res);

34069

return;

34070

}

34071

// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.

34072

case X86ISD::FMINC:

34073

case X86ISD::FMIN:

34074

case X86ISD::FMAXC:

34075

case X86ISD::FMAX: {

34076

EVT VT = N->getValueType(0);

34077

assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")(static_cast <bool> (VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? void (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34077, __extension__
__PRETTY_FUNCTION__));

34078

SDValue UNDEF = DAG.getUNDEF(VT);

34079

SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

34080

N->getOperand(0), UNDEF);

34081

SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

34082

N->getOperand(1), UNDEF);

34083

Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));

34084

return;

34085

}

34086

case ISD::SDIV:

34087

case ISD::UDIV:

34088

case ISD::SREM:

34089

case ISD::UREM: {

34090

EVT VT = N->getValueType(0);

34091

if (VT.isVector()) {

34092

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34093, __extension__
__PRETTY_FUNCTION__))

34093

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34093, __extension__
__PRETTY_FUNCTION__));

34094

// If this RHS is a constant splat vector we can widen this and let

34095

// division/remainder by constant optimize it.

34096

// TODO: Can we do something for non-splat?

34097

APInt SplatVal;

34098

if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {

34099

unsigned NumConcats = 128 / VT.getSizeInBits();

34100

SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));

34101

Ops0[0] = N->getOperand(0);

34102

EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);

34103

SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);

34104

SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);

34105

SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);

34106

Results.push_back(Res);

34107

}

34108

return;

34109

}

34110

34111

SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);

34112

Results.push_back(V);

34113

return;

34114

}

34115

case ISD::TRUNCATE: {

34116

MVT VT = N->getSimpleValueType(0);

34117

if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)

34118

return;

34119

34120

// The generic legalizer will try to widen the input type to the same

34121

// number of elements as the widened result type. But this isn't always

34122

// the best thing so do some custom legalization to avoid some cases.

34123

MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();

34124

SDValue In = N->getOperand(0);

34125

EVT InVT = In.getValueType();

34126

34127

unsigned InBits = InVT.getSizeInBits();

34128

if (128 % InBits == 0) {

34129

// 128 bit and smaller inputs should avoid truncate all together and

34130

// just use a build_vector that will become a shuffle.

34131

// TODO: Widen and use a shuffle directly?

34132

MVT InEltVT = InVT.getSimpleVT().getVectorElementType();

34133

EVT EltVT = VT.getVectorElementType();

34134

unsigned WidenNumElts = WidenVT.getVectorNumElements();

34135

SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));

34136

// Use the original element count so we don't do more scalar opts than

34137

// necessary.

34138

unsigned MinElts = VT.getVectorNumElements();

34139

for (unsigned i=0; i < MinElts; ++i) {

34140

SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,

34141

DAG.getIntPtrConstant(i, dl));

34142

Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);

34143

}

34144

Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));

34145

return;

34146

}

34147

// With AVX512 there are some cases that can use a target specific

34148

// truncate node to go from 256/512 to less than 128 with zeros in the

34149

// upper elements of the 128 bit result.

34150

if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {

34151

// We can use VTRUNC directly if for 256 bits with VLX or for any 512.

34152

if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {

34153

Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));

34154

return;

34155

}

34156

// There's one case we can widen to 512 bits and use VTRUNC.

34157

if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {

34158

In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,

34159

DAG.getUNDEF(MVT::v4i64));

34160

Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));

34161

return;

34162

}

34163

}

34164

if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&

34165

getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&

34166

isTypeLegal(MVT::v4i64)) {

34167

// Input needs to be split and output needs to widened. Let's use two

34168

// VTRUNCs, and shuffle their results together into the wider type.

34169

SDValue Lo, Hi;

34170

std::tie(Lo, Hi) = DAG.SplitVector(In, dl);

34171

34172

Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);

34173

Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);

34174

SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,

34175

{ 0, 1, 2, 3, 16, 17, 18, 19,

34176

-1, -1, -1, -1, -1, -1, -1, -1 });

34177

Results.push_back(Res);

34178

return;

34179

}

34180

34181

return;

34182

}

34183

case ISD::ANY_EXTEND:

34184

// Right now, only MVT::v8i8 has Custom action for an illegal type.

34185

// It's intended to custom handle the input type.

34186

assert(N->getValueType(0) == MVT::v8i8 &&(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34187, __extension__
__PRETTY_FUNCTION__))

34187

"Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34187, __extension__
__PRETTY_FUNCTION__));

34188

return;

34189

case ISD::SIGN_EXTEND:

34190

case ISD::ZERO_EXTEND: {

34191

EVT VT = N->getValueType(0);

34192

SDValue In = N->getOperand(0);

34193

EVT InVT = In.getValueType();

34194

if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&

34195

(InVT == MVT::v4i16 || InVT == MVT::v4i8)){

34196

assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34197, __extension__
__PRETTY_FUNCTION__))

34197

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34197, __extension__
__PRETTY_FUNCTION__));

34198

assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND
&& "Unexpected opcode") ? void (0) : __assert_fail (
"N->getOpcode() == ISD::SIGN_EXTEND && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34198, __extension__
__PRETTY_FUNCTION__));

34199

// Custom split this so we can extend i8/i16->i32 invec. This is better

34200

// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using

34201

// sra. Then extending from i32 to i64 using pcmpgt. By custom splitting

34202

// we allow the sra from the extend to i32 to be shared by the split.

34203

In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);

34204

34205

// Fill a vector with sign bits for each element.

34206

SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);

34207

SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);

34208

34209

// Create an unpackl and unpackh to interleave the sign bits then bitcast

34210

// to v2i64.

34211

SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,

34212

{0, 4, 1, 5});

34213

Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);

34214

SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,

34215

{2, 6, 3, 7});

34216

Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);

34217

34218

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

34219

Results.push_back(Res);

34220

return;

34221

}

34222

34223

if (VT == MVT::v16i32 || VT == MVT::v8i64) {

34224

if (!InVT.is128BitVector()) {

34225

// Not a 128 bit vector, but maybe type legalization will promote

34226

// it to 128 bits.

34227

if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)

34228

return;

34229

InVT = getTypeToTransformTo(*DAG.getContext(), InVT);

34230

if (!InVT.is128BitVector())

34231

return;

34232

34233

// Promote the input to 128 bits. Type legalization will turn this into

34234

// zext_inreg/sext_inreg.

34235

In = DAG.getNode(N->getOpcode(), dl, InVT, In);

34236

}

34237

34238

// Perform custom splitting instead of the two stage extend we would get

34239

// by default.

34240

EVT LoVT, HiVT;

34241

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

34242

assert(isTypeLegal(LoVT) && "Split VT not legal?")(static_cast <bool> (isTypeLegal(LoVT) && "Split VT not legal?"
) ? void (0) : __assert_fail ("isTypeLegal(LoVT) && \"Split VT not legal?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34242, __extension__
__PRETTY_FUNCTION__));

34243

34244

SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);

34245

34246

// We need to shift the input over by half the number of elements.

34247

unsigned NumElts = InVT.getVectorNumElements();

34248

unsigned HalfNumElts = NumElts / 2;

34249

SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);

34250

for (unsigned i = 0; i != HalfNumElts; ++i)

34251

ShufMask[i] = i + HalfNumElts;

34252

34253

SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);

34254

Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);

34255

34256

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

34257

Results.push_back(Res);

34258

}

34259

return;

34260

}

34261

case ISD::FP_TO_SINT:

34262

case ISD::STRICT_FP_TO_SINT:

34263

case ISD::FP_TO_UINT:

34264

case ISD::STRICT_FP_TO_UINT: {

34265

bool IsStrict = N->isStrictFPOpcode();

34266

bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||

34267

N->getOpcode() == ISD::STRICT_FP_TO_SINT;

34268

EVT VT = N->getValueType(0);

34269

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

34270

SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();

34271

EVT SrcVT = Src.getValueType();

34272

34273

SDValue Res;

34274

if (isSoftFP16(SrcVT)) {

34275

EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;

34276

if (IsStrict) {

34277

Res =

34278

DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},

34279

{Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,

34280

{NVT, MVT::Other}, {Chain, Src})});

34281

Chain = Res.getValue(1);

34282

} else {

34283

Res = DAG.getNode(N->getOpcode(), dl, VT,

34284

DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));

34285

}

34286

Results.push_back(Res);

34287

if (IsStrict)

34288

Results.push_back(Chain);

34289

34290

return;

34291

}

34292

34293

if (VT.isVector() && Subtarget.hasFP16() &&

34294

SrcVT.getVectorElementType() == MVT::f16) {

34295

EVT EleVT = VT.getVectorElementType();

34296

EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;

34297

34298

if (SrcVT != MVT::v8f16) {

34299

SDValue Tmp =

34300

IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);

34301

SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);

34302

Ops[0] = Src;

34303

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);

34304

}

34305

34306

if (IsStrict) {

34307

unsigned Opc =

34308

IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

34309

Res =

34310

DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});

34311

Chain = Res.getValue(1);

34312

} else {

34313

unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

34314

Res = DAG.getNode(Opc, dl, ResVT, Src);

34315

}

34316

34317

// TODO: Need to add exception check code for strict FP.

34318

if (EleVT.getSizeInBits() < 16) {

34319

MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);

34320

Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);

34321

34322

// Now widen to 128 bits.

34323

unsigned NumConcats = 128 / TmpVT.getSizeInBits();

34324

MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);

34325

SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));

34326

ConcatOps[0] = Res;

34327

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);

34328

}

34329

34330

Results.push_back(Res);

34331

if (IsStrict)

34332

Results.push_back(Chain);

34333

34334

return;

34335

}

34336

34337

if (VT.isVector() && VT.getScalarSizeInBits() < 32) {

34338

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34339, __extension__
__PRETTY_FUNCTION__))

34339

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34339, __extension__
__PRETTY_FUNCTION__));

34340

34341

// Try to create a 128 bit vector, but don't exceed a 32 bit element.

34342

unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);

34343

MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),

34344

VT.getVectorNumElements());

34345

SDValue Res;

34346

SDValue Chain;

34347

if (IsStrict) {

34348

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},

34349

{N->getOperand(0), Src});

34350

Chain = Res.getValue(1);

34351

} else

34352

Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);

34353

34354

// Preserve what we know about the size of the original result. If the

34355

// result is v2i32, we have to manually widen the assert.

34356

if (PromoteVT == MVT::v2i32)

34357

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,

34358

DAG.getUNDEF(MVT::v2i32));

34359

34360

Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,

34361

Res.getValueType(), Res,

34362

DAG.getValueType(VT.getVectorElementType()));

34363

34364

if (PromoteVT == MVT::v2i32)

34365

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,

34366

DAG.getIntPtrConstant(0, dl));

34367

34368

// Truncate back to the original width.

34369

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

34370

34371

// Now widen to 128 bits.

34372

unsigned NumConcats = 128 / VT.getSizeInBits();

34373

MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),

34374

VT.getVectorNumElements() * NumConcats);

34375

SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));

34376

ConcatOps[0] = Res;

34377

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);

34378

Results.push_back(Res);

34379

if (IsStrict)

34380

Results.push_back(Chain);

34381

return;

34382

}

34383

34384

34385

if (VT == MVT::v2i32) {

34386

assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34387, __extension__
__PRETTY_FUNCTION__))

34387

"Strict unsigned conversion requires AVX512")(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34387, __extension__
__PRETTY_FUNCTION__));

34388

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34388, __extension__
__PRETTY_FUNCTION__));

34389

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34390, __extension__
__PRETTY_FUNCTION__))

34390

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34390, __extension__
__PRETTY_FUNCTION__));

34391

if (Src.getValueType() == MVT::v2f64) {

34392

if (!IsSigned && !Subtarget.hasAVX512()) {

34393

SDValue Res =

34394

expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);

34395

Results.push_back(Res);

34396

return;

34397

}

34398

34399

unsigned Opc;

34400

if (IsStrict)

34401

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

34402

else

34403

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

34404

34405

// If we have VLX we can emit a target specific FP_TO_UINT node,.

34406

if (!IsSigned && !Subtarget.hasVLX()) {

34407

// Otherwise we can defer to the generic legalizer which will widen

34408

// the input as well. This will be further widened during op

34409

// legalization to v8i32<-v8f64.

34410

// For strict nodes we'll need to widen ourselves.

34411

// FIXME: Fix the type legalizer to safely widen strict nodes?

34412

if (!IsStrict)

34413

return;

34414

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,

34415

DAG.getConstantFP(0.0, dl, MVT::v2f64));

34416

Opc = N->getOpcode();

34417

}

34418

SDValue Res;

34419

SDValue Chain;

34420

if (IsStrict) {

34421

Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},

34422

{N->getOperand(0), Src});

34423

Chain = Res.getValue(1);

34424

} else {

34425

Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);

34426

}

34427

Results.push_back(Res);

34428

if (IsStrict)

34429

Results.push_back(Chain);

34430

return;

34431

}

34432

34433

// Custom widen strict v2f32->v2i32 by padding with zeros.

34434

// FIXME: Should generic type legalizer do this?

34435

if (Src.getValueType() == MVT::v2f32 && IsStrict) {

34436

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

34437

DAG.getConstantFP(0.0, dl, MVT::v2f32));

34438

SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},

34439

{N->getOperand(0), Src});

34440

Results.push_back(Res);

34441

Results.push_back(Res.getValue(1));

34442

return;

34443

}

34444

34445

// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,

34446

// so early out here.

34447

return;

34448

}

34449

34450

assert(!VT.isVector() && "Vectors should have been handled above!")(static_cast <bool> (!VT.isVector() && "Vectors should have been handled above!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Vectors should have been handled above!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34450, __extension__
__PRETTY_FUNCTION__));

34451

34452

if ((Subtarget.hasDQI() && VT == MVT::i64 &&

34453

(SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||

34454

(Subtarget.hasFP16() && SrcVT == MVT::f16)) {

34455

assert(!Subtarget.is64Bit() && "i64 should be legal")(static_cast <bool> (!Subtarget.is64Bit() && "i64 should be legal"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"i64 should be legal\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34455, __extension__
__PRETTY_FUNCTION__));

34456

unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;

34457

// If we use a 128-bit result we might need to use a target specific node.

34458

unsigned SrcElts =

34459

std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());

34460

MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);

34461

MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);

34462

unsigned Opc = N->getOpcode();

34463

if (NumElts != SrcElts) {

34464

if (IsStrict)

34465

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

34466

else

34467

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

34468

}

34469

34470

SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

34471

SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,

34472

DAG.getConstantFP(0.0, dl, VecInVT), Src,

34473

ZeroIdx);

34474

SDValue Chain;

34475

if (IsStrict) {

34476

SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);

34477

Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);

34478

Chain = Res.getValue(1);

34479

} else

34480

Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);

34481

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);

34482

Results.push_back(Res);

34483

if (IsStrict)

34484

Results.push_back(Chain);

34485

return;

34486

}

34487

34488

if (VT == MVT::i128 && Subtarget.isTargetWin64()) {

34489

SDValue Chain;

34490

SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);

34491

Results.push_back(V);

34492

if (IsStrict)

34493

Results.push_back(Chain);

34494

return;

34495

}

34496

34497

if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {

34498

Results.push_back(V);

34499

if (IsStrict)

34500

Results.push_back(Chain);

34501

}

34502

return;

34503

}

34504

case ISD::LRINT:

34505

case ISD::LLRINT: {

34506

if (SDValue V = LRINT_LLRINTHelper(N, DAG))

34507

Results.push_back(V);

34508

return;

34509

}

34510

34511

case ISD::SINT_TO_FP:

34512

case ISD::STRICT_SINT_TO_FP:

34513

case ISD::UINT_TO_FP:

34514

case ISD::STRICT_UINT_TO_FP: {

34515

bool IsStrict = N->isStrictFPOpcode();

34516

bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||

34517

N->getOpcode() == ISD::STRICT_SINT_TO_FP;

34518

EVT VT = N->getValueType(0);

34519

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

34520

if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&

34521

Subtarget.hasVLX()) {

34522

if (Src.getValueType().getVectorElementType() == MVT::i16)

34523

return;

34524

34525

if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)

34526

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

34527

IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)

34528

: DAG.getUNDEF(MVT::v2i32));

34529

if (IsStrict) {

34530

unsigned Opc =

34531

IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;

34532

SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},

34533

{N->getOperand(0), Src});

34534

Results.push_back(Res);

34535

Results.push_back(Res.getValue(1));

34536

} else {

34537

unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;

34538

Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));

34539

}

34540

return;

34541

}

34542

if (VT != MVT::v2f32)

34543

return;

34544

EVT SrcVT = Src.getValueType();

34545

if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {

34546

if (IsStrict) {

34547

unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P

34548

: X86ISD::STRICT_CVTUI2P;

34549

SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},

34550

{N->getOperand(0), Src});

34551

Results.push_back(Res);

34552

Results.push_back(Res.getValue(1));

34553

} else {

34554

unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;

34555

Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));

34556

}

34557

return;

34558

}

34559

if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&

34560

Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {

34561

SDValue Zero = DAG.getConstant(0, dl, SrcVT);

34562

SDValue One = DAG.getConstant(1, dl, SrcVT);

34563

SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,

34564

DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),

34565

DAG.getNode(ISD::AND, dl, SrcVT, Src, One));

34566

SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);

34567

SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);

34568

SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));

34569

for (int i = 0; i != 2; ++i) {

34570

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,

34571

SignSrc, DAG.getIntPtrConstant(i, dl));

34572

if (IsStrict)

34573

SignCvts[i] =

34574

DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},

34575

{N->getOperand(0), Elt});

34576

else

34577

SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);

34578

};

34579

SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);

34580

SDValue Slow, Chain;

34581

if (IsStrict) {

34582

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

34583

SignCvts[0].getValue(1), SignCvts[1].getValue(1));

34584

Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},

34585

{Chain, SignCvt, SignCvt});

34586

Chain = Slow.getValue(1);

34587

} else {

34588

Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);

34589

}

34590

IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);

34591

IsNeg =

34592

DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});

34593

SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);

34594

Results.push_back(Cvt);

34595

if (IsStrict)

34596

Results.push_back(Chain);

34597

return;

34598

}

34599

34600

if (SrcVT != MVT::v2i32)

34601

return;

34602

34603

if (IsSigned || Subtarget.hasAVX512()) {

34604

if (!IsStrict)

34605

return;

34606

34607

// Custom widen strict v2i32->v2f32 to avoid scalarization.

34608

// FIXME: Should generic type legalizer do this?

34609

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

34610

DAG.getConstant(0, dl, MVT::v2i32));

34611

SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},

34612

{N->getOperand(0), Src});

34613

Results.push_back(Res);

34614

Results.push_back(Res.getValue(1));

34615

return;

34616

}

34617

34618

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34618, __extension__
__PRETTY_FUNCTION__));

34619

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);

34620

SDValue VBias = DAG.getConstantFP(

34621

llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);

34622

SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,

34623

DAG.getBitcast(MVT::v2i64, VBias));

34624

Or = DAG.getBitcast(MVT::v2f64, Or);

34625

if (IsStrict) {

34626

SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},

34627

{N->getOperand(0), Or, VBias});

34628

SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,

34629

{MVT::v4f32, MVT::Other},

34630

{Sub.getValue(1), Sub});

34631

Results.push_back(Res);

34632

Results.push_back(Res.getValue(1));

34633

} else {

34634

// TODO: Are there any fast-math-flags to propagate here?

34635

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);

34636

Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));

34637

}

34638

return;

34639

}

34640

case ISD::STRICT_FP_ROUND:

34641

case ISD::FP_ROUND: {

34642

bool IsStrict = N->isStrictFPOpcode();

34643

SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();

34644

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

34645

SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);

34646

EVT SrcVT = Src.getValueType();

34647

EVT VT = N->getValueType(0);

34648

SDValue V;

34649

if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {

34650

SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)

34651

: DAG.getUNDEF(MVT::v2f32);

34652

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);

34653

}

34654

if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {

34655

assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C")(static_cast <bool> (Subtarget.hasF16C() && "Cannot widen f16 without F16C"
) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Cannot widen f16 without F16C\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34655, __extension__
__PRETTY_FUNCTION__));

34656

if (SrcVT.getVectorElementType() != MVT::f32)

34657

return;

34658

34659

if (IsStrict)

34660

V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},

34661

{Chain, Src, Rnd});

34662

else

34663

V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);

34664

34665

Results.push_back(DAG.getBitcast(MVT::v8f16, V));

34666

if (IsStrict)

34667

Results.push_back(V.getValue(1));

34668

return;

34669

}

34670

if (!isTypeLegal(Src.getValueType()))

34671

return;

34672

EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;

34673

if (IsStrict)

34674

V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},

34675

{Chain, Src});

34676

else

34677

V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);

34678

Results.push_back(V);

34679

if (IsStrict)

34680

Results.push_back(V.getValue(1));

34681

return;

34682

}

34683

case ISD::FP_EXTEND:

34684

case ISD::STRICT_FP_EXTEND: {

34685

// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.

34686

// No other ValueType for FP_EXTEND should reach this point.

34687

assert(N->getValueType(0) == MVT::v2f32 &&(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34688, __extension__
__PRETTY_FUNCTION__))

34688

"Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34688, __extension__
__PRETTY_FUNCTION__));

34689

if (!Subtarget.hasFP16() || !Subtarget.hasVLX())

34690

return;

34691

bool IsStrict = N->isStrictFPOpcode();

34692

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

34693

SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)

34694

: DAG.getUNDEF(MVT::v2f16);

34695

SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);

34696

if (IsStrict)

34697

V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},

34698

{N->getOperand(0), V});

34699

else

34700

V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);

34701

Results.push_back(V);

34702

if (IsStrict)

34703

Results.push_back(V.getValue(1));

34704

return;

34705

}

34706

case ISD::INTRINSIC_W_CHAIN: {

34707

unsigned IntNo = N->getConstantOperandVal(1);

34708

switch (IntNo) {

34709

default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34710)

34710

"legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34710);

34711

case Intrinsic::x86_rdtsc:

34712

return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,

34713

Results);

34714

case Intrinsic::x86_rdtscp:

34715

return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,

34716

Results);

34717

case Intrinsic::x86_rdpmc:

34718

expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,

34719

Results);

34720

return;

34721

case Intrinsic::x86_rdpru:

34722

expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,

34723

Results);

34724

return;

34725

case Intrinsic::x86_xgetbv:

34726

expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,

34727

Results);

34728

return;

34729

}

34730

}

34731

case ISD::READCYCLECOUNTER: {

34732

return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);

34733

}

34734

case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {

34735

EVT T = N->getValueType(0);

34736

assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(static_cast <bool> ((T == MVT::i64 || T == MVT::i128) &&
"can only expand cmpxchg pair") ? void (0) : __assert_fail (
"(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34736, __extension__
__PRETTY_FUNCTION__));

34737

bool Regs64bit = T == MVT::i128;

34738

assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34739, __extension__
__PRETTY_FUNCTION__))

34739

"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34739, __extension__
__PRETTY_FUNCTION__));

34740

MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;

34741

SDValue cpInL, cpInH;

34742

std::tie(cpInL, cpInH) =

34743

DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);

34744

cpInL = DAG.getCopyToReg(N->getOperand(0), dl,

34745

Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());

34746

cpInH =

34747

DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,

34748

cpInH, cpInL.getValue(1));

34749

SDValue swapInL, swapInH;

34750

std::tie(swapInL, swapInH) =

34751

DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);

34752

swapInH =

34753

DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,

34754

swapInH, cpInH.getValue(1));

34755

34756

// In 64-bit mode we might need the base pointer in RBX, but we can't know

34757

// until later. So we keep the RBX input in a vreg and use a custom

34758

// inserter.

34759

// Since RBX will be a reserved register the register allocator will not

34760

// make sure its value will be properly saved and restored around this

34761

// live-range.

34762

SDValue Result;

34763

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

34764

MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();

34765

if (Regs64bit) {

34766

SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,

34767

swapInH.getValue(1)};

34768

Result =

34769

DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);

34770

} else {

34771

swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,

34772

swapInH.getValue(1));

34773

SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),

34774

swapInL.getValue(1)};

34775

Result =

34776

DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);

34777

}

34778

34779

SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,

34780

Regs64bit ? X86::RAX : X86::EAX,

34781

HalfT, Result.getValue(1));

34782

SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,

34783

Regs64bit ? X86::RDX : X86::EDX,

34784

HalfT, cpOutL.getValue(2));

34785

SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};

34786

34787

SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,

34788

MVT::i32, cpOutH.getValue(2));

34789

SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);

34790

Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));

34791

34792

Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));

34793

Results.push_back(Success);

34794

Results.push_back(EFLAGS.getValue(1));

34795

return;

34796

}

34797

case ISD::ATOMIC_LOAD: {

34798

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34798, __extension__
__PRETTY_FUNCTION__));

34799

bool NoImplicitFloatOps =

34800

DAG.getMachineFunction().getFunction().hasFnAttribute(

34801

Attribute::NoImplicitFloat);

34802

if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {

34803

auto *Node = cast<AtomicSDNode>(N);

34804

if (Subtarget.hasSSE1()) {

34805

// Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.

34806

// Then extract the lower 64-bits.

34807

MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;

34808

SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);

34809

SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };

34810

SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,

34811

MVT::i64, Node->getMemOperand());

34812

if (Subtarget.hasSSE2()) {

34813

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,

34814

DAG.getIntPtrConstant(0, dl));

34815

Results.push_back(Res);

34816

Results.push_back(Ld.getValue(1));

34817

return;

34818

}

34819

// We use an alternative sequence for SSE1 that extracts as v2f32 and

34820

// then casts to i64. This avoids a 128-bit stack temporary being

34821

// created by type legalization if we were to cast v4f32->v2i64.

34822

SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,

34823

DAG.getIntPtrConstant(0, dl));

34824

Res = DAG.getBitcast(MVT::i64, Res);

34825

Results.push_back(Res);

34826

Results.push_back(Ld.getValue(1));

34827

return;

34828

}

34829

if (Subtarget.hasX87()) {

34830

// First load this into an 80-bit X87 register. This will put the whole

34831

// integer into the significand.

34832

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

34833

SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };

34834

SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,

34835

dl, Tys, Ops, MVT::i64,

34836

Node->getMemOperand());

34837

SDValue Chain = Result.getValue(1);

34838

34839

// Now store the X87 register to a stack temporary and convert to i64.

34840

// This store is not atomic and doesn't need to be.

34841

// FIXME: We don't need a stack temporary if the result of the load

34842

// is already being stored. We could just directly store there.

34843

SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);

34844

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

34845

MachinePointerInfo MPI =

34846

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

34847

SDValue StoreOps[] = { Chain, Result, StackPtr };

34848

Chain = DAG.getMemIntrinsicNode(

34849

X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,

34850

MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);

34851

34852

// Finally load the value back from the stack temporary and return it.

34853

// This load is not atomic and doesn't need to be.

34854

// This load will be further type legalized.

34855

Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);

34856

Results.push_back(Result);

34857

Results.push_back(Result.getValue(1));

34858

return;

34859

}

34860

}

34861

// TODO: Use MOVLPS when SSE1 is available?

34862

// Delegate to generic TypeLegalization. Situations we can really handle

34863

// should have already been dealt with by AtomicExpandPass.cpp.

34864

break;

34865

}

34866

case ISD::ATOMIC_SWAP:

34867

case ISD::ATOMIC_LOAD_ADD:

34868

case ISD::ATOMIC_LOAD_SUB:

34869

case ISD::ATOMIC_LOAD_AND:

34870

case ISD::ATOMIC_LOAD_OR:

34871

case ISD::ATOMIC_LOAD_XOR:

34872

case ISD::ATOMIC_LOAD_NAND:

34873

case ISD::ATOMIC_LOAD_MIN:

34874

case ISD::ATOMIC_LOAD_MAX:

34875

case ISD::ATOMIC_LOAD_UMIN:

34876

case ISD::ATOMIC_LOAD_UMAX:

34877

// Delegate to generic TypeLegalization. Situations we can really handle

34878

// should have already been dealt with by AtomicExpandPass.cpp.

34879

break;

34880

34881

case ISD::BITCAST: {

34882

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34882, __extension__
__PRETTY_FUNCTION__));

34883

EVT DstVT = N->getValueType(0);

34884

EVT SrcVT = N->getOperand(0).getValueType();

34885

34886

// If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target

34887

// we can split using the k-register rather than memory.

34888

if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {

34889

assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34889, __extension__
__PRETTY_FUNCTION__));

34890

SDValue Lo, Hi;

34891

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

34892

Lo = DAG.getBitcast(MVT::i32, Lo);

34893

Hi = DAG.getBitcast(MVT::i32, Hi);

34894

SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);

34895

Results.push_back(Res);

34896

return;

34897

}

34898

34899

if (DstVT.isVector() && SrcVT == MVT::x86mmx) {

34900

// FIXME: Use v4f32 for SSE1?

34901

assert(Subtarget.hasSSE2() && "Requires SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Requires SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34901, __extension__
__PRETTY_FUNCTION__));

34902

assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34903, __extension__
__PRETTY_FUNCTION__))

34903

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34903, __extension__
__PRETTY_FUNCTION__));

34904

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);

34905

SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,

34906

N->getOperand(0));

34907

Res = DAG.getBitcast(WideVT, Res);

34908

Results.push_back(Res);

34909

return;

34910

}

34911

34912

return;

34913

}

34914

case ISD::MGATHER: {

34915

EVT VT = N->getValueType(0);

34916

if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&

34917

(Subtarget.hasVLX() || !Subtarget.hasAVX512())) {

34918

auto *Gather = cast<MaskedGatherSDNode>(N);

34919

SDValue Index = Gather->getIndex();

34920

if (Index.getValueType() != MVT::v2i64)

34921

return;

34922

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34923, __extension__
__PRETTY_FUNCTION__))

34923

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34923, __extension__
__PRETTY_FUNCTION__));

34924

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);

34925

SDValue Mask = Gather->getMask();

34926

assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34926, __extension__
__PRETTY_FUNCTION__));

34927

SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,

34928

Gather->getPassThru(),

34929

DAG.getUNDEF(VT));

34930

if (!Subtarget.hasVLX()) {

34931

// We need to widen the mask, but the instruction will only use 2

34932

// of its elements. So we can use undef.

34933

Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,

34934

DAG.getUNDEF(MVT::v2i1));

34935

Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);

34936

}

34937

SDValue Ops[] = { Gather->getChain(), PassThru, Mask,

34938

Gather->getBasePtr(), Index, Gather->getScale() };

34939

SDValue Res = DAG.getMemIntrinsicNode(

34940

X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,

34941

Gather->getMemoryVT(), Gather->getMemOperand());

34942

Results.push_back(Res);

34943

Results.push_back(Res.getValue(1));

34944

return;

34945

}

34946

return;

34947

}

34948

case ISD::LOAD: {

34949

// Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This

34950

// avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp

34951

// cast since type legalization will try to use an i64 load.

34952

MVT VT = N->getSimpleValueType(0);

34953

assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")(static_cast <bool> (VT.isVector() && VT.getSizeInBits
() == 64 && "Unexpected VT") ? void (0) : __assert_fail
("VT.isVector() && VT.getSizeInBits() == 64 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34953, __extension__
__PRETTY_FUNCTION__));

34954

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34955, __extension__
__PRETTY_FUNCTION__))

34955

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34955, __extension__
__PRETTY_FUNCTION__));

34956

if (!ISD::isNON_EXTLoad(N))

34957

return;

34958

auto *Ld = cast<LoadSDNode>(N);

34959

if (Subtarget.hasSSE2()) {

34960

MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;

34961

SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),

34962

Ld->getPointerInfo(), Ld->getOriginalAlign(),

34963

Ld->getMemOperand()->getFlags());

34964

SDValue Chain = Res.getValue(1);

34965

MVT VecVT = MVT::getVectorVT(LdVT, 2);

34966

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);

34967

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);

34968

Res = DAG.getBitcast(WideVT, Res);

34969

Results.push_back(Res);

34970

Results.push_back(Chain);

34971

return;

34972

}

34973

assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34973, __extension__
__PRETTY_FUNCTION__));

34974

SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);

34975

SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};

34976

SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,

34977

MVT::i64, Ld->getMemOperand());

34978

Results.push_back(Res);

34979

Results.push_back(Res.getValue(1));

34980

return;

34981

}

34982

case ISD::ADDRSPACECAST: {

34983

SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);

34984

Results.push_back(V);

34985

return;

34986

}

34987

case ISD::BITREVERSE: {

34988

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34988, __extension__
__PRETTY_FUNCTION__));

34989

assert(Subtarget.hasXOP() && "Expected XOP")(static_cast <bool> (Subtarget.hasXOP() && "Expected XOP"
) ? void (0) : __assert_fail ("Subtarget.hasXOP() && \"Expected XOP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34989, __extension__
__PRETTY_FUNCTION__));

34990

// We can use VPPERM by copying to a vector register and back. We'll need

34991

// to move the scalar in two i32 pieces.

34992

Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));

34993

return;

34994

}

34995

case ISD::EXTRACT_VECTOR_ELT: {

34996

// f16 = extract vXf16 %vec, i64 %idx

34997

assert(N->getSimpleValueType(0) == MVT::f16 &&(static_cast <bool> (N->getSimpleValueType(0) == MVT
::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"
) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34998, __extension__
__PRETTY_FUNCTION__))

34998

"Unexpected Value type of EXTRACT_VECTOR_ELT!")(static_cast <bool> (N->getSimpleValueType(0) == MVT
::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"
) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34998, __extension__
__PRETTY_FUNCTION__));

34999

assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34999, __extension__
__PRETTY_FUNCTION__));

35000

SDValue VecOp = N->getOperand(0);

35001

EVT ExtVT = VecOp.getValueType().changeVectorElementTypeToInteger();

35002

SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));

35003

Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,

35004

N->getOperand(1));

35005

Split = DAG.getBitcast(MVT::f16, Split);

35006

Results.push_back(Split);

35007

return;

35008

}

35009

}

35010

}

35011

35012

const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {

35013

switch ((X86ISD::NodeType)Opcode) {

35014

case X86ISD::FIRST_NUMBER: break;

35015

#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;

35016

NODE_NAME_CASE(BSF)

35017

NODE_NAME_CASE(BSR)

35018

NODE_NAME_CASE(FSHL)

35019

NODE_NAME_CASE(FSHR)

35020

NODE_NAME_CASE(FAND)

35021

NODE_NAME_CASE(FANDN)

35022

NODE_NAME_CASE(FOR)

35023

NODE_NAME_CASE(FXOR)

35024

NODE_NAME_CASE(FILD)

35025

NODE_NAME_CASE(FIST)

35026

NODE_NAME_CASE(FP_TO_INT_IN_MEM)

35027

NODE_NAME_CASE(FLD)

35028

NODE_NAME_CASE(FST)

35029

NODE_NAME_CASE(CALL)

35030

NODE_NAME_CASE(CALL_RVMARKER)

35031

NODE_NAME_CASE(BT)

35032

NODE_NAME_CASE(CMP)

35033

NODE_NAME_CASE(FCMP)

35034

NODE_NAME_CASE(STRICT_FCMP)

35035

NODE_NAME_CASE(STRICT_FCMPS)

35036

NODE_NAME_CASE(COMI)

35037

NODE_NAME_CASE(UCOMI)

35038

NODE_NAME_CASE(CMPM)

35039

NODE_NAME_CASE(CMPMM)

35040

NODE_NAME_CASE(STRICT_CMPM)

35041

NODE_NAME_CASE(CMPMM_SAE)

35042

NODE_NAME_CASE(SETCC)

35043

NODE_NAME_CASE(SETCC_CARRY)

35044

NODE_NAME_CASE(FSETCC)

35045

NODE_NAME_CASE(FSETCCM)

35046

NODE_NAME_CASE(FSETCCM_SAE)

35047

NODE_NAME_CASE(CMOV)

35048

NODE_NAME_CASE(BRCOND)

35049

NODE_NAME_CASE(RET_GLUE)

35050

NODE_NAME_CASE(IRET)

35051

NODE_NAME_CASE(REP_STOS)

35052

NODE_NAME_CASE(REP_MOVS)

35053

NODE_NAME_CASE(GlobalBaseReg)

35054

NODE_NAME_CASE(Wrapper)

35055

NODE_NAME_CASE(WrapperRIP)

35056

NODE_NAME_CASE(MOVQ2DQ)

35057

NODE_NAME_CASE(MOVDQ2Q)

35058

NODE_NAME_CASE(MMX_MOVD2W)

35059

NODE_NAME_CASE(MMX_MOVW2D)

35060

NODE_NAME_CASE(PEXTRB)

35061

NODE_NAME_CASE(PEXTRW)

35062

NODE_NAME_CASE(INSERTPS)

35063

NODE_NAME_CASE(PINSRB)

35064

NODE_NAME_CASE(PINSRW)

35065

NODE_NAME_CASE(PSHUFB)

35066

NODE_NAME_CASE(ANDNP)

35067

NODE_NAME_CASE(BLENDI)

35068

NODE_NAME_CASE(BLENDV)

35069

NODE_NAME_CASE(HADD)

35070

NODE_NAME_CASE(HSUB)

35071

NODE_NAME_CASE(FHADD)

35072

NODE_NAME_CASE(FHSUB)

35073

NODE_NAME_CASE(CONFLICT)

35074

NODE_NAME_CASE(FMAX)

35075

NODE_NAME_CASE(FMAXS)

35076

NODE_NAME_CASE(FMAX_SAE)

35077

NODE_NAME_CASE(FMAXS_SAE)

35078

NODE_NAME_CASE(FMIN)

35079

NODE_NAME_CASE(FMINS)

35080

NODE_NAME_CASE(FMIN_SAE)

35081

NODE_NAME_CASE(FMINS_SAE)

35082

NODE_NAME_CASE(FMAXC)

35083

NODE_NAME_CASE(FMINC)

35084

NODE_NAME_CASE(FRSQRT)

35085

NODE_NAME_CASE(FRCP)

35086

NODE_NAME_CASE(EXTRQI)

35087

NODE_NAME_CASE(INSERTQI)

35088

NODE_NAME_CASE(TLSADDR)

35089

NODE_NAME_CASE(TLSBASEADDR)

35090

NODE_NAME_CASE(TLSCALL)

35091

NODE_NAME_CASE(EH_SJLJ_SETJMP)

35092

NODE_NAME_CASE(EH_SJLJ_LONGJMP)

35093

NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)

35094

NODE_NAME_CASE(EH_RETURN)

35095

NODE_NAME_CASE(TC_RETURN)

35096

NODE_NAME_CASE(FNSTCW16m)

35097

NODE_NAME_CASE(FLDCW16m)

35098

NODE_NAME_CASE(LCMPXCHG_DAG)

35099

NODE_NAME_CASE(LCMPXCHG8_DAG)

35100

NODE_NAME_CASE(LCMPXCHG16_DAG)

35101

NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)

35102

NODE_NAME_CASE(LADD)

35103

NODE_NAME_CASE(LSUB)

35104

NODE_NAME_CASE(LOR)

35105

NODE_NAME_CASE(LXOR)

35106

NODE_NAME_CASE(LAND)

35107

NODE_NAME_CASE(LBTS)

35108

NODE_NAME_CASE(LBTC)

35109

NODE_NAME_CASE(LBTR)

35110

NODE_NAME_CASE(LBTS_RM)

35111

NODE_NAME_CASE(LBTC_RM)

35112

NODE_NAME_CASE(LBTR_RM)

35113

NODE_NAME_CASE(AADD)

35114

NODE_NAME_CASE(AOR)

35115

NODE_NAME_CASE(AXOR)

35116

NODE_NAME_CASE(AAND)

35117

NODE_NAME_CASE(VZEXT_MOVL)

35118

NODE_NAME_CASE(VZEXT_LOAD)

35119

NODE_NAME_CASE(VEXTRACT_STORE)

35120

NODE_NAME_CASE(VTRUNC)

35121

NODE_NAME_CASE(VTRUNCS)

35122

NODE_NAME_CASE(VTRUNCUS)

35123

NODE_NAME_CASE(VMTRUNC)

35124

NODE_NAME_CASE(VMTRUNCS)

35125

NODE_NAME_CASE(VMTRUNCUS)

35126

NODE_NAME_CASE(VTRUNCSTORES)

35127

NODE_NAME_CASE(VTRUNCSTOREUS)

35128

NODE_NAME_CASE(VMTRUNCSTORES)

35129

NODE_NAME_CASE(VMTRUNCSTOREUS)

35130

NODE_NAME_CASE(VFPEXT)

35131

NODE_NAME_CASE(STRICT_VFPEXT)

35132

NODE_NAME_CASE(VFPEXT_SAE)

35133

NODE_NAME_CASE(VFPEXTS)

35134

NODE_NAME_CASE(VFPEXTS_SAE)

35135

NODE_NAME_CASE(VFPROUND)

35136

NODE_NAME_CASE(STRICT_VFPROUND)

35137

NODE_NAME_CASE(VMFPROUND)

35138

NODE_NAME_CASE(VFPROUND_RND)

35139

NODE_NAME_CASE(VFPROUNDS)

35140

NODE_NAME_CASE(VFPROUNDS_RND)

35141

NODE_NAME_CASE(VSHLDQ)

35142

NODE_NAME_CASE(VSRLDQ)

35143

NODE_NAME_CASE(VSHL)

35144

NODE_NAME_CASE(VSRL)

35145

NODE_NAME_CASE(VSRA)

35146

NODE_NAME_CASE(VSHLI)

35147

NODE_NAME_CASE(VSRLI)

35148

NODE_NAME_CASE(VSRAI)

35149

NODE_NAME_CASE(VSHLV)

35150

NODE_NAME_CASE(VSRLV)

35151

NODE_NAME_CASE(VSRAV)

35152

NODE_NAME_CASE(VROTLI)

35153

NODE_NAME_CASE(VROTRI)

35154

NODE_NAME_CASE(VPPERM)

35155

NODE_NAME_CASE(CMPP)

35156

NODE_NAME_CASE(STRICT_CMPP)

35157

NODE_NAME_CASE(PCMPEQ)

35158

NODE_NAME_CASE(PCMPGT)

35159

NODE_NAME_CASE(PHMINPOS)

35160

NODE_NAME_CASE(ADD)

35161

NODE_NAME_CASE(SUB)

35162

NODE_NAME_CASE(ADC)

35163

NODE_NAME_CASE(SBB)

35164

NODE_NAME_CASE(SMUL)

35165

NODE_NAME_CASE(UMUL)

35166

NODE_NAME_CASE(OR)

35167

NODE_NAME_CASE(XOR)

35168

NODE_NAME_CASE(AND)

35169

NODE_NAME_CASE(BEXTR)

35170

NODE_NAME_CASE(BEXTRI)

35171

NODE_NAME_CASE(BZHI)

35172

NODE_NAME_CASE(PDEP)

35173

NODE_NAME_CASE(PEXT)

35174

NODE_NAME_CASE(MUL_IMM)

35175

NODE_NAME_CASE(MOVMSK)

35176

NODE_NAME_CASE(PTEST)

35177

NODE_NAME_CASE(TESTP)

35178

NODE_NAME_CASE(KORTEST)

35179

NODE_NAME_CASE(KTEST)

35180

NODE_NAME_CASE(KADD)

35181

NODE_NAME_CASE(KSHIFTL)

35182

NODE_NAME_CASE(KSHIFTR)

35183

NODE_NAME_CASE(PACKSS)

35184

NODE_NAME_CASE(PACKUS)

35185

NODE_NAME_CASE(PALIGNR)

35186

NODE_NAME_CASE(VALIGN)

35187

NODE_NAME_CASE(VSHLD)

35188

NODE_NAME_CASE(VSHRD)

35189

NODE_NAME_CASE(VSHLDV)

35190

NODE_NAME_CASE(VSHRDV)

35191

NODE_NAME_CASE(PSHUFD)

35192

NODE_NAME_CASE(PSHUFHW)

35193

NODE_NAME_CASE(PSHUFLW)

35194

NODE_NAME_CASE(SHUFP)

35195

NODE_NAME_CASE(SHUF128)

35196

NODE_NAME_CASE(MOVLHPS)

35197

NODE_NAME_CASE(MOVHLPS)

35198

NODE_NAME_CASE(MOVDDUP)

35199

NODE_NAME_CASE(MOVSHDUP)

35200

NODE_NAME_CASE(MOVSLDUP)

35201

NODE_NAME_CASE(MOVSD)

35202

NODE_NAME_CASE(MOVSS)

35203

NODE_NAME_CASE(MOVSH)

35204

NODE_NAME_CASE(UNPCKL)

35205

NODE_NAME_CASE(UNPCKH)

35206

NODE_NAME_CASE(VBROADCAST)

35207

NODE_NAME_CASE(VBROADCAST_LOAD)

35208

NODE_NAME_CASE(VBROADCASTM)

35209

NODE_NAME_CASE(SUBV_BROADCAST_LOAD)

35210

NODE_NAME_CASE(VPERMILPV)

35211

NODE_NAME_CASE(VPERMILPI)

35212

NODE_NAME_CASE(VPERM2X128)

35213

NODE_NAME_CASE(VPERMV)

35214

NODE_NAME_CASE(VPERMV3)

35215

NODE_NAME_CASE(VPERMI)

35216

NODE_NAME_CASE(VPTERNLOG)

35217

NODE_NAME_CASE(VFIXUPIMM)

35218

NODE_NAME_CASE(VFIXUPIMM_SAE)

35219

NODE_NAME_CASE(VFIXUPIMMS)

35220

NODE_NAME_CASE(VFIXUPIMMS_SAE)

35221

NODE_NAME_CASE(VRANGE)

35222

NODE_NAME_CASE(VRANGE_SAE)

35223

NODE_NAME_CASE(VRANGES)

35224

NODE_NAME_CASE(VRANGES_SAE)

35225

NODE_NAME_CASE(PMULUDQ)

35226

NODE_NAME_CASE(PMULDQ)

35227

NODE_NAME_CASE(PSADBW)

35228

NODE_NAME_CASE(DBPSADBW)

35229

NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)

35230

NODE_NAME_CASE(VAARG_64)

35231

NODE_NAME_CASE(VAARG_X32)

35232

NODE_NAME_CASE(DYN_ALLOCA)

35233

NODE_NAME_CASE(MFENCE)

35234

NODE_NAME_CASE(SEG_ALLOCA)

35235

NODE_NAME_CASE(PROBED_ALLOCA)

35236

NODE_NAME_CASE(RDRAND)

35237

NODE_NAME_CASE(RDSEED)

35238

NODE_NAME_CASE(RDPKRU)

35239

NODE_NAME_CASE(WRPKRU)

35240

NODE_NAME_CASE(VPMADDUBSW)

35241

NODE_NAME_CASE(VPMADDWD)

35242

NODE_NAME_CASE(VPSHA)

35243

NODE_NAME_CASE(VPSHL)

35244

NODE_NAME_CASE(VPCOM)

35245

NODE_NAME_CASE(VPCOMU)

35246

NODE_NAME_CASE(VPERMIL2)

35247

NODE_NAME_CASE(FMSUB)

35248

NODE_NAME_CASE(STRICT_FMSUB)

35249

NODE_NAME_CASE(FNMADD)

35250

NODE_NAME_CASE(STRICT_FNMADD)

35251

NODE_NAME_CASE(FNMSUB)

35252

NODE_NAME_CASE(STRICT_FNMSUB)

35253

NODE_NAME_CASE(FMADDSUB)

35254

NODE_NAME_CASE(FMSUBADD)

35255

NODE_NAME_CASE(FMADD_RND)

35256

NODE_NAME_CASE(FNMADD_RND)

35257

NODE_NAME_CASE(FMSUB_RND)

35258

NODE_NAME_CASE(FNMSUB_RND)

35259

NODE_NAME_CASE(FMADDSUB_RND)

35260

NODE_NAME_CASE(FMSUBADD_RND)

35261

NODE_NAME_CASE(VFMADDC)

35262

NODE_NAME_CASE(VFMADDC_RND)

35263

NODE_NAME_CASE(VFCMADDC)

35264

NODE_NAME_CASE(VFCMADDC_RND)

35265

NODE_NAME_CASE(VFMULC)

35266

NODE_NAME_CASE(VFMULC_RND)

35267

NODE_NAME_CASE(VFCMULC)

35268

NODE_NAME_CASE(VFCMULC_RND)

35269

NODE_NAME_CASE(VFMULCSH)

35270

NODE_NAME_CASE(VFMULCSH_RND)

35271

NODE_NAME_CASE(VFCMULCSH)

35272

NODE_NAME_CASE(VFCMULCSH_RND)

35273

NODE_NAME_CASE(VFMADDCSH)

35274

NODE_NAME_CASE(VFMADDCSH_RND)

35275

NODE_NAME_CASE(VFCMADDCSH)

35276

NODE_NAME_CASE(VFCMADDCSH_RND)

35277

NODE_NAME_CASE(VPMADD52H)

35278

NODE_NAME_CASE(VPMADD52L)

35279

NODE_NAME_CASE(VRNDSCALE)

35280

NODE_NAME_CASE(STRICT_VRNDSCALE)

35281

NODE_NAME_CASE(VRNDSCALE_SAE)

35282

NODE_NAME_CASE(VRNDSCALES)

35283

NODE_NAME_CASE(VRNDSCALES_SAE)

35284

NODE_NAME_CASE(VREDUCE)

35285

NODE_NAME_CASE(VREDUCE_SAE)

35286

NODE_NAME_CASE(VREDUCES)

35287

NODE_NAME_CASE(VREDUCES_SAE)

35288

NODE_NAME_CASE(VGETMANT)

35289

NODE_NAME_CASE(VGETMANT_SAE)

35290

NODE_NAME_CASE(VGETMANTS)

35291

NODE_NAME_CASE(VGETMANTS_SAE)

35292

NODE_NAME_CASE(PCMPESTR)

35293

NODE_NAME_CASE(PCMPISTR)

35294

NODE_NAME_CASE(XTEST)

35295

NODE_NAME_CASE(COMPRESS)

35296

NODE_NAME_CASE(EXPAND)

35297

NODE_NAME_CASE(SELECTS)

35298

NODE_NAME_CASE(ADDSUB)

35299

NODE_NAME_CASE(RCP14)

35300

NODE_NAME_CASE(RCP14S)

35301

NODE_NAME_CASE(RCP28)

35302

NODE_NAME_CASE(RCP28_SAE)

35303

NODE_NAME_CASE(RCP28S)

35304

NODE_NAME_CASE(RCP28S_SAE)

35305

NODE_NAME_CASE(EXP2)

35306

NODE_NAME_CASE(EXP2_SAE)

35307

NODE_NAME_CASE(RSQRT14)

35308

NODE_NAME_CASE(RSQRT14S)

35309

NODE_NAME_CASE(RSQRT28)

35310

NODE_NAME_CASE(RSQRT28_SAE)

35311

NODE_NAME_CASE(RSQRT28S)

35312

NODE_NAME_CASE(RSQRT28S_SAE)

35313

NODE_NAME_CASE(FADD_RND)

35314

NODE_NAME_CASE(FADDS)

35315

NODE_NAME_CASE(FADDS_RND)

35316

NODE_NAME_CASE(FSUB_RND)

35317

NODE_NAME_CASE(FSUBS)

35318

NODE_NAME_CASE(FSUBS_RND)

35319

NODE_NAME_CASE(FMUL_RND)

35320

NODE_NAME_CASE(FMULS)

35321

NODE_NAME_CASE(FMULS_RND)

35322

NODE_NAME_CASE(FDIV_RND)

35323

NODE_NAME_CASE(FDIVS)

35324

NODE_NAME_CASE(FDIVS_RND)

35325

NODE_NAME_CASE(FSQRT_RND)

35326

NODE_NAME_CASE(FSQRTS)

35327

NODE_NAME_CASE(FSQRTS_RND)

35328

NODE_NAME_CASE(FGETEXP)

35329

NODE_NAME_CASE(FGETEXP_SAE)

35330

NODE_NAME_CASE(FGETEXPS)

35331

NODE_NAME_CASE(FGETEXPS_SAE)

35332

NODE_NAME_CASE(SCALEF)

35333

NODE_NAME_CASE(SCALEF_RND)

35334

NODE_NAME_CASE(SCALEFS)

35335

NODE_NAME_CASE(SCALEFS_RND)

35336

NODE_NAME_CASE(MULHRS)

35337

NODE_NAME_CASE(SINT_TO_FP_RND)

35338

NODE_NAME_CASE(UINT_TO_FP_RND)

35339

NODE_NAME_CASE(CVTTP2SI)

35340

NODE_NAME_CASE(CVTTP2UI)

35341

NODE_NAME_CASE(STRICT_CVTTP2SI)

35342

NODE_NAME_CASE(STRICT_CVTTP2UI)

35343

NODE_NAME_CASE(MCVTTP2SI)

35344

NODE_NAME_CASE(MCVTTP2UI)

35345

NODE_NAME_CASE(CVTTP2SI_SAE)

35346

NODE_NAME_CASE(CVTTP2UI_SAE)

35347

NODE_NAME_CASE(CVTTS2SI)

35348

NODE_NAME_CASE(CVTTS2UI)

35349

NODE_NAME_CASE(CVTTS2SI_SAE)

35350

NODE_NAME_CASE(CVTTS2UI_SAE)

35351

NODE_NAME_CASE(CVTSI2P)

35352

NODE_NAME_CASE(CVTUI2P)

35353

NODE_NAME_CASE(STRICT_CVTSI2P)

35354

NODE_NAME_CASE(STRICT_CVTUI2P)

35355

NODE_NAME_CASE(MCVTSI2P)

35356

NODE_NAME_CASE(MCVTUI2P)

35357

NODE_NAME_CASE(VFPCLASS)

35358

NODE_NAME_CASE(VFPCLASSS)

35359

NODE_NAME_CASE(MULTISHIFT)

35360

NODE_NAME_CASE(SCALAR_SINT_TO_FP)

35361

NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)

35362

NODE_NAME_CASE(SCALAR_UINT_TO_FP)

35363

NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)

35364

NODE_NAME_CASE(CVTPS2PH)

35365

NODE_NAME_CASE(STRICT_CVTPS2PH)

35366

NODE_NAME_CASE(CVTPS2PH_SAE)

35367

NODE_NAME_CASE(MCVTPS2PH)

35368

NODE_NAME_CASE(MCVTPS2PH_SAE)

35369

NODE_NAME_CASE(CVTPH2PS)

35370

NODE_NAME_CASE(STRICT_CVTPH2PS)

35371

NODE_NAME_CASE(CVTPH2PS_SAE)

35372

NODE_NAME_CASE(CVTP2SI)

35373

NODE_NAME_CASE(CVTP2UI)

35374

NODE_NAME_CASE(MCVTP2SI)

35375

NODE_NAME_CASE(MCVTP2UI)

35376

NODE_NAME_CASE(CVTP2SI_RND)

35377

NODE_NAME_CASE(CVTP2UI_RND)

35378

NODE_NAME_CASE(CVTS2SI)

35379

NODE_NAME_CASE(CVTS2UI)

35380

NODE_NAME_CASE(CVTS2SI_RND)

35381

NODE_NAME_CASE(CVTS2UI_RND)

35382

NODE_NAME_CASE(CVTNE2PS2BF16)

35383

NODE_NAME_CASE(CVTNEPS2BF16)

35384

NODE_NAME_CASE(MCVTNEPS2BF16)

35385

NODE_NAME_CASE(DPBF16PS)

35386

NODE_NAME_CASE(LWPINS)

35387

NODE_NAME_CASE(MGATHER)

35388

NODE_NAME_CASE(MSCATTER)

35389

NODE_NAME_CASE(VPDPBUSD)

35390

NODE_NAME_CASE(VPDPBUSDS)

35391

NODE_NAME_CASE(VPDPWSSD)

35392

NODE_NAME_CASE(VPDPWSSDS)

35393

NODE_NAME_CASE(VPSHUFBITQMB)

35394

NODE_NAME_CASE(GF2P8MULB)

35395

NODE_NAME_CASE(GF2P8AFFINEQB)

35396

NODE_NAME_CASE(GF2P8AFFINEINVQB)

35397

NODE_NAME_CASE(NT_CALL)

35398

NODE_NAME_CASE(NT_BRIND)

35399

NODE_NAME_CASE(UMWAIT)

35400

NODE_NAME_CASE(TPAUSE)

35401

NODE_NAME_CASE(ENQCMD)

35402

NODE_NAME_CASE(ENQCMDS)

35403

NODE_NAME_CASE(VP2INTERSECT)

35404

NODE_NAME_CASE(VPDPBSUD)

35405

NODE_NAME_CASE(VPDPBSUDS)

35406

NODE_NAME_CASE(VPDPBUUD)

35407

NODE_NAME_CASE(VPDPBUUDS)

35408

NODE_NAME_CASE(VPDPBSSD)

35409

NODE_NAME_CASE(VPDPBSSDS)

35410

NODE_NAME_CASE(AESENC128KL)

35411

NODE_NAME_CASE(AESDEC128KL)

35412

NODE_NAME_CASE(AESENC256KL)

35413

NODE_NAME_CASE(AESDEC256KL)

35414

NODE_NAME_CASE(AESENCWIDE128KL)

35415

NODE_NAME_CASE(AESDECWIDE128KL)

35416

NODE_NAME_CASE(AESENCWIDE256KL)

35417

NODE_NAME_CASE(AESDECWIDE256KL)

35418

NODE_NAME_CASE(CMPCCXADD)

35419

NODE_NAME_CASE(TESTUI)

35420

NODE_NAME_CASE(FP80_ADD)

35421

NODE_NAME_CASE(STRICT_FP80_ADD)

35422

}

35423

return nullptr;

35424

#undef NODE_NAME_CASE

35425

}

35426

35427

/// Return true if the addressing mode represented by AM is legal for this

35428

/// target, for a load/store of the specified type.

35429

bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,

35430

const AddrMode &AM, Type *Ty,

35431

unsigned AS,

35432

Instruction *I) const {

35433

// X86 supports extremely general addressing modes.

35434

CodeModel::Model M = getTargetMachine().getCodeModel();

35435

35436

// X86 allows a sign-extended 32-bit immediate field as a displacement.

35437

if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))

35438

return false;

35439

35440

if (AM.BaseGV) {

35441

unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);

35442

35443

// If a reference to this global requires an extra load, we can't fold it.

35444

if (isGlobalStubReference(GVFlags))

35445

return false;

35446

35447

// If BaseGV requires a register for the PIC base, we cannot also have a

35448

// BaseReg specified.

35449

if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))

35450

return false;

35451

35452

// If lower 4G is not available, then we must use rip-relative addressing.

35453

if ((M != CodeModel::Small || isPositionIndependent()) &&

35454

Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))

35455

return false;

35456

}

35457

35458

switch (AM.Scale) {

35459

case 0:

35460

case 1:

35461

case 2:

35462

case 4:

35463

case 8:

35464

// These scales always work.

35465

break;

35466

case 3:

35467

case 5:

35468

case 9:

35469

// These scales are formed with basereg+scalereg. Only accept if there is

35470

// no basereg yet.

35471

if (AM.HasBaseReg)

35472

return false;

35473

break;

35474

default: // Other stuff never works.

35475

return false;

35476

}

35477

35478

return true;

35479

}

35480

35481

bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {

35482

unsigned Bits = Ty->getScalarSizeInBits();

35483

35484

// XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.

35485

// Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.

35486

if (Subtarget.hasXOP() &&

35487

(Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))

35488

return false;

35489

35490

// AVX2 has vpsllv[dq] instructions (and other shifts) that make variable

35491

// shifts just as cheap as scalar ones.

35492

if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))

35493

return false;

35494

35495

// AVX512BW has shifts such as vpsllvw.

35496

if (Subtarget.hasBWI() && Bits == 16)

35497

return false;

35498

35499

// Otherwise, it's significantly cheaper to shift by a scalar amount than by a

35500

// fully general vector.

35501

return true;

35502

}

35503

35504

bool X86TargetLowering::isBinOp(unsigned Opcode) const {

35505

switch (Opcode) {

35506

// These are non-commutative binops.

35507

// TODO: Add more X86ISD opcodes once we have test coverage.

35508

case X86ISD::ANDNP:

35509

case X86ISD::PCMPGT:

35510

case X86ISD::FMAX:

35511

case X86ISD::FMIN:

35512

case X86ISD::FANDN:

35513

case X86ISD::VPSHA:

35514

case X86ISD::VPSHL:

35515

case X86ISD::VSHLV:

35516

case X86ISD::VSRLV:

35517

case X86ISD::VSRAV:

35518

return true;

35519

}

35520

35521

return TargetLoweringBase::isBinOp(Opcode);

35522

}

35523

35524

bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {

35525

switch (Opcode) {

35526

// TODO: Add more X86ISD opcodes once we have test coverage.

35527

case X86ISD::PCMPEQ:

35528

case X86ISD::PMULDQ:

35529

case X86ISD::PMULUDQ:

35530

case X86ISD::FMAXC:

35531

case X86ISD::FMINC:

35532

case X86ISD::FAND:

35533

case X86ISD::FOR:

35534

case X86ISD::FXOR:

35535

return true;

35536

}

35537

35538

return TargetLoweringBase::isCommutativeBinOp(Opcode);

35539

}

35540

35541

bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {

35542

if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

35543

return false;

35544

unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();

35545

unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();

35546

return NumBits1 > NumBits2;

35547

}

35548

35549

bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {

35550

if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

35551

return false;

35552

35553

if (!isTypeLegal(EVT::getEVT(Ty1)))

35554

return false;

35555

35556

assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")(static_cast <bool> (Ty1->getPrimitiveSizeInBits() <=
64 && "i128 is probably not a noop") ? void (0) : __assert_fail
("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35556, __extension__
__PRETTY_FUNCTION__));

35557

35558

// Assuming the caller doesn't have a zeroext or signext return parameter,

35559

// truncation all the way down to i1 is valid.

35560

return true;

35561

}

35562

35563

bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {

35564

return isInt<32>(Imm);

35565

}

35566

35567

bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {

35568

// Can also use sub to handle negated immediates.

35569

return isInt<32>(Imm);

35570

}

35571

35572

bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {

35573

return isInt<32>(Imm);

35574

}

35575

35576

bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {

35577

if (!VT1.isScalarInteger() || !VT2.isScalarInteger())

35578

return false;

35579

unsigned NumBits1 = VT1.getSizeInBits();

35580

unsigned NumBits2 = VT2.getSizeInBits();

35581

return NumBits1 > NumBits2;

35582

}

35583

35584

bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {

35585

// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.

35586

return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();

35587

}

35588

35589

bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {

35590

// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.

35591

return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();

35592

}

35593

35594

bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {

35595

EVT VT1 = Val.getValueType();

35596

if (isZExtFree(VT1, VT2))

35597

return true;

35598

35599

if (Val.getOpcode() != ISD::LOAD)

35600

return false;

35601

35602

if (!VT1.isSimple() || !VT1.isInteger() ||

35603

!VT2.isSimple() || !VT2.isInteger())

35604

return false;

35605

35606

switch (VT1.getSimpleVT().SimpleTy) {

35607

default: break;

35608

case MVT::i8:

35609

case MVT::i16:

35610

case MVT::i32:

35611

// X86 has 8, 16, and 32-bit zero-extending loads.

35612

return true;

35613

}

35614

35615

return false;

35616

}

35617

35618

bool X86TargetLowering::shouldSinkOperands(Instruction *I,

35619

SmallVectorImpl<Use *> &Ops) const {

35620

using namespace llvm::PatternMatch;

35621

35622

FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());

35623

if (!VTy)

35624

return false;

35625

35626

if (I->getOpcode() == Instruction::Mul &&

35627

VTy->getElementType()->isIntegerTy(64)) {

35628

for (auto &Op : I->operands()) {

35629

// Make sure we are not already sinking this operand

35630

if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))

35631

continue;

35632

35633

// Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or

35634

// the PMULUDQ pattern where the input is a zext_inreg from vXi32.

35635

if (Subtarget.hasSSE41() &&

35636

match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),

35637

m_SpecificInt(32)))) {

35638

Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));

35639

Ops.push_back(&Op);

35640

} else if (Subtarget.hasSSE2() &&

35641

match(Op.get(),

35642

m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff)0xffffffffUL)))) {

35643

Ops.push_back(&Op);

35644

}

35645

}

35646

35647

return !Ops.empty();

35648

}

35649

35650

// A uniform shift amount in a vector shift or funnel shift may be much

35651

// cheaper than a generic variable vector shift, so make that pattern visible

35652

// to SDAG by sinking the shuffle instruction next to the shift.

35653

int ShiftAmountOpNum = -1;

35654

if (I->isShift())

35655

ShiftAmountOpNum = 1;

35656

else if (auto *II = dyn_cast<IntrinsicInst>(I)) {

35657

if (II->getIntrinsicID() == Intrinsic::fshl ||

35658

II->getIntrinsicID() == Intrinsic::fshr)

35659

ShiftAmountOpNum = 2;

35660

}

35661

35662

if (ShiftAmountOpNum == -1)

35663

return false;

35664

35665

auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));

35666

if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&

35667

isVectorShiftByScalarCheap(I->getType())) {

35668

Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));

35669

return true;

35670

}

35671

35672

return false;

35673

}

35674

35675

bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {

35676

if (!Subtarget.is64Bit())

35677

return false;

35678

return TargetLowering::shouldConvertPhiType(From, To);

35679

}

35680

35681

bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {

35682

if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))

35683

return false;

35684

35685

EVT SrcVT = ExtVal.getOperand(0).getValueType();

35686

35687

// There is no extending load for vXi1.

35688

if (SrcVT.getScalarType() == MVT::i1)

35689

return false;

35690

35691

return true;

35692

}

35693

35694

bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,

35695

EVT VT) const {

35696

if (!Subtarget.hasAnyFMA())

35697

return false;

35698

35699

VT = VT.getScalarType();

35700

35701

if (!VT.isSimple())

35702

return false;

35703

35704

switch (VT.getSimpleVT().SimpleTy) {

35705

case MVT::f16:

35706

return Subtarget.hasFP16();

35707

case MVT::f32:

35708

case MVT::f64:

35709

return true;

35710

default:

35711

break;

35712

}

35713

35714

return false;

35715

}

35716

35717

bool X86TargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {

35718

// i16 instructions are longer (0x66 prefix) and potentially slower.

35719

return !(SrcVT == MVT::i32 && DestVT == MVT::i16);

35720

}

35721

35722

bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,

35723

EVT VT) const {

35724

// TODO: This is too general. There are cases where pre-AVX512 codegen would

35725

// benefit. The transform may also be profitable for scalar code.

35726

if (!Subtarget.hasAVX512())

35727

return false;

35728

if (!Subtarget.hasVLX() && !VT.is512BitVector())

35729

return false;

35730

if (!VT.isVector() || VT.getScalarType() == MVT::i1)

35731

return false;

35732

35733

return true;

35734

}

35735

35736

/// Targets can use this to indicate that they only support *some*

35737

/// VECTOR_SHUFFLE operations, those with specific masks.

35738

/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values

35739

/// are assumed to be legal.

35740

bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {

35741

if (!VT.isSimple())

35742

return false;

35743

35744

// Not for i1 vectors

35745

if (VT.getSimpleVT().getScalarType() == MVT::i1)

35746

return false;

35747

35748

// Very little shuffling can be done for 64-bit vectors right now.

35749

if (VT.getSimpleVT().getSizeInBits() == 64)

35750

return false;

35751

35752

// We only care that the types being shuffled are legal. The lowering can

35753

// handle any possible shuffle mask that results.

35754

return isTypeLegal(VT.getSimpleVT());

35755

}

35756

35757

bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,

35758

EVT VT) const {

35759

// Don't convert an 'and' into a shuffle that we don't directly support.

35760

// vpblendw and vpshufb for 256-bit vectors are not available on AVX1.

35761

if (!Subtarget.hasAVX2())

35762

if (VT == MVT::v32i8 || VT == MVT::v16i16)

35763

return false;

35764

35765

// Just delegate to the generic legality, clear masks aren't special.

35766

return isShuffleMaskLegal(Mask, VT);

35767

}

35768

35769

bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {

35770

// If the subtarget is using thunks, we need to not generate jump tables.

35771

if (Subtarget.useIndirectThunkBranches())

35772

return false;

35773

35774

// Otherwise, fallback on the generic logic.

35775

return TargetLowering::areJTsAllowed(Fn);

35776

}

35777

35778

MVT X86TargetLowering::getPreferredSwitchConditionType(LLVMContext &Context,

35779

EVT ConditionVT) const {

35780

// Avoid 8 and 16 bit types because they increase the chance for unnecessary

35781

// zero-extensions.

35782

if (ConditionVT.getSizeInBits() < 32)

35783

return MVT::i32;

35784

return TargetLoweringBase::getPreferredSwitchConditionType(Context,

35785

ConditionVT);

35786

}

35787

35788

//===----------------------------------------------------------------------===//

35789

// X86 Scheduler Hooks

35790

//===----------------------------------------------------------------------===//

35791

35792

// Returns true if EFLAG is consumed after this iterator in the rest of the

35793

// basic block or any successors of the basic block.

35794

static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,

35795

MachineBasicBlock *BB) {

35796

// Scan forward through BB for a use/def of EFLAGS.

35797

for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {

35798

if (mi.readsRegister(X86::EFLAGS))

35799

return true;

35800

// If we found a def, we can stop searching.

35801

if (mi.definesRegister(X86::EFLAGS))

35802

return false;

35803

}

35804

35805

// If we hit the end of the block, check whether EFLAGS is live into a

35806

// successor.

35807

for (MachineBasicBlock *Succ : BB->successors())

35808

if (Succ->isLiveIn(X86::EFLAGS))

35809

return true;

35810

35811

return false;

35812

}

35813

35814

/// Utility function to emit xbegin specifying the start of an RTM region.

35815

static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,

35816

const TargetInstrInfo *TII) {

35817

const DebugLoc &DL = MI.getDebugLoc();

35818

35819

const BasicBlock *BB = MBB->getBasicBlock();

35820

MachineFunction::iterator I = ++MBB->getIterator();

35821

35822

// For the v = xbegin(), we generate

35823

//

35824

// thisMBB:

35825

// xbegin sinkMBB

35826

//

35827

// mainMBB:

35828

// s0 = -1

35829

//

35830

// fallBB:

35831

// eax = # XABORT_DEF

35832

// s1 = eax

35833

//

35834

// sinkMBB:

35835

// v = phi(s0/mainBB, s1/fallBB)

35836

35837

MachineBasicBlock *thisMBB = MBB;

35838

MachineFunction *MF = MBB->getParent();

35839

MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);

35840

MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);

35841

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

35842

MF->insert(I, mainMBB);

35843

MF->insert(I, fallMBB);

35844

MF->insert(I, sinkMBB);

35845

35846

if (isEFLAGSLiveAfter(MI, MBB)) {

35847

mainMBB->addLiveIn(X86::EFLAGS);

35848

fallMBB->addLiveIn(X86::EFLAGS);

35849

sinkMBB->addLiveIn(X86::EFLAGS);

35850

}

35851

35852

// Transfer the remainder of BB and its successor edges to sinkMBB.

35853

sinkMBB->splice(sinkMBB->begin(), MBB,

35854

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

35855

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

35856

35857

MachineRegisterInfo &MRI = MF->getRegInfo();

35858

Register DstReg = MI.getOperand(0).getReg();

35859

const TargetRegisterClass *RC = MRI.getRegClass(DstReg);

35860

Register mainDstReg = MRI.createVirtualRegister(RC);

35861

Register fallDstReg = MRI.createVirtualRegister(RC);

35862

35863

// thisMBB:

35864

// xbegin fallMBB

35865

// # fallthrough to mainMBB

35866

// # abortion to fallMBB

35867

BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);

35868

thisMBB->addSuccessor(mainMBB);

35869

thisMBB->addSuccessor(fallMBB);

35870

35871

// mainMBB:

35872

// mainDstReg := -1

35873

BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);

35874

BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);

35875

mainMBB->addSuccessor(sinkMBB);

35876

35877

// fallMBB:

35878

// ; pseudo instruction to model hardware's definition from XABORT

35879

// EAX := XABORT_DEF

35880

// fallDstReg := EAX

35881

BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));

35882

BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)

35883

.addReg(X86::EAX);

35884

fallMBB->addSuccessor(sinkMBB);

35885

35886

// sinkMBB:

35887

// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)

35888

BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)

35889

.addReg(mainDstReg).addMBB(mainMBB)

35890

.addReg(fallDstReg).addMBB(fallMBB);

35891

35892

MI.eraseFromParent();

35893

return sinkMBB;

35894

}

35895

35896

MachineBasicBlock *

35897

X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,

35898

MachineBasicBlock *MBB) const {

35899

// Emit va_arg instruction on X86-64.

35900

35901

// Operands to this pseudo-instruction:

35902

// 0 ) Output : destination address (reg)

35903

// 1-5) Input : va_list address (addr, i64mem)

35904

// 6 ) ArgSize : Size (in bytes) of vararg type

35905

// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset

35906

// 8 ) Align : Alignment of type

35907

// 9 ) EFLAGS (implicit-def)

35908

35909

assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!")(static_cast <bool> (MI.getNumOperands() == 10 &&
"VAARG should have 10 operands!") ? void (0) : __assert_fail
("MI.getNumOperands() == 10 && \"VAARG should have 10 operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35909, __extension__
__PRETTY_FUNCTION__));

35910

static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");

35911

35912

Register DestReg = MI.getOperand(0).getReg();

35913

MachineOperand &Base = MI.getOperand(1);

35914

MachineOperand &Scale = MI.getOperand(2);

35915

MachineOperand &Index = MI.getOperand(3);

35916

MachineOperand &Disp = MI.getOperand(4);

35917

MachineOperand &Segment = MI.getOperand(5);

35918

unsigned ArgSize = MI.getOperand(6).getImm();

35919

unsigned ArgMode = MI.getOperand(7).getImm();

35920

Align Alignment = Align(MI.getOperand(8).getImm());

35921

35922

MachineFunction *MF = MBB->getParent();

35923

35924

// Memory Reference

35925

assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand")(static_cast <bool> (MI.hasOneMemOperand() && "Expected VAARG to have one memoperand"
) ? void (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG to have one memoperand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35925, __extension__
__PRETTY_FUNCTION__));

35926

35927

MachineMemOperand *OldMMO = MI.memoperands().front();

35928

35929

// Clone the MMO into two separate MMOs for loading and storing

35930

MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(

35931

OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);

35932

MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(

35933

OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);

35934

35935

// Machine Information

35936

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

35937

MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();

35938

const TargetRegisterClass *AddrRegClass =

35939

getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));

35940

const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);

35941

const DebugLoc &DL = MI.getDebugLoc();

35942

35943

// struct va_list {

35944

// i32 gp_offset

35945

// i32 fp_offset

35946

// i64 overflow_area (address)

35947

// i64 reg_save_area (address)

35948

// }

35949

// sizeof(va_list) = 24

35950

// alignment(va_list) = 8

35951

35952

unsigned TotalNumIntRegs = 6;

35953

unsigned TotalNumXMMRegs = 8;

35954

bool UseGPOffset = (ArgMode == 1);

35955

bool UseFPOffset = (ArgMode == 2);

35956

unsigned MaxOffset = TotalNumIntRegs * 8 +

35957

(UseFPOffset ? TotalNumXMMRegs * 16 : 0);

35958

35959

/* Align ArgSize to a multiple of 8 */

35960

unsigned ArgSizeA8 = (ArgSize + 7) & ~7;

35961

bool NeedsAlign = (Alignment > 8);

35962

35963

MachineBasicBlock *thisMBB = MBB;

35964

MachineBasicBlock *overflowMBB;

35965

MachineBasicBlock *offsetMBB;

35966

MachineBasicBlock *endMBB;

35967

35968

unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB

35969

unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB

35970

unsigned OffsetReg = 0;

35971

35972

if (!UseGPOffset && !UseFPOffset) {

35973

// If we only pull from the overflow region, we don't create a branch.

35974

// We don't need to alter control flow.

35975

OffsetDestReg = 0; // unused

35976

OverflowDestReg = DestReg;

35977

35978

offsetMBB = nullptr;

35979

overflowMBB = thisMBB;

35980

endMBB = thisMBB;

35981

} else {

35982

// First emit code to check if gp_offset (or fp_offset) is below the bound.

35983

// If so, pull the argument from reg_save_area. (branch to offsetMBB)

35984

// If not, pull from overflow_area. (branch to overflowMBB)

35985

//

35986

// thisMBB

35987

// | .

35988

// | .

35989

// offsetMBB overflowMBB

35990

// | .

35991

// | .

35992

// endMBB

35993

35994

// Registers for the PHI in endMBB

35995

OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);

35996

OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);

35997

35998

const BasicBlock *LLVM_BB = MBB->getBasicBlock();

35999

overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36000

offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36001

endMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36002

36003

MachineFunction::iterator MBBIter = ++MBB->getIterator();

36004

36005

// Insert the new basic blocks

36006

MF->insert(MBBIter, offsetMBB);

36007

MF->insert(MBBIter, overflowMBB);

36008

MF->insert(MBBIter, endMBB);

36009

36010

// Transfer the remainder of MBB and its successor edges to endMBB.

36011

endMBB->splice(endMBB->begin(), thisMBB,

36012

std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());

36013

endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

36014

36015

// Make offsetMBB and overflowMBB successors of thisMBB

36016

thisMBB->addSuccessor(offsetMBB);

36017

thisMBB->addSuccessor(overflowMBB);

36018

36019

// endMBB is a successor of both offsetMBB and overflowMBB

36020

offsetMBB->addSuccessor(endMBB);

36021

overflowMBB->addSuccessor(endMBB);

36022

36023

// Load the offset value into a register

36024

OffsetReg = MRI.createVirtualRegister(OffsetRegClass);

36025

BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)

36026

.add(Base)

36027

.add(Scale)

36028

.add(Index)

36029

.addDisp(Disp, UseFPOffset ? 4 : 0)

36030

.add(Segment)

36031

.setMemRefs(LoadOnlyMMO);

36032

36033

// Check if there is enough room left to pull this argument.

36034

BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))

36035

.addReg(OffsetReg)

36036

.addImm(MaxOffset + 8 - ArgSizeA8);

36037

36038

// Branch to "overflowMBB" if offset >= max

36039

// Fall through to "offsetMBB" otherwise

36040

BuildMI(thisMBB, DL, TII->get(X86::JCC_1))

36041

.addMBB(overflowMBB).addImm(X86::COND_AE);

36042

}

36043

36044

// In offsetMBB, emit code to use the reg_save_area.

36045

if (offsetMBB) {

36046

assert(OffsetReg != 0)(static_cast <bool> (OffsetReg != 0) ? void (0) : __assert_fail
("OffsetReg != 0", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36046, __extension__ __PRETTY_FUNCTION__));

36047

36048

// Read the reg_save_area address.

36049

Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);

36050

BuildMI(

36051

offsetMBB, DL,

36052

TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),

36053

RegSaveReg)

36054

.add(Base)

36055

.add(Scale)

36056

.add(Index)

36057

.addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)

36058

.add(Segment)

36059

.setMemRefs(LoadOnlyMMO);

36060

36061

if (Subtarget.isTarget64BitLP64()) {

36062

// Zero-extend the offset

36063

Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);

36064

BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)

36065

.addImm(0)

36066

.addReg(OffsetReg)

36067

.addImm(X86::sub_32bit);

36068

36069

// Add the offset to the reg_save_area to get the final address.

36070

BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)

36071

.addReg(OffsetReg64)

36072

.addReg(RegSaveReg);

36073

} else {

36074

// Add the offset to the reg_save_area to get the final address.

36075

BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)

36076

.addReg(OffsetReg)

36077

.addReg(RegSaveReg);

36078

}

36079

36080

// Compute the offset for the next argument

36081

Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);

36082

BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)

36083

.addReg(OffsetReg)

36084

.addImm(UseFPOffset ? 16 : 8);

36085

36086

// Store it back into the va_list.

36087

BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))

36088

.add(Base)

36089

.add(Scale)

36090

.add(Index)

36091

.addDisp(Disp, UseFPOffset ? 4 : 0)

36092

.add(Segment)

36093

.addReg(NextOffsetReg)

36094

.setMemRefs(StoreOnlyMMO);

36095

36096

// Jump to endMBB

36097

BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))

36098

.addMBB(endMBB);

36099

}

36100

36101

//

36102

// Emit code to use overflow area

36103

//

36104

36105

// Load the overflow_area address into a register.

36106

Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);

36107

BuildMI(overflowMBB, DL,

36108

TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),

36109

OverflowAddrReg)

36110

.add(Base)

36111

.add(Scale)

36112

.add(Index)

36113

.addDisp(Disp, 8)

36114

.add(Segment)

36115

.setMemRefs(LoadOnlyMMO);

36116

36117

// If we need to align it, do so. Otherwise, just copy the address

36118

// to OverflowDestReg.

36119

if (NeedsAlign) {

36120

// Align the overflow address

36121

Register TmpReg = MRI.createVirtualRegister(AddrRegClass);

36122

36123

// aligned_addr = (addr + (align-1)) & ~(align-1)

36124

BuildMI(

36125

overflowMBB, DL,

36126

TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),

36127

TmpReg)

36128

.addReg(OverflowAddrReg)

36129

.addImm(Alignment.value() - 1);

36130

36131

BuildMI(

36132

overflowMBB, DL,

36133

TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),

36134

OverflowDestReg)

36135

.addReg(TmpReg)

36136

.addImm(~(uint64_t)(Alignment.value() - 1));

36137

} else {

36138

BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)

36139

.addReg(OverflowAddrReg);

36140

}

36141

36142

// Compute the next overflow address after this argument.

36143

// (the overflow address should be kept 8-byte aligned)

36144

Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);

36145

BuildMI(

36146

overflowMBB, DL,

36147

TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),

36148

NextAddrReg)

36149

.addReg(OverflowDestReg)

36150

.addImm(ArgSizeA8);

36151

36152

// Store the new overflow address.

36153

BuildMI(overflowMBB, DL,

36154

TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))

36155

.add(Base)

36156

.add(Scale)

36157

.add(Index)

36158

.addDisp(Disp, 8)

36159

.add(Segment)

36160

.addReg(NextAddrReg)

36161

.setMemRefs(StoreOnlyMMO);

36162

36163

// If we branched, emit the PHI to the front of endMBB.

36164

if (offsetMBB) {

36165

BuildMI(*endMBB, endMBB->begin(), DL,

36166

TII->get(X86::PHI), DestReg)

36167

.addReg(OffsetDestReg).addMBB(offsetMBB)

36168

.addReg(OverflowDestReg).addMBB(overflowMBB);

36169

}

36170

36171

// Erase the pseudo instruction

36172

MI.eraseFromParent();

36173

36174

return endMBB;

36175

}

36176

36177

// The EFLAGS operand of SelectItr might be missing a kill marker

36178

// because there were multiple uses of EFLAGS, and ISel didn't know

36179

// which to mark. Figure out whether SelectItr should have had a

36180

// kill marker, and set it if it should. Returns the correct kill

36181

// marker value.

36182

static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,

36183

MachineBasicBlock* BB,

36184

const TargetRegisterInfo* TRI) {

36185

if (isEFLAGSLiveAfter(SelectItr, BB))

36186

return false;

36187

36188

// We found a def, or hit the end of the basic block and EFLAGS wasn't live

36189

// out. SelectMI should have a kill flag on EFLAGS.

36190

SelectItr->addRegisterKilled(X86::EFLAGS, TRI);

36191

return true;

36192

}

36193

36194

// Return true if it is OK for this CMOV pseudo-opcode to be cascaded

36195

// together with other CMOV pseudo-opcodes into a single basic-block with

36196

// conditional jump around it.

36197

static bool isCMOVPseudo(MachineInstr &MI) {

36198

switch (MI.getOpcode()) {

36199

case X86::CMOV_FR16:

36200

case X86::CMOV_FR16X:

36201

case X86::CMOV_FR32:

36202

case X86::CMOV_FR32X:

36203

case X86::CMOV_FR64:

36204

case X86::CMOV_FR64X:

36205

case X86::CMOV_GR8:

36206

case X86::CMOV_GR16:

36207

case X86::CMOV_GR32:

36208

case X86::CMOV_RFP32:

36209

case X86::CMOV_RFP64:

36210

case X86::CMOV_RFP80:

36211

case X86::CMOV_VR64:

36212

case X86::CMOV_VR128:

36213

case X86::CMOV_VR128X:

36214

case X86::CMOV_VR256:

36215

case X86::CMOV_VR256X:

36216

case X86::CMOV_VR512:

36217

case X86::CMOV_VK1:

36218

case X86::CMOV_VK2:

36219

case X86::CMOV_VK4:

36220

case X86::CMOV_VK8:

36221

case X86::CMOV_VK16:

36222

case X86::CMOV_VK32:

36223

case X86::CMOV_VK64:

36224

return true;

36225

36226

default:

36227

return false;

36228

}

36229

}

36230

36231

// Helper function, which inserts PHI functions into SinkMBB:

36232

// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],

36233

// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs

36234

// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for

36235

// the last PHI function inserted.

36236

static MachineInstrBuilder createPHIsForCMOVsInSinkBB(

36237

MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,

36238

MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,

36239

MachineBasicBlock *SinkMBB) {

36240

MachineFunction *MF = TrueMBB->getParent();

36241

const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();

36242

const DebugLoc &DL = MIItBegin->getDebugLoc();

36243

36244

X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());

36245

X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

36246

36247

MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();

36248

36249

// As we are creating the PHIs, we have to be careful if there is more than

36250

// one. Later CMOVs may reference the results of earlier CMOVs, but later

36251

// PHIs have to reference the individual true/false inputs from earlier PHIs.

36252

// That also means that PHI construction must work forward from earlier to

36253

// later, and that the code must maintain a mapping from earlier PHI's

36254

// destination registers, and the registers that went into the PHI.

36255

DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;

36256

MachineInstrBuilder MIB;

36257

36258

for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {

36259

Register DestReg = MIIt->getOperand(0).getReg();

36260

Register Op1Reg = MIIt->getOperand(1).getReg();

36261

Register Op2Reg = MIIt->getOperand(2).getReg();

36262

36263

// If this CMOV we are generating is the opposite condition from

36264

// the jump we generated, then we have to swap the operands for the

36265

// PHI that is going to be generated.

36266

if (MIIt->getOperand(3).getImm() == OppCC)

36267

std::swap(Op1Reg, Op2Reg);

36268

36269

if (RegRewriteTable.contains(Op1Reg))

36270

Op1Reg = RegRewriteTable[Op1Reg].first;

36271

36272

if (RegRewriteTable.contains(Op2Reg))

36273

Op2Reg = RegRewriteTable[Op2Reg].second;

36274

36275

MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)

36276

.addReg(Op1Reg)

36277

.addMBB(FalseMBB)

36278

.addReg(Op2Reg)

36279

.addMBB(TrueMBB);

36280

36281

// Add this PHI to the rewrite table.

36282

RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);

36283

}

36284

36285

return MIB;

36286

}

36287

36288

// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).

36289

MachineBasicBlock *

36290

X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,

36291

MachineInstr &SecondCascadedCMOV,

36292

MachineBasicBlock *ThisMBB) const {

36293

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36294

const DebugLoc &DL = FirstCMOV.getDebugLoc();

36295

36296

// We lower cascaded CMOVs such as

36297

//

36298

// (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)

36299

//

36300

// to two successive branches.

36301

//

36302

// Without this, we would add a PHI between the two jumps, which ends up

36303

// creating a few copies all around. For instance, for

36304

//

36305

// (sitofp (zext (fcmp une)))

36306

//

36307

// we would generate:

36308

//

36309

// ucomiss %xmm1, %xmm0

36310

// movss <1.0f>, %xmm0

36311

// movaps %xmm0, %xmm1

36312

// jne .LBB5_2

36313

// xorps %xmm1, %xmm1

36314

// .LBB5_2:

36315

// jp .LBB5_4

36316

// movaps %xmm1, %xmm0

36317

// .LBB5_4:

36318

// retq

36319

//

36320

// because this custom-inserter would have generated:

36321

//

36322

// A

36323

// | \

36324

// | B

36325

// | /

36326

// C

36327

// | \

36328

// | D

36329

// | /

36330

// E

36331

//

36332

// A: X = ...; Y = ...

36333

// B: empty

36334

// C: Z = PHI [X, A], [Y, B]

36335

// D: empty

36336

// E: PHI [X, C], [Z, D]

36337

//

36338

// If we lower both CMOVs in a single step, we can instead generate:

36339

//

36340

// A

36341

// | \

36342

// | C

36343

// | /|

36344

// |/ |

36345

// | |

36346

// | D

36347

// | /

36348

// E

36349

//

36350

// A: X = ...; Y = ...

36351

// D: empty

36352

// E: PHI [X, A], [X, C], [Y, D]

36353

//

36354

// Which, in our sitofp/fcmp example, gives us something like:

36355

//

36356

// ucomiss %xmm1, %xmm0

36357

// movss <1.0f>, %xmm0

36358

// jne .LBB5_4

36359

// jp .LBB5_4

36360

// xorps %xmm0, %xmm0

36361

// .LBB5_4:

36362

// retq

36363

//

36364

36365

// We lower cascaded CMOV into two successive branches to the same block.

36366

// EFLAGS is used by both, so mark it as live in the second.

36367

const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();

36368

MachineFunction *F = ThisMBB->getParent();

36369

MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);

36370

MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);

36371

MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

36372

36373

MachineFunction::iterator It = ++ThisMBB->getIterator();

36374

F->insert(It, FirstInsertedMBB);

36375

F->insert(It, SecondInsertedMBB);

36376

F->insert(It, SinkMBB);

36377

36378

// For a cascaded CMOV, we lower it to two successive branches to

36379

// the same block (SinkMBB). EFLAGS is used by both, so mark it as live in

36380

// the FirstInsertedMBB.

36381

FirstInsertedMBB->addLiveIn(X86::EFLAGS);

36382

36383

// If the EFLAGS register isn't dead in the terminator, then claim that it's

36384

// live into the sink and copy blocks.

36385

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

36386

if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&

36387

!checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {

36388

SecondInsertedMBB->addLiveIn(X86::EFLAGS);

36389

SinkMBB->addLiveIn(X86::EFLAGS);

36390

}

36391

36392

// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.

36393

SinkMBB->splice(SinkMBB->begin(), ThisMBB,

36394

std::next(MachineBasicBlock::iterator(FirstCMOV)),

36395

ThisMBB->end());

36396

SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

36397

36398

// Fallthrough block for ThisMBB.

36399

ThisMBB->addSuccessor(FirstInsertedMBB);

36400

// The true block target of the first branch is always SinkMBB.

36401

ThisMBB->addSuccessor(SinkMBB);

36402

// Fallthrough block for FirstInsertedMBB.

36403

FirstInsertedMBB->addSuccessor(SecondInsertedMBB);

36404

// The true block for the branch of FirstInsertedMBB.

36405

FirstInsertedMBB->addSuccessor(SinkMBB);

36406

// This is fallthrough.

36407

SecondInsertedMBB->addSuccessor(SinkMBB);

36408

36409

// Create the conditional branch instructions.

36410

X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());

36411

BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);

36412

36413

X86::CondCode SecondCC =

36414

X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());

36415

BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);

36416

36417

// SinkMBB:

36418

// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]

36419

Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();

36420

Register Op1Reg = FirstCMOV.getOperand(1).getReg();

36421

Register Op2Reg = FirstCMOV.getOperand(2).getReg();

36422

MachineInstrBuilder MIB =

36423

BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)

36424

.addReg(Op1Reg)

36425

.addMBB(SecondInsertedMBB)

36426

.addReg(Op2Reg)

36427

.addMBB(ThisMBB);

36428

36429

// The second SecondInsertedMBB provides the same incoming value as the

36430

// FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).

36431

MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);

36432

36433

// Now remove the CMOVs.

36434

FirstCMOV.eraseFromParent();

36435

SecondCascadedCMOV.eraseFromParent();

36436

36437

return SinkMBB;

36438

}

36439

36440

MachineBasicBlock *

36441

X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,

36442

MachineBasicBlock *ThisMBB) const {

36443

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36444

const DebugLoc &DL = MI.getDebugLoc();

36445

36446

// To "insert" a SELECT_CC instruction, we actually have to insert the

36447

// diamond control-flow pattern. The incoming instruction knows the

36448

// destination vreg to set, the condition code register to branch on, the

36449

// true/false values to select between and a branch opcode to use.

36450

36451

// ThisMBB:

36452

// ...

36453

// TrueVal = ...

36454

// cmpTY ccX, r1, r2

36455

// bCC copy1MBB

36456

// fallthrough --> FalseMBB

36457

36458

// This code lowers all pseudo-CMOV instructions. Generally it lowers these

36459

// as described above, by inserting a BB, and then making a PHI at the join

36460

// point to select the true and false operands of the CMOV in the PHI.

36461

//

36462

// The code also handles two different cases of multiple CMOV opcodes

36463

// in a row.

36464

//

36465

// Case 1:

36466

// In this case, there are multiple CMOVs in a row, all which are based on

36467

// the same condition setting (or the exact opposite condition setting).

36468

// In this case we can lower all the CMOVs using a single inserted BB, and

36469

// then make a number of PHIs at the join point to model the CMOVs. The only

36470

// trickiness here, is that in a case like:

36471

//

36472

// t2 = CMOV cond1 t1, f1

36473

// t3 = CMOV cond1 t2, f2

36474

//

36475

// when rewriting this into PHIs, we have to perform some renaming on the

36476

// temps since you cannot have a PHI operand refer to a PHI result earlier

36477

// in the same block. The "simple" but wrong lowering would be:

36478

//

36479

// t2 = PHI t1(BB1), f1(BB2)

36480

// t3 = PHI t2(BB1), f2(BB2)

36481

//

36482

// but clearly t2 is not defined in BB1, so that is incorrect. The proper

36483

// renaming is to note that on the path through BB1, t2 is really just a

36484

// copy of t1, and do that renaming, properly generating:

36485

//

36486

// t2 = PHI t1(BB1), f1(BB2)

36487

// t3 = PHI t1(BB1), f2(BB2)

36488

//

36489

// Case 2:

36490

// CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate

36491

// function - EmitLoweredCascadedSelect.

36492

36493

X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());

36494

X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

36495

MachineInstr *LastCMOV = &MI;

36496

MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);

36497

36498

// Check for case 1, where there are multiple CMOVs with the same condition

36499

// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the

36500

// number of jumps the most.

36501

36502

if (isCMOVPseudo(MI)) {

36503

// See if we have a string of CMOVS with the same condition. Skip over

36504

// intervening debug insts.

36505

while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&

36506

(NextMIIt->getOperand(3).getImm() == CC ||

36507

NextMIIt->getOperand(3).getImm() == OppCC)) {

36508

LastCMOV = &*NextMIIt;

36509

NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());

36510

}

36511

}

36512

36513

// This checks for case 2, but only do this if we didn't already find

36514

// case 1, as indicated by LastCMOV == MI.

36515

if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&

36516

NextMIIt->getOpcode() == MI.getOpcode() &&

36517

NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&

36518

NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&

36519

NextMIIt->getOperand(1).isKill()) {

36520

return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);

36521

}

36522

36523

const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();

36524

MachineFunction *F = ThisMBB->getParent();

36525

MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);

36526

MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

36527

36528

MachineFunction::iterator It = ++ThisMBB->getIterator();

36529

F->insert(It, FalseMBB);

36530

F->insert(It, SinkMBB);

36531

36532

// If the EFLAGS register isn't dead in the terminator, then claim that it's

36533

// live into the sink and copy blocks.

36534

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

36535

if (!LastCMOV->killsRegister(X86::EFLAGS) &&

36536

!checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {

36537

FalseMBB->addLiveIn(X86::EFLAGS);

36538

SinkMBB->addLiveIn(X86::EFLAGS);

36539

}

36540

36541

// Transfer any debug instructions inside the CMOV sequence to the sunk block.

36542

auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI),

36543

MachineBasicBlock::iterator(LastCMOV));

36544

for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))

36545

if (MI.isDebugInstr())

36546

SinkMBB->push_back(MI.removeFromParent());

36547

36548

// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.

36549

SinkMBB->splice(SinkMBB->end(), ThisMBB,

36550

std::next(MachineBasicBlock::iterator(LastCMOV)),

36551

ThisMBB->end());

36552

SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

36553

36554

// Fallthrough block for ThisMBB.

36555

ThisMBB->addSuccessor(FalseMBB);

36556

// The true block target of the first (or only) branch is always a SinkMBB.

36557

ThisMBB->addSuccessor(SinkMBB);

36558

// Fallthrough block for FalseMBB.

36559

FalseMBB->addSuccessor(SinkMBB);

36560

36561

// Create the conditional branch instruction.

36562

BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);

36563

36564

// SinkMBB:

36565

// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]

36566

// ...

36567

MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);

36568

MachineBasicBlock::iterator MIItEnd =

36569

std::next(MachineBasicBlock::iterator(LastCMOV));

36570

createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);

36571

36572

// Now remove the CMOV(s).

36573

ThisMBB->erase(MIItBegin, MIItEnd);

36574

36575

return SinkMBB;

36576

}

36577

36578

static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {

36579

if (IsLP64) {

36580

if (isInt<8>(Imm))

36581

return X86::SUB64ri8;

36582

return X86::SUB64ri32;

36583

} else {

36584

if (isInt<8>(Imm))

36585

return X86::SUB32ri8;

36586

return X86::SUB32ri;

36587

}

36588

}

36589

36590

MachineBasicBlock *

36591

X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,

36592

MachineBasicBlock *MBB) const {

36593

MachineFunction *MF = MBB->getParent();

36594

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36595

const X86FrameLowering &TFI = *Subtarget.getFrameLowering();

36596

const DebugLoc &DL = MI.getDebugLoc();

36597

const BasicBlock *LLVM_BB = MBB->getBasicBlock();

36598

36599

const unsigned ProbeSize = getStackProbeSize(*MF);

36600

36601

MachineRegisterInfo &MRI = MF->getRegInfo();

36602

MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36603

MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36604

MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36605

36606

MachineFunction::iterator MBBIter = ++MBB->getIterator();

36607

MF->insert(MBBIter, testMBB);

36608

MF->insert(MBBIter, blockMBB);

36609

MF->insert(MBBIter, tailMBB);

36610

36611

Register sizeVReg = MI.getOperand(1).getReg();

36612

36613

Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;

36614

36615

Register TmpStackPtr = MRI.createVirtualRegister(

36616

TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);

36617

Register FinalStackPtr = MRI.createVirtualRegister(

36618

TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);

36619

36620

BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)

36621

.addReg(physSPReg);

36622

{

36623

const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;

36624

BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)

36625

.addReg(TmpStackPtr)

36626

.addReg(sizeVReg);

36627

}

36628

36629

// test rsp size

36630

36631

BuildMI(testMBB, DL,

36632

TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))

36633

.addReg(FinalStackPtr)

36634

.addReg(physSPReg);

36635

36636

BuildMI(testMBB, DL, TII->get(X86::JCC_1))

36637

.addMBB(tailMBB)

36638

.addImm(X86::COND_GE);

36639

testMBB->addSuccessor(blockMBB);

36640

testMBB->addSuccessor(tailMBB);

36641

36642

// Touch the block then extend it. This is done on the opposite side of

36643

// static probe where we allocate then touch, to avoid the need of probing the

36644

// tail of the static alloca. Possible scenarios are:

36645

//

36646

// + ---- <- ------------ <- ------------- <- ------------ +

36647

// | |

36648

// [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +

36649

// | |

36650

// + <- ----------- <- ------------ <- ----------- <- ------------ +

36651

//

36652

// The property we want to enforce is to never have more than [page alloc] between two probes.

36653

36654

const unsigned XORMIOpc =

36655

TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;

36656

addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)

36657

.addImm(0);

36658

36659

BuildMI(blockMBB, DL,

36660

TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)

36661

.addReg(physSPReg)

36662

.addImm(ProbeSize);

36663

36664

36665

BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);

36666

blockMBB->addSuccessor(testMBB);

36667

36668

// Replace original instruction by the expected stack ptr

36669

BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())

36670

.addReg(FinalStackPtr);

36671

36672

tailMBB->splice(tailMBB->end(), MBB,

36673

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

36674

tailMBB->transferSuccessorsAndUpdatePHIs(MBB);

36675

MBB->addSuccessor(testMBB);

36676

36677

// Delete the original pseudo instruction.

36678

MI.eraseFromParent();

36679

36680

// And we're done.

36681

return tailMBB;

36682

}

36683

36684

MachineBasicBlock *

36685

X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,

36686

MachineBasicBlock *BB) const {

36687

MachineFunction *MF = BB->getParent();

36688

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36689

const DebugLoc &DL = MI.getDebugLoc();

36690

const BasicBlock *LLVM_BB = BB->getBasicBlock();

36691

36692

assert(MF->shouldSplitStack())(static_cast <bool> (MF->shouldSplitStack()) ? void (
0) : __assert_fail ("MF->shouldSplitStack()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36692, __extension__ __PRETTY_FUNCTION__));

36693

36694

const bool Is64Bit = Subtarget.is64Bit();

36695

const bool IsLP64 = Subtarget.isTarget64BitLP64();

36696

36697

const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;

36698

const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;

36699

36700

// BB:

36701

// ... [Till the alloca]

36702

// If stacklet is not large enough, jump to mallocMBB

36703

//

36704

// bumpMBB:

36705

// Allocate by subtracting from RSP

36706

// Jump to continueMBB

36707

//

36708

// mallocMBB:

36709

// Allocate by call to runtime

36710

//

36711

// continueMBB:

36712

// ...

36713

// [rest of original BB]

36714

//

36715

36716

MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36717

MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36718

MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36719

36720

MachineRegisterInfo &MRI = MF->getRegInfo();

36721

const TargetRegisterClass *AddrRegClass =

36722

getRegClassFor(getPointerTy(MF->getDataLayout()));

36723

36724

Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),

36725

bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),

36726

tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),

36727

SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),

36728

sizeVReg = MI.getOperand(1).getReg(),

36729

physSPReg =

36730

IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;

36731

36732

MachineFunction::iterator MBBIter = ++BB->getIterator();

36733

36734

MF->insert(MBBIter, bumpMBB);

36735

MF->insert(MBBIter, mallocMBB);

36736

MF->insert(MBBIter, continueMBB);

36737

36738

continueMBB->splice(continueMBB->begin(), BB,

36739

std::next(MachineBasicBlock::iterator(MI)), BB->end());

36740

continueMBB->transferSuccessorsAndUpdatePHIs(BB);

36741

36742

// Add code to the main basic block to check if the stack limit has been hit,

36743

// and if so, jump to mallocMBB otherwise to bumpMBB.

36744

BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);

36745

BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)

36746

.addReg(tmpSPVReg).addReg(sizeVReg);

36747

BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))

36748

.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)

36749

.addReg(SPLimitVReg);

36750

BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);

36751

36752

// bumpMBB simply decreases the stack pointer, since we know the current

36753

// stacklet has enough space.

36754

BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)

36755

.addReg(SPLimitVReg);

36756

BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)

36757

.addReg(SPLimitVReg);

36758

BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

36759

36760

// Calls into a routine in libgcc to allocate more space from the heap.

36761

const uint32_t *RegMask =

36762

Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);

36763

if (IsLP64) {

36764

BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)

36765

.addReg(sizeVReg);

36766

BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))

36767

.addExternalSymbol("__morestack_allocate_stack_space")

36768

.addRegMask(RegMask)

36769

.addReg(X86::RDI, RegState::Implicit)

36770

.addReg(X86::RAX, RegState::ImplicitDefine);

36771

} else if (Is64Bit) {

36772

BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)

36773

.addReg(sizeVReg);

36774

BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))

36775

.addExternalSymbol("__morestack_allocate_stack_space")

36776

.addRegMask(RegMask)

36777

.addReg(X86::EDI, RegState::Implicit)

36778

.addReg(X86::EAX, RegState::ImplicitDefine);

36779

} else {

36780

BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)

36781

.addImm(12);

36782

BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);

36783

BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))

36784

.addExternalSymbol("__morestack_allocate_stack_space")

36785

.addRegMask(RegMask)

36786

.addReg(X86::EAX, RegState::ImplicitDefine);

36787

}

36788

36789

if (!Is64Bit)

36790

BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)

36791

.addImm(16);

36792

36793

BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)

36794

.addReg(IsLP64 ? X86::RAX : X86::EAX);

36795

BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

36796

36797

// Set up the CFG correctly.

36798

BB->addSuccessor(bumpMBB);

36799

BB->addSuccessor(mallocMBB);

36800

mallocMBB->addSuccessor(continueMBB);

36801

bumpMBB->addSuccessor(continueMBB);

36802

36803

// Take care of the PHI nodes.

36804

BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),

36805

MI.getOperand(0).getReg())

36806

.addReg(mallocPtrVReg)

36807

.addMBB(mallocMBB)

36808

.addReg(bumpSPPtrVReg)

36809

.addMBB(bumpMBB);

36810

36811

// Delete the original pseudo instruction.

36812

MI.eraseFromParent();

36813

36814

// And we're done.

36815

return continueMBB;

36816

}

36817

36818

MachineBasicBlock *

36819

X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,

36820

MachineBasicBlock *BB) const {

36821

MachineFunction *MF = BB->getParent();

36822

const TargetInstrInfo &TII = *Subtarget.getInstrInfo();

36823

MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();

36824

const DebugLoc &DL = MI.getDebugLoc();

36825

36826

assert(!isAsynchronousEHPersonality((static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36828, __extension__
__PRETTY_FUNCTION__))

36827

classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36828, __extension__
__PRETTY_FUNCTION__))

36828

"SEH does not use catchret!")(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36828, __extension__
__PRETTY_FUNCTION__));

36829

36830

// Only 32-bit EH needs to worry about manually restoring stack pointers.

36831

if (!Subtarget.is32Bit())

36832

return BB;

36833

36834

// C++ EH creates a new target block to hold the restore code, and wires up

36835

// the new block to the return destination with a normal JMP_4.

36836

MachineBasicBlock *RestoreMBB =

36837

MF->CreateMachineBasicBlock(BB->getBasicBlock());

36838

assert(BB->succ_size() == 1)(static_cast <bool> (BB->succ_size() == 1) ? void (0
) : __assert_fail ("BB->succ_size() == 1", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36838, __extension__ __PRETTY_FUNCTION__));

36839

MF->insert(std::next(BB->getIterator()), RestoreMBB);

36840

RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);

36841

BB->addSuccessor(RestoreMBB);

36842

MI.getOperand(0).setMBB(RestoreMBB);

36843

36844

// Marking this as an EH pad but not a funclet entry block causes PEI to

36845

// restore stack pointers in the block.

36846

RestoreMBB->setIsEHPad(true);

36847

36848

auto RestoreMBBI = RestoreMBB->begin();

36849

BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);

36850

return BB;

36851

}

36852

36853

MachineBasicBlock *

36854

X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,

36855

MachineBasicBlock *BB) const {

36856

// So, here we replace TLSADDR with the sequence:

36857

// adjust_stackdown -> TLSADDR -> adjust_stackup.

36858

// We need this because TLSADDR is lowered into calls

36859

// inside MC, therefore without the two markers shrink-wrapping

36860

// may push the prologue/epilogue pass them.

36861

const TargetInstrInfo &TII = *Subtarget.getInstrInfo();

36862

const DebugLoc &DL = MI.getDebugLoc();

36863

MachineFunction &MF = *BB->getParent();

36864

36865

// Emit CALLSEQ_START right before the instruction.

36866

unsigned AdjStackDown = TII.getCallFrameSetupOpcode();

36867

MachineInstrBuilder CallseqStart =

36868

BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);

36869

BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);

36870

36871

// Emit CALLSEQ_END right after the instruction.

36872

// We don't call erase from parent because we want to keep the

36873

// original instruction around.

36874

unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();

36875

MachineInstrBuilder CallseqEnd =

36876

BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);

36877

BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);

36878

36879

return BB;

36880

}

36881

36882

MachineBasicBlock *

36883

X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,

36884

MachineBasicBlock *BB) const {

36885

// This is pretty easy. We're taking the value that we received from

36886

// our load from the relocation, sticking it in either RDI (x86-64)

36887

// or EAX and doing an indirect call. The return value will then

36888

// be in the normal return register.

36889

MachineFunction *F = BB->getParent();

36890

const X86InstrInfo *TII = Subtarget.getInstrInfo();

36891

const DebugLoc &DL = MI.getDebugLoc();

36892

36893

assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")(static_cast <bool> (Subtarget.isTargetDarwin() &&
"Darwin only instr emitted?") ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36893, __extension__
__PRETTY_FUNCTION__));

36894

assert(MI.getOperand(3).isGlobal() && "This should be a global")(static_cast <bool> (MI.getOperand(3).isGlobal() &&
"This should be a global") ? void (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36894, __extension__
__PRETTY_FUNCTION__));

36895

36896

// Get a register mask for the lowered call.

36897

// FIXME: The 32-bit calls have non-standard calling conventions. Use a

36898

// proper register mask.

36899

const uint32_t *RegMask =

36900

Subtarget.is64Bit() ?

36901

Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :

36902

Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);

36903

if (Subtarget.is64Bit()) {

36904

MachineInstrBuilder MIB =

36905

BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)

36906

.addReg(X86::RIP)

36907

.addImm(0)

36908

.addReg(0)

36909

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

36910

MI.getOperand(3).getTargetFlags())

36911

.addReg(0);

36912

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));

36913

addDirectMem(MIB, X86::RDI);

36914

MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);

36915

} else if (!isPositionIndependent()) {

36916

MachineInstrBuilder MIB =

36917

BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)

36918

.addReg(0)

36919

.addImm(0)

36920

.addReg(0)

36921

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

36922

MI.getOperand(3).getTargetFlags())

36923

.addReg(0);

36924

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));

36925

addDirectMem(MIB, X86::EAX);

36926

MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);

36927

} else {

36928

MachineInstrBuilder MIB =

36929

BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)

36930

.addReg(TII->getGlobalBaseReg(F))

36931

.addImm(0)

36932

.addReg(0)

36933

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

36934

MI.getOperand(3).getTargetFlags())

36935

.addReg(0);

36936

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));

36937

addDirectMem(MIB, X86::EAX);

36938

MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);

36939

}

36940

36941

MI.eraseFromParent(); // The pseudo instruction is gone now.

36942

return BB;

36943

}

36944

36945

static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {

36946

switch (RPOpc) {

36947

case X86::INDIRECT_THUNK_CALL32:

36948

return X86::CALLpcrel32;

36949

case X86::INDIRECT_THUNK_CALL64:

36950

return X86::CALL64pcrel32;

36951

case X86::INDIRECT_THUNK_TCRETURN32:

36952

return X86::TCRETURNdi;

36953

case X86::INDIRECT_THUNK_TCRETURN64:

36954

return X86::TCRETURNdi64;

36955

}

36956

llvm_unreachable("not indirect thunk opcode")::llvm::llvm_unreachable_internal("not indirect thunk opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36956);

36957

}

36958

36959

static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,

36960

unsigned Reg) {

36961

if (Subtarget.useRetpolineExternalThunk()) {

36962

// When using an external thunk for retpolines, we pick names that match the

36963

// names GCC happens to use as well. This helps simplify the implementation

36964

// of the thunks for kernels where they have no easy ability to create

36965

// aliases and are doing non-trivial configuration of the thunk's body. For

36966

// example, the Linux kernel will do boot-time hot patching of the thunk

36967

// bodies and cannot easily export aliases of these to loaded modules.

36968

//

36969

// Note that at any point in the future, we may need to change the semantics

36970

// of how we implement retpolines and at that time will likely change the

36971

// name of the called thunk. Essentially, there is no hard guarantee that

36972

// LLVM will generate calls to specific thunks, we merely make a best-effort

36973

// attempt to help out kernels and other systems where duplicating the

36974

// thunks is costly.

36975

switch (Reg) {

36976

case X86::EAX:

36977

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36977, __extension__
__PRETTY_FUNCTION__));

36978

return "__x86_indirect_thunk_eax";

36979

case X86::ECX:

36980

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36980, __extension__
__PRETTY_FUNCTION__));

36981

return "__x86_indirect_thunk_ecx";

36982

case X86::EDX:

36983

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36983, __extension__
__PRETTY_FUNCTION__));

36984

return "__x86_indirect_thunk_edx";

36985

case X86::EDI:

36986

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36986, __extension__
__PRETTY_FUNCTION__));

36987

return "__x86_indirect_thunk_edi";

36988

case X86::R11:

36989

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36989, __extension__
__PRETTY_FUNCTION__));

36990

return "__x86_indirect_thunk_r11";

36991

}

36992

llvm_unreachable("unexpected reg for external indirect thunk")::llvm::llvm_unreachable_internal("unexpected reg for external indirect thunk"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36992);

36993

}

36994

36995

if (Subtarget.useRetpolineIndirectCalls() ||

36996

Subtarget.useRetpolineIndirectBranches()) {

36997

// When targeting an internal COMDAT thunk use an LLVM-specific name.

36998

switch (Reg) {

36999

case X86::EAX:

37000

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37000, __extension__
__PRETTY_FUNCTION__));

37001

return "__llvm_retpoline_eax";

37002

case X86::ECX:

37003

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37003, __extension__
__PRETTY_FUNCTION__));

37004

return "__llvm_retpoline_ecx";

37005

case X86::EDX:

37006

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37006, __extension__
__PRETTY_FUNCTION__));

37007

return "__llvm_retpoline_edx";

37008

case X86::EDI:

37009

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37009, __extension__
__PRETTY_FUNCTION__));

37010

return "__llvm_retpoline_edi";

37011

case X86::R11:

37012

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37012, __extension__
__PRETTY_FUNCTION__));

37013

return "__llvm_retpoline_r11";

37014

}

37015

llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37015);

37016

}

37017

37018

if (Subtarget.useLVIControlFlowIntegrity()) {

37019

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37019, __extension__
__PRETTY_FUNCTION__));

37020

return "__llvm_lvi_thunk_r11";

37021

}

37022

llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature")::llvm::llvm_unreachable_internal("getIndirectThunkSymbol() invoked without thunk feature"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37022);

37023

}

37024

37025

MachineBasicBlock *

37026

X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,

37027

MachineBasicBlock *BB) const {

37028

// Copy the virtual register into the R11 physical register and

37029

// call the retpoline thunk.

37030

const DebugLoc &DL = MI.getDebugLoc();

37031

const X86InstrInfo *TII = Subtarget.getInstrInfo();

37032

Register CalleeVReg = MI.getOperand(0).getReg();

37033

unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());

37034

37035

// Find an available scratch register to hold the callee. On 64-bit, we can

37036

// just use R11, but we scan for uses anyway to ensure we don't generate

37037

// incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't

37038

// already a register use operand to the call to hold the callee. If none

37039

// are available, use EDI instead. EDI is chosen because EBX is the PIC base

37040

// register and ESI is the base pointer to realigned stack frames with VLAs.

37041

SmallVector<unsigned, 3> AvailableRegs;

37042

if (Subtarget.is64Bit())

37043

AvailableRegs.push_back(X86::R11);

37044

else

37045

AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});

37046

37047

// Zero out any registers that are already used.

37048

for (const auto &MO : MI.operands()) {

37049

if (MO.isReg() && MO.isUse())

37050

for (unsigned &Reg : AvailableRegs)

37051

if (Reg == MO.getReg())

37052

Reg = 0;

37053

}

37054

37055

// Choose the first remaining non-zero available register.

37056

unsigned AvailableReg = 0;

37057

for (unsigned MaybeReg : AvailableRegs) {

37058

if (MaybeReg) {

37059

AvailableReg = MaybeReg;

37060

break;

37061

}

37062

}

37063

if (!AvailableReg)

37064

report_fatal_error("calling convention incompatible with retpoline, no "

37065

"available registers");

37066

37067

const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);

37068

37069

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)

37070

.addReg(CalleeVReg);

37071

MI.getOperand(0).ChangeToES(Symbol);

37072

MI.setDesc(TII->get(Opc));

37073

MachineInstrBuilder(*BB->getParent(), &MI)

37074

.addReg(AvailableReg, RegState::Implicit | RegState::Kill);

37075

return BB;

37076

}

37077

37078

/// SetJmp implies future control flow change upon calling the corresponding

37079

/// LongJmp.

37080

/// Instead of using the 'return' instruction, the long jump fixes the stack and

37081

/// performs an indirect branch. To do so it uses the registers that were stored

37082

/// in the jump buffer (when calling SetJmp).

37083

/// In case the shadow stack is enabled we need to fix it as well, because some

37084

/// return addresses will be skipped.

37085

/// The function will save the SSP for future fixing in the function

37086

/// emitLongJmpShadowStackFix.

37087

/// \sa emitLongJmpShadowStackFix

37088

/// \param [in] MI The temporary Machine Instruction for the builtin.

37089

/// \param [in] MBB The Machine Basic Block that will be modified.

37090

void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,

37091

MachineBasicBlock *MBB) const {

37092

const DebugLoc &DL = MI.getDebugLoc();

37093

MachineFunction *MF = MBB->getParent();

37094

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

37095

MachineRegisterInfo &MRI = MF->getRegInfo();

37096

MachineInstrBuilder MIB;

37097

37098

// Memory Reference.

37099

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

37100

MI.memoperands_end());

37101

37102

// Initialize a register with zero.

37103

MVT PVT = getPointerTy(MF->getDataLayout());

37104

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

37105

Register ZReg = MRI.createVirtualRegister(PtrRC);

37106

unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;

37107

BuildMI(*MBB, MI, DL, TII->get(XorRROpc))

37108

.addDef(ZReg)

37109

.addReg(ZReg, RegState::Undef)

37110

.addReg(ZReg, RegState::Undef);

37111

37112

// Read the current SSP Register value to the zeroed register.

37113

Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);

37114

unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;

37115

BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

37116

37117

// Write the SSP register value to offset 3 in input memory buffer.

37118

unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

37119

MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));

37120

const int64_t SSPOffset = 3 * PVT.getStoreSize();

37121

const unsigned MemOpndSlot = 1;

37122

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37123

if (i == X86::AddrDisp)

37124

MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);

37125

else

37126

MIB.add(MI.getOperand(MemOpndSlot + i));

37127

}

37128

MIB.addReg(SSPCopyReg);

37129

MIB.setMemRefs(MMOs);

37130

}

37131

37132

MachineBasicBlock *

37133

X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,

37134

MachineBasicBlock *MBB) const {

37135

const DebugLoc &DL = MI.getDebugLoc();

37136

MachineFunction *MF = MBB->getParent();

37137

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

37138

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

37139

MachineRegisterInfo &MRI = MF->getRegInfo();

37140

37141

const BasicBlock *BB = MBB->getBasicBlock();

37142

MachineFunction::iterator I = ++MBB->getIterator();

37143

37144

// Memory Reference

37145

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

37146

MI.memoperands_end());

37147

37148

unsigned DstReg;

37149

unsigned MemOpndSlot = 0;

37150

37151

unsigned CurOp = 0;

37152

37153

DstReg = MI.getOperand(CurOp++).getReg();

37154

const TargetRegisterClass *RC = MRI.getRegClass(DstReg);

37155

assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")(static_cast <bool> (TRI->isTypeLegalForClass(*RC, MVT
::i32) && "Invalid destination!") ? void (0) : __assert_fail
("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37155, __extension__
__PRETTY_FUNCTION__));

37156

(void)TRI;

37157

Register mainDstReg = MRI.createVirtualRegister(RC);

37158

Register restoreDstReg = MRI.createVirtualRegister(RC);

37159

37160

MemOpndSlot = CurOp;

37161

37162

MVT PVT = getPointerTy(MF->getDataLayout());

37163

assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37164, __extension__
__PRETTY_FUNCTION__))

37164

"Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37164, __extension__
__PRETTY_FUNCTION__));

37165

37166

// For v = setjmp(buf), we generate

37167

//

37168

// thisMBB:

37169

// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB

37170

// SjLjSetup restoreMBB

37171

//

37172

// mainMBB:

37173

// v_main = 0

37174

//

37175

// sinkMBB:

37176

// v = phi(main, restore)

37177

//

37178

// restoreMBB:

37179

// if base pointer being used, load it from frame

37180

// v_restore = 1

37181

37182

MachineBasicBlock *thisMBB = MBB;

37183

MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);

37184

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

37185

MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);

37186

MF->insert(I, mainMBB);

37187

MF->insert(I, sinkMBB);

37188

MF->push_back(restoreMBB);

37189

restoreMBB->setMachineBlockAddressTaken();

37190

37191

MachineInstrBuilder MIB;

37192

37193

// Transfer the remainder of BB and its successor edges to sinkMBB.

37194

sinkMBB->splice(sinkMBB->begin(), MBB,

37195

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

37196

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

37197

37198

// thisMBB:

37199

unsigned PtrStoreOpc = 0;

37200

unsigned LabelReg = 0;

37201

const int64_t LabelOffset = 1 * PVT.getStoreSize();

37202

bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&

37203

!isPositionIndependent();

37204

37205

// Prepare IP either in reg or imm.

37206

if (!UseImmLabel) {

37207

PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

37208

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

37209

LabelReg = MRI.createVirtualRegister(PtrRC);

37210

if (Subtarget.is64Bit()) {

37211

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)

37212

.addReg(X86::RIP)

37213

.addImm(0)

37214

.addReg(0)

37215

.addMBB(restoreMBB)

37216

.addReg(0);

37217

} else {

37218

const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);

37219

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)

37220

.addReg(XII->getGlobalBaseReg(MF))

37221

.addImm(0)

37222

.addReg(0)

37223

.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())

37224

.addReg(0);

37225

}

37226

} else

37227

PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;

37228

// Store IP

37229

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));

37230

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37231

if (i == X86::AddrDisp)

37232

MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);

37233

else

37234

MIB.add(MI.getOperand(MemOpndSlot + i));

37235

}

37236

if (!UseImmLabel)

37237

MIB.addReg(LabelReg);

37238

else

37239

MIB.addMBB(restoreMBB);

37240

MIB.setMemRefs(MMOs);

37241

37242

if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {

37243

emitSetJmpShadowStackFix(MI, thisMBB);

37244

}

37245

37246

// Setup

37247

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))

37248

.addMBB(restoreMBB);

37249

37250

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

37251

MIB.addRegMask(RegInfo->getNoPreservedMask());

37252

thisMBB->addSuccessor(mainMBB);

37253

thisMBB->addSuccessor(restoreMBB);

37254

37255

// mainMBB:

37256

// EAX = 0

37257

BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);

37258

mainMBB->addSuccessor(sinkMBB);

37259

37260

// sinkMBB:

37261

BuildMI(*sinkMBB, sinkMBB->begin(), DL,

37262

TII->get(X86::PHI), DstReg)

37263

.addReg(mainDstReg).addMBB(mainMBB)

37264

.addReg(restoreDstReg).addMBB(restoreMBB);

37265

37266

// restoreMBB:

37267

if (RegInfo->hasBasePointer(*MF)) {

37268

const bool Uses64BitFramePtr =

37269

Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();

37270

X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();

37271

X86FI->setRestoreBasePointer(MF);

37272

Register FramePtr = RegInfo->getFrameRegister(*MF);

37273

Register BasePtr = RegInfo->getBaseRegister();

37274

unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;

37275

addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),

37276

FramePtr, true, X86FI->getRestoreBasePointerOffset())

37277

.setMIFlag(MachineInstr::FrameSetup);

37278

}

37279

BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);

37280

BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);

37281

restoreMBB->addSuccessor(sinkMBB);

37282

37283

MI.eraseFromParent();

37284

return sinkMBB;

37285

}

37286

37287

/// Fix the shadow stack using the previously saved SSP pointer.

37288

/// \sa emitSetJmpShadowStackFix

37289

/// \param [in] MI The temporary Machine Instruction for the builtin.

37290

/// \param [in] MBB The Machine Basic Block that will be modified.

37291

/// \return The sink MBB that will perform the future indirect branch.

37292

MachineBasicBlock *

37293

X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,

37294

MachineBasicBlock *MBB) const {

37295

const DebugLoc &DL = MI.getDebugLoc();

37296

MachineFunction *MF = MBB->getParent();

37297

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

37298

MachineRegisterInfo &MRI = MF->getRegInfo();

37299

37300

// Memory Reference

37301

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

37302

MI.memoperands_end());

37303

37304

MVT PVT = getPointerTy(MF->getDataLayout());

37305

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

37306

37307

// checkSspMBB:

37308

// xor vreg1, vreg1

37309

// rdssp vreg1

37310

// test vreg1, vreg1

37311

// je sinkMBB # Jump if Shadow Stack is not supported

37312

// fallMBB:

37313

// mov buf+24/12(%rip), vreg2

37314

// sub vreg1, vreg2

37315

// jbe sinkMBB # No need to fix the Shadow Stack

37316

// fixShadowMBB:

37317

// shr 3/2, vreg2

37318

// incssp vreg2 # fix the SSP according to the lower 8 bits

37319

// shr 8, vreg2

37320

// je sinkMBB

37321

// fixShadowLoopPrepareMBB:

37322

// shl vreg2

37323

// mov 128, vreg3

37324

// fixShadowLoopMBB:

37325

// incssp vreg3

37326

// dec vreg2

37327

// jne fixShadowLoopMBB # Iterate until you finish fixing

37328

// # the Shadow Stack

37329

// sinkMBB:

37330

37331

MachineFunction::iterator I = ++MBB->getIterator();

37332

const BasicBlock *BB = MBB->getBasicBlock();

37333

37334

MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);

37335

MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);

37336

MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);

37337

MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);

37338

MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);

37339

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

37340

MF->insert(I, checkSspMBB);

37341

MF->insert(I, fallMBB);

37342

MF->insert(I, fixShadowMBB);

37343

MF->insert(I, fixShadowLoopPrepareMBB);

37344

MF->insert(I, fixShadowLoopMBB);

37345

MF->insert(I, sinkMBB);

37346

37347

// Transfer the remainder of BB and its successor edges to sinkMBB.

37348

sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),

37349

MBB->end());

37350

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

37351

37352

MBB->addSuccessor(checkSspMBB);

37353

37354

// Initialize a register with zero.

37355

Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);

37356

BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);

37357

37358

if (PVT == MVT::i64) {

37359

Register TmpZReg = MRI.createVirtualRegister(PtrRC);

37360

BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)

37361

.addImm(0)

37362

.addReg(ZReg)

37363

.addImm(X86::sub_32bit);

37364

ZReg = TmpZReg;

37365

}

37366

37367

// Read the current SSP Register value to the zeroed register.

37368

Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);

37369

unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;

37370

BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

37371

37372

// Check whether the result of the SSP register is zero and jump directly

37373

// to the sink.

37374

unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;

37375

BuildMI(checkSspMBB, DL, TII->get(TestRROpc))

37376

.addReg(SSPCopyReg)

37377

.addReg(SSPCopyReg);

37378

BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);

37379

checkSspMBB->addSuccessor(sinkMBB);

37380

checkSspMBB->addSuccessor(fallMBB);

37381

37382

// Reload the previously saved SSP register value.

37383

Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);

37384

unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;

37385

const int64_t SPPOffset = 3 * PVT.getStoreSize();

37386

MachineInstrBuilder MIB =

37387

BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);

37388

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37389

const MachineOperand &MO = MI.getOperand(i);

37390

if (i == X86::AddrDisp)

37391

MIB.addDisp(MO, SPPOffset);

37392

else if (MO.isReg()) // Don't add the whole operand, we don't want to

37393

// preserve kill flags.

37394

MIB.addReg(MO.getReg());

37395

else

37396

MIB.add(MO);

37397

}

37398

MIB.setMemRefs(MMOs);

37399

37400

// Subtract the current SSP from the previous SSP.

37401

Register SspSubReg = MRI.createVirtualRegister(PtrRC);

37402

unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;

37403

BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)

37404

.addReg(PrevSSPReg)

37405

.addReg(SSPCopyReg);

37406

37407

// Jump to sink in case PrevSSPReg <= SSPCopyReg.

37408

BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);

37409

fallMBB->addSuccessor(sinkMBB);

37410

fallMBB->addSuccessor(fixShadowMBB);

37411

37412

// Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.

37413

unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;

37414

unsigned Offset = (PVT == MVT::i64) ? 3 : 2;

37415

Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);

37416

BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)

37417

.addReg(SspSubReg)

37418

.addImm(Offset);

37419

37420

// Increase SSP when looking only on the lower 8 bits of the delta.

37421

unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;

37422

BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);

37423

37424

// Reset the lower 8 bits.

37425

Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);

37426

BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)

37427

.addReg(SspFirstShrReg)

37428

.addImm(8);

37429

37430

// Jump if the result of the shift is zero.

37431

BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);

37432

fixShadowMBB->addSuccessor(sinkMBB);

37433

fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);

37434

37435

// Do a single shift left.

37436

unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;

37437

Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);

37438

BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)

37439

.addReg(SspSecondShrReg);

37440

37441

// Save the value 128 to a register (will be used next with incssp).

37442

Register Value128InReg = MRI.createVirtualRegister(PtrRC);

37443

unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;

37444

BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)

37445

.addImm(128);

37446

fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);

37447

37448

// Since incssp only looks at the lower 8 bits, we might need to do several

37449

// iterations of incssp until we finish fixing the shadow stack.

37450

Register DecReg = MRI.createVirtualRegister(PtrRC);

37451

Register CounterReg = MRI.createVirtualRegister(PtrRC);

37452

BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)

37453

.addReg(SspAfterShlReg)

37454

.addMBB(fixShadowLoopPrepareMBB)

37455

.addReg(DecReg)

37456

.addMBB(fixShadowLoopMBB);

37457

37458

// Every iteration we increase the SSP by 128.

37459

BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);

37460

37461

// Every iteration we decrement the counter by 1.

37462

unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;

37463

BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);

37464

37465

// Jump if the counter is not zero yet.

37466

BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);

37467

fixShadowLoopMBB->addSuccessor(sinkMBB);

37468

fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);

37469

37470

return sinkMBB;

37471

}

37472

37473

MachineBasicBlock *

37474

X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,

37475

MachineBasicBlock *MBB) const {

37476

const DebugLoc &DL = MI.getDebugLoc();

37477

MachineFunction *MF = MBB->getParent();

37478

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

37479

MachineRegisterInfo &MRI = MF->getRegInfo();

37480

37481

// Memory Reference

37482

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

37483

MI.memoperands_end());

37484

37485

MVT PVT = getPointerTy(MF->getDataLayout());

37486

assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37487, __extension__
__PRETTY_FUNCTION__))

37487

"Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37487, __extension__
__PRETTY_FUNCTION__));

37488

37489

const TargetRegisterClass *RC =

37490

(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;

37491

Register Tmp = MRI.createVirtualRegister(RC);

37492

// Since FP is only updated here but NOT referenced, it's treated as GPR.

37493

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

37494

Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;

37495

Register SP = RegInfo->getStackRegister();

37496

37497

MachineInstrBuilder MIB;

37498

37499

const int64_t LabelOffset = 1 * PVT.getStoreSize();

37500

const int64_t SPOffset = 2 * PVT.getStoreSize();

37501

37502

unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;

37503

unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;

37504

37505

MachineBasicBlock *thisMBB = MBB;

37506

37507

// When CET and shadow stack is enabled, we need to fix the Shadow Stack.

37508

if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {

37509

thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);

37510

}

37511

37512

// Reload FP

37513

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);

37514

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37515

const MachineOperand &MO = MI.getOperand(i);

37516

if (MO.isReg()) // Don't add the whole operand, we don't want to

37517

// preserve kill flags.

37518

MIB.addReg(MO.getReg());

37519

else

37520

MIB.add(MO);

37521

}

37522

MIB.setMemRefs(MMOs);

37523

37524

// Reload IP

37525

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);

37526

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37527

const MachineOperand &MO = MI.getOperand(i);

37528

if (i == X86::AddrDisp)

37529

MIB.addDisp(MO, LabelOffset);

37530

else if (MO.isReg()) // Don't add the whole operand, we don't want to

37531

// preserve kill flags.

37532

MIB.addReg(MO.getReg());

37533

else

37534

MIB.add(MO);

37535

}

37536

MIB.setMemRefs(MMOs);

37537

37538

// Reload SP

37539

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);

37540

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37541

if (i == X86::AddrDisp)

37542

MIB.addDisp(MI.getOperand(i), SPOffset);

37543

else

37544

MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's

37545

// the last instruction of the expansion.

37546

}

37547

MIB.setMemRefs(MMOs);

37548

37549

// Jump

37550

BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);

37551

37552

MI.eraseFromParent();

37553

return thisMBB;

37554

}

37555

37556

void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,

37557

MachineBasicBlock *MBB,

37558

MachineBasicBlock *DispatchBB,

37559

int FI) const {

37560

const DebugLoc &DL = MI.getDebugLoc();

37561

MachineFunction *MF = MBB->getParent();

37562

MachineRegisterInfo *MRI = &MF->getRegInfo();

37563

const X86InstrInfo *TII = Subtarget.getInstrInfo();

37564

37565

MVT PVT = getPointerTy(MF->getDataLayout());

37566

assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37566, __extension__
__PRETTY_FUNCTION__));

37567

37568

unsigned Op = 0;

37569

unsigned VR = 0;

37570

37571

bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&

37572

!isPositionIndependent();

37573

37574

if (UseImmLabel) {

37575

Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;

37576

} else {

37577

const TargetRegisterClass *TRC =

37578

(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;

37579

VR = MRI->createVirtualRegister(TRC);

37580

Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

37581

37582

if (Subtarget.is64Bit())

37583

BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)

37584

.addReg(X86::RIP)

37585

.addImm(1)

37586

.addReg(0)

37587

.addMBB(DispatchBB)

37588

.addReg(0);

37589

else

37590

BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)

37591

.addReg(0) /* TII->getGlobalBaseReg(MF) */

37592

.addImm(1)

37593

.addReg(0)

37594

.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())

37595

.addReg(0);

37596

}

37597

37598

MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));

37599

addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);

37600

if (UseImmLabel)

37601

MIB.addMBB(DispatchBB);

37602

else

37603

MIB.addReg(VR);

37604

}

37605

37606

MachineBasicBlock *

37607

X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,

37608

MachineBasicBlock *BB) const {

37609

const DebugLoc &DL = MI.getDebugLoc();

37610

MachineFunction *MF = BB->getParent();

37611

MachineRegisterInfo *MRI = &MF->getRegInfo();

37612

const X86InstrInfo *TII = Subtarget.getInstrInfo();

37613

int FI = MF->getFrameInfo().getFunctionContextIndex();

37614

37615

// Get a mapping of the call site numbers to all of the landing pads they're

37616

// associated with.

37617

DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;

37618

unsigned MaxCSNum = 0;

37619

for (auto &MBB : *MF) {

37620

if (!MBB.isEHPad())

37621

continue;

37622

37623

MCSymbol *Sym = nullptr;

37624

for (const auto &MI : MBB) {

37625

if (MI.isDebugInstr())

37626

continue;

37627

37628

assert(MI.isEHLabel() && "expected EH_LABEL")(static_cast <bool> (MI.isEHLabel() && "expected EH_LABEL"
) ? void (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37628, __extension__
__PRETTY_FUNCTION__));

37629

Sym = MI.getOperand(0).getMCSymbol();

37630

break;

37631

}

37632

37633

if (!MF->hasCallSiteLandingPad(Sym))

37634

continue;

37635

37636

for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {

37637

CallSiteNumToLPad[CSI].push_back(&MBB);

37638

MaxCSNum = std::max(MaxCSNum, CSI);

37639

}

37640

}

37641

37642

// Get an ordered list of the machine basic blocks for the jump table.

37643

std::vector<MachineBasicBlock *> LPadList;

37644

SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;

37645

LPadList.reserve(CallSiteNumToLPad.size());

37646

37647

for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {

37648

for (auto &LP : CallSiteNumToLPad[CSI]) {

37649

LPadList.push_back(LP);

37650

InvokeBBs.insert(LP->pred_begin(), LP->pred_end());

37651

}

37652

}

37653

37654

assert(!LPadList.empty() &&(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37655, __extension__
__PRETTY_FUNCTION__))

37655

"No landing pad destinations for the dispatch jump table!")(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37655, __extension__
__PRETTY_FUNCTION__));

37656

37657

// Create the MBBs for the dispatch code.

37658

37659

// Shove the dispatch's address into the return slot in the function context.

37660

MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();

37661

DispatchBB->setIsEHPad(true);

37662

37663

MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();

37664

BuildMI(TrapBB, DL, TII->get(X86::TRAP));

37665

DispatchBB->addSuccessor(TrapBB);

37666

37667

MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();

37668

DispatchBB->addSuccessor(DispContBB);

37669

37670

// Insert MBBs.

37671

MF->push_back(DispatchBB);

37672

MF->push_back(DispContBB);

37673

MF->push_back(TrapBB);

37674

37675

// Insert code into the entry block that creates and registers the function

37676

// context.

37677

SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);

37678

37679

// Create the jump table and associated information

37680

unsigned JTE = getJumpTableEncoding();

37681

MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);

37682

unsigned MJTI = JTI->createJumpTableIndex(LPadList);

37683

37684

const X86RegisterInfo &RI = TII->getRegisterInfo();

37685

// Add a register mask with no preserved registers. This results in all

37686

// registers being marked as clobbered.

37687

if (RI.hasBasePointer(*MF)) {

37688

const bool FPIs64Bit =

37689

Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();

37690

X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();

37691

MFI->setRestoreBasePointer(MF);

37692

37693

Register FP = RI.getFrameRegister(*MF);

37694

Register BP = RI.getBaseRegister();

37695

unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;

37696

addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,

37697

MFI->getRestoreBasePointerOffset())

37698

.addRegMask(RI.getNoPreservedMask());

37699

} else {

37700

BuildMI(DispatchBB, DL, TII->get(X86::NOOP))

37701

.addRegMask(RI.getNoPreservedMask());

37702

}

37703

37704

// IReg is used as an index in a memory operand and therefore can't be SP

37705

Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);

37706

addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,

37707

Subtarget.is64Bit() ? 8 : 4);

37708

BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))

37709

.addReg(IReg)

37710

.addImm(LPadList.size());

37711

BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);

37712

37713

if (Subtarget.is64Bit()) {

37714

Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);

37715

Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);

37716

37717

// leaq .LJTI0_0(%rip), BReg

37718

BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)

37719

.addReg(X86::RIP)

37720

.addImm(1)

37721

.addReg(0)

37722

.addJumpTableIndex(MJTI)

37723

.addReg(0);

37724

// movzx IReg64, IReg

37725

BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)

37726

.addImm(0)

37727

.addReg(IReg)

37728

.addImm(X86::sub_32bit);

37729

37730

switch (JTE) {

37731

case MachineJumpTableInfo::EK_BlockAddress:

37732

// jmpq *(BReg,IReg64,8)

37733

BuildMI(DispContBB, DL, TII->get(X86::JMP64m))

37734

.addReg(BReg)

37735

.addImm(8)

37736

.addReg(IReg64)

37737

.addImm(0)

37738

.addReg(0);

37739

break;

37740

case MachineJumpTableInfo::EK_LabelDifference32: {

37741

Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);

37742

Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);

37743

Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);

37744

37745

// movl (BReg,IReg64,4), OReg

37746

BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)

37747

.addReg(BReg)

37748

.addImm(4)

37749

.addReg(IReg64)

37750

.addImm(0)

37751

.addReg(0);

37752

// movsx OReg64, OReg

37753

BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);

37754

// addq BReg, OReg64, TReg

37755

BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)

37756

.addReg(OReg64)

37757

.addReg(BReg);

37758

// jmpq *TReg

37759

BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);

37760

break;

37761

}

37762

default:

37763

llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37763);

37764

}

37765

} else {

37766

// jmpl *.LJTI0_0(,IReg,4)

37767

BuildMI(DispContBB, DL, TII->get(X86::JMP32m))

37768

.addReg(0)

37769

.addImm(4)

37770

.addReg(IReg)

37771

.addJumpTableIndex(MJTI)

37772

.addReg(0);

37773

}

37774

37775

// Add the jump table entries as successors to the MBB.

37776

SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;

37777

for (auto &LP : LPadList)

37778

if (SeenMBBs.insert(LP).second)

37779

DispContBB->addSuccessor(LP);

37780

37781

// N.B. the order the invoke BBs are processed in doesn't matter here.

37782

SmallVector<MachineBasicBlock *, 64> MBBLPads;

37783

const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();

37784

for (MachineBasicBlock *MBB : InvokeBBs) {

37785

// Remove the landing pad successor from the invoke block and replace it

37786

// with the new dispatch block.

37787

// Keep a copy of Successors since it's modified inside the loop.

37788

SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),

37789

MBB->succ_rend());

37790

// FIXME: Avoid quadratic complexity.

37791

for (auto *MBBS : Successors) {

37792

if (MBBS->isEHPad()) {

37793

MBB->removeSuccessor(MBBS);

37794

MBBLPads.push_back(MBBS);

37795

}

37796

}

37797

37798

MBB->addSuccessor(DispatchBB);

37799

37800

// Find the invoke call and mark all of the callee-saved registers as

37801

// 'implicit defined' so that they're spilled. This prevents code from

37802

// moving instructions to before the EH block, where they will never be

37803

// executed.

37804

for (auto &II : reverse(*MBB)) {

37805

if (!II.isCall())

37806

continue;

37807

37808

DenseMap<unsigned, bool> DefRegs;

37809

for (auto &MOp : II.operands())

37810

if (MOp.isReg())

37811

DefRegs[MOp.getReg()] = true;

37812

37813

MachineInstrBuilder MIB(*MF, &II);

37814

for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {

37815

unsigned Reg = SavedRegs[RegIdx];

37816

if (!DefRegs[Reg])

37817

MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);

37818

}

37819

37820

break;

37821

}

37822

}

37823

37824

// Mark all former landing pads as non-landing pads. The dispatch is the only

37825

// landing pad now.

37826

for (auto &LP : MBBLPads)

37827

LP->setIsEHPad(false);

37828

37829

// The instruction is gone now.

37830

MI.eraseFromParent();

37831

return BB;

37832

}

37833

37834

MachineBasicBlock *

37835

X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

37836

MachineBasicBlock *BB) const {

37837

MachineFunction *MF = BB->getParent();

37838

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

37839

const DebugLoc &DL = MI.getDebugLoc();

37840

37841

auto TMMImmToTMMReg = [](unsigned Imm) {

37842

assert (Imm < 8 && "Illegal tmm index")(static_cast <bool> (Imm < 8 && "Illegal tmm index"
) ? void (0) : __assert_fail ("Imm < 8 && \"Illegal tmm index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37842, __extension__
__PRETTY_FUNCTION__));

37843

return X86::TMM0 + Imm;

37844

};

37845

switch (MI.getOpcode()) {

37846

default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37846);

37847

case X86::TLS_addr32:

37848

case X86::TLS_addr64:

37849

case X86::TLS_addrX32:

37850

case X86::TLS_base_addr32:

37851

case X86::TLS_base_addr64:

37852

case X86::TLS_base_addrX32:

37853

return EmitLoweredTLSAddr(MI, BB);

37854

case X86::INDIRECT_THUNK_CALL32:

37855

case X86::INDIRECT_THUNK_CALL64:

37856

case X86::INDIRECT_THUNK_TCRETURN32:

37857

case X86::INDIRECT_THUNK_TCRETURN64:

37858

return EmitLoweredIndirectThunk(MI, BB);

37859

case X86::CATCHRET:

37860

return EmitLoweredCatchRet(MI, BB);

37861

case X86::SEG_ALLOCA_32:

37862

case X86::SEG_ALLOCA_64:

37863

return EmitLoweredSegAlloca(MI, BB);

37864

case X86::PROBED_ALLOCA_32:

37865

case X86::PROBED_ALLOCA_64:

37866

return EmitLoweredProbedAlloca(MI, BB);

37867

case X86::TLSCall_32:

37868

case X86::TLSCall_64:

37869

return EmitLoweredTLSCall(MI, BB);

37870

case X86::CMOV_FR16:

37871

case X86::CMOV_FR16X:

37872

case X86::CMOV_FR32:

37873

case X86::CMOV_FR32X:

37874

case X86::CMOV_FR64:

37875

case X86::CMOV_FR64X:

37876

case X86::CMOV_GR8:

37877

case X86::CMOV_GR16:

37878

case X86::CMOV_GR32:

37879

case X86::CMOV_RFP32:

37880

case X86::CMOV_RFP64:

37881

case X86::CMOV_RFP80:

37882

case X86::CMOV_VR64:

37883

case X86::CMOV_VR128:

37884

case X86::CMOV_VR128X:

37885

case X86::CMOV_VR256:

37886

case X86::CMOV_VR256X:

37887

case X86::CMOV_VR512:

37888

case X86::CMOV_VK1:

37889

case X86::CMOV_VK2:

37890

case X86::CMOV_VK4:

37891

case X86::CMOV_VK8:

37892

case X86::CMOV_VK16:

37893

case X86::CMOV_VK32:

37894

case X86::CMOV_VK64:

37895

return EmitLoweredSelect(MI, BB);

37896

37897

case X86::FP80_ADDr:

37898

case X86::FP80_ADDm32: {

37899

// Change the floating point control register to use double extended

37900

// precision when performing the addition.

37901

int OrigCWFrameIdx =

37902

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

37903

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)),

37904

OrigCWFrameIdx);

37905

37906

// Load the old value of the control word...

37907

Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

37908

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),

37909

OrigCWFrameIdx);

37910

37911

// OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended

37912

// precision.

37913

Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

37914

BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)

37915

.addReg(OldCW, RegState::Kill)

37916

.addImm(0x300);

37917

37918

// Extract to 16 bits.

37919

Register NewCW16 =

37920

MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);

37921

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)

37922

.addReg(NewCW, RegState::Kill, X86::sub_16bit);

37923

37924

// Prepare memory for FLDCW.

37925

int NewCWFrameIdx =

37926

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

37927

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),

37928

NewCWFrameIdx)

37929

.addReg(NewCW16, RegState::Kill);

37930

37931

// Reload the modified control word now...

37932

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),

37933

NewCWFrameIdx);

37934

37935

// Do the addition.

37936

if (MI.getOpcode() == X86::FP80_ADDr) {

37937

BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80))

37938

.add(MI.getOperand(0))

37939

.add(MI.getOperand(1))

37940

.add(MI.getOperand(2));

37941

} else {

37942

BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80m32))

37943

.add(MI.getOperand(0))

37944

.add(MI.getOperand(1))

37945

.add(MI.getOperand(2))

37946

.add(MI.getOperand(3))

37947

.add(MI.getOperand(4))

37948

.add(MI.getOperand(5))

37949

.add(MI.getOperand(6));

37950

}

37951

37952

// Reload the original control word now.

37953

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),

37954

OrigCWFrameIdx);

37955

37956

MI.eraseFromParent(); // The pseudo instruction is gone now.

37957

return BB;

37958

}

37959

37960

case X86::FP32_TO_INT16_IN_MEM:

37961

case X86::FP32_TO_INT32_IN_MEM:

37962

case X86::FP32_TO_INT64_IN_MEM:

37963

case X86::FP64_TO_INT16_IN_MEM:

37964

case X86::FP64_TO_INT32_IN_MEM:

37965

case X86::FP64_TO_INT64_IN_MEM:

37966

case X86::FP80_TO_INT16_IN_MEM:

37967

case X86::FP80_TO_INT32_IN_MEM:

37968

case X86::FP80_TO_INT64_IN_MEM: {

37969

// Change the floating point control register to use "round towards zero"

37970

// mode when truncating to an integer value.

37971

int OrigCWFrameIdx =

37972

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

37973

addFrameReference(BuildMI(*BB, MI, DL,

37974

TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);

37975

37976

// Load the old value of the control word...

37977

Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

37978

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),

37979

OrigCWFrameIdx);

37980

37981

// OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.

37982

Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

37983

BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)

37984

.addReg(OldCW, RegState::Kill).addImm(0xC00);

37985

37986

// Extract to 16 bits.

37987

Register NewCW16 =

37988

MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);

37989

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)

37990

.addReg(NewCW, RegState::Kill, X86::sub_16bit);

37991

37992

// Prepare memory for FLDCW.

37993

int NewCWFrameIdx =

37994

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

37995

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),

37996

NewCWFrameIdx)

37997

.addReg(NewCW16, RegState::Kill);

37998

37999

// Reload the modified control word now...

38000

addFrameReference(BuildMI(*BB, MI, DL,

38001

TII->get(X86::FLDCW16m)), NewCWFrameIdx);

38002

38003

// Get the X86 opcode to use.

38004

unsigned Opc;

38005

switch (MI.getOpcode()) {

38006

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38006);

38007

case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;

38008

case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;

38009

case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;

38010

case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;

38011

case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;

38012

case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;

38013

case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;

38014

case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;

38015

case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;

38016

}

38017

38018

X86AddressMode AM = getAddressFromInstr(&MI, 0);

38019

addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)

38020

.addReg(MI.getOperand(X86::AddrNumOperands).getReg());

38021

38022

// Reload the original control word now.

38023

addFrameReference(BuildMI(*BB, MI, DL,

38024

TII->get(X86::FLDCW16m)), OrigCWFrameIdx);

38025

38026

MI.eraseFromParent(); // The pseudo instruction is gone now.

38027

return BB;

38028

}

38029

38030

// xbegin

38031

case X86::XBEGIN:

38032

return emitXBegin(MI, BB, Subtarget.getInstrInfo());

38033

38034

case X86::VAARG_64:

38035

case X86::VAARG_X32:

38036

return EmitVAARGWithCustomInserter(MI, BB);

38037

38038

case X86::EH_SjLj_SetJmp32:

38039

case X86::EH_SjLj_SetJmp64:

38040

return emitEHSjLjSetJmp(MI, BB);

38041

38042

case X86::EH_SjLj_LongJmp32:

38043

case X86::EH_SjLj_LongJmp64:

38044

return emitEHSjLjLongJmp(MI, BB);

38045

38046

case X86::Int_eh_sjlj_setup_dispatch:

38047

return EmitSjLjDispatchBlock(MI, BB);

38048

38049

case TargetOpcode::STATEPOINT:

38050

// As an implementation detail, STATEPOINT shares the STACKMAP format at

38051

// this point in the process. We diverge later.

38052

return emitPatchPoint(MI, BB);

38053

38054

case TargetOpcode::STACKMAP:

38055

case TargetOpcode::PATCHPOINT:

38056

return emitPatchPoint(MI, BB);

38057

38058

case TargetOpcode::PATCHABLE_EVENT_CALL:

38059

case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:

38060

return BB;

38061

38062

case X86::LCMPXCHG8B: {

38063

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

38064

// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B

38065

// requires a memory operand. If it happens that current architecture is

38066

// i686 and for current function we need a base pointer

38067

// - which is ESI for i686 - register allocator would not be able to

38068

// allocate registers for an address in form of X(%reg, %reg, Y)

38069

// - there never would be enough unreserved registers during regalloc

38070

// (without the need for base ptr the only option would be X(%edi, %esi, Y).

38071

// We are giving a hand to register allocator by precomputing the address in

38072

// a new vreg using LEA.

38073

38074

// If it is not i686 or there is no base pointer - nothing to do here.

38075

if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))

38076

return BB;

38077

38078

// Even though this code does not necessarily needs the base pointer to

38079

// be ESI, we check for that. The reason: if this assert fails, there are

38080

// some changes happened in the compiler base pointer handling, which most

38081

// probably have to be addressed somehow here.

38082

assert(TRI->getBaseRegister() == X86::ESI &&(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38084, __extension__
__PRETTY_FUNCTION__))

38083

"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38084, __extension__
__PRETTY_FUNCTION__))

38084

"base pointer in mind")(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38084, __extension__
__PRETTY_FUNCTION__));

38085

38086

MachineRegisterInfo &MRI = MF->getRegInfo();

38087

MVT SPTy = getPointerTy(MF->getDataLayout());

38088

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

38089

Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);

38090

38091

X86AddressMode AM = getAddressFromInstr(&MI, 0);

38092

// Regalloc does not need any help when the memory operand of CMPXCHG8B

38093

// does not use index register.

38094

if (AM.IndexReg == X86::NoRegister)

38095

return BB;

38096

38097

// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its

38098

// four operand definitions that are E[ABCD] registers. We skip them and

38099

// then insert the LEA.

38100

MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());

38101

while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||

38102

RMBBI->definesRegister(X86::EBX) ||

38103

RMBBI->definesRegister(X86::ECX) ||

38104

RMBBI->definesRegister(X86::EDX))) {

38105

++RMBBI;

38106

}

38107

MachineBasicBlock::iterator MBBI(RMBBI);

38108

addFullAddress(

38109

BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);

38110

38111

setDirectAddressInInstr(&MI, 0, computedAddrVReg);

38112

38113

return BB;

38114

}

38115

case X86::LCMPXCHG16B_NO_RBX: {

38116

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

38117

Register BasePtr = TRI->getBaseRegister();

38118

if (TRI->hasBasePointer(*MF) &&

38119

(BasePtr == X86::RBX || BasePtr == X86::EBX)) {

38120

if (!BB->isLiveIn(BasePtr))

38121

BB->addLiveIn(BasePtr);

38122

// Save RBX into a virtual register.

38123

Register SaveRBX =

38124

MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

38125

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)

38126

.addReg(X86::RBX);

38127

Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

38128

MachineInstrBuilder MIB =

38129

BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);

38130

for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)

38131

MIB.add(MI.getOperand(Idx));

38132

MIB.add(MI.getOperand(X86::AddrNumOperands));

38133

MIB.addReg(SaveRBX);

38134

} else {

38135

// Simple case, just copy the virtual register to RBX.

38136

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)

38137

.add(MI.getOperand(X86::AddrNumOperands));

38138

MachineInstrBuilder MIB =

38139

BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));

38140

for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)

38141

MIB.add(MI.getOperand(Idx));

38142

}

38143

MI.eraseFromParent();

38144

return BB;

38145

}

38146

case X86::MWAITX: {

38147

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

38148

Register BasePtr = TRI->getBaseRegister();

38149

bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);

38150

// If no need to save the base pointer, we generate MWAITXrrr,

38151

// else we generate pseudo MWAITX_SAVE_RBX.

38152

if (!IsRBX || !TRI->hasBasePointer(*MF)) {

38153

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)

38154

.addReg(MI.getOperand(0).getReg());

38155

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)

38156

.addReg(MI.getOperand(1).getReg());

38157

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)

38158

.addReg(MI.getOperand(2).getReg());

38159

BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));

38160

MI.eraseFromParent();

38161

} else {

38162

if (!BB->isLiveIn(BasePtr)) {

38163

BB->addLiveIn(BasePtr);

38164

}

38165

// Parameters can be copied into ECX and EAX but not EBX yet.

38166

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)

38167

.addReg(MI.getOperand(0).getReg());

38168

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)

38169

.addReg(MI.getOperand(1).getReg());

38170

assert(Subtarget.is64Bit() && "Expected 64-bit mode!")(static_cast <bool> (Subtarget.is64Bit() && "Expected 64-bit mode!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Expected 64-bit mode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38170, __extension__
__PRETTY_FUNCTION__));

38171

// Save RBX into a virtual register.

38172

Register SaveRBX =

38173

MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

38174

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)

38175

.addReg(X86::RBX);

38176

// Generate mwaitx pseudo.

38177

Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

38178

BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))

38179

.addDef(Dst) // Destination tied in with SaveRBX.

38180

.addReg(MI.getOperand(2).getReg()) // input value of EBX.

38181

.addUse(SaveRBX); // Save of base pointer.

38182

MI.eraseFromParent();

38183

}

38184

return BB;

38185

}

38186

case TargetOpcode::PREALLOCATED_SETUP: {

38187

assert(Subtarget.is32Bit() && "preallocated only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38187, __extension__
__PRETTY_FUNCTION__));

38188

auto MFI = MF->getInfo<X86MachineFunctionInfo>();

38189

MFI->setHasPreallocatedCall(true);

38190

int64_t PreallocatedId = MI.getOperand(0).getImm();

38191

size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);

38192

assert(StackAdjustment != 0 && "0 stack adjustment")(static_cast <bool> (StackAdjustment != 0 && "0 stack adjustment"
) ? void (0) : __assert_fail ("StackAdjustment != 0 && \"0 stack adjustment\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38192, __extension__
__PRETTY_FUNCTION__));

38193

LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)

38194

<< StackAdjustment << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false);

38195

BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)

38196

.addReg(X86::ESP)

38197

.addImm(StackAdjustment);

38198

MI.eraseFromParent();

38199

return BB;

38200

}

38201

case TargetOpcode::PREALLOCATED_ARG: {

38202

assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated calls only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated calls only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38202, __extension__
__PRETTY_FUNCTION__));

38203

int64_t PreallocatedId = MI.getOperand(1).getImm();

38204

int64_t ArgIdx = MI.getOperand(2).getImm();

38205

auto MFI = MF->getInfo<X86MachineFunctionInfo>();

38206

size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];

38207

LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdxdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)

38208

<< ", arg offset " << ArgOffset << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false);

38209

// stack pointer + offset

38210

addRegOffset(

38211

BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),

38212

X86::ESP, false, ArgOffset);

38213

MI.eraseFromParent();

38214

return BB;

38215

}

38216

case X86::PTDPBSSD:

38217

case X86::PTDPBSUD:

38218

case X86::PTDPBUSD:

38219

case X86::PTDPBUUD:

38220

case X86::PTDPBF16PS:

38221

case X86::PTDPFP16PS: {

38222

unsigned Opc;

38223

switch (MI.getOpcode()) {

38224

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38224);

38225

case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;

38226

case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;

38227

case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;

38228

case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;

38229

case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;

38230

case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;

38231

}

38232

38233

MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

38234

MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);

38235

MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);

38236

MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);

38237

MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);

38238

38239

MI.eraseFromParent(); // The pseudo is gone now.

38240

return BB;

38241

}

38242

case X86::PTILEZERO: {

38243

unsigned Imm = MI.getOperand(0).getImm();

38244

BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));

38245

MI.eraseFromParent(); // The pseudo is gone now.

38246

return BB;

38247

}

38248

case X86::PTILELOADD:

38249

case X86::PTILELOADDT1:

38250

case X86::PTILESTORED: {

38251

unsigned Opc;

38252

switch (MI.getOpcode()) {

38253

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38253);

38254

case X86::PTILELOADD: Opc = X86::TILELOADD; break;

38255

case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;

38256

case X86::PTILESTORED: Opc = X86::TILESTORED; break;

38257

}

38258

38259

MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

38260

unsigned CurOp = 0;

38261

if (Opc != X86::TILESTORED)

38262

MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),

38263

RegState::Define);

38264

38265

MIB.add(MI.getOperand(CurOp++)); // base

38266

MIB.add(MI.getOperand(CurOp++)); // scale

38267

MIB.add(MI.getOperand(CurOp++)); // index -- stride

38268

MIB.add(MI.getOperand(CurOp++)); // displacement

38269

MIB.add(MI.getOperand(CurOp++)); // segment

38270

38271

if (Opc == X86::TILESTORED)

38272

MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),

38273

RegState::Undef);

38274

38275

MI.eraseFromParent(); // The pseudo is gone now.

38276

return BB;

38277

}

38278

case X86::PTCMMIMFP16PS:

38279

case X86::PTCMMRLFP16PS: {

38280

const DebugLoc &DL = MI.getDebugLoc();

38281

unsigned Opc;

38282

switch (MI.getOpcode()) {

38283

default: llvm_unreachable("Unexpected instruction!")::llvm::llvm_unreachable_internal("Unexpected instruction!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38283);

38284

case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break;

38285

case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break;

38286

}

38287

MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

38288

MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);

38289

MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);

38290

MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);

38291

MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);

38292

MI.eraseFromParent(); // The pseudo is gone now.

38293

return BB;

38294

}

38295

}

38296

}

38297

38298

//===----------------------------------------------------------------------===//

38299

// X86 Optimization Hooks

38300

//===----------------------------------------------------------------------===//

38301

38302

bool

38303

X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,

38304

const APInt &DemandedBits,

38305

const APInt &DemandedElts,

38306

TargetLoweringOpt &TLO) const {

38307

EVT VT = Op.getValueType();

38308

unsigned Opcode = Op.getOpcode();

38309

unsigned EltSize = VT.getScalarSizeInBits();

38310

38311

if (VT.isVector()) {

38312

// If the constant is only all signbits in the active bits, then we should

38313

// extend it to the entire constant to allow it act as a boolean constant

38314

// vector.

38315

auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {

38316

if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))

38317

return false;

38318

for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {

38319

if (!DemandedElts[i] || V.getOperand(i).isUndef())

38320

continue;

38321

const APInt &Val = V.getConstantOperandAPInt(i);

38322

if (Val.getBitWidth() > Val.getNumSignBits() &&

38323

Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)

38324

return true;

38325

}

38326

return false;

38327

};

38328

// For vectors - if we have a constant, then try to sign extend.

38329

// TODO: Handle AND/ANDN cases.

38330

unsigned ActiveBits = DemandedBits.getActiveBits();

38331

if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&

38332

(Opcode == ISD::OR || Opcode == ISD::XOR) &&

38333

NeedsSignExtension(Op.getOperand(1), ActiveBits)) {

38334

EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);

38335

EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,

38336

VT.getVectorNumElements());

38337

SDValue NewC =

38338

TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,

38339

Op.getOperand(1), TLO.DAG.getValueType(ExtVT));

38340

SDValue NewOp =

38341

TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);

38342

return TLO.CombineTo(Op, NewOp);

38343

}

38344

return false;

38345

}

38346

38347

// Only optimize Ands to prevent shrinking a constant that could be

38348

// matched by movzx.

38349

if (Opcode != ISD::AND)

38350

return false;

38351

38352

// Make sure the RHS really is a constant.

38353

ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));

38354

if (!C)

38355

return false;

38356

38357

const APInt &Mask = C->getAPIntValue();

38358

38359

// Clear all non-demanded bits initially.

38360

APInt ShrunkMask = Mask & DemandedBits;

38361

38362

// Find the width of the shrunk mask.

38363

unsigned Width = ShrunkMask.getActiveBits();

38364

38365

// If the mask is all 0s there's nothing to do here.

38366

if (Width == 0)

38367

return false;

38368

38369

// Find the next power of 2 width, rounding up to a byte.

38370

Width = llvm::bit_ceil(std::max(Width, 8U));

38371

// Truncate the width to size to handle illegal types.

38372

Width = std::min(Width, EltSize);

38373

38374

// Calculate a possible zero extend mask for this constant.

38375

APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);

38376

38377

// If we aren't changing the mask, just return true to keep it and prevent

38378

// the caller from optimizing.

38379

if (ZeroExtendMask == Mask)

38380

return true;

38381

38382

// Make sure the new mask can be represented by a combination of mask bits

38383

// and non-demanded bits.

38384

if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))

38385

return false;

38386

38387

// Replace the constant with the zero extend mask.

38388

SDLoc DL(Op);

38389

SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);

38390

SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);

38391

return TLO.CombineTo(Op, NewOp);

38392

}

38393

38394

void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,

38395

KnownBits &Known,

38396

const APInt &DemandedElts,

38397

const SelectionDAG &DAG,

38398

unsigned Depth) const {

38399

unsigned BitWidth = Known.getBitWidth();

38400

unsigned NumElts = DemandedElts.getBitWidth();

38401

unsigned Opc = Op.getOpcode();

38402

EVT VT = Op.getValueType();

38403

assert((Opc >= ISD::BUILTIN_OP_END ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38408, __extension__
__PRETTY_FUNCTION__))

38404

Opc == ISD::INTRINSIC_WO_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38408, __extension__
__PRETTY_FUNCTION__))

38405

Opc == ISD::INTRINSIC_W_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38408, __extension__
__PRETTY_FUNCTION__))

38406

Opc == ISD::INTRINSIC_VOID) &&(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38408, __extension__
__PRETTY_FUNCTION__))

38407

"Should use MaskedValueIsZero if you don't know whether Op"(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38408, __extension__
__PRETTY_FUNCTION__))

38408

" is a target node!")(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38408, __extension__
__PRETTY_FUNCTION__));

38409

38410

Known.resetAll();

38411

switch (Opc) {

38412

default: break;

38413

case X86ISD::MUL_IMM: {

38414

KnownBits Known2;

38415

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38416

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38417

Known = KnownBits::mul(Known, Known2);

38418

break;

38419

}

38420

case X86ISD::SETCC:

38421

Known.Zero.setBitsFrom(1);

38422

break;

38423

case X86ISD::MOVMSK: {

38424

unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();

38425

Known.Zero.setBitsFrom(NumLoBits);

38426

break;

38427

}

38428

case X86ISD::PEXTRB:

38429

case X86ISD::PEXTRW: {

38430

SDValue Src = Op.getOperand(0);

38431

EVT SrcVT = Src.getValueType();

38432

APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),

38433

Op.getConstantOperandVal(1));

38434

Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);

38435

Known = Known.anyextOrTrunc(BitWidth);

38436

Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());

38437

break;

38438

}

38439

case X86ISD::VSRAI:

38440

case X86ISD::VSHLI:

38441

case X86ISD::VSRLI: {

38442

unsigned ShAmt = Op.getConstantOperandVal(1);

38443

if (ShAmt >= VT.getScalarSizeInBits()) {

38444

// Out of range logical bit shifts are guaranteed to be zero.

38445

// Out of range arithmetic bit shifts splat the sign bit.

38446

if (Opc != X86ISD::VSRAI) {

38447

Known.setAllZero();

38448

break;

38449

}

38450

38451

ShAmt = VT.getScalarSizeInBits() - 1;

38452

}

38453

38454

Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38455

if (Opc == X86ISD::VSHLI) {

38456

Known.Zero <<= ShAmt;

38457

Known.One <<= ShAmt;

38458

// Low bits are known zero.

38459

Known.Zero.setLowBits(ShAmt);

38460

} else if (Opc == X86ISD::VSRLI) {

38461

Known.Zero.lshrInPlace(ShAmt);

38462

Known.One.lshrInPlace(ShAmt);

38463

// High bits are known zero.

38464

Known.Zero.setHighBits(ShAmt);

38465

} else {

38466

Known.Zero.ashrInPlace(ShAmt);

38467

Known.One.ashrInPlace(ShAmt);

38468

}

38469

break;

38470

}

38471

case X86ISD::PACKUS: {

38472

// PACKUS is just a truncation if the upper half is zero.

38473

APInt DemandedLHS, DemandedRHS;

38474

getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

38475

38476

Known.One = APInt::getAllOnes(BitWidth * 2);

38477

Known.Zero = APInt::getAllOnes(BitWidth * 2);

38478

38479

KnownBits Known2;

38480

if (!!DemandedLHS) {

38481

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);

38482

Known = KnownBits::commonBits(Known, Known2);

38483

}

38484

if (!!DemandedRHS) {

38485

Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);

38486

Known = KnownBits::commonBits(Known, Known2);

38487

}

38488

38489

if (Known.countMinLeadingZeros() < BitWidth)

38490

Known.resetAll();

38491

Known = Known.trunc(BitWidth);

38492

break;

38493

}

38494

case X86ISD::VBROADCAST: {

38495

SDValue Src = Op.getOperand(0);

38496

if (!Src.getSimpleValueType().isVector()) {

38497

Known = DAG.computeKnownBits(Src, Depth + 1);

38498

return;

38499

}

38500

break;

38501

}

38502

case X86ISD::AND: {

38503

if (Op.getResNo() == 0) {

38504

KnownBits Known2;

38505

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38506

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38507

Known &= Known2;

38508

}

38509

break;

38510

}

38511

case X86ISD::ANDNP: {

38512

KnownBits Known2;

38513

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38514

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38515

38516

// ANDNP = (~X & Y);

38517

Known.One &= Known2.Zero;

38518

Known.Zero |= Known2.One;

38519

break;

38520

}

38521

case X86ISD::FOR: {

38522

KnownBits Known2;

38523

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38524

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38525

38526

Known |= Known2;

38527

break;

38528

}

38529

case X86ISD::PSADBW: {

38530

assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38532, __extension__
__PRETTY_FUNCTION__))

38531

Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38532, __extension__
__PRETTY_FUNCTION__))

38532

"Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38532, __extension__
__PRETTY_FUNCTION__));

38533

38534

// PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.

38535

Known.Zero.setBitsFrom(16);

38536

break;

38537

}

38538

case X86ISD::PMULUDQ: {

38539

KnownBits Known2;

38540

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38541

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38542

38543

Known = Known.trunc(BitWidth / 2).zext(BitWidth);

38544

Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);

38545

Known = KnownBits::mul(Known, Known2);

38546

break;

38547

}

38548

case X86ISD::CMOV: {

38549

Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);

38550

// If we don't know any bits, early out.

38551

if (Known.isUnknown())

38552

break;

38553

KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);

38554

38555

// Only known if known in both the LHS and RHS.

38556

Known = KnownBits::commonBits(Known, Known2);

38557

break;

38558

}

38559

case X86ISD::BEXTR:

38560

case X86ISD::BEXTRI: {

38561

SDValue Op0 = Op.getOperand(0);

38562

SDValue Op1 = Op.getOperand(1);

38563

38564

if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {

38565

unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);

38566

unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);

38567

38568

// If the length is 0, the result is 0.

38569

if (Length == 0) {

38570

Known.setAllZero();

38571

break;

38572

}

38573

38574

if ((Shift + Length) <= BitWidth) {

38575

Known = DAG.computeKnownBits(Op0, Depth + 1);

38576

Known = Known.extractBits(Length, Shift);

38577

Known = Known.zextOrTrunc(BitWidth);

38578

}

38579

}

38580

break;

38581

}

38582

case X86ISD::PDEP: {

38583

KnownBits Known2;

38584

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38585

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38586

// Zeros are retained from the mask operand. But not ones.

38587

Known.One.clearAllBits();

38588

// The result will have at least as many trailing zeros as the non-mask

38589

// operand since bits can only map to the same or higher bit position.

38590

Known.Zero.setLowBits(Known2.countMinTrailingZeros());

38591

break;

38592

}

38593

case X86ISD::PEXT: {

38594

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38595

// The result has as many leading zeros as the number of zeroes in the mask.

38596

unsigned Count = Known.Zero.popcount();

38597

Known.Zero = APInt::getHighBitsSet(BitWidth, Count);

38598

Known.One.clearAllBits();

38599

break;

38600

}

38601

case X86ISD::VTRUNC:

38602

case X86ISD::VTRUNCS:

38603

case X86ISD::VTRUNCUS:

38604

case X86ISD::CVTSI2P:

38605

case X86ISD::CVTUI2P:

38606

case X86ISD::CVTP2SI:

38607

case X86ISD::CVTP2UI:

38608

case X86ISD::MCVTP2SI:

38609

case X86ISD::MCVTP2UI:

38610

case X86ISD::CVTTP2SI:

38611

case X86ISD::CVTTP2UI:

38612

case X86ISD::MCVTTP2SI:

38613

case X86ISD::MCVTTP2UI:

38614

case X86ISD::MCVTSI2P:

38615

case X86ISD::MCVTUI2P:

38616

case X86ISD::VFPROUND:

38617

case X86ISD::VMFPROUND:

38618

case X86ISD::CVTPS2PH:

38619

case X86ISD::MCVTPS2PH: {

38620

// Truncations/Conversions - upper elements are known zero.

38621

EVT SrcVT = Op.getOperand(0).getValueType();

38622

if (SrcVT.isVector()) {

38623

unsigned NumSrcElts = SrcVT.getVectorNumElements();

38624

if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)

38625

Known.setAllZero();

38626

}

38627

break;

38628

}

38629

case X86ISD::STRICT_CVTTP2SI:

38630

case X86ISD::STRICT_CVTTP2UI:

38631

case X86ISD::STRICT_CVTSI2P:

38632

case X86ISD::STRICT_CVTUI2P:

38633

case X86ISD::STRICT_VFPROUND:

38634

case X86ISD::STRICT_CVTPS2PH: {

38635

// Strict Conversions - upper elements are known zero.

38636

EVT SrcVT = Op.getOperand(1).getValueType();

38637

if (SrcVT.isVector()) {

38638

unsigned NumSrcElts = SrcVT.getVectorNumElements();

38639

if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)

38640

Known.setAllZero();

38641

}

38642

break;

38643

}

38644

case X86ISD::MOVQ2DQ: {

38645

// Move from MMX to XMM. Upper half of XMM should be 0.

38646

if (DemandedElts.countr_zero() >= (NumElts / 2))

38647

Known.setAllZero();

38648

break;

38649

}

38650

case X86ISD::VBROADCAST_LOAD: {

38651

APInt UndefElts;

38652

SmallVector<APInt, 16> EltBits;

38653

if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,

38654

/*AllowWholeUndefs*/ false,

38655

/*AllowPartialUndefs*/ false)) {

38656

Known.Zero.setAllBits();

38657

Known.One.setAllBits();

38658

for (unsigned I = 0; I != NumElts; ++I) {

38659

if (!DemandedElts[I])

38660

continue;

38661

if (UndefElts[I]) {

38662

Known.resetAll();

38663

break;

38664

}

38665

KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);

38666

Known = KnownBits::commonBits(Known, Known2);

38667

}

38668

return;

38669

}

38670

break;

38671

}

38672

}

38673

38674

// Handle target shuffles.

38675

// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.

38676

if (isTargetShuffle(Opc)) {

38677

SmallVector<int, 64> Mask;

38678

SmallVector<SDValue, 2> Ops;

38679

if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {

38680

unsigned NumOps = Ops.size();

38681

unsigned NumElts = VT.getVectorNumElements();

38682

if (Mask.size() == NumElts) {

38683

SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));

38684

Known.Zero.setAllBits(); Known.One.setAllBits();

38685

for (unsigned i = 0; i != NumElts; ++i) {

38686

if (!DemandedElts[i])

38687

continue;

38688

int M = Mask[i];

38689

if (M == SM_SentinelUndef) {

38690

// For UNDEF elements, we don't know anything about the common state

38691

// of the shuffle result.

38692

Known.resetAll();

38693

break;

38694

}

38695

if (M == SM_SentinelZero) {

38696

Known.One.clearAllBits();

38697

continue;

38698

}

38699

assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38700, __extension__
__PRETTY_FUNCTION__))

38700

"Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38700, __extension__
__PRETTY_FUNCTION__));

38701

38702

unsigned OpIdx = (unsigned)M / NumElts;

38703

unsigned EltIdx = (unsigned)M % NumElts;

38704

if (Ops[OpIdx].getValueType() != VT) {

38705

// TODO - handle target shuffle ops with different value types.

38706

Known.resetAll();

38707

break;

38708

}

38709

DemandedOps[OpIdx].setBit(EltIdx);

38710

}

38711

// Known bits are the values that are shared by every demanded element.

38712

for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {

38713

if (!DemandedOps[i])

38714

continue;

38715

KnownBits Known2 =

38716

DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);

38717

Known = KnownBits::commonBits(Known, Known2);

38718

}

38719

}

38720

}

38721

}

38722

}

38723

38724

unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(

38725

SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

38726

unsigned Depth) const {

38727

EVT VT = Op.getValueType();

38728

unsigned VTBits = VT.getScalarSizeInBits();

38729

unsigned Opcode = Op.getOpcode();

38730

switch (Opcode) {

38731

case X86ISD::SETCC_CARRY:

38732

// SETCC_CARRY sets the dest to ~0 for true or 0 for false.

38733

return VTBits;

38734

38735

case X86ISD::VTRUNC: {

38736

SDValue Src = Op.getOperand(0);

38737

MVT SrcVT = Src.getSimpleValueType();

38738

unsigned NumSrcBits = SrcVT.getScalarSizeInBits();

38739

assert(VTBits < NumSrcBits && "Illegal truncation input type")(static_cast <bool> (VTBits < NumSrcBits && "Illegal truncation input type"
) ? void (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38739, __extension__
__PRETTY_FUNCTION__));

38740

APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

38741

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);

38742

if (Tmp > (NumSrcBits - VTBits))

38743

return Tmp - (NumSrcBits - VTBits);

38744

return 1;

38745

}

38746

38747

case X86ISD::PACKSS: {

38748

// PACKSS is just a truncation if the sign bits extend to the packed size.

38749

APInt DemandedLHS, DemandedRHS;

38750

getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,

38751

DemandedRHS);

38752

38753

unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();

38754

unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;

38755

if (!!DemandedLHS)

38756

Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);

38757

if (!!DemandedRHS)

38758

Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);

38759

unsigned Tmp = std::min(Tmp0, Tmp1);

38760

if (Tmp > (SrcBits - VTBits))

38761

return Tmp - (SrcBits - VTBits);

38762

return 1;

38763

}

38764

38765

case X86ISD::VBROADCAST: {

38766

SDValue Src = Op.getOperand(0);

38767

if (!Src.getSimpleValueType().isVector())

38768

return DAG.ComputeNumSignBits(Src, Depth + 1);

38769

break;

38770

}

38771

38772

case X86ISD::VSHLI: {

38773

SDValue Src = Op.getOperand(0);

38774

const APInt &ShiftVal = Op.getConstantOperandAPInt(1);

38775

if (ShiftVal.uge(VTBits))

38776

return VTBits; // Shifted all bits out --> zero.

38777

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);

38778

if (ShiftVal.uge(Tmp))

38779

return 1; // Shifted all sign bits out --> unknown.

38780

return Tmp - ShiftVal.getZExtValue();

38781

}

38782

38783

case X86ISD::VSRAI: {

38784

SDValue Src = Op.getOperand(0);

38785

APInt ShiftVal = Op.getConstantOperandAPInt(1);

38786

if (ShiftVal.uge(VTBits - 1))

38787

return VTBits; // Sign splat.

38788

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);

38789

ShiftVal += Tmp;

38790

return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();

38791

}

38792

38793

case X86ISD::FSETCC:

38794

// cmpss/cmpsd return zero/all-bits result values in the bottom element.

38795

if (VT == MVT::f32 || VT == MVT::f64 ||

38796

((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))

38797

return VTBits;

38798

break;

38799

38800

case X86ISD::PCMPGT:

38801

case X86ISD::PCMPEQ:

38802

case X86ISD::CMPP:

38803

case X86ISD::VPCOM:

38804

case X86ISD::VPCOMU:

38805

// Vector compares return zero/all-bits result values.

38806

return VTBits;

38807

38808

case X86ISD::ANDNP: {

38809

unsigned Tmp0 =

38810

DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);

38811

if (Tmp0 == 1) return 1; // Early out.

38812

unsigned Tmp1 =

38813

DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);

38814

return std::min(Tmp0, Tmp1);

38815

}

38816

38817

case X86ISD::CMOV: {

38818

unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);

38819

if (Tmp0 == 1) return 1; // Early out.

38820

unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);

38821

return std::min(Tmp0, Tmp1);

38822

}

38823

}

38824

38825

// Handle target shuffles.

38826

// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.

38827

if (isTargetShuffle(Opcode)) {

38828

SmallVector<int, 64> Mask;

38829

SmallVector<SDValue, 2> Ops;

38830

if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {

38831

unsigned NumOps = Ops.size();

38832

unsigned NumElts = VT.getVectorNumElements();

38833

if (Mask.size() == NumElts) {

38834

SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));

38835

for (unsigned i = 0; i != NumElts; ++i) {

38836

if (!DemandedElts[i])

38837

continue;

38838

int M = Mask[i];

38839

if (M == SM_SentinelUndef) {

38840

// For UNDEF elements, we don't know anything about the common state

38841

// of the shuffle result.

38842

return 1;

38843

} else if (M == SM_SentinelZero) {

38844

// Zero = all sign bits.

38845

continue;

38846

}

38847

assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38848, __extension__
__PRETTY_FUNCTION__))

38848

"Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38848, __extension__
__PRETTY_FUNCTION__));

38849

38850

unsigned OpIdx = (unsigned)M / NumElts;

38851

unsigned EltIdx = (unsigned)M % NumElts;

38852

if (Ops[OpIdx].getValueType() != VT) {

38853

// TODO - handle target shuffle ops with different value types.

38854

return 1;

38855

}

38856

DemandedOps[OpIdx].setBit(EltIdx);

38857

}

38858

unsigned Tmp0 = VTBits;

38859

for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {

38860

if (!DemandedOps[i])

38861

continue;

38862

unsigned Tmp1 =

38863

DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);

38864

Tmp0 = std::min(Tmp0, Tmp1);

38865

}

38866

return Tmp0;

38867

}

38868

}

38869

}

38870

38871

// Fallback case.

38872

return 1;

38873

}

38874

38875

SDValue X86TargetLowering::unwrapAddress(SDValue N) const {

38876

if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)

38877

return N->getOperand(0);

38878

return N;

38879

}

38880

38881

// Helper to look for a normal load that can be narrowed into a vzload with the

38882

// specified VT and memory VT. Returns SDValue() on failure.

38883

static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,

38884

SelectionDAG &DAG) {

38885

// Can't if the load is volatile or atomic.

38886

if (!LN->isSimple())

38887

return SDValue();

38888

38889

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

38890

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

38891

return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,

38892

LN->getPointerInfo(), LN->getOriginalAlign(),

38893

LN->getMemOperand()->getFlags());

38894

}

38895

38896

// Attempt to match a combined shuffle mask against supported unary shuffle

38897

// instructions.

38898

// TODO: Investigate sharing more of this with shuffle lowering.

38899

static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,

38900

bool AllowFloatDomain, bool AllowIntDomain,

38901

SDValue V1, const SelectionDAG &DAG,

38902

const X86Subtarget &Subtarget, unsigned &Shuffle,

38903

MVT &SrcVT, MVT &DstVT) {

38904

unsigned NumMaskElts = Mask.size();

38905

unsigned MaskEltSize = MaskVT.getScalarSizeInBits();

38906

38907

// Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.

38908

if (Mask[0] == 0 &&

38909

(MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {

38910

if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||

38911

(V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&

38912

isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {

38913

Shuffle = X86ISD::VZEXT_MOVL;

38914

if (MaskEltSize == 16)

38915

SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);

38916

else

38917

SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;

38918

return true;

38919

}

38920

}

38921

38922

// Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.

38923

// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).

38924

if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||

38925

(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {

38926

unsigned MaxScale = 64 / MaskEltSize;

38927

for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {

38928

bool MatchAny = true;

38929

bool MatchZero = true;

38930

unsigned NumDstElts = NumMaskElts / Scale;

38931

for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {

38932

if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {

38933

MatchAny = MatchZero = false;

38934

break;

38935

}

38936

MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);

38937

MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);

38938

}

38939

if (MatchAny || MatchZero) {

38940

assert(MatchZero && "Failed to match zext but matched aext?")(static_cast <bool> (MatchZero && "Failed to match zext but matched aext?"
) ? void (0) : __assert_fail ("MatchZero && \"Failed to match zext but matched aext?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38940, __extension__
__PRETTY_FUNCTION__));

38941

unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);

38942

MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :

38943

MVT::getIntegerVT(MaskEltSize);

38944

SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);

38945

38946

Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);

38947

if (SrcVT.getVectorNumElements() != NumDstElts)

38948

Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);

38949

38950

DstVT = MVT::getIntegerVT(Scale * MaskEltSize);

38951

DstVT = MVT::getVectorVT(DstVT, NumDstElts);

38952

return true;

38953

}

38954

}

38955

}

38956

38957

// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).

38958

if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||

38959

(MaskEltSize == 16 && Subtarget.hasFP16())) &&

38960

isUndefOrEqual(Mask[0], 0) &&

38961

isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {

38962

Shuffle = X86ISD::VZEXT_MOVL;

38963

if (MaskEltSize == 16)

38964

SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);

38965

else

38966

SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;

38967

return true;

38968

}

38969

38970

// Check if we have SSE3 which will let us use MOVDDUP etc. The

38971

// instructions are no slower than UNPCKLPD but has the option to

38972

// fold the input operand into even an unaligned memory load.

38973

if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {

38974

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {

38975

Shuffle = X86ISD::MOVDDUP;

38976

SrcVT = DstVT = MVT::v2f64;

38977

return true;

38978

}

38979

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {

38980

Shuffle = X86ISD::MOVSLDUP;

38981

SrcVT = DstVT = MVT::v4f32;

38982

return true;

38983

}

38984

if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {

38985

Shuffle = X86ISD::MOVSHDUP;

38986

SrcVT = DstVT = MVT::v4f32;

38987

return true;

38988

}

38989

}

38990

38991

if (MaskVT.is256BitVector() && AllowFloatDomain) {

38992

assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38992, __extension__
__PRETTY_FUNCTION__));

38993

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {

38994

Shuffle = X86ISD::MOVDDUP;

38995

SrcVT = DstVT = MVT::v4f64;

38996

return true;

38997

}

38998

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,

38999

V1)) {

39000

Shuffle = X86ISD::MOVSLDUP;

39001

SrcVT = DstVT = MVT::v8f32;

39002

return true;

39003

}

39004

if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,

39005

V1)) {

39006

Shuffle = X86ISD::MOVSHDUP;

39007

SrcVT = DstVT = MVT::v8f32;

39008

return true;

39009

}

39010

}

39011

39012

if (MaskVT.is512BitVector() && AllowFloatDomain) {

39013

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39014, __extension__
__PRETTY_FUNCTION__))

39014

"AVX512 required for 512-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39014, __extension__
__PRETTY_FUNCTION__));

39015

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,

39016

V1)) {

39017

Shuffle = X86ISD::MOVDDUP;

39018

SrcVT = DstVT = MVT::v8f64;

39019

return true;

39020

}

39021

if (isTargetShuffleEquivalent(

39022

MaskVT, Mask,

39023

{0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {

39024

Shuffle = X86ISD::MOVSLDUP;

39025

SrcVT = DstVT = MVT::v16f32;

39026

return true;

39027

}

39028

if (isTargetShuffleEquivalent(

39029

MaskVT, Mask,

39030

{1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {

39031

Shuffle = X86ISD::MOVSHDUP;

39032

SrcVT = DstVT = MVT::v16f32;

39033

return true;

39034

}

39035

}

39036

39037

return false;

39038

}

39039

39040

// Attempt to match a combined shuffle mask against supported unary immediate

39041

// permute instructions.

39042

// TODO: Investigate sharing more of this with shuffle lowering.

39043

static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,

39044

const APInt &Zeroable,

39045

bool AllowFloatDomain, bool AllowIntDomain,

39046

const SelectionDAG &DAG,

39047

const X86Subtarget &Subtarget,

39048

unsigned &Shuffle, MVT &ShuffleVT,

39049

unsigned &PermuteImm) {

39050

unsigned NumMaskElts = Mask.size();

39051

unsigned InputSizeInBits = MaskVT.getSizeInBits();

39052

unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;

39053

MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);

39054

bool ContainsZeros = isAnyZero(Mask);

39055

39056

// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.

39057

if (!ContainsZeros && MaskScalarSizeInBits == 64) {

39058

// Check for lane crossing permutes.

39059

if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {

39060

// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).

39061

if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {

39062

Shuffle = X86ISD::VPERMI;

39063

ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);

39064

PermuteImm = getV4X86ShuffleImm(Mask);

39065

return true;

39066

}

39067

if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {

39068

SmallVector<int, 4> RepeatedMask;

39069

if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {

39070

Shuffle = X86ISD::VPERMI;

39071

ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);

39072

PermuteImm = getV4X86ShuffleImm(RepeatedMask);

39073

return true;

39074

}

39075

}

39076

} else if (AllowFloatDomain && Subtarget.hasAVX()) {

39077

// VPERMILPD can permute with a non-repeating shuffle.

39078

Shuffle = X86ISD::VPERMILPI;

39079

ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());

39080

PermuteImm = 0;

39081

for (int i = 0, e = Mask.size(); i != e; ++i) {

39082

int M = Mask[i];

39083

if (M == SM_SentinelUndef)

39084

continue;

39085

assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")(static_cast <bool> (((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? void (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39085, __extension__
__PRETTY_FUNCTION__));

39086

PermuteImm |= (M & 1) << i;

39087

}

39088

return true;

39089

}

39090

}

39091

39092

// We are checking for shuffle match or shift match. Loop twice so we can

39093

// order which we try and match first depending on target preference.

39094

for (unsigned Order = 0; Order < 2; ++Order) {

39095

if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {

39096

// Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.

39097

// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we

39098

// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).

39099

if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&

39100

!ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {

39101

SmallVector<int, 4> RepeatedMask;

39102

if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {

39103

// Narrow the repeated mask to create 32-bit element permutes.

39104

SmallVector<int, 4> WordMask = RepeatedMask;

39105

if (MaskScalarSizeInBits == 64)

39106

narrowShuffleMaskElts(2, RepeatedMask, WordMask);

39107

39108

Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);

39109

ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);

39110

ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);

39111

PermuteImm = getV4X86ShuffleImm(WordMask);

39112

return true;

39113

}

39114

}

39115

39116

// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.

39117

if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&

39118

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

39119

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

39120

(MaskVT.is512BitVector() && Subtarget.hasBWI()))) {

39121

SmallVector<int, 4> RepeatedMask;

39122

if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {

39123

ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);

39124

ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);

39125

39126

// PSHUFLW: permute lower 4 elements only.

39127

if (isUndefOrInRange(LoMask, 0, 4) &&

39128

isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {

39129

Shuffle = X86ISD::PSHUFLW;

39130

ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);

39131

PermuteImm = getV4X86ShuffleImm(LoMask);

39132

return true;

39133

}

39134

39135

// PSHUFHW: permute upper 4 elements only.

39136

if (isUndefOrInRange(HiMask, 4, 8) &&

39137

isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {

39138

// Offset the HiMask so that we can create the shuffle immediate.

39139

int OffsetHiMask[4];

39140

for (int i = 0; i != 4; ++i)

39141

OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);

39142

39143

Shuffle = X86ISD::PSHUFHW;

39144

ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);

39145

PermuteImm = getV4X86ShuffleImm(OffsetHiMask);

39146

return true;

39147

}

39148

}

39149

}

39150

} else {

39151

// Attempt to match against bit rotates.

39152

if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&

39153

((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||

39154

Subtarget.hasAVX512())) {

39155

int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,

39156

Subtarget, Mask);

39157

if (0 < RotateAmt) {

39158

Shuffle = X86ISD::VROTLI;

39159

PermuteImm = (unsigned)RotateAmt;

39160

return true;

39161

}

39162

}

39163

}

39164

// Attempt to match against byte/bit shifts.

39165

if (AllowIntDomain &&

39166

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

39167

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

39168

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

39169

int ShiftAmt =

39170

matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,

39171

Zeroable, Subtarget);

39172

if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||

39173

32 <= ShuffleVT.getScalarSizeInBits())) {

39174

// Byte shifts can be slower so only match them on second attempt.

39175

if (Order == 0 &&

39176

(Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))

39177

continue;

39178

39179

PermuteImm = (unsigned)ShiftAmt;

39180

return true;

39181

}

39182

39183

}

39184

}

39185

39186

return false;

39187

}

39188

39189

// Attempt to match a combined unary shuffle mask against supported binary

39190

// shuffle instructions.

39191

// TODO: Investigate sharing more of this with shuffle lowering.

39192

static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,

39193

bool AllowFloatDomain, bool AllowIntDomain,

39194

SDValue &V1, SDValue &V2, const SDLoc &DL,

39195

SelectionDAG &DAG, const X86Subtarget &Subtarget,

39196

unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,

39197

bool IsUnary) {

39198

unsigned NumMaskElts = Mask.size();

39199

unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

39200

unsigned SizeInBits = MaskVT.getSizeInBits();

39201

39202

if (MaskVT.is128BitVector()) {

39203

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&

39204

AllowFloatDomain) {

39205

V2 = V1;

39206

V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);

39207

Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;

39208

SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;

39209

return true;

39210

}

39211

if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&

39212

AllowFloatDomain) {

39213

V2 = V1;

39214

Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;

39215

SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;

39216

return true;

39217

}

39218

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&

39219

Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {

39220

std::swap(V1, V2);

39221

Shuffle = X86ISD::MOVSD;

39222

SrcVT = DstVT = MVT::v2f64;

39223

return true;

39224

}

39225

if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&

39226

(AllowFloatDomain || !Subtarget.hasSSE41())) {

39227

Shuffle = X86ISD::MOVSS;

39228

SrcVT = DstVT = MVT::v4f32;

39229

return true;

39230

}

39231

if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},

39232

DAG) &&

39233

Subtarget.hasFP16()) {

39234

Shuffle = X86ISD::MOVSH;

39235

SrcVT = DstVT = MVT::v8f16;

39236

return true;

39237

}

39238

}

39239

39240

// Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.

39241

if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||

39242

((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||

39243

((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {

39244

if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,

39245

Subtarget)) {

39246

DstVT = MaskVT;

39247

return true;

39248

}

39249

}

39250

39251

// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.

39252

if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||

39253

(MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

39254

(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||

39255

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

39256

(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {

39257

if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,

39258

Subtarget)) {

39259

SrcVT = DstVT = MaskVT;

39260

if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())

39261

SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);

39262

return true;

39263

}

39264

}

39265

39266

// Attempt to match against a OR if we're performing a blend shuffle and the

39267

// non-blended source element is zero in each case.

39268

// TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.

39269

if (SizeInBits == V1.getValueSizeInBits() &&

39270

SizeInBits == V2.getValueSizeInBits() &&

39271

(EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&

39272

(EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {

39273

bool IsBlend = true;

39274

unsigned NumV1Elts = V1.getValueType().getVectorNumElements();

39275

unsigned NumV2Elts = V2.getValueType().getVectorNumElements();

39276

unsigned Scale1 = NumV1Elts / NumMaskElts;

39277

unsigned Scale2 = NumV2Elts / NumMaskElts;

39278

APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);

39279

APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);

39280

for (unsigned i = 0; i != NumMaskElts; ++i) {

39281

int M = Mask[i];

39282

if (M == SM_SentinelUndef)

39283

continue;

39284

if (M == SM_SentinelZero) {

39285

DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);

39286

DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);

39287

continue;

39288

}

39289

if (M == (int)i) {

39290

DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);

39291

continue;

39292

}

39293

if (M == (int)(i + NumMaskElts)) {

39294

DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);

39295

continue;

39296

}

39297

IsBlend = false;

39298

break;

39299

}

39300

if (IsBlend) {

39301

if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&

39302

DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {

39303

Shuffle = ISD::OR;

39304

SrcVT = DstVT = MaskVT.changeTypeToInteger();

39305

return true;

39306

}

39307

if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {

39308

// FIXME: handle mismatched sizes?

39309

// TODO: investigate if `ISD::OR` handling in

39310

// `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.

39311

auto computeKnownBitsElementWise = [&DAG](SDValue V) {

39312

unsigned NumElts = V.getValueType().getVectorNumElements();

39313

KnownBits Known(NumElts);

39314

for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {

39315

APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);

39316

KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);

39317

if (PeepholeKnown.isZero())

39318

Known.Zero.setBit(EltIdx);

39319

if (PeepholeKnown.isAllOnes())

39320

Known.One.setBit(EltIdx);

39321

}

39322

return Known;

39323

};

39324

39325

KnownBits V1Known = computeKnownBitsElementWise(V1);

39326

KnownBits V2Known = computeKnownBitsElementWise(V2);

39327

39328

for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {

39329

int M = Mask[i];

39330

if (M == SM_SentinelUndef)

39331

continue;

39332

if (M == SM_SentinelZero) {

39333

IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];

39334

continue;

39335

}

39336

if (M == (int)i) {

39337

IsBlend &= V2Known.Zero[i] || V1Known.One[i];

39338

continue;

39339

}

39340

if (M == (int)(i + NumMaskElts)) {

39341

IsBlend &= V1Known.Zero[i] || V2Known.One[i];

39342

continue;

39343

}

39344

llvm_unreachable("will not get here.")::llvm::llvm_unreachable_internal("will not get here.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39344);

39345

}

39346

if (IsBlend) {

39347

Shuffle = ISD::OR;

39348

SrcVT = DstVT = MaskVT.changeTypeToInteger();

39349

return true;

39350

}

39351

}

39352

}

39353

}

39354

39355

return false;

39356

}

39357

39358

static bool matchBinaryPermuteShuffle(

39359

MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,

39360

bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,

39361

const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,

39362

unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {

39363

unsigned NumMaskElts = Mask.size();

39364

unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

39365

39366

// Attempt to match against VALIGND/VALIGNQ rotate.

39367

if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&

39368

((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||

39369

(MaskVT.is256BitVector() && Subtarget.hasVLX()) ||

39370

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

39371

if (!isAnyZero(Mask)) {

39372

int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);

39373

if (0 < Rotation) {

39374

Shuffle = X86ISD::VALIGN;

39375

if (EltSizeInBits == 64)

39376

ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);

39377

else

39378

ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);

39379

PermuteImm = Rotation;

39380

return true;

39381

}

39382

}

39383

}

39384

39385

// Attempt to match against PALIGNR byte rotate.

39386

if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||

39387

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

39388

(MaskVT.is512BitVector() && Subtarget.hasBWI()))) {

39389

int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);

39390

if (0 < ByteRotation) {

39391

Shuffle = X86ISD::PALIGNR;

39392

ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);

39393

PermuteImm = ByteRotation;

39394

return true;

39395

}

39396

}

39397

39398

// Attempt to combine to X86ISD::BLENDI.

39399

if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||

39400

(Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||

39401

(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {

39402

uint64_t BlendMask = 0;

39403

bool ForceV1Zero = false, ForceV2Zero = false;

39404

SmallVector<int, 8> TargetMask(Mask);

39405

if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,

39406

ForceV2Zero, BlendMask)) {

39407

if (MaskVT == MVT::v16i16) {

39408

// We can only use v16i16 PBLENDW if the lanes are repeated.

39409

SmallVector<int, 8> RepeatedMask;

39410

if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,

39411

RepeatedMask)) {

39412

assert(RepeatedMask.size() == 8 &&(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39413, __extension__
__PRETTY_FUNCTION__))

39413

"Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39413, __extension__
__PRETTY_FUNCTION__));

39414

PermuteImm = 0;

39415

for (int i = 0; i < 8; ++i)

39416

if (RepeatedMask[i] >= 8)

39417

PermuteImm |= 1 << i;

39418

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

39419

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

39420

Shuffle = X86ISD::BLENDI;

39421

ShuffleVT = MaskVT;

39422

return true;

39423

}

39424

} else {

39425

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

39426

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

39427

PermuteImm = (unsigned)BlendMask;

39428

Shuffle = X86ISD::BLENDI;

39429

ShuffleVT = MaskVT;

39430

return true;

39431

}

39432

}

39433

}

39434

39435

// Attempt to combine to INSERTPS, but only if it has elements that need to

39436

// be set to zero.

39437

if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&

39438

MaskVT.is128BitVector() && isAnyZero(Mask) &&

39439

matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {

39440

Shuffle = X86ISD::INSERTPS;

39441

ShuffleVT = MVT::v4f32;

39442

return true;

39443

}

39444

39445

// Attempt to combine to SHUFPD.

39446

if (AllowFloatDomain && EltSizeInBits == 64 &&

39447

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

39448

(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||

39449

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

39450

bool ForceV1Zero = false, ForceV2Zero = false;

39451

if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,

39452

PermuteImm, Mask, Zeroable)) {

39453

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

39454

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

39455

Shuffle = X86ISD::SHUFP;

39456

ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);

39457

return true;

39458

}

39459

}

39460

39461

// Attempt to combine to SHUFPS.

39462

if (AllowFloatDomain && EltSizeInBits == 32 &&

39463

((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||

39464

(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||

39465

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

39466

SmallVector<int, 4> RepeatedMask;

39467

if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {

39468

// Match each half of the repeated mask, to determine if its just

39469

// referencing one of the vectors, is zeroable or entirely undef.

39470

auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {

39471

int M0 = RepeatedMask[Offset];

39472

int M1 = RepeatedMask[Offset + 1];

39473

39474

if (isUndefInRange(RepeatedMask, Offset, 2)) {

39475

return DAG.getUNDEF(MaskVT);

39476

} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {

39477

S0 = (SM_SentinelUndef == M0 ? -1 : 0);

39478

S1 = (SM_SentinelUndef == M1 ? -1 : 1);

39479

return getZeroVector(MaskVT, Subtarget, DAG, DL);

39480

} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {

39481

S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);

39482

S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);

39483

return V1;

39484

} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {

39485

S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);

39486

S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);

39487

return V2;

39488

}

39489

39490

return SDValue();

39491

};

39492

39493

int ShufMask[4] = {-1, -1, -1, -1};

39494

SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);

39495

SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);

39496

39497

if (Lo && Hi) {

39498

V1 = Lo;

39499

V2 = Hi;

39500

Shuffle = X86ISD::SHUFP;

39501

ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);

39502

PermuteImm = getV4X86ShuffleImm(ShufMask);

39503

return true;

39504

}

39505

}

39506

}

39507

39508

// Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.

39509

if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&

39510

MaskVT.is128BitVector() &&

39511

matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {

39512

Shuffle = X86ISD::INSERTPS;

39513

ShuffleVT = MVT::v4f32;

39514

return true;

39515

}

39516

39517

return false;

39518

}

39519

39520

static SDValue combineX86ShuffleChainWithExtract(

39521

ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,

39522

bool HasVariableMask, bool AllowVariableCrossLaneMask,

39523

bool AllowVariablePerLaneMask, SelectionDAG &DAG,

39524

const X86Subtarget &Subtarget);

39525

39526

/// Combine an arbitrary chain of shuffles into a single instruction if

39527

/// possible.

39528

///

39529

/// This is the leaf of the recursive combine below. When we have found some

39530

/// chain of single-use x86 shuffle instructions and accumulated the combined

39531

/// shuffle mask represented by them, this will try to pattern match that mask

39532

/// into either a single instruction if there is a special purpose instruction

39533

/// for this operation, or into a PSHUFB instruction which is a fully general

39534

/// instruction but should only be used to replace chains over a certain depth.

39535

static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

39536

ArrayRef<int> BaseMask, int Depth,

39537

bool HasVariableMask,

39538

bool AllowVariableCrossLaneMask,

39539

bool AllowVariablePerLaneMask,

39540

SelectionDAG &DAG,

39541

const X86Subtarget &Subtarget) {

39542

assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")(static_cast <bool> (!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? void (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39542, __extension__
__PRETTY_FUNCTION__));

39543

assert((Inputs.size() == 1 || Inputs.size() == 2) &&(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39544, __extension__
__PRETTY_FUNCTION__))

39544

"Unexpected number of shuffle inputs!")(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39544, __extension__
__PRETTY_FUNCTION__));

39545

39546

SDLoc DL(Root);

39547

MVT RootVT = Root.getSimpleValueType();

39548

unsigned RootSizeInBits = RootVT.getSizeInBits();

39549

unsigned NumRootElts = RootVT.getVectorNumElements();

39550

39551

// Canonicalize shuffle input op to the requested type.

39552

auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {

39553

if (VT.getSizeInBits() > Op.getValueSizeInBits())

39554

Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());

39555

else if (VT.getSizeInBits() < Op.getValueSizeInBits())

39556

Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());

39557

return DAG.getBitcast(VT, Op);

39558

};

39559

39560

// Find the inputs that enter the chain. Note that multiple uses are OK

39561

// here, we're not going to remove the operands we find.

39562

bool UnaryShuffle = (Inputs.size() == 1);

39563

SDValue V1 = peekThroughBitcasts(Inputs[0]);

39564

SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())

39565

: peekThroughBitcasts(Inputs[1]));

39566

39567

MVT VT1 = V1.getSimpleValueType();

39568

MVT VT2 = V2.getSimpleValueType();

39569

assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39570, __extension__
__PRETTY_FUNCTION__))

39570

(RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch")(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39570, __extension__
__PRETTY_FUNCTION__));

39571

39572

SDValue Res;

39573

39574

unsigned NumBaseMaskElts = BaseMask.size();

39575

if (NumBaseMaskElts == 1) {

39576

assert(BaseMask[0] == 0 && "Invalid shuffle index found!")(static_cast <bool> (BaseMask[0] == 0 && "Invalid shuffle index found!"
) ? void (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39576, __extension__
__PRETTY_FUNCTION__));

39577

return CanonicalizeShuffleInput(RootVT, V1);

39578

}

39579

39580

bool OptForSize = DAG.shouldOptForSize();

39581

unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;

39582

bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||

39583

(RootVT.isFloatingPoint() && Depth >= 1) ||

39584

(RootVT.is256BitVector() && !Subtarget.hasAVX2());

39585

39586

// Don't combine if we are a AVX512/EVEX target and the mask element size

39587

// is different from the root element size - this would prevent writemasks

39588

// from being reused.

39589

bool IsMaskedShuffle = false;

39590

if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {

39591

if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&

39592

Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {

39593

IsMaskedShuffle = true;

39594

}

39595

}

39596

39597

// If we are shuffling a splat (and not introducing zeros) then we can just

39598

// use it directly. This works for smaller elements as well as they already

39599

// repeat across each mask element.

39600

if (UnaryShuffle && !isAnyZero(BaseMask) &&

39601

V1.getValueSizeInBits() >= RootSizeInBits &&

39602

(BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&

39603

DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {

39604

return CanonicalizeShuffleInput(RootVT, V1);

39605

}

39606

39607

SmallVector<int, 64> Mask(BaseMask);

39608

39609

// See if the shuffle is a hidden identity shuffle - repeated args in HOPs

39610

// etc. can be simplified.

39611

if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {

39612

SmallVector<int> ScaledMask, IdentityMask;

39613

unsigned NumElts = VT1.getVectorNumElements();

39614

if (Mask.size() <= NumElts &&

39615

scaleShuffleElements(Mask, NumElts, ScaledMask)) {

39616

for (unsigned i = 0; i != NumElts; ++i)

39617

IdentityMask.push_back(i);

39618

if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,

39619

V2))

39620

return CanonicalizeShuffleInput(RootVT, V1);

39621

}

39622

}

39623

39624

// Handle 128/256-bit lane shuffles of 512-bit vectors.

39625

if (RootVT.is512BitVector() &&

39626

(NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {

39627

// If the upper subvectors are zeroable, then an extract+insert is more

39628

// optimal than using X86ISD::SHUF128. The insertion is free, even if it has

39629

// to zero the upper subvectors.

39630

if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {

39631

if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

39632

return SDValue(); // Nothing to do!

39633

assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39634, __extension__
__PRETTY_FUNCTION__))

39634

"Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39634, __extension__
__PRETTY_FUNCTION__));

39635

Res = CanonicalizeShuffleInput(RootVT, V1);

39636

unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);

39637

bool UseZero = isAnyZero(Mask);

39638

Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);

39639

return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);

39640

}

39641

39642

// Narrow shuffle mask to v4x128.

39643

SmallVector<int, 4> ScaledMask;

39644

assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 128) == 0
&& "Illegal mask size") ? void (0) : __assert_fail (
"(BaseMaskEltSizeInBits % 128) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39644, __extension__
__PRETTY_FUNCTION__));

39645

narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);

39646

39647

// Try to lower to vshuf64x2/vshuf32x4.

39648

auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,

39649

ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,

39650

SelectionDAG &DAG) {

39651

unsigned PermMask = 0;

39652

// Insure elements came from the same Op.

39653

SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};

39654

for (int i = 0; i < 4; ++i) {

39655

assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (ScaledMask[i] >= -1 && "Illegal shuffle sentinel value"
) ? void (0) : __assert_fail ("ScaledMask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39655, __extension__
__PRETTY_FUNCTION__));

39656

if (ScaledMask[i] < 0)

39657

continue;

39658

39659

SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;

39660

unsigned OpIndex = i / 2;

39661

if (Ops[OpIndex].isUndef())

39662

Ops[OpIndex] = Op;

39663

else if (Ops[OpIndex] != Op)

39664

return SDValue();

39665

39666

// Convert the 128-bit shuffle mask selection values into 128-bit

39667

// selection bits defined by a vshuf64x2 instruction's immediate control

39668

// byte.

39669

PermMask |= (ScaledMask[i] % 4) << (i * 2);

39670

}

39671

39672

return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,

39673

CanonicalizeShuffleInput(ShuffleVT, Ops[0]),

39674

CanonicalizeShuffleInput(ShuffleVT, Ops[1]),

39675

DAG.getTargetConstant(PermMask, DL, MVT::i8));

39676

};

39677

39678

// FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask

39679

// doesn't work because our mask is for 128 bits and we don't have an MVT

39680

// to match that.

39681

bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&

39682

isUndefOrInRange(ScaledMask[1], 0, 2) &&

39683

isUndefOrInRange(ScaledMask[2], 2, 4) &&

39684

isUndefOrInRange(ScaledMask[3], 2, 4) &&

39685

(ScaledMask[0] < 0 || ScaledMask[2] < 0 ||

39686

ScaledMask[0] == (ScaledMask[2] % 2)) &&

39687

(ScaledMask[1] < 0 || ScaledMask[3] < 0 ||

39688

ScaledMask[1] == (ScaledMask[3] % 2));

39689

39690

if (!isAnyZero(ScaledMask) && !PreferPERMQ) {

39691

if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)

39692

return SDValue(); // Nothing to do!

39693

MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);

39694

if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))

39695

return DAG.getBitcast(RootVT, V);

39696

}

39697

}

39698

39699

// Handle 128-bit lane shuffles of 256-bit vectors.

39700

if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {

39701

// If the upper half is zeroable, then an extract+insert is more optimal

39702

// than using X86ISD::VPERM2X128. The insertion is free, even if it has to

39703

// zero the upper half.

39704

if (isUndefOrZero(Mask[1])) {

39705

if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

39706

return SDValue(); // Nothing to do!

39707

assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, 2) &&
"Unexpected lane shuffle") ? void (0) : __assert_fail ("isInRange(Mask[0], 0, 2) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39707, __extension__
__PRETTY_FUNCTION__));

39708

Res = CanonicalizeShuffleInput(RootVT, V1);

39709

Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);

39710

return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,

39711

256);

39712

}

39713

39714

// If we're inserting the low subvector, an insert-subvector 'concat'

39715

// pattern is quicker than VPERM2X128.

39716

// TODO: Add AVX2 support instead of VPERMQ/VPERMPD.

39717

if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&

39718

!Subtarget.hasAVX2()) {

39719

if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

39720

return SDValue(); // Nothing to do!

39721

SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);

39722

SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);

39723

Hi = extractSubVector(Hi, 0, DAG, DL, 128);

39724

return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);

39725

}

39726

39727

if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)

39728

return SDValue(); // Nothing to do!

39729

39730

// If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless

39731

// we need to use the zeroing feature.

39732

// Prefer blends for sequential shuffles unless we are optimizing for size.

39733

if (UnaryShuffle &&

39734

!(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&

39735

(OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {

39736

unsigned PermMask = 0;

39737

PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);

39738

PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);

39739

return DAG.getNode(

39740

X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),

39741

DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));

39742

}

39743

39744

if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)

39745

return SDValue(); // Nothing to do!

39746

39747

// TODO - handle AVX512VL cases with X86ISD::SHUF128.

39748

if (!UnaryShuffle && !IsMaskedShuffle) {

39749

assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39750, __extension__
__PRETTY_FUNCTION__))

39750

"Unexpected shuffle sentinel value")(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39750, __extension__
__PRETTY_FUNCTION__));

39751

// Prefer blends to X86ISD::VPERM2X128.

39752

if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {

39753

unsigned PermMask = 0;

39754

PermMask |= ((Mask[0] & 3) << 0);

39755

PermMask |= ((Mask[1] & 3) << 4);

39756

SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;

39757

SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;

39758

return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,

39759

CanonicalizeShuffleInput(RootVT, LHS),

39760

CanonicalizeShuffleInput(RootVT, RHS),

39761

DAG.getTargetConstant(PermMask, DL, MVT::i8));

39762

}

39763

}

39764

}

39765

39766

// For masks that have been widened to 128-bit elements or more,

39767

// narrow back down to 64-bit elements.

39768

if (BaseMaskEltSizeInBits > 64) {

39769

assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 64) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39769, __extension__
__PRETTY_FUNCTION__));

39770

int MaskScale = BaseMaskEltSizeInBits / 64;

39771

SmallVector<int, 64> ScaledMask;

39772

narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);

39773

Mask = std::move(ScaledMask);

39774

}

39775

39776

// For masked shuffles, we're trying to match the root width for better

39777

// writemask folding, attempt to scale the mask.

39778

// TODO - variable shuffles might need this to be widened again.

39779

if (IsMaskedShuffle && NumRootElts > Mask.size()) {

39780

assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size")(static_cast <bool> ((NumRootElts % Mask.size()) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(NumRootElts % Mask.size()) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39780, __extension__
__PRETTY_FUNCTION__));

39781

int MaskScale = NumRootElts / Mask.size();

39782

SmallVector<int, 64> ScaledMask;

39783

narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);

39784

Mask = std::move(ScaledMask);

39785

}

39786

39787

unsigned NumMaskElts = Mask.size();

39788

unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

39789

39790

// Determine the effective mask value type.

39791

FloatDomain &= (32 <= MaskEltSizeInBits);

39792

MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)

39793

: MVT::getIntegerVT(MaskEltSizeInBits);

39794

MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);

39795

39796

// Only allow legal mask types.

39797

if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))

39798

return SDValue();

39799

39800

// Attempt to match the mask against known shuffle patterns.

39801

MVT ShuffleSrcVT, ShuffleVT;

39802

unsigned Shuffle, PermuteImm;

39803

39804

// Which shuffle domains are permitted?

39805

// Permit domain crossing at higher combine depths.

39806

// TODO: Should we indicate which domain is preferred if both are allowed?

39807

bool AllowFloatDomain = FloatDomain || (Depth >= 3);

39808

bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&

39809

(!MaskVT.is256BitVector() || Subtarget.hasAVX2());

39810

39811

// Determine zeroable mask elements.

39812

APInt KnownUndef, KnownZero;

39813

resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);

39814

APInt Zeroable = KnownUndef | KnownZero;

39815

39816

if (UnaryShuffle) {

39817

// Attempt to match against broadcast-from-vector.

39818

// Limit AVX1 to cases where we're loading+broadcasting a scalar element.

39819

if ((Subtarget.hasAVX2() ||

39820

(Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&

39821

(!IsMaskedShuffle || NumRootElts == NumMaskElts)) {

39822

if (isUndefOrEqual(Mask, 0)) {

39823

if (V1.getValueType() == MaskVT &&

39824

V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&

39825

X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {

39826

if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)

39827

return SDValue(); // Nothing to do!

39828

Res = V1.getOperand(0);

39829

Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);

39830

return DAG.getBitcast(RootVT, Res);

39831

}

39832

if (Subtarget.hasAVX2()) {

39833

if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)

39834

return SDValue(); // Nothing to do!

39835

Res = CanonicalizeShuffleInput(MaskVT, V1);

39836

Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);

39837

return DAG.getBitcast(RootVT, Res);

39838

}

39839

}

39840

}

39841

39842

if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,

39843

DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&

39844

(!IsMaskedShuffle ||

39845

(NumRootElts == ShuffleVT.getVectorNumElements()))) {

39846

if (Depth == 0 && Root.getOpcode() == Shuffle)

39847

return SDValue(); // Nothing to do!

39848

Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

39849

Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);

39850

return DAG.getBitcast(RootVT, Res);

39851

}

39852

39853

if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,

39854

AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,

39855

PermuteImm) &&

39856

(!IsMaskedShuffle ||

39857

(NumRootElts == ShuffleVT.getVectorNumElements()))) {

39858

if (Depth == 0 && Root.getOpcode() == Shuffle)

39859

return SDValue(); // Nothing to do!

39860

Res = CanonicalizeShuffleInput(ShuffleVT, V1);

39861

Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,

39862

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

39863

return DAG.getBitcast(RootVT, Res);

39864

}

39865

}

39866

39867

// Attempt to combine to INSERTPS, but only if the inserted element has come

39868

// from a scalar.

39869

// TODO: Handle other insertions here as well?

39870

if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&

39871

Subtarget.hasSSE41() &&

39872

!isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {

39873

if (MaskEltSizeInBits == 32) {

39874

SDValue SrcV1 = V1, SrcV2 = V2;

39875

if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,

39876

DAG) &&

39877

SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {

39878

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)

39879

return SDValue(); // Nothing to do!

39880

Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,

39881

CanonicalizeShuffleInput(MVT::v4f32, SrcV1),

39882

CanonicalizeShuffleInput(MVT::v4f32, SrcV2),

39883

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

39884

return DAG.getBitcast(RootVT, Res);

39885

}

39886

}

39887

if (MaskEltSizeInBits == 64 &&

39888

isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&

39889

V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&

39890

V2.getScalarValueSizeInBits() <= 32) {

39891

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)

39892

return SDValue(); // Nothing to do!

39893

PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);

39894

Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,

39895

CanonicalizeShuffleInput(MVT::v4f32, V1),

39896

CanonicalizeShuffleInput(MVT::v4f32, V2),

39897

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

39898

return DAG.getBitcast(RootVT, Res);

39899

}

39900

}

39901

39902

SDValue NewV1 = V1; // Save operands in case early exit happens.

39903

SDValue NewV2 = V2;

39904

if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,

39905

NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,

39906

ShuffleVT, UnaryShuffle) &&

39907

(!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

39908

if (Depth == 0 && Root.getOpcode() == Shuffle)

39909

return SDValue(); // Nothing to do!

39910

NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);

39911

NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);

39912

Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);

39913

return DAG.getBitcast(RootVT, Res);

39914

}

39915

39916

NewV1 = V1; // Save operands in case early exit happens.

39917

NewV2 = V2;

39918

if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,

39919

AllowIntDomain, NewV1, NewV2, DL, DAG,

39920

Subtarget, Shuffle, ShuffleVT, PermuteImm) &&

39921

(!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

39922

if (Depth == 0 && Root.getOpcode() == Shuffle)

39923

return SDValue(); // Nothing to do!

39924

NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);

39925

NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);

39926

Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,

39927

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

39928

return DAG.getBitcast(RootVT, Res);

39929

}

39930

39931

// Typically from here on, we need an integer version of MaskVT.

39932

MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);

39933

IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);

39934

39935

// Annoyingly, SSE4A instructions don't map into the above match helpers.

39936

if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {

39937

uint64_t BitLen, BitIdx;

39938

if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,

39939

Zeroable)) {

39940

if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)

39941

return SDValue(); // Nothing to do!

39942

V1 = CanonicalizeShuffleInput(IntMaskVT, V1);

39943

Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,

39944

DAG.getTargetConstant(BitLen, DL, MVT::i8),

39945

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

39946

return DAG.getBitcast(RootVT, Res);

39947

}

39948

39949

if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {

39950

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)

39951

return SDValue(); // Nothing to do!

39952

V1 = CanonicalizeShuffleInput(IntMaskVT, V1);

39953

V2 = CanonicalizeShuffleInput(IntMaskVT, V2);

39954

Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,

39955

DAG.getTargetConstant(BitLen, DL, MVT::i8),

39956

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

39957

return DAG.getBitcast(RootVT, Res);

39958

}

39959

}

39960

39961

// Match shuffle against TRUNCATE patterns.

39962

if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {

39963

// Match against a VTRUNC instruction, accounting for src/dst sizes.

39964

if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,

39965

Subtarget)) {

39966

bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==

39967

ShuffleSrcVT.getVectorNumElements();

39968

unsigned Opc =

39969

IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;

39970

if (Depth == 0 && Root.getOpcode() == Opc)

39971

return SDValue(); // Nothing to do!

39972

V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

39973

Res = DAG.getNode(Opc, DL, ShuffleVT, V1);

39974

if (ShuffleVT.getSizeInBits() < RootSizeInBits)

39975

Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);

39976

return DAG.getBitcast(RootVT, Res);

39977

}

39978

39979

// Do we need a more general binary truncation pattern?

39980

if (RootSizeInBits < 512 &&

39981

((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||

39982

(RootVT.is128BitVector() && Subtarget.hasVLX())) &&

39983

(MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&

39984

isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {

39985

// Bail if this was already a truncation or PACK node.

39986

// We sometimes fail to match PACK if we demand known undef elements.

39987

if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||

39988

Root.getOpcode() == X86ISD::PACKSS ||

39989

Root.getOpcode() == X86ISD::PACKUS))

39990

return SDValue(); // Nothing to do!

39991

ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);

39992

ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);

39993

V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

39994

V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);

39995

ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);

39996

ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);

39997

Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);

39998

Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);

39999

return DAG.getBitcast(RootVT, Res);

40000

}

40001

}

40002

40003

// Don't try to re-form single instruction chains under any circumstances now

40004

// that we've done encoding canonicalization for them.

40005

if (Depth < 1)

40006

return SDValue();

40007

40008

// Depth threshold above which we can efficiently use variable mask shuffles.

40009

int VariableCrossLaneShuffleDepth =

40010

Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;

40011

int VariablePerLaneShuffleDepth =

40012

Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;

40013

AllowVariableCrossLaneMask &=

40014

(Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;

40015

AllowVariablePerLaneMask &=

40016

(Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;

40017

// VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a

40018

// higher depth before combining them.

40019

bool AllowBWIVPERMV3 =

40020

(Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);

40021

40022

bool MaskContainsZeros = isAnyZero(Mask);

40023

40024

if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {

40025

// If we have a single input lane-crossing shuffle then lower to VPERMV.

40026

if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {

40027

if (Subtarget.hasAVX2() &&

40028

(MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {

40029

SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);

40030

Res = CanonicalizeShuffleInput(MaskVT, V1);

40031

Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);

40032

return DAG.getBitcast(RootVT, Res);

40033

}

40034

// AVX512 variants (non-VLX will pad to 512-bit shuffles).

40035

if ((Subtarget.hasAVX512() &&

40036

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

40037

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||

40038

(Subtarget.hasBWI() &&

40039

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

40040

(Subtarget.hasVBMI() &&

40041

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {

40042

V1 = CanonicalizeShuffleInput(MaskVT, V1);

40043

V2 = DAG.getUNDEF(MaskVT);

40044

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

40045

return DAG.getBitcast(RootVT, Res);

40046

}

40047

}

40048

40049

// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero

40050

// vector as the second source (non-VLX will pad to 512-bit shuffles).

40051

if (UnaryShuffle && AllowVariableCrossLaneMask &&

40052

((Subtarget.hasAVX512() &&

40053

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

40054

MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||

40055

MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||

40056

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||

40057

(Subtarget.hasBWI() && AllowBWIVPERMV3 &&

40058

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

40059

(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

40060

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {

40061

// Adjust shuffle mask - replace SM_SentinelZero with second source index.

40062

for (unsigned i = 0; i != NumMaskElts; ++i)

40063

if (Mask[i] == SM_SentinelZero)

40064

Mask[i] = NumMaskElts + i;

40065

V1 = CanonicalizeShuffleInput(MaskVT, V1);

40066

V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);

40067

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

40068

return DAG.getBitcast(RootVT, Res);

40069

}

40070

40071

// If that failed and either input is extracted then try to combine as a

40072

// shuffle with the larger type.

40073

if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(

40074

Inputs, Root, BaseMask, Depth, HasVariableMask,

40075

AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,

40076

Subtarget))

40077

return WideShuffle;

40078

40079

// If we have a dual input lane-crossing shuffle then lower to VPERMV3,

40080

// (non-VLX will pad to 512-bit shuffles).

40081

if (AllowVariableCrossLaneMask && !MaskContainsZeros &&

40082

((Subtarget.hasAVX512() &&

40083

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

40084

MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||

40085

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||

40086

MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||

40087

(Subtarget.hasBWI() && AllowBWIVPERMV3 &&

40088

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

40089

(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

40090

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {

40091

V1 = CanonicalizeShuffleInput(MaskVT, V1);

40092

V2 = CanonicalizeShuffleInput(MaskVT, V2);

40093

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

40094

return DAG.getBitcast(RootVT, Res);

40095

}

40096

return SDValue();

40097

}

40098

40099

// See if we can combine a single input shuffle with zeros to a bit-mask,

40100

// which is much simpler than any shuffle.

40101

if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&

40102

isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&

40103

DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {

40104

APInt Zero = APInt::getZero(MaskEltSizeInBits);

40105

APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);

40106

APInt UndefElts(NumMaskElts, 0);

40107

SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);

40108

for (unsigned i = 0; i != NumMaskElts; ++i) {

40109

int M = Mask[i];

40110

if (M == SM_SentinelUndef) {

40111

UndefElts.setBit(i);

40112

continue;

40113

}

40114

if (M == SM_SentinelZero)

40115

continue;

40116

EltBits[i] = AllOnes;

40117

}

40118

SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);

40119

Res = CanonicalizeShuffleInput(MaskVT, V1);

40120

unsigned AndOpcode =

40121

MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);

40122

Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);

40123

return DAG.getBitcast(RootVT, Res);

40124

}

40125

40126

// If we have a single input shuffle with different shuffle patterns in the

40127

// the 128-bit lanes use the variable mask to VPERMILPS.

40128

// TODO Combine other mask types at higher depths.

40129

if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&

40130

((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||

40131

(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {

40132

SmallVector<SDValue, 16> VPermIdx;

40133

for (int M : Mask) {

40134

SDValue Idx =

40135

M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);

40136

VPermIdx.push_back(Idx);

40137

}

40138

SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);

40139

Res = CanonicalizeShuffleInput(MaskVT, V1);

40140

Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);

40141

return DAG.getBitcast(RootVT, Res);

40142

}

40143

40144

// With XOP, binary shuffles of 128/256-bit floating point vectors can combine

40145

// to VPERMIL2PD/VPERMIL2PS.

40146

if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&

40147

(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||

40148

MaskVT == MVT::v8f32)) {

40149

// VPERMIL2 Operation.

40150

// Bits[3] - Match Bit.

40151

// Bits[2:1] - (Per Lane) PD Shuffle Mask.

40152

// Bits[2:0] - (Per Lane) PS Shuffle Mask.

40153

unsigned NumLanes = MaskVT.getSizeInBits() / 128;

40154

unsigned NumEltsPerLane = NumMaskElts / NumLanes;

40155

SmallVector<int, 8> VPerm2Idx;

40156

unsigned M2ZImm = 0;

40157

for (int M : Mask) {

40158

if (M == SM_SentinelUndef) {

40159

VPerm2Idx.push_back(-1);

40160

continue;

40161

}

40162

if (M == SM_SentinelZero) {

40163

M2ZImm = 2;

40164

VPerm2Idx.push_back(8);

40165

continue;

40166

}

40167

int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);

40168

Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);

40169

VPerm2Idx.push_back(Index);

40170

}

40171

V1 = CanonicalizeShuffleInput(MaskVT, V1);

40172

V2 = CanonicalizeShuffleInput(MaskVT, V2);

40173

SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);

40174

Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,

40175

DAG.getTargetConstant(M2ZImm, DL, MVT::i8));

40176

return DAG.getBitcast(RootVT, Res);

40177

}

40178

40179

// If we have 3 or more shuffle instructions or a chain involving a variable

40180

// mask, we can replace them with a single PSHUFB instruction profitably.

40181

// Intel's manuals suggest only using PSHUFB if doing so replacing 5

40182

// instructions, but in practice PSHUFB tends to be *very* fast so we're

40183

// more aggressive.

40184

if (UnaryShuffle && AllowVariablePerLaneMask &&

40185

((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||

40186

(RootVT.is256BitVector() && Subtarget.hasAVX2()) ||

40187

(RootVT.is512BitVector() && Subtarget.hasBWI()))) {

40188

SmallVector<SDValue, 16> PSHUFBMask;

40189

int NumBytes = RootVT.getSizeInBits() / 8;

40190

int Ratio = NumBytes / NumMaskElts;

40191

for (int i = 0; i < NumBytes; ++i) {

40192

int M = Mask[i / Ratio];

40193

if (M == SM_SentinelUndef) {

40194

PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));

40195

continue;

40196

}

40197

if (M == SM_SentinelZero) {

40198

PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));

40199

continue;

40200

}

40201

M = Ratio * M + i % Ratio;

40202

assert((M / 16) == (i / 16) && "Lane crossing detected")(static_cast <bool> ((M / 16) == (i / 16) && "Lane crossing detected"
) ? void (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40202, __extension__
__PRETTY_FUNCTION__));

40203

PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));

40204

}

40205

MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);

40206

Res = CanonicalizeShuffleInput(ByteVT, V1);

40207

SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);

40208

Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);

40209

return DAG.getBitcast(RootVT, Res);

40210

}

40211

40212

// With XOP, if we have a 128-bit binary input shuffle we can always combine

40213

// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never

40214

// slower than PSHUFB on targets that support both.

40215

if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&

40216

Subtarget.hasXOP()) {

40217

// VPPERM Mask Operation

40218

// Bits[4:0] - Byte Index (0 - 31)

40219

// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)

40220

SmallVector<SDValue, 16> VPPERMMask;

40221

int NumBytes = 16;

40222

int Ratio = NumBytes / NumMaskElts;

40223

for (int i = 0; i < NumBytes; ++i) {

40224

int M = Mask[i / Ratio];

40225

if (M == SM_SentinelUndef) {

40226

VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));

40227

continue;

40228

}

40229

if (M == SM_SentinelZero) {

40230

VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));

40231

continue;

40232

}

40233

M = Ratio * M + i % Ratio;

40234

VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));

40235

}

40236

MVT ByteVT = MVT::v16i8;

40237

V1 = CanonicalizeShuffleInput(ByteVT, V1);

40238

V2 = CanonicalizeShuffleInput(ByteVT, V2);

40239

SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);

40240

Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);

40241

return DAG.getBitcast(RootVT, Res);

40242

}

40243

40244

// If that failed and either input is extracted then try to combine as a

40245

// shuffle with the larger type.

40246

if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(

40247

Inputs, Root, BaseMask, Depth, HasVariableMask,

40248

AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))

40249

return WideShuffle;

40250

40251

// If we have a dual input shuffle then lower to VPERMV3,

40252

// (non-VLX will pad to 512-bit shuffles)

40253

if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&

40254

((Subtarget.hasAVX512() &&

40255

(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||

40256

MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||

40257

MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||

40258

MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||

40259

MaskVT == MVT::v16i32)) ||

40260

(Subtarget.hasBWI() && AllowBWIVPERMV3 &&

40261

(MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||

40262

MaskVT == MVT::v32i16)) ||

40263

(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

40264

(MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||

40265

MaskVT == MVT::v64i8)))) {

40266

V1 = CanonicalizeShuffleInput(MaskVT, V1);

40267

V2 = CanonicalizeShuffleInput(MaskVT, V2);

40268

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

40269

return DAG.getBitcast(RootVT, Res);

40270

}

40271

40272

// Failed to find any combines.

40273

return SDValue();

40274

}

40275

40276

// Combine an arbitrary chain of shuffles + extract_subvectors into a single

40277

// instruction if possible.

40278

//

40279

// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger

40280

// type size to attempt to combine:

40281

// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)

40282

// -->

40283

// extract_subvector(shuffle(x,y,m2),0)

40284

static SDValue combineX86ShuffleChainWithExtract(

40285

ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,

40286

bool HasVariableMask, bool AllowVariableCrossLaneMask,

40287

bool AllowVariablePerLaneMask, SelectionDAG &DAG,

40288

const X86Subtarget &Subtarget) {

40289

unsigned NumMaskElts = BaseMask.size();

40290

unsigned NumInputs = Inputs.size();

40291

if (NumInputs == 0)

40292

return SDValue();

40293

40294

EVT RootVT = Root.getValueType();

40295

unsigned RootSizeInBits = RootVT.getSizeInBits();

40296

unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;

40297

assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask")(static_cast <bool> ((RootSizeInBits % NumMaskElts) == 0
&& "Unexpected root shuffle mask") ? void (0) : __assert_fail
("(RootSizeInBits % NumMaskElts) == 0 && \"Unexpected root shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40297, __extension__
__PRETTY_FUNCTION__));

40298

40299

// Peek through extract_subvector to find widest legal vector.

40300

// TODO: Handle ISD::TRUNCATE

40301

unsigned WideSizeInBits = RootSizeInBits;

40302

for (unsigned I = 0; I != NumInputs; ++I) {

40303

SDValue Input = peekThroughBitcasts(Inputs[I]);

40304

while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR)

40305

Input = peekThroughBitcasts(Input.getOperand(0));

40306

if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&

40307

WideSizeInBits < Input.getValueSizeInBits())

40308

WideSizeInBits = Input.getValueSizeInBits();

40309

}

40310

40311

// Bail if we fail to find a source larger than the existing root.

40312

unsigned Scale = WideSizeInBits / RootSizeInBits;

40313

if (WideSizeInBits <= RootSizeInBits ||

40314

(WideSizeInBits % RootSizeInBits) != 0)

40315

return SDValue();

40316

40317

// Create new mask for larger type.

40318

SmallVector<int, 64> WideMask(BaseMask);

40319

for (int &M : WideMask) {

40320

if (M < 0)

40321

continue;

40322

M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);

40323

}

40324

WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);

40325

40326

// Attempt to peek through inputs and adjust mask when we extract from an

40327

// upper subvector.

40328

int AdjustedMasks = 0;

40329

SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());

40330

for (unsigned I = 0; I != NumInputs; ++I) {

40331

SDValue &Input = WideInputs[I];

40332

Input = peekThroughBitcasts(Input);

40333

while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

40334

Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {

40335

uint64_t Idx = Input.getConstantOperandVal(1);

40336

if (Idx != 0) {

40337

++AdjustedMasks;

40338

unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();

40339

Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;

40340

40341

int lo = I * WideMask.size();

40342

int hi = (I + 1) * WideMask.size();

40343

for (int &M : WideMask)

40344

if (lo <= M && M < hi)

40345

M += Idx;

40346

}

40347

Input = peekThroughBitcasts(Input.getOperand(0));

40348

}

40349

}

40350

40351

// Remove unused/repeated shuffle source ops.

40352

resolveTargetShuffleInputsAndMask(WideInputs, WideMask);

40353

assert(!WideInputs.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!WideInputs.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!WideInputs.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40353, __extension__
__PRETTY_FUNCTION__));

40354

40355

// Bail if we're always extracting from the lowest subvectors,

40356

// combineX86ShuffleChain should match this for the current width, or the

40357

// shuffle still references too many inputs.

40358

if (AdjustedMasks == 0 || WideInputs.size() > 2)

40359

return SDValue();

40360

40361

// Minor canonicalization of the accumulated shuffle mask to make it easier

40362

// to match below. All this does is detect masks with sequential pairs of

40363

// elements, and shrink them to the half-width mask. It does this in a loop

40364

// so it will reduce the size of the mask to the minimal width mask which

40365

// performs an equivalent shuffle.

40366

while (WideMask.size() > 1) {

40367

SmallVector<int, 64> WidenedMask;

40368

if (!canWidenShuffleElements(WideMask, WidenedMask))

40369

break;

40370

WideMask = std::move(WidenedMask);

40371

}

40372

40373

// Canonicalization of binary shuffle masks to improve pattern matching by

40374

// commuting the inputs.

40375

if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {

40376

ShuffleVectorSDNode::commuteMask(WideMask);

40377

std::swap(WideInputs[0], WideInputs[1]);

40378

}

40379

40380

// Increase depth for every upper subvector we've peeked through.

40381

Depth += AdjustedMasks;

40382

40383

// Attempt to combine wider chain.

40384

// TODO: Can we use a better Root?

40385

SDValue WideRoot = WideInputs.front().getValueSizeInBits() >

40386

WideInputs.back().getValueSizeInBits()

40387

? WideInputs.front()

40388

: WideInputs.back();

40389

assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&(static_cast <bool> (WideRoot.getValueSizeInBits() == WideSizeInBits
&& "WideRootSize mismatch") ? void (0) : __assert_fail
("WideRoot.getValueSizeInBits() == WideSizeInBits && \"WideRootSize mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40390, __extension__
__PRETTY_FUNCTION__))

40390

"WideRootSize mismatch")(static_cast <bool> (WideRoot.getValueSizeInBits() == WideSizeInBits
&& "WideRootSize mismatch") ? void (0) : __assert_fail
("WideRoot.getValueSizeInBits() == WideSizeInBits && \"WideRootSize mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40390, __extension__
__PRETTY_FUNCTION__));

40391

40392

if (SDValue WideShuffle =

40393

combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,

40394

HasVariableMask, AllowVariableCrossLaneMask,

40395

AllowVariablePerLaneMask, DAG, Subtarget)) {

40396

WideShuffle =

40397

extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);

40398

return DAG.getBitcast(RootVT, WideShuffle);

40399

}

40400

40401

return SDValue();

40402

}

40403

40404

// Canonicalize the combined shuffle mask chain with horizontal ops.

40405

// NOTE: This may update the Ops and Mask.

40406

static SDValue canonicalizeShuffleMaskWithHorizOp(

40407

MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,

40408

unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,

40409

const X86Subtarget &Subtarget) {

40410

if (Mask.empty() || Ops.empty())

40411

return SDValue();

40412

40413

SmallVector<SDValue> BC;

40414

for (SDValue Op : Ops)

40415

BC.push_back(peekThroughBitcasts(Op));

40416

40417

// All ops must be the same horizop + type.

40418

SDValue BC0 = BC[0];

40419

EVT VT0 = BC0.getValueType();

40420

unsigned Opcode0 = BC0.getOpcode();

40421

if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {

40422

return V.getOpcode() != Opcode0 || V.getValueType() != VT0;

40423

}))

40424

return SDValue();

40425

40426

bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||

40427

Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);

40428

bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);

40429

if (!isHoriz && !isPack)

40430

return SDValue();

40431

40432

// Do all ops have a single use?

40433

bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {

40434

return Op.hasOneUse() &&

40435

peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);

40436

});

40437

40438

int NumElts = VT0.getVectorNumElements();

40439

int NumLanes = VT0.getSizeInBits() / 128;

40440

int NumEltsPerLane = NumElts / NumLanes;

40441

int NumHalfEltsPerLane = NumEltsPerLane / 2;

40442

MVT SrcVT = BC0.getOperand(0).getSimpleValueType();

40443

unsigned EltSizeInBits = RootSizeInBits / Mask.size();

40444

40445

if (NumEltsPerLane >= 4 &&

40446

(isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {

40447

SmallVector<int> LaneMask, ScaledMask;

40448

if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&

40449

scaleShuffleElements(LaneMask, 4, ScaledMask)) {

40450

// See if we can remove the shuffle by resorting the HOP chain so that

40451

// the HOP args are pre-shuffled.

40452

// TODO: Generalize to any sized/depth chain.

40453

// TODO: Add support for PACKSS/PACKUS.

40454

if (isHoriz) {

40455

// Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.

40456

auto GetHOpSrc = [&](int M) {

40457

if (M == SM_SentinelUndef)

40458

return DAG.getUNDEF(VT0);

40459

if (M == SM_SentinelZero)

40460

return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);

40461

SDValue Src0 = BC[M / 4];

40462

SDValue Src1 = Src0.getOperand((M % 4) >= 2);

40463

if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))

40464

return Src1.getOperand(M % 2);

40465

return SDValue();

40466

};

40467

SDValue M0 = GetHOpSrc(ScaledMask[0]);

40468

SDValue M1 = GetHOpSrc(ScaledMask[1]);

40469

SDValue M2 = GetHOpSrc(ScaledMask[2]);

40470

SDValue M3 = GetHOpSrc(ScaledMask[3]);

40471

if (M0 && M1 && M2 && M3) {

40472

SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);

40473

SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);

40474

return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);

40475

}

40476

}

40477

// shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.

40478

if (Ops.size() >= 2) {

40479

SDValue LHS, RHS;

40480

auto GetHOpSrc = [&](int M, int &OutM) {

40481

// TODO: Support SM_SentinelZero

40482

if (M < 0)

40483

return M == SM_SentinelUndef;

40484

SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);

40485

if (!LHS || LHS == Src) {

40486

LHS = Src;

40487

OutM = (M % 2);

40488

return true;

40489

}

40490

if (!RHS || RHS == Src) {

40491

RHS = Src;

40492

OutM = (M % 2) + 2;

40493

return true;

40494

}

40495

return false;

40496

};

40497

int PostMask[4] = {-1, -1, -1, -1};

40498

if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&

40499

GetHOpSrc(ScaledMask[1], PostMask[1]) &&

40500

GetHOpSrc(ScaledMask[2], PostMask[2]) &&

40501

GetHOpSrc(ScaledMask[3], PostMask[3])) {

40502

LHS = DAG.getBitcast(SrcVT, LHS);

40503

RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);

40504

SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);

40505

// Use SHUFPS for the permute so this will work on SSE3 targets,

40506

// shuffle combining and domain handling will simplify this later on.

40507

MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);

40508

Res = DAG.getBitcast(ShuffleVT, Res);

40509

return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,

40510

getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));

40511

}

40512

}

40513

}

40514

}

40515

40516

if (2 < Ops.size())

40517

return SDValue();

40518

40519

SDValue BC1 = BC[BC.size() - 1];

40520

if (Mask.size() == VT0.getVectorNumElements()) {

40521

// Canonicalize binary shuffles of horizontal ops that use the

40522

// same sources to an unary shuffle.

40523

// TODO: Try to perform this fold even if the shuffle remains.

40524

if (Ops.size() == 2) {

40525

auto ContainsOps = [](SDValue HOp, SDValue Op) {

40526

return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);

40527

};

40528

// Commute if all BC0's ops are contained in BC1.

40529

if (ContainsOps(BC1, BC0.getOperand(0)) &&

40530

ContainsOps(BC1, BC0.getOperand(1))) {

40531

ShuffleVectorSDNode::commuteMask(Mask);

40532

std::swap(Ops[0], Ops[1]);

40533

std::swap(BC0, BC1);

40534

}

40535

40536

// If BC1 can be represented by BC0, then convert to unary shuffle.

40537

if (ContainsOps(BC0, BC1.getOperand(0)) &&

40538

ContainsOps(BC0, BC1.getOperand(1))) {

40539

for (int &M : Mask) {

40540

if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.

40541

continue;

40542

int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;

40543

M -= NumElts + (SubLane * NumHalfEltsPerLane);

40544

if (BC1.getOperand(SubLane) != BC0.getOperand(0))

40545

M += NumHalfEltsPerLane;

40546

}

40547

}

40548

}

40549

40550

// Canonicalize unary horizontal ops to only refer to lower halves.

40551

for (int i = 0; i != NumElts; ++i) {

40552

int &M = Mask[i];

40553

if (isUndefOrZero(M))

40554

continue;

40555

if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&

40556

(M % NumEltsPerLane) >= NumHalfEltsPerLane)

40557

M -= NumHalfEltsPerLane;

40558

if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&

40559

(M % NumEltsPerLane) >= NumHalfEltsPerLane)

40560

M -= NumHalfEltsPerLane;

40561

}

40562

}

40563

40564

// Combine binary shuffle of 2 similar 'Horizontal' instructions into a

40565

// single instruction. Attempt to match a v2X64 repeating shuffle pattern that

40566

// represents the LHS/RHS inputs for the lower/upper halves.

40567

SmallVector<int, 16> TargetMask128, WideMask128;

40568

if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&

40569

scaleShuffleElements(TargetMask128, 2, WideMask128)) {

40570

assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle")(static_cast <bool> (isUndefOrZeroOrInRange(WideMask128
, 0, 4) && "Illegal shuffle") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(WideMask128, 0, 4) && \"Illegal shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40570, __extension__
__PRETTY_FUNCTION__));

40571

bool SingleOp = (Ops.size() == 1);

40572

if (isPack || OneUseOps ||

40573

shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {

40574

SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;

40575

SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;

40576

Lo = Lo.getOperand(WideMask128[0] & 1);

40577

Hi = Hi.getOperand(WideMask128[1] & 1);

40578

if (SingleOp) {

40579

SDValue Undef = DAG.getUNDEF(SrcVT);

40580

SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);

40581

Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);

40582

Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);

40583

Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);

40584

Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);

40585

}

40586

return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);

40587

}

40588

}

40589

40590

return SDValue();

40591

}

40592

40593

// Attempt to constant fold all of the constant source ops.

40594

// Returns true if the entire shuffle is folded to a constant.

40595

// TODO: Extend this to merge multiple constant Ops and update the mask.

40596

static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,

40597

ArrayRef<int> Mask, SDValue Root,

40598

bool HasVariableMask,

40599

SelectionDAG &DAG,

40600

const X86Subtarget &Subtarget) {

40601

MVT VT = Root.getSimpleValueType();

40602

40603

unsigned SizeInBits = VT.getSizeInBits();

40604

unsigned NumMaskElts = Mask.size();

40605

unsigned MaskSizeInBits = SizeInBits / NumMaskElts;

40606

unsigned NumOps = Ops.size();

40607

40608

// Extract constant bits from each source op.

40609

SmallVector<APInt, 16> UndefEltsOps(NumOps);

40610

SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);

40611

for (unsigned I = 0; I != NumOps; ++I)

40612

if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],

40613

RawBitsOps[I]))

40614

return SDValue();

40615

40616

// If we're optimizing for size, only fold if at least one of the constants is

40617

// only used once or the combined shuffle has included a variable mask

40618

// shuffle, this is to avoid constant pool bloat.

40619

bool IsOptimizingSize = DAG.shouldOptForSize();

40620

if (IsOptimizingSize && !HasVariableMask &&

40621

llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))

40622

return SDValue();

40623

40624

// Shuffle the constant bits according to the mask.

40625

SDLoc DL(Root);

40626

APInt UndefElts(NumMaskElts, 0);

40627

APInt ZeroElts(NumMaskElts, 0);

40628

APInt ConstantElts(NumMaskElts, 0);

40629

SmallVector<APInt, 8> ConstantBitData(NumMaskElts,

40630

APInt::getZero(MaskSizeInBits));

40631

for (unsigned i = 0; i != NumMaskElts; ++i) {

40632

int M = Mask[i];

40633

if (M == SM_SentinelUndef) {

40634

UndefElts.setBit(i);

40635

continue;

40636

} else if (M == SM_SentinelZero) {

40637

ZeroElts.setBit(i);

40638

continue;

40639

}

40640

assert(0 <= M && M < (int)(NumMaskElts * NumOps))(static_cast <bool> (0 <= M && M < (int)(
NumMaskElts * NumOps)) ? void (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40640, __extension__
__PRETTY_FUNCTION__));

40641

40642

unsigned SrcOpIdx = (unsigned)M / NumMaskElts;

40643

unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;

40644

40645

auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];

40646

if (SrcUndefElts[SrcMaskIdx]) {

40647

UndefElts.setBit(i);

40648

continue;

40649

}

40650

40651

auto &SrcEltBits = RawBitsOps[SrcOpIdx];

40652

APInt &Bits = SrcEltBits[SrcMaskIdx];

40653

if (!Bits) {

40654

ZeroElts.setBit(i);

40655

continue;

40656

}

40657

40658

ConstantElts.setBit(i);

40659

ConstantBitData[i] = Bits;

40660

}

40661

40662

40663

// Attempt to create a zero vector.

40664

if ((UndefElts | ZeroElts).isAllOnes())

40665

return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);

40666

40667

// Create the constant data.

40668

MVT MaskSVT;

40669

if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))

40670

MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);

40671

else

40672

MaskSVT = MVT::getIntegerVT(MaskSizeInBits);

40673

40674

MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);

40675

if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))

40676

return SDValue();

40677

40678

SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);

40679

return DAG.getBitcast(VT, CstOp);

40680

}

40681

40682

namespace llvm {

40683

namespace X86 {

40684

enum {

40685

MaxShuffleCombineDepth = 8

40686

};

40687

}

40688

} // namespace llvm

40689

40690

/// Fully generic combining of x86 shuffle instructions.

40691

///

40692

/// This should be the last combine run over the x86 shuffle instructions. Once

40693

/// they have been fully optimized, this will recursively consider all chains

40694

/// of single-use shuffle instructions, build a generic model of the cumulative

40695

/// shuffle operation, and check for simpler instructions which implement this

40696

/// operation. We use this primarily for two purposes:

40697

///

40698

/// 1) Collapse generic shuffles to specialized single instructions when

40699

/// equivalent. In most cases, this is just an encoding size win, but

40700

/// sometimes we will collapse multiple generic shuffles into a single

40701

/// special-purpose shuffle.

40702

/// 2) Look for sequences of shuffle instructions with 3 or more total

40703

/// instructions, and replace them with the slightly more expensive SSSE3

40704

/// PSHUFB instruction if available. We do this as the last combining step

40705

/// to ensure we avoid using PSHUFB if we can implement the shuffle with

40706

/// a suitable short sequence of other instructions. The PSHUFB will either

40707

/// use a register or have to read from memory and so is slightly (but only

40708

/// slightly) more expensive than the other shuffle instructions.

40709

///

40710

/// Because this is inherently a quadratic operation (for each shuffle in

40711

/// a chain, we recurse up the chain), the depth is limited to 8 instructions.

40712

/// This should never be an issue in practice as the shuffle lowering doesn't

40713

/// produce sequences of more than 8 instructions.

40714

///

40715

/// FIXME: We will currently miss some cases where the redundant shuffling

40716

/// would simplify under the threshold for PSHUFB formation because of

40717

/// combine-ordering. To fix this, we should do the redundant instruction

40718

/// combining in this recursive walk.

40719

static SDValue combineX86ShufflesRecursively(

40720

ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,

40721

ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,

40722

unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,

40723

bool AllowVariablePerLaneMask, SelectionDAG &DAG,

40724

const X86Subtarget &Subtarget) {

40725

assert(!RootMask.empty() &&(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40727, __extension__
__PRETTY_FUNCTION__))

40726

(RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40727, __extension__
__PRETTY_FUNCTION__))

40727

"Illegal shuffle root mask")(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40727, __extension__
__PRETTY_FUNCTION__));

40728

MVT RootVT = Root.getSimpleValueType();

40729

assert(RootVT.isVector() && "Shuffles operate on vector types!")(static_cast <bool> (RootVT.isVector() && "Shuffles operate on vector types!"
) ? void (0) : __assert_fail ("RootVT.isVector() && \"Shuffles operate on vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40729, __extension__
__PRETTY_FUNCTION__));

40730

unsigned RootSizeInBits = RootVT.getSizeInBits();

40731

40732

// Bound the depth of our recursive combine because this is ultimately

40733

// quadratic in nature.

40734

if (Depth >= MaxDepth)

40735

return SDValue();

40736

40737

// Directly rip through bitcasts to find the underlying operand.

40738

SDValue Op = SrcOps[SrcOpIndex];

40739

Op = peekThroughOneUseBitcasts(Op);

40740

40741

EVT VT = Op.getValueType();

40742

if (!VT.isVector() || !VT.isSimple())

40743

return SDValue(); // Bail if we hit a non-simple non-vector.

40744

40745

// FIXME: Just bail on f16 for now.

40746

if (VT.getVectorElementType() == MVT::f16)

40747

return SDValue();

40748

40749

assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40750, __extension__
__PRETTY_FUNCTION__))

40750

"Can only combine shuffles upto size of the root op.")(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40750, __extension__
__PRETTY_FUNCTION__));

40751

40752

// Create a demanded elts mask from the referenced elements of Op.

40753

APInt OpDemandedElts = APInt::getZero(RootMask.size());

40754

for (int M : RootMask) {

40755

int BaseIdx = RootMask.size() * SrcOpIndex;

40756

if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))

40757

OpDemandedElts.setBit(M - BaseIdx);

40758

}

40759

if (RootSizeInBits != VT.getSizeInBits()) {

40760

// Op is smaller than Root - extract the demanded elts for the subvector.

40761

unsigned Scale = RootSizeInBits / VT.getSizeInBits();

40762

unsigned NumOpMaskElts = RootMask.size() / Scale;

40763

assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch")(static_cast <bool> ((RootMask.size() % Scale) == 0 &&
"Root mask size mismatch") ? void (0) : __assert_fail ("(RootMask.size() % Scale) == 0 && \"Root mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40763, __extension__
__PRETTY_FUNCTION__));

40764

assert(OpDemandedElts(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40767, __extension__
__PRETTY_FUNCTION__))

40765

.extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40767, __extension__
__PRETTY_FUNCTION__))

40766

.isZero() &&(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40767, __extension__
__PRETTY_FUNCTION__))

40767

"Out of range elements referenced in root mask")(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40767, __extension__
__PRETTY_FUNCTION__));

40768

OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);

40769

}

40770

OpDemandedElts =

40771

APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());

40772

40773

// Extract target shuffle mask and resolve sentinels and inputs.

40774

SmallVector<int, 64> OpMask;

40775

SmallVector<SDValue, 2> OpInputs;

40776

APInt OpUndef, OpZero;

40777

bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());

40778

if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,

40779

OpZero, DAG, Depth, false)) {

40780

// Shuffle inputs must not be larger than the shuffle result.

40781

// TODO: Relax this for single input faux shuffles (e.g. trunc).

40782

if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {

40783

return OpInput.getValueSizeInBits() > VT.getSizeInBits();

40784

}))

40785

return SDValue();

40786

} else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

40787

(RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&

40788

!isNullConstant(Op.getOperand(1))) {

40789

SDValue SrcVec = Op.getOperand(0);

40790

int ExtractIdx = Op.getConstantOperandVal(1);

40791

unsigned NumElts = VT.getVectorNumElements();

40792

OpInputs.assign({SrcVec});

40793

OpMask.assign(NumElts, SM_SentinelUndef);

40794

std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);

40795

OpZero = OpUndef = APInt::getZero(NumElts);

40796

} else {

40797

return SDValue();

40798

}

40799

40800

// If the shuffle result was smaller than the root, we need to adjust the

40801

// mask indices and pad the mask with undefs.

40802

if (RootSizeInBits > VT.getSizeInBits()) {

40803

unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();

40804

unsigned OpMaskSize = OpMask.size();

40805

if (OpInputs.size() > 1) {

40806

unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;

40807

for (int &M : OpMask) {

40808

if (M < 0)

40809

continue;

40810

int EltIdx = M % OpMaskSize;

40811

int OpIdx = M / OpMaskSize;

40812

M = (PaddedMaskSize * OpIdx) + EltIdx;

40813

}

40814

}

40815

OpZero = OpZero.zext(NumSubVecs * OpMaskSize);

40816

OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);

40817

OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);

40818

}

40819

40820

SmallVector<int, 64> Mask;

40821

SmallVector<SDValue, 16> Ops;

40822

40823

// We don't need to merge masks if the root is empty.

40824

bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);

40825

if (EmptyRoot) {

40826

// Only resolve zeros if it will remove an input, otherwise we might end

40827

// up in an infinite loop.

40828

bool ResolveKnownZeros = true;

40829

if (!OpZero.isZero()) {

40830

APInt UsedInputs = APInt::getZero(OpInputs.size());

40831

for (int i = 0, e = OpMask.size(); i != e; ++i) {

40832

int M = OpMask[i];

40833

if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))

40834

continue;

40835

UsedInputs.setBit(M / OpMask.size());

40836

if (UsedInputs.isAllOnes()) {

40837

ResolveKnownZeros = false;

40838

break;

40839

}

40840

}

40841

}

40842

resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,

40843

ResolveKnownZeros);

40844

40845

Mask = OpMask;

40846

Ops.append(OpInputs.begin(), OpInputs.end());

40847

} else {

40848

resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);

40849

40850

// Add the inputs to the Ops list, avoiding duplicates.

40851

Ops.append(SrcOps.begin(), SrcOps.end());

40852

40853

auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {

40854

// Attempt to find an existing match.

40855

SDValue InputBC = peekThroughBitcasts(Input);

40856

for (int i = 0, e = Ops.size(); i < e; ++i)

40857

if (InputBC == peekThroughBitcasts(Ops[i]))

40858

return i;

40859

// Match failed - should we replace an existing Op?

40860

if (InsertionPoint >= 0) {

40861

Ops[InsertionPoint] = Input;

40862

return InsertionPoint;

40863

}

40864

// Add to the end of the Ops list.

40865

Ops.push_back(Input);

40866

return Ops.size() - 1;

40867

};

40868

40869

SmallVector<int, 2> OpInputIdx;

40870

for (SDValue OpInput : OpInputs)

40871

OpInputIdx.push_back(

40872

AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));

40873

40874

assert(((RootMask.size() > OpMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40879, __extension__
__PRETTY_FUNCTION__))

40875

RootMask.size() % OpMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40879, __extension__
__PRETTY_FUNCTION__))

40876

(OpMask.size() > RootMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40879, __extension__
__PRETTY_FUNCTION__))

40877

OpMask.size() % RootMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40879, __extension__
__PRETTY_FUNCTION__))

40878

OpMask.size() == RootMask.size()) &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40879, __extension__
__PRETTY_FUNCTION__))

40879

"The smaller number of elements must divide the larger.")(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40879, __extension__
__PRETTY_FUNCTION__));

40880

40881

// This function can be performance-critical, so we rely on the power-of-2

40882

// knowledge that we have about the mask sizes to replace div/rem ops with

40883

// bit-masks and shifts.

40884

assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&(static_cast <bool> (llvm::has_single_bit<uint32_t>
(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40885, __extension__
__PRETTY_FUNCTION__))

40885

"Non-power-of-2 shuffle mask sizes")(static_cast <bool> (llvm::has_single_bit<uint32_t>
(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40885, __extension__
__PRETTY_FUNCTION__));

40886

assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&(static_cast <bool> (llvm::has_single_bit<uint32_t>
(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40887, __extension__
__PRETTY_FUNCTION__))

40887

"Non-power-of-2 shuffle mask sizes")(static_cast <bool> (llvm::has_single_bit<uint32_t>
(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40887, __extension__
__PRETTY_FUNCTION__));

40888

unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());

40889

unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());

40890

40891

unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());

40892

unsigned RootRatio =

40893

std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);

40894

unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);

40895

assert((RootRatio == 1 || OpRatio == 1) &&(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40896, __extension__
__PRETTY_FUNCTION__))

40896

"Must not have a ratio for both incoming and op masks!")(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40896, __extension__
__PRETTY_FUNCTION__));

40897

40898

assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(MaskWidth) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40898, __extension__
__PRETTY_FUNCTION__));

40899

assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootRatio) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40899, __extension__
__PRETTY_FUNCTION__));

40900

assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40900, __extension__
__PRETTY_FUNCTION__));

40901

unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);

40902

unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);

40903

40904

Mask.resize(MaskWidth, SM_SentinelUndef);

40905

40906

// Merge this shuffle operation's mask into our accumulated mask. Note that

40907

// this shuffle's mask will be the first applied to the input, followed by

40908

// the root mask to get us all the way to the root value arrangement. The

40909

// reason for this order is that we are recursing up the operation chain.

40910

for (unsigned i = 0; i < MaskWidth; ++i) {

40911

unsigned RootIdx = i >> RootRatioLog2;

40912

if (RootMask[RootIdx] < 0) {

40913

// This is a zero or undef lane, we're done.

40914

Mask[i] = RootMask[RootIdx];

40915

continue;

40916

}

40917

40918

unsigned RootMaskedIdx =

40919

RootRatio == 1

40920

? RootMask[RootIdx]

40921

: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));

40922

40923

// Just insert the scaled root mask value if it references an input other

40924

// than the SrcOp we're currently inserting.

40925

if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||

40926

(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {

40927

Mask[i] = RootMaskedIdx;

40928

continue;

40929

}

40930

40931

RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);

40932

unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;

40933

if (OpMask[OpIdx] < 0) {

40934

// The incoming lanes are zero or undef, it doesn't matter which ones we

40935

// are using.

40936

Mask[i] = OpMask[OpIdx];

40937

continue;

40938

}

40939

40940

// Ok, we have non-zero lanes, map them through to one of the Op's inputs.

40941

unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]

40942

: (OpMask[OpIdx] << OpRatioLog2) +

40943

(RootMaskedIdx & (OpRatio - 1));

40944

40945

OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);

40946

int InputIdx = OpMask[OpIdx] / (int)OpMask.size();

40947

assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")(static_cast <bool> (0 <= OpInputIdx[InputIdx] &&
"Unknown target shuffle input") ? void (0) : __assert_fail (
"0 <= OpInputIdx[InputIdx] && \"Unknown target shuffle input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40947, __extension__
__PRETTY_FUNCTION__));

40948

OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;

40949

40950

Mask[i] = OpMaskedIdx;

40951

}

40952

}

40953

40954

// Peek through vector widenings and set out of bounds mask indices to undef.

40955

// TODO: Can resolveTargetShuffleInputsAndMask do some of this?

40956

for (unsigned I = 0, E = Ops.size(); I != E; ++I) {

40957

SDValue &Op = Ops[I];

40958

if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&

40959

isNullConstant(Op.getOperand(2))) {

40960

Op = Op.getOperand(1);

40961

unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();

40962

int Lo = I * Mask.size();

40963

int Hi = (I + 1) * Mask.size();

40964

int NewHi = Lo + (Mask.size() / Scale);

40965

for (int &M : Mask) {

40966

if (Lo <= M && NewHi <= M && M < Hi)

40967

M = SM_SentinelUndef;

40968

}

40969

}

40970

}

40971

40972

// Peek through any free extract_subvector nodes back to root size.

40973

for (SDValue &Op : Ops)

40974

while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

40975

(RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&

40976

isNullConstant(Op.getOperand(1)))

40977

Op = Op.getOperand(0);

40978

40979

// Remove unused/repeated shuffle source ops.

40980

resolveTargetShuffleInputsAndMask(Ops, Mask);

40981

40982

// Handle the all undef/zero/ones cases early.

40983

if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))

40984

return DAG.getUNDEF(RootVT);

40985

if (all_of(Mask, [](int Idx) { return Idx < 0; }))

40986

return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));

40987

if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&

40988

!llvm::is_contained(Mask, SM_SentinelZero))

40989

return getOnesVector(RootVT, DAG, SDLoc(Root));

40990

40991

assert(!Ops.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!Ops.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40991, __extension__
__PRETTY_FUNCTION__));

40992

HasVariableMask |= IsOpVariableMask;

40993

40994

// Update the list of shuffle nodes that have been combined so far.

40995

SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),

40996

SrcNodes.end());

40997

CombinedNodes.push_back(Op.getNode());

40998

40999

// See if we can recurse into each shuffle source op (if it's a target

41000

// shuffle). The source op should only be generally combined if it either has

41001

// a single use (i.e. current Op) or all its users have already been combined,

41002

// if not then we can still combine but should prevent generation of variable

41003

// shuffles to avoid constant pool bloat.

41004

// Don't recurse if we already have more source ops than we can combine in

41005

// the remaining recursion depth.

41006

if (Ops.size() < (MaxDepth - Depth)) {

41007

for (int i = 0, e = Ops.size(); i < e; ++i) {

41008

// For empty roots, we need to resolve zeroable elements before combining

41009

// them with other shuffles.

41010

SmallVector<int, 64> ResolvedMask = Mask;

41011

if (EmptyRoot)

41012

resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);

41013

bool AllowCrossLaneVar = false;

41014

bool AllowPerLaneVar = false;

41015

if (Ops[i].getNode()->hasOneUse() ||

41016

SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {

41017

AllowCrossLaneVar = AllowVariableCrossLaneMask;

41018

AllowPerLaneVar = AllowVariablePerLaneMask;

41019

}

41020

if (SDValue Res = combineX86ShufflesRecursively(

41021

Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,

41022

HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,

41023

Subtarget))

41024

return Res;

41025

}

41026

}

41027

41028

// Attempt to constant fold all of the constant source ops.

41029

if (SDValue Cst = combineX86ShufflesConstants(

41030

Ops, Mask, Root, HasVariableMask, DAG, Subtarget))

41031

return Cst;

41032

41033

// If constant fold failed and we only have constants - then we have

41034

// multiple uses by a single non-variable shuffle - just bail.

41035

if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {

41036

APInt UndefElts;

41037

SmallVector<APInt> RawBits;

41038

unsigned EltSizeInBits = RootSizeInBits / Mask.size();

41039

return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

41040

RawBits);

41041

})) {

41042

return SDValue();

41043

}

41044

41045

// Canonicalize the combined shuffle mask chain with horizontal ops.

41046

// NOTE: This will update the Ops and Mask.

41047

if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(

41048

Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))

41049

return DAG.getBitcast(RootVT, HOp);

41050

41051

// Try to refine our inputs given our knowledge of target shuffle mask.

41052

for (auto I : enumerate(Ops)) {

41053

int OpIdx = I.index();

41054

SDValue &Op = I.value();

41055

41056

// What range of shuffle mask element values results in picking from Op?

41057

int Lo = OpIdx * Mask.size();

41058

int Hi = Lo + Mask.size();

41059

41060

// Which elements of Op do we demand, given the mask's granularity?

41061

APInt OpDemandedElts(Mask.size(), 0);

41062

for (int MaskElt : Mask) {

41063

if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?

41064

int OpEltIdx = MaskElt - Lo;

41065

OpDemandedElts.setBit(OpEltIdx);

41066

}

41067

}

41068

41069

// Is the shuffle result smaller than the root?

41070

if (Op.getValueSizeInBits() < RootSizeInBits) {

41071

// We padded the mask with undefs. But we now need to undo that.

41072

unsigned NumExpectedVectorElts = Mask.size();

41073

unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;

41074

unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;

41075

assert(!OpDemandedElts.extractBits((static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41077, __extension__
__PRETTY_FUNCTION__))

41076

NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41077, __extension__
__PRETTY_FUNCTION__))

41077

"Demanding the virtual undef widening padding?")(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41077, __extension__
__PRETTY_FUNCTION__));

41078

OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW

41079

}

41080

41081

// The Op itself may be of different VT, so we need to scale the mask.

41082

unsigned NumOpElts = Op.getValueType().getVectorNumElements();

41083

APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);

41084

41085

// Can this operand be simplified any further, given it's demanded elements?

41086

if (SDValue NewOp =

41087

DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts(

41088

Op, OpScaledDemandedElts, DAG))

41089

Op = NewOp;

41090

}

41091

// FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?

41092

41093

// Widen any subvector shuffle inputs we've collected.

41094

// TODO: Remove this to avoid generating temporary nodes, we should only

41095

// widen once combineX86ShuffleChain has found a match.

41096

if (any_of(Ops, [RootSizeInBits](SDValue Op) {

41097

return Op.getValueSizeInBits() < RootSizeInBits;

41098

})) {

41099

for (SDValue &Op : Ops)

41100

if (Op.getValueSizeInBits() < RootSizeInBits)

41101

Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),

41102

RootSizeInBits);

41103

// Reresolve - we might have repeated subvector sources.

41104

resolveTargetShuffleInputsAndMask(Ops, Mask);

41105

}

41106

41107

// We can only combine unary and binary shuffle mask cases.

41108

if (Ops.size() <= 2) {

41109

// Minor canonicalization of the accumulated shuffle mask to make it easier

41110

// to match below. All this does is detect masks with sequential pairs of

41111

// elements, and shrink them to the half-width mask. It does this in a loop

41112

// so it will reduce the size of the mask to the minimal width mask which

41113

// performs an equivalent shuffle.

41114

while (Mask.size() > 1) {

41115

SmallVector<int, 64> WidenedMask;

41116

if (!canWidenShuffleElements(Mask, WidenedMask))

41117

break;

41118

Mask = std::move(WidenedMask);

41119

}

41120

41121

// Canonicalization of binary shuffle masks to improve pattern matching by

41122

// commuting the inputs.

41123

if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {

41124

ShuffleVectorSDNode::commuteMask(Mask);

41125

std::swap(Ops[0], Ops[1]);

41126

}

41127

41128

// Try to combine into a single shuffle instruction.

41129

if (SDValue Shuffle = combineX86ShuffleChain(

41130

Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,

41131

AllowVariablePerLaneMask, DAG, Subtarget))

41132

return Shuffle;

41133

41134

// If all the operands come from the same larger vector, fallthrough and try

41135

// to use combineX86ShuffleChainWithExtract.

41136

SDValue LHS = peekThroughBitcasts(Ops.front());

41137

SDValue RHS = peekThroughBitcasts(Ops.back());

41138

if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||

41139

(RootSizeInBits / Mask.size()) != 64 ||

41140

LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

41141

RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

41142

LHS.getOperand(0) != RHS.getOperand(0))

41143

return SDValue();

41144

}

41145

41146

// If that failed and any input is extracted then try to combine as a

41147

// shuffle with the larger type.

41148

return combineX86ShuffleChainWithExtract(

41149

Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,

41150

AllowVariablePerLaneMask, DAG, Subtarget);

41151

}

41152

41153

/// Helper entry wrapper to combineX86ShufflesRecursively.

41154

static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,

41155

const X86Subtarget &Subtarget) {

41156

return combineX86ShufflesRecursively(

41157

{Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,

41158

/*HasVarMask*/ false,

41159

/*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,

41160

Subtarget);

41161

}

41162

41163

/// Get the PSHUF-style mask from PSHUF node.

41164

///

41165

/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4

41166

/// PSHUF-style masks that can be reused with such instructions.

41167

static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {

41168

MVT VT = N.getSimpleValueType();

41169

SmallVector<int, 4> Mask;

41170

SmallVector<SDValue, 2> Ops;

41171

bool HaveMask =

41172

getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);

41173

(void)HaveMask;

41174

assert(HaveMask)(static_cast <bool> (HaveMask) ? void (0) : __assert_fail
("HaveMask", "llvm/lib/Target/X86/X86ISelLowering.cpp", 41174
, __extension__ __PRETTY_FUNCTION__));

41175

41176

// If we have more than 128-bits, only the low 128-bits of shuffle mask

41177

// matter. Check that the upper masks are repeats and remove them.

41178

if (VT.getSizeInBits() > 128) {

41179

int LaneElts = 128 / VT.getScalarSizeInBits();

41180

#ifndef NDEBUG

41181

for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)

41182

for (int j = 0; j < LaneElts; ++j)

41183

assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41184, __extension__
__PRETTY_FUNCTION__))

41184

"Mask doesn't repeat in high 128-bit lanes!")(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41184, __extension__
__PRETTY_FUNCTION__));

41185

#endif

41186

Mask.resize(LaneElts);

41187

}

41188

41189

switch (N.getOpcode()) {

41190

case X86ISD::PSHUFD:

41191

return Mask;

41192

case X86ISD::PSHUFLW:

41193

Mask.resize(4);

41194

return Mask;

41195

case X86ISD::PSHUFHW:

41196

Mask.erase(Mask.begin(), Mask.begin() + 4);

41197

for (int &M : Mask)

41198

M -= 4;

41199

return Mask;

41200

default:

41201

llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41201);

41202

}

41203

}

41204

41205

/// Search for a combinable shuffle across a chain ending in pshufd.

41206

///

41207

/// We walk up the chain and look for a combinable shuffle, skipping over

41208

/// shuffles that we could hoist this shuffle's transformation past without

41209

/// altering anything.

41210

static SDValue

41211

combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,

41212

SelectionDAG &DAG) {

41213

assert(N.getOpcode() == X86ISD::PSHUFD &&(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41214, __extension__
__PRETTY_FUNCTION__))

41214

"Called with something other than an x86 128-bit half shuffle!")(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41214, __extension__
__PRETTY_FUNCTION__));

41215

SDLoc DL(N);

41216

41217

// Walk up a single-use chain looking for a combinable shuffle. Keep a stack

41218

// of the shuffles in the chain so that we can form a fresh chain to replace

41219

// this one.

41220

SmallVector<SDValue, 8> Chain;

41221

SDValue V = N.getOperand(0);

41222

for (; V.hasOneUse(); V = V.getOperand(0)) {

41223

switch (V.getOpcode()) {

41224

default:

41225

return SDValue(); // Nothing combined!

41226

41227

case ISD::BITCAST:

41228

// Skip bitcasts as we always know the type for the target specific

41229

// instructions.

41230

continue;

41231

41232

case X86ISD::PSHUFD:

41233

// Found another dword shuffle.

41234

break;

41235

41236

case X86ISD::PSHUFLW:

41237

// Check that the low words (being shuffled) are the identity in the

41238

// dword shuffle, and the high words are self-contained.

41239

if (Mask[0] != 0 || Mask[1] != 1 ||

41240

!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))

41241

return SDValue();

41242

41243

Chain.push_back(V);

41244

continue;

41245

41246

case X86ISD::PSHUFHW:

41247

// Check that the high words (being shuffled) are the identity in the

41248

// dword shuffle, and the low words are self-contained.

41249

if (Mask[2] != 2 || Mask[3] != 3 ||

41250

!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))

41251

return SDValue();

41252

41253

Chain.push_back(V);

41254

continue;

41255

41256

case X86ISD::UNPCKL:

41257

case X86ISD::UNPCKH:

41258

// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword

41259

// shuffle into a preceding word shuffle.

41260

if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&

41261

V.getSimpleValueType().getVectorElementType() != MVT::i16)

41262

return SDValue();

41263

41264

// Search for a half-shuffle which we can combine with.

41265

unsigned CombineOp =

41266

V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;

41267

if (V.getOperand(0) != V.getOperand(1) ||

41268

!V->isOnlyUserOf(V.getOperand(0).getNode()))

41269

return SDValue();

41270

Chain.push_back(V);

41271

V = V.getOperand(0);

41272

do {

41273

switch (V.getOpcode()) {

41274

default:

41275

return SDValue(); // Nothing to combine.

41276

41277

case X86ISD::PSHUFLW:

41278

case X86ISD::PSHUFHW:

41279

if (V.getOpcode() == CombineOp)

41280

break;

41281

41282

Chain.push_back(V);

41283

41284

[[fallthrough]];

41285

case ISD::BITCAST:

41286

V = V.getOperand(0);

41287

continue;

41288

}

41289

break;

41290

} while (V.hasOneUse());

41291

break;

41292

}

41293

// Break out of the loop if we break out of the switch.

41294

break;

41295

}

41296

41297

if (!V.hasOneUse())

41298

// We fell out of the loop without finding a viable combining instruction.

41299

return SDValue();

41300

41301

// Merge this node's mask and our incoming mask.

41302

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

41303

for (int &M : Mask)

41304

M = VMask[M];

41305

V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),

41306

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

41307

41308

// Rebuild the chain around this new shuffle.

41309

while (!Chain.empty()) {

41310

SDValue W = Chain.pop_back_val();

41311

41312

if (V.getValueType() != W.getOperand(0).getValueType())

41313

V = DAG.getBitcast(W.getOperand(0).getValueType(), V);

41314

41315

switch (W.getOpcode()) {

41316

default:

41317

llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41317);

41318

41319

case X86ISD::UNPCKL:

41320

case X86ISD::UNPCKH:

41321

V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);

41322

break;

41323

41324

case X86ISD::PSHUFD:

41325

case X86ISD::PSHUFLW:

41326

case X86ISD::PSHUFHW:

41327

V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));

41328

break;

41329

}

41330

}

41331

if (V.getValueType() != N.getValueType())

41332

V = DAG.getBitcast(N.getValueType(), V);

41333

41334

// Return the new chain to replace N.

41335

return V;

41336

}

41337

41338

// Attempt to commute shufps LHS loads:

41339

// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))

41340

static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,

41341

SelectionDAG &DAG) {

41342

// TODO: Add vXf64 support.

41343

if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)

41344

return SDValue();

41345

41346

// SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.

41347

auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {

41348

if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))

41349

return SDValue();

41350

SDValue N0 = V.getOperand(0);

41351

SDValue N1 = V.getOperand(1);

41352

unsigned Imm = V.getConstantOperandVal(2);

41353

const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();

41354

if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||

41355

X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))

41356

return SDValue();

41357

Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);

41358

return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,

41359

DAG.getTargetConstant(Imm, DL, MVT::i8));

41360

};

41361

41362

switch (N.getOpcode()) {

41363

case X86ISD::VPERMILPI:

41364

if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {

41365

unsigned Imm = N.getConstantOperandVal(1);

41366

return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,

41367

DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));

41368

}

41369

break;

41370

case X86ISD::SHUFP: {

41371

SDValue N0 = N.getOperand(0);

41372

SDValue N1 = N.getOperand(1);

41373

unsigned Imm = N.getConstantOperandVal(2);

41374

if (N0 == N1) {

41375

if (SDValue NewSHUFP = commuteSHUFP(N, N0))

41376

return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,

41377

DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));

41378

} else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {

41379

return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,

41380

DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));

41381

} else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {

41382

return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,

41383

DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));

41384

}

41385

break;

41386

}

41387

}

41388

41389

return SDValue();

41390

}

41391

41392

// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).

41393

static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,

41394

const SDLoc &DL) {

41395

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

41396

EVT ShuffleVT = N.getValueType();

41397

41398

auto IsMergeableWithShuffle = [&DAG](SDValue Op, bool FoldLoad = false) {

41399

// AllZeros/AllOnes constants are freely shuffled and will peek through

41400

// bitcasts. Other constant build vectors do not peek through bitcasts. Only

41401

// merge with target shuffles if it has one use so shuffle combining is

41402

// likely to kick in. Shuffles of splats are expected to be removed.

41403

return ISD::isBuildVectorAllOnes(Op.getNode()) ||

41404

ISD::isBuildVectorAllZeros(Op.getNode()) ||

41405

ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||

41406

ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||

41407

(Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||

41408

(isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||

41409

(FoldLoad && isShuffleFoldableLoad(Op)) ||

41410

DAG.isSplatValue(Op, /*AllowUndefs*/ false);

41411

};

41412

auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {

41413

// Ensure we only shuffle whole vector src elements, unless its a logical

41414

// binops where we can more aggressively move shuffles from dst to src.

41415

return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||

41416

BinOp == X86ISD::ANDNP ||

41417

(Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());

41418

};

41419

41420

unsigned Opc = N.getOpcode();

41421

switch (Opc) {

41422

// Unary and Unary+Permute Shuffles.

41423

case X86ISD::PSHUFB: {

41424

// Don't merge PSHUFB if it contains zero'd elements.

41425

SmallVector<int> Mask;

41426

SmallVector<SDValue> Ops;

41427

if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,

41428

Mask))

41429

break;

41430

[[fallthrough]];

41431

}

41432

case X86ISD::VBROADCAST:

41433

case X86ISD::MOVDDUP:

41434

case X86ISD::PSHUFD:

41435

case X86ISD::PSHUFHW:

41436

case X86ISD::PSHUFLW:

41437

case X86ISD::VPERMI:

41438

case X86ISD::VPERMILPI: {

41439

if (N.getOperand(0).getValueType() == ShuffleVT &&

41440

N->isOnlyUserOf(N.getOperand(0).getNode())) {

41441

SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));

41442

unsigned SrcOpcode = N0.getOpcode();

41443

if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {

41444

SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));

41445

SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));

41446

if (IsMergeableWithShuffle(Op00, Opc != X86ISD::PSHUFB) ||

41447

IsMergeableWithShuffle(Op01, Opc != X86ISD::PSHUFB)) {

41448

SDValue LHS, RHS;

41449

Op00 = DAG.getBitcast(ShuffleVT, Op00);

41450

Op01 = DAG.getBitcast(ShuffleVT, Op01);

41451

if (N.getNumOperands() == 2) {

41452

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));

41453

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));

41454

} else {

41455

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);

41456

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);

41457

}

41458

EVT OpVT = N0.getValueType();

41459

return DAG.getBitcast(ShuffleVT,

41460

DAG.getNode(SrcOpcode, DL, OpVT,

41461

DAG.getBitcast(OpVT, LHS),

41462

DAG.getBitcast(OpVT, RHS)));

41463

}

41464

}

41465

}

41466

break;

41467

}

41468

// Binary and Binary+Permute Shuffles.

41469

case X86ISD::INSERTPS: {

41470

// Don't merge INSERTPS if it contains zero'd elements.

41471

unsigned InsertPSMask = N.getConstantOperandVal(2);

41472

unsigned ZeroMask = InsertPSMask & 0xF;

41473

if (ZeroMask != 0)

41474

break;

41475

[[fallthrough]];

41476

}

41477

case X86ISD::MOVSD:

41478

case X86ISD::MOVSS:

41479

case X86ISD::BLENDI:

41480

case X86ISD::SHUFP:

41481

case X86ISD::UNPCKH:

41482

case X86ISD::UNPCKL: {

41483

if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&

41484

N->isOnlyUserOf(N.getOperand(1).getNode())) {

41485

SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));

41486

SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));

41487

unsigned SrcOpcode = N0.getOpcode();

41488

if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&

41489

IsSafeToMoveShuffle(N0, SrcOpcode) &&

41490

IsSafeToMoveShuffle(N1, SrcOpcode)) {

41491

SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));

41492

SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));

41493

SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));

41494

SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));

41495

// Ensure the total number of shuffles doesn't increase by folding this

41496

// shuffle through to the source ops.

41497

if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||

41498

(IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||

41499

((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&

41500

(IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {

41501

SDValue LHS, RHS;

41502

Op00 = DAG.getBitcast(ShuffleVT, Op00);

41503

Op10 = DAG.getBitcast(ShuffleVT, Op10);

41504

Op01 = DAG.getBitcast(ShuffleVT, Op01);

41505

Op11 = DAG.getBitcast(ShuffleVT, Op11);

41506

if (N.getNumOperands() == 3) {

41507

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));

41508

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));

41509

} else {

41510

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);

41511

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);

41512

}

41513

EVT OpVT = N0.getValueType();

41514

return DAG.getBitcast(ShuffleVT,

41515

DAG.getNode(SrcOpcode, DL, OpVT,

41516

DAG.getBitcast(OpVT, LHS),

41517

DAG.getBitcast(OpVT, RHS)));

41518

}

41519

}

41520

}

41521

break;

41522

}

41523

}

41524

return SDValue();

41525

}

41526

41527

/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).

41528

static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,

41529

SelectionDAG &DAG,

41530

const SDLoc &DL) {

41531

assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle")(static_cast <bool> (V.getOpcode() == X86ISD::VPERM2X128
&& "Unknown lane shuffle") ? void (0) : __assert_fail
("V.getOpcode() == X86ISD::VPERM2X128 && \"Unknown lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41531, __extension__
__PRETTY_FUNCTION__));

41532

41533

MVT VT = V.getSimpleValueType();

41534

SDValue Src0 = peekThroughBitcasts(V.getOperand(0));

41535

SDValue Src1 = peekThroughBitcasts(V.getOperand(1));

41536

unsigned SrcOpc0 = Src0.getOpcode();

41537

unsigned SrcOpc1 = Src1.getOpcode();

41538

EVT SrcVT0 = Src0.getValueType();

41539

EVT SrcVT1 = Src1.getValueType();

41540

41541

if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))

41542

return SDValue();

41543

41544

switch (SrcOpc0) {

41545

case X86ISD::MOVDDUP: {

41546

SDValue LHS = Src0.getOperand(0);

41547

SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);

41548

SDValue Res =

41549

DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));

41550

Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);

41551

return DAG.getBitcast(VT, Res);

41552

}

41553

case X86ISD::VPERMILPI:

41554

// TODO: Handle v4f64 permutes with different low/high lane masks.

41555

if (SrcVT0 == MVT::v4f64) {

41556

uint64_t Mask = Src0.getConstantOperandVal(1);

41557

if ((Mask & 0x3) != ((Mask >> 2) & 0x3))

41558

break;

41559

}

41560

[[fallthrough]];

41561

case X86ISD::VSHLI:

41562

case X86ISD::VSRLI:

41563

case X86ISD::VSRAI:

41564

case X86ISD::PSHUFD:

41565

if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {

41566

SDValue LHS = Src0.getOperand(0);

41567

SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);

41568

SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,

41569

V.getOperand(2));

41570

Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));

41571

return DAG.getBitcast(VT, Res);

41572

}

41573

break;

41574

}

41575

41576

return SDValue();

41577

}

41578

41579

/// Try to combine x86 target specific shuffles.

41580

static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,

41581

TargetLowering::DAGCombinerInfo &DCI,

41582

const X86Subtarget &Subtarget) {

41583

SDLoc DL(N);

41584

MVT VT = N.getSimpleValueType();

41585

SmallVector<int, 4> Mask;

41586

unsigned Opcode = N.getOpcode();

41587

41588

if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))

41589

return R;

41590

41591

// Handle specific target shuffles.

41592

switch (Opcode) {

41593

case X86ISD::MOVDDUP: {

41594

SDValue Src = N.getOperand(0);

41595

// Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.

41596

if (VT == MVT::v2f64 && Src.hasOneUse() &&

41597

ISD::isNormalLoad(Src.getNode())) {

41598

LoadSDNode *LN = cast<LoadSDNode>(Src);

41599

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {

41600

SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);

41601

DCI.CombineTo(N.getNode(), Movddup);

41602

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

41603

DCI.recursivelyDeleteUnusedNodes(LN);

41604

return N; // Return N so it doesn't get rechecked!

41605

}

41606

}

41607

41608

return SDValue();

41609

}

41610

case X86ISD::VBROADCAST: {

41611

SDValue Src = N.getOperand(0);

41612

SDValue BC = peekThroughBitcasts(Src);

41613

EVT SrcVT = Src.getValueType();

41614

EVT BCVT = BC.getValueType();

41615

41616

// If broadcasting from another shuffle, attempt to simplify it.

41617

// TODO - we really need a general SimplifyDemandedVectorElts mechanism.

41618

if (isTargetShuffle(BC.getOpcode()) &&

41619

VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {

41620

unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();

41621

SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),

41622

SM_SentinelUndef);

41623

for (unsigned i = 0; i != Scale; ++i)

41624

DemandedMask[i] = i;

41625

if (SDValue Res = combineX86ShufflesRecursively(

41626

{BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,

41627

X86::MaxShuffleCombineDepth,

41628

/*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,

41629

/*AllowPerLaneVarMask*/ true, DAG, Subtarget))

41630

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

41631

DAG.getBitcast(SrcVT, Res));

41632

}

41633

41634

// broadcast(bitcast(src)) -> bitcast(broadcast(src))

41635

// 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.

41636

if (Src.getOpcode() == ISD::BITCAST &&

41637

SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&

41638

DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&

41639

FixedVectorType::isValidElementType(

41640

BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {

41641

EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),

41642

VT.getVectorNumElements());

41643

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));

41644

}

41645

41646

// vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))

41647

// If we're re-broadcasting a smaller type then broadcast with that type and

41648

// bitcast.

41649

// TODO: Do this for any splat?

41650

if (Src.getOpcode() == ISD::BITCAST &&

41651

(BC.getOpcode() == X86ISD::VBROADCAST ||

41652

BC.getOpcode() == X86ISD::VBROADCAST_LOAD) &&

41653

(VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&

41654

(VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {

41655

MVT NewVT =

41656

MVT::getVectorVT(BCVT.getSimpleVT().getScalarType(),

41657

VT.getSizeInBits() / BCVT.getScalarSizeInBits());

41658

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));

41659

}

41660

41661

// Reduce broadcast source vector to lowest 128-bits.

41662

if (SrcVT.getSizeInBits() > 128)

41663

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

41664

extract128BitVector(Src, 0, DAG, DL));

41665

41666

// broadcast(scalar_to_vector(x)) -> broadcast(x).

41667

if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)

41668

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));

41669

41670

// broadcast(extract_vector_elt(x, 0)) -> broadcast(x).

41671

if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

41672

isNullConstant(Src.getOperand(1)) &&

41673

DAG.getTargetLoweringInfo().isTypeLegal(

41674

Src.getOperand(0).getValueType()))

41675

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));

41676

41677

// Share broadcast with the longest vector and extract low subvector (free).

41678

// Ensure the same SDValue from the SDNode use is being used.

41679

for (SDNode *User : Src->uses())

41680

if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&

41681

Src == User->getOperand(0) &&

41682

User->getValueSizeInBits(0).getFixedValue() >

41683

VT.getFixedSizeInBits()) {

41684

return extractSubVector(SDValue(User, 0), 0, DAG, DL,

41685

VT.getSizeInBits());

41686

}

41687

41688

// vbroadcast(scalarload X) -> vbroadcast_load X

41689

// For float loads, extract other uses of the scalar from the broadcast.

41690

if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&

41691

ISD::isNormalLoad(Src.getNode())) {

41692

LoadSDNode *LN = cast<LoadSDNode>(Src);

41693

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41694

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

41695

SDValue BcastLd =

41696

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

41697

LN->getMemoryVT(), LN->getMemOperand());

41698

// If the load value is used only by N, replace it via CombineTo N.

41699

bool NoReplaceExtract = Src.hasOneUse();

41700

DCI.CombineTo(N.getNode(), BcastLd);

41701

if (NoReplaceExtract) {

41702

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41703

DCI.recursivelyDeleteUnusedNodes(LN);

41704

} else {

41705

SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,

41706

DAG.getIntPtrConstant(0, DL));

41707

DCI.CombineTo(LN, Scl, BcastLd.getValue(1));

41708

}

41709

return N; // Return N so it doesn't get rechecked!

41710

}

41711

41712

// Due to isTypeDesirableForOp, we won't always shrink a load truncated to

41713

// i16. So shrink it ourselves if we can make a broadcast_load.

41714

if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&

41715

Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {

41716

assert(Subtarget.hasAVX2() && "Expected AVX2")(static_cast <bool> (Subtarget.hasAVX2() && "Expected AVX2"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"Expected AVX2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41716, __extension__
__PRETTY_FUNCTION__));

41717

SDValue TruncIn = Src.getOperand(0);

41718

41719

// If this is a truncate of a non extending load we can just narrow it to

41720

// use a broadcast_load.

41721

if (ISD::isNormalLoad(TruncIn.getNode())) {

41722

LoadSDNode *LN = cast<LoadSDNode>(TruncIn);

41723

// Unless its volatile or atomic.

41724

if (LN->isSimple()) {

41725

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41726

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

41727

SDValue BcastLd = DAG.getMemIntrinsicNode(

41728

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,

41729

LN->getPointerInfo(), LN->getOriginalAlign(),

41730

LN->getMemOperand()->getFlags());

41731

DCI.CombineTo(N.getNode(), BcastLd);

41732

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41733

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

41734

return N; // Return N so it doesn't get rechecked!

41735

}

41736

}

41737

41738

// If this is a truncate of an i16 extload, we can directly replace it.

41739

if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&

41740

ISD::isEXTLoad(Src.getOperand(0).getNode())) {

41741

LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));

41742

if (LN->getMemoryVT().getSizeInBits() == 16) {

41743

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41744

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

41745

SDValue BcastLd =

41746

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

41747

LN->getMemoryVT(), LN->getMemOperand());

41748

DCI.CombineTo(N.getNode(), BcastLd);

41749

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41750

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

41751

return N; // Return N so it doesn't get rechecked!

41752

}

41753

}

41754

41755

// If this is a truncate of load that has been shifted right, we can

41756

// offset the pointer and use a narrower load.

41757

if (TruncIn.getOpcode() == ISD::SRL &&

41758

TruncIn.getOperand(0).hasOneUse() &&

41759

isa<ConstantSDNode>(TruncIn.getOperand(1)) &&

41760

ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {

41761

LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));

41762

unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);

41763

// Make sure the shift amount and the load size are divisible by 16.

41764

// Don't do this if the load is volatile or atomic.

41765

if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&

41766

LN->isSimple()) {

41767

unsigned Offset = ShiftAmt / 8;

41768

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41769

SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),

41770

TypeSize::Fixed(Offset), DL);

41771

SDValue Ops[] = { LN->getChain(), Ptr };

41772

SDValue BcastLd = DAG.getMemIntrinsicNode(

41773

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,

41774

LN->getPointerInfo().getWithOffset(Offset),

41775

LN->getOriginalAlign(),

41776

LN->getMemOperand()->getFlags());

41777

DCI.CombineTo(N.getNode(), BcastLd);

41778

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41779

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

41780

return N; // Return N so it doesn't get rechecked!

41781

}

41782

}

41783

}

41784

41785

// vbroadcast(vzload X) -> vbroadcast_load X

41786

if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {

41787

MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);

41788

if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {

41789

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41790

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

41791

SDValue BcastLd =

41792

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

41793

LN->getMemoryVT(), LN->getMemOperand());

41794

DCI.CombineTo(N.getNode(), BcastLd);

41795

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41796

DCI.recursivelyDeleteUnusedNodes(LN);

41797

return N; // Return N so it doesn't get rechecked!

41798

}

41799

}

41800

41801

// vbroadcast(vector load X) -> vbroadcast_load

41802

if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||

41803

SrcVT == MVT::v4i32) &&

41804

Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {

41805

LoadSDNode *LN = cast<LoadSDNode>(Src);

41806

// Unless the load is volatile or atomic.

41807

if (LN->isSimple()) {

41808

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41809

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

41810

SDValue BcastLd = DAG.getMemIntrinsicNode(

41811

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),

41812

LN->getPointerInfo(), LN->getOriginalAlign(),

41813

LN->getMemOperand()->getFlags());

41814

DCI.CombineTo(N.getNode(), BcastLd);

41815

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41816

DCI.recursivelyDeleteUnusedNodes(LN);

41817

return N; // Return N so it doesn't get rechecked!

41818

}

41819

}

41820

41821

return SDValue();

41822

}

41823

case X86ISD::VZEXT_MOVL: {

41824

SDValue N0 = N.getOperand(0);

41825

41826

// If this a vzmovl of a full vector load, replace it with a vzload, unless

41827

// the load is volatile.

41828

if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {

41829

auto *LN = cast<LoadSDNode>(N0);

41830

if (SDValue VZLoad =

41831

narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {

41832

DCI.CombineTo(N.getNode(), VZLoad);

41833

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

41834

DCI.recursivelyDeleteUnusedNodes(LN);

41835

return N;

41836

}

41837

}

41838

41839

// If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast

41840

// and can just use a VZEXT_LOAD.

41841

// FIXME: Is there some way to do this with SimplifyDemandedVectorElts?

41842

if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {

41843

auto *LN = cast<MemSDNode>(N0);

41844

if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {

41845

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41846

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

41847

SDValue VZLoad =

41848

DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,

41849

LN->getMemoryVT(), LN->getMemOperand());

41850

DCI.CombineTo(N.getNode(), VZLoad);

41851

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

41852

DCI.recursivelyDeleteUnusedNodes(LN);

41853

return N;

41854

}

41855

}

41856

41857

// Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into

41858

// (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))

41859

// if the upper bits of the i64 are zero.

41860

if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&

41861

N0.getOperand(0).hasOneUse() &&

41862

N0.getOperand(0).getValueType() == MVT::i64) {

41863

SDValue In = N0.getOperand(0);

41864

APInt Mask = APInt::getHighBitsSet(64, 32);

41865

if (DAG.MaskedValueIsZero(In, Mask)) {

41866

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);

41867

MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);

41868

SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);

41869

SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);

41870

return DAG.getBitcast(VT, Movl);

41871

}

41872

}

41873

41874

// Load a scalar integer constant directly to XMM instead of transferring an

41875

// immediate value from GPR.

41876

// vzext_movl (scalar_to_vector C) --> load [C,0...]

41877

if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {

41878

if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {

41879

// Create a vector constant - scalar constant followed by zeros.

41880

EVT ScalarVT = N0.getOperand(0).getValueType();

41881

Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());

41882

unsigned NumElts = VT.getVectorNumElements();

41883

Constant *Zero = ConstantInt::getNullValue(ScalarTy);

41884

SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);

41885

ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());

41886

41887

// Load the vector constant from constant pool.

41888

MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

41889

SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);

41890

MachinePointerInfo MPI =

41891

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

41892

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

41893

return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,

41894

MachineMemOperand::MOLoad);

41895

}

41896

}

41897

41898

// Pull subvector inserts into undef through VZEXT_MOVL by making it an

41899

// insert into a zero vector. This helps get VZEXT_MOVL closer to

41900

// scalar_to_vectors where 256/512 are canonicalized to an insert and a

41901

// 128-bit scalar_to_vector. This reduces the number of isel patterns.

41902

if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {

41903

SDValue V = peekThroughOneUseBitcasts(N0);

41904

41905

if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&

41906

isNullConstant(V.getOperand(2))) {

41907

SDValue In = V.getOperand(1);

41908

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),

41909

In.getValueSizeInBits() /

41910

VT.getScalarSizeInBits());

41911

In = DAG.getBitcast(SubVT, In);

41912

SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);

41913

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

41914

getZeroVector(VT, Subtarget, DAG, DL), Movl,

41915

V.getOperand(2));

41916

}

41917

}

41918

41919

return SDValue();

41920

}

41921

case X86ISD::BLENDI: {

41922

SDValue N0 = N.getOperand(0);

41923

SDValue N1 = N.getOperand(1);

41924

41925

// blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.

41926

// TODO: Handle MVT::v16i16 repeated blend mask.

41927

if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&

41928

N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {

41929

MVT SrcVT = N0.getOperand(0).getSimpleValueType();

41930

if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&

41931

SrcVT.getScalarSizeInBits() >= 32) {

41932

unsigned BlendMask = N.getConstantOperandVal(2);

41933

unsigned Size = VT.getVectorNumElements();

41934

unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();

41935

BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);

41936

return DAG.getBitcast(

41937

VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),

41938

N1.getOperand(0),

41939

DAG.getTargetConstant(BlendMask, DL, MVT::i8)));

41940

}

41941

}

41942

return SDValue();

41943

}

41944

case X86ISD::SHUFP: {

41945

// Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).

41946

// This is a more relaxed shuffle combiner that can ignore oneuse limits.

41947

// TODO: Support types other than v4f32.

41948

if (VT == MVT::v4f32) {

41949

bool Updated = false;

41950

SmallVector<int> Mask;

41951

SmallVector<SDValue> Ops;

41952

if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&

41953

Ops.size() == 2) {

41954

for (int i = 0; i != 2; ++i) {

41955

SmallVector<SDValue> SubOps;

41956

SmallVector<int> SubMask, SubScaledMask;

41957

SDValue Sub = peekThroughBitcasts(Ops[i]);

41958

// TODO: Scaling might be easier if we specify the demanded elts.

41959

if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&

41960

scaleShuffleElements(SubMask, 4, SubScaledMask) &&

41961

SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {

41962

int Ofs = i * 2;

41963

Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);

41964

Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);

41965

Ops[i] = DAG.getBitcast(VT, SubOps[0]);

41966

Updated = true;

41967

}

41968

}

41969

}

41970

if (Updated) {

41971

for (int &M : Mask)

41972

M %= 4;

41973

Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

41974

return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);

41975

}

41976

}

41977

return SDValue();

41978

}

41979

case X86ISD::VPERMI: {

41980

// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.

41981

// TODO: Remove when we have preferred domains in combineX86ShuffleChain.

41982

SDValue N0 = N.getOperand(0);

41983

SDValue N1 = N.getOperand(1);

41984

unsigned EltSizeInBits = VT.getScalarSizeInBits();

41985

if (N0.getOpcode() == ISD::BITCAST &&

41986

N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {

41987

SDValue Src = N0.getOperand(0);

41988

EVT SrcVT = Src.getValueType();

41989

SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);

41990

return DAG.getBitcast(VT, Res);

41991

}

41992

return SDValue();

41993

}

41994

case X86ISD::VPERM2X128: {

41995

// Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).

41996

SDValue LHS = N->getOperand(0);

41997

SDValue RHS = N->getOperand(1);

41998

if (LHS.getOpcode() == ISD::BITCAST &&

41999

(RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {

42000

EVT SrcVT = LHS.getOperand(0).getValueType();

42001

if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {

42002

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,

42003

DAG.getBitcast(SrcVT, LHS),

42004

DAG.getBitcast(SrcVT, RHS),

42005

N->getOperand(2)));

42006

}

42007

}

42008

42009

// Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).

42010

if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))

42011

return Res;

42012

42013

// Fold vperm2x128 subvector shuffle with an inner concat pattern.

42014

// vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.

42015

auto FindSubVector128 = [&](unsigned Idx) {

42016

if (Idx > 3)

42017

return SDValue();

42018

SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));

42019

SmallVector<SDValue> SubOps;

42020

if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)

42021

return SubOps[Idx & 1];

42022

unsigned NumElts = Src.getValueType().getVectorNumElements();

42023

if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&

42024

Src.getOperand(1).getValueSizeInBits() == 128 &&

42025

Src.getConstantOperandAPInt(2) == (NumElts / 2)) {

42026

return Src.getOperand(1);

42027

}

42028

return SDValue();

42029

};

42030

unsigned Imm = N.getConstantOperandVal(2);

42031

if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {

42032

if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {

42033

MVT SubVT = VT.getHalfNumVectorElementsVT();

42034

SubLo = DAG.getBitcast(SubVT, SubLo);

42035

SubHi = DAG.getBitcast(SubVT, SubHi);

42036

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);

42037

}

42038

}

42039

return SDValue();

42040

}

42041

case X86ISD::PSHUFD:

42042

case X86ISD::PSHUFLW:

42043

case X86ISD::PSHUFHW:

42044

Mask = getPSHUFShuffleMask(N);

42045

assert(Mask.size() == 4)(static_cast <bool> (Mask.size() == 4) ? void (0) : __assert_fail
("Mask.size() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42045, __extension__ __PRETTY_FUNCTION__));

42046

break;

42047

case X86ISD::MOVSD:

42048

case X86ISD::MOVSH:

42049

case X86ISD::MOVSS: {

42050

SDValue N0 = N.getOperand(0);

42051

SDValue N1 = N.getOperand(1);

42052

42053

// Canonicalize scalar FPOps:

42054

// MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))

42055

// If commutable, allow OP(N1[0], N0[0]).

42056

unsigned Opcode1 = N1.getOpcode();

42057

if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||

42058

Opcode1 == ISD::FDIV) {

42059

SDValue N10 = N1.getOperand(0);

42060

SDValue N11 = N1.getOperand(1);

42061

if (N10 == N0 ||

42062

(N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {

42063

if (N10 != N0)

42064

std::swap(N10, N11);

42065

MVT SVT = VT.getVectorElementType();

42066

SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);

42067

N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);

42068

N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);

42069

SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);

42070

SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);

42071

return DAG.getNode(Opcode, DL, VT, N0, SclVec);

42072

}

42073

}

42074

42075

return SDValue();

42076

}

42077

case X86ISD::INSERTPS: {

42078

assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")(static_cast <bool> (VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? void (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42078, __extension__
__PRETTY_FUNCTION__));

42079

SDValue Op0 = N.getOperand(0);

42080

SDValue Op1 = N.getOperand(1);

42081

unsigned InsertPSMask = N.getConstantOperandVal(2);

42082

unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;

42083

unsigned DstIdx = (InsertPSMask >> 4) & 0x3;

42084

unsigned ZeroMask = InsertPSMask & 0xF;

42085

42086

// If we zero out all elements from Op0 then we don't need to reference it.

42087

if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())

42088

return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,

42089

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

42090

42091

// If we zero out the element from Op1 then we don't need to reference it.

42092

if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())

42093

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),

42094

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

42095

42096

// Attempt to merge insertps Op1 with an inner target shuffle node.

42097

SmallVector<int, 8> TargetMask1;

42098

SmallVector<SDValue, 2> Ops1;

42099

APInt KnownUndef1, KnownZero1;

42100

if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,

42101

KnownZero1)) {

42102

if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {

42103

// Zero/UNDEF insertion - zero out element and remove dependency.

42104

InsertPSMask |= (1u << DstIdx);

42105

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),

42106

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

42107

}

42108

// Update insertps mask srcidx and reference the source input directly.

42109

int M = TargetMask1[SrcIdx];

42110

assert(0 <= M && M < 8 && "Shuffle index out of range")(static_cast <bool> (0 <= M && M < 8 &&
"Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42110, __extension__
__PRETTY_FUNCTION__));

42111

InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);

42112

Op1 = Ops1[M < 4 ? 0 : 1];

42113

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,

42114

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

42115

}

42116

42117

// Attempt to merge insertps Op0 with an inner target shuffle node.

42118

SmallVector<int, 8> TargetMask0;

42119

SmallVector<SDValue, 2> Ops0;

42120

APInt KnownUndef0, KnownZero0;

42121

if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,

42122

KnownZero0)) {

42123

bool Updated = false;

42124

bool UseInput00 = false;

42125

bool UseInput01 = false;

42126

for (int i = 0; i != 4; ++i) {

42127

if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {

42128

// No change if element is already zero or the inserted element.

42129

continue;

42130

}

42131

42132

if (KnownUndef0[i] || KnownZero0[i]) {

42133

// If the target mask is undef/zero then we must zero the element.

42134

InsertPSMask |= (1u << i);

42135

Updated = true;

42136

continue;

42137

}

42138

42139

// The input vector element must be inline.

42140

int M = TargetMask0[i];

42141

if (M != i && M != (i + 4))

42142

return SDValue();

42143

42144

// Determine which inputs of the target shuffle we're using.

42145

UseInput00 |= (0 <= M && M < 4);

42146

UseInput01 |= (4 <= M);

42147

}

42148

42149

// If we're not using both inputs of the target shuffle then use the

42150

// referenced input directly.

42151

if (UseInput00 && !UseInput01) {

42152

Updated = true;

42153

Op0 = Ops0[0];

42154

} else if (!UseInput00 && UseInput01) {

42155

Updated = true;

42156

Op0 = Ops0[1];

42157

}

42158

42159

if (Updated)

42160

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,

42161

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

42162

}

42163

42164

// If we're inserting an element from a vbroadcast load, fold the

42165

// load into the X86insertps instruction. We need to convert the scalar

42166

// load to a vector and clear the source lane of the INSERTPS control.

42167

if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {

42168

auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);

42169

if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {

42170

SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),

42171

MemIntr->getBasePtr(),

42172

MemIntr->getMemOperand());

42173

SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,

42174

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,

42175

Load),

42176

DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));

42177

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));

42178

return Insert;

42179

}

42180

}

42181

42182

return SDValue();

42183

}

42184

default:

42185

return SDValue();

42186

}

42187

42188

// Nuke no-op shuffles that show up after combining.

42189

if (isNoopShuffleMask(Mask))

42190

return N.getOperand(0);

42191

42192

// Look for simplifications involving one or two shuffle instructions.

42193

SDValue V = N.getOperand(0);

42194

switch (N.getOpcode()) {

42195

default:

42196

break;

42197

case X86ISD::PSHUFLW:

42198

case X86ISD::PSHUFHW:

42199

assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad word shuffle type!") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42199, __extension__
__PRETTY_FUNCTION__));

42200

42201

// See if this reduces to a PSHUFD which is no more expensive and can

42202

// combine with more operations. Note that it has to at least flip the

42203

// dwords as otherwise it would have been removed as a no-op.

42204

if (ArrayRef(Mask).equals({2, 3, 0, 1})) {

42205

int DMask[] = {0, 1, 2, 3};

42206

int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;

42207

DMask[DOffset + 0] = DOffset + 1;

42208

DMask[DOffset + 1] = DOffset + 0;

42209

MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

42210

V = DAG.getBitcast(DVT, V);

42211

V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,

42212

getV4X86ShuffleImm8ForMask(DMask, DL, DAG));

42213

return DAG.getBitcast(VT, V);

42214

}

42215

42216

// Look for shuffle patterns which can be implemented as a single unpack.

42217

// FIXME: This doesn't handle the location of the PSHUFD generically, and

42218

// only works when we have a PSHUFD followed by two half-shuffles.

42219

if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&

42220

(V.getOpcode() == X86ISD::PSHUFLW ||

42221

V.getOpcode() == X86ISD::PSHUFHW) &&

42222

V.getOpcode() != N.getOpcode() &&

42223

V.hasOneUse() && V.getOperand(0).hasOneUse()) {

42224

SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));

42225

if (D.getOpcode() == X86ISD::PSHUFD) {

42226

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

42227

SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);

42228

int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

42229

int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

42230

int WordMask[8];

42231

for (int i = 0; i < 4; ++i) {

42232

WordMask[i + NOffset] = Mask[i] + NOffset;

42233

WordMask[i + VOffset] = VMask[i] + VOffset;

42234

}

42235

// Map the word mask through the DWord mask.

42236

int MappedMask[8];

42237

for (int i = 0; i < 8; ++i)

42238

MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;

42239

if (ArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||

42240

ArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {

42241

// We can replace all three shuffles with an unpack.

42242

V = DAG.getBitcast(VT, D.getOperand(0));

42243

return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL

42244

: X86ISD::UNPCKH,

42245

DL, VT, V, V);

42246

}

42247

}

42248

}

42249

42250

break;

42251

42252

case X86ISD::PSHUFD:

42253

if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))

42254

return NewN;

42255

42256

break;

42257

}

42258

42259

return SDValue();

42260

}

42261

42262

/// Checks if the shuffle mask takes subsequent elements

42263

/// alternately from two vectors.

42264

/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.

42265

static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {

42266

42267

int ParitySrc[2] = {-1, -1};

42268

unsigned Size = Mask.size();

42269

for (unsigned i = 0; i != Size; ++i) {

42270

int M = Mask[i];

42271

if (M < 0)

42272

continue;

42273

42274

// Make sure we are using the matching element from the input.

42275

if ((M % Size) != i)

42276

return false;

42277

42278

// Make sure we use the same input for all elements of the same parity.

42279

int Src = M / Size;

42280

if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)

42281

return false;

42282

ParitySrc[i % 2] = Src;

42283

}

42284

42285

// Make sure each input is used.

42286

if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])

42287

return false;

42288

42289

Op0Even = ParitySrc[0] == 0;

42290

return true;

42291

}

42292

42293

/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)

42294

/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation

42295

/// are written to the parameters \p Opnd0 and \p Opnd1.

42296

///

42297

/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes

42298

/// so it is easier to generically match. We also insert dummy vector shuffle

42299

/// nodes for the operands which explicitly discard the lanes which are unused

42300

/// by this operation to try to flow through the rest of the combiner

42301

/// the fact that they're unused.

42302

static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,

42303

SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,

42304

bool &IsSubAdd) {

42305

42306

EVT VT = N->getValueType(0);

42307

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

42308

if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||

42309

!VT.getSimpleVT().isFloatingPoint())

42310

return false;

42311

42312

// We only handle target-independent shuffles.

42313

// FIXME: It would be easy and harmless to use the target shuffle mask

42314

// extraction tool to support more.

42315

if (N->getOpcode() != ISD::VECTOR_SHUFFLE)

42316

return false;

42317

42318

SDValue V1 = N->getOperand(0);

42319

SDValue V2 = N->getOperand(1);

42320

42321

// Make sure we have an FADD and an FSUB.

42322

if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||

42323

(V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||

42324

V1.getOpcode() == V2.getOpcode())

42325

return false;

42326

42327

// If there are other uses of these operations we can't fold them.

42328

if (!V1->hasOneUse() || !V2->hasOneUse())

42329

return false;

42330

42331

// Ensure that both operations have the same operands. Note that we can

42332

// commute the FADD operands.

42333

SDValue LHS, RHS;

42334

if (V1.getOpcode() == ISD::FSUB) {

42335

LHS = V1->getOperand(0); RHS = V1->getOperand(1);

42336

if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&

42337

(V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))

42338

return false;

42339

} else {

42340

assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")(static_cast <bool> (V2.getOpcode() == ISD::FSUB &&
"Unexpected opcode") ? void (0) : __assert_fail ("V2.getOpcode() == ISD::FSUB && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42340, __extension__
__PRETTY_FUNCTION__));

42341

LHS = V2->getOperand(0); RHS = V2->getOperand(1);

42342

if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&

42343

(V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))

42344

return false;

42345

}

42346

42347

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();

42348

bool Op0Even;

42349

if (!isAddSubOrSubAddMask(Mask, Op0Even))

42350

return false;

42351

42352

// It's a subadd if the vector in the even parity is an FADD.

42353

IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD

42354

: V2->getOpcode() == ISD::FADD;

42355

42356

Opnd0 = LHS;

42357

Opnd1 = RHS;

42358

return true;

42359

}

42360

42361

/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.

42362

static SDValue combineShuffleToFMAddSub(SDNode *N,

42363

const X86Subtarget &Subtarget,

42364

SelectionDAG &DAG) {

42365

// We only handle target-independent shuffles.

42366

// FIXME: It would be easy and harmless to use the target shuffle mask

42367

// extraction tool to support more.

42368

if (N->getOpcode() != ISD::VECTOR_SHUFFLE)

42369

return SDValue();

42370

42371

MVT VT = N->getSimpleValueType(0);

42372

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

42373

if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))

42374

return SDValue();

42375

42376

// We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).

42377

SDValue Op0 = N->getOperand(0);

42378

SDValue Op1 = N->getOperand(1);

42379

SDValue FMAdd = Op0, FMSub = Op1;

42380

if (FMSub.getOpcode() != X86ISD::FMSUB)

42381

std::swap(FMAdd, FMSub);

42382

42383

if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||

42384

FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||

42385

FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||

42386

FMAdd.getOperand(2) != FMSub.getOperand(2))

42387

return SDValue();

42388

42389

// Check for correct shuffle mask.

42390

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();

42391

bool Op0Even;

42392

if (!isAddSubOrSubAddMask(Mask, Op0Even))

42393

return SDValue();

42394

42395

// FMAddSub takes zeroth operand from FMSub node.

42396

SDLoc DL(N);

42397

bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;

42398

unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

42399

return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),

42400

FMAdd.getOperand(2));

42401

}

42402

42403

/// Try to combine a shuffle into a target-specific add-sub or

42404

/// mul-add-sub node.

42405

static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,

42406

const X86Subtarget &Subtarget,

42407

SelectionDAG &DAG) {

42408

if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))

42409

return V;

42410

42411

SDValue Opnd0, Opnd1;

42412

bool IsSubAdd;

42413

if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))

42414

return SDValue();

42415

42416

MVT VT = N->getSimpleValueType(0);

42417

SDLoc DL(N);

42418

42419

// Try to generate X86ISD::FMADDSUB node here.

42420

SDValue Opnd2;

42421

if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {

42422

unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

42423

return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);

42424

}

42425

42426

if (IsSubAdd)

42427

return SDValue();

42428

42429

// Do not generate X86ISD::ADDSUB node for 512-bit types even though

42430

// the ADDSUB idiom has been successfully recognized. There are no known

42431

// X86 targets with 512-bit ADDSUB instructions!

42432

if (VT.is512BitVector())

42433

return SDValue();

42434

42435

// Do not generate X86ISD::ADDSUB node for FP16's vector types even though

42436

// the ADDSUB idiom has been successfully recognized. There are no known

42437

// X86 targets with FP16 ADDSUB instructions!

42438

if (VT.getVectorElementType() == MVT::f16)

42439

return SDValue();

42440

42441

return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);

42442

}

42443

42444

// We are looking for a shuffle where both sources are concatenated with undef

42445

// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so

42446

// if we can express this as a single-source shuffle, that's preferable.

42447

static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,

42448

const X86Subtarget &Subtarget) {

42449

if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))

42450

return SDValue();

42451

42452

EVT VT = N->getValueType(0);

42453

42454

// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.

42455

if (!VT.is128BitVector() && !VT.is256BitVector())

42456

return SDValue();

42457

42458

if (VT.getVectorElementType() != MVT::i32 &&

42459

VT.getVectorElementType() != MVT::i64 &&

42460

VT.getVectorElementType() != MVT::f32 &&

42461

VT.getVectorElementType() != MVT::f64)

42462

return SDValue();

42463

42464

SDValue N0 = N->getOperand(0);

42465

SDValue N1 = N->getOperand(1);

42466

42467

// Check that both sources are concats with undef.

42468

if (N0.getOpcode() != ISD::CONCAT_VECTORS ||

42469

N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||

42470

N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||

42471

!N1.getOperand(1).isUndef())

42472

return SDValue();

42473

42474

// Construct the new shuffle mask. Elements from the first source retain their

42475

// index, but elements from the second source no longer need to skip an undef.

42476

SmallVector<int, 8> Mask;

42477

int NumElts = VT.getVectorNumElements();

42478

42479

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);

42480

for (int Elt : SVOp->getMask())

42481

Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));

42482

42483

SDLoc DL(N);

42484

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),

42485

N1.getOperand(0));

42486

return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);

42487

}

42488

42489

/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the

42490

/// low half of each source vector and does not set any high half elements in

42491

/// the destination vector, narrow the shuffle to half its original size.

42492

static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {

42493

if (!Shuf->getValueType(0).isSimple())

42494

return SDValue();

42495

MVT VT = Shuf->getSimpleValueType(0);

42496

if (!VT.is256BitVector() && !VT.is512BitVector())

42497

return SDValue();

42498

42499

// See if we can ignore all of the high elements of the shuffle.

42500

ArrayRef<int> Mask = Shuf->getMask();

42501

if (!isUndefUpperHalf(Mask))

42502

return SDValue();

42503

42504

// Check if the shuffle mask accesses only the low half of each input vector

42505

// (half-index output is 0 or 2).

42506

int HalfIdx1, HalfIdx2;

42507

SmallVector<int, 8> HalfMask(Mask.size() / 2);

42508

if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||

42509

(HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))

42510

return SDValue();

42511

42512

// Create a half-width shuffle to replace the unnecessarily wide shuffle.

42513

// The trick is knowing that all of the insert/extract are actually free

42514

// subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle

42515

// of narrow inputs into a narrow output, and that is always cheaper than

42516

// the wide shuffle that we started with.

42517

return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),

42518

Shuf->getOperand(1), HalfMask, HalfIdx1,

42519

HalfIdx2, false, DAG, /*UseConcat*/true);

42520

}

42521

42522

static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,

42523

TargetLowering::DAGCombinerInfo &DCI,

42524

const X86Subtarget &Subtarget) {

42525

if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))

42526

if (SDValue V = narrowShuffle(Shuf, DAG))

42527

return V;

42528

42529

// If we have legalized the vector types, look for blends of FADD and FSUB

42530

// nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.

42531

SDLoc dl(N);

42532

EVT VT = N->getValueType(0);

42533

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

42534

if (TLI.isTypeLegal(VT))

42535

if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))

42536

return AddSub;

42537

42538

// Attempt to combine into a vector load/broadcast.

42539

if (SDValue LD = combineToConsecutiveLoads(

42540

VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))

42541

return LD;

42542

42543

// For AVX2, we sometimes want to combine

42544

// (vector_shuffle <mask> (concat_vectors t1, undef)

42545

// (concat_vectors t2, undef))

42546

// Into:

42547

// (vector_shuffle <mask> (concat_vectors t1, t2), undef)

42548

// Since the latter can be efficiently lowered with VPERMD/VPERMQ

42549

if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))

42550

return ShufConcat;

42551

42552

if (isTargetShuffle(N->getOpcode())) {

42553

SDValue Op(N, 0);

42554

if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))

42555

return Shuffle;

42556

42557

// Try recursively combining arbitrary sequences of x86 shuffle

42558

// instructions into higher-order shuffles. We do this after combining

42559

// specific PSHUF instruction sequences into their minimal form so that we

42560

// can evaluate how many specialized shuffle instructions are involved in

42561

// a particular chain.

42562

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

42563

return Res;

42564

42565

// Simplify source operands based on shuffle mask.

42566

// TODO - merge this into combineX86ShufflesRecursively.

42567

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

42568

if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))

42569

return SDValue(N, 0);

42570

42571

// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).

42572

// Perform this after other shuffle combines to allow inner shuffles to be

42573

// combined away first.

42574

if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, dl))

42575

return BinOp;

42576

}

42577

42578

return SDValue();

42579

}

42580

42581

// Simplify variable target shuffle masks based on the demanded elements.

42582

// TODO: Handle DemandedBits in mask indices as well?

42583

bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(

42584

SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,

42585

TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {

42586

// If we're demanding all elements don't bother trying to simplify the mask.

42587

unsigned NumElts = DemandedElts.getBitWidth();

42588

if (DemandedElts.isAllOnes())

42589

return false;

42590

42591

SDValue Mask = Op.getOperand(MaskIndex);

42592

if (!Mask.hasOneUse())

42593

return false;

42594

42595

// Attempt to generically simplify the variable shuffle mask.

42596

APInt MaskUndef, MaskZero;

42597

if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,

42598

Depth + 1))

42599

return true;

42600

42601

// Attempt to extract+simplify a (constant pool load) shuffle mask.

42602

// TODO: Support other types from getTargetShuffleMaskIndices?

42603

SDValue BC = peekThroughOneUseBitcasts(Mask);

42604

EVT BCVT = BC.getValueType();

42605

auto *Load = dyn_cast<LoadSDNode>(BC);

42606

if (!Load)

42607

return false;

42608

42609

const Constant *C = getTargetConstantFromNode(Load);

42610

if (!C)

42611

return false;

42612

42613

Type *CTy = C->getType();

42614

if (!CTy->isVectorTy() ||

42615

CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())

42616

return false;

42617

42618

// Handle scaling for i64 elements on 32-bit targets.

42619

unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();

42620

if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))

42621

return false;

42622

unsigned Scale = NumCstElts / NumElts;

42623

42624

// Simplify mask if we have an undemanded element that is not undef.

42625

bool Simplified = false;

42626

SmallVector<Constant *, 32> ConstVecOps;

42627

for (unsigned i = 0; i != NumCstElts; ++i) {

42628

Constant *Elt = C->getAggregateElement(i);

42629

if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {

42630

ConstVecOps.push_back(UndefValue::get(Elt->getType()));

42631

Simplified = true;

42632

continue;

42633

}

42634

ConstVecOps.push_back(Elt);

42635

}

42636

if (!Simplified)

42637

return false;

42638

42639

// Generate new constant pool entry + legalize immediately for the load.

42640

SDLoc DL(Op);

42641

SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);

42642

SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);

42643

SDValue NewMask = TLO.DAG.getLoad(

42644

BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,

42645

MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),

42646

Load->getAlign());

42647

return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));

42648

}

42649

42650

bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(

42651

SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,

42652

TargetLoweringOpt &TLO, unsigned Depth) const {

42653

int NumElts = DemandedElts.getBitWidth();

42654

unsigned Opc = Op.getOpcode();

42655

EVT VT = Op.getValueType();

42656

42657

// Handle special case opcodes.

42658

switch (Opc) {

42659

case X86ISD::PMULDQ:

42660

case X86ISD::PMULUDQ: {

42661

APInt LHSUndef, LHSZero;

42662

APInt RHSUndef, RHSZero;

42663

SDValue LHS = Op.getOperand(0);

42664

SDValue RHS = Op.getOperand(1);

42665

if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,

42666

Depth + 1))

42667

return true;

42668

if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,

42669

Depth + 1))

42670

return true;

42671

// Multiply by zero.

42672

KnownZero = LHSZero | RHSZero;

42673

break;

42674

}

42675

case X86ISD::VPMADDWD: {

42676

APInt LHSUndef, LHSZero;

42677

APInt RHSUndef, RHSZero;

42678

SDValue LHS = Op.getOperand(0);

42679

SDValue RHS = Op.getOperand(1);

42680

APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);

42681

42682

if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,

42683

Depth + 1))

42684

return true;

42685

if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,

42686

Depth + 1))

42687

return true;

42688

42689

// TODO: Multiply by zero.

42690

42691

// If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.

42692

APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;

42693

if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,

42694

Depth + 1))

42695

return true;

42696

APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;

42697

if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,

42698

Depth + 1))

42699

return true;

42700

break;

42701

}

42702

case X86ISD::PSADBW: {

42703

SDValue LHS = Op.getOperand(0);

42704

SDValue RHS = Op.getOperand(1);

42705

assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42708, __extension__
__PRETTY_FUNCTION__))

42706

LHS.getValueType() == RHS.getValueType() &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42708, __extension__
__PRETTY_FUNCTION__))

42707

LHS.getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42708, __extension__
__PRETTY_FUNCTION__))

42708

"Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42708, __extension__
__PRETTY_FUNCTION__));

42709

42710

// Aggressively peek through ops to get at the demanded elts.

42711

if (!DemandedElts.isAllOnes()) {

42712

unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();

42713

APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);

42714

SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(

42715

LHS, DemandedSrcElts, TLO.DAG, Depth + 1);

42716

SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(

42717

RHS, DemandedSrcElts, TLO.DAG, Depth + 1);

42718

if (NewLHS || NewRHS) {

42719

NewLHS = NewLHS ? NewLHS : LHS;

42720

NewRHS = NewRHS ? NewRHS : RHS;

42721

return TLO.CombineTo(

42722

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));

42723

}

42724

}

42725

break;

42726

}

42727

case X86ISD::VSHL:

42728

case X86ISD::VSRL:

42729

case X86ISD::VSRA: {

42730

// We only need the bottom 64-bits of the (128-bit) shift amount.

42731

SDValue Amt = Op.getOperand(1);

42732

MVT AmtVT = Amt.getSimpleValueType();

42733

assert(AmtVT.is128BitVector() && "Unexpected value type")(static_cast <bool> (AmtVT.is128BitVector() && "Unexpected value type"
) ? void (0) : __assert_fail ("AmtVT.is128BitVector() && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42733, __extension__
__PRETTY_FUNCTION__));

42734

42735

// If we reuse the shift amount just for sse shift amounts then we know that

42736

// only the bottom 64-bits are only ever used.

42737

bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {

42738

unsigned UseOpc = Use->getOpcode();

42739

return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||

42740

UseOpc == X86ISD::VSRA) &&

42741

Use->getOperand(0) != Amt;

42742

});

42743

42744

APInt AmtUndef, AmtZero;

42745

unsigned NumAmtElts = AmtVT.getVectorNumElements();

42746

APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);

42747

if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,

42748

Depth + 1, AssumeSingleUse))

42749

return true;

42750

[[fallthrough]];

42751

}

42752

case X86ISD::VSHLI:

42753

case X86ISD::VSRLI:

42754

case X86ISD::VSRAI: {

42755

SDValue Src = Op.getOperand(0);

42756

APInt SrcUndef;

42757

if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,

42758

Depth + 1))

42759

return true;

42760

42761

// Fold shift(0,x) -> 0

42762

if (DemandedElts.isSubsetOf(KnownZero))

42763

return TLO.CombineTo(

42764

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));

42765

42766

// Aggressively peek through ops to get at the demanded elts.

42767

if (!DemandedElts.isAllOnes())

42768

if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(

42769

Src, DemandedElts, TLO.DAG, Depth + 1))

42770

return TLO.CombineTo(

42771

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));

42772

break;

42773

}

42774

case X86ISD::VPSHA:

42775

case X86ISD::VPSHL:

42776

case X86ISD::VSHLV:

42777

case X86ISD::VSRLV:

42778

case X86ISD::VSRAV: {

42779

APInt LHSUndef, LHSZero;

42780

APInt RHSUndef, RHSZero;

42781

SDValue LHS = Op.getOperand(0);

42782

SDValue RHS = Op.getOperand(1);

42783

if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,

42784

Depth + 1))

42785

return true;

42786

42787

// Fold shift(0,x) -> 0

42788

if (DemandedElts.isSubsetOf(LHSZero))

42789

return TLO.CombineTo(

42790

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));

42791

42792

if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,

42793

Depth + 1))

42794

return true;

42795

42796

KnownZero = LHSZero;

42797

break;

42798

}

42799

case X86ISD::KSHIFTL: {

42800

SDValue Src = Op.getOperand(0);

42801

auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));

42802

assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42802, __extension__
__PRETTY_FUNCTION__));

42803

unsigned ShiftAmt = Amt->getZExtValue();

42804

42805

if (ShiftAmt == 0)

42806

return TLO.CombineTo(Op, Src);

42807

42808

// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a

42809

// single shift. We can do this if the bottom bits (which are shifted

42810

// out) are never demanded.

42811

if (Src.getOpcode() == X86ISD::KSHIFTR) {

42812

if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {

42813

unsigned C1 = Src.getConstantOperandVal(1);

42814

unsigned NewOpc = X86ISD::KSHIFTL;

42815

int Diff = ShiftAmt - C1;

42816

if (Diff < 0) {

42817

Diff = -Diff;

42818

NewOpc = X86ISD::KSHIFTR;

42819

}

42820

42821

SDLoc dl(Op);

42822

SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);

42823

return TLO.CombineTo(

42824

Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));

42825

}

42826

}

42827

42828

APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);

42829

if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,

42830

Depth + 1))

42831

return true;

42832

42833

KnownUndef <<= ShiftAmt;

42834

KnownZero <<= ShiftAmt;

42835

KnownZero.setLowBits(ShiftAmt);

42836

break;

42837

}

42838

case X86ISD::KSHIFTR: {

42839

SDValue Src = Op.getOperand(0);

42840

auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));

42841

assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42841, __extension__
__PRETTY_FUNCTION__));

42842

unsigned ShiftAmt = Amt->getZExtValue();

42843

42844

if (ShiftAmt == 0)

42845

return TLO.CombineTo(Op, Src);

42846

42847

// If this is ((X << C1) >>u ShAmt), see if we can simplify this into a

42848

// single shift. We can do this if the top bits (which are shifted

42849

// out) are never demanded.

42850

if (Src.getOpcode() == X86ISD::KSHIFTL) {

42851

if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {

42852

unsigned C1 = Src.getConstantOperandVal(1);

42853

unsigned NewOpc = X86ISD::KSHIFTR;

42854

int Diff = ShiftAmt - C1;

42855

if (Diff < 0) {

42856

Diff = -Diff;

42857

NewOpc = X86ISD::KSHIFTL;

42858

}

42859

42860

SDLoc dl(Op);

42861

SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);

42862

return TLO.CombineTo(

42863

Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));

42864

}

42865

}

42866

42867

APInt DemandedSrc = DemandedElts.shl(ShiftAmt);

42868

if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,

42869

Depth + 1))

42870

return true;

42871

42872

KnownUndef.lshrInPlace(ShiftAmt);

42873

KnownZero.lshrInPlace(ShiftAmt);

42874

KnownZero.setHighBits(ShiftAmt);

42875

break;

42876

}

42877

case X86ISD::ANDNP: {

42878

// ANDNP = (~LHS & RHS);

42879

SDValue LHS = Op.getOperand(0);

42880

SDValue RHS = Op.getOperand(1);

42881

42882

auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {

42883

APInt UndefElts;

42884

SmallVector<APInt> EltBits;

42885

int NumElts = VT.getVectorNumElements();

42886

int EltSizeInBits = VT.getScalarSizeInBits();

42887

APInt OpBits = APInt::getAllOnes(EltSizeInBits);

42888

APInt OpElts = DemandedElts;

42889

if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

42890

EltBits)) {

42891

OpBits.clearAllBits();

42892

OpElts.clearAllBits();

42893

for (int I = 0; I != NumElts; ++I) {

42894

if (!DemandedElts[I])

42895

continue;

42896

if (UndefElts[I]) {

42897

// We can't assume an undef src element gives an undef dst - the

42898

// other src might be zero.

42899

OpBits.setAllBits();

42900

OpElts.setBit(I);

42901

} else if ((Invert && !EltBits[I].isAllOnes()) ||

42902

(!Invert && !EltBits[I].isZero())) {

42903

OpBits |= Invert ? ~EltBits[I] : EltBits[I];

42904

OpElts.setBit(I);

42905

}

42906

}

42907

}

42908

return std::make_pair(OpBits, OpElts);

42909

};

42910

APInt BitsLHS, EltsLHS;

42911

APInt BitsRHS, EltsRHS;

42912

std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);

42913

std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);

42914

42915

APInt LHSUndef, LHSZero;

42916

APInt RHSUndef, RHSZero;

42917

if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,

42918

Depth + 1))

42919

return true;

42920

if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,

42921

Depth + 1))

42922

return true;

42923

42924

if (!DemandedElts.isAllOnes()) {

42925

SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,

42926

TLO.DAG, Depth + 1);

42927

SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,

42928

TLO.DAG, Depth + 1);

42929

if (NewLHS || NewRHS) {

42930

NewLHS = NewLHS ? NewLHS : LHS;

42931

NewRHS = NewRHS ? NewRHS : RHS;

42932

return TLO.CombineTo(

42933

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));

42934

}

42935

}

42936

break;

42937

}

42938

case X86ISD::CVTSI2P:

42939

case X86ISD::CVTUI2P: {

42940

SDValue Src = Op.getOperand(0);

42941

MVT SrcVT = Src.getSimpleValueType();

42942

APInt SrcUndef, SrcZero;

42943

APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

42944

if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,

42945

Depth + 1))

42946

return true;

42947

break;

42948

}

42949

case X86ISD::PACKSS:

42950

case X86ISD::PACKUS: {

42951

SDValue N0 = Op.getOperand(0);

42952

SDValue N1 = Op.getOperand(1);

42953

42954

APInt DemandedLHS, DemandedRHS;

42955

getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

42956

42957

APInt LHSUndef, LHSZero;

42958

if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,

42959

Depth + 1))

42960

return true;

42961

APInt RHSUndef, RHSZero;

42962

if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,

42963

Depth + 1))

42964

return true;

42965

42966

// TODO - pass on known zero/undef.

42967

42968

// Aggressively peek through ops to get at the demanded elts.

42969

// TODO - we should do this for all target/faux shuffles ops.

42970

if (!DemandedElts.isAllOnes()) {

42971

SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,

42972

TLO.DAG, Depth + 1);

42973

SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,

42974

TLO.DAG, Depth + 1);

42975

if (NewN0 || NewN1) {

42976

NewN0 = NewN0 ? NewN0 : N0;

42977

NewN1 = NewN1 ? NewN1 : N1;

42978

return TLO.CombineTo(Op,

42979

TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));

42980

}

42981

}

42982

break;

42983

}

42984

case X86ISD::HADD:

42985

case X86ISD::HSUB:

42986

case X86ISD::FHADD:

42987

case X86ISD::FHSUB: {

42988

SDValue N0 = Op.getOperand(0);

42989

SDValue N1 = Op.getOperand(1);

42990

42991

APInt DemandedLHS, DemandedRHS;

42992

getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

42993

42994

APInt LHSUndef, LHSZero;

42995

if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,

42996

Depth + 1))

42997

return true;

42998

APInt RHSUndef, RHSZero;

42999

if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,

43000

Depth + 1))

43001

return true;

43002

43003

// TODO - pass on known zero/undef.

43004

43005

// Aggressively peek through ops to get at the demanded elts.

43006

// TODO: Handle repeated operands.

43007

if (N0 != N1 && !DemandedElts.isAllOnes()) {

43008

SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,

43009

TLO.DAG, Depth + 1);

43010

SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,

43011

TLO.DAG, Depth + 1);

43012

if (NewN0 || NewN1) {

43013

NewN0 = NewN0 ? NewN0 : N0;

43014

NewN1 = NewN1 ? NewN1 : N1;

43015

return TLO.CombineTo(Op,

43016

TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));

43017

}

43018

}

43019

break;

43020

}

43021

case X86ISD::VTRUNC:

43022

case X86ISD::VTRUNCS:

43023

case X86ISD::VTRUNCUS: {

43024

SDValue Src = Op.getOperand(0);

43025

MVT SrcVT = Src.getSimpleValueType();

43026

APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

43027

APInt SrcUndef, SrcZero;

43028

if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,

43029

Depth + 1))

43030

return true;

43031

KnownZero = SrcZero.zextOrTrunc(NumElts);

43032

KnownUndef = SrcUndef.zextOrTrunc(NumElts);

43033

break;

43034

}

43035

case X86ISD::BLENDV: {

43036

APInt SelUndef, SelZero;

43037

if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,

43038

SelZero, TLO, Depth + 1))

43039

return true;

43040

43041

// TODO: Use SelZero to adjust LHS/RHS DemandedElts.

43042

APInt LHSUndef, LHSZero;

43043

if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,

43044

LHSZero, TLO, Depth + 1))

43045

return true;

43046

43047

APInt RHSUndef, RHSZero;

43048

if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,

43049

RHSZero, TLO, Depth + 1))

43050

return true;

43051

43052

KnownZero = LHSZero & RHSZero;

43053

KnownUndef = LHSUndef & RHSUndef;

43054

break;

43055

}

43056

case X86ISD::VZEXT_MOVL: {

43057

// If upper demanded elements are already zero then we have nothing to do.

43058

SDValue Src = Op.getOperand(0);

43059

APInt DemandedUpperElts = DemandedElts;

43060

DemandedUpperElts.clearLowBits(1);

43061

if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))

43062

return TLO.CombineTo(Op, Src);

43063

break;

43064

}

43065

case X86ISD::VBROADCAST: {

43066

SDValue Src = Op.getOperand(0);

43067

MVT SrcVT = Src.getSimpleValueType();

43068

if (!SrcVT.isVector())

43069

break;

43070

// Don't bother broadcasting if we just need the 0'th element.

43071

if (DemandedElts == 1) {

43072

if (Src.getValueType() != VT)

43073

Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,

43074

SDLoc(Op));

43075

return TLO.CombineTo(Op, Src);

43076

}

43077

APInt SrcUndef, SrcZero;

43078

APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);

43079

if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,

43080

Depth + 1))

43081

return true;

43082

// Aggressively peek through src to get at the demanded elt.

43083

// TODO - we should do this for all target/faux shuffles ops.

43084

if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(

43085

Src, SrcElts, TLO.DAG, Depth + 1))

43086

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

43087

break;

43088

}

43089

case X86ISD::VPERMV:

43090

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,

43091

Depth))

43092

return true;

43093

break;

43094

case X86ISD::PSHUFB:

43095

case X86ISD::VPERMV3:

43096

case X86ISD::VPERMILPV:

43097

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,

43098

Depth))

43099

return true;

43100

break;

43101

case X86ISD::VPPERM:

43102

case X86ISD::VPERMIL2:

43103

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,

43104

Depth))

43105

return true;

43106

break;

43107

}

43108

43109

// For 256/512-bit ops that are 128/256-bit ops glued together, if we do not

43110

// demand any of the high elements, then narrow the op to 128/256-bits: e.g.

43111

// (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0

43112

if ((VT.is256BitVector() || VT.is512BitVector()) &&

43113

DemandedElts.lshr(NumElts / 2) == 0) {

43114

unsigned SizeInBits = VT.getSizeInBits();

43115

unsigned ExtSizeInBits = SizeInBits / 2;

43116

43117

// See if 512-bit ops only use the bottom 128-bits.

43118

if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)

43119

ExtSizeInBits = SizeInBits / 4;

43120

43121

switch (Opc) {

43122

// Scalar broadcast.

43123

case X86ISD::VBROADCAST: {

43124

SDLoc DL(Op);

43125

SDValue Src = Op.getOperand(0);

43126

if (Src.getValueSizeInBits() > ExtSizeInBits)

43127

Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);

43128

EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

43129

ExtSizeInBits / VT.getScalarSizeInBits());

43130

SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);

43131

return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,

43132

TLO.DAG, DL, ExtSizeInBits));

43133

}

43134

case X86ISD::VBROADCAST_LOAD: {

43135

SDLoc DL(Op);

43136

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

43137

EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

43138

ExtSizeInBits / VT.getScalarSizeInBits());

43139

SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);

43140

SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};

43141

SDValue Bcst = TLO.DAG.getMemIntrinsicNode(

43142

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),

43143

MemIntr->getMemOperand());

43144

TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),

43145

Bcst.getValue(1));

43146

return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,

43147

TLO.DAG, DL, ExtSizeInBits));

43148

}

43149

// Subvector broadcast.

43150

case X86ISD::SUBV_BROADCAST_LOAD: {

43151

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

43152

EVT MemVT = MemIntr->getMemoryVT();

43153

if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {

43154

SDLoc DL(Op);

43155

SDValue Ld =

43156

TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),

43157

MemIntr->getBasePtr(), MemIntr->getMemOperand());

43158

TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),

43159

Ld.getValue(1));

43160

return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,

43161

TLO.DAG, DL, ExtSizeInBits));

43162

} else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {

43163

SDLoc DL(Op);

43164

EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

43165

ExtSizeInBits / VT.getScalarSizeInBits());

43166

if (SDValue BcstLd =

43167

getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))

43168

return TLO.CombineTo(Op,

43169

insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,

43170

TLO.DAG, DL, ExtSizeInBits));

43171

}

43172

break;

43173

}

43174

// Byte shifts by immediate.

43175

case X86ISD::VSHLDQ:

43176

case X86ISD::VSRLDQ:

43177

// Shift by uniform.

43178

case X86ISD::VSHL:

43179

case X86ISD::VSRL:

43180

case X86ISD::VSRA:

43181

// Shift by immediate.

43182

case X86ISD::VSHLI:

43183

case X86ISD::VSRLI:

43184

case X86ISD::VSRAI: {

43185

SDLoc DL(Op);

43186

SDValue Ext0 =

43187

extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);

43188

SDValue ExtOp =

43189

TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));

43190

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

43191

SDValue Insert =

43192

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

43193

return TLO.CombineTo(Op, Insert);

43194

}

43195

case X86ISD::VPERMI: {

43196

// Simplify PERMPD/PERMQ to extract_subvector.

43197

// TODO: This should be done in shuffle combining.

43198

if (VT == MVT::v4f64 || VT == MVT::v4i64) {

43199

SmallVector<int, 4> Mask;

43200

DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);

43201

if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {

43202

SDLoc DL(Op);

43203

SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);

43204

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

43205

SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);

43206

return TLO.CombineTo(Op, Insert);

43207

}

43208

}

43209

break;

43210

}

43211

case X86ISD::VPERM2X128: {

43212

// Simplify VPERM2F128/VPERM2I128 to extract_subvector.

43213

SDLoc DL(Op);

43214

unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;

43215

if (LoMask & 0x8)

43216

return TLO.CombineTo(

43217

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));

43218

unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);

43219

unsigned SrcIdx = (LoMask & 0x2) >> 1;

43220

SDValue ExtOp =

43221

extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);

43222

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

43223

SDValue Insert =

43224

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

43225

return TLO.CombineTo(Op, Insert);

43226

}

43227

// Zero upper elements.

43228

case X86ISD::VZEXT_MOVL:

43229

// Target unary shuffles by immediate:

43230

case X86ISD::PSHUFD:

43231

case X86ISD::PSHUFLW:

43232

case X86ISD::PSHUFHW:

43233

case X86ISD::VPERMILPI:

43234

// (Non-Lane Crossing) Target Shuffles.

43235

case X86ISD::VPERMILPV:

43236

case X86ISD::VPERMIL2:

43237

case X86ISD::PSHUFB:

43238

case X86ISD::UNPCKL:

43239

case X86ISD::UNPCKH:

43240

case X86ISD::BLENDI:

43241

// Integer ops.

43242

case X86ISD::PACKSS:

43243

case X86ISD::PACKUS:

43244

// Horizontal Ops.

43245

case X86ISD::HADD:

43246

case X86ISD::HSUB:

43247

case X86ISD::FHADD:

43248

case X86ISD::FHSUB: {

43249

SDLoc DL(Op);

43250

SmallVector<SDValue, 4> Ops;

43251

for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {

43252

SDValue SrcOp = Op.getOperand(i);

43253

EVT SrcVT = SrcOp.getValueType();

43254

assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43255, __extension__
__PRETTY_FUNCTION__))

43255

"Unsupported vector size")(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43255, __extension__
__PRETTY_FUNCTION__));

43256

Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,

43257

ExtSizeInBits)

43258

: SrcOp);

43259

}

43260

MVT ExtVT = VT.getSimpleVT();

43261

ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),

43262

ExtSizeInBits / ExtVT.getScalarSizeInBits());

43263

SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);

43264

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

43265

SDValue Insert =

43266

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

43267

return TLO.CombineTo(Op, Insert);

43268

}

43269

}

43270

}

43271

43272

// For splats, unless we *only* demand the 0'th element,

43273

// stop attempts at simplification here, we aren't going to improve things,

43274

// this is better than any potential shuffle.

43275

if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))

43276

return false;

43277

43278

// Get target/faux shuffle mask.

43279

APInt OpUndef, OpZero;

43280

SmallVector<int, 64> OpMask;

43281

SmallVector<SDValue, 2> OpInputs;

43282

if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,

43283

OpZero, TLO.DAG, Depth, false))

43284

return false;

43285

43286

// Shuffle inputs must be the same size as the result.

43287

if (OpMask.size() != (unsigned)NumElts ||

43288

llvm::any_of(OpInputs, [VT](SDValue V) {

43289

return VT.getSizeInBits() != V.getValueSizeInBits() ||

43290

!V.getValueType().isVector();

43291

}))

43292

return false;

43293

43294

KnownZero = OpZero;

43295

KnownUndef = OpUndef;

43296

43297

// Check if shuffle mask can be simplified to undef/zero/identity.

43298

int NumSrcs = OpInputs.size();

43299

for (int i = 0; i != NumElts; ++i)

43300

if (!DemandedElts[i])

43301

OpMask[i] = SM_SentinelUndef;

43302

43303

if (isUndefInRange(OpMask, 0, NumElts)) {

43304

KnownUndef.setAllBits();

43305

return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));

43306

}

43307

if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {

43308

KnownZero.setAllBits();

43309

return TLO.CombineTo(

43310

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));

43311

}

43312

for (int Src = 0; Src != NumSrcs; ++Src)

43313

if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))

43314

return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));

43315

43316

// Attempt to simplify inputs.

43317

for (int Src = 0; Src != NumSrcs; ++Src) {

43318

// TODO: Support inputs of different types.

43319

if (OpInputs[Src].getValueType() != VT)

43320

continue;

43321

43322

int Lo = Src * NumElts;

43323

APInt SrcElts = APInt::getZero(NumElts);

43324

for (int i = 0; i != NumElts; ++i)

43325

if (DemandedElts[i]) {

43326

int M = OpMask[i] - Lo;

43327

if (0 <= M && M < NumElts)

43328

SrcElts.setBit(M);

43329

}

43330

43331

// TODO - Propagate input undef/zero elts.

43332

APInt SrcUndef, SrcZero;

43333

if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,

43334

TLO, Depth + 1))

43335

return true;

43336

}

43337

43338

// If we don't demand all elements, then attempt to combine to a simpler

43339

// shuffle.

43340

// We need to convert the depth to something combineX86ShufflesRecursively

43341

// can handle - so pretend its Depth == 0 again, and reduce the max depth

43342

// to match. This prevents combineX86ShuffleChain from returning a

43343

// combined shuffle that's the same as the original root, causing an

43344

// infinite loop.

43345

if (!DemandedElts.isAllOnes()) {

43346

assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range")(static_cast <bool> (Depth < X86::MaxShuffleCombineDepth
&& "Depth out of range") ? void (0) : __assert_fail (
"Depth < X86::MaxShuffleCombineDepth && \"Depth out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43346, __extension__
__PRETTY_FUNCTION__));

43347

43348

SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);

43349

for (int i = 0; i != NumElts; ++i)

43350

if (DemandedElts[i])

43351

DemandedMask[i] = i;

43352

43353

SDValue NewShuffle = combineX86ShufflesRecursively(

43354

{Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,

43355

/*HasVarMask*/ false,

43356

/*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,

43357

Subtarget);

43358

if (NewShuffle)

43359

return TLO.CombineTo(Op, NewShuffle);

43360

}

43361

43362

return false;

43363

}

43364

43365

bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(

43366

SDValue Op, const APInt &OriginalDemandedBits,

43367

const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,

43368

unsigned Depth) const {

43369

EVT VT = Op.getValueType();

43370

unsigned BitWidth = OriginalDemandedBits.getBitWidth();

43371

unsigned Opc = Op.getOpcode();

43372

switch(Opc) {

43373

case X86ISD::VTRUNC: {

43374

KnownBits KnownOp;

43375

SDValue Src = Op.getOperand(0);

43376

MVT SrcVT = Src.getSimpleValueType();

43377

43378

// Simplify the input, using demanded bit information.

43379

APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());

43380

APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());

43381

if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))

43382

return true;

43383

break;

43384

}

43385

case X86ISD::PMULDQ:

43386

case X86ISD::PMULUDQ: {

43387

// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.

43388

KnownBits KnownLHS, KnownRHS;

43389

SDValue LHS = Op.getOperand(0);

43390

SDValue RHS = Op.getOperand(1);

43391

43392

// Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.

43393

// FIXME: Can we bound this better?

43394

APInt DemandedMask = APInt::getLowBitsSet(64, 32);

43395

APInt DemandedMaskLHS = APInt::getAllOnes(64);

43396

APInt DemandedMaskRHS = APInt::getAllOnes(64);

43397

43398

bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();

43399

if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))

43400

DemandedMaskLHS = DemandedMask;

43401

if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))

43402

DemandedMaskRHS = DemandedMask;

43403

43404

if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,

43405

KnownLHS, TLO, Depth + 1))

43406

return true;

43407

if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,

43408

KnownRHS, TLO, Depth + 1))

43409

return true;

43410

43411

// PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.

43412

KnownRHS = KnownRHS.trunc(32);

43413

if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&

43414

KnownRHS.getConstant().isOne()) {

43415

SDLoc DL(Op);

43416

SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);

43417

return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));

43418

}

43419

43420

// Aggressively peek through ops to get at the demanded low bits.

43421

SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(

43422

LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);

43423

SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(

43424

RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);

43425

if (DemandedLHS || DemandedRHS) {

43426

DemandedLHS = DemandedLHS ? DemandedLHS : LHS;

43427

DemandedRHS = DemandedRHS ? DemandedRHS : RHS;

43428

return TLO.CombineTo(

43429

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));

43430

}

43431

break;

43432

}

43433

case X86ISD::VSHLI: {

43434

SDValue Op0 = Op.getOperand(0);

43435

43436

unsigned ShAmt = Op.getConstantOperandVal(1);

43437

if (ShAmt >= BitWidth)

43438

break;

43439

43440

APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);

43441

43442

// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a

43443

// single shift. We can do this if the bottom bits (which are shifted

43444

// out) are never demanded.

43445

if (Op0.getOpcode() == X86ISD::VSRLI &&

43446

OriginalDemandedBits.countr_zero() >= ShAmt) {

43447

unsigned Shift2Amt = Op0.getConstantOperandVal(1);

43448

if (Shift2Amt < BitWidth) {

43449

int Diff = ShAmt - Shift2Amt;

43450

if (Diff == 0)

43451

return TLO.CombineTo(Op, Op0.getOperand(0));

43452

43453

unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;

43454

SDValue NewShift = TLO.DAG.getNode(

43455

NewOpc, SDLoc(Op), VT, Op0.getOperand(0),

43456

TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));

43457

return TLO.CombineTo(Op, NewShift);

43458

}

43459

}

43460

43461

// If we are only demanding sign bits then we can use the shift source directly.

43462

unsigned NumSignBits =

43463

TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);

43464

unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();

43465

if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)

43466

return TLO.CombineTo(Op, Op0);

43467

43468

if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

43469

TLO, Depth + 1))

43470

return true;

43471

43472

assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43472, __extension__
__PRETTY_FUNCTION__));

43473

Known.Zero <<= ShAmt;

43474

Known.One <<= ShAmt;

43475

43476

// Low bits known zero.

43477

Known.Zero.setLowBits(ShAmt);

43478

return false;

43479

}

43480

case X86ISD::VSRLI: {

43481

unsigned ShAmt = Op.getConstantOperandVal(1);

43482

if (ShAmt >= BitWidth)

43483

break;

43484

43485

APInt DemandedMask = OriginalDemandedBits << ShAmt;

43486

43487

if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,

43488

OriginalDemandedElts, Known, TLO, Depth + 1))

43489

return true;

43490

43491

assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43491, __extension__
__PRETTY_FUNCTION__));

43492

Known.Zero.lshrInPlace(ShAmt);

43493

Known.One.lshrInPlace(ShAmt);

43494

43495

// High bits known zero.

43496

Known.Zero.setHighBits(ShAmt);

43497

return false;

43498

}

43499

case X86ISD::VSRAI: {

43500

SDValue Op0 = Op.getOperand(0);

43501

SDValue Op1 = Op.getOperand(1);

43502

43503

unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();

43504

if (ShAmt >= BitWidth)

43505

break;

43506

43507

APInt DemandedMask = OriginalDemandedBits << ShAmt;

43508

43509

// If we just want the sign bit then we don't need to shift it.

43510

if (OriginalDemandedBits.isSignMask())

43511

return TLO.CombineTo(Op, Op0);

43512

43513

// fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1

43514

if (Op0.getOpcode() == X86ISD::VSHLI &&

43515

Op.getOperand(1) == Op0.getOperand(1)) {

43516

SDValue Op00 = Op0.getOperand(0);

43517

unsigned NumSignBits =

43518

TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);

43519

if (ShAmt < NumSignBits)

43520

return TLO.CombineTo(Op, Op00);

43521

}

43522

43523

// If any of the demanded bits are produced by the sign extension, we also

43524

// demand the input sign bit.

43525

if (OriginalDemandedBits.countl_zero() < ShAmt)

43526

DemandedMask.setSignBit();

43527

43528

if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

43529

TLO, Depth + 1))

43530

return true;

43531

43532

assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43532, __extension__
__PRETTY_FUNCTION__));

43533

Known.Zero.lshrInPlace(ShAmt);

43534

Known.One.lshrInPlace(ShAmt);

43535

43536

// If the input sign bit is known to be zero, or if none of the top bits

43537

// are demanded, turn this into an unsigned shift right.

43538

if (Known.Zero[BitWidth - ShAmt - 1] ||

43539

OriginalDemandedBits.countl_zero() >= ShAmt)

43540

return TLO.CombineTo(

43541

Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));

43542

43543

// High bits are known one.

43544

if (Known.One[BitWidth - ShAmt - 1])

43545

Known.One.setHighBits(ShAmt);

43546

return false;

43547

}

43548

case X86ISD::BLENDV: {

43549

SDValue Sel = Op.getOperand(0);

43550

SDValue LHS = Op.getOperand(1);

43551

SDValue RHS = Op.getOperand(2);

43552

43553

APInt SignMask = APInt::getSignMask(BitWidth);

43554

SDValue NewSel = SimplifyMultipleUseDemandedBits(

43555

Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);

43556

SDValue NewLHS = SimplifyMultipleUseDemandedBits(

43557

LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);

43558

SDValue NewRHS = SimplifyMultipleUseDemandedBits(

43559

RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);

43560

43561

if (NewSel || NewLHS || NewRHS) {

43562

NewSel = NewSel ? NewSel : Sel;

43563

NewLHS = NewLHS ? NewLHS : LHS;

43564

NewRHS = NewRHS ? NewRHS : RHS;

43565

return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,

43566

NewSel, NewLHS, NewRHS));

43567

}

43568

break;

43569

}

43570

case X86ISD::PEXTRB:

43571

case X86ISD::PEXTRW: {

43572

SDValue Vec = Op.getOperand(0);

43573

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));

43574

MVT VecVT = Vec.getSimpleValueType();

43575

unsigned NumVecElts = VecVT.getVectorNumElements();

43576

43577

if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {

43578

unsigned Idx = CIdx->getZExtValue();

43579

unsigned VecBitWidth = VecVT.getScalarSizeInBits();

43580

43581

// If we demand no bits from the vector then we must have demanded

43582

// bits from the implict zext - simplify to zero.

43583

APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);

43584

if (DemandedVecBits == 0)

43585

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

43586

43587

APInt KnownUndef, KnownZero;

43588

APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);

43589

if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,

43590

KnownZero, TLO, Depth + 1))

43591

return true;

43592

43593

KnownBits KnownVec;

43594

if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,

43595

KnownVec, TLO, Depth + 1))

43596

return true;

43597

43598

if (SDValue V = SimplifyMultipleUseDemandedBits(

43599

Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))

43600

return TLO.CombineTo(

43601

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));

43602

43603

Known = KnownVec.zext(BitWidth);

43604

return false;

43605

}

43606

break;

43607

}

43608

case X86ISD::PINSRB:

43609

case X86ISD::PINSRW: {

43610

SDValue Vec = Op.getOperand(0);

43611

SDValue Scl = Op.getOperand(1);

43612

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));

43613

MVT VecVT = Vec.getSimpleValueType();

43614

43615

if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {

43616

unsigned Idx = CIdx->getZExtValue();

43617

if (!OriginalDemandedElts[Idx])

43618

return TLO.CombineTo(Op, Vec);

43619

43620

KnownBits KnownVec;

43621

APInt DemandedVecElts(OriginalDemandedElts);

43622

DemandedVecElts.clearBit(Idx);

43623

if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,

43624

KnownVec, TLO, Depth + 1))

43625

return true;

43626

43627

KnownBits KnownScl;

43628

unsigned NumSclBits = Scl.getScalarValueSizeInBits();

43629

APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);

43630

if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))

43631

return true;

43632

43633

KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());

43634

Known = KnownBits::commonBits(KnownVec, KnownScl);

43635

return false;

43636

}

43637

break;

43638

}

43639

case X86ISD::PACKSS:

43640

// PACKSS saturates to MIN/MAX integer values. So if we just want the

43641

// sign bit then we can just ask for the source operands sign bit.

43642

// TODO - add known bits handling.

43643

if (OriginalDemandedBits.isSignMask()) {

43644

APInt DemandedLHS, DemandedRHS;

43645

getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);

43646

43647

KnownBits KnownLHS, KnownRHS;

43648

APInt SignMask = APInt::getSignMask(BitWidth * 2);

43649

if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,

43650

KnownLHS, TLO, Depth + 1))

43651

return true;

43652

if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,

43653

KnownRHS, TLO, Depth + 1))

43654

return true;

43655

43656

// Attempt to avoid multi-use ops if we don't need anything from them.

43657

SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(

43658

Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);

43659

SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(

43660

Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);

43661

if (DemandedOp0 || DemandedOp1) {

43662

SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);

43663

SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);

43664

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));

43665

}

43666

}

43667

// TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.

43668

break;

43669

case X86ISD::VBROADCAST: {

43670

SDValue Src = Op.getOperand(0);

43671

MVT SrcVT = Src.getSimpleValueType();

43672

APInt DemandedElts = APInt::getOneBitSet(

43673

SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);

43674

if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,

43675

TLO, Depth + 1))

43676

return true;

43677

// If we don't need the upper bits, attempt to narrow the broadcast source.

43678

// Don't attempt this on AVX512 as it might affect broadcast folding.

43679

// TODO: Should we attempt this for i32/i16 splats? They tend to be slower.

43680

if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&

43681

OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&

43682

Src->hasOneUse()) {

43683

MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);

43684

SDValue NewSrc =

43685

TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);

43686

MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);

43687

SDValue NewBcst =

43688

TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);

43689

return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));

43690

}

43691

break;

43692

}

43693

case X86ISD::PCMPGT:

43694

// icmp sgt(0, R) == ashr(R, BitWidth-1).

43695

// iff we only need the sign bit then we can use R directly.

43696

if (OriginalDemandedBits.isSignMask() &&

43697

ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))

43698

return TLO.CombineTo(Op, Op.getOperand(1));

43699

break;

43700

case X86ISD::MOVMSK: {

43701

SDValue Src = Op.getOperand(0);

43702

MVT SrcVT = Src.getSimpleValueType();

43703

unsigned SrcBits = SrcVT.getScalarSizeInBits();

43704

unsigned NumElts = SrcVT.getVectorNumElements();

43705

43706

// If we don't need the sign bits at all just return zero.

43707

if (OriginalDemandedBits.countr_zero() >= NumElts)

43708

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

43709

43710

// See if we only demand bits from the lower 128-bit vector.

43711

if (SrcVT.is256BitVector() &&

43712

OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {

43713

SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));

43714

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

43715

}

43716

43717

// Only demand the vector elements of the sign bits we need.

43718

APInt KnownUndef, KnownZero;

43719

APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);

43720

if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,

43721

TLO, Depth + 1))

43722

return true;

43723

43724

Known.Zero = KnownZero.zext(BitWidth);

43725

Known.Zero.setHighBits(BitWidth - NumElts);

43726

43727

// MOVMSK only uses the MSB from each vector element.

43728

KnownBits KnownSrc;

43729

APInt DemandedSrcBits = APInt::getSignMask(SrcBits);

43730

if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,

43731

Depth + 1))

43732

return true;

43733

43734

if (KnownSrc.One[SrcBits - 1])

43735

Known.One.setLowBits(NumElts);

43736

else if (KnownSrc.Zero[SrcBits - 1])

43737

Known.Zero.setLowBits(NumElts);

43738

43739

// Attempt to avoid multi-use os if we don't need anything from it.

43740

if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(

43741

Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))

43742

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

43743

return false;

43744

}

43745

case X86ISD::TESTP: {

43746

SDValue Op0 = Op.getOperand(0);

43747

SDValue Op1 = Op.getOperand(1);

43748

MVT OpVT = Op0.getSimpleValueType();

43749

assert((OpVT.getVectorElementType() == MVT::f32 ||(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43751, __extension__
__PRETTY_FUNCTION__))

43750

OpVT.getVectorElementType() == MVT::f64) &&(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43751, __extension__
__PRETTY_FUNCTION__))

43751

"Illegal vector type for X86ISD::TESTP")(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43751, __extension__
__PRETTY_FUNCTION__));

43752

43753

// TESTPS/TESTPD only demands the sign bits of ALL the elements.

43754

KnownBits KnownSrc;

43755

APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());

43756

return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1) ||

43757

SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1);

43758

}

43759

case X86ISD::BEXTR:

43760

case X86ISD::BEXTRI: {

43761

SDValue Op0 = Op.getOperand(0);

43762

SDValue Op1 = Op.getOperand(1);

43763

43764

// Only bottom 16-bits of the control bits are required.

43765

if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {

43766

// NOTE: SimplifyDemandedBits won't do this for constants.

43767

uint64_t Val1 = Cst1->getZExtValue();

43768

uint64_t MaskedVal1 = Val1 & 0xFFFF;

43769

if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {

43770

SDLoc DL(Op);

43771

return TLO.CombineTo(

43772

Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,

43773

TLO.DAG.getConstant(MaskedVal1, DL, VT)));

43774

}

43775

43776

unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);

43777

unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);

43778

43779

// If the length is 0, the result is 0.

43780

if (Length == 0) {

43781

Known.setAllZero();

43782

return false;

43783

}

43784

43785

if ((Shift + Length) <= BitWidth) {

43786

APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);

43787

if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))

43788

return true;

43789

43790

Known = Known.extractBits(Length, Shift);

43791

Known = Known.zextOrTrunc(BitWidth);

43792

return false;

43793

}

43794

} else {

43795

assert(Opc == X86ISD::BEXTR && "Unexpected opcode!")(static_cast <bool> (Opc == X86ISD::BEXTR && "Unexpected opcode!"
) ? void (0) : __assert_fail ("Opc == X86ISD::BEXTR && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43795, __extension__
__PRETTY_FUNCTION__));

43796

KnownBits Known1;

43797

APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));

43798

if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))

43799

return true;

43800

43801

// If the length is 0, replace with 0.

43802

KnownBits LengthBits = Known1.extractBits(8, 8);

43803

if (LengthBits.isZero())

43804

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

43805

}

43806

43807

break;

43808

}

43809

case X86ISD::PDEP: {

43810

SDValue Op0 = Op.getOperand(0);

43811

SDValue Op1 = Op.getOperand(1);

43812

43813

unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();

43814

APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);

43815

43816

// If the demanded bits has leading zeroes, we don't demand those from the

43817

// mask.

43818

if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))

43819

return true;

43820

43821

// The number of possible 1s in the mask determines the number of LSBs of

43822

// operand 0 used. Undemanded bits from the mask don't matter so filter

43823

// them before counting.

43824

KnownBits Known2;

43825

uint64_t Count = (~Known.Zero & LoMask).popcount();

43826

APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));

43827

if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))

43828

return true;

43829

43830

// Zeroes are retained from the mask, but not ones.

43831

Known.One.clearAllBits();

43832

// The result will have at least as many trailing zeros as the non-mask

43833

// operand since bits can only map to the same or higher bit position.

43834

Known.Zero.setLowBits(Known2.countMinTrailingZeros());

43835

return false;

43836

}

43837

}

43838

43839

return TargetLowering::SimplifyDemandedBitsForTargetNode(

43840

Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);

43841

}

43842

43843

SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(

43844

SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,

43845

SelectionDAG &DAG, unsigned Depth) const {

43846

int NumElts = DemandedElts.getBitWidth();

43847

unsigned Opc = Op.getOpcode();

43848

EVT VT = Op.getValueType();

43849

43850

switch (Opc) {

43851

case X86ISD::PINSRB:

43852

case X86ISD::PINSRW: {

43853

// If we don't demand the inserted element, return the base vector.

43854

SDValue Vec = Op.getOperand(0);

43855

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));

43856

MVT VecVT = Vec.getSimpleValueType();

43857

if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&

43858

!DemandedElts[CIdx->getZExtValue()])

43859

return Vec;

43860

break;

43861

}

43862

case X86ISD::VSHLI: {

43863

// If we are only demanding sign bits then we can use the shift source

43864

// directly.

43865

SDValue Op0 = Op.getOperand(0);

43866

unsigned ShAmt = Op.getConstantOperandVal(1);

43867

unsigned BitWidth = DemandedBits.getBitWidth();

43868

unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);

43869

unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();

43870

if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)

43871

return Op0;

43872

break;

43873

}

43874

case X86ISD::VSRAI:

43875

// iff we only need the sign bit then we can use the source directly.

43876

// TODO: generalize where we only demand extended signbits.

43877

if (DemandedBits.isSignMask())

43878

return Op.getOperand(0);

43879

break;

43880

case X86ISD::PCMPGT:

43881

// icmp sgt(0, R) == ashr(R, BitWidth-1).

43882

// iff we only need the sign bit then we can use R directly.

43883

if (DemandedBits.isSignMask() &&

43884

ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))

43885

return Op.getOperand(1);

43886

break;

43887

case X86ISD::ANDNP: {

43888

// ANDNP = (~LHS & RHS);

43889

SDValue LHS = Op.getOperand(0);

43890

SDValue RHS = Op.getOperand(1);

43891

43892

KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);

43893

KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);

43894

43895

// If all of the demanded bits are known 0 on LHS and known 0 on RHS, then

43896

// the (inverted) LHS bits cannot contribute to the result of the 'andn' in

43897

// this context, so return RHS.

43898

if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))

43899

return RHS;

43900

break;

43901

}

43902

}

43903

43904

APInt ShuffleUndef, ShuffleZero;

43905

SmallVector<int, 16> ShuffleMask;

43906

SmallVector<SDValue, 2> ShuffleOps;

43907

if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,

43908

ShuffleUndef, ShuffleZero, DAG, Depth, false)) {

43909

// If all the demanded elts are from one operand and are inline,

43910

// then we can use the operand directly.

43911

int NumOps = ShuffleOps.size();

43912

if (ShuffleMask.size() == (unsigned)NumElts &&

43913

llvm::all_of(ShuffleOps, [VT](SDValue V) {

43914

return VT.getSizeInBits() == V.getValueSizeInBits();

43915

})) {

43916

43917

if (DemandedElts.isSubsetOf(ShuffleUndef))

43918

return DAG.getUNDEF(VT);

43919

if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))

43920

return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));

43921

43922

// Bitmask that indicates which ops have only been accessed 'inline'.

43923

APInt IdentityOp = APInt::getAllOnes(NumOps);

43924

for (int i = 0; i != NumElts; ++i) {

43925

int M = ShuffleMask[i];

43926

if (!DemandedElts[i] || ShuffleUndef[i])

43927

continue;

43928

int OpIdx = M / NumElts;

43929

int EltIdx = M % NumElts;

43930

if (M < 0 || EltIdx != i) {

43931

IdentityOp.clearAllBits();

43932

break;

43933

}

43934

IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);

43935

if (IdentityOp == 0)

43936

break;

43937

}

43938

assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&(static_cast <bool> ((IdentityOp == 0 || IdentityOp.popcount
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.popcount() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43939, __extension__
__PRETTY_FUNCTION__))

43939

"Multiple identity shuffles detected")(static_cast <bool> ((IdentityOp == 0 || IdentityOp.popcount
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.popcount() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43939, __extension__
__PRETTY_FUNCTION__));

43940

43941

if (IdentityOp != 0)

43942

return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);

43943

}

43944

}

43945

43946

return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(

43947

Op, DemandedBits, DemandedElts, DAG, Depth);

43948

}

43949

43950

bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(

43951

SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

43952

bool PoisonOnly, unsigned Depth) const {

43953

unsigned EltsBits = Op.getScalarValueSizeInBits();

43954

unsigned NumElts = DemandedElts.getBitWidth();

43955

43956

// TODO: Add more target shuffles.

43957

switch (Op.getOpcode()) {

43958

case X86ISD::PSHUFD:

43959

case X86ISD::VPERMILPI: {

43960

SmallVector<int, 8> Mask;

43961

DecodePSHUFMask(NumElts, EltsBits, Op.getConstantOperandVal(1), Mask);

43962

43963

APInt DemandedSrcElts = APInt::getZero(NumElts);

43964

for (unsigned I = 0; I != NumElts; ++I)

43965

if (DemandedElts[I])

43966

DemandedSrcElts.setBit(Mask[I]);

43967

43968

return DAG.isGuaranteedNotToBeUndefOrPoison(

43969

Op.getOperand(0), DemandedSrcElts, PoisonOnly, Depth + 1);

43970

}

43971

}

43972

return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(

43973

Op, DemandedElts, DAG, PoisonOnly, Depth);

43974

}

43975

43976

bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(

43977

SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

43978

bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {

43979

43980

// TODO: Add more target shuffles.

43981

switch (Op.getOpcode()) {

43982

case X86ISD::PSHUFD:

43983

case X86ISD::VPERMILPI:

43984

return false;

43985

}

43986

return TargetLowering::canCreateUndefOrPoisonForTargetNode(

43987

Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);

43988

}

43989

43990

bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,

43991

const APInt &DemandedElts,

43992

APInt &UndefElts,

43993

const SelectionDAG &DAG,

43994

unsigned Depth) const {

43995

unsigned NumElts = DemandedElts.getBitWidth();

43996

unsigned Opc = Op.getOpcode();

43997

43998

switch (Opc) {

43999

case X86ISD::VBROADCAST:

44000

case X86ISD::VBROADCAST_LOAD:

44001

UndefElts = APInt::getZero(NumElts);

44002

return true;

44003

}

44004

44005

return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,

44006

DAG, Depth);

44007

}

44008

44009

// Helper to peek through bitops/trunc/setcc to determine size of source vector.

44010

// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.

44011

static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,

44012

bool AllowTruncate) {

44013

switch (Src.getOpcode()) {

44014

case ISD::TRUNCATE:

44015

if (!AllowTruncate)

44016

return false;

44017

[[fallthrough]];

44018

case ISD::SETCC:

44019

return Src.getOperand(0).getValueSizeInBits() == Size;

44020

case ISD::AND:

44021

case ISD::XOR:

44022

case ISD::OR:

44023

return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&

44024

checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);

44025

case ISD::SELECT:

44026

case ISD::VSELECT:

44027

return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&

44028

checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&

44029

checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);

44030

case ISD::BUILD_VECTOR:

44031

return ISD::isBuildVectorAllZeros(Src.getNode()) ||

44032

ISD::isBuildVectorAllOnes(Src.getNode());

44033

}

44034

return false;

44035

}

44036

44037

// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.

44038

static unsigned getAltBitOpcode(unsigned Opcode) {

44039

switch(Opcode) {

44040

case ISD::AND: return X86ISD::FAND;

44041

case ISD::OR: return X86ISD::FOR;

44042

case ISD::XOR: return X86ISD::FXOR;

44043

case X86ISD::ANDNP: return X86ISD::FANDN;

44044

}

44045

llvm_unreachable("Unknown bitwise opcode")::llvm::llvm_unreachable_internal("Unknown bitwise opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44045);

44046

}

44047

44048

// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.

44049

static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,

44050

const SDLoc &DL) {

44051

EVT SrcVT = Src.getValueType();

44052

if (SrcVT != MVT::v4i1)

44053

return SDValue();

44054

44055

switch (Src.getOpcode()) {

44056

case ISD::SETCC:

44057

if (Src.getOperand(0).getValueType() == MVT::v4i32 &&

44058

ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&

44059

cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {

44060

SDValue Op0 = Src.getOperand(0);

44061

if (ISD::isNormalLoad(Op0.getNode()))

44062

return DAG.getBitcast(MVT::v4f32, Op0);

44063

if (Op0.getOpcode() == ISD::BITCAST &&

44064

Op0.getOperand(0).getValueType() == MVT::v4f32)

44065

return Op0.getOperand(0);

44066

}

44067

break;

44068

case ISD::AND:

44069

case ISD::XOR:

44070

case ISD::OR: {

44071

SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);

44072

SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);

44073

if (Op0 && Op1)

44074

return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,

44075

Op1);

44076

break;

44077

}

44078

}

44079

return SDValue();

44080

}

44081

44082

// Helper to push sign extension of vXi1 SETCC result through bitops.

44083

static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,

44084

SDValue Src, const SDLoc &DL) {

44085

switch (Src.getOpcode()) {

44086

case ISD::SETCC:

44087

case ISD::TRUNCATE:

44088

case ISD::BUILD_VECTOR:

44089

return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);

44090

case ISD::AND:

44091

case ISD::XOR:

44092

case ISD::OR:

44093

return DAG.getNode(

44094

Src.getOpcode(), DL, SExtVT,

44095

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),

44096

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));

44097

case ISD::SELECT:

44098

case ISD::VSELECT:

44099

return DAG.getSelect(

44100

DL, SExtVT, Src.getOperand(0),

44101

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),

44102

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));

44103

}

44104

llvm_unreachable("Unexpected node type for vXi1 sign extension")::llvm::llvm_unreachable_internal("Unexpected node type for vXi1 sign extension"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44104);

44105

}

44106

44107

// Try to match patterns such as

44108

// (i16 bitcast (v16i1 x))

44109

// ->

44110

// (i16 movmsk (16i8 sext (v16i1 x)))

44111

// before the illegal vector is scalarized on subtargets that don't have legal

44112

// vxi1 types.

44113

static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,

44114

const SDLoc &DL,

44115

const X86Subtarget &Subtarget) {

44116

EVT SrcVT = Src.getValueType();

44117

if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)

44118

return SDValue();

44119

44120

// Recognize the IR pattern for the movmsk intrinsic under SSE1 before type

44121

// legalization destroys the v4i32 type.

44122

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {

44123

if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {

44124

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,

44125

DAG.getBitcast(MVT::v4f32, V));

44126

return DAG.getZExtOrTrunc(V, DL, VT);

44127

}

44128

}

44129

44130

// If the input is a truncate from v16i8 or v32i8 go ahead and use a

44131

// movmskb even with avx512. This will be better than truncating to vXi1 and

44132

// using a kmov. This can especially help KNL if the input is a v16i8/v32i8

44133

// vpcmpeqb/vpcmpgtb.

44134

bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&

44135

(Src.getOperand(0).getValueType() == MVT::v16i8 ||

44136

Src.getOperand(0).getValueType() == MVT::v32i8 ||

44137

Src.getOperand(0).getValueType() == MVT::v64i8);

44138

44139

// Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled

44140

// directly with vpmovmskb/vmovmskps/vmovmskpd.

44141

if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&

44142

cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&

44143

ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {

44144

EVT CmpVT = Src.getOperand(0).getValueType();

44145

EVT EltVT = CmpVT.getVectorElementType();

44146

if (CmpVT.getSizeInBits() <= 256 &&

44147

(EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))

44148

PreferMovMsk = true;

44149

}

44150

44151

// With AVX512 vxi1 types are legal and we prefer using k-regs.

44152

// MOVMSK is supported in SSE2 or later.

44153

if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))

44154

return SDValue();

44155

44156

// If the upper ops of a concatenation are undef, then try to bitcast the

44157

// lower op and extend.

44158

SmallVector<SDValue, 4> SubSrcOps;

44159

if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&

44160

SubSrcOps.size() >= 2) {

44161

SDValue LowerOp = SubSrcOps[0];

44162

ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());

44163

if (LowerOp.getOpcode() == ISD::SETCC &&

44164

all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {

44165

EVT SubVT = VT.getIntegerVT(

44166

*DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());

44167

if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {

44168

EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());

44169

return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));

44170

}

44171

}

44172

}

44173

44174

// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and

44175

// v8f64. So all legal 128-bit and 256-bit vectors are covered except for

44176

// v8i16 and v16i16.

44177

// For these two cases, we can shuffle the upper element bytes to a

44178

// consecutive sequence at the start of the vector and treat the results as

44179

// v16i8 or v32i8, and for v16i8 this is the preferable solution. However,

44180

// for v16i16 this is not the case, because the shuffle is expensive, so we

44181

// avoid sign-extending to this type entirely.

44182

// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:

44183

// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)

44184

MVT SExtVT;

44185

bool PropagateSExt = false;

44186

switch (SrcVT.getSimpleVT().SimpleTy) {

44187

default:

44188

return SDValue();

44189

case MVT::v2i1:

44190

SExtVT = MVT::v2i64;

44191

break;

44192

case MVT::v4i1:

44193

SExtVT = MVT::v4i32;

44194

// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))

44195

// sign-extend to a 256-bit operation to avoid truncation.

44196

if (Subtarget.hasAVX() &&

44197

checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {

44198

SExtVT = MVT::v4i64;

44199

PropagateSExt = true;

44200

}

44201

break;

44202

case MVT::v8i1:

44203

SExtVT = MVT::v8i16;

44204

// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),

44205

// sign-extend to a 256-bit operation to match the compare.

44206

// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over

44207

// 256-bit because the shuffle is cheaper than sign extending the result of

44208

// the compare.

44209

if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||

44210

checkBitcastSrcVectorSize(Src, 512, true))) {

44211

SExtVT = MVT::v8i32;

44212

PropagateSExt = true;

44213

}

44214

break;

44215

case MVT::v16i1:

44216

SExtVT = MVT::v16i8;

44217

// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),

44218

// it is not profitable to sign-extend to 256-bit because this will

44219

// require an extra cross-lane shuffle which is more expensive than

44220

// truncating the result of the compare to 128-bits.

44221

break;

44222

case MVT::v32i1:

44223

SExtVT = MVT::v32i8;

44224

break;

44225

case MVT::v64i1:

44226

// If we have AVX512F, but not AVX512BW and the input is truncated from

44227

// v64i8 checked earlier. Then split the input and make two pmovmskbs.

44228

if (Subtarget.hasAVX512()) {

44229

if (Subtarget.hasBWI())

44230

return SDValue();

44231

SExtVT = MVT::v64i8;

44232

break;

44233

}

44234

// Split if this is a <64 x i8> comparison result.

44235

if (checkBitcastSrcVectorSize(Src, 512, false)) {

44236

SExtVT = MVT::v64i8;

44237

break;

44238

}

44239

return SDValue();

44240

};

44241

44242

SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)

44243

: DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);

44244

44245

if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {

44246

V = getPMOVMSKB(DL, V, DAG, Subtarget);

44247

} else {

44248

if (SExtVT == MVT::v8i16)

44249

V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,

44250

DAG.getUNDEF(MVT::v8i16));

44251

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

44252

}

44253

44254

EVT IntVT =

44255

EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());

44256

V = DAG.getZExtOrTrunc(V, DL, IntVT);

44257

return DAG.getBitcast(VT, V);

44258

}

44259

44260

// Convert a vXi1 constant build vector to the same width scalar integer.

44261

static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {

44262

EVT SrcVT = Op.getValueType();

44263

assert(SrcVT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44264, __extension__
__PRETTY_FUNCTION__))

44264

"Expected a vXi1 vector")(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44264, __extension__
__PRETTY_FUNCTION__));

44265

assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44266, __extension__
__PRETTY_FUNCTION__))

44266

"Expected a constant build vector")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44266, __extension__
__PRETTY_FUNCTION__));

44267

44268

APInt Imm(SrcVT.getVectorNumElements(), 0);

44269

for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {

44270

SDValue In = Op.getOperand(Idx);

44271

if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))

44272

Imm.setBit(Idx);

44273

}

44274

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());

44275

return DAG.getConstant(Imm, SDLoc(Op), IntVT);

44276

}

44277

44278

static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,

44279

TargetLowering::DAGCombinerInfo &DCI,

44280

const X86Subtarget &Subtarget) {

44281

assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")(static_cast <bool> (N->getOpcode() == ISD::BITCAST &&
"Expected a bitcast") ? void (0) : __assert_fail ("N->getOpcode() == ISD::BITCAST && \"Expected a bitcast\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44281, __extension__
__PRETTY_FUNCTION__));

44282

44283

if (!DCI.isBeforeLegalizeOps())

44284

return SDValue();

44285

44286

// Only do this if we have k-registers.

44287

if (!Subtarget.hasAVX512())

44288

return SDValue();

44289

44290

EVT DstVT = N->getValueType(0);

44291

SDValue Op = N->getOperand(0);

44292

EVT SrcVT = Op.getValueType();

44293

44294

if (!Op.hasOneUse())

44295

return SDValue();

44296

44297

// Look for logic ops.

44298

if (Op.getOpcode() != ISD::AND &&

44299

Op.getOpcode() != ISD::OR &&

44300

Op.getOpcode() != ISD::XOR)

44301

return SDValue();

44302

44303

// Make sure we have a bitcast between mask registers and a scalar type.

44304

if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&

44305

DstVT.isScalarInteger()) &&

44306

!(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&

44307

SrcVT.isScalarInteger()))

44308

return SDValue();

44309

44310

SDValue LHS = Op.getOperand(0);

44311

SDValue RHS = Op.getOperand(1);

44312

44313

if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&

44314

LHS.getOperand(0).getValueType() == DstVT)

44315

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),

44316

DAG.getBitcast(DstVT, RHS));

44317

44318

if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&

44319

RHS.getOperand(0).getValueType() == DstVT)

44320

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,

44321

DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));

44322

44323

// If the RHS is a vXi1 build vector, this is a good reason to flip too.

44324

// Most of these have to move a constant from the scalar domain anyway.

44325

if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {

44326

RHS = combinevXi1ConstantToInteger(RHS, DAG);

44327

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,

44328

DAG.getBitcast(DstVT, LHS), RHS);

44329

}

44330

44331

return SDValue();

44332

}

44333

44334

static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,

44335

const X86Subtarget &Subtarget) {

44336

SDLoc DL(BV);

44337

unsigned NumElts = BV->getNumOperands();

44338

SDValue Splat = BV->getSplatValue();

44339

44340

// Build MMX element from integer GPR or SSE float values.

44341

auto CreateMMXElement = [&](SDValue V) {

44342

if (V.isUndef())

44343

return DAG.getUNDEF(MVT::x86mmx);

44344

if (V.getValueType().isFloatingPoint()) {

44345

if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {

44346

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);

44347

V = DAG.getBitcast(MVT::v2i64, V);

44348

return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);

44349

}

44350

V = DAG.getBitcast(MVT::i32, V);

44351

} else {

44352

V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);

44353

}

44354

return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);

44355

};

44356

44357

// Convert build vector ops to MMX data in the bottom elements.

44358

SmallVector<SDValue, 8> Ops;

44359

44360

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

44361

44362

// Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.

44363

if (Splat) {

44364

if (Splat.isUndef())

44365

return DAG.getUNDEF(MVT::x86mmx);

44366

44367

Splat = CreateMMXElement(Splat);

44368

44369

if (Subtarget.hasSSE1()) {

44370

// Unpack v8i8 to splat i8 elements to lowest 16-bits.

44371

if (NumElts == 8)

44372

Splat = DAG.getNode(

44373

ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,

44374

DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,

44375

TLI.getPointerTy(DAG.getDataLayout())),

44376

Splat, Splat);

44377

44378

// Use PSHUFW to repeat 16-bit elements.

44379

unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);

44380

return DAG.getNode(

44381

ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,

44382

DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,

44383

TLI.getPointerTy(DAG.getDataLayout())),

44384

Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));

44385

}

44386

Ops.append(NumElts, Splat);

44387

} else {

44388

for (unsigned i = 0; i != NumElts; ++i)

44389

Ops.push_back(CreateMMXElement(BV->getOperand(i)));

44390

}

44391

44392

// Use tree of PUNPCKLs to build up general MMX vector.

44393

while (Ops.size() > 1) {

44394

unsigned NumOps = Ops.size();

44395

unsigned IntrinOp =

44396

(NumOps == 2 ? Intrinsic::x86_mmx_punpckldq

44397

: (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd

44398

: Intrinsic::x86_mmx_punpcklbw));

44399

SDValue Intrin = DAG.getTargetConstant(

44400

IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));

44401

for (unsigned i = 0; i != NumOps; i += 2)

44402

Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,

44403

Ops[i], Ops[i + 1]);

44404

Ops.resize(NumOps / 2);

44405

}

44406

44407

return Ops[0];

44408

}

44409

44410

// Recursive function that attempts to find if a bool vector node was originally

44411

// a vector/float/double that got truncated/extended/bitcast to/from a scalar

44412

// integer. If so, replace the scalar ops with bool vector equivalents back down

44413

// the chain.

44414

static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,

44415

SelectionDAG &DAG,

44416

const X86Subtarget &Subtarget) {

44417

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

44418

unsigned Opc = V.getOpcode();

44419

switch (Opc) {

44420

case ISD::BITCAST: {

44421

// Bitcast from a vector/float/double, we can cheaply bitcast to VT.

44422

SDValue Src = V.getOperand(0);

44423

EVT SrcVT = Src.getValueType();

44424

if (SrcVT.isVector() || SrcVT.isFloatingPoint())

44425

return DAG.getBitcast(VT, Src);

44426

break;

44427

}

44428

case ISD::TRUNCATE: {

44429

// If we find a suitable source, a truncated scalar becomes a subvector.

44430

SDValue Src = V.getOperand(0);

44431

EVT NewSrcVT =

44432

EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());

44433

if (TLI.isTypeLegal(NewSrcVT))

44434

if (SDValue N0 =

44435

combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))

44436

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,

44437

DAG.getIntPtrConstant(0, DL));

44438

break;

44439

}

44440

case ISD::ANY_EXTEND:

44441

case ISD::ZERO_EXTEND: {

44442

// If we find a suitable source, an extended scalar becomes a subvector.

44443

SDValue Src = V.getOperand(0);

44444

EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

44445

Src.getScalarValueSizeInBits());

44446

if (TLI.isTypeLegal(NewSrcVT))

44447

if (SDValue N0 =

44448

combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))

44449

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

44450

Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)

44451

: DAG.getConstant(0, DL, VT),

44452

N0, DAG.getIntPtrConstant(0, DL));

44453

break;

44454

}

44455

case ISD::OR: {

44456

// If we find suitable sources, we can just move an OR to the vector domain.

44457

SDValue Src0 = V.getOperand(0);

44458

SDValue Src1 = V.getOperand(1);

44459

if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))

44460

if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))

44461

return DAG.getNode(Opc, DL, VT, N0, N1);

44462

break;

44463

}

44464

case ISD::SHL: {

44465

// If we find a suitable source, a SHL becomes a KSHIFTL.

44466

SDValue Src0 = V.getOperand(0);

44467

if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||

44468

((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))

44469

break;

44470

44471

if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))

44472

if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))

44473

return DAG.getNode(

44474

X86ISD::KSHIFTL, DL, VT, N0,

44475

DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));

44476

break;

44477

}

44478

}

44479

return SDValue();

44480

}

44481

44482

static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,

44483

TargetLowering::DAGCombinerInfo &DCI,

44484

const X86Subtarget &Subtarget) {

44485

SDValue N0 = N->getOperand(0);

44486

EVT VT = N->getValueType(0);

44487

EVT SrcVT = N0.getValueType();

44488

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

44489

44490

// Try to match patterns such as

44491

// (i16 bitcast (v16i1 x))

44492

// ->

44493

// (i16 movmsk (16i8 sext (v16i1 x)))

44494

// before the setcc result is scalarized on subtargets that don't have legal

44495

// vxi1 types.

44496

if (DCI.isBeforeLegalize()) {

44497

SDLoc dl(N);

44498

if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))

44499

return V;

44500

44501

// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer

44502

// type, widen both sides to avoid a trip through memory.

44503

if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&

44504

Subtarget.hasAVX512()) {

44505

N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);

44506

N0 = DAG.getBitcast(MVT::v8i1, N0);

44507

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,

44508

DAG.getIntPtrConstant(0, dl));

44509

}

44510

44511

// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer

44512

// type, widen both sides to avoid a trip through memory.

44513

if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&

44514

Subtarget.hasAVX512()) {

44515

// Use zeros for the widening if we already have some zeroes. This can

44516

// allow SimplifyDemandedBits to remove scalar ANDs that may be down

44517

// stream of this.

44518

// FIXME: It might make sense to detect a concat_vectors with a mix of

44519

// zeroes and undef and turn it into insert_subvector for i1 vectors as

44520

// a separate combine. What we can't do is canonicalize the operands of

44521

// such a concat or we'll get into a loop with SimplifyDemandedBits.

44522

if (N0.getOpcode() == ISD::CONCAT_VECTORS) {

44523

SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);

44524

if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {

44525

SrcVT = LastOp.getValueType();

44526

unsigned NumConcats = 8 / SrcVT.getVectorNumElements();

44527

SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());

44528

Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));

44529

N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

44530

N0 = DAG.getBitcast(MVT::i8, N0);

44531

return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);

44532

}

44533

}

44534

44535

unsigned NumConcats = 8 / SrcVT.getVectorNumElements();

44536

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));

44537

Ops[0] = N0;

44538

N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

44539

N0 = DAG.getBitcast(MVT::i8, N0);

44540

return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);

44541

}

44542

} else {

44543

// If we're bitcasting from iX to vXi1, see if the integer originally

44544

// began as a vXi1 and whether we can remove the bitcast entirely.

44545

if (VT.isVector() && VT.getScalarType() == MVT::i1 &&

44546

SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {

44547

if (SDValue V =

44548

combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))

44549

return V;

44550

}

44551

}

44552

44553

// Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and

44554

// replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur

44555

// due to insert_subvector legalization on KNL. By promoting the copy to i16

44556

// we can help with known bits propagation from the vXi1 domain to the

44557

// scalar domain.

44558

if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&

44559

!Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

44560

N0.getOperand(0).getValueType() == MVT::v16i1 &&

44561

isNullConstant(N0.getOperand(1)))

44562

return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,

44563

DAG.getBitcast(MVT::i16, N0.getOperand(0)));

44564

44565

// Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast

44566

// and the vbroadcast_load are both integer or both fp. In some cases this

44567

// will remove the bitcast entirely.

44568

if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&

44569

VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {

44570

auto *BCast = cast<MemIntrinsicSDNode>(N0);

44571

unsigned SrcVTSize = SrcVT.getScalarSizeInBits();

44572

unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();

44573

// Don't swap i8/i16 since don't have fp types that size.

44574

if (MemSize >= 32) {

44575

MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)

44576

: MVT::getIntegerVT(MemSize);

44577

MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)

44578

: MVT::getIntegerVT(SrcVTSize);

44579

LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());

44580

44581

SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);

44582

SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };

44583

SDValue ResNode =

44584

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,

44585

MemVT, BCast->getMemOperand());

44586

DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));

44587

return DAG.getBitcast(VT, ResNode);

44588

}

44589

}

44590

44591

// Since MMX types are special and don't usually play with other vector types,

44592

// it's better to handle them early to be sure we emit efficient code by

44593

// avoiding store-load conversions.

44594

if (VT == MVT::x86mmx) {

44595

// Detect MMX constant vectors.

44596

APInt UndefElts;

44597

SmallVector<APInt, 1> EltBits;

44598

if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {

44599

SDLoc DL(N0);

44600

// Handle zero-extension of i32 with MOVD.

44601

if (EltBits[0].countl_zero() >= 32)

44602

return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,

44603

DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));

44604

// Else, bitcast to a double.

44605

// TODO - investigate supporting sext 32-bit immediates on x86_64.

44606

APFloat F64(APFloat::IEEEdouble(), EltBits[0]);

44607

return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));

44608

}

44609

44610

// Detect bitcasts to x86mmx low word.

44611

if (N0.getOpcode() == ISD::BUILD_VECTOR &&

44612

(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&

44613

N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {

44614

bool LowUndef = true, AllUndefOrZero = true;

44615

for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {

44616

SDValue Op = N0.getOperand(i);

44617

LowUndef &= Op.isUndef() || (i >= e/2);

44618

AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));

44619

}

44620

if (AllUndefOrZero) {

44621

SDValue N00 = N0.getOperand(0);

44622

SDLoc dl(N00);

44623

N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)

44624

: DAG.getZExtOrTrunc(N00, dl, MVT::i32);

44625

return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);

44626

}

44627

}

44628

44629

// Detect bitcasts of 64-bit build vectors and convert to a

44630

// MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the

44631

// lowest element.

44632

if (N0.getOpcode() == ISD::BUILD_VECTOR &&

44633

(SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||

44634

SrcVT == MVT::v8i8))

44635

return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);

44636

44637

// Detect bitcasts between element or subvector extraction to x86mmx.

44638

if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

44639

N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&

44640

isNullConstant(N0.getOperand(1))) {

44641

SDValue N00 = N0.getOperand(0);

44642

if (N00.getValueType().is128BitVector())

44643

return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,

44644

DAG.getBitcast(MVT::v2i64, N00));

44645

}

44646

44647

// Detect bitcasts from FP_TO_SINT to x86mmx.

44648

if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {

44649

SDLoc DL(N0);

44650

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

44651

DAG.getUNDEF(MVT::v2i32));

44652

return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,

44653

DAG.getBitcast(MVT::v2i64, Res));

44654

}

44655

}

44656

44657

// Try to remove a bitcast of constant vXi1 vector. We have to legalize

44658

// most of these to scalar anyway.

44659

if (Subtarget.hasAVX512() && VT.isScalarInteger() &&

44660

SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&

44661

ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {

44662

return combinevXi1ConstantToInteger(N0, DAG);

44663

}

44664

44665

if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&

44666

VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

44667

isa<ConstantSDNode>(N0)) {

44668

auto *C = cast<ConstantSDNode>(N0);

44669

if (C->isAllOnes())

44670

return DAG.getConstant(1, SDLoc(N0), VT);

44671

if (C->isZero())

44672

return DAG.getConstant(0, SDLoc(N0), VT);

44673

}

44674

44675

// Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.

44676

// Turn it into a sign bit compare that produces a k-register. This avoids

44677

// a trip through a GPR.

44678

if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&

44679

VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

44680

isPowerOf2_32(VT.getVectorNumElements())) {

44681

unsigned NumElts = VT.getVectorNumElements();

44682

SDValue Src = N0;

44683

44684

// Peek through truncate.

44685

if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())

44686

Src = N0.getOperand(0);

44687

44688

if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {

44689

SDValue MovmskIn = Src.getOperand(0);

44690

MVT MovmskVT = MovmskIn.getSimpleValueType();

44691

unsigned MovMskElts = MovmskVT.getVectorNumElements();

44692

44693

// We allow extra bits of the movmsk to be used since they are known zero.

44694

// We can't convert a VPMOVMSKB without avx512bw.

44695

if (MovMskElts <= NumElts &&

44696

(Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {

44697

EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();

44698

MovmskIn = DAG.getBitcast(IntVT, MovmskIn);

44699

SDLoc dl(N);

44700

MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);

44701

SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,

44702

DAG.getConstant(0, dl, IntVT), ISD::SETLT);

44703

if (EVT(CmpVT) == VT)

44704

return Cmp;

44705

44706

// Pad with zeroes up to original VT to replace the zeroes that were

44707

// being used from the MOVMSK.

44708

unsigned NumConcats = NumElts / MovMskElts;

44709

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));

44710

Ops[0] = Cmp;

44711

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);

44712

}

44713

}

44714

}

44715

44716

// Try to remove bitcasts from input and output of mask arithmetic to

44717

// remove GPR<->K-register crossings.

44718

if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))

44719

return V;

44720

44721

// Convert a bitcasted integer logic operation that has one bitcasted

44722

// floating-point operand into a floating-point logic operation. This may

44723

// create a load of a constant, but that is cheaper than materializing the

44724

// constant in an integer register and transferring it to an SSE register or

44725

// transferring the SSE operand to integer register and back.

44726

unsigned FPOpcode;

44727

switch (N0.getOpcode()) {

44728

case ISD::AND: FPOpcode = X86ISD::FAND; break;

44729

case ISD::OR: FPOpcode = X86ISD::FOR; break;

44730

case ISD::XOR: FPOpcode = X86ISD::FXOR; break;

44731

default: return SDValue();

44732

}

44733

44734

// Check if we have a bitcast from another integer type as well.

44735

if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||

44736

(Subtarget.hasSSE2() && VT == MVT::f64) ||

44737

(Subtarget.hasFP16() && VT == MVT::f16) ||

44738

(Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&

44739

TLI.isTypeLegal(VT))))

44740

return SDValue();

44741

44742

SDValue LogicOp0 = N0.getOperand(0);

44743

SDValue LogicOp1 = N0.getOperand(1);

44744

SDLoc DL0(N0);

44745

44746

// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))

44747

if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&

44748

LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&

44749

LogicOp0.getOperand(0).getValueType() == VT &&

44750

!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {

44751

SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);

44752

unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();

44753

return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);

44754

}

44755

// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)

44756

if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&

44757

LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&

44758

LogicOp1.getOperand(0).getValueType() == VT &&

44759

!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {

44760

SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);

44761

unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();

44762

return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);

44763

}

44764

44765

return SDValue();

44766

}

44767

44768

// (mul (zext a), (sext, b))

44769

static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,

44770

SDValue &Op1) {

44771

Op0 = Mul.getOperand(0);

44772

Op1 = Mul.getOperand(1);

44773

44774

// The operand1 should be signed extend

44775

if (Op0.getOpcode() == ISD::SIGN_EXTEND)

44776

std::swap(Op0, Op1);

44777

44778

auto IsFreeTruncation = [](SDValue &Op) -> bool {

44779

if ((Op.getOpcode() == ISD::ZERO_EXTEND ||

44780

Op.getOpcode() == ISD::SIGN_EXTEND) &&

44781

Op.getOperand(0).getScalarValueSizeInBits() <= 8)

44782

return true;

44783

44784

auto *BV = dyn_cast<BuildVectorSDNode>(Op);

44785

return (BV && BV->isConstant());

44786

};

44787

44788

// (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned

44789

// value, we need to check Op0 is zero extended value. Op1 should be signed

44790

// value, so we just check the signed bits.

44791

if ((IsFreeTruncation(Op0) &&

44792

DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&

44793

(IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))

44794

return true;

44795

44796

return false;

44797

}

44798

44799

// Given a ABS node, detect the following pattern:

44800

// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).

44801

// This is useful as it is the input into a SAD pattern.

44802

static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {

44803

SDValue AbsOp1 = Abs->getOperand(0);

44804

if (AbsOp1.getOpcode() != ISD::SUB)

44805

return false;

44806

44807

Op0 = AbsOp1.getOperand(0);

44808

Op1 = AbsOp1.getOperand(1);

44809

44810

// Check if the operands of the sub are zero-extended from vectors of i8.

44811

if (Op0.getOpcode() != ISD::ZERO_EXTEND ||

44812

Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||

44813

Op1.getOpcode() != ISD::ZERO_EXTEND ||

44814

Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)

44815

return false;

44816

44817

return true;

44818

}

44819

44820

static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,

44821

unsigned &LogBias, const SDLoc &DL,

44822

const X86Subtarget &Subtarget) {

44823

// Extend or truncate to MVT::i8 first.

44824

MVT Vi8VT =

44825

MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());

44826

LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);

44827

RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);

44828

44829

// VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element

44830

// C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].

44831

// The src A, B element type is i8, but the dst C element type is i32.

44832

// When we calculate the reduce stage, we use src vector type vXi8 for it

44833

// so we need logbias 2 to avoid extra 2 stages.

44834

LogBias = 2;

44835

44836

unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());

44837

if (Subtarget.hasVNNI() && !Subtarget.hasVLX())

44838

RegSize = std::max(512u, RegSize);

44839

44840

// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we

44841

// fill in the missing vector elements with 0.

44842

unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();

44843

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));

44844

Ops[0] = LHS;

44845

MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);

44846

SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

44847

Ops[0] = RHS;

44848

SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

44849

44850

// Actually build the DotProduct, split as 256/512 bits for

44851

// AVXVNNI/AVX512VNNI.

44852

auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

44853

ArrayRef<SDValue> Ops) {

44854

MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);

44855

return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);

44856

};

44857

MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);

44858

SDValue Zero = DAG.getConstant(0, DL, DpVT);

44859

44860

return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},

44861

DpBuilder, false);

44862

}

44863

44864

// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs

44865

// to these zexts.

44866

static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,

44867

const SDValue &Zext1, const SDLoc &DL,

44868

const X86Subtarget &Subtarget) {

44869

// Find the appropriate width for the PSADBW.

44870

EVT InVT = Zext0.getOperand(0).getValueType();

44871

unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());

44872

44873

// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we

44874

// fill in the missing vector elements with 0.

44875

unsigned NumConcat = RegSize / InVT.getSizeInBits();

44876

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));

44877

Ops[0] = Zext0.getOperand(0);

44878

MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);

44879

SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

44880

Ops[0] = Zext1.getOperand(0);

44881

SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

44882

44883

// Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.

44884

auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

44885

ArrayRef<SDValue> Ops) {

44886

MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);

44887

return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);

44888

};

44889

MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);

44890

return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },

44891

PSADBWBuilder);

44892

}

44893

44894

// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with

44895

// PHMINPOSUW.

44896

static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,

44897

const X86Subtarget &Subtarget) {

44898

// Bail without SSE41.

44899

if (!Subtarget.hasSSE41())

44900

return SDValue();

44901

44902

EVT ExtractVT = Extract->getValueType(0);

44903

if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)

44904

return SDValue();

44905

44906

// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.

44907

ISD::NodeType BinOp;

44908

SDValue Src = DAG.matchBinOpReduction(

44909

Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);

44910

if (!Src)

44911

return SDValue();

44912

44913

EVT SrcVT = Src.getValueType();

44914

EVT SrcSVT = SrcVT.getScalarType();

44915

if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)

44916

return SDValue();

44917

44918

SDLoc DL(Extract);

44919

SDValue MinPos = Src;

44920

44921

// First, reduce the source down to 128-bit, applying BinOp to lo/hi.

44922

while (SrcVT.getSizeInBits() > 128) {

44923

SDValue Lo, Hi;

44924

std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);

44925

SrcVT = Lo.getValueType();

44926

MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);

44927

}

44928

assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44930, __extension__
__PRETTY_FUNCTION__))

44929

(SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44930, __extension__
__PRETTY_FUNCTION__))

44930

"Unexpected value type")(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44930, __extension__
__PRETTY_FUNCTION__));

44931

44932

// PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask

44933

// to flip the value accordingly.

44934

SDValue Mask;

44935

unsigned MaskEltsBits = ExtractVT.getSizeInBits();

44936

if (BinOp == ISD::SMAX)

44937

Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);

44938

else if (BinOp == ISD::SMIN)

44939

Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);

44940

else if (BinOp == ISD::UMAX)

44941

Mask = DAG.getAllOnesConstant(DL, SrcVT);

44942

44943

if (Mask)

44944

MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

44945

44946

// For v16i8 cases we need to perform UMIN on pairs of byte elements,

44947

// shuffling each upper element down and insert zeros. This means that the

44948

// v16i8 UMIN will leave the upper element as zero, performing zero-extension

44949

// ready for the PHMINPOS.

44950

if (ExtractVT == MVT::i8) {

44951

SDValue Upper = DAG.getVectorShuffle(

44952

SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),

44953

{1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});

44954

MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);

44955

}

44956

44957

// Perform the PHMINPOS on a v8i16 vector,

44958

MinPos = DAG.getBitcast(MVT::v8i16, MinPos);

44959

MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);

44960

MinPos = DAG.getBitcast(SrcVT, MinPos);

44961

44962

if (Mask)

44963

MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

44964

44965

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,

44966

DAG.getIntPtrConstant(0, DL));

44967

}

44968

44969

// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.

44970

static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,

44971

const X86Subtarget &Subtarget) {

44972

// Bail without SSE2.

44973

if (!Subtarget.hasSSE2())

44974

return SDValue();

44975

44976

EVT ExtractVT = Extract->getValueType(0);

44977

unsigned BitWidth = ExtractVT.getSizeInBits();

44978

if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&

44979

ExtractVT != MVT::i8 && ExtractVT != MVT::i1)

44980

return SDValue();

44981

44982

// Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.

44983

ISD::NodeType BinOp;

44984

SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});

44985

if (!Match && ExtractVT == MVT::i1)

44986

Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});

44987

if (!Match)

44988

return SDValue();

44989

44990

// EXTRACT_VECTOR_ELT can require implicit extension of the vector element

44991

// which we can't support here for now.

44992

if (Match.getScalarValueSizeInBits() != BitWidth)

44993

return SDValue();

44994

44995

SDValue Movmsk;

44996

SDLoc DL(Extract);

44997

EVT MatchVT = Match.getValueType();

44998

unsigned NumElts = MatchVT.getVectorNumElements();

44999

unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;

45000

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45001

LLVMContext &Ctx = *DAG.getContext();

45002

45003

if (ExtractVT == MVT::i1) {

45004

// Special case for (pre-legalization) vXi1 reductions.

45005

if (NumElts > 64 || !isPowerOf2_32(NumElts))

45006

return SDValue();

45007

if (Match.getOpcode() == ISD::SETCC) {

45008

ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();

45009

if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||

45010

(BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {

45011

// For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.

45012

// For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.

45013

X86::CondCode X86CC;

45014

SDValue LHS = DAG.getFreeze(Match.getOperand(0));

45015

SDValue RHS = DAG.getFreeze(Match.getOperand(1));

45016

APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());

45017

if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,

45018

DAG, X86CC))

45019

return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,

45020

getSETCC(X86CC, V, DL, DAG));

45021

}

45022

}

45023

if (TLI.isTypeLegal(MatchVT)) {

45024

// If this is a legal AVX512 predicate type then we can just bitcast.

45025

EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);

45026

Movmsk = DAG.getBitcast(MovmskVT, Match);

45027

} else {

45028

// Use combineBitcastvxi1 to create the MOVMSK.

45029

while (NumElts > MaxElts) {

45030

SDValue Lo, Hi;

45031

std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);

45032

Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);

45033

NumElts /= 2;

45034

}

45035

EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);

45036

Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);

45037

}

45038

if (!Movmsk)

45039

return SDValue();

45040

Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);

45041

} else {

45042

// FIXME: Better handling of k-registers or 512-bit vectors?

45043

unsigned MatchSizeInBits = Match.getValueSizeInBits();

45044

if (!(MatchSizeInBits == 128 ||

45045

(MatchSizeInBits == 256 && Subtarget.hasAVX())))

45046

return SDValue();

45047

45048

// Make sure this isn't a vector of 1 element. The perf win from using

45049

// MOVMSK diminishes with less elements in the reduction, but it is

45050

// generally better to get the comparison over to the GPRs as soon as

45051

// possible to reduce the number of vector ops.

45052

if (Match.getValueType().getVectorNumElements() < 2)

45053

return SDValue();

45054

45055

// Check that we are extracting a reduction of all sign bits.

45056

if (DAG.ComputeNumSignBits(Match) != BitWidth)

45057

return SDValue();

45058

45059

if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {

45060

SDValue Lo, Hi;

45061

std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);

45062

Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);

45063

MatchSizeInBits = Match.getValueSizeInBits();

45064

}

45065

45066

// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.

45067

MVT MaskSrcVT;

45068

if (64 == BitWidth || 32 == BitWidth)

45069

MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),

45070

MatchSizeInBits / BitWidth);

45071

else

45072

MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);

45073

45074

SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);

45075

Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);

45076

NumElts = MaskSrcVT.getVectorNumElements();

45077

}

45078

assert((NumElts <= 32 || NumElts == 64) &&(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45079, __extension__
__PRETTY_FUNCTION__))

45079

"Not expecting more than 64 elements")(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45079, __extension__
__PRETTY_FUNCTION__));

45080

45081

MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;

45082

if (BinOp == ISD::XOR) {

45083

// parity -> (PARITY(MOVMSK X))

45084

SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);

45085

return DAG.getZExtOrTrunc(Result, DL, ExtractVT);

45086

}

45087

45088

SDValue CmpC;

45089

ISD::CondCode CondCode;

45090

if (BinOp == ISD::OR) {

45091

// any_of -> MOVMSK != 0

45092

CmpC = DAG.getConstant(0, DL, CmpVT);

45093

CondCode = ISD::CondCode::SETNE;

45094

} else {

45095

// all_of -> MOVMSK == ((1 << NumElts) - 1)

45096

CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),

45097

DL, CmpVT);

45098

CondCode = ISD::CondCode::SETEQ;

45099

}

45100

45101

// The setcc produces an i8 of 0/1, so extend that to the result width and

45102

// negate to get the final 0/-1 mask value.

45103

EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);

45104

SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);

45105

SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);

45106

SDValue Zero = DAG.getConstant(0, DL, ExtractVT);

45107

return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);

45108

}

45109

45110

static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,

45111

const X86Subtarget &Subtarget) {

45112

if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())

45113

return SDValue();

45114

45115

EVT ExtractVT = Extract->getValueType(0);

45116

// Verify the type we're extracting is i32, as the output element type of

45117

// vpdpbusd is i32.

45118

if (ExtractVT != MVT::i32)

45119

return SDValue();

45120

45121

EVT VT = Extract->getOperand(0).getValueType();

45122

if (!isPowerOf2_32(VT.getVectorNumElements()))

45123

return SDValue();

45124

45125

// Match shuffle + add pyramid.

45126

ISD::NodeType BinOp;

45127

SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});

45128

45129

// We can't combine to vpdpbusd for zext, because each of the 4 multiplies

45130

// done by vpdpbusd compute a signed 16-bit product that will be sign extended

45131

// before adding into the accumulator.

45132

// TODO:

45133

// We also need to verify that the multiply has at least 2x the number of bits

45134

// of the input. We shouldn't match

45135

// (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).

45136

// if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))

45137

// Root = Root.getOperand(0);

45138

45139

// If there was a match, we want Root to be a mul.

45140

if (!Root || Root.getOpcode() != ISD::MUL)

45141

return SDValue();

45142

45143

// Check whether we have an extend and mul pattern

45144

SDValue LHS, RHS;

45145

if (!detectExtMul(DAG, Root, LHS, RHS))

45146

return SDValue();

45147

45148

// Create the dot product instruction.

45149

SDLoc DL(Extract);

45150

unsigned StageBias;

45151

SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);

45152

45153

// If the original vector was wider than 4 elements, sum over the results

45154

// in the DP vector.

45155

unsigned Stages = Log2_32(VT.getVectorNumElements());

45156

EVT DpVT = DP.getValueType();

45157

45158

if (Stages > StageBias) {

45159

unsigned DpElems = DpVT.getVectorNumElements();

45160

45161

for (unsigned i = Stages - StageBias; i > 0; --i) {

45162

SmallVector<int, 16> Mask(DpElems, -1);

45163

for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)

45164

Mask[j] = MaskEnd + j;

45165

45166

SDValue Shuffle =

45167

DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);

45168

DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);

45169

}

45170

}

45171

45172

// Return the lowest ExtractSizeInBits bits.

45173

EVT ResVT =

45174

EVT::getVectorVT(*DAG.getContext(), ExtractVT,

45175

DpVT.getSizeInBits() / ExtractVT.getSizeInBits());

45176

DP = DAG.getBitcast(ResVT, DP);

45177

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,

45178

Extract->getOperand(1));

45179

}

45180

45181

static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,

45182

const X86Subtarget &Subtarget) {

45183

// PSADBW is only supported on SSE2 and up.

45184

if (!Subtarget.hasSSE2())

45185

return SDValue();

45186

45187

EVT ExtractVT = Extract->getValueType(0);

45188

// Verify the type we're extracting is either i32 or i64.

45189

// FIXME: Could support other types, but this is what we have coverage for.

45190

if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)

45191

return SDValue();

45192

45193

EVT VT = Extract->getOperand(0).getValueType();

45194

if (!isPowerOf2_32(VT.getVectorNumElements()))

45195

return SDValue();

45196

45197

// Match shuffle + add pyramid.

45198

ISD::NodeType BinOp;

45199

SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});

45200

45201

// The operand is expected to be zero extended from i8

45202

// (verified in detectZextAbsDiff).

45203

// In order to convert to i64 and above, additional any/zero/sign

45204

// extend is expected.

45205

// The zero extend from 32 bit has no mathematical effect on the result.

45206

// Also the sign extend is basically zero extend

45207

// (extends the sign bit which is zero).

45208

// So it is correct to skip the sign/zero extend instruction.

45209

if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||

45210

Root.getOpcode() == ISD::ZERO_EXTEND ||

45211

Root.getOpcode() == ISD::ANY_EXTEND))

45212

Root = Root.getOperand(0);

45213

45214

// If there was a match, we want Root to be a select that is the root of an

45215

// abs-diff pattern.

45216

if (!Root || Root.getOpcode() != ISD::ABS)

45217

return SDValue();

45218

45219

// Check whether we have an abs-diff pattern feeding into the select.

45220

SDValue Zext0, Zext1;

45221

if (!detectZextAbsDiff(Root, Zext0, Zext1))

45222

return SDValue();

45223

45224

// Create the SAD instruction.

45225

SDLoc DL(Extract);

45226

SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);

45227

45228

// If the original vector was wider than 8 elements, sum over the results

45229

// in the SAD vector.

45230

unsigned Stages = Log2_32(VT.getVectorNumElements());

45231

EVT SadVT = SAD.getValueType();

45232

if (Stages > 3) {

45233

unsigned SadElems = SadVT.getVectorNumElements();

45234

45235

for(unsigned i = Stages - 3; i > 0; --i) {

45236

SmallVector<int, 16> Mask(SadElems, -1);

45237

for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)

45238

Mask[j] = MaskEnd + j;

45239

45240

SDValue Shuffle =

45241

DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);

45242

SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);

45243

}

45244

}

45245

45246

unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();

45247

// Return the lowest ExtractSizeInBits bits.

45248

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,

45249

SadVT.getSizeInBits() / ExtractSizeInBits);

45250

SAD = DAG.getBitcast(ResVT, SAD);

45251

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,

45252

Extract->getOperand(1));

45253

}

45254

45255

// Attempt to peek through a target shuffle and extract the scalar from the

45256

// source.

45257

static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,

45258

TargetLowering::DAGCombinerInfo &DCI,

45259

const X86Subtarget &Subtarget) {

45260

if (DCI.isBeforeLegalizeOps())

45261

return SDValue();

45262

45263

SDLoc dl(N);

45264

SDValue Src = N->getOperand(0);

45265

SDValue Idx = N->getOperand(1);

45266

45267

EVT VT = N->getValueType(0);

45268

EVT SrcVT = Src.getValueType();

45269

EVT SrcSVT = SrcVT.getVectorElementType();

45270

unsigned SrcEltBits = SrcSVT.getSizeInBits();

45271

unsigned NumSrcElts = SrcVT.getVectorNumElements();

45272

45273

// Don't attempt this for boolean mask vectors or unknown extraction indices.

45274

if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))

45275

return SDValue();

45276

45277

const APInt &IdxC = N->getConstantOperandAPInt(1);

45278

if (IdxC.uge(NumSrcElts))

45279

return SDValue();

45280

45281

SDValue SrcBC = peekThroughBitcasts(Src);

45282

45283

// Handle extract(bitcast(broadcast(scalar_value))).

45284

if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {

45285

SDValue SrcOp = SrcBC.getOperand(0);

45286

EVT SrcOpVT = SrcOp.getValueType();

45287

if (SrcOpVT.isScalarInteger() && VT.isInteger() &&

45288

(SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {

45289

unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;

45290

unsigned Offset = IdxC.urem(Scale) * SrcEltBits;

45291

// TODO support non-zero offsets.

45292

if (Offset == 0) {

45293

SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());

45294

SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);

45295

return SrcOp;

45296

}

45297

}

45298

}

45299

45300

// If we're extracting a single element from a broadcast load and there are

45301

// no other users, just create a single load.

45302

if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {

45303

auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);

45304

unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();

45305

if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&

45306

VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {

45307

SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),

45308

MemIntr->getBasePtr(),

45309

MemIntr->getPointerInfo(),

45310

MemIntr->getOriginalAlign(),

45311

MemIntr->getMemOperand()->getFlags());

45312

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));

45313

return Load;

45314

}

45315

}

45316

45317

// Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.

45318

// TODO: Move to DAGCombine?

45319

if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&

45320

SrcBC.getValueType().isInteger() &&

45321

(SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&

45322

SrcBC.getScalarValueSizeInBits() ==

45323

SrcBC.getOperand(0).getValueSizeInBits()) {

45324

unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;

45325

if (IdxC.ult(Scale)) {

45326

unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();

45327

SDValue Scl = SrcBC.getOperand(0);

45328

EVT SclVT = Scl.getValueType();

45329

if (Offset) {

45330

Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,

45331

DAG.getShiftAmountConstant(Offset, SclVT, dl));

45332

}

45333

Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());

45334

Scl = DAG.getZExtOrTrunc(Scl, dl, VT);

45335

return Scl;

45336

}

45337

}

45338

45339

// Handle extract(truncate(x)) for 0'th index.

45340

// TODO: Treat this as a faux shuffle?

45341

// TODO: When can we use this for general indices?

45342

if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&

45343

(SrcVT.getSizeInBits() % 128) == 0) {

45344

Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);

45345

MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);

45346

return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),

45347

Idx);

45348

}

45349

45350

// We can only legally extract other elements from 128-bit vectors and in

45351

// certain circumstances, depending on SSE-level.

45352

// TODO: Investigate float/double extraction if it will be just stored.

45353

auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,

45354

unsigned Idx) {

45355

EVT VecSVT = VecVT.getScalarType();

45356

if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&

45357

(VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||

45358

VecSVT == MVT::i64)) {

45359

unsigned EltSizeInBits = VecSVT.getSizeInBits();

45360

unsigned NumEltsPerLane = 128 / EltSizeInBits;

45361

unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;

45362

unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();

45363

VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);

45364

Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);

45365

Idx &= (NumEltsPerLane - 1);

45366

}

45367

if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&

45368

((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {

45369

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),

45370

DAG.getBitcast(VecVT, Vec),

45371

DAG.getIntPtrConstant(Idx, dl));

45372

}

45373

if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||

45374

(VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {

45375

unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);

45376

return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),

45377

DAG.getTargetConstant(Idx, dl, MVT::i8));

45378

}

45379

return SDValue();

45380

};

45381

45382

// Resolve the target shuffle inputs and mask.

45383

SmallVector<int, 16> Mask;

45384

SmallVector<SDValue, 2> Ops;

45385

if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))

45386

return SDValue();

45387

45388

// Shuffle inputs must be the same size as the result.

45389

if (llvm::any_of(Ops, [SrcVT](SDValue Op) {

45390

return SrcVT.getSizeInBits() != Op.getValueSizeInBits();

45391

}))

45392

return SDValue();

45393

45394

// Attempt to narrow/widen the shuffle mask to the correct size.

45395

if (Mask.size() != NumSrcElts) {

45396

if ((NumSrcElts % Mask.size()) == 0) {

45397

SmallVector<int, 16> ScaledMask;

45398

int Scale = NumSrcElts / Mask.size();

45399

narrowShuffleMaskElts(Scale, Mask, ScaledMask);

45400

Mask = std::move(ScaledMask);

45401

} else if ((Mask.size() % NumSrcElts) == 0) {

45402

// Simplify Mask based on demanded element.

45403

int ExtractIdx = (int)IdxC.getZExtValue();

45404

int Scale = Mask.size() / NumSrcElts;

45405

int Lo = Scale * ExtractIdx;

45406

int Hi = Scale * (ExtractIdx + 1);

45407

for (int i = 0, e = (int)Mask.size(); i != e; ++i)

45408

if (i < Lo || Hi <= i)

45409

Mask[i] = SM_SentinelUndef;

45410

45411

SmallVector<int, 16> WidenedMask;

45412

while (Mask.size() > NumSrcElts &&

45413

canWidenShuffleElements(Mask, WidenedMask))

45414

Mask = std::move(WidenedMask);

45415

}

45416

}

45417

45418

// If narrowing/widening failed, see if we can extract+zero-extend.

45419

int ExtractIdx;

45420

EVT ExtractVT;

45421

if (Mask.size() == NumSrcElts) {

45422

ExtractIdx = Mask[IdxC.getZExtValue()];

45423

ExtractVT = SrcVT;

45424

} else {

45425

unsigned Scale = Mask.size() / NumSrcElts;

45426

if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())

45427

return SDValue();

45428

unsigned ScaledIdx = Scale * IdxC.getZExtValue();

45429

if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))

45430

return SDValue();

45431

ExtractIdx = Mask[ScaledIdx];

45432

EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);

45433

ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());

45434

assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45435, __extension__
__PRETTY_FUNCTION__))

45435

"Failed to widen vector type")(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45435, __extension__
__PRETTY_FUNCTION__));

45436

}

45437

45438

// If the shuffle source element is undef/zero then we can just accept it.

45439

if (ExtractIdx == SM_SentinelUndef)

45440

return DAG.getUNDEF(VT);

45441

45442

if (ExtractIdx == SM_SentinelZero)

45443

return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)

45444

: DAG.getConstant(0, dl, VT);

45445

45446

SDValue SrcOp = Ops[ExtractIdx / Mask.size()];

45447

ExtractIdx = ExtractIdx % Mask.size();

45448

if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))

45449

return DAG.getZExtOrTrunc(V, dl, VT);

45450

45451

return SDValue();

45452

}

45453

45454

/// Extracting a scalar FP value from vector element 0 is free, so extract each

45455

/// operand first, then perform the math as a scalar op.

45456

static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,

45457

const X86Subtarget &Subtarget) {

45458

assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Expected extract") ? void (0) : __assert_fail ("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Expected extract\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45458, __extension__
__PRETTY_FUNCTION__));

45459

SDValue Vec = ExtElt->getOperand(0);

45460

SDValue Index = ExtElt->getOperand(1);

45461

EVT VT = ExtElt->getValueType(0);

45462

EVT VecVT = Vec.getValueType();

45463

45464

// TODO: If this is a unary/expensive/expand op, allow extraction from a

45465

// non-zero element because the shuffle+scalar op will be cheaper?

45466

if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)

45467

return SDValue();

45468

45469

// Vector FP compares don't fit the pattern of FP math ops (propagate, not

45470

// extract, the condition code), so deal with those as a special-case.

45471

if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {

45472

EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();

45473

if (OpVT != MVT::f32 && OpVT != MVT::f64)

45474

return SDValue();

45475

45476

// extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC

45477

SDLoc DL(ExtElt);

45478

SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,

45479

Vec.getOperand(0), Index);

45480

SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,

45481

Vec.getOperand(1), Index);

45482

return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));

45483

}

45484

45485

if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&

45486

VT != MVT::f64)

45487

return SDValue();

45488

45489

// Vector FP selects don't fit the pattern of FP math ops (because the

45490

// condition has a different type and we have to change the opcode), so deal

45491

// with those here.

45492

// FIXME: This is restricted to pre type legalization by ensuring the setcc

45493

// has i1 elements. If we loosen this we need to convert vector bool to a

45494

// scalar bool.

45495

if (Vec.getOpcode() == ISD::VSELECT &&

45496

Vec.getOperand(0).getOpcode() == ISD::SETCC &&

45497

Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&

45498

Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {

45499

// ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)

45500

SDLoc DL(ExtElt);

45501

SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,

45502

Vec.getOperand(0).getValueType().getScalarType(),

45503

Vec.getOperand(0), Index);

45504

SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

45505

Vec.getOperand(1), Index);

45506

SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

45507

Vec.getOperand(2), Index);

45508

return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);

45509

}

45510

45511

// TODO: This switch could include FNEG and the x86-specific FP logic ops

45512

// (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid

45513

// missed load folding and fma+fneg combining.

45514

switch (Vec.getOpcode()) {

45515

case ISD::FMA: // Begin 3 operands

45516

case ISD::FMAD:

45517

case ISD::FADD: // Begin 2 operands

45518

case ISD::FSUB:

45519

case ISD::FMUL:

45520

case ISD::FDIV:

45521

case ISD::FREM:

45522

case ISD::FCOPYSIGN:

45523

case ISD::FMINNUM:

45524

case ISD::FMAXNUM:

45525

case ISD::FMINNUM_IEEE:

45526

case ISD::FMAXNUM_IEEE:

45527

case ISD::FMAXIMUM:

45528

case ISD::FMINIMUM:

45529

case X86ISD::FMAX:

45530

case X86ISD::FMIN:

45531

case ISD::FABS: // Begin 1 operand

45532

case ISD::FSQRT:

45533

case ISD::FRINT:

45534

case ISD::FCEIL:

45535

case ISD::FTRUNC:

45536

case ISD::FNEARBYINT:

45537

case ISD::FROUND:

45538

case ISD::FFLOOR:

45539

case X86ISD::FRCP:

45540

case X86ISD::FRSQRT: {

45541

// extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...

45542

SDLoc DL(ExtElt);

45543

SmallVector<SDValue, 4> ExtOps;

45544

for (SDValue Op : Vec->ops())

45545

ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));

45546

return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);

45547

}

45548

default:

45549

return SDValue();

45550

}

45551

llvm_unreachable("All opcodes should return within switch")::llvm::llvm_unreachable_internal("All opcodes should return within switch"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45551);

45552

}

45553

45554

/// Try to convert a vector reduction sequence composed of binops and shuffles

45555

/// into horizontal ops.

45556

static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,

45557

const X86Subtarget &Subtarget) {

45558

assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Unexpected caller") ? void (0) : __assert_fail (
"ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Unexpected caller\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45558, __extension__
__PRETTY_FUNCTION__));

45559

45560

// We need at least SSE2 to anything here.

45561

if (!Subtarget.hasSSE2())

45562

return SDValue();

45563

45564

ISD::NodeType Opc;

45565

SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,

45566

{ISD::ADD, ISD::MUL, ISD::FADD}, true);

45567

if (!Rdx)

45568

return SDValue();

45569

45570

SDValue Index = ExtElt->getOperand(1);

45571

assert(isNullConstant(Index) &&(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45572, __extension__
__PRETTY_FUNCTION__))

45572

"Reduction doesn't end in an extract from index 0")(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45572, __extension__
__PRETTY_FUNCTION__));

45573

45574

EVT VT = ExtElt->getValueType(0);

45575

EVT VecVT = Rdx.getValueType();

45576

if (VecVT.getScalarType() != VT)

45577

return SDValue();

45578

45579

SDLoc DL(ExtElt);

45580

unsigned NumElts = VecVT.getVectorNumElements();

45581

unsigned EltSizeInBits = VecVT.getScalarSizeInBits();

45582

45583

// Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.

45584

auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {

45585

if (V.getValueType() == MVT::v4i8) {

45586

if (ZeroExtend && Subtarget.hasSSE41()) {

45587

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,

45588

DAG.getConstant(0, DL, MVT::v4i32),

45589

DAG.getBitcast(MVT::i32, V),

45590

DAG.getIntPtrConstant(0, DL));

45591

return DAG.getBitcast(MVT::v16i8, V);

45592

}

45593

V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,

45594

ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)

45595

: DAG.getUNDEF(MVT::v4i8));

45596

}

45597

return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,

45598

DAG.getUNDEF(MVT::v8i8));

45599

};

45600

45601

// vXi8 mul reduction - promote to vXi16 mul reduction.

45602

if (Opc == ISD::MUL) {

45603

if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))

45604

return SDValue();

45605

if (VecVT.getSizeInBits() >= 128) {

45606

EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);

45607

SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));

45608

SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));

45609

Lo = DAG.getBitcast(WideVT, Lo);

45610

Hi = DAG.getBitcast(WideVT, Hi);

45611

Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);

45612

while (Rdx.getValueSizeInBits() > 128) {

45613

std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

45614

Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);

45615

}

45616

} else {

45617

Rdx = WidenToV16I8(Rdx, false);

45618

Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));

45619

Rdx = DAG.getBitcast(MVT::v8i16, Rdx);

45620

}

45621

if (NumElts >= 8)

45622

Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

45623

DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

45624

{4, 5, 6, 7, -1, -1, -1, -1}));

45625

Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

45626

DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

45627

{2, 3, -1, -1, -1, -1, -1, -1}));

45628

Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

45629

DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

45630

{1, -1, -1, -1, -1, -1, -1, -1}));

45631

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

45632

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

45633

}

45634

45635

// vXi8 add reduction - sub 128-bit vector.

45636

if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {

45637

Rdx = WidenToV16I8(Rdx, true);

45638

Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,

45639

DAG.getConstant(0, DL, MVT::v16i8));

45640

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

45641

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

45642

}

45643

45644

// Must be a >=128-bit vector with pow2 elements.

45645

if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))

45646

return SDValue();

45647

45648

// vXi8 add reduction - sum lo/hi halves then use PSADBW.

45649

if (VT == MVT::i8) {

45650

while (Rdx.getValueSizeInBits() > 128) {

45651

SDValue Lo, Hi;

45652

std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

45653

VecVT = Lo.getValueType();

45654

Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);

45655

}

45656

assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")(static_cast <bool> (VecVT == MVT::v16i8 && "v16i8 reduction expected"
) ? void (0) : __assert_fail ("VecVT == MVT::v16i8 && \"v16i8 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45656, __extension__
__PRETTY_FUNCTION__));

45657

45658

SDValue Hi = DAG.getVectorShuffle(

45659

MVT::v16i8, DL, Rdx, Rdx,

45660

{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});

45661

Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);

45662

Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,

45663

getZeroVector(MVT::v16i8, Subtarget, DAG, DL));

45664

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

45665

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

45666

}

45667

45668

// See if we can use vXi8 PSADBW add reduction for larger zext types.

45669

// If the source vector values are 0-255, then we can use PSADBW to

45670

// sum+zext v8i8 subvectors to vXi64, then perform the reduction.

45671

// TODO: See if its worth avoiding vXi16/i32 truncations?

45672

if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&

45673

DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&

45674

(EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||

45675

Subtarget.hasAVX512())) {

45676

EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);

45677

Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);

45678

if (ByteVT.getSizeInBits() < 128)

45679

Rdx = WidenToV16I8(Rdx, true);

45680

45681

// Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.

45682

auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

45683

ArrayRef<SDValue> Ops) {

45684

MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);

45685

SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());

45686

return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);

45687

};

45688

MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);

45689

Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);

45690

45691

// TODO: We could truncate to vXi16/vXi32 before performing the reduction.

45692

while (Rdx.getValueSizeInBits() > 128) {

45693

SDValue Lo, Hi;

45694

std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

45695

VecVT = Lo.getValueType();

45696

Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);

45697

}

45698

assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected")(static_cast <bool> (Rdx.getValueType() == MVT::v2i64 &&
"v2i64 reduction expected") ? void (0) : __assert_fail ("Rdx.getValueType() == MVT::v2i64 && \"v2i64 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45698, __extension__
__PRETTY_FUNCTION__));

45699

45700

if (NumElts > 8) {

45701

SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});

45702

Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);

45703

}

45704

45705

VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());

45706

Rdx = DAG.getBitcast(VecVT, Rdx);

45707

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

45708

}

45709

45710

// Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.

45711

if (!shouldUseHorizontalOp(true, DAG, Subtarget))

45712

return SDValue();

45713

45714

unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;

45715

45716

// 256-bit horizontal instructions operate on 128-bit chunks rather than

45717

// across the whole vector, so we need an extract + hop preliminary stage.

45718

// This is the only step where the operands of the hop are not the same value.

45719

// TODO: We could extend this to handle 512-bit or even longer vectors.

45720

if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||

45721

((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {

45722

unsigned NumElts = VecVT.getVectorNumElements();

45723

SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);

45724

SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);

45725

Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);

45726

VecVT = Rdx.getValueType();

45727

}

45728

if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&

45729

!((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))

45730

return SDValue();

45731

45732

// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0

45733

unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());

45734

for (unsigned i = 0; i != ReductionSteps; ++i)

45735

Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);

45736

45737

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

45738

}

45739

45740

/// Detect vector gather/scatter index generation and convert it from being a

45741

/// bunch of shuffles and extracts into a somewhat faster sequence.

45742

/// For i686, the best sequence is apparently storing the value and loading

45743

/// scalars back, while for x64 we should use 64-bit extracts and shifts.

45744

static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,

45745

TargetLowering::DAGCombinerInfo &DCI,

45746

const X86Subtarget &Subtarget) {

45747

if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))

45748

return NewOp;

45749

45750

SDValue InputVector = N->getOperand(0);

45751

SDValue EltIdx = N->getOperand(1);

45752

auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);

45753

45754

EVT SrcVT = InputVector.getValueType();

45755

EVT VT = N->getValueType(0);

45756

SDLoc dl(InputVector);

45757

bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;

45758

unsigned NumSrcElts = SrcVT.getVectorNumElements();

45759

unsigned NumEltBits = VT.getScalarSizeInBits();

45760

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45761

45762

if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))

45763

return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

45764

45765

// Integer Constant Folding.

45766

if (CIdx && VT.isInteger()) {

45767

APInt UndefVecElts;

45768

SmallVector<APInt, 16> EltBits;

45769

unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();

45770

if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,

45771

EltBits, true, false)) {

45772

uint64_t Idx = CIdx->getZExtValue();

45773

if (UndefVecElts[Idx])

45774

return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

45775

return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);

45776

}

45777

45778

// Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).

45779

// Improves lowering of bool masks on rust which splits them into byte array.

45780

if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {

45781

SDValue Src = peekThroughBitcasts(InputVector);

45782

if (Src.getValueType().getScalarType() == MVT::i1 &&

45783

TLI.isTypeLegal(Src.getValueType())) {

45784

MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);

45785

SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,

45786

DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));

45787

return DAG.getBitcast(VT, Sub);

45788

}

45789

}

45790

}

45791

45792

if (IsPextr) {

45793

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),

45794

DCI))

45795

return SDValue(N, 0);

45796

45797

// PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).

45798

if ((InputVector.getOpcode() == X86ISD::PINSRB ||

45799

InputVector.getOpcode() == X86ISD::PINSRW) &&

45800

InputVector.getOperand(2) == EltIdx) {

45801

assert(SrcVT == InputVector.getOperand(0).getValueType() &&(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45802, __extension__
__PRETTY_FUNCTION__))

45802

"Vector type mismatch")(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45802, __extension__
__PRETTY_FUNCTION__));

45803

SDValue Scl = InputVector.getOperand(1);

45804

Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);

45805

return DAG.getZExtOrTrunc(Scl, dl, VT);

45806

}

45807

45808

// TODO - Remove this once we can handle the implicit zero-extension of

45809

// X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and

45810

// combineBasicSADPattern.

45811

return SDValue();

45812

}

45813

45814

// Detect mmx extraction of all bits as a i64. It works better as a bitcast.

45815

if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&

45816

InputVector.getOpcode() == ISD::BITCAST &&

45817

InputVector.getOperand(0).getValueType() == MVT::x86mmx &&

45818

isNullConstant(EltIdx) && InputVector.hasOneUse())

45819

return DAG.getBitcast(VT, InputVector);

45820

45821

// Detect mmx to i32 conversion through a v2i32 elt extract.

45822

if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&

45823

InputVector.getOpcode() == ISD::BITCAST &&

45824

InputVector.getOperand(0).getValueType() == MVT::x86mmx &&

45825

isNullConstant(EltIdx) && InputVector.hasOneUse())

45826

return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,

45827

InputVector.getOperand(0));

45828

45829

// Check whether this extract is the root of a sum of absolute differences

45830

// pattern. This has to be done here because we really want it to happen

45831

// pre-legalization,

45832

if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))

45833

return SAD;

45834

45835

if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))

45836

return VPDPBUSD;

45837

45838

// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.

45839

if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))

45840

return Cmp;

45841

45842

// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.

45843

if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))

45844

return MinMax;

45845

45846

// Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..

45847

if (SDValue V = combineArithReduction(N, DAG, Subtarget))

45848

return V;

45849

45850

if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))

45851

return V;

45852

45853

// Attempt to extract a i1 element by using MOVMSK to extract the signbits

45854

// and then testing the relevant element.

45855

//

45856

// Note that we only combine extracts on the *same* result number, i.e.

45857

// t0 = merge_values a0, a1, a2, a3

45858

// i1 = extract_vector_elt t0, Constant:i64<2>

45859

// i1 = extract_vector_elt t0, Constant:i64<3>

45860

// but not

45861

// i1 = extract_vector_elt t0:1, Constant:i64<2>

45862

// since the latter would need its own MOVMSK.

45863

if (SrcVT.getScalarType() == MVT::i1) {

45864

bool IsVar = !CIdx;

45865

SmallVector<SDNode *, 16> BoolExtracts;

45866

unsigned ResNo = InputVector.getResNo();

45867

auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {

45868

if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

45869

Use->getOperand(0).getResNo() == ResNo &&

45870

Use->getValueType(0) == MVT::i1) {

45871

BoolExtracts.push_back(Use);

45872

IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));

45873

return true;

45874

}

45875

return false;

45876

};

45877

// TODO: Can we drop the oneuse check for constant extracts?

45878

if (all_of(InputVector->uses(), IsBoolExtract) &&

45879

(IsVar || BoolExtracts.size() > 1)) {

45880

EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);

45881

if (SDValue BC =

45882

combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {

45883

for (SDNode *Use : BoolExtracts) {

45884

// extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask

45885

// Mask = 1 << MaskIdx

45886

SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);

45887

SDValue MaskBit = DAG.getConstant(1, dl, BCVT);

45888

SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);

45889

SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);

45890

Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);

45891

DCI.CombineTo(Use, Res);

45892

}

45893

return SDValue(N, 0);

45894

}

45895

}

45896

}

45897

45898

// If this extract is from a loaded vector value and will be used as an

45899

// integer, that requires a potentially expensive XMM -> GPR transfer.

45900

// Additionally, if we can convert to a scalar integer load, that will likely

45901

// be folded into a subsequent integer op.

45902

// Note: Unlike the related fold for this in DAGCombiner, this is not limited

45903

// to a single-use of the loaded vector. For the reasons above, we

45904

// expect this to be profitable even if it creates an extra load.

45905

bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {

45906

return Use->getOpcode() == ISD::STORE ||

45907

Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||

45908

Use->getOpcode() == ISD::SCALAR_TO_VECTOR;

45909

});

45910

auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);

45911

if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&

45912

SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&

45913

!LikelyUsedAsVector && LoadVec->isSimple()) {

45914

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45915

SDValue NewPtr =

45916

TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);

45917

unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;

45918

MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);

45919

Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);

45920

SDValue Load =

45921

DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,

45922

LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());

45923

DAG.makeEquivalentMemoryOrdering(LoadVec, Load);

45924

return Load;

45925

}

45926

45927

return SDValue();

45928

}

45929

45930

// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).

45931

// This is more or less the reverse of combineBitcastvxi1.

45932

static SDValue combineToExtendBoolVectorInReg(

45933

unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,

45934

TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {

45935

if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&

45936

Opcode != ISD::ANY_EXTEND)

45937

return SDValue();

45938

if (!DCI.isBeforeLegalizeOps())

45939

return SDValue();

45940

if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())

45941

return SDValue();

45942

45943

EVT SVT = VT.getScalarType();

45944

EVT InSVT = N0.getValueType().getScalarType();

45945

unsigned EltSizeInBits = SVT.getSizeInBits();

45946

45947

// Input type must be extending a bool vector (bit-casted from a scalar

45948

// integer) to legal integer types.

45949

if (!VT.isVector())

45950

return SDValue();

45951

if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)

45952

return SDValue();

45953

if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)

45954

return SDValue();

45955

45956

SDValue N00 = N0.getOperand(0);

45957

EVT SclVT = N00.getValueType();

45958

if (!SclVT.isScalarInteger())

45959

return SDValue();

45960

45961

SDValue Vec;

45962

SmallVector<int> ShuffleMask;

45963

unsigned NumElts = VT.getVectorNumElements();

45964

assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")(static_cast <bool> (NumElts == SclVT.getSizeInBits() &&
"Unexpected bool vector size") ? void (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45964, __extension__
__PRETTY_FUNCTION__));

45965

45966

// Broadcast the scalar integer to the vector elements.

45967

if (NumElts > EltSizeInBits) {

45968

// If the scalar integer is greater than the vector element size, then we

45969

// must split it down into sub-sections for broadcasting. For example:

45970

// i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.

45971

// i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.

45972

assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(static_cast <bool> ((NumElts % EltSizeInBits) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45972, __extension__
__PRETTY_FUNCTION__));

45973

unsigned Scale = NumElts / EltSizeInBits;

45974

EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);

45975

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);

45976

Vec = DAG.getBitcast(VT, Vec);

45977

45978

for (unsigned i = 0; i != Scale; ++i)

45979

ShuffleMask.append(EltSizeInBits, i);

45980

Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

45981

} else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&

45982

(SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {

45983

// If we have register broadcast instructions, use the scalar size as the

45984

// element type for the shuffle. Then cast to the wider element type. The

45985

// widened bits won't be used, and this might allow the use of a broadcast

45986

// load.

45987

assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale")(static_cast <bool> ((EltSizeInBits % NumElts) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(EltSizeInBits % NumElts) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45987, __extension__
__PRETTY_FUNCTION__));

45988

unsigned Scale = EltSizeInBits / NumElts;

45989

EVT BroadcastVT =

45990

EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);

45991

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);

45992

ShuffleMask.append(NumElts * Scale, 0);

45993

Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);

45994

Vec = DAG.getBitcast(VT, Vec);

45995

} else {

45996

// For smaller scalar integers, we can simply any-extend it to the vector

45997

// element size (we don't care about the upper bits) and broadcast it to all

45998

// elements.

45999

SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);

46000

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);

46001

ShuffleMask.append(NumElts, 0);

46002

Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

46003

}

46004

46005

// Now, mask the relevant bit in each element.

46006

SmallVector<SDValue, 32> Bits;

46007

for (unsigned i = 0; i != NumElts; ++i) {

46008

int BitIdx = (i % EltSizeInBits);

46009

APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);

46010

Bits.push_back(DAG.getConstant(Bit, DL, SVT));

46011

}

46012

SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);

46013

Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);

46014

46015

// Compare against the bitmask and extend the result.

46016

EVT CCVT = VT.changeVectorElementType(MVT::i1);

46017

Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);

46018

Vec = DAG.getSExtOrTrunc(Vec, DL, VT);

46019

46020

// For SEXT, this is now done, otherwise shift the result down for

46021

// zero-extension.

46022

if (Opcode == ISD::SIGN_EXTEND)

46023

return Vec;

46024

return DAG.getNode(ISD::SRL, DL, VT, Vec,

46025

DAG.getConstant(EltSizeInBits - 1, DL, VT));

46026

}

46027

46028

/// If a vector select has an operand that is -1 or 0, try to simplify the

46029

/// select to a bitwise logic operation.

46030

/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?

46031

static SDValue

46032

combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,

46033

TargetLowering::DAGCombinerInfo &DCI,

46034

const X86Subtarget &Subtarget) {

46035

SDValue Cond = N->getOperand(0);

46036

SDValue LHS = N->getOperand(1);

46037

SDValue RHS = N->getOperand(2);

46038

EVT VT = LHS.getValueType();

46039

EVT CondVT = Cond.getValueType();

46040

SDLoc DL(N);

46041

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46042

46043

if (N->getOpcode() != ISD::VSELECT)

46044

return SDValue();

46045

46046

assert(CondVT.isVector() && "Vector select expects a vector selector!")(static_cast <bool> (CondVT.isVector() && "Vector select expects a vector selector!"
) ? void (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46046, __extension__
__PRETTY_FUNCTION__));

46047

46048

// TODO: Use isNullOrNullSplat() to distinguish constants with undefs?

46049

// TODO: Can we assert that both operands are not zeros (because that should

46050

// get simplified at node creation time)?

46051

bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());

46052

bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());

46053

46054

// If both inputs are 0/undef, create a complete zero vector.

46055

// FIXME: As noted above this should be handled by DAGCombiner/getNode.

46056

if (TValIsAllZeros && FValIsAllZeros) {

46057

if (VT.isFloatingPoint())

46058

return DAG.getConstantFP(0.0, DL, VT);

46059

return DAG.getConstant(0, DL, VT);

46060

}

46061

46062

// To use the condition operand as a bitwise mask, it must have elements that

46063

// are the same size as the select elements. Ie, the condition operand must

46064

// have already been promoted from the IR select condition type <N x i1>.

46065

// Don't check if the types themselves are equal because that excludes

46066

// vector floating-point selects.

46067

if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())

46068

return SDValue();

46069

46070

// Try to invert the condition if true value is not all 1s and false value is

46071

// not all 0s. Only do this if the condition has one use.

46072

bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());

46073

if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&

46074

// Check if the selector will be produced by CMPP*/PCMP*.

46075

Cond.getOpcode() == ISD::SETCC &&

46076

// Check if SETCC has already been promoted.

46077

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==

46078

CondVT) {

46079

bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());

46080

46081

if (TValIsAllZeros || FValIsAllOnes) {

46082

SDValue CC = Cond.getOperand(2);

46083

ISD::CondCode NewCC = ISD::getSetCCInverse(

46084

cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());

46085

Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),

46086

NewCC);

46087

std::swap(LHS, RHS);

46088

TValIsAllOnes = FValIsAllOnes;

46089

FValIsAllZeros = TValIsAllZeros;

46090

}

46091

}

46092

46093

// Cond value must be 'sign splat' to be converted to a logical op.

46094

if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())

46095

return SDValue();

46096

46097

// vselect Cond, 111..., 000... -> Cond

46098

if (TValIsAllOnes && FValIsAllZeros)

46099

return DAG.getBitcast(VT, Cond);

46100

46101

if (!TLI.isTypeLegal(CondVT))

46102

return SDValue();

46103

46104

// vselect Cond, 111..., X -> or Cond, X

46105

if (TValIsAllOnes) {

46106

SDValue CastRHS = DAG.getBitcast(CondVT, RHS);

46107

SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);

46108

return DAG.getBitcast(VT, Or);

46109

}

46110

46111

// vselect Cond, X, 000... -> and Cond, X

46112

if (FValIsAllZeros) {

46113

SDValue CastLHS = DAG.getBitcast(CondVT, LHS);

46114

SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);

46115

return DAG.getBitcast(VT, And);

46116

}

46117

46118

// vselect Cond, 000..., X -> andn Cond, X

46119

if (TValIsAllZeros) {

46120

SDValue CastRHS = DAG.getBitcast(CondVT, RHS);

46121

SDValue AndN;

46122

// The canonical form differs for i1 vectors - x86andnp is not used

46123

if (CondVT.getScalarType() == MVT::i1)

46124

AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),

46125

CastRHS);

46126

else

46127

AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);

46128

return DAG.getBitcast(VT, AndN);

46129

}

46130

46131

return SDValue();

46132

}

46133

46134

/// If both arms of a vector select are concatenated vectors, split the select,

46135

/// and concatenate the result to eliminate a wide (256-bit) vector instruction:

46136

/// vselect Cond, (concat T0, T1), (concat F0, F1) -->

46137

/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)

46138

static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,

46139

const X86Subtarget &Subtarget) {

46140

unsigned Opcode = N->getOpcode();

46141

if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)

46142

return SDValue();

46143

46144

// TODO: Split 512-bit vectors too?

46145

EVT VT = N->getValueType(0);

46146

if (!VT.is256BitVector())

46147

return SDValue();

46148

46149

// TODO: Split as long as any 2 of the 3 operands are concatenated?

46150

SDValue Cond = N->getOperand(0);

46151

SDValue TVal = N->getOperand(1);

46152

SDValue FVal = N->getOperand(2);

46153

SmallVector<SDValue, 4> CatOpsT, CatOpsF;

46154

if (!TVal.hasOneUse() || !FVal.hasOneUse() ||

46155

!collectConcatOps(TVal.getNode(), CatOpsT, DAG) ||

46156

!collectConcatOps(FVal.getNode(), CatOpsF, DAG))

46157

return SDValue();

46158

46159

auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,

46160

ArrayRef<SDValue> Ops) {

46161

return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);

46162

};

46163

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },

46164

makeBlend, /*CheckBWI*/ false);

46165

}

46166

46167

static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {

46168

SDValue Cond = N->getOperand(0);

46169

SDValue LHS = N->getOperand(1);

46170

SDValue RHS = N->getOperand(2);

46171

SDLoc DL(N);

46172

46173

auto *TrueC = dyn_cast<ConstantSDNode>(LHS);

46174

auto *FalseC = dyn_cast<ConstantSDNode>(RHS);

46175

if (!TrueC || !FalseC)

46176

return SDValue();

46177

46178

// Don't do this for crazy integer types.

46179

EVT VT = N->getValueType(0);

46180

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

46181

return SDValue();

46182

46183

// We're going to use the condition bit in math or logic ops. We could allow

46184

// this with a wider condition value (post-legalization it becomes an i8),

46185

// but if nothing is creating selects that late, it doesn't matter.

46186

if (Cond.getValueType() != MVT::i1)

46187

return SDValue();

46188

46189

// A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by

46190

// 3, 5, or 9 with i32/i64, so those get transformed too.

46191

// TODO: For constants that overflow or do not differ by power-of-2 or small

46192

// multiplier, convert to 'and' + 'add'.

46193

const APInt &TrueVal = TrueC->getAPIntValue();

46194

const APInt &FalseVal = FalseC->getAPIntValue();

46195

46196

// We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.

46197

if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&

46198

Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {

46199

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

46200

if (CC == ISD::SETEQ || CC == ISD::SETNE)

46201

return SDValue();

46202

}

46203

46204

bool OV;

46205

APInt Diff = TrueVal.ssub_ov(FalseVal, OV);

46206

if (OV)

46207

return SDValue();

46208

46209

APInt AbsDiff = Diff.abs();

46210

if (AbsDiff.isPowerOf2() ||

46211

((VT == MVT::i32 || VT == MVT::i64) &&

46212

(AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {

46213

46214

// We need a positive multiplier constant for shift/LEA codegen. The 'not'

46215

// of the condition can usually be folded into a compare predicate, but even

46216

// without that, the sequence should be cheaper than a CMOV alternative.

46217

if (TrueVal.slt(FalseVal)) {

46218

Cond = DAG.getNOT(DL, Cond, MVT::i1);

46219

std::swap(TrueC, FalseC);

46220

}

46221

46222

// select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC

46223

SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);

46224

46225

// Multiply condition by the difference if non-one.

46226

if (!AbsDiff.isOne())

46227

R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));

46228

46229

// Add the base if non-zero.

46230

if (!FalseC->isZero())

46231

R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));

46232

46233

return R;

46234

}

46235

46236

return SDValue();

46237

}

46238

46239

/// If this is a *dynamic* select (non-constant condition) and we can match

46240

/// this node with one of the variable blend instructions, restructure the

46241

/// condition so that blends can use the high (sign) bit of each element.

46242

/// This function will also call SimplifyDemandedBits on already created

46243

/// BLENDV to perform additional simplifications.

46244

static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,

46245

TargetLowering::DAGCombinerInfo &DCI,

46246

const X86Subtarget &Subtarget) {

46247

SDValue Cond = N->getOperand(0);

46248

if ((N->getOpcode() != ISD::VSELECT &&

46249

N->getOpcode() != X86ISD::BLENDV) ||

46250

ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))

46251

return SDValue();

46252

46253

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46254

unsigned BitWidth = Cond.getScalarValueSizeInBits();

46255

EVT VT = N->getValueType(0);

46256

46257

// We can only handle the cases where VSELECT is directly legal on the

46258

// subtarget. We custom lower VSELECT nodes with constant conditions and

46259

// this makes it hard to see whether a dynamic VSELECT will correctly

46260

// lower, so we both check the operation's status and explicitly handle the

46261

// cases where a *dynamic* blend will fail even though a constant-condition

46262

// blend could be custom lowered.

46263

// FIXME: We should find a better way to handle this class of problems.

46264

// Potentially, we should combine constant-condition vselect nodes

46265

// pre-legalization into shuffles and not mark as many types as custom

46266

// lowered.

46267

if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))

46268

return SDValue();

46269

// FIXME: We don't support i16-element blends currently. We could and

46270

// should support them by making *all* the bits in the condition be set

46271

// rather than just the high bit and using an i8-element blend.

46272

if (VT.getVectorElementType() == MVT::i16)

46273

return SDValue();

46274

// Dynamic blending was only available from SSE4.1 onward.

46275

if (VT.is128BitVector() && !Subtarget.hasSSE41())

46276

return SDValue();

46277

// Byte blends are only available in AVX2

46278

if (VT == MVT::v32i8 && !Subtarget.hasAVX2())

46279

return SDValue();

46280

// There are no 512-bit blend instructions that use sign bits.

46281

if (VT.is512BitVector())

46282

return SDValue();

46283

46284

// Don't optimize before the condition has been transformed to a legal type

46285

// and don't ever optimize vector selects that map to AVX512 mask-registers.

46286

if (BitWidth < 8 || BitWidth > 64)

46287

return SDValue();

46288

46289

auto OnlyUsedAsSelectCond = [](SDValue Cond) {

46290

for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();

46291

UI != UE; ++UI)

46292

if ((UI->getOpcode() != ISD::VSELECT &&

46293

UI->getOpcode() != X86ISD::BLENDV) ||

46294

UI.getOperandNo() != 0)

46295

return false;

46296

46297

return true;

46298

};

46299

46300

APInt DemandedBits(APInt::getSignMask(BitWidth));

46301

46302

if (OnlyUsedAsSelectCond(Cond)) {

46303

KnownBits Known;

46304

TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),

46305

!DCI.isBeforeLegalizeOps());

46306

if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))

46307

return SDValue();

46308

46309

// If we changed the computation somewhere in the DAG, this change will

46310

// affect all users of Cond. Update all the nodes so that we do not use

46311

// the generic VSELECT anymore. Otherwise, we may perform wrong

46312

// optimizations as we messed with the actual expectation for the vector

46313

// boolean values.

46314

for (SDNode *U : Cond->uses()) {

46315

if (U->getOpcode() == X86ISD::BLENDV)

46316

continue;

46317

46318

SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),

46319

Cond, U->getOperand(1), U->getOperand(2));

46320

DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);

46321

DCI.AddToWorklist(U);

46322

}

46323

DCI.CommitTargetLoweringOpt(TLO);

46324

return SDValue(N, 0);

46325

}

46326

46327

// Otherwise we can still at least try to simplify multiple use bits.

46328

if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))

46329

return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,

46330

N->getOperand(1), N->getOperand(2));

46331

46332

return SDValue();

46333

}

46334

46335

// Try to match:

46336

// (or (and (M, (sub 0, X)), (pandn M, X)))

46337

// which is a special case of:

46338

// (select M, (sub 0, X), X)

46339

// Per:

46340

// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate

46341

// We know that, if fNegate is 0 or 1:

46342

// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)

46343

//

46344

// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:

46345

// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))

46346

// ( M ? -X : X) == ((X ^ M ) + (M & 1))

46347

// This lets us transform our vselect to:

46348

// (add (xor X, M), (and M, 1))

46349

// And further to:

46350

// (sub (xor X, M), M)

46351

static SDValue combineLogicBlendIntoConditionalNegate(

46352

EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,

46353

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

46354

EVT MaskVT = Mask.getValueType();

46355

assert(MaskVT.isInteger() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46357, __extension__
__PRETTY_FUNCTION__))

46356

DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46357, __extension__
__PRETTY_FUNCTION__))

46357

"Mask must be zero/all-bits")(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46357, __extension__
__PRETTY_FUNCTION__));

46358

46359

if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)

46360

return SDValue();

46361

if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))

46362

return SDValue();

46363

46364

auto IsNegV = [](SDNode *N, SDValue V) {

46365

return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&

46366

ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());

46367

};

46368

46369

SDValue V;

46370

if (IsNegV(Y.getNode(), X))

46371

V = X;

46372

else if (IsNegV(X.getNode(), Y))

46373

V = Y;

46374

else

46375

return SDValue();

46376

46377

SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);

46378

SDValue SubOp2 = Mask;

46379

46380

// If the negate was on the false side of the select, then

46381

// the operands of the SUB need to be swapped. PR 27251.

46382

// This is because the pattern being matched above is

46383

// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)

46384

// but if the pattern matched was

46385

// (vselect M, X, (sub (0, X))), that is really negation of the pattern

46386

// above, -(vselect M, (sub 0, X), X), and therefore the replacement

46387

// pattern also needs to be a negation of the replacement pattern above.

46388

// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the

46389

// sub accomplishes the negation of the replacement pattern.

46390

if (V == Y)

46391

std::swap(SubOp1, SubOp2);

46392

46393

SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);

46394

return DAG.getBitcast(VT, Res);

46395

}

46396

46397

/// Do target-specific dag combines on SELECT and VSELECT nodes.

46398

static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

46399

TargetLowering::DAGCombinerInfo &DCI,

46400

const X86Subtarget &Subtarget) {

46401

SDLoc DL(N);

46402

SDValue Cond = N->getOperand(0);

46403

SDValue LHS = N->getOperand(1);

46404

SDValue RHS = N->getOperand(2);

46405

46406

// Try simplification again because we use this function to optimize

46407

// BLENDV nodes that are not handled by the generic combiner.

46408

if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))

46409

return V;

46410

46411

EVT VT = LHS.getValueType();

46412

EVT CondVT = Cond.getValueType();

46413

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46414

bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());

46415

46416

// Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).

46417

// Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT

46418

// can't catch, plus vXi8 cases where we'd likely end up with BLENDV.

46419

if (CondVT.isVector() && CondVT.isInteger() &&

46420

CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&

46421

(!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&

46422

DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())

46423

if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,

46424

DL, DAG, Subtarget))

46425

return V;

46426

46427

// Convert vselects with constant condition into shuffles.

46428

if (CondConstantVector && DCI.isBeforeLegalizeOps() &&

46429

(N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {

46430

SmallVector<int, 64> Mask;

46431

if (createShuffleMaskFromVSELECT(Mask, Cond,

46432

N->getOpcode() == X86ISD::BLENDV))

46433

return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);

46434

}

46435

46436

// fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))

46437

// by forcing the unselected elements to zero.

46438

// TODO: Can we handle more shuffles with this?

46439

if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&

46440

LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&

46441

LHS.hasOneUse() && RHS.hasOneUse()) {

46442

MVT SimpleVT = VT.getSimpleVT();

46443

SmallVector<SDValue, 1> LHSOps, RHSOps;

46444

SmallVector<int, 64> LHSMask, RHSMask, CondMask;

46445

if (createShuffleMaskFromVSELECT(CondMask, Cond) &&

46446

getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&

46447

getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {

46448

int NumElts = VT.getVectorNumElements();

46449

for (int i = 0; i != NumElts; ++i) {

46450

// getConstVector sets negative shuffle mask values as undef, so ensure

46451

// we hardcode SM_SentinelZero values to zero (0x80).

46452

if (CondMask[i] < NumElts) {

46453

LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];

46454

RHSMask[i] = 0x80;

46455

} else {

46456

LHSMask[i] = 0x80;

46457

RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];

46458

}

46459

}

46460

LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),

46461

getConstVector(LHSMask, SimpleVT, DAG, DL, true));

46462

RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),

46463

getConstVector(RHSMask, SimpleVT, DAG, DL, true));

46464

return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);

46465

}

46466

}

46467

46468

// If we have SSE[12] support, try to form min/max nodes. SSE min/max

46469

// instructions match the semantics of the common C idiom x<y?x:y but not

46470

// x<=y?x:y, because of how they handle negative zero (which can be

46471

// ignored in unsafe-math mode).

46472

// We also try to create v2f32 min/max nodes, which we later widen to v4f32.

46473

if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&

46474

VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) &&

46475

(TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&

46476

(Subtarget.hasSSE2() ||

46477

(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {

46478

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

46479

46480

unsigned Opcode = 0;

46481

// Check for x CC y ? x : y.

46482

if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&

46483

DAG.isEqualTo(RHS, Cond.getOperand(1))) {

46484

switch (CC) {

46485

default: break;

46486

case ISD::SETULT:

46487

// Converting this to a min would handle NaNs incorrectly, and swapping

46488

// the operands would cause it to handle comparisons between positive

46489

// and negative zero incorrectly.

46490

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

46491

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46492

!(DAG.isKnownNeverZeroFloat(LHS) ||

46493

DAG.isKnownNeverZeroFloat(RHS)))

46494

break;

46495

std::swap(LHS, RHS);

46496

}

46497

Opcode = X86ISD::FMIN;

46498

break;

46499

case ISD::SETOLE:

46500

// Converting this to a min would handle comparisons between positive

46501

// and negative zero incorrectly.

46502

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46503

!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))

46504

break;

46505

Opcode = X86ISD::FMIN;

46506

break;

46507

case ISD::SETULE:

46508

// Converting this to a min would handle both negative zeros and NaNs

46509

// incorrectly, but we can swap the operands to fix both.

46510

std::swap(LHS, RHS);

46511

[[fallthrough]];

46512

case ISD::SETOLT:

46513

case ISD::SETLT:

46514

case ISD::SETLE:

46515

Opcode = X86ISD::FMIN;

46516

break;

46517

46518

case ISD::SETOGE:

46519

// Converting this to a max would handle comparisons between positive

46520

// and negative zero incorrectly.

46521

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46522

!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))

46523

break;

46524

Opcode = X86ISD::FMAX;

46525

break;

46526

case ISD::SETUGT:

46527

// Converting this to a max would handle NaNs incorrectly, and swapping

46528

// the operands would cause it to handle comparisons between positive

46529

// and negative zero incorrectly.

46530

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

46531

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46532

!(DAG.isKnownNeverZeroFloat(LHS) ||

46533

DAG.isKnownNeverZeroFloat(RHS)))

46534

break;

46535

std::swap(LHS, RHS);

46536

}

46537

Opcode = X86ISD::FMAX;

46538

break;

46539

case ISD::SETUGE:

46540

// Converting this to a max would handle both negative zeros and NaNs

46541

// incorrectly, but we can swap the operands to fix both.

46542

std::swap(LHS, RHS);

46543

[[fallthrough]];

46544

case ISD::SETOGT:

46545

case ISD::SETGT:

46546

case ISD::SETGE:

46547

Opcode = X86ISD::FMAX;

46548

break;

46549

}

46550

// Check for x CC y ? y : x -- a min/max with reversed arms.

46551

} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&

46552

DAG.isEqualTo(RHS, Cond.getOperand(0))) {

46553

switch (CC) {

46554

default: break;

46555

case ISD::SETOGE:

46556

// Converting this to a min would handle comparisons between positive

46557

// and negative zero incorrectly, and swapping the operands would

46558

// cause it to handle NaNs incorrectly.

46559

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46560

!(DAG.isKnownNeverZeroFloat(LHS) ||

46561

DAG.isKnownNeverZeroFloat(RHS))) {

46562

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

46563

break;

46564

std::swap(LHS, RHS);

46565

}

46566

Opcode = X86ISD::FMIN;

46567

break;

46568

case ISD::SETUGT:

46569

// Converting this to a min would handle NaNs incorrectly.

46570

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

46571

break;

46572

Opcode = X86ISD::FMIN;

46573

break;

46574

case ISD::SETUGE:

46575

// Converting this to a min would handle both negative zeros and NaNs

46576

// incorrectly, but we can swap the operands to fix both.

46577

std::swap(LHS, RHS);

46578

[[fallthrough]];

46579

case ISD::SETOGT:

46580

case ISD::SETGT:

46581

case ISD::SETGE:

46582

Opcode = X86ISD::FMIN;

46583

break;

46584

46585

case ISD::SETULT:

46586

// Converting this to a max would handle NaNs incorrectly.

46587

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

46588

break;

46589

Opcode = X86ISD::FMAX;

46590

break;

46591

case ISD::SETOLE:

46592

// Converting this to a max would handle comparisons between positive

46593

// and negative zero incorrectly, and swapping the operands would

46594

// cause it to handle NaNs incorrectly.

46595

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46596

!DAG.isKnownNeverZeroFloat(LHS) &&

46597

!DAG.isKnownNeverZeroFloat(RHS)) {

46598

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

46599

break;

46600

std::swap(LHS, RHS);

46601

}

46602

Opcode = X86ISD::FMAX;

46603

break;

46604

case ISD::SETULE:

46605

// Converting this to a max would handle both negative zeros and NaNs

46606

// incorrectly, but we can swap the operands to fix both.

46607

std::swap(LHS, RHS);

46608

[[fallthrough]];

46609

case ISD::SETOLT:

46610

case ISD::SETLT:

46611

case ISD::SETLE:

46612

Opcode = X86ISD::FMAX;

46613

break;

46614

}

46615

}

46616

46617

if (Opcode)

46618

return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);

46619

}

46620

46621

// Some mask scalar intrinsics rely on checking if only one bit is set

46622

// and implement it in C code like this:

46623

// A[0] = (U & 1) ? A[0] : W[0];

46624

// This creates some redundant instructions that break pattern matching.

46625

// fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)

46626

if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&

46627

Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {

46628

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

46629

SDValue AndNode = Cond.getOperand(0);

46630

if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&

46631

isNullConstant(Cond.getOperand(1)) &&

46632

isOneConstant(AndNode.getOperand(1))) {

46633

// LHS and RHS swapped due to

46634

// setcc outputting 1 when AND resulted in 0 and vice versa.

46635

AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);

46636

return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);

46637

}

46638

}

46639

46640

// v16i8 (select v16i1, v16i8, v16i8) does not have a proper

46641

// lowering on KNL. In this case we convert it to

46642

// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.

46643

// The same situation all vectors of i8 and i16 without BWI.

46644

// Make sure we extend these even before type legalization gets a chance to

46645

// split wide vectors.

46646

// Since SKX these selects have a proper lowering.

46647

if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&

46648

CondVT.getVectorElementType() == MVT::i1 &&

46649

(VT.getVectorElementType() == MVT::i8 ||

46650

VT.getVectorElementType() == MVT::i16)) {

46651

Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);

46652

return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);

46653

}

46654

46655

// AVX512 - Extend select with zero to merge with target shuffle.

46656

// select(mask, extract_subvector(shuffle(x)), zero) -->

46657

// extract_subvector(select(insert_subvector(mask), shuffle(x), zero))

46658

// TODO - support non target shuffles as well.

46659

if (Subtarget.hasAVX512() && CondVT.isVector() &&

46660

CondVT.getVectorElementType() == MVT::i1) {

46661

auto SelectableOp = [&TLI](SDValue Op) {

46662

return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

46663

isTargetShuffle(Op.getOperand(0).getOpcode()) &&

46664

isNullConstant(Op.getOperand(1)) &&

46665

TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&

46666

Op.hasOneUse() && Op.getOperand(0).hasOneUse();

46667

};

46668

46669

bool SelectableLHS = SelectableOp(LHS);

46670

bool SelectableRHS = SelectableOp(RHS);

46671

bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());

46672

bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());

46673

46674

if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {

46675

EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()

46676

: RHS.getOperand(0).getValueType();

46677

EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);

46678

LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,

46679

VT.getSizeInBits());

46680

RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,

46681

VT.getSizeInBits());

46682

Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,

46683

DAG.getUNDEF(SrcCondVT), Cond,

46684

DAG.getIntPtrConstant(0, DL));

46685

SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);

46686

return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());

46687

}

46688

}

46689

46690

if (SDValue V = combineSelectOfTwoConstants(N, DAG))

46691

return V;

46692

46693

if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&

46694

Cond.hasOneUse()) {

46695

EVT CondVT = Cond.getValueType();

46696

SDValue Cond0 = Cond.getOperand(0);

46697

SDValue Cond1 = Cond.getOperand(1);

46698

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

46699

46700

// Canonicalize min/max:

46701

// (x > 0) ? x : 0 -> (x >= 0) ? x : 0

46702

// (x < -1) ? x : -1 -> (x <= -1) ? x : -1

46703

// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates

46704

// the need for an extra compare against zero. e.g.

46705

// (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0

46706

// subl %esi, %edi

46707

// testl %edi, %edi

46708

// movl $0, %eax

46709

// cmovgl %edi, %eax

46710

// =>

46711

// xorl %eax, %eax

46712

// subl %esi, $edi

46713

// cmovsl %eax, %edi

46714

//

46715

// We can also canonicalize

46716

// (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1

46717

// (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1

46718

// This allows the use of a test instruction for the compare.

46719

if (LHS == Cond0 && RHS == Cond1) {

46720

if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||

46721

(CC == ISD::SETLT && isAllOnesConstant(RHS))) {

46722

ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;

46723

Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);

46724

return DAG.getSelect(DL, VT, Cond, LHS, RHS);

46725

}

46726

if (CC == ISD::SETUGT && isOneConstant(RHS)) {

46727

ISD::CondCode NewCC = ISD::SETUGE;

46728

Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);

46729

return DAG.getSelect(DL, VT, Cond, LHS, RHS);

46730

}

46731

}

46732

46733

// Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.

46734

// fold eq + gt/lt nested selects into ge/le selects

46735

// select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)

46736

// --> (select (cmpuge Cond0, Cond1), LHS, Y)

46737

// select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)

46738

// --> (select (cmpsle Cond0, Cond1), LHS, Y)

46739

// .. etc ..

46740

if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&

46741

RHS.getOperand(0).getOpcode() == ISD::SETCC) {

46742

SDValue InnerSetCC = RHS.getOperand(0);

46743

ISD::CondCode InnerCC =

46744

cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();

46745

if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&

46746

Cond0 == InnerSetCC.getOperand(0) &&

46747

Cond1 == InnerSetCC.getOperand(1)) {

46748

ISD::CondCode NewCC;

46749

switch (CC == ISD::SETEQ ? InnerCC : CC) {

46750

case ISD::SETGT: NewCC = ISD::SETGE; break;

46751

case ISD::SETLT: NewCC = ISD::SETLE; break;

46752

case ISD::SETUGT: NewCC = ISD::SETUGE; break;

46753

case ISD::SETULT: NewCC = ISD::SETULE; break;

46754

default: NewCC = ISD::SETCC_INVALID; break;

46755

}

46756

if (NewCC != ISD::SETCC_INVALID) {

46757

Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);

46758

return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));

46759

}

46760

}

46761

}

46762

}

46763

46764

// Check if the first operand is all zeros and Cond type is vXi1.

46765

// If this an avx512 target we can improve the use of zero masking by

46766

// swapping the operands and inverting the condition.

46767

if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&

46768

Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&

46769

ISD::isBuildVectorAllZeros(LHS.getNode()) &&

46770

!ISD::isBuildVectorAllZeros(RHS.getNode())) {

46771

// Invert the cond to not(cond) : xor(op,allones)=not(op)

46772

SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);

46773

// Vselect cond, op1, op2 = Vselect not(cond), op2, op1

46774

return DAG.getSelect(DL, VT, CondNew, RHS, LHS);

46775

}

46776

46777

// Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might

46778

// get split by legalization.

46779

if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&

46780

CondVT.getVectorElementType() == MVT::i1 &&

46781

TLI.isTypeLegal(VT.getScalarType())) {

46782

EVT ExtCondVT = VT.changeVectorElementTypeToInteger();

46783

if (SDValue ExtCond = combineToExtendBoolVectorInReg(

46784

ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {

46785

ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);

46786

return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);

46787

}

46788

}

46789

46790

// Early exit check

46791

if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget))

46792

return SDValue();

46793

46794

if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))

46795

return V;

46796

46797

if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))

46798

return V;

46799

46800

if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))

46801

return V;

46802

46803

// select(~Cond, X, Y) -> select(Cond, Y, X)

46804

if (CondVT.getScalarType() != MVT::i1) {

46805

if (SDValue CondNot = IsNOT(Cond, DAG))

46806

return DAG.getNode(N->getOpcode(), DL, VT,

46807

DAG.getBitcast(CondVT, CondNot), RHS, LHS);

46808

46809

// pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the

46810

// signbit.

46811

if (Cond.getOpcode() == X86ISD::PCMPGT &&

46812

ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&

46813

Cond.hasOneUse()) {

46814

Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,

46815

DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));

46816

return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);

46817

}

46818

}

46819

46820

// Try to optimize vXi1 selects if both operands are either all constants or

46821

// bitcasts from scalar integer type. In that case we can convert the operands

46822

// to integer and use an integer select which will be converted to a CMOV.

46823

// We need to take a little bit of care to avoid creating an i64 type after

46824

// type legalization.

46825

if (N->getOpcode() == ISD::SELECT && VT.isVector() &&

46826

VT.getVectorElementType() == MVT::i1 &&

46827

(DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {

46828

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());

46829

if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {

46830

bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());

46831

bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());

46832

46833

if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&

46834

LHS.getOperand(0).getValueType() == IntVT)) &&

46835

(RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&

46836

RHS.getOperand(0).getValueType() == IntVT))) {

46837

if (LHSIsConst)

46838

LHS = combinevXi1ConstantToInteger(LHS, DAG);

46839

else

46840

LHS = LHS.getOperand(0);

46841

46842

if (RHSIsConst)

46843

RHS = combinevXi1ConstantToInteger(RHS, DAG);

46844

else

46845

RHS = RHS.getOperand(0);

46846

46847

SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);

46848

return DAG.getBitcast(VT, Select);

46849

}

46850

}

46851

}

46852

46853

// If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of

46854

// single bits, then invert the predicate and swap the select operands.

46855

// This can lower using a vector shift bit-hack rather than mask and compare.

46856

if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&

46857

N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&

46858

Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&

46859

Cond.getOperand(0).getOpcode() == ISD::AND &&

46860

isNullOrNullSplat(Cond.getOperand(1)) &&

46861

cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&

46862

Cond.getOperand(0).getValueType() == VT) {

46863

// The 'and' mask must be composed of power-of-2 constants.

46864

SDValue And = Cond.getOperand(0);

46865

auto *C = isConstOrConstSplat(And.getOperand(1));

46866

if (C && C->getAPIntValue().isPowerOf2()) {

46867

// vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS

46868

SDValue NotCond =

46869

DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);

46870

return DAG.getSelect(DL, VT, NotCond, RHS, LHS);

46871

}

46872

46873

// If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld

46874

// and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.

46875

// 16-bit lacks a proper blendv.

46876

unsigned EltBitWidth = VT.getScalarSizeInBits();

46877

bool CanShiftBlend =

46878

TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||

46879

(Subtarget.hasAVX2() && EltBitWidth == 64) ||

46880

(Subtarget.hasXOP()));

46881

if (CanShiftBlend &&

46882

ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {

46883

return C->getAPIntValue().isPowerOf2();

46884

})) {

46885

// Create a left-shift constant to get the mask bits over to the sign-bit.

46886

SDValue Mask = And.getOperand(1);

46887

SmallVector<int, 32> ShlVals;

46888

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {

46889

auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));

46890

ShlVals.push_back(EltBitWidth - 1 -

46891

MaskVal->getAPIntValue().exactLogBase2());

46892

}

46893

// vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS

46894

SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);

46895

SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);

46896

SDValue NewCond =

46897

DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);

46898

return DAG.getSelect(DL, VT, NewCond, RHS, LHS);

46899

}

46900

}

46901

46902

return SDValue();

46903

}

46904

46905

/// Combine:

46906

/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)

46907

/// to:

46908

/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)

46909

/// i.e., reusing the EFLAGS produced by the LOCKed instruction.

46910

/// Note that this is only legal for some op/cc combinations.

46911

static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,

46912

SelectionDAG &DAG,

46913

const X86Subtarget &Subtarget) {

46914

// This combine only operates on CMP-like nodes.

46915

if (!(Cmp.getOpcode() == X86ISD::CMP ||

46916

(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))

46917

return SDValue();

46918

46919

// Can't replace the cmp if it has more uses than the one we're looking at.

46920

// FIXME: We would like to be able to handle this, but would need to make sure

46921

// all uses were updated.

46922

if (!Cmp.hasOneUse())

46923

return SDValue();

46924

46925

// This only applies to variations of the common case:

46926

// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)

46927

// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)

46928

// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)

46929

// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)

46930

// Using the proper condcodes (see below), overflow is checked for.

46931

46932

// FIXME: We can generalize both constraints:

46933

// - XOR/OR/AND (if they were made to survive AtomicExpand)

46934

// - LHS != 1

46935

// if the result is compared.

46936

46937

SDValue CmpLHS = Cmp.getOperand(0);

46938

SDValue CmpRHS = Cmp.getOperand(1);

46939

EVT CmpVT = CmpLHS.getValueType();

46940

46941

if (!CmpLHS.hasOneUse())

46942

return SDValue();

46943

46944

unsigned Opc = CmpLHS.getOpcode();

46945

if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)

46946

return SDValue();

46947

46948

SDValue OpRHS = CmpLHS.getOperand(2);

46949

auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);

46950

if (!OpRHSC)

46951

return SDValue();

46952

46953

APInt Addend = OpRHSC->getAPIntValue();

46954

if (Opc == ISD::ATOMIC_LOAD_SUB)

46955

Addend = -Addend;

46956

46957

auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);

46958

if (!CmpRHSC)

46959

return SDValue();

46960

46961

APInt Comparison = CmpRHSC->getAPIntValue();

46962

APInt NegAddend = -Addend;

46963

46964

// See if we can adjust the CC to make the comparison match the negated

46965

// addend.

46966

if (Comparison != NegAddend) {

46967

APInt IncComparison = Comparison + 1;

46968

if (IncComparison == NegAddend) {

46969

if (CC == X86::COND_A && !Comparison.isMaxValue()) {

46970

Comparison = IncComparison;

46971

CC = X86::COND_AE;

46972

} else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {

46973

Comparison = IncComparison;

46974

CC = X86::COND_L;

46975

}

46976

}

46977

APInt DecComparison = Comparison - 1;

46978

if (DecComparison == NegAddend) {

46979

if (CC == X86::COND_AE && !Comparison.isMinValue()) {

46980

Comparison = DecComparison;

46981

CC = X86::COND_A;

46982

} else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {

46983

Comparison = DecComparison;

46984

CC = X86::COND_LE;

46985

}

46986

}

46987

}

46988

46989

// If the addend is the negation of the comparison value, then we can do

46990

// a full comparison by emitting the atomic arithmetic as a locked sub.

46991

if (Comparison == NegAddend) {

46992

// The CC is fine, but we need to rewrite the LHS of the comparison as an

46993

// atomic sub.

46994

auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());

46995

auto AtomicSub = DAG.getAtomic(

46996

ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,

46997

/*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),

46998

/*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),

46999

AN->getMemOperand());

47000

auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);

47001

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));

47002

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));

47003

return LockOp;

47004

}

47005

47006

// We can handle comparisons with zero in a number of cases by manipulating

47007

// the CC used.

47008

if (!Comparison.isZero())

47009

return SDValue();

47010

47011

if (CC == X86::COND_S && Addend == 1)

47012

CC = X86::COND_LE;

47013

else if (CC == X86::COND_NS && Addend == 1)

47014

CC = X86::COND_G;

47015

else if (CC == X86::COND_G && Addend == -1)

47016

CC = X86::COND_GE;

47017

else if (CC == X86::COND_LE && Addend == -1)

47018

CC = X86::COND_L;

47019

else

47020

return SDValue();

47021

47022

SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);

47023

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));

47024

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));

47025

return LockOp;

47026

}

47027

47028

// Check whether a boolean test is testing a boolean value generated by

47029

// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition

47030

// code.

47031

//

47032

// Simplify the following patterns:

47033

// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or

47034

// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)

47035

// to (Op EFLAGS Cond)

47036

//

47037

// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or

47038

// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)

47039

// to (Op EFLAGS !Cond)

47040

//

47041

// where Op could be BRCOND or CMOV.

47042

//

47043

static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {

47044

// This combine only operates on CMP-like nodes.

47045

if (!(Cmp.getOpcode() == X86ISD::CMP ||

47046

(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))

47047

return SDValue();

47048

47049

// Quit if not used as a boolean value.

47050

if (CC != X86::COND_E && CC != X86::COND_NE)

47051

return SDValue();

47052

47053

// Check CMP operands. One of them should be 0 or 1 and the other should be

47054

// an SetCC or extended from it.

47055

SDValue Op1 = Cmp.getOperand(0);

47056

SDValue Op2 = Cmp.getOperand(1);

47057

47058

SDValue SetCC;

47059

const ConstantSDNode* C = nullptr;

47060

bool needOppositeCond = (CC == X86::COND_E);

47061

bool checkAgainstTrue = false; // Is it a comparison against 1?

47062

47063

if ((C = dyn_cast<ConstantSDNode>(Op1)))

47064

SetCC = Op2;

47065

else if ((C = dyn_cast<ConstantSDNode>(Op2)))

47066

SetCC = Op1;

47067

else // Quit if all operands are not constants.

47068

return SDValue();

47069

47070

if (C->getZExtValue() == 1) {

47071

needOppositeCond = !needOppositeCond;

47072

checkAgainstTrue = true;

47073

} else if (C->getZExtValue() != 0)

47074

// Quit if the constant is neither 0 or 1.

47075

return SDValue();

47076

47077

bool truncatedToBoolWithAnd = false;

47078

// Skip (zext $x), (trunc $x), or (and $x, 1) node.

47079

while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||

47080

SetCC.getOpcode() == ISD::TRUNCATE ||

47081

SetCC.getOpcode() == ISD::AND) {

47082

if (SetCC.getOpcode() == ISD::AND) {

47083

int OpIdx = -1;

47084

if (isOneConstant(SetCC.getOperand(0)))

47085

OpIdx = 1;

47086

if (isOneConstant(SetCC.getOperand(1)))

47087

OpIdx = 0;

47088

if (OpIdx < 0)

47089

break;

47090

SetCC = SetCC.getOperand(OpIdx);

47091

truncatedToBoolWithAnd = true;

47092

} else

47093

SetCC = SetCC.getOperand(0);

47094

}

47095

47096

switch (SetCC.getOpcode()) {

47097

case X86ISD::SETCC_CARRY:

47098

// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to

47099

// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,

47100

// i.e. it's a comparison against true but the result of SETCC_CARRY is not

47101

// truncated to i1 using 'and'.

47102

if (checkAgainstTrue && !truncatedToBoolWithAnd)

47103

break;

47104

assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47105, __extension__
__PRETTY_FUNCTION__))

47105

"Invalid use of SETCC_CARRY!")(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47105, __extension__
__PRETTY_FUNCTION__));

47106

[[fallthrough]];

47107

case X86ISD::SETCC:

47108

// Set the condition code or opposite one if necessary.

47109

CC = X86::CondCode(SetCC.getConstantOperandVal(0));

47110

if (needOppositeCond)

47111

CC = X86::GetOppositeBranchCondition(CC);

47112

return SetCC.getOperand(1);

47113

case X86ISD::CMOV: {

47114

// Check whether false/true value has canonical one, i.e. 0 or 1.

47115

ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));

47116

ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));

47117

// Quit if true value is not a constant.

47118

if (!TVal)

47119

return SDValue();

47120

// Quit if false value is not a constant.

47121

if (!FVal) {

47122

SDValue Op = SetCC.getOperand(0);

47123

// Skip 'zext' or 'trunc' node.

47124

if (Op.getOpcode() == ISD::ZERO_EXTEND ||

47125

Op.getOpcode() == ISD::TRUNCATE)

47126

Op = Op.getOperand(0);

47127

// A special case for rdrand/rdseed, where 0 is set if false cond is

47128

// found.

47129

if ((Op.getOpcode() != X86ISD::RDRAND &&

47130

Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)

47131

return SDValue();

47132

}

47133

// Quit if false value is not the constant 0 or 1.

47134

bool FValIsFalse = true;

47135

if (FVal && FVal->getZExtValue() != 0) {

47136

if (FVal->getZExtValue() != 1)

47137

return SDValue();

47138

// If FVal is 1, opposite cond is needed.

47139

needOppositeCond = !needOppositeCond;

47140

FValIsFalse = false;

47141

}

47142

// Quit if TVal is not the constant opposite of FVal.

47143

if (FValIsFalse && TVal->getZExtValue() != 1)

47144

return SDValue();

47145

if (!FValIsFalse && TVal->getZExtValue() != 0)

47146

return SDValue();

47147

CC = X86::CondCode(SetCC.getConstantOperandVal(2));

47148

if (needOppositeCond)

47149

CC = X86::GetOppositeBranchCondition(CC);

47150

return SetCC.getOperand(3);

47151

}

47152

}

47153

47154

return SDValue();

47155

}

47156

47157

/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.

47158

/// Match:

47159

/// (X86or (X86setcc) (X86setcc))

47160

/// (X86cmp (and (X86setcc) (X86setcc)), 0)

47161

static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,

47162

X86::CondCode &CC1, SDValue &Flags,

47163

bool &isAnd) {

47164

if (Cond->getOpcode() == X86ISD::CMP) {

47165

if (!isNullConstant(Cond->getOperand(1)))

47166

return false;

47167

47168

Cond = Cond->getOperand(0);

47169

}

47170

47171

isAnd = false;

47172

47173

SDValue SetCC0, SetCC1;

47174

switch (Cond->getOpcode()) {

47175

default: return false;

47176

case ISD::AND:

47177

case X86ISD::AND:

47178

isAnd = true;

47179

[[fallthrough]];

47180

case ISD::OR:

47181

case X86ISD::OR:

47182

SetCC0 = Cond->getOperand(0);

47183

SetCC1 = Cond->getOperand(1);

47184

break;

47185

};

47186

47187

// Make sure we have SETCC nodes, using the same flags value.

47188

if (SetCC0.getOpcode() != X86ISD::SETCC ||

47189

SetCC1.getOpcode() != X86ISD::SETCC ||

47190

SetCC0->getOperand(1) != SetCC1->getOperand(1))

47191

return false;

47192

47193

CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);

47194

CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);

47195

Flags = SetCC0->getOperand(1);

47196

return true;

47197

}

47198

47199

// When legalizing carry, we create carries via add X, -1

47200

// If that comes from an actual carry, via setcc, we use the

47201

// carry directly.

47202

static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {

47203

if (EFLAGS.getOpcode() == X86ISD::ADD) {

47204

if (isAllOnesConstant(EFLAGS.getOperand(1))) {

47205

bool FoundAndLSB = false;

47206

SDValue Carry = EFLAGS.getOperand(0);

47207

while (Carry.getOpcode() == ISD::TRUNCATE ||

47208

Carry.getOpcode() == ISD::ZERO_EXTEND ||

47209

(Carry.getOpcode() == ISD::AND &&

47210

isOneConstant(Carry.getOperand(1)))) {

47211

FoundAndLSB |= Carry.getOpcode() == ISD::AND;

47212

Carry = Carry.getOperand(0);

47213

}

47214

if (Carry.getOpcode() == X86ISD::SETCC ||

47215

Carry.getOpcode() == X86ISD::SETCC_CARRY) {

47216

// TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?

47217

uint64_t CarryCC = Carry.getConstantOperandVal(0);

47218

SDValue CarryOp1 = Carry.getOperand(1);

47219

if (CarryCC == X86::COND_B)

47220

return CarryOp1;

47221

if (CarryCC == X86::COND_A) {

47222

// Try to convert COND_A into COND_B in an attempt to facilitate

47223

// materializing "setb reg".

47224

//

47225

// Do not flip "e > c", where "c" is a constant, because Cmp

47226

// instruction cannot take an immediate as its first operand.

47227

//

47228

if (CarryOp1.getOpcode() == X86ISD::SUB &&

47229

CarryOp1.getNode()->hasOneUse() &&

47230

CarryOp1.getValueType().isInteger() &&

47231

!isa<ConstantSDNode>(CarryOp1.getOperand(1))) {

47232

SDValue SubCommute =

47233

DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),

47234

CarryOp1.getOperand(1), CarryOp1.getOperand(0));

47235

return SDValue(SubCommute.getNode(), CarryOp1.getResNo());

47236

}

47237

}

47238

// If this is a check of the z flag of an add with 1, switch to the

47239

// C flag.

47240

if (CarryCC == X86::COND_E &&

47241

CarryOp1.getOpcode() == X86ISD::ADD &&

47242

isOneConstant(CarryOp1.getOperand(1)))

47243

return CarryOp1;

47244

} else if (FoundAndLSB) {

47245

SDLoc DL(Carry);

47246

SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());

47247

if (Carry.getOpcode() == ISD::SRL) {

47248

BitNo = Carry.getOperand(1);

47249

Carry = Carry.getOperand(0);

47250

}

47251

return getBT(Carry, BitNo, DL, DAG);

47252

}

47253

}

47254

}

47255

47256

return SDValue();

47257

}

47258

47259

/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC

47260

/// to avoid the inversion.

47261

static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,

47262

SelectionDAG &DAG,

47263

const X86Subtarget &Subtarget) {

47264

// TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.

47265

if (EFLAGS.getOpcode() != X86ISD::PTEST &&

47266

EFLAGS.getOpcode() != X86ISD::TESTP)

47267

return SDValue();

47268

47269

// PTEST/TESTP sets EFLAGS as:

47270

// TESTZ: ZF = (Op0 & Op1) == 0

47271

// TESTC: CF = (~Op0 & Op1) == 0

47272

// TESTNZC: ZF == 0 && CF == 0

47273

EVT VT = EFLAGS.getValueType();

47274

SDValue Op0 = EFLAGS.getOperand(0);

47275

SDValue Op1 = EFLAGS.getOperand(1);

47276

EVT OpVT = Op0.getValueType();

47277

47278

// TEST*(~X,Y) == TEST*(X,Y)

47279

if (SDValue NotOp0 = IsNOT(Op0, DAG)) {

47280

X86::CondCode InvCC;

47281

switch (CC) {

47282

case X86::COND_B:

47283

// testc -> testz.

47284

InvCC = X86::COND_E;

47285

break;

47286

case X86::COND_AE:

47287

// !testc -> !testz.

47288

InvCC = X86::COND_NE;

47289

break;

47290

case X86::COND_E:

47291

// testz -> testc.

47292

InvCC = X86::COND_B;

47293

break;

47294

case X86::COND_NE:

47295

// !testz -> !testc.

47296

InvCC = X86::COND_AE;

47297

break;

47298

case X86::COND_A:

47299

case X86::COND_BE:

47300

// testnzc -> testnzc (no change).

47301

InvCC = CC;

47302

break;

47303

default:

47304

InvCC = X86::COND_INVALID;

47305

break;

47306

}

47307

47308

if (InvCC != X86::COND_INVALID) {

47309

CC = InvCC;

47310

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

47311

DAG.getBitcast(OpVT, NotOp0), Op1);

47312

}

47313

}

47314

47315

if (CC == X86::COND_B || CC == X86::COND_AE) {

47316

// TESTC(X,~X) == TESTC(X,-1)

47317

if (SDValue NotOp1 = IsNOT(Op1, DAG)) {

47318

if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {

47319

SDLoc DL(EFLAGS);

47320

return DAG.getNode(EFLAGS.getOpcode(), DL, VT,

47321

DAG.getBitcast(OpVT, NotOp1),

47322

DAG.getAllOnesConstant(DL, OpVT));

47323

}

47324

}

47325

}

47326

47327

if (CC == X86::COND_E || CC == X86::COND_NE) {

47328

// TESTZ(X,~Y) == TESTC(Y,X)

47329

if (SDValue NotOp1 = IsNOT(Op1, DAG)) {

47330

CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);

47331

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

47332

DAG.getBitcast(OpVT, NotOp1), Op0);

47333

}

47334

47335

if (Op0 == Op1) {

47336

SDValue BC = peekThroughBitcasts(Op0);

47337

EVT BCVT = BC.getValueType();

47338

47339

// TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)

47340

if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {

47341

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

47342

DAG.getBitcast(OpVT, BC.getOperand(0)),

47343

DAG.getBitcast(OpVT, BC.getOperand(1)));

47344

}

47345

47346

// TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)

47347

if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {

47348

CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);

47349

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

47350

DAG.getBitcast(OpVT, BC.getOperand(0)),

47351

DAG.getBitcast(OpVT, BC.getOperand(1)));

47352

}

47353

47354

// If every element is an all-sign value, see if we can use TESTP/MOVMSK

47355

// to more efficiently extract the sign bits and compare that.

47356

// TODO: Handle TESTC with comparison inversion.

47357

// TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on

47358

// TESTP/MOVMSK combines to make sure its never worse than PTEST?

47359

if (BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {

47360

unsigned EltBits = BCVT.getScalarSizeInBits();

47361

if (DAG.ComputeNumSignBits(BC) == EltBits) {

47362

assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result")(static_cast <bool> (VT == MVT::i32 && "Expected i32 EFLAGS comparison result"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Expected i32 EFLAGS comparison result\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47362, __extension__
__PRETTY_FUNCTION__));

47363

APInt SignMask = APInt::getSignMask(EltBits);

47364

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

47365

if (SDValue Res =

47366

TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {

47367

// For vXi16 cases we need to use pmovmksb and extract every other

47368

// sign bit.

47369

SDLoc DL(EFLAGS);

47370

if (EltBits == 32 || EltBits == 64) {

47371

MVT FloatSVT = MVT::getFloatingPointVT(EltBits);

47372

MVT FloatVT =

47373

MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);

47374

Res = DAG.getBitcast(FloatVT, Res);

47375

return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);

47376

} else if (EltBits == 16) {

47377

MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;

47378

Res = DAG.getBitcast(MovmskVT, Res);

47379

Res = getPMOVMSKB(DL, Res, DAG, Subtarget);

47380

Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,

47381

DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));

47382

} else {

47383

Res = getPMOVMSKB(DL, Res, DAG, Subtarget);

47384

}

47385

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,

47386

DAG.getConstant(0, DL, MVT::i32));

47387

}

47388

}

47389

}

47390

}

47391

47392

// TESTZ(-1,X) == TESTZ(X,X)

47393

if (ISD::isBuildVectorAllOnes(Op0.getNode()))

47394

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);

47395

47396

// TESTZ(X,-1) == TESTZ(X,X)

47397

if (ISD::isBuildVectorAllOnes(Op1.getNode()))

47398

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);

47399

47400

// TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)

47401

// TODO: Add COND_NE handling?

47402

if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {

47403

SDValue Src0 = peekThroughBitcasts(Op0);

47404

SDValue Src1 = peekThroughBitcasts(Op1);

47405

if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {

47406

Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)),

47407

peekThroughBitcasts(Src0.getOperand(1)), true);

47408

Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)),

47409

peekThroughBitcasts(Src1.getOperand(1)), true);

47410

if (Src0 && Src1)

47411

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

47412

DAG.getBitcast(MVT::v4i64, Src0),

47413

DAG.getBitcast(MVT::v4i64, Src1));

47414

}

47415

}

47416

}

47417

47418

return SDValue();

47419

}

47420

47421

// Attempt to simplify the MOVMSK input based on the comparison type.

47422

static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,

47423

SelectionDAG &DAG,

47424

const X86Subtarget &Subtarget) {

47425

// Handle eq/ne against zero (any_of).

47426

// Handle eq/ne against -1 (all_of).

47427

if (!(CC == X86::COND_E || CC == X86::COND_NE))

47428

return SDValue();

47429

if (EFLAGS.getValueType() != MVT::i32)

47430

return SDValue();

47431

unsigned CmpOpcode = EFLAGS.getOpcode();

47432

if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)

47433

return SDValue();

47434

auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));

47435

if (!CmpConstant)

47436

return SDValue();

47437

const APInt &CmpVal = CmpConstant->getAPIntValue();

47438

47439

SDValue CmpOp = EFLAGS.getOperand(0);

47440

unsigned CmpBits = CmpOp.getValueSizeInBits();

47441

assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch")(static_cast <bool> (CmpBits == CmpVal.getBitWidth() &&
"Value size mismatch") ? void (0) : __assert_fail ("CmpBits == CmpVal.getBitWidth() && \"Value size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47441, __extension__
__PRETTY_FUNCTION__));

47442

47443

// Peek through any truncate.

47444

if (CmpOp.getOpcode() == ISD::TRUNCATE)

47445

CmpOp = CmpOp.getOperand(0);

47446

47447

// Bail if we don't find a MOVMSK.

47448

if (CmpOp.getOpcode() != X86ISD::MOVMSK)

47449

return SDValue();

47450

47451

SDValue Vec = CmpOp.getOperand(0);

47452

MVT VecVT = Vec.getSimpleValueType();

47453

assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47454, __extension__
__PRETTY_FUNCTION__))

47454

"Unexpected MOVMSK operand")(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47454, __extension__
__PRETTY_FUNCTION__));

47455

unsigned NumElts = VecVT.getVectorNumElements();

47456

unsigned NumEltBits = VecVT.getScalarSizeInBits();

47457

47458

bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();

47459

bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&

47460

NumElts <= CmpBits && CmpVal.isMask(NumElts);

47461

if (!IsAnyOf && !IsAllOf)

47462

return SDValue();

47463

47464

// TODO: Check more combining cases for me.

47465

// Here we check the cmp use number to decide do combining or not.

47466

// Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"

47467

// and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.

47468

bool IsOneUse = CmpOp.getNode()->hasOneUse();

47469

47470

// See if we can peek through to a vector with a wider element type, if the

47471

// signbits extend down to all the sub-elements as well.

47472

// Calling MOVMSK with the wider type, avoiding the bitcast, helps expose

47473

// potential SimplifyDemandedBits/Elts cases.

47474

// If we looked through a truncate that discard bits, we can't do this

47475

// transform.

47476

// FIXME: We could do this transform for truncates that discarded bits by

47477

// inserting an AND mask between the new MOVMSK and the CMP.

47478

if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {

47479

SDValue BC = peekThroughBitcasts(Vec);

47480

MVT BCVT = BC.getSimpleValueType();

47481

unsigned BCNumElts = BCVT.getVectorNumElements();

47482

unsigned BCNumEltBits = BCVT.getScalarSizeInBits();

47483

if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&

47484

BCNumEltBits > NumEltBits &&

47485

DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {

47486

SDLoc DL(EFLAGS);

47487

APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);

47488

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

47489

DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),

47490

DAG.getConstant(CmpMask, DL, MVT::i32));

47491

}

47492

}

47493

47494

// MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).

47495

// MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).

47496

// MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).

47497

// MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).

47498

if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {

47499

SmallVector<SDValue> Ops;

47500

if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&

47501

Ops.size() == 2) {

47502

SDLoc DL(EFLAGS);

47503

EVT SubVT = Ops[0].getValueType().changeTypeToInteger();

47504

APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);

47505

SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,

47506

DAG.getBitcast(SubVT, Ops[0]),

47507

DAG.getBitcast(SubVT, Ops[1]));

47508

V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);

47509

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

47510

DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),

47511

DAG.getConstant(CmpMask, DL, MVT::i32));

47512

}

47513

}

47514

47515

// MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).

47516

// MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).

47517

// MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).

47518

// MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).

47519

if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {

47520

MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

47521

SDValue BC = peekThroughBitcasts(Vec);

47522

// Ensure MOVMSK was testing every signbit of BC.

47523

if (BC.getValueType().getVectorNumElements() <= NumElts) {

47524

if (BC.getOpcode() == X86ISD::PCMPEQ) {

47525

SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),

47526

BC.getOperand(0), BC.getOperand(1));

47527

V = DAG.getBitcast(TestVT, V);

47528

return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

47529

}

47530

// Check for 256-bit split vector cases.

47531

if (BC.getOpcode() == ISD::AND &&

47532

BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&

47533

BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {

47534

SDValue LHS = BC.getOperand(0);

47535

SDValue RHS = BC.getOperand(1);

47536

LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),

47537

LHS.getOperand(0), LHS.getOperand(1));

47538

RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),

47539

RHS.getOperand(0), RHS.getOperand(1));

47540

LHS = DAG.getBitcast(TestVT, LHS);

47541

RHS = DAG.getBitcast(TestVT, RHS);

47542

SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);

47543

return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

47544

}

47545

}

47546

}

47547

47548

// See if we can avoid a PACKSS by calling MOVMSK on the sources.

47549

// For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out

47550

// sign bits prior to the comparison with zero unless we know that

47551

// the vXi16 splats the sign bit down to the lower i8 half.

47552

// TODO: Handle all_of patterns.

47553

if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {

47554

SDValue VecOp0 = Vec.getOperand(0);

47555

SDValue VecOp1 = Vec.getOperand(1);

47556

bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;

47557

bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;

47558

// PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.

47559

if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {

47560

SDLoc DL(EFLAGS);

47561

SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);

47562

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

47563

Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);

47564

if (!SignExt0) {

47565

Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,

47566

DAG.getConstant(0xAAAA, DL, MVT::i16));

47567

}

47568

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

47569

DAG.getConstant(0, DL, MVT::i16));

47570

}

47571

// PMOVMSKB(PACKSSBW(LO(X), HI(X)))

47572

// -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.

47573

if (CmpBits >= 16 && Subtarget.hasInt256() &&

47574

(IsAnyOf || (SignExt0 && SignExt1))) {

47575

if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {

47576

SDLoc DL(EFLAGS);

47577

SDValue Result = peekThroughBitcasts(Src);

47578

if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&

47579

Result.getValueType().getVectorNumElements() <= NumElts) {

47580

SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),

47581

Result.getOperand(0), Result.getOperand(1));

47582

V = DAG.getBitcast(MVT::v4i64, V);

47583

return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

47584

}

47585

Result = DAG.getBitcast(MVT::v32i8, Result);

47586

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

47587

unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;

47588

if (!SignExt0 || !SignExt1) {

47589

assert(IsAnyOf &&(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47590, __extension__
__PRETTY_FUNCTION__))

47590

"Only perform v16i16 signmasks for any_of patterns")(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47590, __extension__
__PRETTY_FUNCTION__));

47591

Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,

47592

DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));

47593

}

47594

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

47595

DAG.getConstant(CmpMask, DL, MVT::i32));

47596

}

47597

}

47598

}

47599

47600

// MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.

47601

SmallVector<int, 32> ShuffleMask;

47602

SmallVector<SDValue, 2> ShuffleInputs;

47603

if (NumElts <= CmpBits &&

47604

getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,

47605

ShuffleMask, DAG) &&

47606

ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&

47607

ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {

47608

unsigned NumShuffleElts = ShuffleMask.size();

47609

APInt DemandedElts = APInt::getZero(NumShuffleElts);

47610

for (int M : ShuffleMask) {

47611

assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index")(static_cast <bool> (0 <= M && M < (int)NumShuffleElts
&& "Bad unary shuffle index") ? void (0) : __assert_fail
("0 <= M && M < (int)NumShuffleElts && \"Bad unary shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47611, __extension__
__PRETTY_FUNCTION__));

47612

DemandedElts.setBit(M);

47613

}

47614

if (DemandedElts.isAllOnes()) {

47615

SDLoc DL(EFLAGS);

47616

SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);

47617

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

47618

Result =

47619

DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());

47620

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

47621

EFLAGS.getOperand(1));

47622

}

47623

}

47624

47625

return SDValue();

47626

}

47627

47628

/// Optimize an EFLAGS definition used according to the condition code \p CC

47629

/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing

47630

/// uses of chain values.

47631

static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,

47632

SelectionDAG &DAG,

47633

const X86Subtarget &Subtarget) {

47634

if (CC == X86::COND_B)

47635

if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))

47636

return Flags;

47637

47638

if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))

47639

return R;

47640

47641

if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))

47642

return R;

47643

47644

if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))

47645

return R;

47646

47647

return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);

47648

}

47649

47650

/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]

47651

static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,

47652

TargetLowering::DAGCombinerInfo &DCI,

47653

const X86Subtarget &Subtarget) {

47654

SDLoc DL(N);

47655

47656

SDValue FalseOp = N->getOperand(0);

47657

SDValue TrueOp = N->getOperand(1);

47658

X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);

47659

SDValue Cond = N->getOperand(3);

47660

47661

// cmov X, X, ?, ? --> X

47662

if (TrueOp == FalseOp)

47663

return TrueOp;

47664

47665

// Try to simplify the EFLAGS and condition code operands.

47666

// We can't always do this as FCMOV only supports a subset of X86 cond.

47667

if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {

47668

if (!(FalseOp.getValueType() == MVT::f80 ||

47669

(FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||

47670

(FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||

47671

!Subtarget.canUseCMOV() || hasFPCMov(CC)) {

47672

SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),

47673

Flags};

47674

return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

47675

}

47676

}

47677

47678

// If this is a select between two integer constants, try to do some

47679

// optimizations. Note that the operands are ordered the opposite of SELECT

47680

// operands.

47681

if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {

47682

if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {

47683

// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is

47684

// larger than FalseC (the false value).

47685

if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {

47686

CC = X86::GetOppositeBranchCondition(CC);

47687

std::swap(TrueC, FalseC);

47688

std::swap(TrueOp, FalseOp);

47689

}

47690

47691

// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.

47692

// This is efficient for any integer data type (including i8/i16) and

47693

// shift amount.

47694

if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {

47695

Cond = getSETCC(CC, Cond, DL, DAG);

47696

47697

// Zero extend the condition if needed.

47698

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);

47699

47700

unsigned ShAmt = TrueC->getAPIntValue().logBase2();

47701

Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,

47702

DAG.getConstant(ShAmt, DL, MVT::i8));

47703

return Cond;

47704

}

47705

47706

// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient

47707

// for any integer data type, including i8/i16.

47708

if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {

47709

Cond = getSETCC(CC, Cond, DL, DAG);

47710

47711

// Zero extend the condition if needed.

47712

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,

47713

FalseC->getValueType(0), Cond);

47714

Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

47715

SDValue(FalseC, 0));

47716

return Cond;

47717

}

47718

47719

// Optimize cases that will turn into an LEA instruction. This requires

47720

// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).

47721

if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {

47722

APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();

47723

assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47724, __extension__
__PRETTY_FUNCTION__))

47724

"Implicit constant truncation")(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47724, __extension__
__PRETTY_FUNCTION__));

47725

47726

bool isFastMultiplier = false;

47727

if (Diff.ult(10)) {

47728

switch (Diff.getZExtValue()) {

47729

default: break;

47730

case 1: // result = add base, cond

47731

case 2: // result = lea base( , cond*2)

47732

case 3: // result = lea base(cond, cond*2)

47733

case 4: // result = lea base( , cond*4)

47734

case 5: // result = lea base(cond, cond*4)

47735

case 8: // result = lea base( , cond*8)

47736

case 9: // result = lea base(cond, cond*8)

47737

isFastMultiplier = true;

47738

break;

47739

}

47740

}

47741

47742

if (isFastMultiplier) {

47743

Cond = getSETCC(CC, Cond, DL ,DAG);

47744

// Zero extend the condition if needed.

47745

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),

47746

Cond);

47747

// Scale the condition by the difference.

47748

if (Diff != 1)

47749

Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,

47750

DAG.getConstant(Diff, DL, Cond.getValueType()));

47751

47752

// Add the base if non-zero.

47753

if (FalseC->getAPIntValue() != 0)

47754

Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

47755

SDValue(FalseC, 0));

47756

return Cond;

47757

}

47758

}

47759

}

47760

}

47761

47762

// Handle these cases:

47763

// (select (x != c), e, c) -> select (x != c), e, x),

47764

// (select (x == c), c, e) -> select (x == c), x, e)

47765

// where the c is an integer constant, and the "select" is the combination

47766

// of CMOV and CMP.

47767

//

47768

// The rationale for this change is that the conditional-move from a constant

47769

// needs two instructions, however, conditional-move from a register needs

47770

// only one instruction.

47771

//

47772

// CAVEAT: By replacing a constant with a symbolic value, it may obscure

47773

// some instruction-combining opportunities. This opt needs to be

47774

// postponed as late as possible.

47775

//

47776

if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {

47777

// the DCI.xxxx conditions are provided to postpone the optimization as

47778

// late as possible.

47779

47780

ConstantSDNode *CmpAgainst = nullptr;

47781

if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&

47782

(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&

47783

!isa<ConstantSDNode>(Cond.getOperand(0))) {

47784

47785

if (CC == X86::COND_NE &&

47786

CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {

47787

CC = X86::GetOppositeBranchCondition(CC);

47788

std::swap(TrueOp, FalseOp);

47789

}

47790

47791

if (CC == X86::COND_E &&

47792

CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {

47793

SDValue Ops[] = {FalseOp, Cond.getOperand(0),

47794

DAG.getTargetConstant(CC, DL, MVT::i8), Cond};

47795

return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

47796

}

47797

}

47798

}

47799

47800

// Transform:

47801

//

47802

// (cmov 1 T (uge T 2))

47803

//

47804

// to:

47805

//

47806

// (adc T 0 (sub T 1))

47807

if (CC == X86::COND_AE && isOneConstant(FalseOp) &&

47808

Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {

47809

SDValue Cond0 = Cond.getOperand(0);

47810

if (Cond0.getOpcode() == ISD::TRUNCATE)

47811

Cond0 = Cond0.getOperand(0);

47812

auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));

47813

if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {

47814

EVT CondVT = Cond->getValueType(0);

47815

EVT OuterVT = N->getValueType(0);

47816

// Subtract 1 and generate a carry.

47817

SDValue NewSub =

47818

DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),

47819

DAG.getConstant(1, DL, CondVT));

47820

SDValue EFLAGS(NewSub.getNode(), 1);

47821

return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(OuterVT, MVT::i32),

47822

TrueOp, DAG.getConstant(0, DL, OuterVT), EFLAGS);

47823

}

47824

}

47825

47826

// Fold and/or of setcc's to double CMOV:

47827

// (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)

47828

// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)

47829

//

47830

// This combine lets us generate:

47831

// cmovcc1 (jcc1 if we don't have CMOV)

47832

// cmovcc2 (same)

47833

// instead of:

47834

// setcc1

47835

// setcc2

47836

// and/or

47837

// cmovne (jne if we don't have CMOV)

47838

// When we can't use the CMOV instruction, it might increase branch

47839

// mispredicts.

47840

// When we can use CMOV, or when there is no mispredict, this improves

47841

// throughput and reduces register pressure.

47842

//

47843

if (CC == X86::COND_NE) {

47844

SDValue Flags;

47845

X86::CondCode CC0, CC1;

47846

bool isAndSetCC;

47847

if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {

47848

if (isAndSetCC) {

47849

std::swap(FalseOp, TrueOp);

47850

CC0 = X86::GetOppositeBranchCondition(CC0);

47851

CC1 = X86::GetOppositeBranchCondition(CC1);

47852

}

47853

47854

SDValue LOps[] = {FalseOp, TrueOp,

47855

DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};

47856

SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);

47857

SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),

47858

Flags};

47859

SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

47860

return CMOV;

47861

}

47862

}

47863

47864

// Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->

47865

// (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)

47866

// Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->

47867

// (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)

47868

if ((CC == X86::COND_NE || CC == X86::COND_E) &&

47869

Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {

47870

SDValue Add = TrueOp;

47871

SDValue Const = FalseOp;

47872

// Canonicalize the condition code for easier matching and output.

47873

if (CC == X86::COND_E)

47874

std::swap(Add, Const);

47875

47876

// We might have replaced the constant in the cmov with the LHS of the

47877

// compare. If so change it to the RHS of the compare.

47878

if (Const == Cond.getOperand(0))

47879

Const = Cond.getOperand(1);

47880

47881

// Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.

47882

if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&

47883

Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&

47884

(Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||

47885

Add.getOperand(0).getOpcode() == ISD::CTTZ) &&

47886

Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {

47887

EVT VT = N->getValueType(0);

47888

// This should constant fold.

47889

SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));

47890

SDValue CMov =

47891

DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),

47892

DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);

47893

return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));

47894

}

47895

}

47896

47897

return SDValue();

47898

}

47899

47900

/// Different mul shrinking modes.

47901

enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };

47902

47903

static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {

47904

EVT VT = N->getOperand(0).getValueType();

47905

if (VT.getScalarSizeInBits() != 32)

47906

return false;

47907

47908

assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")(static_cast <bool> (N->getNumOperands() == 2 &&
"NumOperands of Mul are 2") ? void (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47908, __extension__
__PRETTY_FUNCTION__));

47909

unsigned SignBits[2] = {1, 1};

47910

bool IsPositive[2] = {false, false};

47911

for (unsigned i = 0; i < 2; i++) {

47912

SDValue Opd = N->getOperand(i);

47913

47914

SignBits[i] = DAG.ComputeNumSignBits(Opd);

47915

IsPositive[i] = DAG.SignBitIsZero(Opd);

47916

}

47917

47918

bool AllPositive = IsPositive[0] && IsPositive[1];

47919

unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);

47920

// When ranges are from -128 ~ 127, use MULS8 mode.

47921

if (MinSignBits >= 25)

47922

Mode = ShrinkMode::MULS8;

47923

// When ranges are from 0 ~ 255, use MULU8 mode.

47924

else if (AllPositive && MinSignBits >= 24)

47925

Mode = ShrinkMode::MULU8;

47926

// When ranges are from -32768 ~ 32767, use MULS16 mode.

47927

else if (MinSignBits >= 17)

47928

Mode = ShrinkMode::MULS16;

47929

// When ranges are from 0 ~ 65535, use MULU16 mode.

47930

else if (AllPositive && MinSignBits >= 16)

47931

Mode = ShrinkMode::MULU16;

47932

else

47933

return false;

47934

return true;

47935

}

47936

47937

/// When the operands of vector mul are extended from smaller size values,

47938

/// like i8 and i16, the type of mul may be shrinked to generate more

47939

/// efficient code. Two typical patterns are handled:

47940

/// Pattern1:

47941

/// %2 = sext/zext <N x i8> %1 to <N x i32>

47942

/// %4 = sext/zext <N x i8> %3 to <N x i32>

47943

// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)

47944

/// %5 = mul <N x i32> %2, %4

47945

///

47946

/// Pattern2:

47947

/// %2 = zext/sext <N x i16> %1 to <N x i32>

47948

/// %4 = zext/sext <N x i16> %3 to <N x i32>

47949

/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)

47950

/// %5 = mul <N x i32> %2, %4

47951

///

47952

/// There are four mul shrinking modes:

47953

/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is

47954

/// -128 to 128, and the scalar value range of %4 is also -128 to 128,

47955

/// generate pmullw+sext32 for it (MULS8 mode).

47956

/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is

47957

/// 0 to 255, and the scalar value range of %4 is also 0 to 255,

47958

/// generate pmullw+zext32 for it (MULU8 mode).

47959

/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is

47960

/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,

47961

/// generate pmullw+pmulhw for it (MULS16 mode).

47962

/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is

47963

/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,

47964

/// generate pmullw+pmulhuw for it (MULU16 mode).

47965

static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,

47966

const X86Subtarget &Subtarget) {

47967

// Check for legality

47968

// pmullw/pmulhw are not supported by SSE.

47969

if (!Subtarget.hasSSE2())

47970

return SDValue();

47971

47972

// Check for profitability

47973

// pmulld is supported since SSE41. It is better to use pmulld

47974

// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than

47975

// the expansion.

47976

bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();

47977

if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))

47978

return SDValue();

47979

47980

ShrinkMode Mode;

47981

if (!canReduceVMulWidth(N, DAG, Mode))

47982

return SDValue();

47983

47984

SDLoc DL(N);

47985

SDValue N0 = N->getOperand(0);

47986

SDValue N1 = N->getOperand(1);

47987

EVT VT = N->getOperand(0).getValueType();

47988

unsigned NumElts = VT.getVectorNumElements();

47989

if ((NumElts % 2) != 0)

47990

return SDValue();

47991

47992

EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);

47993

47994

// Shrink the operands of mul.

47995

SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);

47996

SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);

47997

47998

// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the

47999

// lower part is needed.

48000

SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);

48001

if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)

48002

return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND

48003

: ISD::SIGN_EXTEND,

48004

DL, VT, MulLo);

48005

48006

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);

48007

// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,

48008

// the higher part is also needed.

48009

SDValue MulHi =

48010

DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,

48011

ReducedVT, NewN0, NewN1);

48012

48013

// Repack the lower part and higher part result of mul into a wider

48014

// result.

48015

// Generate shuffle functioning as punpcklwd.

48016

SmallVector<int, 16> ShuffleMask(NumElts);

48017

for (unsigned i = 0, e = NumElts / 2; i < e; i++) {

48018

ShuffleMask[2 * i] = i;

48019

ShuffleMask[2 * i + 1] = i + NumElts;

48020

}

48021

SDValue ResLo =

48022

DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);

48023

ResLo = DAG.getBitcast(ResVT, ResLo);

48024

// Generate shuffle functioning as punpckhwd.

48025

for (unsigned i = 0, e = NumElts / 2; i < e; i++) {

48026

ShuffleMask[2 * i] = i + NumElts / 2;

48027

ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;

48028

}

48029

SDValue ResHi =

48030

DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);

48031

ResHi = DAG.getBitcast(ResVT, ResHi);

48032

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);

48033

}

48034

48035

static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,

48036

EVT VT, const SDLoc &DL) {

48037

48038

auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {

48039

SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

48040

DAG.getConstant(Mult, DL, VT));

48041

Result = DAG.getNode(ISD::SHL, DL, VT, Result,

48042

DAG.getConstant(Shift, DL, MVT::i8));

48043

Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,

48044

N->getOperand(0));

48045

return Result;

48046

};

48047

48048

auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {

48049

SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

48050

DAG.getConstant(Mul1, DL, VT));

48051

Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,

48052

DAG.getConstant(Mul2, DL, VT));

48053

Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,

48054

N->getOperand(0));

48055

return Result;

48056

};

48057

48058

switch (MulAmt) {

48059

default:

48060

break;

48061

case 11:

48062

// mul x, 11 => add ((shl (mul x, 5), 1), x)

48063

return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);

48064

case 21:

48065

// mul x, 21 => add ((shl (mul x, 5), 2), x)

48066

return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);

48067

case 41:

48068

// mul x, 41 => add ((shl (mul x, 5), 3), x)

48069

return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);

48070

case 22:

48071

// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)

48072

return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),

48073

combineMulShlAddOrSub(5, 2, /*isAdd*/ true));

48074

case 19:

48075

// mul x, 19 => add ((shl (mul x, 9), 1), x)

48076

return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);

48077

case 37:

48078

// mul x, 37 => add ((shl (mul x, 9), 2), x)

48079

return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);

48080

case 73:

48081

// mul x, 73 => add ((shl (mul x, 9), 3), x)

48082

return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);

48083

case 13:

48084

// mul x, 13 => add ((shl (mul x, 3), 2), x)

48085

return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);

48086

case 23:

48087

// mul x, 23 => sub ((shl (mul x, 3), 3), x)

48088

return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);

48089

case 26:

48090

// mul x, 26 => add ((mul (mul x, 5), 5), x)

48091

return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);

48092

case 28:

48093

// mul x, 28 => add ((mul (mul x, 9), 3), x)

48094

return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);

48095

case 29:

48096

// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)

48097

return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),

48098

combineMulMulAddOrSub(9, 3, /*isAdd*/ true));

48099

}

48100

48101

// Another trick. If this is a power 2 + 2/4/8, we can use a shift followed

48102

// by a single LEA.

48103

// First check if this a sum of two power of 2s because that's easy. Then

48104

// count how many zeros are up to the first bit.

48105

// TODO: We can do this even without LEA at a cost of two shifts and an add.

48106

if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {

48107

unsigned ScaleShift = llvm::countr_zero(MulAmt);

48108

if (ScaleShift >= 1 && ScaleShift < 4) {

48109

unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));

48110

SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48111

DAG.getConstant(ShiftAmt, DL, MVT::i8));

48112

SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48113

DAG.getConstant(ScaleShift, DL, MVT::i8));

48114

return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);

48115

}

48116

}

48117

48118

return SDValue();

48119

}

48120

48121

// If the upper 17 bits of either element are zero and the other element are

48122

// zero/sign bits then we can use PMADDWD, which is always at least as quick as

48123

// PMULLD, except on KNL.

48124

static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,

48125

const X86Subtarget &Subtarget) {

48126

if (!Subtarget.hasSSE2())

48127

return SDValue();

48128

48129

if (Subtarget.isPMADDWDSlow())

48130

return SDValue();

48131

48132

EVT VT = N->getValueType(0);

48133

48134

// Only support vXi32 vectors.

48135

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)

48136

return SDValue();

48137

48138

// Make sure the type is legal or can split/widen to a legal type.

48139

// With AVX512 but without BWI, we would need to split v32i16.

48140

unsigned NumElts = VT.getVectorNumElements();

48141

if (NumElts == 1 || !isPowerOf2_32(NumElts))

48142

return SDValue();

48143

48144

// With AVX512 but without BWI, we would need to split v32i16.

48145

if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())

48146

return SDValue();

48147

48148

SDValue N0 = N->getOperand(0);

48149

SDValue N1 = N->getOperand(1);

48150

48151

// If we are zero/sign extending two steps without SSE4.1, its better to

48152

// reduce the vmul width instead.

48153

if (!Subtarget.hasSSE41() &&

48154

(((N0.getOpcode() == ISD::ZERO_EXTEND &&

48155

N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&

48156

(N1.getOpcode() == ISD::ZERO_EXTEND &&

48157

N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||

48158

((N0.getOpcode() == ISD::SIGN_EXTEND &&

48159

N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&

48160

(N1.getOpcode() == ISD::SIGN_EXTEND &&

48161

N1.getOperand(0).getScalarValueSizeInBits() <= 8))))

48162

return SDValue();

48163

48164

// If we are sign extending a wide vector without SSE4.1, its better to reduce

48165

// the vmul width instead.

48166

if (!Subtarget.hasSSE41() &&

48167

(N0.getOpcode() == ISD::SIGN_EXTEND &&

48168

N0.getOperand(0).getValueSizeInBits() > 128) &&

48169

(N1.getOpcode() == ISD::SIGN_EXTEND &&

48170

N1.getOperand(0).getValueSizeInBits() > 128))

48171

return SDValue();

48172

48173

// Sign bits must extend down to the lowest i16.

48174

if (DAG.ComputeMaxSignificantBits(N1) > 16 ||

48175

DAG.ComputeMaxSignificantBits(N0) > 16)

48176

return SDValue();

48177

48178

// At least one of the elements must be zero in the upper 17 bits, or can be

48179

// safely made zero without altering the final result.

48180

auto GetZeroableOp = [&](SDValue Op) {

48181

APInt Mask17 = APInt::getHighBitsSet(32, 17);

48182

if (DAG.MaskedValueIsZero(Op, Mask17))

48183

return Op;

48184

// Mask off upper 16-bits of sign-extended constants.

48185

if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))

48186

return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,

48187

DAG.getConstant(0xFFFF, SDLoc(N), VT));

48188

if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {

48189

SDValue Src = Op.getOperand(0);

48190

// Convert sext(vXi16) to zext(vXi16).

48191

if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)

48192

return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);

48193

// Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets

48194

// which will expand the extension.

48195

if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {

48196

EVT ExtVT = VT.changeVectorElementType(MVT::i16);

48197

Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);

48198

return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);

48199

}

48200

}

48201

// Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.

48202

if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&

48203

N->isOnlyUserOf(Op.getNode())) {

48204

SDValue Src = Op.getOperand(0);

48205

if (Src.getScalarValueSizeInBits() == 16)

48206

return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);

48207

}

48208

// Convert VSRAI(Op, 16) to VSRLI(Op, 16).

48209

if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&

48210

N->isOnlyUserOf(Op.getNode())) {

48211

return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),

48212

Op.getOperand(1));

48213

}

48214

return SDValue();

48215

};

48216

SDValue ZeroN0 = GetZeroableOp(N0);

48217

SDValue ZeroN1 = GetZeroableOp(N1);

48218

if (!ZeroN0 && !ZeroN1)

48219

return SDValue();

48220

N0 = ZeroN0 ? ZeroN0 : N0;

48221

N1 = ZeroN1 ? ZeroN1 : N1;

48222

48223

// Use SplitOpsAndApply to handle AVX splitting.

48224

auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

48225

ArrayRef<SDValue> Ops) {

48226

MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);

48227

MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);

48228

return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,

48229

DAG.getBitcast(OpVT, Ops[0]),

48230

DAG.getBitcast(OpVT, Ops[1]));

48231

};

48232

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {N0, N1},

48233

PMADDWDBuilder);

48234

}

48235

48236

static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,

48237

const X86Subtarget &Subtarget) {

48238

if (!Subtarget.hasSSE2())

48239

return SDValue();

48240

48241

EVT VT = N->getValueType(0);

48242

48243

// Only support vXi64 vectors.

48244

if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||

48245

VT.getVectorNumElements() < 2 ||

48246

!isPowerOf2_32(VT.getVectorNumElements()))

48247

return SDValue();

48248

48249

SDValue N0 = N->getOperand(0);

48250

SDValue N1 = N->getOperand(1);

48251

48252

// MULDQ returns the 64-bit result of the signed multiplication of the lower

48253

// 32-bits. We can lower with this if the sign bits stretch that far.

48254

if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&

48255

DAG.ComputeNumSignBits(N1) > 32) {

48256

auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

48257

ArrayRef<SDValue> Ops) {

48258

return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);

48259

};

48260

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },

48261

PMULDQBuilder, /*CheckBWI*/false);

48262

}

48263

48264

// If the upper bits are zero we can use a single pmuludq.

48265

APInt Mask = APInt::getHighBitsSet(64, 32);

48266

if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {

48267

auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

48268

ArrayRef<SDValue> Ops) {

48269

return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);

48270

};

48271

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },

48272

PMULUDQBuilder, /*CheckBWI*/false);

48273

}

48274

48275

return SDValue();

48276

}

48277

48278

static SDValue combineMul(SDNode *N, SelectionDAG &DAG,

48279

TargetLowering::DAGCombinerInfo &DCI,

48280

const X86Subtarget &Subtarget) {

48281

EVT VT = N->getValueType(0);

48282

48283

if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))

48284

return V;

48285

48286

if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))

48287

return V;

48288

48289

if (DCI.isBeforeLegalize() && VT.isVector())

48290

return reduceVMULWidth(N, DAG, Subtarget);

48291

48292

// Optimize a single multiply with constant into two operations in order to

48293

// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.

48294

if (!MulConstantOptimization)

48295

return SDValue();

48296

48297

// An imul is usually smaller than the alternative sequence.

48298

if (DAG.getMachineFunction().getFunction().hasMinSize())

48299

return SDValue();

48300

48301

if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())

48302

return SDValue();

48303

48304

if (VT != MVT::i64 && VT != MVT::i32)

48305

return SDValue();

48306

48307

ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));

48308

if (!C)

48309

return SDValue();

48310

if (isPowerOf2_64(C->getZExtValue()))

48311

return SDValue();

48312

48313

int64_t SignMulAmt = C->getSExtValue();

48314

assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")(static_cast <bool> (SignMulAmt != (-9223372036854775807L
-1) && "Int min should have been handled!") ? void (
0) : __assert_fail ("SignMulAmt != INT64_MIN && \"Int min should have been handled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48314, __extension__
__PRETTY_FUNCTION__));

48315

uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;

48316

48317

SDLoc DL(N);

48318

if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {

48319

SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

48320

DAG.getConstant(AbsMulAmt, DL, VT));

48321

if (SignMulAmt < 0)

48322

NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

48323

NewMul);

48324

48325

return NewMul;

48326

}

48327

48328

uint64_t MulAmt1 = 0;

48329

uint64_t MulAmt2 = 0;

48330

if ((AbsMulAmt % 9) == 0) {

48331

MulAmt1 = 9;

48332

MulAmt2 = AbsMulAmt / 9;

48333

} else if ((AbsMulAmt % 5) == 0) {

48334

MulAmt1 = 5;

48335

MulAmt2 = AbsMulAmt / 5;

48336

} else if ((AbsMulAmt % 3) == 0) {

48337

MulAmt1 = 3;

48338

MulAmt2 = AbsMulAmt / 3;

48339

}

48340

48341

SDValue NewMul;

48342

// For negative multiply amounts, only allow MulAmt2 to be a power of 2.

48343

if (MulAmt2 &&

48344

(isPowerOf2_64(MulAmt2) ||

48345

(SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {

48346

48347

if (isPowerOf2_64(MulAmt2) &&

48348

!(SignMulAmt >= 0 && N->hasOneUse() &&

48349

N->use_begin()->getOpcode() == ISD::ADD))

48350

// If second multiplifer is pow2, issue it first. We want the multiply by

48351

// 3, 5, or 9 to be folded into the addressing mode unless the lone use

48352

// is an add. Only do this for positive multiply amounts since the

48353

// negate would prevent it from being used as an address mode anyway.

48354

std::swap(MulAmt1, MulAmt2);

48355

48356

if (isPowerOf2_64(MulAmt1))

48357

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48358

DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));

48359

else

48360

NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

48361

DAG.getConstant(MulAmt1, DL, VT));

48362

48363

if (isPowerOf2_64(MulAmt2))

48364

NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,

48365

DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));

48366

else

48367

NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,

48368

DAG.getConstant(MulAmt2, DL, VT));

48369

48370

// Negate the result.

48371

if (SignMulAmt < 0)

48372

NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

48373

NewMul);

48374

} else if (!Subtarget.slowLEA())

48375

NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);

48376

48377

if (!NewMul) {

48378

assert(C->getZExtValue() != 0 &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48381, __extension__
__PRETTY_FUNCTION__))

48379

C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48381, __extension__
__PRETTY_FUNCTION__))

48380

"Both cases that could cause potential overflows should have "(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48381, __extension__
__PRETTY_FUNCTION__))

48381

"already been handled.")(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48381, __extension__
__PRETTY_FUNCTION__));

48382

if (isPowerOf2_64(AbsMulAmt - 1)) {

48383

// (mul x, 2^N + 1) => (add (shl x, N), x)

48384

NewMul = DAG.getNode(

48385

ISD::ADD, DL, VT, N->getOperand(0),

48386

DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48387

DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,

48388

MVT::i8)));

48389

// To negate, subtract the number from zero

48390

if (SignMulAmt < 0)

48391

NewMul = DAG.getNode(ISD::SUB, DL, VT,

48392

DAG.getConstant(0, DL, VT), NewMul);

48393

} else if (isPowerOf2_64(AbsMulAmt + 1)) {

48394

// (mul x, 2^N - 1) => (sub (shl x, N), x)

48395

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48396

DAG.getConstant(Log2_64(AbsMulAmt + 1),

48397

DL, MVT::i8));

48398

// To negate, reverse the operands of the subtract.

48399

if (SignMulAmt < 0)

48400

NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);

48401

else

48402

NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));

48403

} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {

48404

// (mul x, 2^N + 2) => (add (shl x, N), (add x, x))

48405

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48406

DAG.getConstant(Log2_64(AbsMulAmt - 2),

48407

DL, MVT::i8));

48408

NewMul = DAG.getNode(

48409

ISD::ADD, DL, VT, NewMul,

48410

DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));

48411

} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {

48412

// (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))

48413

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48414

DAG.getConstant(Log2_64(AbsMulAmt + 2),

48415

DL, MVT::i8));

48416

NewMul = DAG.getNode(

48417

ISD::SUB, DL, VT, NewMul,

48418

DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));

48419

}

48420

}

48421

48422

return NewMul;

48423

}

48424

48425

// Try to form a MULHU or MULHS node by looking for

48426

// (srl (mul ext, ext), 16)

48427

// TODO: This is X86 specific because we want to be able to handle wide types

48428

// before type legalization. But we can only do it if the vector will be

48429

// legalized via widening/splitting. Type legalization can't handle promotion

48430

// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG

48431

// combiner.

48432

static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,

48433

const X86Subtarget &Subtarget) {

48434

assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48435, __extension__
__PRETTY_FUNCTION__))

48435

"SRL or SRA node is required here!")(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48435, __extension__
__PRETTY_FUNCTION__));

48436

SDLoc DL(N);

48437

48438

if (!Subtarget.hasSSE2())

48439

return SDValue();

48440

48441

// The operation feeding into the shift must be a multiply.

48442

SDValue ShiftOperand = N->getOperand(0);

48443

if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())

48444

return SDValue();

48445

48446

// Input type should be at least vXi32.

48447

EVT VT = N->getValueType(0);

48448

if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)

48449

return SDValue();

48450

48451

// Need a shift by 16.

48452

APInt ShiftAmt;

48453

if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||

48454

ShiftAmt != 16)

48455

return SDValue();

48456

48457

SDValue LHS = ShiftOperand.getOperand(0);

48458

SDValue RHS = ShiftOperand.getOperand(1);

48459

48460

unsigned ExtOpc = LHS.getOpcode();

48461

if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||

48462

RHS.getOpcode() != ExtOpc)

48463

return SDValue();

48464

48465

// Peek through the extends.

48466

LHS = LHS.getOperand(0);

48467

RHS = RHS.getOperand(0);

48468

48469

// Ensure the input types match.

48470

EVT MulVT = LHS.getValueType();

48471

if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)

48472

return SDValue();

48473

48474

unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;

48475

SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);

48476

48477

ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

48478

return DAG.getNode(ExtOpc, DL, VT, Mulh);

48479

}

48480

48481

static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {

48482

SDValue N0 = N->getOperand(0);

48483

SDValue N1 = N->getOperand(1);

48484

ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);

48485

EVT VT = N0.getValueType();

48486

48487

// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))

48488

// since the result of setcc_c is all zero's or all ones.

48489

if (VT.isInteger() && !VT.isVector() &&

48490

N1C && N0.getOpcode() == ISD::AND &&

48491

N0.getOperand(1).getOpcode() == ISD::Constant) {

48492

SDValue N00 = N0.getOperand(0);

48493

APInt Mask = N0.getConstantOperandAPInt(1);

48494

Mask <<= N1C->getAPIntValue();

48495

bool MaskOK = false;

48496

// We can handle cases concerning bit-widening nodes containing setcc_c if

48497

// we carefully interrogate the mask to make sure we are semantics

48498

// preserving.

48499

// The transform is not safe if the result of C1 << C2 exceeds the bitwidth

48500

// of the underlying setcc_c operation if the setcc_c was zero extended.

48501

// Consider the following example:

48502

// zext(setcc_c) -> i32 0x0000FFFF

48503

// c1 -> i32 0x0000FFFF

48504

// c2 -> i32 0x00000001

48505

// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE

48506

// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE

48507

if (N00.getOpcode() == X86ISD::SETCC_CARRY) {

48508

MaskOK = true;

48509

} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&

48510

N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {

48511

MaskOK = true;

48512

} else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||

48513

N00.getOpcode() == ISD::ANY_EXTEND) &&

48514

N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {

48515

MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());

48516

}

48517

if (MaskOK && Mask != 0) {

48518

SDLoc DL(N);

48519

return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));

48520

}

48521

}

48522

48523

return SDValue();

48524

}

48525

48526

static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,

48527

const X86Subtarget &Subtarget) {

48528

SDValue N0 = N->getOperand(0);

48529

SDValue N1 = N->getOperand(1);

48530

EVT VT = N0.getValueType();

48531

unsigned Size = VT.getSizeInBits();

48532

48533

if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))

48534

return V;

48535

48536

// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)

48537

// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or

48538

// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))

48539

// depending on sign of (SarConst - [56,48,32,24,16])

48540

48541

// sexts in X86 are MOVs. The MOVs have the same code size

48542

// as above SHIFTs (only SHIFT on 1 has lower code size).

48543

// However the MOVs have 2 advantages to a SHIFT:

48544

// 1. MOVs can write to a register that differs from source

48545

// 2. MOVs accept memory operands

48546

48547

if (VT.isVector() || N1.getOpcode() != ISD::Constant ||

48548

N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||

48549

N0.getOperand(1).getOpcode() != ISD::Constant)

48550

return SDValue();

48551

48552

SDValue N00 = N0.getOperand(0);

48553

SDValue N01 = N0.getOperand(1);

48554

APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();

48555

APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();

48556

EVT CVT = N1.getValueType();

48557

48558

if (SarConst.isNegative())

48559

return SDValue();

48560

48561

for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {

48562

unsigned ShiftSize = SVT.getSizeInBits();

48563

// skipping types without corresponding sext/zext and

48564

// ShlConst that is not one of [56,48,32,24,16]

48565

if (ShiftSize >= Size || ShlConst != Size - ShiftSize)

48566

continue;

48567

SDLoc DL(N);

48568

SDValue NN =

48569

DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));

48570

SarConst = SarConst - (Size - ShiftSize);

48571

if (SarConst == 0)

48572

return NN;

48573

if (SarConst.isNegative())

48574

return DAG.getNode(ISD::SHL, DL, VT, NN,

48575

DAG.getConstant(-SarConst, DL, CVT));

48576

return DAG.getNode(ISD::SRA, DL, VT, NN,

48577

DAG.getConstant(SarConst, DL, CVT));

48578

}

48579

return SDValue();

48580

}

48581

48582

static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,

48583

TargetLowering::DAGCombinerInfo &DCI,

48584

const X86Subtarget &Subtarget) {

48585

SDValue N0 = N->getOperand(0);

48586

SDValue N1 = N->getOperand(1);

48587

EVT VT = N0.getValueType();

48588

48589

if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))

48590

return V;

48591

48592

// Only do this on the last DAG combine as it can interfere with other

48593

// combines.

48594

if (!DCI.isAfterLegalizeDAG())

48595

return SDValue();

48596

48597

// Try to improve a sequence of srl (and X, C1), C2 by inverting the order.

48598

// TODO: This is a generic DAG combine that became an x86-only combine to

48599

// avoid shortcomings in other folds such as bswap, bit-test ('bt'), and

48600

// and-not ('andn').

48601

if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())

48602

return SDValue();

48603

48604

auto *ShiftC = dyn_cast<ConstantSDNode>(N1);

48605

auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));

48606

if (!ShiftC || !AndC)

48607

return SDValue();

48608

48609

// If we can shrink the constant mask below 8-bits or 32-bits, then this

48610

// transform should reduce code size. It may also enable secondary transforms

48611

// from improved known-bits analysis or instruction selection.

48612

APInt MaskVal = AndC->getAPIntValue();

48613

48614

// If this can be matched by a zero extend, don't optimize.

48615

if (MaskVal.isMask()) {

48616

unsigned TO = MaskVal.countr_one();

48617

if (TO >= 8 && isPowerOf2_32(TO))

48618

return SDValue();

48619

}

48620

48621

APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());

48622

unsigned OldMaskSize = MaskVal.getSignificantBits();

48623

unsigned NewMaskSize = NewMaskVal.getSignificantBits();

48624

if ((OldMaskSize > 8 && NewMaskSize <= 8) ||

48625

(OldMaskSize > 32 && NewMaskSize <= 32)) {

48626

// srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)

48627

SDLoc DL(N);

48628

SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);

48629

SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);

48630

return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);

48631

}

48632

return SDValue();

48633

}

48634

48635

static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,

48636

const X86Subtarget &Subtarget) {

48637

unsigned Opcode = N->getOpcode();

48638

assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode")(static_cast <bool> (isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? void (0) : __assert_fail ("isHorizOp(Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48638, __extension__
__PRETTY_FUNCTION__));

48639

48640

SDLoc DL(N);

48641

EVT VT = N->getValueType(0);

48642

SDValue N0 = N->getOperand(0);

48643

SDValue N1 = N->getOperand(1);

48644

EVT SrcVT = N0.getValueType();

48645

48646

SDValue BC0 =

48647

N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;

48648

SDValue BC1 =

48649

N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;

48650

48651

// Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))

48652

// to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for

48653

// truncation trees that help us avoid lane crossing shuffles.

48654

// TODO: There's a lot more we can do for PACK/HADD style shuffle combines.

48655

// TODO: We don't handle vXf64 shuffles yet.

48656

if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {

48657

if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {

48658

SmallVector<SDValue> ShuffleOps;

48659

SmallVector<int> ShuffleMask, ScaledMask;

48660

SDValue Vec = peekThroughBitcasts(BCSrc);

48661

if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {

48662

resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);

48663

// To keep the HOP LHS/RHS coherency, we must be able to scale the unary

48664

// shuffle to a v4X64 width - we can probably relax this in the future.

48665

if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&

48666

ShuffleOps[0].getValueType().is256BitVector() &&

48667

scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {

48668

SDValue Lo, Hi;

48669

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;

48670

std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);

48671

Lo = DAG.getBitcast(SrcVT, Lo);

48672

Hi = DAG.getBitcast(SrcVT, Hi);

48673

SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);

48674

Res = DAG.getBitcast(ShufVT, Res);

48675

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);

48676

return DAG.getBitcast(VT, Res);

48677

}

48678

}

48679

}

48680

}

48681

48682

// Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).

48683

if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {

48684

// If either/both ops are a shuffle that can scale to v2x64,

48685

// then see if we can perform this as a v4x32 post shuffle.

48686

SmallVector<SDValue> Ops0, Ops1;

48687

SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;

48688

bool IsShuf0 =

48689

getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&

48690

scaleShuffleElements(Mask0, 2, ScaledMask0) &&

48691

all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });

48692

bool IsShuf1 =

48693

getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&

48694

scaleShuffleElements(Mask1, 2, ScaledMask1) &&

48695

all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });

48696

if (IsShuf0 || IsShuf1) {

48697

if (!IsShuf0) {

48698

Ops0.assign({BC0});

48699

ScaledMask0.assign({0, 1});

48700

}

48701

if (!IsShuf1) {

48702

Ops1.assign({BC1});

48703

ScaledMask1.assign({0, 1});

48704

}

48705

48706

SDValue LHS, RHS;

48707

int PostShuffle[4] = {-1, -1, -1, -1};

48708

auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {

48709

if (M < 0)

48710

return true;

48711

Idx = M % 2;

48712

SDValue Src = Ops[M / 2];

48713

if (!LHS || LHS == Src) {

48714

LHS = Src;

48715

return true;

48716

}

48717

if (!RHS || RHS == Src) {

48718

Idx += 2;

48719

RHS = Src;

48720

return true;

48721

}

48722

return false;

48723

};

48724

if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&

48725

FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&

48726

FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&

48727

FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {

48728

LHS = DAG.getBitcast(SrcVT, LHS);

48729

RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);

48730

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;

48731

SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);

48732

Res = DAG.getBitcast(ShufVT, Res);

48733

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);

48734

return DAG.getBitcast(VT, Res);

48735

}

48736

}

48737

}

48738

48739

// Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).

48740

if (VT.is256BitVector() && Subtarget.hasInt256()) {

48741

SmallVector<int> Mask0, Mask1;

48742

SmallVector<SDValue> Ops0, Ops1;

48743

SmallVector<int, 2> ScaledMask0, ScaledMask1;

48744

if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&

48745

getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&

48746

!Ops0.empty() && !Ops1.empty() &&

48747

all_of(Ops0,

48748

[](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&

48749

all_of(Ops1,

48750

[](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&

48751

scaleShuffleElements(Mask0, 2, ScaledMask0) &&

48752

scaleShuffleElements(Mask1, 2, ScaledMask1)) {

48753

SDValue Op00 = peekThroughBitcasts(Ops0.front());

48754

SDValue Op10 = peekThroughBitcasts(Ops1.front());

48755

SDValue Op01 = peekThroughBitcasts(Ops0.back());

48756

SDValue Op11 = peekThroughBitcasts(Ops1.back());

48757

if ((Op00 == Op11) && (Op01 == Op10)) {

48758

std::swap(Op10, Op11);

48759

ShuffleVectorSDNode::commuteMask(ScaledMask1);

48760

}

48761

if ((Op00 == Op10) && (Op01 == Op11)) {

48762

const int Map[4] = {0, 2, 1, 3};

48763

SmallVector<int, 4> ShuffleMask(

48764

{Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],

48765

Map[ScaledMask1[1]]});

48766

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;

48767

SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),

48768

DAG.getBitcast(SrcVT, Op01));

48769

Res = DAG.getBitcast(ShufVT, Res);

48770

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);

48771

return DAG.getBitcast(VT, Res);

48772

}

48773

}

48774

}

48775

48776

return SDValue();

48777

}

48778

48779

static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,

48780

TargetLowering::DAGCombinerInfo &DCI,

48781

const X86Subtarget &Subtarget) {

48782

unsigned Opcode = N->getOpcode();

48783

assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48784, __extension__
__PRETTY_FUNCTION__))

48784

"Unexpected pack opcode")(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48784, __extension__
__PRETTY_FUNCTION__));

48785

48786

EVT VT = N->getValueType(0);

48787

SDValue N0 = N->getOperand(0);

48788

SDValue N1 = N->getOperand(1);

48789

unsigned NumDstElts = VT.getVectorNumElements();

48790

unsigned DstBitsPerElt = VT.getScalarSizeInBits();

48791

unsigned SrcBitsPerElt = 2 * DstBitsPerElt;

48792

assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48794, __extension__
__PRETTY_FUNCTION__))

48793

N1.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48794, __extension__
__PRETTY_FUNCTION__))

48794

"Unexpected PACKSS/PACKUS input type")(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48794, __extension__
__PRETTY_FUNCTION__));

48795

48796

bool IsSigned = (X86ISD::PACKSS == Opcode);

48797

48798

// Constant Folding.

48799

APInt UndefElts0, UndefElts1;

48800

SmallVector<APInt, 32> EltBits0, EltBits1;

48801

if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&

48802

(N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&

48803

getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&

48804

getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {

48805

unsigned NumLanes = VT.getSizeInBits() / 128;

48806

unsigned NumSrcElts = NumDstElts / 2;

48807

unsigned NumDstEltsPerLane = NumDstElts / NumLanes;

48808

unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;

48809

48810

APInt Undefs(NumDstElts, 0);

48811

SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));

48812

for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

48813

for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {

48814

unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;

48815

auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);

48816

auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);

48817

48818

if (UndefElts[SrcIdx]) {

48819

Undefs.setBit(Lane * NumDstEltsPerLane + Elt);

48820

continue;

48821

}

48822

48823

APInt &Val = EltBits[SrcIdx];

48824

if (IsSigned) {

48825

// PACKSS: Truncate signed value with signed saturation.

48826

// Source values less than dst minint are saturated to minint.

48827

// Source values greater than dst maxint are saturated to maxint.

48828

if (Val.isSignedIntN(DstBitsPerElt))

48829

Val = Val.trunc(DstBitsPerElt);

48830

else if (Val.isNegative())

48831

Val = APInt::getSignedMinValue(DstBitsPerElt);

48832

else

48833

Val = APInt::getSignedMaxValue(DstBitsPerElt);

48834

} else {

48835

// PACKUS: Truncate signed value with unsigned saturation.

48836

// Source values less than zero are saturated to zero.

48837

// Source values greater than dst maxuint are saturated to maxuint.

48838

if (Val.isIntN(DstBitsPerElt))

48839

Val = Val.trunc(DstBitsPerElt);

48840

else if (Val.isNegative())

48841

Val = APInt::getZero(DstBitsPerElt);

48842

else

48843

Val = APInt::getAllOnes(DstBitsPerElt);

48844

}

48845

Bits[Lane * NumDstEltsPerLane + Elt] = Val;

48846

}

48847

}

48848

48849

return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));

48850

}

48851

48852

// Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).

48853

if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))

48854

return V;

48855

48856

// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular

48857

// truncate to create a larger truncate.

48858

if (Subtarget.hasAVX512() &&

48859

N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&

48860

N0.getOperand(0).getValueType() == MVT::v8i32) {

48861

if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||

48862

(!IsSigned &&

48863

DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {

48864

if (Subtarget.hasVLX())

48865

return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));

48866

48867

// Widen input to v16i32 so we can truncate that.

48868

SDLoc dl(N);

48869

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,

48870

N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));

48871

return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);

48872

}

48873

}

48874

48875

// Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.

48876

if (VT.is128BitVector()) {

48877

unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

48878

SDValue Src0, Src1;

48879

if (N0.getOpcode() == ExtOpc &&

48880

N0.getOperand(0).getValueType().is64BitVector() &&

48881

N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {

48882

Src0 = N0.getOperand(0);

48883

}

48884

if (N1.getOpcode() == ExtOpc &&

48885

N1.getOperand(0).getValueType().is64BitVector() &&

48886

N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {

48887

Src1 = N1.getOperand(0);

48888

}

48889

if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {

48890

assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)")(static_cast <bool> ((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)"
) ? void (0) : __assert_fail ("(Src0 || Src1) && \"Found PACK(UNDEF,UNDEF)\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48890, __extension__
__PRETTY_FUNCTION__));

48891

Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());

48892

Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());

48893

return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);

48894

}

48895

48896

// Try again with pack(*_extend_vector_inreg, undef).

48897

unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG

48898

: ISD::ZERO_EXTEND_VECTOR_INREG;

48899

if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&

48900

N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)

48901

return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),

48902

DAG);

48903

}

48904

48905

// Attempt to combine as shuffle.

48906

SDValue Op(N, 0);

48907

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

48908

return Res;

48909

48910

return SDValue();

48911

}

48912

48913

static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,

48914

TargetLowering::DAGCombinerInfo &DCI,

48915

const X86Subtarget &Subtarget) {

48916

assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48918, __extension__
__PRETTY_FUNCTION__))

48917

X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48918, __extension__
__PRETTY_FUNCTION__))

48918

"Unexpected horizontal add/sub opcode")(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48918, __extension__
__PRETTY_FUNCTION__));

48919

48920

if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {

48921

MVT VT = N->getSimpleValueType(0);

48922

SDValue LHS = N->getOperand(0);

48923

SDValue RHS = N->getOperand(1);

48924

48925

// HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).

48926

if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&

48927

LHS.getOpcode() == RHS.getOpcode() &&

48928

LHS.getValueType() == RHS.getValueType() &&

48929

N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {

48930

SDValue LHS0 = LHS.getOperand(0);

48931

SDValue LHS1 = LHS.getOperand(1);

48932

SDValue RHS0 = RHS.getOperand(0);

48933

SDValue RHS1 = RHS.getOperand(1);

48934

if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&

48935

(RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {

48936

SDLoc DL(N);

48937

SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),

48938

LHS0.isUndef() ? LHS1 : LHS0,

48939

RHS0.isUndef() ? RHS1 : RHS0);

48940

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

48941

Res = DAG.getBitcast(ShufVT, Res);

48942

SDValue NewLHS =

48943

DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,

48944

getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));

48945

SDValue NewRHS =

48946

DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,

48947

getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));

48948

return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),

48949

DAG.getBitcast(VT, NewRHS));

48950

}

48951

}

48952

}

48953

48954

// Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).

48955

if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))

48956

return V;

48957

48958

return SDValue();

48959

}

48960

48961

static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,

48962

TargetLowering::DAGCombinerInfo &DCI,

48963

const X86Subtarget &Subtarget) {

48964

assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48966, __extension__
__PRETTY_FUNCTION__))

48965

X86ISD::VSRL == N->getOpcode()) &&(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48966, __extension__
__PRETTY_FUNCTION__))

48966

"Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48966, __extension__
__PRETTY_FUNCTION__));

48967

EVT VT = N->getValueType(0);

48968

SDValue N0 = N->getOperand(0);

48969

SDValue N1 = N->getOperand(1);

48970

48971

// Shift zero -> zero.

48972

if (ISD::isBuildVectorAllZeros(N0.getNode()))

48973

return DAG.getConstant(0, SDLoc(N), VT);

48974

48975

// Detect constant shift amounts.

48976

APInt UndefElts;

48977

SmallVector<APInt, 32> EltBits;

48978

if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {

48979

unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);

48980

return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,

48981

EltBits[0].getZExtValue(), DAG);

48982

}

48983

48984

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

48985

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

48986

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

48987

return SDValue(N, 0);

48988

48989

return SDValue();

48990

}

48991

48992

static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,

48993

TargetLowering::DAGCombinerInfo &DCI,

48994

const X86Subtarget &Subtarget) {

48995

unsigned Opcode = N->getOpcode();

48996

assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48998, __extension__
__PRETTY_FUNCTION__))

48997

X86ISD::VSRLI == Opcode) &&(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48998, __extension__
__PRETTY_FUNCTION__))

48998

"Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48998, __extension__
__PRETTY_FUNCTION__));

48999

bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;

49000

EVT VT = N->getValueType(0);

49001

SDValue N0 = N->getOperand(0);

49002

SDValue N1 = N->getOperand(1);

49003

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

49004

assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49005, __extension__
__PRETTY_FUNCTION__))

49005

"Unexpected value type")(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49005, __extension__
__PRETTY_FUNCTION__));

49006

assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type")(static_cast <bool> (N1.getValueType() == MVT::i8 &&
"Unexpected shift amount type") ? void (0) : __assert_fail (
"N1.getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49006, __extension__
__PRETTY_FUNCTION__));

49007

49008

// (shift undef, X) -> 0

49009

if (N0.isUndef())

49010

return DAG.getConstant(0, SDLoc(N), VT);

49011

49012

// Out of range logical bit shifts are guaranteed to be zero.

49013

// Out of range arithmetic bit shifts splat the sign bit.

49014

unsigned ShiftVal = N->getConstantOperandVal(1);

49015

if (ShiftVal >= NumBitsPerElt) {

49016

if (LogicalShift)

49017

return DAG.getConstant(0, SDLoc(N), VT);

49018

ShiftVal = NumBitsPerElt - 1;

49019

}

49020

49021

// (shift X, 0) -> X

49022

if (!ShiftVal)

49023

return N0;

49024

49025

// (shift 0, C) -> 0

49026

if (ISD::isBuildVectorAllZeros(N0.getNode()))

49027

// N0 is all zeros or undef. We guarantee that the bits shifted into the

49028

// result are all zeros, not undef.

49029

return DAG.getConstant(0, SDLoc(N), VT);

49030

49031

// (VSRAI -1, C) -> -1

49032

if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))

49033

// N0 is all ones or undef. We guarantee that the bits shifted into the

49034

// result are all ones, not undef.

49035

return DAG.getConstant(-1, SDLoc(N), VT);

49036

49037

auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {

49038

unsigned NewShiftVal = Amt0 + Amt1;

49039

if (NewShiftVal >= NumBitsPerElt) {

49040

// Out of range logical bit shifts are guaranteed to be zero.

49041

// Out of range arithmetic bit shifts splat the sign bit.

49042

if (LogicalShift)

49043

return DAG.getConstant(0, SDLoc(N), VT);

49044

NewShiftVal = NumBitsPerElt - 1;

49045

}

49046

return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),

49047

DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));

49048

};

49049

49050

// (shift (shift X, C2), C1) -> (shift X, (C1 + C2))

49051

if (Opcode == N0.getOpcode())

49052

return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));

49053

49054

// (shl (add X, X), C) -> (shl X, (C + 1))

49055

if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&

49056

N0.getOperand(0) == N0.getOperand(1))

49057

return MergeShifts(N0.getOperand(0), ShiftVal, 1);

49058

49059

// We can decode 'whole byte' logical bit shifts as shuffles.

49060

if (LogicalShift && (ShiftVal % 8) == 0) {

49061

SDValue Op(N, 0);

49062

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

49063

return Res;

49064

}

49065

49066

auto TryConstantFold = [&](SDValue V) {

49067

APInt UndefElts;

49068

SmallVector<APInt, 32> EltBits;

49069

if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits))

49070

return SDValue();

49071

assert(EltBits.size() == VT.getVectorNumElements() &&(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49072, __extension__
__PRETTY_FUNCTION__))

49072

"Unexpected shift value type")(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49072, __extension__
__PRETTY_FUNCTION__));

49073

// Undef elements need to fold to 0. It's possible SimplifyDemandedBits

49074

// created an undef input due to no input bits being demanded, but user

49075

// still expects 0 in other bits.

49076

for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {

49077

APInt &Elt = EltBits[i];

49078

if (UndefElts[i])

49079

Elt = 0;

49080

else if (X86ISD::VSHLI == Opcode)

49081

Elt <<= ShiftVal;

49082

else if (X86ISD::VSRAI == Opcode)

49083

Elt.ashrInPlace(ShiftVal);

49084

else

49085

Elt.lshrInPlace(ShiftVal);

49086

}

49087

// Reset undef elements since they were zeroed above.

49088

UndefElts = 0;

49089

return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));

49090

};

49091

49092

// Constant Folding.

49093

if (N->isOnlyUserOf(N0.getNode())) {

49094

if (SDValue C = TryConstantFold(N0))

49095

return C;

49096

49097

// Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))

49098

// Don't break NOT patterns.

49099

SDValue BC = peekThroughOneUseBitcasts(N0);

49100

if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&

49101

BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&

49102

!ISD::isBuildVectorAllOnes(BC.getOperand(1).getNode())) {

49103

if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {

49104

SDLoc DL(N);

49105

SDValue LHS = DAG.getNode(Opcode, DL, VT,

49106

DAG.getBitcast(VT, BC.getOperand(0)), N1);

49107

return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);

49108

}

49109

}

49110

}

49111

49112

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49113

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),

49114

DCI))

49115

return SDValue(N, 0);

49116

49117

return SDValue();

49118

}

49119

49120

static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,

49121

TargetLowering::DAGCombinerInfo &DCI,

49122

const X86Subtarget &Subtarget) {

49123

EVT VT = N->getValueType(0);

49124

unsigned Opcode = N->getOpcode();

49125

assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49128, __extension__
__PRETTY_FUNCTION__))

49126

(Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49128, __extension__
__PRETTY_FUNCTION__))

49127

Opcode == ISD::INSERT_VECTOR_ELT) &&(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49128, __extension__
__PRETTY_FUNCTION__))

49128

"Unexpected vector insertion")(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49128, __extension__
__PRETTY_FUNCTION__));

49129

49130

SDValue Vec = N->getOperand(0);

49131

SDValue Scl = N->getOperand(1);

49132

SDValue Idx = N->getOperand(2);

49133

49134

// Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).

49135

if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))

49136

return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);

49137

49138

if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {

49139

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

49140

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49141

if (TLI.SimplifyDemandedBits(SDValue(N, 0),

49142

APInt::getAllOnes(NumBitsPerElt), DCI))

49143

return SDValue(N, 0);

49144

}

49145

49146

// Attempt to combine insertion patterns to a shuffle.

49147

if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {

49148

SDValue Op(N, 0);

49149

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

49150

return Res;

49151

}

49152

49153

return SDValue();

49154

}

49155

49156

/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs

49157

/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for

49158

/// OR -> CMPNEQSS.

49159

static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,

49160

TargetLowering::DAGCombinerInfo &DCI,

49161

const X86Subtarget &Subtarget) {

49162

unsigned opcode;

49163

49164

// SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but

49165

// we're requiring SSE2 for both.

49166

if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {

49167

SDValue N0 = N->getOperand(0);

49168

SDValue N1 = N->getOperand(1);

49169

SDValue CMP0 = N0.getOperand(1);

49170

SDValue CMP1 = N1.getOperand(1);

49171

SDLoc DL(N);

49172

49173

// The SETCCs should both refer to the same CMP.

49174

if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)

49175

return SDValue();

49176

49177

SDValue CMP00 = CMP0->getOperand(0);

49178

SDValue CMP01 = CMP0->getOperand(1);

49179

EVT VT = CMP00.getValueType();

49180

49181

if (VT == MVT::f32 || VT == MVT::f64 ||

49182

(VT == MVT::f16 && Subtarget.hasFP16())) {

49183

bool ExpectingFlags = false;

49184

// Check for any users that want flags:

49185

for (const SDNode *U : N->uses()) {

49186

if (ExpectingFlags)

49187

break;

49188

49189

switch (U->getOpcode()) {

49190

default:

49191

case ISD::BR_CC:

49192

case ISD::BRCOND:

49193

case ISD::SELECT:

49194

ExpectingFlags = true;

49195

break;

49196

case ISD::CopyToReg:

49197

case ISD::SIGN_EXTEND:

49198

case ISD::ZERO_EXTEND:

49199

case ISD::ANY_EXTEND:

49200

break;

49201

}

49202

}

49203

49204

if (!ExpectingFlags) {

49205

enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);

49206

enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);

49207

49208

if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {

49209

X86::CondCode tmp = cc0;

49210

cc0 = cc1;

49211

cc1 = tmp;

49212

}

49213

49214

if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||

49215

(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {

49216

// FIXME: need symbolic constants for these magic numbers.

49217

// See X86ATTInstPrinter.cpp:printSSECC().

49218

unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;

49219

if (Subtarget.hasAVX512()) {

49220

SDValue FSetCC =

49221

DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,

49222

DAG.getTargetConstant(x86cc, DL, MVT::i8));

49223

// Need to fill with zeros to ensure the bitcast will produce zeroes

49224

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

49225

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,

49226

DAG.getConstant(0, DL, MVT::v16i1),

49227

FSetCC, DAG.getIntPtrConstant(0, DL));

49228

return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,

49229

N->getSimpleValueType(0));

49230

}

49231

SDValue OnesOrZeroesF =

49232

DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,

49233

CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));

49234

49235

bool is64BitFP = (CMP00.getValueType() == MVT::f64);

49236

MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;

49237

49238

if (is64BitFP && !Subtarget.is64Bit()) {

49239

// On a 32-bit target, we cannot bitcast the 64-bit float to a

49240

// 64-bit integer, since that's not a legal type. Since

49241

// OnesOrZeroesF is all ones or all zeroes, we don't need all the

49242

// bits, but can do this little dance to extract the lowest 32 bits

49243

// and work with those going forward.

49244

SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,

49245

OnesOrZeroesF);

49246

SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);

49247

OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,

49248

Vector32, DAG.getIntPtrConstant(0, DL));

49249

IntVT = MVT::i32;

49250

}

49251

49252

SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);

49253

SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,

49254

DAG.getConstant(1, DL, IntVT));

49255

SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,

49256

ANDed);

49257

return OneBitOfTruth;

49258

}

49259

}

49260

}

49261

}

49262

return SDValue();

49263

}

49264

49265

/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).

49266

static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {

49267

assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail
("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49267, __extension__
__PRETTY_FUNCTION__));

49268

49269

MVT VT = N->getSimpleValueType(0);

49270

if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())

49271

return SDValue();

49272

49273

SDValue X, Y;

49274

SDValue N0 = N->getOperand(0);

49275

SDValue N1 = N->getOperand(1);

49276

49277

if (SDValue Not = IsNOT(N0, DAG)) {

49278

X = Not;

49279

Y = N1;

49280

} else if (SDValue Not = IsNOT(N1, DAG)) {

49281

X = Not;

49282

Y = N0;

49283

} else

49284

return SDValue();

49285

49286

X = DAG.getBitcast(VT, X);

49287

Y = DAG.getBitcast(VT, Y);

49288

return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);

49289

}

49290

49291

/// Try to fold:

49292

/// and (vector_shuffle<Z,...,Z>

49293

/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y

49294

/// ->

49295

/// andnp (vector_shuffle<Z,...,Z>

49296

/// (insert_vector_elt undef, X, Z), undef), Y

49297

static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,

49298

const X86Subtarget &Subtarget) {

49299

assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail
("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49299, __extension__
__PRETTY_FUNCTION__));

49300

49301

EVT VT = N->getValueType(0);

49302

// Do not split 256 and 512 bit vectors with SSE2 as they overwrite original

49303

// value and require extra moves.

49304

if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||

49305

((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))

49306

return SDValue();

49307

49308

auto GetNot = [&DAG](SDValue V) {

49309

auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));

49310

// TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all

49311

// end-users are ISD::AND including cases

49312

// (and(extract_vector_element(SVN), Y)).

49313

if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||

49314

!SVN->getOperand(1).isUndef()) {

49315

return SDValue();

49316

}

49317

SDValue IVEN = SVN->getOperand(0);

49318

if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||

49319

!IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())

49320

return SDValue();

49321

if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||

49322

IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())

49323

return SDValue();

49324

SDValue Src = IVEN.getOperand(1);

49325

if (SDValue Not = IsNOT(Src, DAG)) {

49326

SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);

49327

SDValue NotIVEN =

49328

DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(IVEN), IVEN.getValueType(),

49329

IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));

49330

return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,

49331

SVN->getOperand(1), SVN->getMask());

49332

}

49333

return SDValue();

49334

};

49335

49336

SDValue X, Y;

49337

SDValue N0 = N->getOperand(0);

49338

SDValue N1 = N->getOperand(1);

49339

49340

if (SDValue Not = GetNot(N0)) {

49341

X = Not;

49342

Y = N1;

49343

} else if (SDValue Not = GetNot(N1)) {

49344

X = Not;

49345

Y = N0;

49346

} else

49347

return SDValue();

49348

49349

X = DAG.getBitcast(VT, X);

49350

Y = DAG.getBitcast(VT, Y);

49351

SDLoc DL(N);

49352

// We do not split for SSE at all, but we need to split vectors for AVX1 and

49353

// AVX2.

49354

if (!Subtarget.useAVX512Regs() && VT.is512BitVector()) {

49355

SDValue LoX, HiX;

49356

std::tie(LoX, HiX) = splitVector(X, DAG, DL);

49357

SDValue LoY, HiY;

49358

std::tie(LoY, HiY) = splitVector(Y, DAG, DL);

49359

EVT SplitVT = LoX.getValueType();

49360

SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});

49361

SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});

49362

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});

49363

}

49364

return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});

49365

}

49366

49367

// Try to widen AND, OR and XOR nodes to VT in order to remove casts around

49368

// logical operations, like in the example below.

49369

// or (and (truncate x, truncate y)),

49370

// (xor (truncate z, build_vector (constants)))

49371

// Given a target type \p VT, we generate

49372

// or (and x, y), (xor z, zext(build_vector (constants)))

49373

// given x, y and z are of type \p VT. We can do so, if operands are either

49374

// truncates from VT types, the second operand is a vector of constants or can

49375

// be recursively promoted.

49376

static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,

49377

unsigned Depth) {

49378

// Limit recursion to avoid excessive compile times.

49379

if (Depth >= SelectionDAG::MaxRecursionDepth)

49380

return SDValue();

49381

49382

if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&

49383

N->getOpcode() != ISD::OR)

49384

return SDValue();

49385

49386

SDValue N0 = N->getOperand(0);

49387

SDValue N1 = N->getOperand(1);

49388

SDLoc DL(N);

49389

49390

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49391

if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))

49392

return SDValue();

49393

49394

if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))

49395

N0 = NN0;

49396

else {

49397

// The Left side has to be a trunc.

49398

if (N0.getOpcode() != ISD::TRUNCATE)

49399

return SDValue();

49400

49401

// The type of the truncated inputs.

49402

if (N0.getOperand(0).getValueType() != VT)

49403

return SDValue();

49404

49405

N0 = N0.getOperand(0);

49406

}

49407

49408

if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))

49409

N1 = NN1;

49410

else {

49411

// The right side has to be a 'trunc' or a constant vector.

49412

bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&

49413

N1.getOperand(0).getValueType() == VT;

49414

if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))

49415

return SDValue();

49416

49417

if (RHSTrunc)

49418

N1 = N1.getOperand(0);

49419

else

49420

N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);

49421

}

49422

49423

return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);

49424

}

49425

49426

// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized

49427

// register. In most cases we actually compare or select YMM-sized registers

49428

// and mixing the two types creates horrible code. This method optimizes

49429

// some of the transition sequences.

49430

// Even with AVX-512 this is still useful for removing casts around logical

49431

// operations on vXi1 mask types.

49432

static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,

49433

const X86Subtarget &Subtarget) {

49434

EVT VT = N->getValueType(0);

49435

assert(VT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && "Expected vector type"
) ? void (0) : __assert_fail ("VT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49435, __extension__
__PRETTY_FUNCTION__));

49436

49437

SDLoc DL(N);

49438

assert((N->getOpcode() == ISD::ANY_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49440, __extension__
__PRETTY_FUNCTION__))

49439

N->getOpcode() == ISD::ZERO_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49440, __extension__
__PRETTY_FUNCTION__))

49440

N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49440, __extension__
__PRETTY_FUNCTION__));

49441

49442

SDValue Narrow = N->getOperand(0);

49443

EVT NarrowVT = Narrow.getValueType();

49444

49445

// Generate the wide operation.

49446

SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);

49447

if (!Op)

49448

return SDValue();

49449

switch (N->getOpcode()) {

49450

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49450);

49451

case ISD::ANY_EXTEND:

49452

return Op;

49453

case ISD::ZERO_EXTEND:

49454

return DAG.getZeroExtendInReg(Op, DL, NarrowVT);

49455

case ISD::SIGN_EXTEND:

49456

return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,

49457

Op, DAG.getValueType(NarrowVT));

49458

}

49459

}

49460

49461

static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {

49462

unsigned FPOpcode;

49463

switch (Opcode) {

49464

default: llvm_unreachable("Unexpected input node for FP logic conversion")::llvm::llvm_unreachable_internal("Unexpected input node for FP logic conversion"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49464);

49465

case ISD::AND: FPOpcode = X86ISD::FAND; break;

49466

case ISD::OR: FPOpcode = X86ISD::FOR; break;

49467

case ISD::XOR: FPOpcode = X86ISD::FXOR; break;

49468

}

49469

return FPOpcode;

49470

}

49471

49472

/// If both input operands of a logic op are being cast from floating-point

49473

/// types or FP compares, try to convert this into a floating-point logic node

49474

/// to avoid unnecessary moves from SSE to integer registers.

49475

static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,

49476

TargetLowering::DAGCombinerInfo &DCI,

49477

const X86Subtarget &Subtarget) {

49478

EVT VT = N->getValueType(0);

49479

SDValue N0 = N->getOperand(0);

49480

SDValue N1 = N->getOperand(1);

49481

SDLoc DL(N);

49482

49483

if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||

49484

(N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))

49485

return SDValue();

49486

49487

SDValue N00 = N0.getOperand(0);

49488

SDValue N10 = N1.getOperand(0);

49489

EVT N00Type = N00.getValueType();

49490

EVT N10Type = N10.getValueType();

49491

49492

// Ensure that both types are the same and are legal scalar fp types.

49493

if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||

49494

(Subtarget.hasSSE2() && N00Type == MVT::f64) ||

49495

(Subtarget.hasFP16() && N00Type == MVT::f16)))

49496

return SDValue();

49497

49498

if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {

49499

unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());

49500

SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);

49501

return DAG.getBitcast(VT, FPLogic);

49502

}

49503

49504

if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||

49505

!N1.hasOneUse())

49506

return SDValue();

49507

49508

ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();

49509

ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();

49510

49511

// The vector ISA for FP predicates is incomplete before AVX, so converting

49512

// COMIS* to CMPS* may not be a win before AVX.

49513

if (!Subtarget.hasAVX() &&

49514

!(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))

49515

return SDValue();

49516

49517

// Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)

49518

// and vector logic:

49519

// logic (setcc N00, N01), (setcc N10, N11) -->

49520

// extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0

49521

unsigned NumElts = 128 / N00Type.getSizeInBits();

49522

EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);

49523

EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);

49524

SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);

49525

SDValue N01 = N0.getOperand(1);

49526

SDValue N11 = N1.getOperand(1);

49527

SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);

49528

SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);

49529

SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);

49530

SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);

49531

SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);

49532

SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);

49533

SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);

49534

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);

49535

}

49536

49537

// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))

49538

// to reduce XMM->GPR traffic.

49539

static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {

49540

unsigned Opc = N->getOpcode();

49541

assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49542, __extension__
__PRETTY_FUNCTION__))

49542

"Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49542, __extension__
__PRETTY_FUNCTION__));

49543

49544

SDValue N0 = N->getOperand(0);

49545

SDValue N1 = N->getOperand(1);

49546

49547

// Both operands must be single use MOVMSK.

49548

if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||

49549

N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())

49550

return SDValue();

49551

49552

SDValue Vec0 = N0.getOperand(0);

49553

SDValue Vec1 = N1.getOperand(0);

49554

EVT VecVT0 = Vec0.getValueType();

49555

EVT VecVT1 = Vec1.getValueType();

49556

49557

// Both MOVMSK operands must be from vectors of the same size and same element

49558

// size, but its OK for a fp/int diff.

49559

if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||

49560

VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())

49561

return SDValue();

49562

49563

SDLoc DL(N);

49564

unsigned VecOpc =

49565

VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;

49566

SDValue Result =

49567

DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));

49568

return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

49569

}

49570

49571

// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).

49572

// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws

49573

// handles in InstCombine.

49574

static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) {

49575

unsigned Opc = N->getOpcode();

49576

assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49577, __extension__
__PRETTY_FUNCTION__))

49577

"Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49577, __extension__
__PRETTY_FUNCTION__));

49578

49579

SDValue N0 = N->getOperand(0);

49580

SDValue N1 = N->getOperand(1);

49581

EVT VT = N->getValueType(0);

49582

49583

// Both operands must be single use.

49584

if (!N0.hasOneUse() || !N1.hasOneUse())

49585

return SDValue();

49586

49587

// Search for matching shifts.

49588

SDValue BC0 = peekThroughOneUseBitcasts(N0);

49589

SDValue BC1 = peekThroughOneUseBitcasts(N1);

49590

49591

unsigned BCOpc = BC0.getOpcode();

49592

EVT BCVT = BC0.getValueType();

49593

if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())

49594

return SDValue();

49595

49596

switch (BCOpc) {

49597

case X86ISD::VSHLI:

49598

case X86ISD::VSRLI:

49599

case X86ISD::VSRAI: {

49600

if (BC0.getOperand(1) != BC1.getOperand(1))

49601

return SDValue();

49602

49603

SDLoc DL(N);

49604

SDValue BitOp =

49605

DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));

49606

SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));

49607

return DAG.getBitcast(VT, Shift);

49608

}

49609

}

49610

49611

return SDValue();

49612

}

49613

49614

/// If this is a zero/all-bits result that is bitwise-anded with a low bits

49615

/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'

49616

/// with a shift-right to eliminate loading the vector constant mask value.

49617

static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,

49618

const X86Subtarget &Subtarget) {

49619

SDValue Op0 = peekThroughBitcasts(N->getOperand(0));

49620

SDValue Op1 = peekThroughBitcasts(N->getOperand(1));

49621

EVT VT = Op0.getValueType();

49622

if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())

49623

return SDValue();

49624

49625

// Try to convert an "is positive" signbit masking operation into arithmetic

49626

// shift and "andn". This saves a materialization of a -1 vector constant.

49627

// The "is negative" variant should be handled more generally because it only

49628

// requires "and" rather than "andn":

49629

// and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y

49630

//

49631

// This is limited to the original type to avoid producing even more bitcasts.

49632

// If the bitcasts can't be eliminated, then it is unlikely that this fold

49633

// will be profitable.

49634

if (N->getValueType(0) == VT &&

49635

supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRA)) {

49636

SDValue X, Y;

49637

if (Op1.getOpcode() == X86ISD::PCMPGT &&

49638

isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {

49639

X = Op1.getOperand(0);

49640

Y = Op0;

49641

} else if (Op0.getOpcode() == X86ISD::PCMPGT &&

49642

isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {

49643

X = Op0.getOperand(0);

49644

Y = Op1;

49645

}

49646

if (X && Y) {

49647

SDLoc DL(N);

49648

SDValue Sra =

49649

getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,

49650

VT.getScalarSizeInBits() - 1, DAG);

49651

return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);

49652

}

49653

}

49654

49655

APInt SplatVal;

49656

if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||

49657

!SplatVal.isMask())

49658

return SDValue();

49659

49660

// Don't prevent creation of ANDN.

49661

if (isBitwiseNot(Op0))

49662

return SDValue();

49663

49664

if (!supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRL))

49665

return SDValue();

49666

49667

unsigned EltBitWidth = VT.getScalarSizeInBits();

49668

if (EltBitWidth != DAG.ComputeNumSignBits(Op0))

49669

return SDValue();

49670

49671

SDLoc DL(N);

49672

unsigned ShiftVal = SplatVal.countr_one();

49673

SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);

49674

SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);

49675

return DAG.getBitcast(N->getValueType(0), Shift);

49676

}

49677

49678

// Get the index node from the lowered DAG of a GEP IR instruction with one

49679

// indexing dimension.

49680

static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {

49681

if (Ld->isIndexed())

49682

return SDValue();

49683

49684

SDValue Base = Ld->getBasePtr();

49685

49686

if (Base.getOpcode() != ISD::ADD)

49687

return SDValue();

49688

49689

SDValue ShiftedIndex = Base.getOperand(0);

49690

49691

if (ShiftedIndex.getOpcode() != ISD::SHL)

49692

return SDValue();

49693

49694

return ShiftedIndex.getOperand(0);

49695

49696

}

49697

49698

static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {

49699

if (Subtarget.hasBMI2() && VT.isScalarInteger()) {

49700

switch (VT.getSizeInBits()) {

49701

default: return false;

49702

case 64: return Subtarget.is64Bit() ? true : false;

49703

case 32: return true;

49704

}

49705

}

49706

return false;

49707

}

49708

49709

// This function recognizes cases where X86 bzhi instruction can replace and

49710

// 'and-load' sequence.

49711

// In case of loading integer value from an array of constants which is defined

49712

// as follows:

49713

//

49714

// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}

49715

//

49716

// then applying a bitwise and on the result with another input.

49717

// It's equivalent to performing bzhi (zero high bits) on the input, with the

49718

// same index of the load.

49719

static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,

49720

const X86Subtarget &Subtarget) {

49721

MVT VT = Node->getSimpleValueType(0);

49722

SDLoc dl(Node);

49723

49724

// Check if subtarget has BZHI instruction for the node's type

49725

if (!hasBZHI(Subtarget, VT))

49726

return SDValue();

49727

49728

// Try matching the pattern for both operands.

49729

for (unsigned i = 0; i < 2; i++) {

49730

SDValue N = Node->getOperand(i);

49731

LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());

49732

49733

// continue if the operand is not a load instruction

49734

if (!Ld)

49735

return SDValue();

49736

49737

const Value *MemOp = Ld->getMemOperand()->getValue();

49738

49739

if (!MemOp)

49740

return SDValue();

49741

49742

if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {

49743

if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {

49744

if (GV->isConstant() && GV->hasDefinitiveInitializer()) {

49745

49746

Constant *Init = GV->getInitializer();

49747

Type *Ty = Init->getType();

49748

if (!isa<ConstantDataArray>(Init) ||

49749

!Ty->getArrayElementType()->isIntegerTy() ||

49750

Ty->getArrayElementType()->getScalarSizeInBits() !=

49751

VT.getSizeInBits() ||

49752

Ty->getArrayNumElements() >

49753

Ty->getArrayElementType()->getScalarSizeInBits())

49754

continue;

49755

49756

// Check if the array's constant elements are suitable to our case.

49757

uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();

49758

bool ConstantsMatch = true;

49759

for (uint64_t j = 0; j < ArrayElementCount; j++) {

49760

auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));

49761

if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {

49762

ConstantsMatch = false;

49763

break;

49764

}

49765

}

49766

if (!ConstantsMatch)

49767

continue;

49768

49769

// Do the transformation (For 32-bit type):

49770

// -> (and (load arr[idx]), inp)

49771

// <- (and (srl 0xFFFFFFFF, (sub 32, idx)))

49772

// that will be replaced with one bzhi instruction.

49773

SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);

49774

SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);

49775

49776

// Get the Node which indexes into the array.

49777

SDValue Index = getIndexFromUnindexedLoad(Ld);

49778

if (!Index)

49779

return SDValue();

49780

Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);

49781

49782

SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);

49783

Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);

49784

49785

SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);

49786

SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);

49787

49788

return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);

49789

}

49790

}

49791

}

49792

}

49793

return SDValue();

49794

}

49795

49796

// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)

49797

// Where C is a mask containing the same number of bits as the setcc and

49798

// where the setcc will freely 0 upper bits of k-register. We can replace the

49799

// undef in the concat with 0s and remove the AND. This mainly helps with

49800

// v2i1/v4i1 setcc being casted to scalar.

49801

static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,

49802

const X86Subtarget &Subtarget) {

49803

assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode!") ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49803, __extension__
__PRETTY_FUNCTION__));

49804

49805

EVT VT = N->getValueType(0);

49806

49807

// Make sure this is an AND with constant. We will check the value of the

49808

// constant later.

49809

auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));

49810

if (!C1)

49811

return SDValue();

49812

49813

// This is implied by the ConstantSDNode.

49814

assert(!VT.isVector() && "Expected scalar VT!")(static_cast <bool> (!VT.isVector() && "Expected scalar VT!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Expected scalar VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49814, __extension__
__PRETTY_FUNCTION__));

49815

49816

SDValue Src = N->getOperand(0);

49817

if (!Src.hasOneUse())

49818

return SDValue();

49819

49820

// (Optionally) peek through any_extend().

49821

if (Src.getOpcode() == ISD::ANY_EXTEND) {

49822

if (!Src.getOperand(0).hasOneUse())

49823

return SDValue();

49824

Src = Src.getOperand(0);

49825

}

49826

49827

if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())

49828

return SDValue();

49829

49830

Src = Src.getOperand(0);

49831

EVT SrcVT = Src.getValueType();

49832

49833

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49834

if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||

49835

!TLI.isTypeLegal(SrcVT))

49836

return SDValue();

49837

49838

if (Src.getOpcode() != ISD::CONCAT_VECTORS)

49839

return SDValue();

49840

49841

// We only care about the first subvector of the concat, we expect the

49842

// other subvectors to be ignored due to the AND if we make the change.

49843

SDValue SubVec = Src.getOperand(0);

49844

EVT SubVecVT = SubVec.getValueType();

49845

49846

// The RHS of the AND should be a mask with as many bits as SubVec.

49847

if (!TLI.isTypeLegal(SubVecVT) ||

49848

!C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))

49849

return SDValue();

49850

49851

// First subvector should be a setcc with a legal result type or a

49852

// AND containing at least one setcc with a legal result type.

49853

auto IsLegalSetCC = [&](SDValue V) {

49854

if (V.getOpcode() != ISD::SETCC)

49855

return false;

49856

EVT SetccVT = V.getOperand(0).getValueType();

49857

if (!TLI.isTypeLegal(SetccVT) ||

49858

!(Subtarget.hasVLX() || SetccVT.is512BitVector()))

49859

return false;

49860

if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))

49861

return false;

49862

return true;

49863

};

49864

if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&

49865

(IsLegalSetCC(SubVec.getOperand(0)) ||

49866

IsLegalSetCC(SubVec.getOperand(1))))))

49867

return SDValue();

49868

49869

// We passed all the checks. Rebuild the concat_vectors with zeroes

49870

// and cast it back to VT.

49871

SDLoc dl(N);

49872

SmallVector<SDValue, 4> Ops(Src.getNumOperands(),

49873

DAG.getConstant(0, dl, SubVecVT));

49874

Ops[0] = SubVec;

49875

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,

49876

Ops);

49877

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());

49878

return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);

49879

}

49880

49881

static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,

49882

SDValue OpMustEq, SDValue Op, unsigned Depth) {

49883

// We don't want to go crazy with the recursion here. This isn't a super

49884

// important optimization.

49885

static constexpr unsigned kMaxDepth = 2;

49886

49887

// Only do this re-ordering if op has one use.

49888

if (!Op.hasOneUse())

49889

return SDValue();

49890

49891

SDLoc DL(Op);

49892

// If we hit another assosiative op, recurse further.

49893

if (Op.getOpcode() == Opc) {

49894

// Done recursing.

49895

if (Depth++ >= kMaxDepth)

49896

return SDValue();

49897

49898

for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)

49899

if (SDValue R =

49900

getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))

49901

return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,

49902

Op.getOperand(1 - OpIdx));

49903

49904

} else if (Op.getOpcode() == ISD::SUB) {

49905

if (Opc == ISD::AND) {

49906

// BLSI: (and x, (sub 0, x))

49907

if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)

49908

return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);

49909

}

49910

// Opc must be ISD::AND or ISD::XOR

49911

// BLSR: (and x, (sub x, 1))

49912

// BLSMSK: (xor x, (sub x, 1))

49913

if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)

49914

return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);

49915

49916

} else if (Op.getOpcode() == ISD::ADD) {

49917

// Opc must be ISD::AND or ISD::XOR

49918

// BLSR: (and x, (add x, -1))

49919

// BLSMSK: (xor x, (add x, -1))

49920

if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)

49921

return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);

49922

}

49923

return SDValue();

49924

}

49925

49926

static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG,

49927

const X86Subtarget &Subtarget) {

49928

EVT VT = N->getValueType(0);

49929

// Make sure this node is a candidate for BMI instructions.

49930

if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||

49931

(VT != MVT::i32 && VT != MVT::i64))

49932

return SDValue();

49933

49934

assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR)(static_cast <bool> (N->getOpcode() == ISD::AND || N
->getOpcode() == ISD::XOR) ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49934, __extension__
__PRETTY_FUNCTION__));

49935

49936

// Try and match LHS and RHS.

49937

for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)

49938

if (SDValue OpMatch =

49939

getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),

49940

N->getOperand(1 - OpIdx), 0))

49941

return OpMatch;

49942

return SDValue();

49943

}

49944

49945

static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,

49946

TargetLowering::DAGCombinerInfo &DCI,

49947

const X86Subtarget &Subtarget) {

49948

SDValue N0 = N->getOperand(0);

49949

SDValue N1 = N->getOperand(1);

49950

EVT VT = N->getValueType(0);

49951

SDLoc dl(N);

49952

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49953

49954

// If this is SSE1 only convert to FAND to avoid scalarization.

49955

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

49956

return DAG.getBitcast(MVT::v4i32,

49957

DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,

49958

DAG.getBitcast(MVT::v4f32, N0),

49959

DAG.getBitcast(MVT::v4f32, N1)));

49960

}

49961

49962

// Use a 32-bit and+zext if upper bits known zero.

49963

if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {

49964

APInt HiMask = APInt::getHighBitsSet(64, 32);

49965

if (DAG.MaskedValueIsZero(N1, HiMask) ||

49966

DAG.MaskedValueIsZero(N0, HiMask)) {

49967

SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);

49968

SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);

49969

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,

49970

DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));

49971

}

49972

}

49973

49974

// Match all-of bool scalar reductions into a bitcast/movmsk + cmp.

49975

// TODO: Support multiple SrcOps.

49976

if (VT == MVT::i1) {

49977

SmallVector<SDValue, 2> SrcOps;

49978

SmallVector<APInt, 2> SrcPartials;

49979

if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&

49980

SrcOps.size() == 1) {

49981

unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();

49982

EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

49983

SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);

49984

if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))

49985

Mask = DAG.getBitcast(MaskVT, SrcOps[0]);

49986

if (Mask) {

49987

assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49988, __extension__
__PRETTY_FUNCTION__))

49988

"Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49988, __extension__
__PRETTY_FUNCTION__));

49989

SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);

49990

Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);

49991

return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);

49992

}

49993

}

49994

}

49995

49996

if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))

49997

return V;

49998

49999

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

50000

return R;

50001

50002

if (SDValue R = combineBitOpWithShift(N, DAG))

50003

return R;

50004

50005

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))

50006

return FPLogic;

50007

50008

if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))

50009

return R;

50010

50011

if (DCI.isBeforeLegalizeOps())

50012

return SDValue();

50013

50014

if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))

50015

return R;

50016

50017

if (SDValue R = combineAndNotIntoANDNP(N, DAG))

50018

return R;

50019

50020

if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))

50021

return ShiftRight;

50022

50023

if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))

50024

return R;

50025

50026

// fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))

50027

// iff c2 is all/no bits mask - i.e. a select-with-zero mask.

50028

// TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?

50029

if (VT.isVector() && getTargetConstantFromNode(N1)) {

50030

unsigned Opc0 = N0.getOpcode();

50031

if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&

50032

getTargetConstantFromNode(N0.getOperand(1)) &&

50033

DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&

50034

N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {

50035

SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);

50036

return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);

50037

}

50038

}

50039

50040

// Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant

50041

// avoids slow variable shift (moving shift amount to ECX etc.)

50042

if (isOneConstant(N1) && N0->hasOneUse()) {

50043

SDValue Src = N0;

50044

while ((Src.getOpcode() == ISD::ZERO_EXTEND ||

50045

Src.getOpcode() == ISD::TRUNCATE) &&

50046

Src.getOperand(0)->hasOneUse())

50047

Src = Src.getOperand(0);

50048

bool ContainsNOT = false;

50049

X86::CondCode X86CC = X86::COND_B;

50050

// Peek through AND(NOT(SRL(X,Y)),1).

50051

if (isBitwiseNot(Src)) {

50052

Src = Src.getOperand(0);

50053

X86CC = X86::COND_AE;

50054

ContainsNOT = true;

50055

}

50056

if (Src.getOpcode() == ISD::SRL &&

50057

!isa<ConstantSDNode>(Src.getOperand(1))) {

50058

SDValue BitNo = Src.getOperand(1);

50059

Src = Src.getOperand(0);

50060

// Peek through AND(SRL(NOT(X),Y),1).

50061

if (isBitwiseNot(Src)) {

50062

Src = Src.getOperand(0);

50063

X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;

50064

ContainsNOT = true;

50065

}

50066

// If we have BMI2 then SHRX should be faster for i32/i64 cases.

50067

if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))

50068

if (SDValue BT = getBT(Src, BitNo, dl, DAG))

50069

return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);

50070

}

50071

}

50072

50073

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

50074

// Attempt to recursively combine a bitmask AND with shuffles.

50075

SDValue Op(N, 0);

50076

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

50077

return Res;

50078

50079

// If either operand is a constant mask, then only the elements that aren't

50080

// zero are actually demanded by the other operand.

50081

auto GetDemandedMasks = [&](SDValue Op) {

50082

APInt UndefElts;

50083

SmallVector<APInt> EltBits;

50084

int NumElts = VT.getVectorNumElements();

50085

int EltSizeInBits = VT.getScalarSizeInBits();

50086

APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);

50087

APInt DemandedElts = APInt::getAllOnes(NumElts);

50088

if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

50089

EltBits)) {

50090

DemandedBits.clearAllBits();

50091

DemandedElts.clearAllBits();

50092

for (int I = 0; I != NumElts; ++I) {

50093

if (UndefElts[I]) {

50094

// We can't assume an undef src element gives an undef dst - the

50095

// other src might be zero.

50096

DemandedBits.setAllBits();

50097

DemandedElts.setBit(I);

50098

} else if (!EltBits[I].isZero()) {

50099

DemandedBits |= EltBits[I];

50100

DemandedElts.setBit(I);

50101

}

50102

}

50103

}

50104

return std::make_pair(DemandedBits, DemandedElts);

50105

};

50106

APInt Bits0, Elts0;

50107

APInt Bits1, Elts1;

50108

std::tie(Bits0, Elts0) = GetDemandedMasks(N1);

50109

std::tie(Bits1, Elts1) = GetDemandedMasks(N0);

50110

50111

if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||

50112

TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||

50113

TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||

50114

TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {

50115

if (N->getOpcode() != ISD::DELETED_NODE)

50116

DCI.AddToWorklist(N);

50117

return SDValue(N, 0);

50118

}

50119

50120

SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);

50121

SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);

50122

if (NewN0 || NewN1)

50123

return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,

50124

NewN1 ? NewN1 : N1);

50125

}

50126

50127

// Attempt to combine a scalar bitmask AND with an extracted shuffle.

50128

if ((VT.getScalarSizeInBits() % 8) == 0 &&

50129

N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

50130

isa<ConstantSDNode>(N0.getOperand(1))) {

50131

SDValue BitMask = N1;

50132

SDValue SrcVec = N0.getOperand(0);

50133

EVT SrcVecVT = SrcVec.getValueType();

50134

50135

// Check that the constant bitmask masks whole bytes.

50136

APInt UndefElts;

50137

SmallVector<APInt, 64> EltBits;

50138

if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&

50139

getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&

50140

llvm::all_of(EltBits, [](const APInt &M) {

50141

return M.isZero() || M.isAllOnes();

50142

})) {

50143

unsigned NumElts = SrcVecVT.getVectorNumElements();

50144

unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;

50145

unsigned Idx = N0.getConstantOperandVal(1);

50146

50147

// Create a root shuffle mask from the byte mask and the extracted index.

50148

SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);

50149

for (unsigned i = 0; i != Scale; ++i) {

50150

if (UndefElts[i])

50151

continue;

50152

int VecIdx = Scale * Idx + i;

50153

ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;

50154

}

50155

50156

if (SDValue Shuffle = combineX86ShufflesRecursively(

50157

{SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,

50158

X86::MaxShuffleCombineDepth,

50159

/*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,

50160

/*AllowVarPerLaneMask*/ true, DAG, Subtarget))

50161

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,

50162

N0.getOperand(1));

50163

}

50164

}

50165

50166

if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))

50167

return R;

50168

50169

return SDValue();

50170

}

50171

50172

// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))

50173

static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,

50174

const X86Subtarget &Subtarget) {

50175

assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50175, __extension__
__PRETTY_FUNCTION__));

50176

50177

MVT VT = N->getSimpleValueType(0);

50178

unsigned EltSizeInBits = VT.getScalarSizeInBits();

50179

if (!VT.isVector() || (EltSizeInBits % 8) != 0)

50180

return SDValue();

50181

50182

SDValue N0 = peekThroughBitcasts(N->getOperand(0));

50183

SDValue N1 = peekThroughBitcasts(N->getOperand(1));

50184

if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)

50185

return SDValue();

50186

50187

// On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use

50188

// VPTERNLOG. Otherwise only do this if either mask has multiple uses already.

50189

if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||

50190

!N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))

50191

return SDValue();

50192

50193

// Attempt to extract constant byte masks.

50194

APInt UndefElts0, UndefElts1;

50195

SmallVector<APInt, 32> EltBits0, EltBits1;

50196

if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,

50197

false, false))

50198

return SDValue();

50199

if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,

50200

false, false))

50201

return SDValue();

50202

50203

for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {

50204

// TODO - add UNDEF elts support.

50205

if (UndefElts0[i] || UndefElts1[i])

50206

return SDValue();

50207

if (EltBits0[i] != ~EltBits1[i])

50208

return SDValue();

50209

}

50210

50211

SDLoc DL(N);

50212

50213

if (useVPTERNLOG(Subtarget, VT)) {

50214

// Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.

50215

// VPTERNLOG is only available as vXi32/64-bit types.

50216

MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64;

50217

MVT OpVT =

50218

MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());

50219

SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));

50220

SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));

50221

SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));

50222

SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);

50223

SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},

50224

DAG, Subtarget);

50225

return DAG.getBitcast(VT, Res);

50226

}

50227

50228

SDValue X = N->getOperand(0);

50229

SDValue Y =

50230

DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),

50231

DAG.getBitcast(VT, N1.getOperand(0)));

50232

return DAG.getNode(ISD::OR, DL, VT, X, Y);

50233

}

50234

50235

// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.

50236

static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {

50237

if (N->getOpcode() != ISD::OR)

50238

return false;

50239

50240

SDValue N0 = N->getOperand(0);

50241

SDValue N1 = N->getOperand(1);

50242

50243

// Canonicalize AND to LHS.

50244

if (N1.getOpcode() == ISD::AND)

50245

std::swap(N0, N1);

50246

50247

// Attempt to match OR(AND(M,Y),ANDNP(M,X)).

50248

if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)

50249

return false;

50250

50251

Mask = N1.getOperand(0);

50252

X = N1.getOperand(1);

50253

50254

// Check to see if the mask appeared in both the AND and ANDNP.

50255

if (N0.getOperand(0) == Mask)

50256

Y = N0.getOperand(1);

50257

else if (N0.getOperand(1) == Mask)

50258

Y = N0.getOperand(0);

50259

else

50260

return false;

50261

50262

// TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for

50263

// ANDNP combine allows other combines to happen that prevent matching.

50264

return true;

50265

}

50266

50267

// Try to fold:

50268

// (or (and (m, y), (pandn m, x)))

50269

// into:

50270

// (vselect m, x, y)

50271

// As a special case, try to fold:

50272

// (or (and (m, (sub 0, x)), (pandn m, x)))

50273

// into:

50274

// (sub (xor X, M), M)

50275

static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,

50276

const X86Subtarget &Subtarget) {

50277

assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50277, __extension__
__PRETTY_FUNCTION__));

50278

50279

EVT VT = N->getValueType(0);

50280

if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||

50281

(VT.is256BitVector() && Subtarget.hasInt256())))

50282

return SDValue();

50283

50284

SDValue X, Y, Mask;

50285

if (!matchLogicBlend(N, X, Y, Mask))

50286

return SDValue();

50287

50288

// Validate that X, Y, and Mask are bitcasts, and see through them.

50289

Mask = peekThroughBitcasts(Mask);

50290

X = peekThroughBitcasts(X);

50291

Y = peekThroughBitcasts(Y);

50292

50293

EVT MaskVT = Mask.getValueType();

50294

unsigned EltBits = MaskVT.getScalarSizeInBits();

50295

50296

// TODO: Attempt to handle floating point cases as well?

50297

if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)

50298

return SDValue();

50299

50300

SDLoc DL(N);

50301

50302

// Attempt to combine to conditional negate: (sub (xor X, M), M)

50303

if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,

50304

DAG, Subtarget))

50305

return Res;

50306

50307

// PBLENDVB is only available on SSE 4.1.

50308

if (!Subtarget.hasSSE41())

50309

return SDValue();

50310

50311

// If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.

50312

if (Subtarget.hasVLX())

50313

return SDValue();

50314

50315

MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;

50316

50317

X = DAG.getBitcast(BlendVT, X);

50318

Y = DAG.getBitcast(BlendVT, Y);

50319

Mask = DAG.getBitcast(BlendVT, Mask);

50320

Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);

50321

return DAG.getBitcast(VT, Mask);

50322

}

50323

50324

// Helper function for combineOrCmpEqZeroToCtlzSrl

50325

// Transforms:

50326

// seteq(cmp x, 0)

50327

// into:

50328

// srl(ctlz x), log2(bitsize(x))

50329

// Input pattern is checked by caller.

50330

static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) {

50331

SDValue Cmp = Op.getOperand(1);

50332

EVT VT = Cmp.getOperand(0).getValueType();

50333

unsigned Log2b = Log2_32(VT.getSizeInBits());

50334

SDLoc dl(Op);

50335

SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));

50336

// The result of the shift is true or false, and on X86, the 32-bit

50337

// encoding of shr and lzcnt is more desirable.

50338

SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);

50339

SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,

50340

DAG.getConstant(Log2b, dl, MVT::i8));

50341

return Scc;

50342

}

50343

50344

// Try to transform:

50345

// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))

50346

// into:

50347

// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))

50348

// Will also attempt to match more generic cases, eg:

50349

// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))

50350

// Only applies if the target supports the FastLZCNT feature.

50351

static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,

50352

TargetLowering::DAGCombinerInfo &DCI,

50353

const X86Subtarget &Subtarget) {

50354

if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())

50355

return SDValue();

50356

50357

auto isORCandidate = [](SDValue N) {

50358

return (N->getOpcode() == ISD::OR && N->hasOneUse());

50359

};

50360

50361

// Check the zero extend is extending to 32-bit or more. The code generated by

50362

// srl(ctlz) for 16-bit or less variants of the pattern would require extra

50363

// instructions to clear the upper bits.

50364

if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||

50365

!isORCandidate(N->getOperand(0)))

50366

return SDValue();

50367

50368

// Check the node matches: setcc(eq, cmp 0)

50369

auto isSetCCCandidate = [](SDValue N) {

50370

return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&

50371

X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&

50372

N->getOperand(1).getOpcode() == X86ISD::CMP &&

50373

isNullConstant(N->getOperand(1).getOperand(1)) &&

50374

N->getOperand(1).getValueType().bitsGE(MVT::i32);

50375

};

50376

50377

SDNode *OR = N->getOperand(0).getNode();

50378

SDValue LHS = OR->getOperand(0);

50379

SDValue RHS = OR->getOperand(1);

50380

50381

// Save nodes matching or(or, setcc(eq, cmp 0)).

50382

SmallVector<SDNode *, 2> ORNodes;

50383

while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||

50384

(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {

50385

ORNodes.push_back(OR);

50386

OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();

50387

LHS = OR->getOperand(0);

50388

RHS = OR->getOperand(1);

50389

}

50390

50391

// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).

50392

if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||

50393

!isORCandidate(SDValue(OR, 0)))

50394

return SDValue();

50395

50396

// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it

50397

// to

50398

// or(srl(ctlz),srl(ctlz)).

50399

// The dag combiner can then fold it into:

50400

// srl(or(ctlz, ctlz)).

50401

SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);

50402

SDValue Ret, NewRHS;

50403

if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))

50404

Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);

50405

50406

if (!Ret)

50407

return SDValue();

50408

50409

// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.

50410

while (!ORNodes.empty()) {

50411

OR = ORNodes.pop_back_val();

50412

LHS = OR->getOperand(0);

50413

RHS = OR->getOperand(1);

50414

// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).

50415

if (RHS->getOpcode() == ISD::OR)

50416

std::swap(LHS, RHS);

50417

NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);

50418

if (!NewRHS)

50419

return SDValue();

50420

Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);

50421

}

50422

50423

return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);

50424

}

50425

50426

static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,

50427

SDValue And1_L, SDValue And1_R,

50428

const SDLoc &DL, SelectionDAG &DAG) {

50429

if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())

50430

return SDValue();

50431

SDValue NotOp = And0_L->getOperand(0);

50432

if (NotOp == And1_R)

50433

std::swap(And1_R, And1_L);

50434

if (NotOp != And1_L)

50435

return SDValue();

50436

50437

// (~(NotOp) & And0_R) | (NotOp & And1_R)

50438

// --> ((And0_R ^ And1_R) & NotOp) ^ And1_R

50439

EVT VT = And1_L->getValueType(0);

50440

SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);

50441

SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);

50442

SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);

50443

SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);

50444

return Xor1;

50445

}

50446

50447

/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the

50448

/// equivalent `((x ^ y) & m) ^ y)` pattern.

50449

/// This is typically a better representation for targets without a fused

50450

/// "and-not" operation. This function is intended to be called from a

50451

/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.

50452

static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {

50453

// Note that masked-merge variants using XOR or ADD expressions are

50454

// normalized to OR by InstCombine so we only check for OR.

50455

assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node")(static_cast <bool> (Node->getOpcode() == ISD::OR &&
"Must be called with ISD::OR node") ? void (0) : __assert_fail
("Node->getOpcode() == ISD::OR && \"Must be called with ISD::OR node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50455, __extension__
__PRETTY_FUNCTION__));

50456

SDValue N0 = Node->getOperand(0);

50457

if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())

50458

return SDValue();

50459

SDValue N1 = Node->getOperand(1);

50460

if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())

50461

return SDValue();

50462

50463

SDLoc DL(Node);

50464

SDValue N00 = N0->getOperand(0);

50465

SDValue N01 = N0->getOperand(1);

50466

SDValue N10 = N1->getOperand(0);

50467

SDValue N11 = N1->getOperand(1);

50468

if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))

50469

return Result;

50470

if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))

50471

return Result;

50472

if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))

50473

return Result;

50474

if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))

50475

return Result;

50476

return SDValue();

50477

}

50478

50479

/// If this is an add or subtract where one operand is produced by a cmp+setcc,

50480

/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}

50481

/// with CMP+{ADC, SBB}.

50482

/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.

50483

static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,

50484

SDValue X, SDValue Y,

50485

SelectionDAG &DAG,

50486

bool ZeroSecondOpOnly = false) {

50487

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

50488

return SDValue();

50489

50490

// Look through a one-use zext.

50491

if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())

50492

Y = Y.getOperand(0);

50493

50494

X86::CondCode CC;

50495

SDValue EFLAGS;

50496

if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {

50497

CC = (X86::CondCode)Y.getConstantOperandVal(0);

50498

EFLAGS = Y.getOperand(1);

50499

} else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&

50500

Y.hasOneUse()) {

50501

EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);

50502

}

50503

50504

if (!EFLAGS)

50505

return SDValue();

50506

50507

// If X is -1 or 0, then we have an opportunity to avoid constants required in

50508

// the general case below.

50509

auto *ConstantX = dyn_cast<ConstantSDNode>(X);

50510

if (ConstantX && !ZeroSecondOpOnly) {

50511

if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||

50512

(IsSub && CC == X86::COND_B && ConstantX->isZero())) {

50513

// This is a complicated way to get -1 or 0 from the carry flag:

50514

// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax

50515

// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax

50516

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

50517

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

50518

EFLAGS);

50519

}

50520

50521

if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||

50522

(IsSub && CC == X86::COND_A && ConstantX->isZero())) {

50523

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&

50524

EFLAGS.getValueType().isInteger() &&

50525

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

50526

// Swap the operands of a SUB, and we have the same pattern as above.

50527

// -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB

50528

// 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB

50529

SDValue NewSub = DAG.getNode(

50530

X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

50531

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

50532

SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());

50533

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

50534

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

50535

NewEFLAGS);

50536

}

50537

}

50538

}

50539

50540

if (CC == X86::COND_B) {

50541

// X + SETB Z --> adc X, 0

50542

// X - SETB Z --> sbb X, 0

50543

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,

50544

DAG.getVTList(VT, MVT::i32), X,

50545

DAG.getConstant(0, DL, VT), EFLAGS);

50546

}

50547

50548

if (ZeroSecondOpOnly)

50549

return SDValue();

50550

50551

if (CC == X86::COND_A) {

50552

// Try to convert COND_A into COND_B in an attempt to facilitate

50553

// materializing "setb reg".

50554

//

50555

// Do not flip "e > c", where "c" is a constant, because Cmp instruction

50556

// cannot take an immediate as its first operand.

50557

//

50558

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&

50559

EFLAGS.getValueType().isInteger() &&

50560

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

50561

SDValue NewSub =

50562

DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

50563

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

50564

SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());

50565

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,

50566

DAG.getVTList(VT, MVT::i32), X,

50567

DAG.getConstant(0, DL, VT), NewEFLAGS);

50568

}

50569

}

50570

50571

if (CC == X86::COND_AE) {

50572

// X + SETAE --> sbb X, -1

50573

// X - SETAE --> adc X, -1

50574

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,

50575

DAG.getVTList(VT, MVT::i32), X,

50576

DAG.getConstant(-1, DL, VT), EFLAGS);

50577

}

50578

50579

if (CC == X86::COND_BE) {

50580

// X + SETBE --> sbb X, -1

50581

// X - SETBE --> adc X, -1

50582

// Try to convert COND_BE into COND_AE in an attempt to facilitate

50583

// materializing "setae reg".

50584

//

50585

// Do not flip "e <= c", where "c" is a constant, because Cmp instruction

50586

// cannot take an immediate as its first operand.

50587

//

50588

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&

50589

EFLAGS.getValueType().isInteger() &&

50590

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

50591

SDValue NewSub =

50592

DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

50593

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

50594

SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());

50595

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,

50596

DAG.getVTList(VT, MVT::i32), X,

50597

DAG.getConstant(-1, DL, VT), NewEFLAGS);

50598

}

50599

}

50600

50601

if (CC != X86::COND_E && CC != X86::COND_NE)

50602

return SDValue();

50603

50604

if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||

50605

!X86::isZeroNode(EFLAGS.getOperand(1)) ||

50606

!EFLAGS.getOperand(0).getValueType().isInteger())

50607

return SDValue();

50608

50609

SDValue Z = EFLAGS.getOperand(0);

50610

EVT ZVT = Z.getValueType();

50611

50612

// If X is -1 or 0, then we have an opportunity to avoid constants required in

50613

// the general case below.

50614

if (ConstantX) {

50615

// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with

50616

// fake operands:

50617

// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)

50618

// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)

50619

if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||

50620

(!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {

50621

SDValue Zero = DAG.getConstant(0, DL, ZVT);

50622

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

50623

SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);

50624

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

50625

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

50626

SDValue(Neg.getNode(), 1));

50627

}

50628

50629

// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'

50630

// with fake operands:

50631

// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)

50632

// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)

50633

if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||

50634

(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {

50635

SDValue One = DAG.getConstant(1, DL, ZVT);

50636

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

50637

SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);

50638

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

50639

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

50640

Cmp1.getValue(1));

50641

}

50642

}

50643

50644

// (cmp Z, 1) sets the carry flag if Z is 0.

50645

SDValue One = DAG.getConstant(1, DL, ZVT);

50646

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

50647

SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);

50648

50649

// Add the flags type for ADC/SBB nodes.

50650

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

50651

50652

// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)

50653

// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)

50654

if (CC == X86::COND_NE)

50655

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,

50656

DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));

50657

50658

// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)

50659

// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)

50660

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,

50661

DAG.getConstant(0, DL, VT), Cmp1.getValue(1));

50662

}

50663

50664

/// If this is an add or subtract where one operand is produced by a cmp+setcc,

50665

/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}

50666

/// with CMP+{ADC, SBB}.

50667

static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {

50668

bool IsSub = N->getOpcode() == ISD::SUB;

50669

SDValue X = N->getOperand(0);

50670

SDValue Y = N->getOperand(1);

50671

EVT VT = N->getValueType(0);

50672

SDLoc DL(N);

50673

50674

if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))

50675

return ADCOrSBB;

50676

50677

// Commute and try again (negate the result for subtracts).

50678

if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {

50679

if (IsSub)

50680

ADCOrSBB =

50681

DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB);

50682

return ADCOrSBB;

50683

}

50684

50685

return SDValue();

50686

}

50687

50688

static SDValue combineOrXorWithSETCC(SDNode *N, SDValue N0, SDValue N1,

50689

SelectionDAG &DAG) {

50690

assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) &&(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::OR) && "Unexpected opcode") ?
void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50691, __extension__
__PRETTY_FUNCTION__))

50691

"Unexpected opcode")(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::OR) && "Unexpected opcode") ?
void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50691, __extension__
__PRETTY_FUNCTION__));

50692

50693

// Delegate to combineAddOrSubToADCOrSBB if we have:

50694

//

50695

// (xor/or (zero_extend (setcc)) imm)

50696

//

50697

// where imm is odd if and only if we have xor, in which case the XOR/OR are

50698

// equivalent to a SUB/ADD, respectively.

50699

if (N0.getOpcode() == ISD::ZERO_EXTEND &&

50700

N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {

50701

if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {

50702

bool IsSub = N->getOpcode() == ISD::XOR;

50703

bool N1COdd = N1C->getZExtValue() & 1;

50704

if (IsSub ? N1COdd : !N1COdd) {

50705

SDLoc DL(N);

50706

EVT VT = N->getValueType(0);

50707

if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))

50708

return R;

50709

}

50710

}

50711

}

50712

50713

return SDValue();

50714

}

50715

50716

static SDValue combineOr(SDNode *N, SelectionDAG &DAG,

50717

TargetLowering::DAGCombinerInfo &DCI,

50718

const X86Subtarget &Subtarget) {

50719

SDValue N0 = N->getOperand(0);

50720

SDValue N1 = N->getOperand(1);

50721

EVT VT = N->getValueType(0);

50722

SDLoc dl(N);

50723

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50724

50725

// If this is SSE1 only convert to FOR to avoid scalarization.

50726

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

50727

return DAG.getBitcast(MVT::v4i32,

50728

DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,

50729

DAG.getBitcast(MVT::v4f32, N0),

50730

DAG.getBitcast(MVT::v4f32, N1)));

50731

}

50732

50733

// Match any-of bool scalar reductions into a bitcast/movmsk + cmp.

50734

// TODO: Support multiple SrcOps.

50735

if (VT == MVT::i1) {

50736

SmallVector<SDValue, 2> SrcOps;

50737

SmallVector<APInt, 2> SrcPartials;

50738

if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&

50739

SrcOps.size() == 1) {

50740

unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();

50741

EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

50742

SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);

50743

if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))

50744

Mask = DAG.getBitcast(MaskVT, SrcOps[0]);

50745

if (Mask) {

50746

assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50747, __extension__
__PRETTY_FUNCTION__))

50747

"Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50747, __extension__
__PRETTY_FUNCTION__));

50748

SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);

50749

SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);

50750

Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);

50751

return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);

50752

}

50753

}

50754

}

50755

50756

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

50757

return R;

50758

50759

if (SDValue R = combineBitOpWithShift(N, DAG))

50760

return R;

50761

50762

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))

50763

return FPLogic;

50764

50765

if (DCI.isBeforeLegalizeOps())

50766

return SDValue();

50767

50768

if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))

50769

return R;

50770

50771

if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))

50772

return R;

50773

50774

if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))

50775

return R;

50776

50777

// (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.

50778

if ((VT == MVT::i32 || VT == MVT::i64) &&

50779

N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&

50780

isNullConstant(N0.getOperand(0))) {

50781

SDValue Cond = N0.getOperand(1);

50782

if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())

50783

Cond = Cond.getOperand(0);

50784

50785

if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {

50786

if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {

50787

uint64_t Val = CN->getZExtValue();

50788

if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {

50789

X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);

50790

CCode = X86::GetOppositeBranchCondition(CCode);

50791

SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);

50792

50793

SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);

50794

R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));

50795

R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));

50796

return R;

50797

}

50798

}

50799

}

50800

}

50801

50802

// Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).

50803

// Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).

50804

// iff the upper elements of the non-shifted arg are zero.

50805

// KUNPCK require 16+ bool vector elements.

50806

if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {

50807

unsigned NumElts = VT.getVectorNumElements();

50808

unsigned HalfElts = NumElts / 2;

50809

APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);

50810

if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&

50811

N1.getConstantOperandAPInt(1) == HalfElts &&

50812

DAG.MaskedVectorIsZero(N0, UpperElts)) {

50813

return DAG.getNode(

50814

ISD::CONCAT_VECTORS, dl, VT,

50815

extractSubVector(N0, 0, DAG, dl, HalfElts),

50816

extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));

50817

}

50818

if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&

50819

N0.getConstantOperandAPInt(1) == HalfElts &&

50820

DAG.MaskedVectorIsZero(N1, UpperElts)) {

50821

return DAG.getNode(

50822

ISD::CONCAT_VECTORS, dl, VT,

50823

extractSubVector(N1, 0, DAG, dl, HalfElts),

50824

extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));

50825

}

50826

}

50827

50828

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

50829

// Attempt to recursively combine an OR of shuffles.

50830

SDValue Op(N, 0);

50831

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

50832

return Res;

50833

50834

// If either operand is a constant mask, then only the elements that aren't

50835

// allones are actually demanded by the other operand.

50836

auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {

50837

APInt UndefElts;

50838

SmallVector<APInt> EltBits;

50839

int NumElts = VT.getVectorNumElements();

50840

int EltSizeInBits = VT.getScalarSizeInBits();

50841

if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))

50842

return false;

50843

50844

APInt DemandedElts = APInt::getZero(NumElts);

50845

for (int I = 0; I != NumElts; ++I)

50846

if (!EltBits[I].isAllOnes())

50847

DemandedElts.setBit(I);

50848

50849

return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);

50850

};

50851

if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {

50852

if (N->getOpcode() != ISD::DELETED_NODE)

50853

DCI.AddToWorklist(N);

50854

return SDValue(N, 0);

50855

}

50856

}

50857

50858

// We should fold "masked merge" patterns when `andn` is not available.

50859

if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)

50860

if (SDValue R = foldMaskedMerge(N, DAG))

50861

return R;

50862

50863

if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))

50864

return R;

50865

50866

return SDValue();

50867

}

50868

50869

/// Try to turn tests against the signbit in the form of:

50870

/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)

50871

/// into:

50872

/// SETGT(X, -1)

50873

static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {

50874

// This is only worth doing if the output type is i8 or i1.

50875

EVT ResultType = N->getValueType(0);

50876

if (ResultType != MVT::i8 && ResultType != MVT::i1)

50877

return SDValue();

50878

50879

SDValue N0 = N->getOperand(0);

50880

SDValue N1 = N->getOperand(1);

50881

50882

// We should be performing an xor against a truncated shift.

50883

if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())

50884

return SDValue();

50885

50886

// Make sure we are performing an xor against one.

50887

if (!isOneConstant(N1))

50888

return SDValue();

50889

50890

// SetCC on x86 zero extends so only act on this if it's a logical shift.

50891

SDValue Shift = N0.getOperand(0);

50892

if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())

50893

return SDValue();

50894

50895

// Make sure we are truncating from one of i16, i32 or i64.

50896

EVT ShiftTy = Shift.getValueType();

50897

if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)

50898

return SDValue();

50899

50900

// Make sure the shift amount extracts the sign bit.

50901

if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||

50902

Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))

50903

return SDValue();

50904

50905

// Create a greater-than comparison against -1.

50906

// N.B. Using SETGE against 0 works but we want a canonical looking

50907

// comparison, using SETGT matches up with what TranslateX86CC.

50908

SDLoc DL(N);

50909

SDValue ShiftOp = Shift.getOperand(0);

50910

EVT ShiftOpTy = ShiftOp.getValueType();

50911

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50912

EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),

50913

*DAG.getContext(), ResultType);

50914

SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,

50915

DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);

50916

if (SetCCResultType != ResultType)

50917

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);

50918

return Cond;

50919

}

50920

50921

/// Turn vector tests of the signbit in the form of:

50922

/// xor (sra X, elt_size(X)-1), -1

50923

/// into:

50924

/// pcmpgt X, -1

50925

///

50926

/// This should be called before type legalization because the pattern may not

50927

/// persist after that.

50928

static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,

50929

const X86Subtarget &Subtarget) {

50930

EVT VT = N->getValueType(0);

50931

if (!VT.isSimple())

50932

return SDValue();

50933

50934

switch (VT.getSimpleVT().SimpleTy) {

50935

default: return SDValue();

50936

case MVT::v16i8:

50937

case MVT::v8i16:

50938

case MVT::v4i32:

50939

case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;

50940

case MVT::v32i8:

50941

case MVT::v16i16:

50942

case MVT::v8i32:

50943

case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;

50944

}

50945

50946

// There must be a shift right algebraic before the xor, and the xor must be a

50947

// 'not' operation.

50948

SDValue Shift = N->getOperand(0);

50949

SDValue Ones = N->getOperand(1);

50950

if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||

50951

!ISD::isBuildVectorAllOnes(Ones.getNode()))

50952

return SDValue();

50953

50954

// The shift should be smearing the sign bit across each vector element.

50955

auto *ShiftAmt =

50956

isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);

50957

if (!ShiftAmt ||

50958

ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))

50959

return SDValue();

50960

50961

// Create a greater-than comparison against -1. We don't use the more obvious

50962

// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.

50963

return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);

50964

}

50965

50966

/// Detect patterns of truncation with unsigned saturation:

50967

///

50968

/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).

50969

/// Return the source value x to be truncated or SDValue() if the pattern was

50970

/// not matched.

50971

///

50972

/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),

50973

/// where C1 >= 0 and C2 is unsigned max of destination type.

50974

///

50975

/// (truncate (smax (smin (x, C2), C1)) to dest_type)

50976

/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.

50977

///

50978

/// These two patterns are equivalent to:

50979

/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)

50980

/// So return the smax(x, C1) value to be truncated or SDValue() if the

50981

/// pattern was not matched.

50982

static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,

50983

const SDLoc &DL) {

50984

EVT InVT = In.getValueType();

50985

50986

// Saturation with truncation. We truncate from InVT to VT.

50987

assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50988, __extension__
__PRETTY_FUNCTION__))

50988

"Unexpected types for truncate operation")(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50988, __extension__
__PRETTY_FUNCTION__));

50989

50990

// Match min/max and return limit value as a parameter.

50991

auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {

50992

if (V.getOpcode() == Opcode &&

50993

ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))

50994

return V.getOperand(0);

50995

return SDValue();

50996

};

50997

50998

APInt C1, C2;

50999

if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))

51000

// C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according

51001

// the element size of the destination type.

51002

if (C2.isMask(VT.getScalarSizeInBits()))

51003

return UMin;

51004

51005

if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))

51006

if (MatchMinMax(SMin, ISD::SMAX, C1))

51007

if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))

51008

return SMin;

51009

51010

if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))

51011

if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))

51012

if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&

51013

C2.uge(C1)) {

51014

return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));

51015

}

51016

51017

return SDValue();

51018

}

51019

51020

/// Detect patterns of truncation with signed saturation:

51021

/// (truncate (smin ((smax (x, signed_min_of_dest_type)),

51022

/// signed_max_of_dest_type)) to dest_type)

51023

/// or:

51024

/// (truncate (smax ((smin (x, signed_max_of_dest_type)),

51025

/// signed_min_of_dest_type)) to dest_type).

51026

/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].

51027

/// Return the source value to be truncated or SDValue() if the pattern was not

51028

/// matched.

51029

static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {

51030

unsigned NumDstBits = VT.getScalarSizeInBits();

51031

unsigned NumSrcBits = In.getScalarValueSizeInBits();

51032

assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")(static_cast <bool> (NumSrcBits > NumDstBits &&
"Unexpected types for truncate operation") ? void (0) : __assert_fail
("NumSrcBits > NumDstBits && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51032, __extension__
__PRETTY_FUNCTION__));

51033

51034

auto MatchMinMax = [](SDValue V, unsigned Opcode,

51035

const APInt &Limit) -> SDValue {

51036

APInt C;

51037

if (V.getOpcode() == Opcode &&

51038

ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)

51039

return V.getOperand(0);

51040

return SDValue();

51041

};

51042

51043

APInt SignedMax, SignedMin;

51044

if (MatchPackUS) {

51045

SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);

51046

SignedMin = APInt(NumSrcBits, 0);

51047

} else {

51048

SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);

51049

SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);

51050

}

51051

51052

if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))

51053

if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))

51054

return SMax;

51055

51056

if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))

51057

if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))

51058

return SMin;

51059

51060

return SDValue();

51061

}

51062

51063

static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,

51064

SelectionDAG &DAG,

51065

const X86Subtarget &Subtarget) {

51066

if (!Subtarget.hasSSE2() || !VT.isVector())

51067

return SDValue();

51068

51069

EVT SVT = VT.getVectorElementType();

51070

EVT InVT = In.getValueType();

51071

EVT InSVT = InVT.getVectorElementType();

51072

51073

// If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is

51074

// split across two registers. We can use a packusdw+perm to clamp to 0-65535

51075

// and concatenate at the same time. Then we can use a final vpmovuswb to

51076

// clip to 0-255.

51077

if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

51078

InVT == MVT::v16i32 && VT == MVT::v16i8) {

51079

if (SDValue USatVal = detectSSatPattern(In, VT, true)) {

51080

// Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.

51081

SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,

51082

DL, DAG, Subtarget);

51083

assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51083, __extension__
__PRETTY_FUNCTION__));

51084

return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);

51085

}

51086

}

51087

51088

// vXi32 truncate instructions are available with AVX512F.

51089

// vXi16 truncate instructions are only available with AVX512BW.

51090

// For 256-bit or smaller vectors, we require VLX.

51091

// FIXME: We could widen truncates to 512 to remove the VLX restriction.

51092

// If the result type is 256-bits or larger and we have disable 512-bit

51093

// registers, we should go ahead and use the pack instructions if possible.

51094

bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||

51095

(Subtarget.hasBWI() && InSVT == MVT::i16)) &&

51096

(InVT.getSizeInBits() > 128) &&

51097

(Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&

51098

!(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);

51099

51100

if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&

51101

VT.getSizeInBits() >= 64 &&

51102

(SVT == MVT::i8 || SVT == MVT::i16) &&

51103

(InSVT == MVT::i16 || InSVT == MVT::i32)) {

51104

if (SDValue USatVal = detectSSatPattern(In, VT, true)) {

51105

// vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).

51106

// Only do this when the result is at least 64 bits or we'll leaving

51107

// dangling PACKSSDW nodes.

51108

if (SVT == MVT::i8 && InSVT == MVT::i32) {

51109

EVT MidVT = VT.changeVectorElementType(MVT::i16);

51110

SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,

51111

DAG, Subtarget);

51112

assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51112, __extension__
__PRETTY_FUNCTION__));

51113

SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,

51114

Subtarget);

51115

assert(V && "Failed to pack!")(static_cast <bool> (V && "Failed to pack!") ? void
(0) : __assert_fail ("V && \"Failed to pack!\"", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51115, __extension__ __PRETTY_FUNCTION__));

51116

return V;

51117

} else if (SVT == MVT::i8 || Subtarget.hasSSE41())

51118

return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,

51119

Subtarget);

51120

}

51121

if (SDValue SSatVal = detectSSatPattern(In, VT))

51122

return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,

51123

Subtarget);

51124

}

51125

51126

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51127

if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&

51128

Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&

51129

(SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {

51130

unsigned TruncOpc = 0;

51131

SDValue SatVal;

51132

if (SDValue SSatVal = detectSSatPattern(In, VT)) {

51133

SatVal = SSatVal;

51134

TruncOpc = X86ISD::VTRUNCS;

51135

} else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {

51136

SatVal = USatVal;

51137

TruncOpc = X86ISD::VTRUNCUS;

51138

}

51139

if (SatVal) {

51140

unsigned ResElts = VT.getVectorNumElements();

51141

// If the input type is less than 512 bits and we don't have VLX, we need

51142

// to widen to 512 bits.

51143

if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {

51144

unsigned NumConcats = 512 / InVT.getSizeInBits();

51145

ResElts *= NumConcats;

51146

SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));

51147

ConcatOps[0] = SatVal;

51148

InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,

51149

NumConcats * InVT.getVectorNumElements());

51150

SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);

51151

}

51152

// Widen the result if its narrower than 128 bits.

51153

if (ResElts * SVT.getSizeInBits() < 128)

51154

ResElts = 128 / SVT.getSizeInBits();

51155

EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);

51156

SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);

51157

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

51158

DAG.getIntPtrConstant(0, DL));

51159

}

51160

}

51161

51162

return SDValue();

51163

}

51164

51165

/// This function detects the AVG pattern between vectors of unsigned i8/i16,

51166

/// which is c = (a + b + 1) / 2, and replace this operation with the efficient

51167

/// ISD::AVGCEILU (AVG) instruction.

51168

static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,

51169

const X86Subtarget &Subtarget,

51170

const SDLoc &DL) {

51171

if (!VT.isVector())

51172

return SDValue();

51173

EVT InVT = In.getValueType();

51174

unsigned NumElems = VT.getVectorNumElements();

51175

51176

EVT ScalarVT = VT.getVectorElementType();

51177

if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))

51178

return SDValue();

51179

51180

// InScalarVT is the intermediate type in AVG pattern and it should be greater

51181

// than the original input type (i8/i16).

51182

EVT InScalarVT = InVT.getVectorElementType();

51183

if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())

51184

return SDValue();

51185

51186

if (!Subtarget.hasSSE2())

51187

return SDValue();

51188

51189

// Detect the following pattern:

51190

//

51191

// %1 = zext <N x i8> %a to <N x i32>

51192

// %2 = zext <N x i8> %b to <N x i32>

51193

// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>

51194

// %4 = add nuw nsw <N x i32> %3, %2

51195

// %5 = lshr <N x i32> %N, <i32 1 x N>

51196

// %6 = trunc <N x i32> %5 to <N x i8>

51197

//

51198

// In AVX512, the last instruction can also be a trunc store.

51199

if (In.getOpcode() != ISD::SRL)

51200

return SDValue();

51201

51202

// A lambda checking the given SDValue is a constant vector and each element

51203

// is in the range [Min, Max].

51204

auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {

51205

return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {

51206

return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));

51207

});

51208

};

51209

51210

auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {

51211

unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();

51212

return MaxActiveBits <= ScalarVT.getSizeInBits();

51213

};

51214

51215

// Check if each element of the vector is right-shifted by one.

51216

SDValue LHS = In.getOperand(0);

51217

SDValue RHS = In.getOperand(1);

51218

if (!IsConstVectorInRange(RHS, 1, 1))

51219

return SDValue();

51220

if (LHS.getOpcode() != ISD::ADD)

51221

return SDValue();

51222

51223

// Detect a pattern of a + b + 1 where the order doesn't matter.

51224

SDValue Operands[3];

51225

Operands[0] = LHS.getOperand(0);

51226

Operands[1] = LHS.getOperand(1);

51227

51228

auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

51229

ArrayRef<SDValue> Ops) {

51230

return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);

51231

};

51232

51233

auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {

51234

for (SDValue &Op : Ops)

51235

if (Op.getValueType() != VT)

51236

Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);

51237

// Pad to a power-of-2 vector, split+apply and extract the original vector.

51238

unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);

51239

EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);

51240

if (NumElemsPow2 != NumElems) {

51241

for (SDValue &Op : Ops) {

51242

SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));

51243

for (unsigned i = 0; i != NumElems; ++i) {

51244

SDValue Idx = DAG.getIntPtrConstant(i, DL);

51245

EltsOfOp[i] =

51246

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);

51247

}

51248

Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);

51249

}

51250

}

51251

SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);

51252

if (NumElemsPow2 == NumElems)

51253

return Res;

51254

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

51255

DAG.getIntPtrConstant(0, DL));

51256

};

51257

51258

// Take care of the case when one of the operands is a constant vector whose

51259

// element is in the range [1, 256].

51260

if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&

51261

IsZExtLike(Operands[0])) {

51262

// The pattern is detected. Subtract one from the constant vector, then

51263

// demote it and emit X86ISD::AVG instruction.

51264

SDValue VecOnes = DAG.getConstant(1, DL, InVT);

51265

Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);

51266

return AVGSplitter({Operands[0], Operands[1]});

51267

}

51268

51269

// Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).

51270

// Match the or case only if its 'add-like' - can be replaced by an add.

51271

auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {

51272

if (ISD::ADD == V.getOpcode()) {

51273

Op0 = V.getOperand(0);

51274

Op1 = V.getOperand(1);

51275

return true;

51276

}

51277

if (ISD::ZERO_EXTEND != V.getOpcode())

51278

return false;

51279

V = V.getOperand(0);

51280

if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||

51281

!DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))

51282

return false;

51283

Op0 = V.getOperand(0);

51284

Op1 = V.getOperand(1);

51285

return true;

51286

};

51287

51288

SDValue Op0, Op1;

51289

if (FindAddLike(Operands[0], Op0, Op1))

51290

std::swap(Operands[0], Operands[1]);

51291

else if (!FindAddLike(Operands[1], Op0, Op1))

51292

return SDValue();

51293

Operands[2] = Op0;

51294

Operands[1] = Op1;

51295

51296

// Now we have three operands of two additions. Check that one of them is a

51297

// constant vector with ones, and the other two can be promoted from i8/i16.

51298

for (SDValue &Op : Operands) {

51299

if (!IsConstVectorInRange(Op, 1, 1))

51300

continue;

51301

std::swap(Op, Operands[2]);

51302

51303

// Check if Operands[0] and Operands[1] are results of type promotion.

51304

for (int j = 0; j < 2; ++j)

51305

if (Operands[j].getValueType() != VT)

51306

if (!IsZExtLike(Operands[j]))

51307

return SDValue();

51308

51309

// The pattern is detected, emit X86ISD::AVG instruction(s).

51310

return AVGSplitter({Operands[0], Operands[1]});

51311

}

51312

51313

return SDValue();

51314

}

51315

51316

static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,

51317

TargetLowering::DAGCombinerInfo &DCI,

51318

const X86Subtarget &Subtarget) {

51319

LoadSDNode *Ld = cast<LoadSDNode>(N);

51320

EVT RegVT = Ld->getValueType(0);

51321

EVT MemVT = Ld->getMemoryVT();

51322

SDLoc dl(Ld);

51323

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51324

51325

// For chips with slow 32-byte unaligned loads, break the 32-byte operation

51326

// into two 16-byte operations. Also split non-temporal aligned loads on

51327

// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.

51328

ISD::LoadExtType Ext = Ld->getExtensionType();

51329

unsigned Fast;

51330

if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&

51331

Ext == ISD::NON_EXTLOAD &&

51332

((Ld->isNonTemporal() && !Subtarget.hasInt256() &&

51333

Ld->getAlign() >= Align(16)) ||

51334

(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,

51335

*Ld->getMemOperand(), &Fast) &&

51336

!Fast))) {

51337

unsigned NumElems = RegVT.getVectorNumElements();

51338

if (NumElems < 2)

51339

return SDValue();

51340

51341

unsigned HalfOffset = 16;

51342

SDValue Ptr1 = Ld->getBasePtr();

51343

SDValue Ptr2 =

51344

DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);

51345

EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),

51346

NumElems / 2);

51347

SDValue Load1 =

51348

DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),

51349

Ld->getOriginalAlign(),

51350

Ld->getMemOperand()->getFlags());

51351

SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,

51352

Ld->getPointerInfo().getWithOffset(HalfOffset),

51353

Ld->getOriginalAlign(),

51354

Ld->getMemOperand()->getFlags());

51355

SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

51356

Load1.getValue(1), Load2.getValue(1));

51357

51358

SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);

51359

return DCI.CombineTo(N, NewVec, TF, true);

51360

}

51361

51362

// Bool vector load - attempt to cast to an integer, as we have good

51363

// (vXiY *ext(vXi1 bitcast(iX))) handling.

51364

if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&

51365

RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {

51366

unsigned NumElts = RegVT.getVectorNumElements();

51367

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

51368

if (TLI.isTypeLegal(IntVT)) {

51369

SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),

51370

Ld->getPointerInfo(),

51371

Ld->getOriginalAlign(),

51372

Ld->getMemOperand()->getFlags());

51373

SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);

51374

return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);

51375

}

51376

}

51377

51378

// If we also broadcast this as a subvector to a wider type, then just extract

51379

// the lowest subvector.

51380

if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&

51381

(RegVT.is128BitVector() || RegVT.is256BitVector())) {

51382

SDValue Ptr = Ld->getBasePtr();

51383

SDValue Chain = Ld->getChain();

51384

for (SDNode *User : Ptr->uses()) {

51385

if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&

51386

cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&

51387

cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&

51388

cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==

51389

MemVT.getSizeInBits() &&

51390

!User->hasAnyUseOfValue(1) &&

51391

User->getValueSizeInBits(0).getFixedValue() >

51392

RegVT.getFixedSizeInBits()) {

51393

SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),

51394

RegVT.getSizeInBits());

51395

Extract = DAG.getBitcast(RegVT, Extract);

51396

return DCI.CombineTo(N, Extract, SDValue(User, 1));

51397

}

51398

}

51399

}

51400

51401

// Cast ptr32 and ptr64 pointers to the default address space before a load.

51402

unsigned AddrSpace = Ld->getAddressSpace();

51403

if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||

51404

AddrSpace == X86AS::PTR32_UPTR) {

51405

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

51406

if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {

51407

SDValue Cast =

51408

DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);

51409

return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),

51410

Ld->getOriginalAlign(),

51411

Ld->getMemOperand()->getFlags());

51412

}

51413

}

51414

51415

return SDValue();

51416

}

51417

51418

/// If V is a build vector of boolean constants and exactly one of those

51419

/// constants is true, return the operand index of that true element.

51420

/// Otherwise, return -1.

51421

static int getOneTrueElt(SDValue V) {

51422

// This needs to be a build vector of booleans.

51423

// TODO: Checking for the i1 type matches the IR definition for the mask,

51424

// but the mask check could be loosened to i8 or other types. That might

51425

// also require checking more than 'allOnesValue'; eg, the x86 HW

51426

// instructions only require that the MSB is set for each mask element.

51427

// The ISD::MSTORE comments/definition do not specify how the mask operand

51428

// is formatted.

51429

auto *BV = dyn_cast<BuildVectorSDNode>(V);

51430

if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)

51431

return -1;

51432

51433

int TrueIndex = -1;

51434

unsigned NumElts = BV->getValueType(0).getVectorNumElements();

51435

for (unsigned i = 0; i < NumElts; ++i) {

51436

const SDValue &Op = BV->getOperand(i);

51437

if (Op.isUndef())

51438

continue;

51439

auto *ConstNode = dyn_cast<ConstantSDNode>(Op);

51440

if (!ConstNode)

51441

return -1;

51442

if (ConstNode->getAPIntValue().countr_one() >= 1) {

51443

// If we already found a one, this is too many.

51444

if (TrueIndex >= 0)

51445

return -1;

51446

TrueIndex = i;

51447

}

51448

}

51449

return TrueIndex;

51450

}

51451

51452

/// Given a masked memory load/store operation, return true if it has one mask

51453

/// bit set. If it has one mask bit set, then also return the memory address of

51454

/// the scalar element to load/store, the vector index to insert/extract that

51455

/// scalar element, and the alignment for the scalar memory access.

51456

static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,

51457

SelectionDAG &DAG, SDValue &Addr,

51458

SDValue &Index, Align &Alignment,

51459

unsigned &Offset) {

51460

int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());

51461

if (TrueMaskElt < 0)

51462

return false;

51463

51464

// Get the address of the one scalar element that is specified by the mask

51465

// using the appropriate offset from the base pointer.

51466

EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();

51467

Offset = 0;

51468

Addr = MaskedOp->getBasePtr();

51469

if (TrueMaskElt != 0) {

51470

Offset = TrueMaskElt * EltVT.getStoreSize();

51471

Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),

51472

SDLoc(MaskedOp));

51473

}

51474

51475

Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));

51476

Alignment = commonAlignment(MaskedOp->getOriginalAlign(),

51477

EltVT.getStoreSize());

51478

return true;

51479

}

51480

51481

/// If exactly one element of the mask is set for a non-extending masked load,

51482

/// it is a scalar load and vector insert.

51483

/// Note: It is expected that the degenerate cases of an all-zeros or all-ones

51484

/// mask have already been optimized in IR, so we don't bother with those here.

51485

static SDValue

51486

reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,

51487

TargetLowering::DAGCombinerInfo &DCI,

51488

const X86Subtarget &Subtarget) {

51489

assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51489, __extension__
__PRETTY_FUNCTION__));

51490

// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.

51491

// However, some target hooks may need to be added to know when the transform

51492

// is profitable. Endianness would also have to be considered.

51493

51494

SDValue Addr, VecIndex;

51495

Align Alignment;

51496

unsigned Offset;

51497

if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))

51498

return SDValue();

51499

51500

// Load the one scalar element that is specified by the mask using the

51501

// appropriate offset from the base pointer.

51502

SDLoc DL(ML);

51503

EVT VT = ML->getValueType(0);

51504

EVT EltVT = VT.getVectorElementType();

51505

51506

EVT CastVT = VT;

51507

if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

51508

EltVT = MVT::f64;

51509

CastVT = VT.changeVectorElementType(EltVT);

51510

}

51511

51512

SDValue Load =

51513

DAG.getLoad(EltVT, DL, ML->getChain(), Addr,

51514

ML->getPointerInfo().getWithOffset(Offset),

51515

Alignment, ML->getMemOperand()->getFlags());

51516

51517

SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());

51518

51519

// Insert the loaded element into the appropriate place in the vector.

51520

SDValue Insert =

51521

DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);

51522

Insert = DAG.getBitcast(VT, Insert);

51523

return DCI.CombineTo(ML, Insert, Load.getValue(1), true);

51524

}

51525

51526

static SDValue

51527

combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,

51528

TargetLowering::DAGCombinerInfo &DCI) {

51529

assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51529, __extension__
__PRETTY_FUNCTION__));

51530

if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))

51531

return SDValue();

51532

51533

SDLoc DL(ML);

51534

EVT VT = ML->getValueType(0);

51535

51536

// If we are loading the first and last elements of a vector, it is safe and

51537

// always faster to load the whole vector. Replace the masked load with a

51538

// vector load and select.

51539

unsigned NumElts = VT.getVectorNumElements();

51540

BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());

51541

bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));

51542

bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));

51543

if (LoadFirstElt && LoadLastElt) {

51544

SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),

51545

ML->getMemOperand());

51546

SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,

51547

ML->getPassThru());

51548

return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);

51549

}

51550

51551

// Convert a masked load with a constant mask into a masked load and a select.

51552

// This allows the select operation to use a faster kind of select instruction

51553

// (for example, vblendvps -> vblendps).

51554

51555

// Don't try this if the pass-through operand is already undefined. That would

51556

// cause an infinite loop because that's what we're about to create.

51557

if (ML->getPassThru().isUndef())

51558

return SDValue();

51559

51560

if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))

51561

return SDValue();

51562

51563

// The new masked load has an undef pass-through operand. The select uses the

51564

// original pass-through operand.

51565

SDValue NewML = DAG.getMaskedLoad(

51566

VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),

51567

DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),

51568

ML->getAddressingMode(), ML->getExtensionType());

51569

SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,

51570

ML->getPassThru());

51571

51572

return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);

51573

}

51574

51575

static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,

51576

TargetLowering::DAGCombinerInfo &DCI,

51577

const X86Subtarget &Subtarget) {

51578

auto *Mld = cast<MaskedLoadSDNode>(N);

51579

51580

// TODO: Expanding load with constant mask may be optimized as well.

51581

if (Mld->isExpandingLoad())

51582

return SDValue();

51583

51584

if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {

51585

if (SDValue ScalarLoad =

51586

reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))

51587

return ScalarLoad;

51588

51589

// TODO: Do some AVX512 subsets benefit from this transform?

51590

if (!Subtarget.hasAVX512())

51591

if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))

51592

return Blend;

51593

}

51594

51595

// If the mask value has been legalized to a non-boolean vector, try to

51596

// simplify ops leading up to it. We only demand the MSB of each lane.

51597

SDValue Mask = Mld->getMask();

51598

if (Mask.getScalarValueSizeInBits() != 1) {

51599

EVT VT = Mld->getValueType(0);

51600

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51601

APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));

51602

if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {

51603

if (N->getOpcode() != ISD::DELETED_NODE)

51604

DCI.AddToWorklist(N);

51605

return SDValue(N, 0);

51606

}

51607

if (SDValue NewMask =

51608

TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))

51609

return DAG.getMaskedLoad(

51610

VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),

51611

NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),

51612

Mld->getAddressingMode(), Mld->getExtensionType());

51613

}

51614

51615

return SDValue();

51616

}

51617

51618

/// If exactly one element of the mask is set for a non-truncating masked store,

51619

/// it is a vector extract and scalar store.

51620

/// Note: It is expected that the degenerate cases of an all-zeros or all-ones

51621

/// mask have already been optimized in IR, so we don't bother with those here.

51622

static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,

51623

SelectionDAG &DAG,

51624

const X86Subtarget &Subtarget) {

51625

// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.

51626

// However, some target hooks may need to be added to know when the transform

51627

// is profitable. Endianness would also have to be considered.

51628

51629

SDValue Addr, VecIndex;

51630

Align Alignment;

51631

unsigned Offset;

51632

if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))

51633

return SDValue();

51634

51635

// Extract the one scalar element that is actually being stored.

51636

SDLoc DL(MS);

51637

SDValue Value = MS->getValue();

51638

EVT VT = Value.getValueType();

51639

EVT EltVT = VT.getVectorElementType();

51640

if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

51641

EltVT = MVT::f64;

51642

EVT CastVT = VT.changeVectorElementType(EltVT);

51643

Value = DAG.getBitcast(CastVT, Value);

51644

}

51645

SDValue Extract =

51646

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);

51647

51648

// Store that element at the appropriate offset from the base pointer.

51649

return DAG.getStore(MS->getChain(), DL, Extract, Addr,

51650

MS->getPointerInfo().getWithOffset(Offset),

51651

Alignment, MS->getMemOperand()->getFlags());

51652

}

51653

51654

static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,

51655

TargetLowering::DAGCombinerInfo &DCI,

51656

const X86Subtarget &Subtarget) {

51657

MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);

51658

if (Mst->isCompressingStore())

51659

return SDValue();

51660

51661

EVT VT = Mst->getValue().getValueType();

51662

SDLoc dl(Mst);

51663

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51664

51665

if (Mst->isTruncatingStore())

51666

return SDValue();

51667

51668

if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))

51669

return ScalarStore;

51670

51671

// If the mask value has been legalized to a non-boolean vector, try to

51672

// simplify ops leading up to it. We only demand the MSB of each lane.

51673

SDValue Mask = Mst->getMask();

51674

if (Mask.getScalarValueSizeInBits() != 1) {

51675

APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));

51676

if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {

51677

if (N->getOpcode() != ISD::DELETED_NODE)

51678

DCI.AddToWorklist(N);

51679

return SDValue(N, 0);

51680

}

51681

if (SDValue NewMask =

51682

TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))

51683

return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),

51684

Mst->getBasePtr(), Mst->getOffset(), NewMask,

51685

Mst->getMemoryVT(), Mst->getMemOperand(),

51686

Mst->getAddressingMode());

51687

}

51688

51689

SDValue Value = Mst->getValue();

51690

if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&

51691

TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),

51692

Mst->getMemoryVT())) {

51693

return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),

51694

Mst->getBasePtr(), Mst->getOffset(), Mask,

51695

Mst->getMemoryVT(), Mst->getMemOperand(),

51696

Mst->getAddressingMode(), true);

51697

}

51698

51699

return SDValue();

51700

}

51701

51702

static SDValue combineStore(SDNode *N, SelectionDAG &DAG,

51703

TargetLowering::DAGCombinerInfo &DCI,

51704

const X86Subtarget &Subtarget) {

51705

StoreSDNode *St = cast<StoreSDNode>(N);

51706

EVT StVT = St->getMemoryVT();

51707

SDLoc dl(St);

51708

SDValue StoredVal = St->getValue();

51709

EVT VT = StoredVal.getValueType();

51710

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51711

51712

// Convert a store of vXi1 into a store of iX and a bitcast.

51713

if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&

51714

VT.getVectorElementType() == MVT::i1) {

51715

51716

EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());

51717

StoredVal = DAG.getBitcast(NewVT, StoredVal);

51718

51719

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

51720

St->getPointerInfo(), St->getOriginalAlign(),

51721

St->getMemOperand()->getFlags());

51722

}

51723

51724

// If this is a store of a scalar_to_vector to v1i1, just use a scalar store.

51725

// This will avoid a copy to k-register.

51726

if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&

51727

StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&

51728

StoredVal.getOperand(0).getValueType() == MVT::i8) {

51729

SDValue Val = StoredVal.getOperand(0);

51730

// We must store zeros to the unused bits.

51731

Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);

51732

return DAG.getStore(St->getChain(), dl, Val,

51733

St->getBasePtr(), St->getPointerInfo(),

51734

St->getOriginalAlign(),

51735

St->getMemOperand()->getFlags());

51736

}

51737

51738

// Widen v2i1/v4i1 stores to v8i1.

51739

if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&

51740

Subtarget.hasAVX512()) {

51741

unsigned NumConcats = 8 / VT.getVectorNumElements();

51742

// We must store zeros to the unused bits.

51743

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));

51744

Ops[0] = StoredVal;

51745

StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

51746

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

51747

St->getPointerInfo(), St->getOriginalAlign(),

51748

St->getMemOperand()->getFlags());

51749

}

51750

51751

// Turn vXi1 stores of constants into a scalar store.

51752

if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||

51753

VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&

51754

ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {

51755

// If its a v64i1 store without 64-bit support, we need two stores.

51756

if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {

51757

SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,

51758

StoredVal->ops().slice(0, 32));

51759

Lo = combinevXi1ConstantToInteger(Lo, DAG);

51760

SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,

51761

StoredVal->ops().slice(32, 32));

51762

Hi = combinevXi1ConstantToInteger(Hi, DAG);

51763

51764

SDValue Ptr0 = St->getBasePtr();

51765

SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);

51766

51767

SDValue Ch0 =

51768

DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),

51769

St->getOriginalAlign(),

51770

St->getMemOperand()->getFlags());

51771

SDValue Ch1 =

51772

DAG.getStore(St->getChain(), dl, Hi, Ptr1,

51773

St->getPointerInfo().getWithOffset(4),

51774

St->getOriginalAlign(),

51775

St->getMemOperand()->getFlags());

51776

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);

51777

}

51778

51779

StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);

51780

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

51781

St->getPointerInfo(), St->getOriginalAlign(),

51782

St->getMemOperand()->getFlags());

51783

}

51784

51785

// If we are saving a 32-byte vector and 32-byte stores are slow, such as on

51786

// Sandy Bridge, perform two 16-byte stores.

51787

unsigned Fast;

51788

if (VT.is256BitVector() && StVT == VT &&

51789

TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,

51790

*St->getMemOperand(), &Fast) &&

51791

!Fast) {

51792

unsigned NumElems = VT.getVectorNumElements();

51793

if (NumElems < 2)

51794

return SDValue();

51795

51796

return splitVectorStore(St, DAG);

51797

}

51798

51799

// Split under-aligned vector non-temporal stores.

51800

if (St->isNonTemporal() && StVT == VT &&

51801

St->getAlign().value() < VT.getStoreSize()) {

51802

// ZMM/YMM nt-stores - either it can be stored as a series of shorter

51803

// vectors or the legalizer can scalarize it to use MOVNTI.

51804

if (VT.is256BitVector() || VT.is512BitVector()) {

51805

unsigned NumElems = VT.getVectorNumElements();

51806

if (NumElems < 2)

51807

return SDValue();

51808

return splitVectorStore(St, DAG);

51809

}

51810

51811

// XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64

51812

// to use MOVNTI.

51813

if (VT.is128BitVector() && Subtarget.hasSSE2()) {

51814

MVT NTVT = Subtarget.hasSSE4A()

51815

? MVT::v2f64

51816

: (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);

51817

return scalarizeVectorStore(St, NTVT, DAG);

51818

}

51819

}

51820

51821

// Try to optimize v16i16->v16i8 truncating stores when BWI is not

51822

// supported, but avx512f is by extending to v16i32 and truncating.

51823

if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&

51824

St->getValue().getOpcode() == ISD::TRUNCATE &&

51825

St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&

51826

TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&

51827

St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {

51828

SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,

51829

St->getValue().getOperand(0));

51830

return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),

51831

MVT::v16i8, St->getMemOperand());

51832

}

51833

51834

// Try to fold a VTRUNCUS or VTRUNCS into a truncating store.

51835

if (!St->isTruncatingStore() &&

51836

(StoredVal.getOpcode() == X86ISD::VTRUNCUS ||

51837

StoredVal.getOpcode() == X86ISD::VTRUNCS) &&

51838

StoredVal.hasOneUse() &&

51839

TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {

51840

bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;

51841

return EmitTruncSStore(IsSigned, St->getChain(),

51842

dl, StoredVal.getOperand(0), St->getBasePtr(),

51843

VT, St->getMemOperand(), DAG);

51844

}

51845

51846

// Try to fold a extract_element(VTRUNC) pattern into a truncating store.

51847

if (!St->isTruncatingStore()) {

51848

auto IsExtractedElement = [](SDValue V) {

51849

if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())

51850

V = V.getOperand(0);

51851

unsigned Opc = V.getOpcode();

51852

if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&

51853

isNullConstant(V.getOperand(1)) && V.hasOneUse() &&

51854

V.getOperand(0).hasOneUse())

51855

return V.getOperand(0);

51856

return SDValue();

51857

};

51858

if (SDValue Extract = IsExtractedElement(StoredVal)) {

51859

SDValue Trunc = peekThroughOneUseBitcasts(Extract);

51860

if (Trunc.getOpcode() == X86ISD::VTRUNC) {

51861

SDValue Src = Trunc.getOperand(0);

51862

MVT DstVT = Trunc.getSimpleValueType();

51863

MVT SrcVT = Src.getSimpleValueType();

51864

unsigned NumSrcElts = SrcVT.getVectorNumElements();

51865

unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;

51866

MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);

51867

if (NumTruncBits == VT.getSizeInBits() &&

51868

TLI.isTruncStoreLegal(SrcVT, TruncVT)) {

51869

return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),

51870

TruncVT, St->getMemOperand());

51871

}

51872

}

51873

}

51874

}

51875

51876

// Optimize trunc store (of multiple scalars) to shuffle and store.

51877

// First, pack all of the elements in one place. Next, store to memory

51878

// in fewer chunks.

51879

if (St->isTruncatingStore() && VT.isVector()) {

51880

// Check if we can detect an AVG pattern from the truncation. If yes,

51881

// replace the trunc store by a normal store with the result of X86ISD::AVG

51882

// instruction.

51883

if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))

51884

if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,

51885

Subtarget, dl))

51886

return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),

51887

St->getPointerInfo(), St->getOriginalAlign(),

51888

St->getMemOperand()->getFlags());

51889

51890

if (TLI.isTruncStoreLegal(VT, StVT)) {

51891

if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))

51892

return EmitTruncSStore(true /* Signed saturation */, St->getChain(),

51893

dl, Val, St->getBasePtr(),

51894

St->getMemoryVT(), St->getMemOperand(), DAG);

51895

if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),

51896

DAG, dl))

51897

return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),

51898

dl, Val, St->getBasePtr(),

51899

St->getMemoryVT(), St->getMemOperand(), DAG);

51900

}

51901

51902

return SDValue();

51903

}

51904

51905

// Cast ptr32 and ptr64 pointers to the default address space before a store.

51906

unsigned AddrSpace = St->getAddressSpace();

51907

if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||

51908

AddrSpace == X86AS::PTR32_UPTR) {

51909

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

51910

if (PtrVT != St->getBasePtr().getSimpleValueType()) {

51911

SDValue Cast =

51912

DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);

51913

return DAG.getStore(St->getChain(), dl, StoredVal, Cast,

51914

St->getPointerInfo(), St->getOriginalAlign(),

51915

St->getMemOperand()->getFlags(), St->getAAInfo());

51916

}

51917

}

51918

51919

// Turn load->store of MMX types into GPR load/stores. This avoids clobbering

51920

// the FP state in cases where an emms may be missing.

51921

// A preferable solution to the general problem is to figure out the right

51922

// places to insert EMMS. This qualifies as a quick hack.

51923

51924

// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.

51925

if (VT.getSizeInBits() != 64)

51926

return SDValue();

51927

51928

const Function &F = DAG.getMachineFunction().getFunction();

51929

bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);

51930

bool F64IsLegal =

51931

!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();

51932

if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&

51933

isa<LoadSDNode>(St->getValue()) &&

51934

cast<LoadSDNode>(St->getValue())->isSimple() &&

51935

St->getChain().hasOneUse() && St->isSimple()) {

51936

LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());

51937

51938

if (!ISD::isNormalLoad(Ld))

51939

return SDValue();

51940

51941

// Avoid the transformation if there are multiple uses of the loaded value.

51942

if (!Ld->hasNUsesOfValue(1, 0))

51943

return SDValue();

51944

51945

SDLoc LdDL(Ld);

51946

SDLoc StDL(N);

51947

// Lower to a single movq load/store pair.

51948

SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),

51949

Ld->getBasePtr(), Ld->getMemOperand());

51950

51951

// Make sure new load is placed in same chain order.

51952

DAG.makeEquivalentMemoryOrdering(Ld, NewLd);

51953

return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),

51954

St->getMemOperand());

51955

}

51956

51957

// This is similar to the above case, but here we handle a scalar 64-bit

51958

// integer store that is extracted from a vector on a 32-bit target.

51959

// If we have SSE2, then we can treat it like a floating-point double

51960

// to get past legalization. The execution dependencies fixup pass will

51961

// choose the optimal machine instruction for the store if this really is

51962

// an integer or v2f32 rather than an f64.

51963

if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&

51964

St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

51965

SDValue OldExtract = St->getOperand(1);

51966

SDValue ExtOp0 = OldExtract.getOperand(0);

51967

unsigned VecSize = ExtOp0.getValueSizeInBits();

51968

EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);

51969

SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);

51970

SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

51971

BitCast, OldExtract.getOperand(1));

51972

return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),

51973

St->getPointerInfo(), St->getOriginalAlign(),

51974

St->getMemOperand()->getFlags());

51975

}

51976

51977

return SDValue();

51978

}

51979

51980

static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,

51981

TargetLowering::DAGCombinerInfo &DCI,

51982

const X86Subtarget &Subtarget) {

51983

auto *St = cast<MemIntrinsicSDNode>(N);

51984

51985

SDValue StoredVal = N->getOperand(1);

51986

MVT VT = StoredVal.getSimpleValueType();

51987

EVT MemVT = St->getMemoryVT();

51988

51989

// Figure out which elements we demand.

51990

unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();

51991

APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);

51992

51993

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51994

if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {

51995

if (N->getOpcode() != ISD::DELETED_NODE)

51996

DCI.AddToWorklist(N);

51997

return SDValue(N, 0);

51998

}

51999

52000

return SDValue();

52001

}

52002

52003

/// Return 'true' if this vector operation is "horizontal"

52004

/// and return the operands for the horizontal operation in LHS and RHS. A

52005

/// horizontal operation performs the binary operation on successive elements

52006

/// of its first operand, then on successive elements of its second operand,

52007

/// returning the resulting values in a vector. For example, if

52008

/// A = < float a0, float a1, float a2, float a3 >

52009

/// and

52010

/// B = < float b0, float b1, float b2, float b3 >

52011

/// then the result of doing a horizontal operation on A and B is

52012

/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.

52013

/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form

52014

/// A horizontal-op B, for some already available A and B, and if so then LHS is

52015

/// set to A, RHS to B, and the routine returns 'true'.

52016

static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,

52017

SelectionDAG &DAG, const X86Subtarget &Subtarget,

52018

bool IsCommutative,

52019

SmallVectorImpl<int> &PostShuffleMask) {

52020

// If either operand is undef, bail out. The binop should be simplified.

52021

if (LHS.isUndef() || RHS.isUndef())

52022

return false;

52023

52024

// Look for the following pattern:

52025

// A = < float a0, float a1, float a2, float a3 >

52026

// B = < float b0, float b1, float b2, float b3 >

52027

// and

52028

// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>

52029

// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>

52030

// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >

52031

// which is A horizontal-op B.

52032

52033

MVT VT = LHS.getSimpleValueType();

52034

assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52035, __extension__
__PRETTY_FUNCTION__))

52035

"Unsupported vector type for horizontal add/sub")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52035, __extension__
__PRETTY_FUNCTION__));

52036

unsigned NumElts = VT.getVectorNumElements();

52037

52038

auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,

52039

SmallVectorImpl<int> &ShuffleMask) {

52040

bool UseSubVector = false;

52041

if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

52042

Op.getOperand(0).getValueType().is256BitVector() &&

52043

llvm::isNullConstant(Op.getOperand(1))) {

52044

Op = Op.getOperand(0);

52045

UseSubVector = true;

52046

}

52047

SmallVector<SDValue, 2> SrcOps;

52048

SmallVector<int, 16> SrcMask, ScaledMask;

52049

SDValue BC = peekThroughBitcasts(Op);

52050

if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&

52051

!isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {

52052

return Op.getValueSizeInBits() == BC.getValueSizeInBits();

52053

})) {

52054

resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);

52055

if (!UseSubVector && SrcOps.size() <= 2 &&

52056

scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {

52057

N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();

52058

N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();

52059

ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());

52060

}

52061

if (UseSubVector && SrcOps.size() == 1 &&

52062

scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {

52063

std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));

52064

ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);

52065

ShuffleMask.assign(Mask.begin(), Mask.end());

52066

}

52067

}

52068

};

52069

52070

// View LHS in the form

52071

// LHS = VECTOR_SHUFFLE A, B, LMask

52072

// If LHS is not a shuffle, then pretend it is the identity shuffle:

52073

// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>

52074

// NOTE: A default initialized SDValue represents an UNDEF of type VT.

52075

SDValue A, B;

52076

SmallVector<int, 16> LMask;

52077

GetShuffle(LHS, A, B, LMask);

52078

52079

// Likewise, view RHS in the form

52080

// RHS = VECTOR_SHUFFLE C, D, RMask

52081

SDValue C, D;

52082

SmallVector<int, 16> RMask;

52083

GetShuffle(RHS, C, D, RMask);

52084

52085

// At least one of the operands should be a vector shuffle.

52086

unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);

52087

if (NumShuffles == 0)

52088

return false;

52089

52090

if (LMask.empty()) {

52091

A = LHS;

52092

for (unsigned i = 0; i != NumElts; ++i)

52093

LMask.push_back(i);

52094

}

52095

52096

if (RMask.empty()) {

52097

C = RHS;

52098

for (unsigned i = 0; i != NumElts; ++i)

52099

RMask.push_back(i);

52100

}

52101

52102

// If we have an unary mask, ensure the other op is set to null.

52103

if (isUndefOrInRange(LMask, 0, NumElts))

52104

B = SDValue();

52105

else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))

52106

A = SDValue();

52107

52108

if (isUndefOrInRange(RMask, 0, NumElts))

52109

D = SDValue();

52110

else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))

52111

C = SDValue();

52112

52113

// If A and B occur in reverse order in RHS, then canonicalize by commuting

52114

// RHS operands and shuffle mask.

52115

if (A != C) {

52116

std::swap(C, D);

52117

ShuffleVectorSDNode::commuteMask(RMask);

52118

}

52119

// Check that the shuffles are both shuffling the same vectors.

52120

if (!(A == C && B == D))

52121

return false;

52122

52123

PostShuffleMask.clear();

52124

PostShuffleMask.append(NumElts, SM_SentinelUndef);

52125

52126

// LHS and RHS are now:

52127

// LHS = shuffle A, B, LMask

52128

// RHS = shuffle A, B, RMask

52129

// Check that the masks correspond to performing a horizontal operation.

52130

// AVX defines horizontal add/sub to operate independently on 128-bit lanes,

52131

// so we just repeat the inner loop if this is a 256-bit op.

52132

unsigned Num128BitChunks = VT.getSizeInBits() / 128;

52133

unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;

52134

unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;

52135

assert((NumEltsPer128BitChunk % 2 == 0) &&(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52136, __extension__
__PRETTY_FUNCTION__))

52136

"Vector type should have an even number of elements in each lane")(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52136, __extension__
__PRETTY_FUNCTION__));

52137

for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {

52138

for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {

52139

// Ignore undefined components.

52140

int LIdx = LMask[i + j], RIdx = RMask[i + j];

52141

if (LIdx < 0 || RIdx < 0 ||

52142

(!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||

52143

(!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))

52144

continue;

52145

52146

// Check that successive odd/even elements are being operated on. If not,

52147

// this is not a horizontal operation.

52148

if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&

52149

!((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))

52150

return false;

52151

52152

// Compute the post-shuffle mask index based on where the element

52153

// is stored in the HOP result, and where it needs to be moved to.

52154

int Base = LIdx & ~1u;

52155

int Index = ((Base % NumEltsPer128BitChunk) / 2) +

52156

((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));

52157

52158

// The low half of the 128-bit result must choose from A.

52159

// The high half of the 128-bit result must choose from B,

52160

// unless B is undef. In that case, we are always choosing from A.

52161

if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))

52162

Index += NumEltsPer64BitChunk;

52163

PostShuffleMask[i + j] = Index;

52164

}

52165

}

52166

52167

SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.

52168

SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.

52169

52170

bool IsIdentityPostShuffle =

52171

isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);

52172

if (IsIdentityPostShuffle)

52173

PostShuffleMask.clear();

52174

52175

// Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).

52176

if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&

52177

isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))

52178

return false;

52179

52180

// If the source nodes are already used in HorizOps then always accept this.

52181

// Shuffle folding should merge these back together.

52182

bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {

52183

return User->getOpcode() == HOpcode && User->getValueType(0) == VT;

52184

});

52185

bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {

52186

return User->getOpcode() == HOpcode && User->getValueType(0) == VT;

52187

});

52188

bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;

52189

52190

// Assume a SingleSource HOP if we only shuffle one input and don't need to

52191

// shuffle the result.

52192

if (!ForceHorizOp &&

52193

!shouldUseHorizontalOp(NewLHS == NewRHS &&

52194

(NumShuffles < 2 || !IsIdentityPostShuffle),

52195

DAG, Subtarget))

52196

return false;

52197

52198

LHS = DAG.getBitcast(VT, NewLHS);

52199

RHS = DAG.getBitcast(VT, NewRHS);

52200

return true;

52201

}

52202

52203

// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.

52204

static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,

52205

const X86Subtarget &Subtarget) {

52206

EVT VT = N->getValueType(0);

52207

unsigned Opcode = N->getOpcode();

52208

bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);

52209

SmallVector<int, 8> PostShuffleMask;

52210

52211

switch (Opcode) {

52212

case ISD::FADD:

52213

case ISD::FSUB:

52214

if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||

52215

(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {

52216

SDValue LHS = N->getOperand(0);

52217

SDValue RHS = N->getOperand(1);

52218

auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;

52219

if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,

52220

PostShuffleMask)) {

52221

SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);

52222

if (!PostShuffleMask.empty())

52223

HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,

52224

DAG.getUNDEF(VT), PostShuffleMask);

52225

return HorizBinOp;

52226

}

52227

}

52228

break;

52229

case ISD::ADD:

52230

case ISD::SUB:

52231

if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||

52232

VT == MVT::v16i16 || VT == MVT::v8i32)) {

52233

SDValue LHS = N->getOperand(0);

52234

SDValue RHS = N->getOperand(1);

52235

auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;

52236

if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,

52237

PostShuffleMask)) {

52238

auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,

52239

ArrayRef<SDValue> Ops) {

52240

return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);

52241

};

52242

SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,

52243

{LHS, RHS}, HOpBuilder);

52244

if (!PostShuffleMask.empty())

52245

HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,

52246

DAG.getUNDEF(VT), PostShuffleMask);

52247

return HorizBinOp;

52248

}

52249

}

52250

break;

52251

}

52252

52253

return SDValue();

52254

}

52255

52256

// Try to combine the following nodes

52257

// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64

52258

// <i32 -2147483648[float -0.000000e+00]> 0

52259

// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD

52260

// <(load 4 from constant-pool)> t0, t29

52261

// [t30: v16i32 = bitcast t27]

52262

// t6: v16i32 = xor t7, t27[t30]

52263

// t11: v16f32 = bitcast t6

52264

// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8

52265

// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:

52266

// t22: v16f32 = bitcast t7

52267

// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22

52268

// t24: v32f16 = bitcast t23

52269

static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,

52270

const X86Subtarget &Subtarget) {

52271

EVT VT = N->getValueType(0);

52272

SDValue LHS = N->getOperand(0);

52273

SDValue RHS = N->getOperand(1);

52274

int CombineOpcode =

52275

N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;

52276

auto isConjugationConstant = [](const Constant *c) {

52277

if (const auto *CI = dyn_cast<ConstantInt>(c)) {

52278

APInt ConjugationInt32 = APInt(32, 0x80000000, true);

52279

APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);

52280

switch (CI->getBitWidth()) {

52281

case 16:

52282

return false;

52283

case 32:

52284

return CI->getValue() == ConjugationInt32;

52285

case 64:

52286

return CI->getValue() == ConjugationInt64;

52287

default:

52288

llvm_unreachable("Unexpected bit width")::llvm::llvm_unreachable_internal("Unexpected bit width", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 52288);

52289

}

52290

}

52291

if (const auto *CF = dyn_cast<ConstantFP>(c))

52292

return CF->isNegativeZeroValue();

52293

return false;

52294

};

52295

auto combineConjugation = [&](SDValue &r) {

52296

if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {

52297

SDValue XOR = LHS.getOperand(0);

52298

if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {

52299

SDValue XORRHS = XOR.getOperand(1);

52300

if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())

52301

XORRHS = XORRHS.getOperand(0);

52302

if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&

52303

XORRHS.getOperand(1).getNumOperands()) {

52304

ConstantPoolSDNode *CP =

52305

dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0));

52306

if (CP && isConjugationConstant(CP->getConstVal())) {

52307

SelectionDAG::FlagInserter FlagsInserter(DAG, N);

52308

SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));

52309

SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);

52310

r = DAG.getBitcast(VT, FCMulC);

52311

return true;

52312

}

52313

}

52314

}

52315

}

52316

return false;

52317

};

52318

SDValue Res;

52319

if (combineConjugation(Res))

52320

return Res;

52321

std::swap(LHS, RHS);

52322

if (combineConjugation(Res))

52323

return Res;

52324

return Res;

52325

}

52326

52327

// Try to combine the following nodes:

52328

// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)

52329

static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,

52330

const X86Subtarget &Subtarget) {

52331

auto AllowContract = [&DAG](const SDNodeFlags &Flags) {

52332

return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||

52333

Flags.hasAllowContract();

52334

};

52335

52336

auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {

52337

return DAG.getTarget().Options.NoSignedZerosFPMath ||

52338

Flags.hasNoSignedZeros();

52339

};

52340

auto IsVectorAllNegativeZero = [](const SDNode *N) {

52341

if (N->getOpcode() != X86ISD::VBROADCAST_LOAD)

52342

return false;

52343

assert(N->getSimpleValueType(0).getScalarType() == MVT::f32 &&(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52344, __extension__
__PRETTY_FUNCTION__))

52344

"Unexpected vector type!")(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52344, __extension__
__PRETTY_FUNCTION__));

52345

if (ConstantPoolSDNode *CP =

52346

dyn_cast<ConstantPoolSDNode>(N->getOperand(1)->getOperand(0))) {

52347

APInt AI = APInt(32, 0x80008000, true);

52348

if (const auto *CI = dyn_cast<ConstantInt>(CP->getConstVal()))

52349

return CI->getValue() == AI;

52350

if (const auto *CF = dyn_cast<ConstantFP>(CP->getConstVal()))

52351

return CF->getValue() == APFloat(APFloat::IEEEsingle(), AI);

52352

}

52353

return false;

52354

};

52355

52356

if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||

52357

!AllowContract(N->getFlags()))

52358

return SDValue();

52359

52360

EVT VT = N->getValueType(0);

52361

if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)

52362

return SDValue();

52363

52364

SDValue LHS = N->getOperand(0);

52365

SDValue RHS = N->getOperand(1);

52366

bool IsConj;

52367

SDValue FAddOp1, MulOp0, MulOp1;

52368

auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,

52369

&IsVectorAllNegativeZero,

52370

&HasNoSignedZero](SDValue N) -> bool {

52371

if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)

52372

return false;

52373

SDValue Op0 = N.getOperand(0);

52374

unsigned Opcode = Op0.getOpcode();

52375

if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {

52376

if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {

52377

MulOp0 = Op0.getOperand(0);

52378

MulOp1 = Op0.getOperand(1);

52379

IsConj = Opcode == X86ISD::VFCMULC;

52380

return true;

52381

}

52382

if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&

52383

((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&

52384

HasNoSignedZero(Op0->getFlags())) ||

52385

IsVectorAllNegativeZero(Op0->getOperand(2).getNode()))) {

52386

MulOp0 = Op0.getOperand(0);

52387

MulOp1 = Op0.getOperand(1);

52388

IsConj = Opcode == X86ISD::VFCMADDC;

52389

return true;

52390

}

52391

}

52392

return false;

52393

};

52394

52395

if (GetCFmulFrom(LHS))

52396

FAddOp1 = RHS;

52397

else if (GetCFmulFrom(RHS))

52398

FAddOp1 = LHS;

52399

else

52400

return SDValue();

52401

52402

MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);

52403

FAddOp1 = DAG.getBitcast(CVT, FAddOp1);

52404

unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;

52405

// FIXME: How do we handle when fast math flags of FADD are different from

52406

// CFMUL's?

52407

SDValue CFmul =

52408

DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());

52409

return DAG.getBitcast(VT, CFmul);

52410

}

52411

52412

/// Do target-specific dag combines on floating-point adds/subs.

52413

static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,

52414

const X86Subtarget &Subtarget) {

52415

if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))

52416

return HOp;

52417

52418

if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))

52419

return COp;

52420

52421

return SDValue();

52422

}

52423

52424

/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify

52425

/// the codegen.

52426

/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )

52427

/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove

52428

/// anything that is guaranteed to be transformed by DAGCombiner.

52429

static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,

52430

const X86Subtarget &Subtarget,

52431

const SDLoc &DL) {

52432

assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")(static_cast <bool> (N->getOpcode() == ISD::TRUNCATE
&& "Wrong opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52432, __extension__
__PRETTY_FUNCTION__));

52433

SDValue Src = N->getOperand(0);

52434

unsigned SrcOpcode = Src.getOpcode();

52435

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52436

52437

EVT VT = N->getValueType(0);

52438

EVT SrcVT = Src.getValueType();

52439

52440

auto IsFreeTruncation = [VT](SDValue Op) {

52441

unsigned TruncSizeInBits = VT.getScalarSizeInBits();

52442

52443

// See if this has been extended from a smaller/equal size to

52444

// the truncation size, allowing a truncation to combine with the extend.

52445

unsigned Opcode = Op.getOpcode();

52446

if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||

52447

Opcode == ISD::ZERO_EXTEND) &&

52448

Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)

52449

return true;

52450

52451

// See if this is a single use constant which can be constant folded.

52452

// NOTE: We don't peek throught bitcasts here because there is currently

52453

// no support for constant folding truncate+bitcast+vector_of_constants. So

52454

// we'll just send up with a truncate on both operands which will

52455

// get turned back into (truncate (binop)) causing an infinite loop.

52456

return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());

52457

};

52458

52459

auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {

52460

SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);

52461

SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);

52462

return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);

52463

};

52464

52465

// Don't combine if the operation has other uses.

52466

if (!Src.hasOneUse())

52467

return SDValue();

52468

52469

// Only support vector truncation for now.

52470

// TODO: i64 scalar math would benefit as well.

52471

if (!VT.isVector())

52472

return SDValue();

52473

52474

// In most cases its only worth pre-truncating if we're only facing the cost

52475

// of one truncation.

52476

// i.e. if one of the inputs will constant fold or the input is repeated.

52477

switch (SrcOpcode) {

52478

case ISD::MUL:

52479

// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its

52480

// better to truncate if we have the chance.

52481

if (SrcVT.getScalarType() == MVT::i64 &&

52482

TLI.isOperationLegal(SrcOpcode, VT) &&

52483

!TLI.isOperationLegal(SrcOpcode, SrcVT))

52484

return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));

52485

[[fallthrough]];

52486

case ISD::AND:

52487

case ISD::XOR:

52488

case ISD::OR:

52489

case ISD::ADD:

52490

case ISD::SUB: {

52491

SDValue Op0 = Src.getOperand(0);

52492

SDValue Op1 = Src.getOperand(1);

52493

if (TLI.isOperationLegal(SrcOpcode, VT) &&

52494

(Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))

52495

return TruncateArithmetic(Op0, Op1);

52496

break;

52497

}

52498

}

52499

52500

return SDValue();

52501

}

52502

52503

/// Truncate using ISD::AND mask and X86ISD::PACKUS.

52504

/// e.g. trunc <8 x i32> X to <8 x i16> -->

52505

/// MaskX = X & 0xffff (clear high bits to prevent saturation)

52506

/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)

52507

static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,

52508

const X86Subtarget &Subtarget,

52509

SelectionDAG &DAG) {

52510

SDValue In = N->getOperand(0);

52511

EVT InVT = In.getValueType();

52512

EVT OutVT = N->getValueType(0);

52513

52514

APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),

52515

OutVT.getScalarSizeInBits());

52516

In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));

52517

return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);

52518

}

52519

52520

/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.

52521

static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,

52522

const X86Subtarget &Subtarget,

52523

SelectionDAG &DAG) {

52524

SDValue In = N->getOperand(0);

52525

EVT InVT = In.getValueType();

52526

EVT OutVT = N->getValueType(0);

52527

In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,

52528

DAG.getValueType(OutVT));

52529

return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);

52530

}

52531

52532

/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into

52533

/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type

52534

/// legalization the truncation will be translated into a BUILD_VECTOR with each

52535

/// element that is extracted from a vector and then truncated, and it is

52536

/// difficult to do this optimization based on them.

52537

static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,

52538

const X86Subtarget &Subtarget) {

52539

EVT OutVT = N->getValueType(0);

52540

if (!OutVT.isVector())

52541

return SDValue();

52542

52543

SDValue In = N->getOperand(0);

52544

if (!In.getValueType().isSimple())

52545

return SDValue();

52546

52547

EVT InVT = In.getValueType();

52548

unsigned NumElems = OutVT.getVectorNumElements();

52549

52550

// AVX512 provides fast truncate ops.

52551

if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())

52552

return SDValue();

52553

52554

EVT OutSVT = OutVT.getVectorElementType();

52555

EVT InSVT = InVT.getVectorElementType();

52556

if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&

52557

(OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&

52558

NumElems >= 8))

52559

return SDValue();

52560

52561

// SSSE3's pshufb results in less instructions in the cases below.

52562

if (Subtarget.hasSSSE3() && NumElems == 8) {

52563

if (InSVT == MVT::i16)

52564

return SDValue();

52565

if (InSVT == MVT::i32 &&

52566

(OutSVT == MVT::i8 || !Subtarget.hasSSE41() || Subtarget.hasInt256()))

52567

return SDValue();

52568

}

52569

52570

SDLoc DL(N);

52571

// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS

52572

// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to

52573

// truncate 2 x v4i32 to v8i16.

52574

if (Subtarget.hasSSE41() || OutSVT == MVT::i8)

52575

return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);

52576

if (InSVT == MVT::i32)

52577

return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);

52578

52579

return SDValue();

52580

}

52581

52582

/// This function transforms vector truncation of 'extended sign-bits' or

52583

/// 'extended zero-bits' values.

52584

/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.

52585

static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,

52586

SelectionDAG &DAG,

52587

const X86Subtarget &Subtarget) {

52588

// Requires SSE2.

52589

if (!Subtarget.hasSSE2())

52590

return SDValue();

52591

52592

if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())

52593

return SDValue();

52594

52595

SDValue In = N->getOperand(0);

52596

if (!In.getValueType().isSimple())

52597

return SDValue();

52598

52599

MVT VT = N->getValueType(0).getSimpleVT();

52600

MVT SVT = VT.getScalarType();

52601

52602

MVT InVT = In.getValueType().getSimpleVT();

52603

MVT InSVT = InVT.getScalarType();

52604

52605

// Check we have a truncation suited for PACKSS/PACKUS.

52606

if (!isPowerOf2_32(VT.getVectorNumElements()))

52607

return SDValue();

52608

if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)

52609

return SDValue();

52610

if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)

52611

return SDValue();

52612

52613

// Truncation to sub-128bit vXi32 can be better handled with shuffles.

52614

if (SVT == MVT::i32 && VT.getSizeInBits() < 128)

52615

return SDValue();

52616

52617

// AVX512 has fast truncate, but if the input is already going to be split,

52618

// there's no harm in trying pack.

52619

if (Subtarget.hasAVX512() &&

52620

!(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&

52621

InVT.is512BitVector())) {

52622

// PACK should still be worth it for 128-bit vectors if the sources were

52623

// originally concatenated from subvectors.

52624

SmallVector<SDValue> ConcatOps;

52625

if (VT.getSizeInBits() > 128 ||

52626

!collectConcatOps(In.getNode(), ConcatOps, DAG))

52627

return SDValue();

52628

}

52629

52630

unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);

52631

unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

52632

52633

// Use PACKUS if the input has zero-bits that extend all the way to the

52634

// packed/truncated value. e.g. masks, zext_in_reg, etc.

52635

KnownBits Known = DAG.computeKnownBits(In);

52636

unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();

52637

if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))

52638

return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);

52639

52640

// Use PACKSS if the input has sign-bits that extend all the way to the

52641

// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.

52642

unsigned NumSignBits = DAG.ComputeNumSignBits(In);

52643

52644

// Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with

52645

// a sign splat. ComputeNumSignBits struggles to see through BITCASTs later

52646

// on and combines/simplifications can't then use it.

52647

if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())

52648

return SDValue();

52649

52650

unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;

52651

if (NumSignBits > MinSignBits)

52652

return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);

52653

52654

// If we have a srl that only generates signbits that we will discard in

52655

// the truncation then we can use PACKSS by converting the srl to a sra.

52656

// SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.

52657

if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))

52658

if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(

52659

In, APInt::getAllOnes(VT.getVectorNumElements()))) {

52660

if (*ShAmt == MinSignBits) {

52661

SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());

52662

return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,

52663

Subtarget);

52664

}

52665

}

52666

52667

return SDValue();

52668

}

52669

52670

// Try to form a MULHU or MULHS node by looking for

52671

// (trunc (srl (mul ext, ext), 16))

52672

// TODO: This is X86 specific because we want to be able to handle wide types

52673

// before type legalization. But we can only do it if the vector will be

52674

// legalized via widening/splitting. Type legalization can't handle promotion

52675

// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG

52676

// combiner.

52677

static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,

52678

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

52679

// First instruction should be a right shift of a multiply.

52680

if (Src.getOpcode() != ISD::SRL ||

52681

Src.getOperand(0).getOpcode() != ISD::MUL)

52682

return SDValue();

52683

52684

if (!Subtarget.hasSSE2())

52685

return SDValue();

52686

52687

// Only handle vXi16 types that are at least 128-bits unless they will be

52688

// widened.

52689

if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)

52690

return SDValue();

52691

52692

// Input type should be at least vXi32.

52693

EVT InVT = Src.getValueType();

52694

if (InVT.getVectorElementType().getSizeInBits() < 32)

52695

return SDValue();

52696

52697

// Need a shift by 16.

52698

APInt ShiftAmt;

52699

if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||

52700

ShiftAmt != 16)

52701

return SDValue();

52702

52703

SDValue LHS = Src.getOperand(0).getOperand(0);

52704

SDValue RHS = Src.getOperand(0).getOperand(1);

52705

52706

// Count leading sign/zero bits on both inputs - if there are enough then

52707

// truncation back to vXi16 will be cheap - either as a pack/shuffle

52708

// sequence or using AVX512 truncations. If the inputs are sext/zext then the

52709

// truncations may actually be free by peeking through to the ext source.

52710

auto IsSext = [&DAG](SDValue V) {

52711

return DAG.ComputeMaxSignificantBits(V) <= 16;

52712

};

52713

auto IsZext = [&DAG](SDValue V) {

52714

return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;

52715

};

52716

52717

bool IsSigned = IsSext(LHS) && IsSext(RHS);

52718

bool IsUnsigned = IsZext(LHS) && IsZext(RHS);

52719

if (!IsSigned && !IsUnsigned)

52720

return SDValue();

52721

52722

// Check if both inputs are extensions, which will be removed by truncation.

52723

bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||

52724

LHS.getOpcode() == ISD::ZERO_EXTEND) &&

52725

(RHS.getOpcode() == ISD::SIGN_EXTEND ||

52726

RHS.getOpcode() == ISD::ZERO_EXTEND) &&

52727

LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&

52728

RHS.getOperand(0).getScalarValueSizeInBits() <= 16;

52729

52730

// For AVX2+ targets, with the upper bits known zero, we can perform MULHU on

52731

// the (bitcasted) inputs directly, and then cheaply pack/truncate the result

52732

// (upper elts will be zero). Don't attempt this with just AVX512F as MULHU

52733

// will have to split anyway.

52734

unsigned InSizeInBits = InVT.getSizeInBits();

52735

if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&

52736

!(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&

52737

(InSizeInBits % 16) == 0) {

52738

EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

52739

InVT.getSizeInBits() / 16);

52740

SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),

52741

DAG.getBitcast(BCVT, RHS));

52742

return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));

52743

}

52744

52745

// Truncate back to source type.

52746

LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);

52747

RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);

52748

52749

unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;

52750

return DAG.getNode(Opc, DL, VT, LHS, RHS);

52751

}

52752

52753

// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes

52754

// from one vector with signed bytes from another vector, adds together

52755

// adjacent pairs of 16-bit products, and saturates the result before

52756

// truncating to 16-bits.

52757

//

52758

// Which looks something like this:

52759

// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),

52760

// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))

52761

static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,

52762

const X86Subtarget &Subtarget,

52763

const SDLoc &DL) {

52764

if (!VT.isVector() || !Subtarget.hasSSSE3())

52765

return SDValue();

52766

52767

unsigned NumElems = VT.getVectorNumElements();

52768

EVT ScalarVT = VT.getVectorElementType();

52769

if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))

52770

return SDValue();

52771

52772

SDValue SSatVal = detectSSatPattern(In, VT);

52773

if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)

52774

return SDValue();

52775

52776

// Ok this is a signed saturation of an ADD. See if this ADD is adding pairs

52777

// of multiplies from even/odd elements.

52778

SDValue N0 = SSatVal.getOperand(0);

52779

SDValue N1 = SSatVal.getOperand(1);

52780

52781

if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)

52782

return SDValue();

52783

52784

SDValue N00 = N0.getOperand(0);

52785

SDValue N01 = N0.getOperand(1);

52786

SDValue N10 = N1.getOperand(0);

52787

SDValue N11 = N1.getOperand(1);

52788

52789

// TODO: Handle constant vectors and use knownbits/computenumsignbits?

52790

// Canonicalize zero_extend to LHS.

52791

if (N01.getOpcode() == ISD::ZERO_EXTEND)

52792

std::swap(N00, N01);

52793

if (N11.getOpcode() == ISD::ZERO_EXTEND)

52794

std::swap(N10, N11);

52795

52796

// Ensure we have a zero_extend and a sign_extend.

52797

if (N00.getOpcode() != ISD::ZERO_EXTEND ||

52798

N01.getOpcode() != ISD::SIGN_EXTEND ||

52799

N10.getOpcode() != ISD::ZERO_EXTEND ||

52800

N11.getOpcode() != ISD::SIGN_EXTEND)

52801

return SDValue();

52802

52803

// Peek through the extends.

52804

N00 = N00.getOperand(0);

52805

N01 = N01.getOperand(0);

52806

N10 = N10.getOperand(0);

52807

N11 = N11.getOperand(0);

52808

52809

// Ensure the extend is from vXi8.

52810

if (N00.getValueType().getVectorElementType() != MVT::i8 ||

52811

N01.getValueType().getVectorElementType() != MVT::i8 ||

52812

N10.getValueType().getVectorElementType() != MVT::i8 ||

52813

N11.getValueType().getVectorElementType() != MVT::i8)

52814

return SDValue();

52815

52816

// All inputs should be build_vectors.

52817

if (N00.getOpcode() != ISD::BUILD_VECTOR ||

52818

N01.getOpcode() != ISD::BUILD_VECTOR ||

52819

N10.getOpcode() != ISD::BUILD_VECTOR ||

52820

N11.getOpcode() != ISD::BUILD_VECTOR)

52821

return SDValue();

52822

52823

// N00/N10 are zero extended. N01/N11 are sign extended.

52824

52825

// For each element, we need to ensure we have an odd element from one vector

52826

// multiplied by the odd element of another vector and the even element from

52827

// one of the same vectors being multiplied by the even element from the

52828

// other vector. So we need to make sure for each element i, this operator

52829

// is being performed:

52830

// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]

52831

SDValue ZExtIn, SExtIn;

52832

for (unsigned i = 0; i != NumElems; ++i) {

52833

SDValue N00Elt = N00.getOperand(i);

52834

SDValue N01Elt = N01.getOperand(i);

52835

SDValue N10Elt = N10.getOperand(i);

52836

SDValue N11Elt = N11.getOperand(i);

52837

// TODO: Be more tolerant to undefs.

52838

if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

52839

N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

52840

N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

52841

N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

52842

return SDValue();

52843

auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));

52844

auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));

52845

auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));

52846

auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));

52847

if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)

52848

return SDValue();

52849

unsigned IdxN00 = ConstN00Elt->getZExtValue();

52850

unsigned IdxN01 = ConstN01Elt->getZExtValue();

52851

unsigned IdxN10 = ConstN10Elt->getZExtValue();

52852

unsigned IdxN11 = ConstN11Elt->getZExtValue();

52853

// Add is commutative so indices can be reordered.

52854

if (IdxN00 > IdxN10) {

52855

std::swap(IdxN00, IdxN10);

52856

std::swap(IdxN01, IdxN11);

52857

}

52858

// N0 indices be the even element. N1 indices must be the next odd element.

52859

if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||

52860

IdxN01 != 2 * i || IdxN11 != 2 * i + 1)

52861

return SDValue();

52862

SDValue N00In = N00Elt.getOperand(0);

52863

SDValue N01In = N01Elt.getOperand(0);

52864

SDValue N10In = N10Elt.getOperand(0);

52865

SDValue N11In = N11Elt.getOperand(0);

52866

// First time we find an input capture it.

52867

if (!ZExtIn) {

52868

ZExtIn = N00In;

52869

SExtIn = N01In;

52870

}

52871

if (ZExtIn != N00In || SExtIn != N01In ||

52872

ZExtIn != N10In || SExtIn != N11In)

52873

return SDValue();

52874

}

52875

52876

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

52877

ArrayRef<SDValue> Ops) {

52878

// Shrink by adding truncate nodes and let DAGCombine fold with the

52879

// sources.

52880

EVT InVT = Ops[0].getValueType();

52881

assert(InVT.getScalarType() == MVT::i8 &&(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52882, __extension__
__PRETTY_FUNCTION__))

52882

"Unexpected scalar element type")(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52882, __extension__
__PRETTY_FUNCTION__));

52883

assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52883, __extension__
__PRETTY_FUNCTION__));

52884

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

52885

InVT.getVectorNumElements() / 2);

52886

return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);

52887

};

52888

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },

52889

PMADDBuilder);

52890

}

52891

52892

static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,

52893

const X86Subtarget &Subtarget) {

52894

EVT VT = N->getValueType(0);

52895

SDValue Src = N->getOperand(0);

52896

SDLoc DL(N);

52897

52898

// Attempt to pre-truncate inputs to arithmetic ops instead.

52899

if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))

52900

return V;

52901

52902

// Try to detect AVG pattern first.

52903

if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))

52904

return Avg;

52905

52906

// Try to detect PMADD

52907

if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))

52908

return PMAdd;

52909

52910

// Try to combine truncation with signed/unsigned saturation.

52911

if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))

52912

return Val;

52913

52914

// Try to combine PMULHUW/PMULHW for vXi16.

52915

if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))

52916

return V;

52917

52918

// The bitcast source is a direct mmx result.

52919

// Detect bitcasts between i32 to x86mmx

52920

if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {

52921

SDValue BCSrc = Src.getOperand(0);

52922

if (BCSrc.getValueType() == MVT::x86mmx)

52923

return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);

52924

}

52925

52926

// Try to truncate extended sign/zero bits with PACKSS/PACKUS.

52927

if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))

52928

return V;

52929

52930

return combineVectorTruncation(N, DAG, Subtarget);

52931

}

52932

52933

static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,

52934

TargetLowering::DAGCombinerInfo &DCI) {

52935

EVT VT = N->getValueType(0);

52936

SDValue In = N->getOperand(0);

52937

SDLoc DL(N);

52938

52939

if (SDValue SSatVal = detectSSatPattern(In, VT))

52940

return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);

52941

if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))

52942

return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);

52943

52944

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52945

APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));

52946

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

52947

return SDValue(N, 0);

52948

52949

return SDValue();

52950

}

52951

52952

/// Returns the negated value if the node \p N flips sign of FP value.

52953

///

52954

/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)

52955

/// or FSUB(0, x)

52956

/// AVX512F does not have FXOR, so FNEG is lowered as

52957

/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).

52958

/// In this case we go though all bitcasts.

52959

/// This also recognizes splat of a negated value and returns the splat of that

52960

/// value.

52961

static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {

52962

if (N->getOpcode() == ISD::FNEG)

52963

return N->getOperand(0);

52964

52965

// Don't recurse exponentially.

52966

if (Depth > SelectionDAG::MaxRecursionDepth)

52967

return SDValue();

52968

52969

unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();

52970

52971

SDValue Op = peekThroughBitcasts(SDValue(N, 0));

52972

EVT VT = Op->getValueType(0);

52973

52974

// Make sure the element size doesn't change.

52975

if (VT.getScalarSizeInBits() != ScalarSize)

52976

return SDValue();

52977

52978

unsigned Opc = Op.getOpcode();

52979

switch (Opc) {

52980

case ISD::VECTOR_SHUFFLE: {

52981

// For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate

52982

// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.

52983

if (!Op.getOperand(1).isUndef())

52984

return SDValue();

52985

if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))

52986

if (NegOp0.getValueType() == VT) // FIXME: Can we do better?

52987

return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),

52988

cast<ShuffleVectorSDNode>(Op)->getMask());

52989

break;

52990

}

52991

case ISD::INSERT_VECTOR_ELT: {

52992

// Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,

52993

// -V, INDEX).

52994

SDValue InsVector = Op.getOperand(0);

52995

SDValue InsVal = Op.getOperand(1);

52996

if (!InsVector.isUndef())

52997

return SDValue();

52998

if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))

52999

if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME

53000

return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,

53001

NegInsVal, Op.getOperand(2));

53002

break;

53003

}

53004

case ISD::FSUB:

53005

case ISD::XOR:

53006

case X86ISD::FXOR: {

53007

SDValue Op1 = Op.getOperand(1);

53008

SDValue Op0 = Op.getOperand(0);

53009

53010

// For XOR and FXOR, we want to check if constant

53011

// bits of Op1 are sign bit masks. For FSUB, we

53012

// have to check if constant bits of Op0 are sign

53013

// bit masks and hence we swap the operands.

53014

if (Opc == ISD::FSUB)

53015

std::swap(Op0, Op1);

53016

53017

APInt UndefElts;

53018

SmallVector<APInt, 16> EltBits;

53019

// Extract constant bits and see if they are all

53020

// sign bit masks. Ignore the undef elements.

53021

if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,

53022

/* AllowWholeUndefs */ true,

53023

/* AllowPartialUndefs */ false)) {

53024

for (unsigned I = 0, E = EltBits.size(); I < E; I++)

53025

if (!UndefElts[I] && !EltBits[I].isSignMask())

53026

return SDValue();

53027

53028

// Only allow bitcast from correctly-sized constant.

53029

Op0 = peekThroughBitcasts(Op0);

53030

if (Op0.getScalarValueSizeInBits() == ScalarSize)

53031

return Op0;

53032

}

53033

break;

53034

} // case

53035

} // switch

53036

53037

return SDValue();

53038

}

53039

53040

static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,

53041

bool NegRes) {

53042

if (NegMul) {

53043

switch (Opcode) {

53044

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53044);

53045

case ISD::FMA: Opcode = X86ISD::FNMADD; break;

53046

case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;

53047

case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;

53048

case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;

53049

case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;

53050

case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;

53051

case X86ISD::FNMADD: Opcode = ISD::FMA; break;

53052

case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;

53053

case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;

53054

case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;

53055

case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;

53056

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;

53057

}

53058

}

53059

53060

if (NegAcc) {

53061

switch (Opcode) {

53062

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53062);

53063

case ISD::FMA: Opcode = X86ISD::FMSUB; break;

53064

case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;

53065

case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;

53066

case X86ISD::FMSUB: Opcode = ISD::FMA; break;

53067

case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;

53068

case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;

53069

case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;

53070

case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;

53071

case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;

53072

case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;

53073

case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;

53074

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;

53075

case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;

53076

case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;

53077

case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;

53078

case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;

53079

}

53080

}

53081

53082

if (NegRes) {

53083

switch (Opcode) {

53084

// For accuracy reason, we never combine fneg and fma under strict FP.

53085

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53085);

53086

case ISD::FMA: Opcode = X86ISD::FNMSUB; break;

53087

case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;

53088

case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;

53089

case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;

53090

case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;

53091

case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;

53092

case X86ISD::FNMSUB: Opcode = ISD::FMA; break;

53093

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;

53094

}

53095

}

53096

53097

return Opcode;

53098

}

53099

53100

/// Do target-specific dag combines on floating point negations.

53101

static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,

53102

TargetLowering::DAGCombinerInfo &DCI,

53103

const X86Subtarget &Subtarget) {

53104

EVT OrigVT = N->getValueType(0);

53105

SDValue Arg = isFNEG(DAG, N);

53106

if (!Arg)

53107

return SDValue();

53108

53109

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53110

EVT VT = Arg.getValueType();

53111

EVT SVT = VT.getScalarType();

53112

SDLoc DL(N);

53113

53114

// Let legalize expand this if it isn't a legal type yet.

53115

if (!TLI.isTypeLegal(VT))

53116

return SDValue();

53117

53118

// If we're negating a FMUL node on a target with FMA, then we can avoid the

53119

// use of a constant by performing (-0 - A*B) instead.

53120

// FIXME: Check rounding control flags as well once it becomes available.

53121

if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&

53122

Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {

53123

SDValue Zero = DAG.getConstantFP(0.0, DL, VT);

53124

SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),

53125

Arg.getOperand(1), Zero);

53126

return DAG.getBitcast(OrigVT, NewNode);

53127

}

53128

53129

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

53130

bool LegalOperations = !DCI.isBeforeLegalizeOps();

53131

if (SDValue NegArg =

53132

TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))

53133

return DAG.getBitcast(OrigVT, NegArg);

53134

53135

return SDValue();

53136

}

53137

53138

SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,

53139

bool LegalOperations,

53140

bool ForCodeSize,

53141

NegatibleCost &Cost,

53142

unsigned Depth) const {

53143

// fneg patterns are removable even if they have multiple uses.

53144

if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {

53145

Cost = NegatibleCost::Cheaper;

53146

return DAG.getBitcast(Op.getValueType(), Arg);

53147

}

53148

53149

EVT VT = Op.getValueType();

53150

EVT SVT = VT.getScalarType();

53151

unsigned Opc = Op.getOpcode();

53152

SDNodeFlags Flags = Op.getNode()->getFlags();

53153

switch (Opc) {

53154

case ISD::FMA:

53155

case X86ISD::FMSUB:

53156

case X86ISD::FNMADD:

53157

case X86ISD::FNMSUB:

53158

case X86ISD::FMADD_RND:

53159

case X86ISD::FMSUB_RND:

53160

case X86ISD::FNMADD_RND:

53161

case X86ISD::FNMSUB_RND: {

53162

if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||

53163

!(SVT == MVT::f32 || SVT == MVT::f64) ||

53164

!isOperationLegal(ISD::FMA, VT))

53165

break;

53166

53167

// Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)

53168

// if it may have signed zeros.

53169

if (!Flags.hasNoSignedZeros())

53170

break;

53171

53172

// This is always negatible for free but we might be able to remove some

53173

// extra operand negations as well.

53174

SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());

53175

for (int i = 0; i != 3; ++i)

53176

NewOps[i] = getCheaperNegatedExpression(

53177

Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);

53178

53179

bool NegA = !!NewOps[0];

53180

bool NegB = !!NewOps[1];

53181

bool NegC = !!NewOps[2];

53182

unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);

53183

53184

Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper

53185

: NegatibleCost::Neutral;

53186

53187

// Fill in the non-negated ops with the original values.

53188

for (int i = 0, e = Op.getNumOperands(); i != e; ++i)

53189

if (!NewOps[i])

53190

NewOps[i] = Op.getOperand(i);

53191

return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);

53192

}

53193

case X86ISD::FRCP:

53194

if (SDValue NegOp0 =

53195

getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,

53196

ForCodeSize, Cost, Depth + 1))

53197

return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);

53198

break;

53199

}

53200

53201

return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,

53202

ForCodeSize, Cost, Depth);

53203

}

53204

53205

static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,

53206

const X86Subtarget &Subtarget) {

53207

MVT VT = N->getSimpleValueType(0);

53208

// If we have integer vector types available, use the integer opcodes.

53209

if (!VT.isVector() || !Subtarget.hasSSE2())

53210

return SDValue();

53211

53212

SDLoc dl(N);

53213

53214

unsigned IntBits = VT.getScalarSizeInBits();

53215

MVT IntSVT = MVT::getIntegerVT(IntBits);

53216

MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);

53217

53218

SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));

53219

SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));

53220

unsigned IntOpcode;

53221

switch (N->getOpcode()) {

53222

default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53222);

53223

case X86ISD::FOR: IntOpcode = ISD::OR; break;

53224

case X86ISD::FXOR: IntOpcode = ISD::XOR; break;

53225

case X86ISD::FAND: IntOpcode = ISD::AND; break;

53226

case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;

53227

}

53228

SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);

53229

return DAG.getBitcast(VT, IntOp);

53230

}

53231

53232

53233

/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)

53234

static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {

53235

if (N->getOpcode() != ISD::XOR)

53236

return SDValue();

53237

53238

SDValue LHS = N->getOperand(0);

53239

if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)

53240

return SDValue();

53241

53242

X86::CondCode NewCC = X86::GetOppositeBranchCondition(

53243

X86::CondCode(LHS->getConstantOperandVal(0)));

53244

SDLoc DL(N);

53245

return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);

53246

}

53247

53248

static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG,

53249

const X86Subtarget &Subtarget) {

53250

assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && \"Invalid opcode for combing with CTLZ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53251, __extension__
__PRETTY_FUNCTION__))

53251

"Invalid opcode for combing with CTLZ")(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && \"Invalid opcode for combing with CTLZ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53251, __extension__
__PRETTY_FUNCTION__));

53252

if (Subtarget.hasFastLZCNT())

53253

return SDValue();

53254

53255

EVT VT = N->getValueType(0);

53256

if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&

53257

(VT != MVT::i64 || !Subtarget.is64Bit()))

53258

return SDValue();

53259

53260

SDValue N0 = N->getOperand(0);

53261

SDValue N1 = N->getOperand(1);

53262

53263

if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&

53264

N1.getOpcode() != ISD::CTLZ_ZERO_UNDEF)

53265

return SDValue();

53266

53267

SDValue OpCTLZ;

53268

SDValue OpSizeTM1;

53269

53270

if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {

53271

OpCTLZ = N1;

53272

OpSizeTM1 = N0;

53273

} else if (N->getOpcode() == ISD::SUB) {

53274

return SDValue();

53275

} else {

53276

OpCTLZ = N0;

53277

OpSizeTM1 = N1;

53278

}

53279

53280

if (!OpCTLZ.hasOneUse())

53281

return SDValue();

53282

auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);

53283

if (!C)

53284

return SDValue();

53285

53286

if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))

53287

return SDValue();

53288

SDLoc DL(N);

53289

EVT OpVT = VT;

53290

SDValue Op = OpCTLZ.getOperand(0);

53291

if (VT == MVT::i8) {

53292

// Zero extend to i32 since there is not an i8 bsr.

53293

OpVT = MVT::i32;

53294

Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);

53295

}

53296

53297

SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);

53298

Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);

53299

if (VT == MVT::i8)

53300

Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);

53301

53302

return Op;

53303

}

53304

53305

static SDValue combineXor(SDNode *N, SelectionDAG &DAG,

53306

TargetLowering::DAGCombinerInfo &DCI,

53307

const X86Subtarget &Subtarget) {

53308

SDValue N0 = N->getOperand(0);

53309

SDValue N1 = N->getOperand(1);

53310

EVT VT = N->getValueType(0);

53311

53312

// If this is SSE1 only convert to FXOR to avoid scalarization.

53313

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

53314

return DAG.getBitcast(MVT::v4i32,

53315

DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,

53316

DAG.getBitcast(MVT::v4f32, N0),

53317

DAG.getBitcast(MVT::v4f32, N1)));

53318

}

53319

53320

if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))

53321

return Cmp;

53322

53323

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

53324

return R;

53325

53326

if (SDValue R = combineBitOpWithShift(N, DAG))

53327

return R;

53328

53329

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))

53330

return FPLogic;

53331

53332

if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget))

53333

return R;

53334

53335

if (DCI.isBeforeLegalizeOps())

53336

return SDValue();

53337

53338

if (SDValue SetCC = foldXor1SetCC(N, DAG))

53339

return SetCC;

53340

53341

if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))

53342

return R;

53343

53344

if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))

53345

return RV;

53346

53347

// Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.

53348

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53349

if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&

53350

N0.getOperand(0).getValueType().isVector() &&

53351

N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

53352

TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {

53353

return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),

53354

N0.getOperand(0).getValueType()));

53355

}

53356

53357

// Handle AVX512 mask widening.

53358

// Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))

53359

if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&

53360

VT.getVectorElementType() == MVT::i1 &&

53361

N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&

53362

TLI.isTypeLegal(N0.getOperand(1).getValueType())) {

53363

return DAG.getNode(

53364

ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),

53365

DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),

53366

N0.getOperand(2));

53367

}

53368

53369

// Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))

53370

// Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))

53371

// TODO: Under what circumstances could this be performed in DAGCombine?

53372

if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&

53373

N0.getOperand(0).getOpcode() == N->getOpcode()) {

53374

SDValue TruncExtSrc = N0.getOperand(0);

53375

auto *N1C = dyn_cast<ConstantSDNode>(N1);

53376

auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));

53377

if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {

53378

SDLoc DL(N);

53379

SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);

53380

SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);

53381

return DAG.getNode(ISD::XOR, DL, VT, LHS,

53382

DAG.getNode(ISD::XOR, DL, VT, RHS, N1));

53383

}

53384

}

53385

53386

if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))

53387

return R;

53388

53389

return combineFneg(N, DAG, DCI, Subtarget);

53390

}

53391

53392

static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,

53393

TargetLowering::DAGCombinerInfo &DCI,

53394

const X86Subtarget &Subtarget) {

53395

EVT VT = N->getValueType(0);

53396

unsigned NumBits = VT.getSizeInBits();

53397

53398

// TODO - Constant Folding.

53399

53400

// Simplify the inputs.

53401

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53402

APInt DemandedMask(APInt::getAllOnes(NumBits));

53403

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

53404

return SDValue(N, 0);

53405

53406

return SDValue();

53407

}

53408

53409

static bool isNullFPScalarOrVectorConst(SDValue V) {

53410

return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());

53411

}

53412

53413

/// If a value is a scalar FP zero or a vector FP zero (potentially including

53414

/// undefined elements), return a zero constant that may be used to fold away

53415

/// that value. In the case of a vector, the returned constant will not contain

53416

/// undefined elements even if the input parameter does. This makes it suitable

53417

/// to be used as a replacement operand with operations (eg, bitwise-and) where

53418

/// an undef should not propagate.

53419

static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,

53420

const X86Subtarget &Subtarget) {

53421

if (!isNullFPScalarOrVectorConst(V))

53422

return SDValue();

53423

53424

if (V.getValueType().isVector())

53425

return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));

53426

53427

return V;

53428

}

53429

53430

static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,

53431

const X86Subtarget &Subtarget) {

53432

SDValue N0 = N->getOperand(0);

53433

SDValue N1 = N->getOperand(1);

53434

EVT VT = N->getValueType(0);

53435

SDLoc DL(N);

53436

53437

// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().

53438

if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||

53439

(VT == MVT::f64 && Subtarget.hasSSE2()) ||

53440

(VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))

53441

return SDValue();

53442

53443

auto isAllOnesConstantFP = [](SDValue V) {

53444

if (V.getSimpleValueType().isVector())

53445

return ISD::isBuildVectorAllOnes(V.getNode());

53446

auto *C = dyn_cast<ConstantFPSDNode>(V);

53447

return C && C->getConstantFPValue()->isAllOnesValue();

53448

};

53449

53450

// fand (fxor X, -1), Y --> fandn X, Y

53451

if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))

53452

return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);

53453

53454

// fand X, (fxor Y, -1) --> fandn Y, X

53455

if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))

53456

return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);

53457

53458

return SDValue();

53459

}

53460

53461

/// Do target-specific dag combines on X86ISD::FAND nodes.

53462

static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,

53463

const X86Subtarget &Subtarget) {

53464

// FAND(0.0, x) -> 0.0

53465

if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))

53466

return V;

53467

53468

// FAND(x, 0.0) -> 0.0

53469

if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))

53470

return V;

53471

53472

if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))

53473

return V;

53474

53475

return lowerX86FPLogicOp(N, DAG, Subtarget);

53476

}

53477

53478

/// Do target-specific dag combines on X86ISD::FANDN nodes.

53479

static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,

53480

const X86Subtarget &Subtarget) {

53481

// FANDN(0.0, x) -> x

53482

if (isNullFPScalarOrVectorConst(N->getOperand(0)))

53483

return N->getOperand(1);

53484

53485

// FANDN(x, 0.0) -> 0.0

53486

if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))

53487

return V;

53488

53489

return lowerX86FPLogicOp(N, DAG, Subtarget);

53490

}

53491

53492

/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.

53493

static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,

53494

TargetLowering::DAGCombinerInfo &DCI,

53495

const X86Subtarget &Subtarget) {

53496

assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)(static_cast <bool> (N->getOpcode() == X86ISD::FOR ||
N->getOpcode() == X86ISD::FXOR) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53496, __extension__
__PRETTY_FUNCTION__));

53497

53498

// F[X]OR(0.0, x) -> x

53499

if (isNullFPScalarOrVectorConst(N->getOperand(0)))

53500

return N->getOperand(1);

53501

53502

// F[X]OR(x, 0.0) -> x

53503

if (isNullFPScalarOrVectorConst(N->getOperand(1)))

53504

return N->getOperand(0);

53505

53506

if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))

53507

return NewVal;

53508

53509

return lowerX86FPLogicOp(N, DAG, Subtarget);

53510

}

53511

53512

/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.

53513

static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {

53514

assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)(static_cast <bool> (N->getOpcode() == X86ISD::FMIN ||
N->getOpcode() == X86ISD::FMAX) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53514, __extension__
__PRETTY_FUNCTION__));

53515

53516

// FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.

53517

if (!DAG.getTarget().Options.NoNaNsFPMath ||

53518

!DAG.getTarget().Options.NoSignedZerosFPMath)

53519

return SDValue();

53520

53521

// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes

53522

// into FMINC and FMAXC, which are Commutative operations.

53523

unsigned NewOp = 0;

53524

switch (N->getOpcode()) {

53525

default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53525);

53526

case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;

53527

case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;

53528

}

53529

53530

return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),

53531

N->getOperand(0), N->getOperand(1));

53532

}

53533

53534

static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,

53535

const X86Subtarget &Subtarget) {

53536

EVT VT = N->getValueType(0);

53537

if (Subtarget.useSoftFloat() || isSoftFP16(VT, Subtarget))

53538

return SDValue();

53539

53540

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53541

53542

if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||

53543

(Subtarget.hasSSE2() && VT == MVT::f64) ||

53544

(Subtarget.hasFP16() && VT == MVT::f16) ||

53545

(VT.isVector() && TLI.isTypeLegal(VT))))

53546

return SDValue();

53547

53548

SDValue Op0 = N->getOperand(0);

53549

SDValue Op1 = N->getOperand(1);

53550

SDLoc DL(N);

53551

auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;

53552

53553

// If we don't have to respect NaN inputs, this is a direct translation to x86

53554

// min/max instructions.

53555

if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())

53556

return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());

53557

53558

// If one of the operands is known non-NaN use the native min/max instructions

53559

// with the non-NaN input as second operand.

53560

if (DAG.isKnownNeverNaN(Op1))

53561

return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());

53562

if (DAG.isKnownNeverNaN(Op0))

53563

return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());

53564

53565

// If we have to respect NaN inputs, this takes at least 3 instructions.

53566

// Favor a library call when operating on a scalar and minimizing code size.

53567

if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())

53568

return SDValue();

53569

53570

EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),

53571

VT);

53572

53573

// There are 4 possibilities involving NaN inputs, and these are the required

53574

// outputs:

53575

// Op1

53576

// Num NaN

53577

// ----------------

53578

// Num | Max | Op0 |

53579

// Op0 ----------------

53580

// NaN | Op1 | NaN |

53581

// ----------------

53582

//

53583

// The SSE FP max/min instructions were not designed for this case, but rather

53584

// to implement:

53585

// Min = Op1 < Op0 ? Op1 : Op0

53586

// Max = Op1 > Op0 ? Op1 : Op0

53587

//

53588

// So they always return Op0 if either input is a NaN. However, we can still

53589

// use those instructions for fmaxnum by selecting away a NaN input.

53590

53591

// If either operand is NaN, the 2nd source operand (Op0) is passed through.

53592

SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);

53593

SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);

53594

53595

// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands

53596

// are NaN, the NaN value of Op1 is the result.

53597

return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);

53598

}

53599

53600

static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,

53601

TargetLowering::DAGCombinerInfo &DCI) {

53602

EVT VT = N->getValueType(0);

53603

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53604

53605

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

53606

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

53607

return SDValue(N, 0);

53608

53609

// Convert a full vector load into vzload when not all bits are needed.

53610

SDValue In = N->getOperand(0);

53611

MVT InVT = In.getSimpleValueType();

53612

if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&

53613

ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {

53614

assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53614, __extension__
__PRETTY_FUNCTION__));

53615

LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));

53616

unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();

53617

MVT MemVT = MVT::getIntegerVT(NumBits);

53618

MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);

53619

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {

53620

SDLoc dl(N);

53621

SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,

53622

DAG.getBitcast(InVT, VZLoad));

53623

DCI.CombineTo(N, Convert);

53624

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

53625

DCI.recursivelyDeleteUnusedNodes(LN);

53626

return SDValue(N, 0);

53627

}

53628

}

53629

53630

return SDValue();

53631

}

53632

53633

static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,

53634

TargetLowering::DAGCombinerInfo &DCI) {

53635

bool IsStrict = N->isTargetStrictFPOpcode();

53636

EVT VT = N->getValueType(0);

53637

53638

// Convert a full vector load into vzload when not all bits are needed.

53639

SDValue In = N->getOperand(IsStrict ? 1 : 0);

53640

MVT InVT = In.getSimpleValueType();

53641

if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&

53642

ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {

53643

assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53643, __extension__
__PRETTY_FUNCTION__));

53644

LoadSDNode *LN = cast<LoadSDNode>(In);

53645

unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();

53646

MVT MemVT = MVT::getFloatingPointVT(NumBits);

53647

MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);

53648

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {

53649

SDLoc dl(N);

53650

if (IsStrict) {

53651

SDValue Convert =

53652

DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},

53653

{N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});

53654

DCI.CombineTo(N, Convert, Convert.getValue(1));

53655

} else {

53656

SDValue Convert =

53657

DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));

53658

DCI.CombineTo(N, Convert);

53659

}

53660

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

53661

DCI.recursivelyDeleteUnusedNodes(LN);

53662

return SDValue(N, 0);

53663

}

53664

}

53665

53666

return SDValue();

53667

}

53668

53669

/// Do target-specific dag combines on X86ISD::ANDNP nodes.

53670

static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,

53671

TargetLowering::DAGCombinerInfo &DCI,

53672

const X86Subtarget &Subtarget) {

53673

SDValue N0 = N->getOperand(0);

53674

SDValue N1 = N->getOperand(1);

53675

MVT VT = N->getSimpleValueType(0);

53676

int NumElts = VT.getVectorNumElements();

53677

unsigned EltSizeInBits = VT.getScalarSizeInBits();

53678

53679

// ANDNP(undef, x) -> 0

53680

// ANDNP(x, undef) -> 0

53681

if (N0.isUndef() || N1.isUndef())

53682

return DAG.getConstant(0, SDLoc(N), VT);

53683

53684

// ANDNP(0, x) -> x

53685

if (ISD::isBuildVectorAllZeros(N0.getNode()))

53686

return N1;

53687

53688

// ANDNP(x, 0) -> 0

53689

if (ISD::isBuildVectorAllZeros(N1.getNode()))

53690

return DAG.getConstant(0, SDLoc(N), VT);

53691

53692

// Turn ANDNP back to AND if input is inverted.

53693

if (SDValue Not = IsNOT(N0, DAG))

53694

return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1);

53695

53696

// Constant Folding

53697

APInt Undefs0, Undefs1;

53698

SmallVector<APInt> EltBits0, EltBits1;

53699

if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0)) {

53700

SDLoc DL(N);

53701

APInt ResultUndefs = APInt::getZero(NumElts);

53702

53703

if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1)) {

53704

SmallVector<APInt> ResultBits;

53705

for (int I = 0; I != NumElts; ++I)

53706

ResultBits.push_back(~EltBits0[I] & EltBits1[I]);

53707

return getConstVector(ResultBits, ResultUndefs, VT, DAG, DL);

53708

}

53709

53710

// Constant fold NOT(N0) to allow us to use AND.

53711

// Ensure this is only performed if we can confirm that the bitcasted source

53712

// has oneuse to prevent an infinite loop with canonicalizeBitSelect.

53713

if (N0->hasOneUse()) {

53714

SDValue BC0 = peekThroughOneUseBitcasts(N0);

53715

if (BC0.getOpcode() != ISD::BITCAST) {

53716

for (APInt &Elt : EltBits0)

53717

Elt = ~Elt;

53718

SDValue Not = getConstVector(EltBits0, ResultUndefs, VT, DAG, DL);

53719

return DAG.getNode(ISD::AND, DL, VT, Not, N1);

53720

}

53721

}

53722

}

53723

53724

// Attempt to recursively combine a bitmask ANDNP with shuffles.

53725

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

53726

SDValue Op(N, 0);

53727

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

53728

return Res;

53729

53730

// If either operand is a constant mask, then only the elements that aren't

53731

// zero are actually demanded by the other operand.

53732

auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {

53733

APInt UndefElts;

53734

SmallVector<APInt> EltBits;

53735

APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);

53736

APInt DemandedElts = APInt::getAllOnes(NumElts);

53737

if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

53738

EltBits)) {

53739

DemandedBits.clearAllBits();

53740

DemandedElts.clearAllBits();

53741

for (int I = 0; I != NumElts; ++I) {

53742

if (UndefElts[I]) {

53743

// We can't assume an undef src element gives an undef dst - the

53744

// other src might be zero.

53745

DemandedBits.setAllBits();

53746

DemandedElts.setBit(I);

53747

} else if ((Invert && !EltBits[I].isAllOnes()) ||

53748

(!Invert && !EltBits[I].isZero())) {

53749

DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];

53750

DemandedElts.setBit(I);

53751

}

53752

}

53753

}

53754

return std::make_pair(DemandedBits, DemandedElts);

53755

};

53756

APInt Bits0, Elts0;

53757

APInt Bits1, Elts1;

53758

std::tie(Bits0, Elts0) = GetDemandedMasks(N1);

53759

std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);

53760

53761

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53762

if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||

53763

TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||

53764

TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||

53765

TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {

53766

if (N->getOpcode() != ISD::DELETED_NODE)

53767

DCI.AddToWorklist(N);

53768

return SDValue(N, 0);

53769

}

53770

}

53771

53772

return SDValue();

53773

}

53774

53775

static SDValue combineBT(SDNode *N, SelectionDAG &DAG,

53776

TargetLowering::DAGCombinerInfo &DCI) {

53777

SDValue N1 = N->getOperand(1);

53778

53779

// BT ignores high bits in the bit index operand.

53780

unsigned BitWidth = N1.getValueSizeInBits();

53781

APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));

53782

if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {

53783

if (N->getOpcode() != ISD::DELETED_NODE)

53784

DCI.AddToWorklist(N);

53785

return SDValue(N, 0);

53786

}

53787

53788

return SDValue();

53789

}

53790

53791

static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,

53792

TargetLowering::DAGCombinerInfo &DCI) {

53793

bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;

53794

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

53795

53796

if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {

53797

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53798

APInt DemandedElts = APInt::getLowBitsSet(8, 4);

53799

if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {

53800

if (N->getOpcode() != ISD::DELETED_NODE)

53801

DCI.AddToWorklist(N);

53802

return SDValue(N, 0);

53803

}

53804

53805

// Convert a full vector load into vzload when not all bits are needed.

53806

if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {

53807

LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));

53808

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {

53809

SDLoc dl(N);

53810

if (IsStrict) {

53811

SDValue Convert = DAG.getNode(

53812

N->getOpcode(), dl, {MVT::v4f32, MVT::Other},

53813

{N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});

53814

DCI.CombineTo(N, Convert, Convert.getValue(1));

53815

} else {

53816

SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,

53817

DAG.getBitcast(MVT::v8i16, VZLoad));

53818

DCI.CombineTo(N, Convert);

53819

}

53820

53821

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

53822

DCI.recursivelyDeleteUnusedNodes(LN);

53823

return SDValue(N, 0);

53824

}

53825

}

53826

}

53827

53828

return SDValue();

53829

}

53830

53831

// Try to combine sext_in_reg of a cmov of constants by extending the constants.

53832

static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {

53833

assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53833, __extension__
__PRETTY_FUNCTION__));

53834

53835

EVT DstVT = N->getValueType(0);

53836

53837

SDValue N0 = N->getOperand(0);

53838

SDValue N1 = N->getOperand(1);

53839

EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

53840

53841

if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)

53842

return SDValue();

53843

53844

// Look through single use any_extends / truncs.

53845

SDValue IntermediateBitwidthOp;

53846

if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&

53847

N0.hasOneUse()) {

53848

IntermediateBitwidthOp = N0;

53849

N0 = N0.getOperand(0);

53850

}

53851

53852

// See if we have a single use cmov.

53853

if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())

53854

return SDValue();

53855

53856

SDValue CMovOp0 = N0.getOperand(0);

53857

SDValue CMovOp1 = N0.getOperand(1);

53858

53859

// Make sure both operands are constants.

53860

if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||

53861

!isa<ConstantSDNode>(CMovOp1.getNode()))

53862

return SDValue();

53863

53864

SDLoc DL(N);

53865

53866

// If we looked through an any_extend/trunc above, add one to the constants.

53867

if (IntermediateBitwidthOp) {

53868

unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();

53869

CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);

53870

CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);

53871

}

53872

53873

CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);

53874

CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);

53875

53876

EVT CMovVT = DstVT;

53877

// We do not want i16 CMOV's. Promote to i32 and truncate afterwards.

53878

if (DstVT == MVT::i16) {

53879

CMovVT = MVT::i32;

53880

CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);

53881

CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);

53882

}

53883

53884

SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,

53885

N0.getOperand(2), N0.getOperand(3));

53886

53887

if (CMovVT != DstVT)

53888

CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);

53889

53890

return CMov;

53891

}

53892

53893

static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,

53894

const X86Subtarget &Subtarget) {

53895

assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53895, __extension__
__PRETTY_FUNCTION__));

53896

53897

if (SDValue V = combineSextInRegCmov(N, DAG))

53898

return V;

53899

53900

EVT VT = N->getValueType(0);

53901

SDValue N0 = N->getOperand(0);

53902

SDValue N1 = N->getOperand(1);

53903

EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

53904

SDLoc dl(N);

53905

53906

// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the

53907

// both SSE and AVX2 since there is no sign-extended shift right

53908

// operation on a vector with 64-bit elements.

53909

//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->

53910

// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))

53911

if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||

53912

N0.getOpcode() == ISD::SIGN_EXTEND)) {

53913

SDValue N00 = N0.getOperand(0);

53914

53915

// EXTLOAD has a better solution on AVX2,

53916

// it may be replaced with X86ISD::VSEXT node.

53917

if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())

53918

if (!ISD::isNormalLoad(N00.getNode()))

53919

return SDValue();

53920

53921

// Attempt to promote any comparison mask ops before moving the

53922

// SIGN_EXTEND_INREG in the way.

53923

if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))

53924

return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);

53925

53926

if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {

53927

SDValue Tmp =

53928

DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);

53929

return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);

53930

}

53931

}

53932

return SDValue();

53933

}

53934

53935

/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)

53936

/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)

53937

/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes

53938

/// opportunities to combine math ops, use an LEA, or use a complex addressing

53939

/// mode. This can eliminate extend, add, and shift instructions.

53940

static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,

53941

const X86Subtarget &Subtarget) {

53942

if (Ext->getOpcode() != ISD::SIGN_EXTEND &&

53943

Ext->getOpcode() != ISD::ZERO_EXTEND)

53944

return SDValue();

53945

53946

// TODO: This should be valid for other integer types.

53947

EVT VT = Ext->getValueType(0);

53948

if (VT != MVT::i64)

53949

return SDValue();

53950

53951

SDValue Add = Ext->getOperand(0);

53952

if (Add.getOpcode() != ISD::ADD)

53953

return SDValue();

53954

53955

bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;

53956

bool NSW = Add->getFlags().hasNoSignedWrap();

53957

bool NUW = Add->getFlags().hasNoUnsignedWrap();

53958

53959

// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding

53960

// into the 'zext'

53961

if ((Sext && !NSW) || (!Sext && !NUW))

53962

return SDValue();

53963

53964

// Having a constant operand to the 'add' ensures that we are not increasing

53965

// the instruction count because the constant is extended for free below.

53966

// A constant operand can also become the displacement field of an LEA.

53967

auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));

53968

if (!AddOp1)

53969

return SDValue();

53970

53971

// Don't make the 'add' bigger if there's no hope of combining it with some

53972

// other 'add' or 'shl' instruction.

53973

// TODO: It may be profitable to generate simpler LEA instructions in place

53974

// of single 'add' instructions, but the cost model for selecting an LEA

53975

// currently has a high threshold.

53976

bool HasLEAPotential = false;

53977

for (auto *User : Ext->uses()) {

53978

if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {

53979

HasLEAPotential = true;

53980

break;

53981

}

53982

}

53983

if (!HasLEAPotential)

53984

return SDValue();

53985

53986

// Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.

53987

int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();

53988

SDValue AddOp0 = Add.getOperand(0);

53989

SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);

53990

SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);

53991

53992

// The wider add is guaranteed to not wrap because both operands are

53993

// sign-extended.

53994

SDNodeFlags Flags;

53995

Flags.setNoSignedWrap(NSW);

53996

Flags.setNoUnsignedWrap(NUW);

53997

return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);

53998

}

53999

54000

// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant

54001

// operands and the result of CMOV is not used anywhere else - promote CMOV

54002

// itself instead of promoting its result. This could be beneficial, because:

54003

// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two

54004

// (or more) pseudo-CMOVs only when they go one-after-another and

54005

// getting rid of result extension code after CMOV will help that.

54006

// 2) Promotion of constant CMOV arguments is free, hence the

54007

// {ANY,SIGN,ZERO}_EXTEND will just be deleted.

54008

// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this

54009

// promotion is also good in terms of code-size.

54010

// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit

54011

// promotion).

54012

static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {

54013

SDValue CMovN = Extend->getOperand(0);

54014

if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())

54015

return SDValue();

54016

54017

EVT TargetVT = Extend->getValueType(0);

54018

unsigned ExtendOpcode = Extend->getOpcode();

54019

SDLoc DL(Extend);

54020

54021

EVT VT = CMovN.getValueType();

54022

SDValue CMovOp0 = CMovN.getOperand(0);

54023

SDValue CMovOp1 = CMovN.getOperand(1);

54024

54025

if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||

54026

!isa<ConstantSDNode>(CMovOp1.getNode()))

54027

return SDValue();

54028

54029

// Only extend to i32 or i64.

54030

if (TargetVT != MVT::i32 && TargetVT != MVT::i64)

54031

return SDValue();

54032

54033

// Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32

54034

// are free.

54035

if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))

54036

return SDValue();

54037

54038

// If this a zero extend to i64, we should only extend to i32 and use a free

54039

// zero extend to finish.

54040

EVT ExtendVT = TargetVT;

54041

if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)

54042

ExtendVT = MVT::i32;

54043

54044

CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);

54045

CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);

54046

54047

SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,

54048

CMovN.getOperand(2), CMovN.getOperand(3));

54049

54050

// Finish extending if needed.

54051

if (ExtendVT != TargetVT)

54052

Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);

54053

54054

return Res;

54055

}

54056

54057

// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm

54058

// result type.

54059

static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,

54060

const X86Subtarget &Subtarget) {

54061

SDValue N0 = N->getOperand(0);

54062

EVT VT = N->getValueType(0);

54063

SDLoc dl(N);

54064

54065

// Only do this combine with AVX512 for vector extends.

54066

if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)

54067

return SDValue();

54068

54069

// Only combine legal element types.

54070

EVT SVT = VT.getVectorElementType();

54071

if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&

54072

SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)

54073

return SDValue();

54074

54075

// We don't have CMPP Instruction for vxf16

54076

if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)

54077

return SDValue();

54078

// We can only do this if the vector size in 256 bits or less.

54079

unsigned Size = VT.getSizeInBits();

54080

if (Size > 256 && Subtarget.useAVX512Regs())

54081

return SDValue();

54082

54083

// Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since

54084

// that's the only integer compares with we have.

54085

ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();

54086

if (ISD::isUnsignedIntSetCC(CC))

54087

return SDValue();

54088

54089

// Only do this combine if the extension will be fully consumed by the setcc.

54090

EVT N00VT = N0.getOperand(0).getValueType();

54091

EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();

54092

if (Size != MatchingVecType.getSizeInBits())

54093

return SDValue();

54094

54095

SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);

54096

54097

if (N->getOpcode() == ISD::ZERO_EXTEND)

54098

Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());

54099

54100

return Res;

54101

}

54102

54103

static SDValue combineSext(SDNode *N, SelectionDAG &DAG,

54104

TargetLowering::DAGCombinerInfo &DCI,

54105

const X86Subtarget &Subtarget) {

54106

SDValue N0 = N->getOperand(0);

54107

EVT VT = N->getValueType(0);

54108

SDLoc DL(N);

54109

54110

// (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))

54111

if (!DCI.isBeforeLegalizeOps() &&

54112

N0.getOpcode() == X86ISD::SETCC_CARRY) {

54113

SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),

54114

N0->getOperand(1));

54115

bool ReplaceOtherUses = !N0.hasOneUse();

54116

DCI.CombineTo(N, Setcc);

54117

// Replace other uses with a truncate of the widened setcc_carry.

54118

if (ReplaceOtherUses) {

54119

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),

54120

N0.getValueType(), Setcc);

54121

DCI.CombineTo(N0.getNode(), Trunc);

54122

}

54123

54124

return SDValue(N, 0);

54125

}

54126

54127

if (SDValue NewCMov = combineToExtendCMOV(N, DAG))

54128

return NewCMov;

54129

54130

if (!DCI.isBeforeLegalizeOps())

54131

return SDValue();

54132

54133

if (SDValue V = combineExtSetcc(N, DAG, Subtarget))

54134

return V;

54135

54136

if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,

54137

DAG, DCI, Subtarget))

54138

return V;

54139

54140

if (VT.isVector()) {

54141

if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))

54142

return R;

54143

54144

if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)

54145

return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));

54146

}

54147

54148

if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))

54149

return NewAdd;

54150

54151

return SDValue();

54152

}

54153

54154

static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,

54155

TargetLowering::DAGCombinerInfo &DCI,

54156

const X86Subtarget &Subtarget) {

54157

SDLoc dl(N);

54158

EVT VT = N->getValueType(0);

54159

bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();

54160

54161

// Let legalize expand this if it isn't a legal type yet.

54162

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54163

if (!TLI.isTypeLegal(VT))

54164

return SDValue();

54165

54166

SDValue A = N->getOperand(IsStrict ? 1 : 0);

54167

SDValue B = N->getOperand(IsStrict ? 2 : 1);

54168

SDValue C = N->getOperand(IsStrict ? 3 : 2);

54169

54170

// If the operation allows fast-math and the target does not support FMA,

54171

// split this into mul+add to avoid libcall(s).

54172

SDNodeFlags Flags = N->getFlags();

54173

if (!IsStrict && Flags.hasAllowReassociation() &&

54174

TLI.isOperationExpand(ISD::FMA, VT)) {

54175

SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);

54176

return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);

54177

}

54178

54179

EVT ScalarVT = VT.getScalarType();

54180

if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||

54181

!Subtarget.hasAnyFMA()) &&

54182

!(ScalarVT == MVT::f16 && Subtarget.hasFP16()))

54183

return SDValue();

54184

54185

auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {

54186

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

54187

bool LegalOperations = !DCI.isBeforeLegalizeOps();

54188

if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,

54189

CodeSize)) {

54190

V = NegV;

54191

return true;

54192

}

54193

// Look through extract_vector_elts. If it comes from an FNEG, create a

54194

// new extract from the FNEG input.

54195

if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

54196

isNullConstant(V.getOperand(1))) {

54197

SDValue Vec = V.getOperand(0);

54198

if (SDValue NegV = TLI.getCheaperNegatedExpression(

54199

Vec, DAG, LegalOperations, CodeSize)) {

54200

V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),

54201

NegV, V.getOperand(1));

54202

return true;

54203

}

54204

}

54205

54206

return false;

54207

};

54208

54209

// Do not convert the passthru input of scalar intrinsics.

54210

// FIXME: We could allow negations of the lower element only.

54211

bool NegA = invertIfNegative(A);

54212

bool NegB = invertIfNegative(B);

54213

bool NegC = invertIfNegative(C);

54214

54215

if (!NegA && !NegB && !NegC)

54216

return SDValue();

54217

54218

unsigned NewOpcode =

54219

negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);

54220

54221

// Propagate fast-math-flags to new FMA node.

54222

SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);

54223

if (IsStrict) {

54224

assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")(static_cast <bool> (N->getNumOperands() == 4 &&
"Shouldn't be greater than 4") ? void (0) : __assert_fail ("N->getNumOperands() == 4 && \"Shouldn't be greater than 4\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54224, __extension__
__PRETTY_FUNCTION__));

54225

return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},

54226

{N->getOperand(0), A, B, C});

54227

} else {

54228

if (N->getNumOperands() == 4)

54229

return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));

54230

return DAG.getNode(NewOpcode, dl, VT, A, B, C);

54231

}

54232

}

54233

54234

// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)

54235

// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)

54236

static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,

54237

TargetLowering::DAGCombinerInfo &DCI) {

54238

SDLoc dl(N);

54239

EVT VT = N->getValueType(0);

54240

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54241

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

54242

bool LegalOperations = !DCI.isBeforeLegalizeOps();

54243

54244

SDValue N2 = N->getOperand(2);

54245

54246

SDValue NegN2 =

54247

TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);

54248

if (!NegN2)

54249

return SDValue();

54250

unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);

54251

54252

if (N->getNumOperands() == 4)

54253

return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),

54254

NegN2, N->getOperand(3));

54255

return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),

54256

NegN2);

54257

}

54258

54259

static SDValue combineZext(SDNode *N, SelectionDAG &DAG,

54260

TargetLowering::DAGCombinerInfo &DCI,

54261

const X86Subtarget &Subtarget) {

54262

SDLoc dl(N);

54263

SDValue N0 = N->getOperand(0);

54264

EVT VT = N->getValueType(0);

54265

54266

// (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))

54267

// FIXME: Is this needed? We don't seem to have any tests for it.

54268

if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&

54269

N0.getOpcode() == X86ISD::SETCC_CARRY) {

54270

SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),

54271

N0->getOperand(1));

54272

bool ReplaceOtherUses = !N0.hasOneUse();

54273

DCI.CombineTo(N, Setcc);

54274

// Replace other uses with a truncate of the widened setcc_carry.

54275

if (ReplaceOtherUses) {

54276

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),

54277

N0.getValueType(), Setcc);

54278

DCI.CombineTo(N0.getNode(), Trunc);

54279

}

54280

54281

return SDValue(N, 0);

54282

}

54283

54284

if (SDValue NewCMov = combineToExtendCMOV(N, DAG))

54285

return NewCMov;

54286

54287

if (DCI.isBeforeLegalizeOps())

54288

if (SDValue V = combineExtSetcc(N, DAG, Subtarget))

54289

return V;

54290

54291

if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,

54292

DAG, DCI, Subtarget))

54293

return V;

54294

54295

if (VT.isVector())

54296

if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))

54297

return R;

54298

54299

if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))

54300

return NewAdd;

54301

54302

if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))

54303

return R;

54304

54305

// TODO: Combine with any target/faux shuffle.

54306

if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&

54307

VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {

54308

SDValue N00 = N0.getOperand(0);

54309

SDValue N01 = N0.getOperand(1);

54310

unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();

54311

APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);

54312

if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&

54313

(N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {

54314

return concatSubVectors(N00, N01, DAG, dl);

54315

}

54316

}

54317

54318

return SDValue();

54319

}

54320

54321

/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just

54322

/// pre-promote its result type since vXi1 vectors don't get promoted

54323

/// during type legalization.

54324

static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS,

54325

SDValue RHS, ISD::CondCode CC,

54326

const SDLoc &DL, SelectionDAG &DAG,

54327

const X86Subtarget &Subtarget) {

54328

if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&

54329

VT.getVectorElementType() == MVT::i1 &&

54330

(OpVT.getVectorElementType() == MVT::i8 ||

54331

OpVT.getVectorElementType() == MVT::i16)) {

54332

SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);

54333

return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);

54334

}

54335

return SDValue();

54336

}

54337

54338

static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,

54339

TargetLowering::DAGCombinerInfo &DCI,

54340

const X86Subtarget &Subtarget) {

54341

const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();

54342

const SDValue LHS = N->getOperand(0);

54343

const SDValue RHS = N->getOperand(1);

54344

EVT VT = N->getValueType(0);

54345

EVT OpVT = LHS.getValueType();

54346

SDLoc DL(N);

54347

54348

if (CC == ISD::SETNE || CC == ISD::SETEQ) {

54349

if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,

54350

Subtarget))

54351

return V;

54352

54353

if (VT == MVT::i1) {

54354

X86::CondCode X86CC;

54355

if (SDValue V =

54356

MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))

54357

return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));

54358

}

54359

54360

if (OpVT.isScalarInteger()) {

54361

// cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)

54362

// cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)

54363

auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {

54364

if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {

54365

if (N0.getOperand(0) == N1)

54366

return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),

54367

N0.getOperand(1));

54368

if (N0.getOperand(1) == N1)

54369

return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),

54370

N0.getOperand(0));

54371

}

54372

return SDValue();

54373

};

54374

if (SDValue AndN = MatchOrCmpEq(LHS, RHS))

54375

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

54376

if (SDValue AndN = MatchOrCmpEq(RHS, LHS))

54377

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

54378

54379

// cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)

54380

// cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)

54381

auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {

54382

if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {

54383

if (N0.getOperand(0) == N1)

54384

return DAG.getNode(ISD::AND, DL, OpVT, N1,

54385

DAG.getNOT(DL, N0.getOperand(1), OpVT));

54386

if (N0.getOperand(1) == N1)

54387

return DAG.getNode(ISD::AND, DL, OpVT, N1,

54388

DAG.getNOT(DL, N0.getOperand(0), OpVT));

54389

}

54390

return SDValue();

54391

};

54392

if (SDValue AndN = MatchAndCmpEq(LHS, RHS))

54393

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

54394

if (SDValue AndN = MatchAndCmpEq(RHS, LHS))

54395

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

54396

54397

// cmpeq(trunc(x),0) --> cmpeq(x,0)

54398

// cmpne(trunc(x),0) --> cmpne(x,0)

54399

// iff x upper bits are zero.

54400

// TODO: Add support for RHS to be truncate as well?

54401

if (LHS.getOpcode() == ISD::TRUNCATE &&

54402

LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&

54403

isNullConstant(RHS) && !DCI.isBeforeLegalize()) {

54404

EVT SrcVT = LHS.getOperand(0).getValueType();

54405

APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),

54406

OpVT.getScalarSizeInBits());

54407

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54408

if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&

54409

TLI.isTypeLegal(LHS.getOperand(0).getValueType()))

54410

return DAG.getSetCC(DL, VT, LHS.getOperand(0),

54411

DAG.getConstant(0, DL, SrcVT), CC);

54412

}

54413

54414

// With C as a power of 2 and C != 0 and C != INT_MIN:

54415

// icmp eq Abs(X) C ->

54416

// (icmp eq A, C) | (icmp eq A, -C)

54417

// icmp ne Abs(X) C ->

54418

// (icmp ne A, C) & (icmp ne A, -C)

54419

// Both of these patterns can be better optimized in

54420

// DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar

54421

// integers which is checked above.

54422

if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {

54423

if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {

54424

const APInt &CInt = C->getAPIntValue();

54425

// We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.

54426

if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {

54427

SDValue BaseOp = LHS.getOperand(0);

54428

SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);

54429

SDValue SETCC1 = DAG.getSetCC(

54430

DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);

54431

return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,

54432

SETCC0, SETCC1);

54433

}

54434

}

54435

}

54436

}

54437

}

54438

54439

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

54440

(CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {

54441

// Using temporaries to avoid messing up operand ordering for later

54442

// transformations if this doesn't work.

54443

SDValue Op0 = LHS;

54444

SDValue Op1 = RHS;

54445

ISD::CondCode TmpCC = CC;

54446

// Put build_vector on the right.

54447

if (Op0.getOpcode() == ISD::BUILD_VECTOR) {

54448

std::swap(Op0, Op1);

54449

TmpCC = ISD::getSetCCSwappedOperands(TmpCC);

54450

}

54451

54452

bool IsSEXT0 =

54453

(Op0.getOpcode() == ISD::SIGN_EXTEND) &&

54454

(Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);

54455

bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());

54456

54457

if (IsSEXT0 && IsVZero1) {

54458

assert(VT == Op0.getOperand(0).getValueType() &&(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54459, __extension__
__PRETTY_FUNCTION__))

54459

"Unexpected operand type")(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54459, __extension__
__PRETTY_FUNCTION__));

54460

if (TmpCC == ISD::SETGT)

54461

return DAG.getConstant(0, DL, VT);

54462

if (TmpCC == ISD::SETLE)

54463

return DAG.getConstant(1, DL, VT);

54464

if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)

54465

return DAG.getNOT(DL, Op0.getOperand(0), VT);

54466

54467

assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54468, __extension__
__PRETTY_FUNCTION__))

54468

"Unexpected condition code!")(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54468, __extension__
__PRETTY_FUNCTION__));

54469

return Op0.getOperand(0);

54470

}

54471

}

54472

54473

// Try and make unsigned vector comparison signed. On pre AVX512 targets there

54474

// only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to

54475

// use `PCMPGT` if the result is mean to stay in a vector (and if its going to

54476

// a mask, there are signed AVX512 comparisons).

54477

if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {

54478

bool CanMakeSigned = false;

54479

if (ISD::isUnsignedIntSetCC(CC)) {

54480

KnownBits CmpKnown = KnownBits::commonBits(DAG.computeKnownBits(LHS),

54481

DAG.computeKnownBits(RHS));

54482

// If we know LHS/RHS share the same sign bit at each element we can

54483

// make this signed.

54484

// NOTE: `computeKnownBits` on a vector type aggregates common bits

54485

// across all lanes. So a pattern where the sign varies from lane to

54486

// lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be

54487

// missed. We could get around this by demanding each lane

54488

// independently, but this isn't the most important optimization and

54489

// that may eat into compile time.

54490

CanMakeSigned =

54491

CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();

54492

}

54493

if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {

54494

SDValue LHSOut = LHS;

54495

SDValue RHSOut = RHS;

54496

ISD::CondCode NewCC = CC;

54497

switch (CC) {

54498

case ISD::SETGE:

54499

case ISD::SETUGE:

54500

if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,

54501

/*NSW*/ true))

54502

LHSOut = NewLHS;

54503

else if (SDValue NewRHS = incDecVectorConstant(

54504

RHS, DAG, /*IsInc*/ false, /*NSW*/ true))

54505

RHSOut = NewRHS;

54506

else

54507

break;

54508

54509

[[fallthrough]];

54510

case ISD::SETUGT:

54511

NewCC = ISD::SETGT;

54512

break;

54513

54514

case ISD::SETLE:

54515

case ISD::SETULE:

54516

if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,

54517

/*NSW*/ true))

54518

LHSOut = NewLHS;

54519

else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,

54520

/*NSW*/ true))

54521

RHSOut = NewRHS;

54522

else

54523

break;

54524

54525

[[fallthrough]];

54526

case ISD::SETULT:

54527

// Will be swapped to SETGT in LowerVSETCC*.

54528

NewCC = ISD::SETLT;

54529

break;

54530

default:

54531

break;

54532

}

54533

if (NewCC != CC) {

54534

if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,

54535

NewCC, DL, DAG, Subtarget))

54536

return R;

54537

return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);

54538

}

54539

}

54540

}

54541

54542

if (SDValue R =

54543

truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))

54544

return R;

54545

54546

// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early

54547

// to avoid scalarization via legalization because v4i32 is not a legal type.

54548

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&

54549

LHS.getValueType() == MVT::v4f32)

54550

return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);

54551

54552

// X pred 0.0 --> X pred -X

54553

// If the negation of X already exists, use it in the comparison. This removes

54554

// the need to materialize 0.0 and allows matching to SSE's MIN/MAX

54555

// instructions in patterns with a 'select' node.

54556

if (isNullFPScalarOrVectorConst(RHS)) {

54557

SDVTList FNegVT = DAG.getVTList(OpVT);

54558

if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))

54559

return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);

54560

}

54561

54562

return SDValue();

54563

}

54564

54565

static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,

54566

TargetLowering::DAGCombinerInfo &DCI,

54567

const X86Subtarget &Subtarget) {

54568

SDValue Src = N->getOperand(0);

54569

MVT SrcVT = Src.getSimpleValueType();

54570

MVT VT = N->getSimpleValueType(0);

54571

unsigned NumBits = VT.getScalarSizeInBits();

54572

unsigned NumElts = SrcVT.getVectorNumElements();

54573

unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();

54574

assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types")(static_cast <bool> (VT == MVT::i32 && NumElts <=
NumBits && "Unexpected MOVMSK types") ? void (0) : __assert_fail
("VT == MVT::i32 && NumElts <= NumBits && \"Unexpected MOVMSK types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54574, __extension__
__PRETTY_FUNCTION__));

54575

54576

// Perform constant folding.

54577

APInt UndefElts;

54578

SmallVector<APInt, 32> EltBits;

54579

if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits)) {

54580

APInt Imm(32, 0);

54581

for (unsigned Idx = 0; Idx != NumElts; ++Idx)

54582

if (!UndefElts[Idx] && EltBits[Idx].isNegative())

54583

Imm.setBit(Idx);

54584

54585

return DAG.getConstant(Imm, SDLoc(N), VT);

54586

}

54587

54588

// Look through int->fp bitcasts that don't change the element width.

54589

unsigned EltWidth = SrcVT.getScalarSizeInBits();

54590

if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&

54591

Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)

54592

return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));

54593

54594

// Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results

54595

// with scalar comparisons.

54596

if (SDValue NotSrc = IsNOT(Src, DAG)) {

54597

SDLoc DL(N);

54598

APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);

54599

NotSrc = DAG.getBitcast(SrcVT, NotSrc);

54600

return DAG.getNode(ISD::XOR, DL, VT,

54601

DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),

54602

DAG.getConstant(NotMask, DL, VT));

54603

}

54604

54605

// Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk

54606

// results with scalar comparisons.

54607

if (Src.getOpcode() == X86ISD::PCMPGT &&

54608

ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {

54609

SDLoc DL(N);

54610

APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);

54611

return DAG.getNode(ISD::XOR, DL, VT,

54612

DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),

54613

DAG.getConstant(NotMask, DL, VT));

54614

}

54615

54616

// Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))

54617

// Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))

54618

// iff pow2splat(c1).

54619

// Use KnownBits to determine if only a single bit is non-zero

54620

// in each element (pow2 or zero), and shift that bit to the msb.

54621

if (Src.getOpcode() == X86ISD::PCMPEQ) {

54622

KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));

54623

KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));

54624

unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();

54625

if (KnownLHS.countMaxPopulation() == 1 &&

54626

(KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&

54627

ShiftAmt == KnownRHS.countMinLeadingZeros()))) {

54628

SDLoc DL(N);

54629

MVT ShiftVT = SrcVT;

54630

SDValue ShiftLHS = Src.getOperand(0);

54631

SDValue ShiftRHS = Src.getOperand(1);

54632

if (ShiftVT.getScalarType() == MVT::i8) {

54633

// vXi8 shifts - we only care about the signbit so can use PSLLW.

54634

ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

54635

ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);

54636

ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);

54637

}

54638

ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,

54639

ShiftLHS, ShiftAmt, DAG);

54640

ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,

54641

ShiftRHS, ShiftAmt, DAG);

54642

ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);

54643

ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);

54644

SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);

54645

return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));

54646

}

54647

}

54648

54649

// Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)

54650

if (N->isOnlyUserOf(Src.getNode())) {

54651

SDValue SrcBC = peekThroughOneUseBitcasts(Src);

54652

if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {

54653

APInt UndefElts;

54654

SmallVector<APInt, 32> EltBits;

54655

if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,

54656

UndefElts, EltBits)) {

54657

APInt Mask = APInt::getZero(NumBits);

54658

for (unsigned Idx = 0; Idx != NumElts; ++Idx) {

54659

if (!UndefElts[Idx] && EltBits[Idx].isNegative())

54660

Mask.setBit(Idx);

54661

}

54662

SDLoc DL(N);

54663

SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));

54664

SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);

54665

return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,

54666

DAG.getConstant(Mask, DL, VT));

54667

}

54668

}

54669

}

54670

54671

// Simplify the inputs.

54672

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54673

APInt DemandedMask(APInt::getAllOnes(NumBits));

54674

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

54675

return SDValue(N, 0);

54676

54677

return SDValue();

54678

}

54679

54680

static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG,

54681

TargetLowering::DAGCombinerInfo &DCI,

54682

const X86Subtarget &Subtarget) {

54683

MVT VT = N->getSimpleValueType(0);

54684

unsigned NumBits = VT.getScalarSizeInBits();

54685

54686

// Simplify the inputs.

54687

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54688

APInt DemandedMask(APInt::getAllOnes(NumBits));

54689

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

54690

return SDValue(N, 0);

54691

54692

return SDValue();

54693

}

54694

54695

static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,

54696

TargetLowering::DAGCombinerInfo &DCI,

54697

const X86Subtarget &Subtarget) {

54698

auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);

54699

SDValue BasePtr = MemOp->getBasePtr();

54700

SDValue Index = MemOp->getIndex();

54701

SDValue Scale = MemOp->getScale();

54702

SDValue Mask = MemOp->getMask();

54703

54704

// Attempt to fold an index scale into the scale value directly.

54705

// For smaller indices, implicit sext is performed BEFORE scale, preventing

54706

// this fold under most circumstances.

54707

// TODO: Move this into X86DAGToDAGISel::matchVectorAddressRecursively?

54708

if ((Index.getOpcode() == X86ISD::VSHLI ||

54709

(Index.getOpcode() == ISD::ADD &&

54710

Index.getOperand(0) == Index.getOperand(1))) &&

54711

isa<ConstantSDNode>(Scale) &&

54712

BasePtr.getScalarValueSizeInBits() == Index.getScalarValueSizeInBits()) {

54713

unsigned ShiftAmt =

54714

Index.getOpcode() == ISD::ADD ? 1 : Index.getConstantOperandVal(1);

54715

uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();

54716

uint64_t NewScaleAmt = ScaleAmt * (1ULL << ShiftAmt);

54717

if (isPowerOf2_64(NewScaleAmt) && NewScaleAmt <= 8) {

54718

SDValue NewIndex = Index.getOperand(0);

54719

SDValue NewScale =

54720

DAG.getTargetConstant(NewScaleAmt, SDLoc(N), Scale.getValueType());

54721

if (N->getOpcode() == X86ISD::MGATHER)

54722

return getAVX2GatherNode(N->getOpcode(), SDValue(N, 0), DAG,

54723

MemOp->getOperand(1), Mask,

54724

MemOp->getBasePtr(), NewIndex, NewScale,

54725

MemOp->getChain(), Subtarget);

54726

if (N->getOpcode() == X86ISD::MSCATTER)

54727

return getScatterNode(N->getOpcode(), SDValue(N, 0), DAG,

54728

MemOp->getOperand(1), Mask, MemOp->getBasePtr(),

54729

NewIndex, NewScale, MemOp->getChain(), Subtarget);

54730

}

54731

}

54732

54733

// With vector masks we only demand the upper bit of the mask.

54734

if (Mask.getScalarValueSizeInBits() != 1) {

54735

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54736

APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));

54737

if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {

54738

if (N->getOpcode() != ISD::DELETED_NODE)

54739

DCI.AddToWorklist(N);

54740

return SDValue(N, 0);

54741

}

54742

}

54743

54744

return SDValue();

54745

}

54746

54747

static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,

54748

SDValue Index, SDValue Base, SDValue Scale,

54749

SelectionDAG &DAG) {

54750

SDLoc DL(GorS);

54751

54752

if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {

54753

SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),

54754

Gather->getMask(), Base, Index, Scale } ;

54755

return DAG.getMaskedGather(Gather->getVTList(),

54756

Gather->getMemoryVT(), DL, Ops,

54757

Gather->getMemOperand(),

54758

Gather->getIndexType(),

54759

Gather->getExtensionType());

54760

}

54761

auto *Scatter = cast<MaskedScatterSDNode>(GorS);

54762

SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),

54763

Scatter->getMask(), Base, Index, Scale };

54764

return DAG.getMaskedScatter(Scatter->getVTList(),

54765

Scatter->getMemoryVT(), DL,

54766

Ops, Scatter->getMemOperand(),

54767

Scatter->getIndexType(),

54768

Scatter->isTruncatingStore());

54769

}

54770

54771

static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,

54772

TargetLowering::DAGCombinerInfo &DCI) {

54773

SDLoc DL(N);

54774

auto *GorS = cast<MaskedGatherScatterSDNode>(N);

54775

SDValue Index = GorS->getIndex();

54776

SDValue Base = GorS->getBasePtr();

54777

SDValue Scale = GorS->getScale();

54778

54779

if (DCI.isBeforeLegalize()) {

54780

unsigned IndexWidth = Index.getScalarValueSizeInBits();

54781

54782

// Shrink constant indices if they are larger than 32-bits.

54783

// Only do this before legalize types since v2i64 could become v2i32.

54784

// FIXME: We could check that the type is legal if we're after legalize

54785

// types, but then we would need to construct test cases where that happens.

54786

// FIXME: We could support more than just constant vectors, but we need to

54787

// careful with costing. A truncate that can be optimized out would be fine.

54788

// Otherwise we might only want to create a truncate if it avoids a split.

54789

if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {

54790

if (BV->isConstant() && IndexWidth > 32 &&

54791

DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {

54792

EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);

54793

Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);

54794

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

54795

}

54796

}

54797

54798

// Shrink any sign/zero extends from 32 or smaller to larger than 32 if

54799

// there are sufficient sign bits. Only do this before legalize types to

54800

// avoid creating illegal types in truncate.

54801

if ((Index.getOpcode() == ISD::SIGN_EXTEND ||

54802

Index.getOpcode() == ISD::ZERO_EXTEND) &&

54803

IndexWidth > 32 &&

54804

Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&

54805

DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {

54806

EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);

54807

Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);

54808

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

54809

}

54810

}

54811

54812

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54813

EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

54814

// Try to move splat constant adders from the index operand to the base

54815

// pointer operand. Taking care to multiply by the scale. We can only do

54816

// this when index element type is the same as the pointer type.

54817

// Otherwise we need to be sure the math doesn't wrap before the scale.

54818

if (Index.getOpcode() == ISD::ADD &&

54819

Index.getValueType().getVectorElementType() == PtrVT &&

54820

isa<ConstantSDNode>(Scale)) {

54821

uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();

54822

if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {

54823

BitVector UndefElts;

54824

if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {

54825

// FIXME: Allow non-constant?

54826

if (UndefElts.none()) {

54827

// Apply the scale.

54828

APInt Adder = C->getAPIntValue() * ScaleAmt;

54829

// Add it to the existing base.

54830

Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,

54831

DAG.getConstant(Adder, DL, PtrVT));

54832

Index = Index.getOperand(0);

54833

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

54834

}

54835

}

54836

54837

// It's also possible base is just a constant. In that case, just

54838

// replace it with 0 and move the displacement into the index.

54839

if (BV->isConstant() && isa<ConstantSDNode>(Base) &&

54840

isOneConstant(Scale)) {

54841

SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);

54842

// Combine the constant build_vector and the constant base.

54843

Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),

54844

Index.getOperand(1), Splat);

54845

// Add to the LHS of the original Index add.

54846

Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),

54847

Index.getOperand(0), Splat);

54848

Base = DAG.getConstant(0, DL, Base.getValueType());

54849

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

54850

}

54851

}

54852

}

54853

54854

if (DCI.isBeforeLegalizeOps()) {

54855

unsigned IndexWidth = Index.getScalarValueSizeInBits();

54856

54857

// Make sure the index is either i32 or i64

54858

if (IndexWidth != 32 && IndexWidth != 64) {

54859

MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;

54860

EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);

54861

Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);

54862

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

54863

}

54864

}

54865

54866

// With vector masks we only demand the upper bit of the mask.

54867

SDValue Mask = GorS->getMask();

54868

if (Mask.getScalarValueSizeInBits() != 1) {

54869

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54870

APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));

54871

if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {

54872

if (N->getOpcode() != ISD::DELETED_NODE)

54873

DCI.AddToWorklist(N);

54874

return SDValue(N, 0);

54875

}

54876

}

54877

54878

return SDValue();

54879

}

54880

54881

// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT

54882

static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,

54883

const X86Subtarget &Subtarget) {

54884

SDLoc DL(N);

54885

X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));

54886

SDValue EFLAGS = N->getOperand(1);

54887

54888

// Try to simplify the EFLAGS and condition code operands.

54889

if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))

54890

return getSETCC(CC, Flags, DL, DAG);

54891

54892

return SDValue();

54893

}

54894

54895

/// Optimize branch condition evaluation.

54896

static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,

54897

const X86Subtarget &Subtarget) {

54898

SDLoc DL(N);

54899

SDValue EFLAGS = N->getOperand(3);

54900

X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));

54901

54902

// Try to simplify the EFLAGS and condition code operands.

54903

// Make sure to not keep references to operands, as combineSetCCEFLAGS can

54904

// RAUW them under us.

54905

if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {

54906

SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);

54907

return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),

54908

N->getOperand(1), Cond, Flags);

54909

}

54910

54911

return SDValue();

54912

}

54913

54914

// TODO: Could we move this to DAGCombine?

54915

static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,

54916

SelectionDAG &DAG) {

54917

// Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane

54918

// to optimize away operation when it's from a constant.

54919

//

54920

// The general transformation is:

54921

// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->

54922

// AND(VECTOR_CMP(x,y), constant2)

54923

// constant2 = UNARYOP(constant)

54924

54925

// Early exit if this isn't a vector operation, the operand of the

54926

// unary operation isn't a bitwise AND, or if the sizes of the operations

54927

// aren't the same.

54928

EVT VT = N->getValueType(0);

54929

bool IsStrict = N->isStrictFPOpcode();

54930

unsigned NumEltBits = VT.getScalarSizeInBits();

54931

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

54932

if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||

54933

DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||

54934

VT.getSizeInBits() != Op0.getValueSizeInBits())

54935

return SDValue();

54936

54937

// Now check that the other operand of the AND is a constant. We could

54938

// make the transformation for non-constant splats as well, but it's unclear

54939

// that would be a benefit as it would not eliminate any operations, just

54940

// perform one more step in scalar code before moving to the vector unit.

54941

if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {

54942

// Bail out if the vector isn't a constant.

54943

if (!BV->isConstant())

54944

return SDValue();

54945

54946

// Everything checks out. Build up the new and improved node.

54947

SDLoc DL(N);

54948

EVT IntVT = BV->getValueType(0);

54949

// Create a new constant of the appropriate type for the transformed

54950

// DAG.

54951

SDValue SourceConst;

54952

if (IsStrict)

54953

SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},

54954

{N->getOperand(0), SDValue(BV, 0)});

54955

else

54956

SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));

54957

// The AND node needs bitcasts to/from an integer vector type around it.

54958

SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);

54959

SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),

54960

MaskConst);

54961

SDValue Res = DAG.getBitcast(VT, NewAnd);

54962

if (IsStrict)

54963

return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);

54964

return Res;

54965

}

54966

54967

return SDValue();

54968

}

54969

54970

/// If we are converting a value to floating-point, try to replace scalar

54971

/// truncate of an extracted vector element with a bitcast. This tries to keep

54972

/// the sequence on XMM registers rather than moving between vector and GPRs.

54973

static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {

54974

// TODO: This is currently only used by combineSIntToFP, but it is generalized

54975

// to allow being called by any similar cast opcode.

54976

// TODO: Consider merging this into lowering: vectorizeExtractedCast().

54977

SDValue Trunc = N->getOperand(0);

54978

if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)

54979

return SDValue();

54980

54981

SDValue ExtElt = Trunc.getOperand(0);

54982

if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

54983

!isNullConstant(ExtElt.getOperand(1)))

54984

return SDValue();

54985

54986

EVT TruncVT = Trunc.getValueType();

54987

EVT SrcVT = ExtElt.getValueType();

54988

unsigned DestWidth = TruncVT.getSizeInBits();

54989

unsigned SrcWidth = SrcVT.getSizeInBits();

54990

if (SrcWidth % DestWidth != 0)

54991

return SDValue();

54992

54993

// inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)

54994

EVT SrcVecVT = ExtElt.getOperand(0).getValueType();

54995

unsigned VecWidth = SrcVecVT.getSizeInBits();

54996

unsigned NumElts = VecWidth / DestWidth;

54997

EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);

54998

SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));

54999

SDLoc DL(N);

55000

SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,

55001

BitcastVec, ExtElt.getOperand(1));

55002

return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);

55003

}

55004

55005

static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,

55006

const X86Subtarget &Subtarget) {

55007

bool IsStrict = N->isStrictFPOpcode();

55008

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

55009

EVT VT = N->getValueType(0);

55010

EVT InVT = Op0.getValueType();

55011

55012

// UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16))

55013

// UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))

55014

// UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))

55015

if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {

55016

unsigned ScalarSize = InVT.getScalarSizeInBits();

55017

if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)

55018

return SDValue();

55019

SDLoc dl(N);

55020

EVT DstVT = EVT::getVectorVT(*DAG.getContext(),

55021

ScalarSize < 16 ? MVT::i16

55022

: ScalarSize < 32 ? MVT::i32

55023

: MVT::i64,

55024

InVT.getVectorNumElements());

55025

SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

55026

if (IsStrict)

55027

return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},

55028

{N->getOperand(0), P});

55029

return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);

55030

}

55031

55032

// UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))

55033

// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))

55034

// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))

55035

if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&

55036

VT.getScalarType() != MVT::f16) {

55037

SDLoc dl(N);

55038

EVT DstVT = InVT.changeVectorElementType(MVT::i32);

55039

SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

55040

55041

// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.

55042

if (IsStrict)

55043

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

55044

{N->getOperand(0), P});

55045

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

55046

}

55047

55048

// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't

55049

// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform

55050

// the optimization here.

55051

if (DAG.SignBitIsZero(Op0)) {

55052

if (IsStrict)

55053

return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},

55054

{N->getOperand(0), Op0});

55055

return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);

55056

}

55057

55058

return SDValue();

55059

}

55060

55061

static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,

55062

TargetLowering::DAGCombinerInfo &DCI,

55063

const X86Subtarget &Subtarget) {

55064

// First try to optimize away the conversion entirely when it's

55065

// conditionally from a constant. Vectors only.

55066

bool IsStrict = N->isStrictFPOpcode();

55067

if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))

55068

return Res;

55069

55070

// Now move on to more general possibilities.

55071

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

55072

EVT VT = N->getValueType(0);

55073

EVT InVT = Op0.getValueType();

55074

55075

// SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))

55076

// SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))

55077

// SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))

55078

if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {

55079

unsigned ScalarSize = InVT.getScalarSizeInBits();

55080

if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)

55081

return SDValue();

55082

SDLoc dl(N);

55083

EVT DstVT = EVT::getVectorVT(*DAG.getContext(),

55084

ScalarSize < 16 ? MVT::i16

55085

: ScalarSize < 32 ? MVT::i32

55086

: MVT::i64,

55087

InVT.getVectorNumElements());

55088

SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);

55089

if (IsStrict)

55090

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

55091

{N->getOperand(0), P});

55092

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

55093

}

55094

55095

// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))

55096

// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))

55097

// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))

55098

if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&

55099

VT.getScalarType() != MVT::f16) {

55100

SDLoc dl(N);

55101

EVT DstVT = InVT.changeVectorElementType(MVT::i32);

55102

SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);

55103

if (IsStrict)

55104

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

55105

{N->getOperand(0), P});

55106

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

55107

}

55108

55109

// Without AVX512DQ we only support i64 to float scalar conversion. For both

55110

// vectors and scalars, see if we know that the upper bits are all the sign

55111

// bit, in which case we can truncate the input to i32 and convert from that.

55112

if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {

55113

unsigned BitWidth = InVT.getScalarSizeInBits();

55114

unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);

55115

if (NumSignBits >= (BitWidth - 31)) {

55116

EVT TruncVT = MVT::i32;

55117

if (InVT.isVector())

55118

TruncVT = InVT.changeVectorElementType(TruncVT);

55119

SDLoc dl(N);

55120

if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {

55121

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);

55122

if (IsStrict)

55123

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

55124

{N->getOperand(0), Trunc});

55125

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);

55126

}

55127

// If we're after legalize and the type is v2i32 we need to shuffle and

55128

// use CVTSI2P.

55129

assert(InVT == MVT::v2i64 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v2i64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v2i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55129, __extension__
__PRETTY_FUNCTION__));

55130

SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);

55131

SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,

55132

{ 0, 2, -1, -1 });

55133

if (IsStrict)

55134

return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},

55135

{N->getOperand(0), Shuf});

55136

return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);

55137

}

55138

}

55139

55140

// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have

55141

// a 32-bit target where SSE doesn't support i64->FP operations.

55142

if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&

55143

Op0.getOpcode() == ISD::LOAD) {

55144

LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());

55145

55146

// This transformation is not supported if the result type is f16 or f128.

55147

if (VT == MVT::f16 || VT == MVT::f128)

55148

return SDValue();

55149

55150

// If we have AVX512DQ we can use packed conversion instructions unless

55151

// the VT is f80.

55152

if (Subtarget.hasDQI() && VT != MVT::f80)

55153

return SDValue();

55154

55155

if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&

55156

Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {

55157

std::pair<SDValue, SDValue> Tmp =

55158

Subtarget.getTargetLowering()->BuildFILD(

55159

VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),

55160

Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);

55161

DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);

55162

return Tmp.first;

55163

}

55164

}

55165

55166

if (IsStrict)

55167

return SDValue();

55168

55169

if (SDValue V = combineToFPTruncExtElt(N, DAG))

55170

return V;

55171

55172

return SDValue();

55173

}

55174

55175

static bool needCarryOrOverflowFlag(SDValue Flags) {

55176

assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55176, __extension__
__PRETTY_FUNCTION__));

55177

55178

for (const SDNode *User : Flags->uses()) {

55179

X86::CondCode CC;

55180

switch (User->getOpcode()) {

55181

default:

55182

// Be conservative.

55183

return true;

55184

case X86ISD::SETCC:

55185

case X86ISD::SETCC_CARRY:

55186

CC = (X86::CondCode)User->getConstantOperandVal(0);

55187

break;

55188

case X86ISD::BRCOND:

55189

case X86ISD::CMOV:

55190

CC = (X86::CondCode)User->getConstantOperandVal(2);

55191

break;

55192

}

55193

55194

switch (CC) {

55195

default: break;

55196

case X86::COND_A: case X86::COND_AE:

55197

case X86::COND_B: case X86::COND_BE:

55198

case X86::COND_O: case X86::COND_NO:

55199

case X86::COND_G: case X86::COND_GE:

55200

case X86::COND_L: case X86::COND_LE:

55201

return true;

55202

}

55203

}

55204

55205

return false;

55206

}

55207

55208

static bool onlyZeroFlagUsed(SDValue Flags) {

55209

assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55209, __extension__
__PRETTY_FUNCTION__));

55210

55211

for (const SDNode *User : Flags->uses()) {

55212

unsigned CCOpNo;

55213

switch (User->getOpcode()) {

55214

default:

55215

// Be conservative.

55216

return false;

55217

case X86ISD::SETCC:

55218

case X86ISD::SETCC_CARRY:

55219

CCOpNo = 0;

55220

break;

55221

case X86ISD::BRCOND:

55222

case X86ISD::CMOV:

55223

CCOpNo = 2;

55224

break;

55225

}

55226

55227

X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);

55228

if (CC != X86::COND_E && CC != X86::COND_NE)

55229

return false;

55230

}

55231

55232

return true;

55233

}

55234

55235

static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {

55236

// Only handle test patterns.

55237

if (!isNullConstant(N->getOperand(1)))

55238

return SDValue();

55239

55240

// If we have a CMP of a truncated binop, see if we can make a smaller binop

55241

// and use its flags directly.

55242

// TODO: Maybe we should try promoting compares that only use the zero flag

55243

// first if we can prove the upper bits with computeKnownBits?

55244

SDLoc dl(N);

55245

SDValue Op = N->getOperand(0);

55246

EVT VT = Op.getValueType();

55247

55248

// If we have a constant logical shift that's only used in a comparison

55249

// against zero turn it into an equivalent AND. This allows turning it into

55250

// a TEST instruction later.

55251

if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&

55252

Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&

55253

onlyZeroFlagUsed(SDValue(N, 0))) {

55254

unsigned BitWidth = VT.getSizeInBits();

55255

const APInt &ShAmt = Op.getConstantOperandAPInt(1);

55256

if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.

55257

unsigned MaskBits = BitWidth - ShAmt.getZExtValue();

55258

APInt Mask = Op.getOpcode() == ISD::SRL

55259

? APInt::getHighBitsSet(BitWidth, MaskBits)

55260

: APInt::getLowBitsSet(BitWidth, MaskBits);

55261

if (Mask.isSignedIntN(32)) {

55262

Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),

55263

DAG.getConstant(Mask, dl, VT));

55264

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

55265

DAG.getConstant(0, dl, VT));

55266

}

55267

}

55268

}

55269

55270

// Peek through any zero-extend if we're only testing for a zero result.

55271

if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {

55272

SDValue Src = Op.getOperand(0);

55273

EVT SrcVT = Src.getValueType();

55274

if (SrcVT.getScalarSizeInBits() >= 8 &&

55275

DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))

55276

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,

55277

DAG.getConstant(0, dl, SrcVT));

55278

}

55279

55280

// Look for a truncate.

55281

if (Op.getOpcode() != ISD::TRUNCATE)

55282

return SDValue();

55283

55284

SDValue Trunc = Op;

55285

Op = Op.getOperand(0);

55286

55287

// See if we can compare with zero against the truncation source,

55288

// which should help using the Z flag from many ops. Only do this for

55289

// i32 truncated op to prevent partial-reg compares of promoted ops.

55290

EVT OpVT = Op.getValueType();

55291

APInt UpperBits =

55292

APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());

55293

if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&

55294

onlyZeroFlagUsed(SDValue(N, 0))) {

55295

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

55296

DAG.getConstant(0, dl, OpVT));

55297

}

55298

55299

// After this the truncate and arithmetic op must have a single use.

55300

if (!Trunc.hasOneUse() || !Op.hasOneUse())

55301

return SDValue();

55302

55303

unsigned NewOpc;

55304

switch (Op.getOpcode()) {

55305

default: return SDValue();

55306

case ISD::AND:

55307

// Skip and with constant. We have special handling for and with immediate

55308

// during isel to generate test instructions.

55309

if (isa<ConstantSDNode>(Op.getOperand(1)))

55310

return SDValue();

55311

NewOpc = X86ISD::AND;

55312

break;

55313

case ISD::OR: NewOpc = X86ISD::OR; break;

55314

case ISD::XOR: NewOpc = X86ISD::XOR; break;

55315

case ISD::ADD:

55316

// If the carry or overflow flag is used, we can't truncate.

55317

if (needCarryOrOverflowFlag(SDValue(N, 0)))

55318

return SDValue();

55319

NewOpc = X86ISD::ADD;

55320

break;

55321

case ISD::SUB:

55322

// If the carry or overflow flag is used, we can't truncate.

55323

if (needCarryOrOverflowFlag(SDValue(N, 0)))

55324

return SDValue();

55325

NewOpc = X86ISD::SUB;

55326

break;

55327

}

55328

55329

// We found an op we can narrow. Truncate its inputs.

55330

SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));

55331

SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));

55332

55333

// Use a X86 specific opcode to avoid DAG combine messing with it.

55334

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

55335

Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);

55336

55337

// For AND, keep a CMP so that we can match the test pattern.

55338

if (NewOpc == X86ISD::AND)

55339

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

55340

DAG.getConstant(0, dl, VT));

55341

55342

// Return the flags.

55343

return Op.getValue(1);

55344

}

55345

55346

static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,

55347

TargetLowering::DAGCombinerInfo &DCI) {

55348

assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55349, __extension__
__PRETTY_FUNCTION__))

55349

"Expected X86ISD::ADD or X86ISD::SUB")(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55349, __extension__
__PRETTY_FUNCTION__));

55350

55351

SDLoc DL(N);

55352

SDValue LHS = N->getOperand(0);

55353

SDValue RHS = N->getOperand(1);

55354

MVT VT = LHS.getSimpleValueType();

55355

bool IsSub = X86ISD::SUB == N->getOpcode();

55356

unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;

55357

55358

// If we don't use the flag result, simplify back to a generic ADD/SUB.

55359

if (!N->hasAnyUseOfValue(1)) {

55360

SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);

55361

return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);

55362

}

55363

55364

// Fold any similar generic ADD/SUB opcodes to reuse this node.

55365

auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {

55366

SDValue Ops[] = {N0, N1};

55367

SDVTList VTs = DAG.getVTList(N->getValueType(0));

55368

if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {

55369

SDValue Op(N, 0);

55370

if (Negate)

55371

Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);

55372

DCI.CombineTo(GenericAddSub, Op);

55373

}

55374

};

55375

MatchGeneric(LHS, RHS, false);

55376

MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());

55377

55378

// TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the

55379

// EFLAGS result doesn't change.

55380

return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,

55381

/*ZeroSecondOpOnly*/ true);

55382

}

55383

55384

static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {

55385

SDValue LHS = N->getOperand(0);

55386

SDValue RHS = N->getOperand(1);

55387

SDValue BorrowIn = N->getOperand(2);

55388

55389

if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {

55390

MVT VT = N->getSimpleValueType(0);

55391

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

55392

return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);

55393

}

55394

55395

// Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)

55396

// iff the flag result is dead.

55397

if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&

55398

!N->hasAnyUseOfValue(1))

55399

return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),

55400

LHS.getOperand(1), BorrowIn);

55401

55402

return SDValue();

55403

}

55404

55405

// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS

55406

static SDValue combineADC(SDNode *N, SelectionDAG &DAG,

55407

TargetLowering::DAGCombinerInfo &DCI) {

55408

SDValue LHS = N->getOperand(0);

55409

SDValue RHS = N->getOperand(1);

55410

SDValue CarryIn = N->getOperand(2);

55411

auto *LHSC = dyn_cast<ConstantSDNode>(LHS);

55412

auto *RHSC = dyn_cast<ConstantSDNode>(RHS);

55413

55414

// Canonicalize constant to RHS.

55415

if (LHSC && !RHSC)

55416

return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,

55417

CarryIn);

55418

55419

// If the LHS and RHS of the ADC node are zero, then it can't overflow and

55420

// the result is either zero or one (depending on the input carry bit).

55421

// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.

55422

if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&

55423

// We don't have a good way to replace an EFLAGS use, so only do this when

55424

// dead right now.

55425

SDValue(N, 1).use_empty()) {

55426

SDLoc DL(N);

55427

EVT VT = N->getValueType(0);

55428

SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));

55429

SDValue Res1 = DAG.getNode(

55430

ISD::AND, DL, VT,

55431

DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

55432

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),

55433

DAG.getConstant(1, DL, VT));

55434

return DCI.CombineTo(N, Res1, CarryOut);

55435

}

55436

55437

// Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)

55438

// iff the flag result is dead.

55439

// TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.

55440

if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {

55441

SDLoc DL(N);

55442

APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();

55443

return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),

55444

DAG.getConstant(0, DL, LHS.getValueType()),

55445

DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);

55446

}

55447

55448

if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {

55449

MVT VT = N->getSimpleValueType(0);

55450

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

55451

return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);

55452

}

55453

55454

// Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)

55455

// iff the flag result is dead.

55456

if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&

55457

!N->hasAnyUseOfValue(1))

55458

return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),

55459

LHS.getOperand(1), CarryIn);

55460

55461

return SDValue();

55462

}

55463

55464

static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,

55465

const SDLoc &DL, EVT VT,

55466

const X86Subtarget &Subtarget) {

55467

// Example of pattern we try to detect:

55468

// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))

55469

//(add (build_vector (extract_elt t, 0),

55470

// (extract_elt t, 2),

55471

// (extract_elt t, 4),

55472

// (extract_elt t, 6)),

55473

// (build_vector (extract_elt t, 1),

55474

// (extract_elt t, 3),

55475

// (extract_elt t, 5),

55476

// (extract_elt t, 7)))

55477

55478

if (!Subtarget.hasSSE2())

55479

return SDValue();

55480

55481

if (Op0.getOpcode() != ISD::BUILD_VECTOR ||

55482

Op1.getOpcode() != ISD::BUILD_VECTOR)

55483

return SDValue();

55484

55485

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||

55486

VT.getVectorNumElements() < 4 ||

55487

!isPowerOf2_32(VT.getVectorNumElements()))

55488

return SDValue();

55489

55490

// Check if one of Op0,Op1 is of the form:

55491

// (build_vector (extract_elt Mul, 0),

55492

// (extract_elt Mul, 2),

55493

// (extract_elt Mul, 4),

55494

// ...

55495

// the other is of the form:

55496

// (build_vector (extract_elt Mul, 1),

55497

// (extract_elt Mul, 3),

55498

// (extract_elt Mul, 5),

55499

// ...

55500

// and identify Mul.

55501

SDValue Mul;

55502

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {

55503

SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),

55504

Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);

55505

// TODO: Be more tolerant to undefs.

55506

if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55507

Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55508

Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55509

Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

55510

return SDValue();

55511

auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));

55512

auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));

55513

auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));

55514

auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));

55515

if (!Const0L || !Const1L || !Const0H || !Const1H)

55516

return SDValue();

55517

unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),

55518

Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();

55519

// Commutativity of mul allows factors of a product to reorder.

55520

if (Idx0L > Idx1L)

55521

std::swap(Idx0L, Idx1L);

55522

if (Idx0H > Idx1H)

55523

std::swap(Idx0H, Idx1H);

55524

// Commutativity of add allows pairs of factors to reorder.

55525

if (Idx0L > Idx0H) {

55526

std::swap(Idx0L, Idx0H);

55527

std::swap(Idx1L, Idx1H);

55528

}

55529

if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||

55530

Idx1H != 2 * i + 3)

55531

return SDValue();

55532

if (!Mul) {

55533

// First time an extract_elt's source vector is visited. Must be a MUL

55534

// with 2X number of vector elements than the BUILD_VECTOR.

55535

// Both extracts must be from same MUL.

55536

Mul = Op0L->getOperand(0);

55537

if (Mul->getOpcode() != ISD::MUL ||

55538

Mul.getValueType().getVectorNumElements() != 2 * e)

55539

return SDValue();

55540

}

55541

// Check that the extract is from the same MUL previously seen.

55542

if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||

55543

Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))

55544

return SDValue();

55545

}

55546

55547

// Check if the Mul source can be safely shrunk.

55548

ShrinkMode Mode;

55549

if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||

55550

Mode == ShrinkMode::MULU16)

55551

return SDValue();

55552

55553

EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

55554

VT.getVectorNumElements() * 2);

55555

SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));

55556

SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));

55557

55558

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

55559

ArrayRef<SDValue> Ops) {

55560

EVT InVT = Ops[0].getValueType();

55561

assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55561, __extension__
__PRETTY_FUNCTION__));

55562

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

55563

InVT.getVectorNumElements() / 2);

55564

return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);

55565

};

55566

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);

55567

}

55568

55569

// Attempt to turn this pattern into PMADDWD.

55570

// (add (mul (sext (build_vector)), (sext (build_vector))),

55571

// (mul (sext (build_vector)), (sext (build_vector)))

55572

static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,

55573

const SDLoc &DL, EVT VT,

55574

const X86Subtarget &Subtarget) {

55575

if (!Subtarget.hasSSE2())

55576

return SDValue();

55577

55578

if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)

55579

return SDValue();

55580

55581

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||

55582

VT.getVectorNumElements() < 4 ||

55583

!isPowerOf2_32(VT.getVectorNumElements()))

55584

return SDValue();

55585

55586

SDValue N00 = N0.getOperand(0);

55587

SDValue N01 = N0.getOperand(1);

55588

SDValue N10 = N1.getOperand(0);

55589

SDValue N11 = N1.getOperand(1);

55590

55591

// All inputs need to be sign extends.

55592

// TODO: Support ZERO_EXTEND from known positive?

55593

if (N00.getOpcode() != ISD::SIGN_EXTEND ||

55594

N01.getOpcode() != ISD::SIGN_EXTEND ||

55595

N10.getOpcode() != ISD::SIGN_EXTEND ||

55596

N11.getOpcode() != ISD::SIGN_EXTEND)

55597

return SDValue();

55598

55599

// Peek through the extends.

55600

N00 = N00.getOperand(0);

55601

N01 = N01.getOperand(0);

55602

N10 = N10.getOperand(0);

55603

N11 = N11.getOperand(0);

55604

55605

// Must be extending from vXi16.

55606

EVT InVT = N00.getValueType();

55607

if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||

55608

N10.getValueType() != InVT || N11.getValueType() != InVT)

55609

return SDValue();

55610

55611

// All inputs should be build_vectors.

55612

if (N00.getOpcode() != ISD::BUILD_VECTOR ||

55613

N01.getOpcode() != ISD::BUILD_VECTOR ||

55614

N10.getOpcode() != ISD::BUILD_VECTOR ||

55615

N11.getOpcode() != ISD::BUILD_VECTOR)

55616

return SDValue();

55617

55618

// For each element, we need to ensure we have an odd element from one vector

55619

// multiplied by the odd element of another vector and the even element from

55620

// one of the same vectors being multiplied by the even element from the

55621

// other vector. So we need to make sure for each element i, this operator

55622

// is being performed:

55623

// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]

55624

SDValue In0, In1;

55625

for (unsigned i = 0; i != N00.getNumOperands(); ++i) {

55626

SDValue N00Elt = N00.getOperand(i);

55627

SDValue N01Elt = N01.getOperand(i);

55628

SDValue N10Elt = N10.getOperand(i);

55629

SDValue N11Elt = N11.getOperand(i);

55630

// TODO: Be more tolerant to undefs.

55631

if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55632

N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55633

N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55634

N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

55635

return SDValue();

55636

auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));

55637

auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));

55638

auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));

55639

auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));

55640

if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)

55641

return SDValue();

55642

unsigned IdxN00 = ConstN00Elt->getZExtValue();

55643

unsigned IdxN01 = ConstN01Elt->getZExtValue();

55644

unsigned IdxN10 = ConstN10Elt->getZExtValue();

55645

unsigned IdxN11 = ConstN11Elt->getZExtValue();

55646

// Add is commutative so indices can be reordered.

55647

if (IdxN00 > IdxN10) {

55648

std::swap(IdxN00, IdxN10);

55649

std::swap(IdxN01, IdxN11);

55650

}

55651

// N0 indices be the even element. N1 indices must be the next odd element.

55652

if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||

55653

IdxN01 != 2 * i || IdxN11 != 2 * i + 1)

55654

return SDValue();

55655

SDValue N00In = N00Elt.getOperand(0);

55656

SDValue N01In = N01Elt.getOperand(0);

55657

SDValue N10In = N10Elt.getOperand(0);

55658

SDValue N11In = N11Elt.getOperand(0);

55659

55660

// First time we find an input capture it.

55661

if (!In0) {

55662

In0 = N00In;

55663

In1 = N01In;

55664

55665

// The input vectors must be at least as wide as the output.

55666

// If they are larger than the output, we extract subvector below.

55667

if (In0.getValueSizeInBits() < VT.getSizeInBits() ||

55668

In1.getValueSizeInBits() < VT.getSizeInBits())

55669

return SDValue();

55670

}

55671

// Mul is commutative so the input vectors can be in any order.

55672

// Canonicalize to make the compares easier.

55673

if (In0 != N00In)

55674

std::swap(N00In, N01In);

55675

if (In0 != N10In)

55676

std::swap(N10In, N11In);

55677

if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)

55678

return SDValue();

55679

}

55680

55681

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

55682

ArrayRef<SDValue> Ops) {

55683

EVT OpVT = Ops[0].getValueType();

55684

assert(OpVT.getScalarType() == MVT::i16 &&(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55685, __extension__
__PRETTY_FUNCTION__))

55685

"Unexpected scalar element type")(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55685, __extension__
__PRETTY_FUNCTION__));

55686

assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (OpVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("OpVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55686, __extension__
__PRETTY_FUNCTION__));

55687

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

55688

OpVT.getVectorNumElements() / 2);

55689

return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);

55690

};

55691

55692

// If the output is narrower than an input, extract the low part of the input

55693

// vector.

55694

EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

55695

VT.getVectorNumElements() * 2);

55696

if (OutVT16.bitsLT(In0.getValueType())) {

55697

In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,

55698

DAG.getIntPtrConstant(0, DL));

55699

}

55700

if (OutVT16.bitsLT(In1.getValueType())) {

55701

In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,

55702

DAG.getIntPtrConstant(0, DL));

55703

}

55704

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },

55705

PMADDBuilder);

55706

}

55707

55708

// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))

55709

// If upper element in each pair of both VPMADDWD are zero then we can merge

55710

// the operand elements and use the implicit add of VPMADDWD.

55711

// TODO: Add support for VPMADDUBSW (which isn't commutable).

55712

static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,

55713

const SDLoc &DL, EVT VT) {

55714

if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)

55715

return SDValue();

55716

55717

// TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.

55718

if (VT.getSizeInBits() > 128)

55719

return SDValue();

55720

55721

unsigned NumElts = VT.getVectorNumElements();

55722

MVT OpVT = N0.getOperand(0).getSimpleValueType();

55723

APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits());

55724

APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));

55725

55726

bool Op0HiZero =

55727

DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||

55728

DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);

55729

bool Op1HiZero =

55730

DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||

55731

DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);

55732

55733

// TODO: Check for zero lower elements once we have actual codegen that

55734

// creates them.

55735

if (!Op0HiZero || !Op1HiZero)

55736

return SDValue();

55737

55738

// Create a shuffle mask packing the lower elements from each VPMADDWD.

55739

SmallVector<int> Mask;

55740

for (int i = 0; i != (int)NumElts; ++i) {

55741

Mask.push_back(2 * i);

55742

Mask.push_back(2 * (i + NumElts));

55743

}

55744

55745

SDValue LHS =

55746

DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);

55747

SDValue RHS =

55748

DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);

55749

return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);

55750

}

55751

55752

/// CMOV of constants requires materializing constant operands in registers.

55753

/// Try to fold those constants into an 'add' instruction to reduce instruction

55754

/// count. We do this with CMOV rather the generic 'select' because there are

55755

/// earlier folds that may be used to turn select-of-constants into logic hacks.

55756

static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,

55757

const X86Subtarget &Subtarget) {

55758

// If an operand is zero, add-of-0 gets simplified away, so that's clearly

55759

// better because we eliminate 1-2 instructions. This transform is still

55760

// an improvement without zero operands because we trade 2 move constants and

55761

// 1 add for 2 adds (LEA) as long as the constants can be represented as

55762

// immediate asm operands (fit in 32-bits).

55763

auto isSuitableCmov = [](SDValue V) {

55764

if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())

55765

return false;

55766

if (!isa<ConstantSDNode>(V.getOperand(0)) ||

55767

!isa<ConstantSDNode>(V.getOperand(1)))

55768

return false;

55769

return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||

55770

(V.getConstantOperandAPInt(0).isSignedIntN(32) &&

55771

V.getConstantOperandAPInt(1).isSignedIntN(32));

55772

};

55773

55774

// Match an appropriate CMOV as the first operand of the add.

55775

SDValue Cmov = N->getOperand(0);

55776

SDValue OtherOp = N->getOperand(1);

55777

if (!isSuitableCmov(Cmov))

55778

std::swap(Cmov, OtherOp);

55779

if (!isSuitableCmov(Cmov))

55780

return SDValue();

55781

55782

// Don't remove a load folding opportunity for the add. That would neutralize

55783

// any improvements from removing constant materializations.

55784

if (X86::mayFoldLoad(OtherOp, Subtarget))

55785

return SDValue();

55786

55787

EVT VT = N->getValueType(0);

55788

SDLoc DL(N);

55789

SDValue FalseOp = Cmov.getOperand(0);

55790

SDValue TrueOp = Cmov.getOperand(1);

55791

55792

// We will push the add through the select, but we can potentially do better

55793

// if we know there is another add in the sequence and this is pointer math.

55794

// In that case, we can absorb an add into the trailing memory op and avoid

55795

// a 3-operand LEA which is likely slower than a 2-operand LEA.

55796

// TODO: If target has "slow3OpsLEA", do this even without the trailing memop?

55797

if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&

55798

!isa<ConstantSDNode>(OtherOp.getOperand(0)) &&

55799

all_of(N->uses(), [&](SDNode *Use) {

55800

auto *MemNode = dyn_cast<MemSDNode>(Use);

55801

return MemNode && MemNode->getBasePtr().getNode() == N;

55802

})) {

55803

// add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y

55804

// TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but

55805

// it is possible that choosing op1 might be better.

55806

SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);

55807

FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);

55808

TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);

55809

Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,

55810

Cmov.getOperand(2), Cmov.getOperand(3));

55811

return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);

55812

}

55813

55814

// add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)

55815

FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);

55816

TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);

55817

return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),

55818

Cmov.getOperand(3));

55819

}

55820

55821

static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,

55822

TargetLowering::DAGCombinerInfo &DCI,

55823

const X86Subtarget &Subtarget) {

55824

EVT VT = N->getValueType(0);

55825

SDValue Op0 = N->getOperand(0);

55826

SDValue Op1 = N->getOperand(1);

55827

SDLoc DL(N);

55828

55829

if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))

55830

return Select;

55831

55832

if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))

55833

return MAdd;

55834

if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))

55835

return MAdd;

55836

if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))

55837

return MAdd;

55838

55839

// Try to synthesize horizontal adds from adds of shuffles.

55840

if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))

55841

return V;

55842

55843

// If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into

55844

// (sub Y, (sext (vXi1 X))).

55845

// FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in

55846

// generic DAG combine without a legal type check, but adding this there

55847

// caused regressions.

55848

if (VT.isVector()) {

55849

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

55850

if (Op0.getOpcode() == ISD::ZERO_EXTEND &&

55851

Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

55852

TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {

55853

SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));

55854

return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);

55855

}

55856

55857

if (Op1.getOpcode() == ISD::ZERO_EXTEND &&

55858

Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

55859

TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {

55860

SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));

55861

return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);

55862

}

55863

}

55864

55865

// Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)

55866

if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&

55867

X86::isZeroNode(Op0.getOperand(1))) {

55868

assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op0->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op0->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55868, __extension__
__PRETTY_FUNCTION__));

55869

return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,

55870

Op0.getOperand(0), Op0.getOperand(2));

55871

}

55872

55873

return combineAddOrSubToADCOrSBB(N, DAG);

55874

}

55875

55876

// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov

55877

// condition comes from the subtract node that produced -X. This matches the

55878

// cmov expansion for absolute value. By swapping the operands we convert abs

55879

// to nabs.

55880

static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) {

55881

SDValue N0 = N->getOperand(0);

55882

SDValue N1 = N->getOperand(1);

55883

55884

if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())

55885

return SDValue();

55886

55887

X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);

55888

if (CC != X86::COND_S && CC != X86::COND_NS)

55889

return SDValue();

55890

55891

// Condition should come from a negate operation.

55892

SDValue Cond = N1.getOperand(3);

55893

if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))

55894

return SDValue();

55895

assert(Cond.getResNo() == 1 && "Unexpected result number")(static_cast <bool> (Cond.getResNo() == 1 && "Unexpected result number"
) ? void (0) : __assert_fail ("Cond.getResNo() == 1 && \"Unexpected result number\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55895, __extension__
__PRETTY_FUNCTION__));

55896

55897

// Get the X and -X from the negate.

55898

SDValue NegX = Cond.getValue(0);

55899

SDValue X = Cond.getOperand(1);

55900

55901

SDValue FalseOp = N1.getOperand(0);

55902

SDValue TrueOp = N1.getOperand(1);

55903

55904

// Cmov operands should be X and NegX. Order doesn't matter.

55905

if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))

55906

return SDValue();

55907

55908

// Build a new CMOV with the operands swapped.

55909

SDLoc DL(N);

55910

MVT VT = N->getSimpleValueType(0);

55911

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,

55912

N1.getOperand(2), Cond);

55913

// Convert sub to add.

55914

return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);

55915

}

55916

55917

static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG) {

55918

SDValue Op0 = N->getOperand(0);

55919

SDValue Op1 = N->getOperand(1);

55920

55921

// (sub C (zero_extend (setcc)))

55922

// =>

55923

// (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate

55924

// Don't disturb (sub 0 setcc), which is easily done with neg.

55925

EVT VT = N->getValueType(0);

55926

auto *Op0C = dyn_cast<ConstantSDNode>(Op0);

55927

if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&

55928

!Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&

55929

Op1.getOperand(0).hasOneUse()) {

55930

SDValue SetCC = Op1.getOperand(0);

55931

X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);

55932

X86::CondCode NewCC = X86::GetOppositeBranchCondition(CC);

55933

uint64_t NewImm = Op0C->getZExtValue() - 1;

55934

SDLoc DL(Op1);

55935

SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);

55936

NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);

55937

return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,

55938

DAG.getConstant(NewImm, DL, VT));

55939

}

55940

55941

return SDValue();

55942

}

55943

55944

static SDValue combineSub(SDNode *N, SelectionDAG &DAG,

55945

TargetLowering::DAGCombinerInfo &DCI,

55946

const X86Subtarget &Subtarget) {

55947

SDValue Op0 = N->getOperand(0);

55948

SDValue Op1 = N->getOperand(1);

55949

55950

// TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.

55951

auto IsNonOpaqueConstant = [&](SDValue Op) {

55952

if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {

55953

if (auto *Cst = dyn_cast<ConstantSDNode>(C))

55954

return !Cst->isOpaque();

55955

return true;

55956

}

55957

return false;

55958

};

55959

55960

// X86 can't encode an immediate LHS of a sub. See if we can push the

55961

// negation into a preceding instruction. If the RHS of the sub is a XOR with

55962

// one use and a constant, invert the immediate, saving one register.

55963

// sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)

55964

if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&

55965

IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {

55966

SDLoc DL(N);

55967

EVT VT = Op0.getValueType();

55968

SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),

55969

DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));

55970

SDValue NewAdd =

55971

DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));

55972

return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);

55973

}

55974

55975

if (SDValue V = combineSubABS(N, DAG))

55976

return V;

55977

55978

// Try to synthesize horizontal subs from subs of shuffles.

55979

if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))

55980

return V;

55981

55982

// Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)

55983

if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&

55984

X86::isZeroNode(Op1.getOperand(1))) {

55985

assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55985, __extension__
__PRETTY_FUNCTION__));

55986

return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,

55987

Op1.getOperand(0), Op1.getOperand(2));

55988

}

55989

55990

// Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)

55991

// Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.

55992

if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&

55993

!(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {

55994

assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55994, __extension__
__PRETTY_FUNCTION__));

55995

SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,

55996

Op1.getOperand(1), Op1.getOperand(2));

55997

return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),

55998

Op1.getOperand(0));

55999

}

56000

56001

if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget))

56002

return V;

56003

56004

if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG))

56005

return V;

56006

56007

return combineSubSetcc(N, DAG);

56008

}

56009

56010

static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,

56011

const X86Subtarget &Subtarget) {

56012

MVT VT = N->getSimpleValueType(0);

56013

SDLoc DL(N);

56014

56015

if (N->getOperand(0) == N->getOperand(1)) {

56016

if (N->getOpcode() == X86ISD::PCMPEQ)

56017

return DAG.getConstant(-1, DL, VT);

56018

if (N->getOpcode() == X86ISD::PCMPGT)

56019

return DAG.getConstant(0, DL, VT);

56020

}

56021

56022

return SDValue();

56023

}

56024

56025

/// Helper that combines an array of subvector ops as if they were the operands

56026

/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.

56027

/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.

56028

static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,

56029

ArrayRef<SDValue> Ops, SelectionDAG &DAG,

56030

TargetLowering::DAGCombinerInfo &DCI,

56031

const X86Subtarget &Subtarget) {

56032

assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")(static_cast <bool> (Subtarget.hasAVX() && "AVX assumed for concat_vectors"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX assumed for concat_vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56032, __extension__
__PRETTY_FUNCTION__));

56033

unsigned EltSizeInBits = VT.getScalarSizeInBits();

56034

56035

if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))

56036

return DAG.getUNDEF(VT);

56037

56038

if (llvm::all_of(Ops, [](SDValue Op) {

56039

return ISD::isBuildVectorAllZeros(Op.getNode());

56040

}))

56041

return getZeroVector(VT, Subtarget, DAG, DL);

56042

56043

SDValue Op0 = Ops[0];

56044

bool IsSplat = llvm::all_equal(Ops);

56045

56046

// Repeated subvectors.

56047

if (IsSplat &&

56048

(VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {

56049

// If this broadcast is inserted into both halves, use a larger broadcast.

56050

if (Op0.getOpcode() == X86ISD::VBROADCAST)

56051

return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));

56052

56053

// If this simple subvector or scalar/subvector broadcast_load is inserted

56054

// into both halves, use a larger broadcast_load. Update other uses to use

56055

// an extracted subvector.

56056

if (ISD::isNormalLoad(Op0.getNode()) ||

56057

Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||

56058

Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {

56059

auto *Mem = cast<MemSDNode>(Op0);

56060

unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD

56061

? X86ISD::VBROADCAST_LOAD

56062

: X86ISD::SUBV_BROADCAST_LOAD;

56063

if (SDValue BcastLd =

56064

getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {

56065

SDValue BcastSrc =

56066

extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());

56067

DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);

56068

return BcastLd;

56069

}

56070

}

56071

56072

// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)

56073

if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&

56074

(Subtarget.hasAVX2() ||

56075

X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0),

56076

VT.getScalarType(), Subtarget)))

56077

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

56078

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,

56079

Op0.getOperand(0),

56080

DAG.getIntPtrConstant(0, DL)));

56081

56082

// concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)

56083

if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&

56084

(Subtarget.hasAVX2() ||

56085

(EltSizeInBits >= 32 &&

56086

X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&

56087

Op0.getOperand(0).getValueType() == VT.getScalarType())

56088

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));

56089

56090

// concat_vectors(extract_subvector(broadcast(x)),

56091

// extract_subvector(broadcast(x))) -> broadcast(x)

56092

if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

56093

Op0.getOperand(0).getValueType() == VT) {

56094

if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||

56095

Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)

56096

return Op0.getOperand(0);

56097

}

56098

}

56099

56100

// concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.

56101

// Only concat of subvector high halves which vperm2x128 is best at.

56102

// TODO: This should go in combineX86ShufflesRecursively eventually.

56103

if (VT.is256BitVector() && Ops.size() == 2) {

56104

SDValue Src0 = peekThroughBitcasts(Ops[0]);

56105

SDValue Src1 = peekThroughBitcasts(Ops[1]);

56106

if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

56107

Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

56108

EVT SrcVT0 = Src0.getOperand(0).getValueType();

56109

EVT SrcVT1 = Src1.getOperand(0).getValueType();

56110

unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();

56111

unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();

56112

if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&

56113

Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&

56114

Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {

56115

return DAG.getNode(X86ISD::VPERM2X128, DL, VT,

56116

DAG.getBitcast(VT, Src0.getOperand(0)),

56117

DAG.getBitcast(VT, Src1.getOperand(0)),

56118

DAG.getTargetConstant(0x31, DL, MVT::i8));

56119

}

56120

}

56121

}

56122

56123

// Repeated opcode.

56124

// TODO - combineX86ShufflesRecursively should handle shuffle concatenation

56125

// but it currently struggles with different vector widths.

56126

if (llvm::all_of(Ops, [Op0](SDValue Op) {

56127

return Op.getOpcode() == Op0.getOpcode();

56128

})) {

56129

auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {

56130

SmallVector<SDValue> Subs;

56131

for (SDValue SubOp : SubOps)

56132

Subs.push_back(SubOp.getOperand(I));

56133

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);

56134

};

56135

auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {

56136

for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {

56137

SDValue Sub = SubOps[I].getOperand(Op);

56138

unsigned NumSubElts = Sub.getValueType().getVectorNumElements();

56139

if (Sub.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

56140

Sub.getOperand(0).getValueType() != VT ||

56141

Sub.getConstantOperandAPInt(1) != (I * NumSubElts))

56142

return false;

56143

}

56144

return true;

56145

};

56146

56147

unsigned NumOps = Ops.size();

56148

switch (Op0.getOpcode()) {

56149

case X86ISD::VBROADCAST: {

56150

if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {

56151

return Op.getOperand(0).getValueType().is128BitVector();

56152

})) {

56153

if (VT == MVT::v4f64 || VT == MVT::v4i64)

56154

return DAG.getNode(X86ISD::UNPCKL, DL, VT,

56155

ConcatSubOperand(VT, Ops, 0),

56156

ConcatSubOperand(VT, Ops, 0));

56157

// TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.

56158

if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))

56159

return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI

56160

: X86ISD::PSHUFD,

56161

DL, VT, ConcatSubOperand(VT, Ops, 0),

56162

getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));

56163

}

56164

break;

56165

}

56166

case X86ISD::MOVDDUP:

56167

case X86ISD::MOVSHDUP:

56168

case X86ISD::MOVSLDUP: {

56169

if (!IsSplat)

56170

return DAG.getNode(Op0.getOpcode(), DL, VT,

56171

ConcatSubOperand(VT, Ops, 0));

56172

break;

56173

}

56174

case X86ISD::SHUFP: {

56175

// Add SHUFPD support if/when necessary.

56176

if (!IsSplat && VT.getScalarType() == MVT::f32 &&

56177

llvm::all_of(Ops, [Op0](SDValue Op) {

56178

return Op.getOperand(2) == Op0.getOperand(2);

56179

})) {

56180

return DAG.getNode(Op0.getOpcode(), DL, VT,

56181

ConcatSubOperand(VT, Ops, 0),

56182

ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));

56183

}

56184

break;

56185

}

56186

case X86ISD::PSHUFHW:

56187

case X86ISD::PSHUFLW:

56188

case X86ISD::PSHUFD:

56189

if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&

56190

Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {

56191

return DAG.getNode(Op0.getOpcode(), DL, VT,

56192

ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));

56193

}

56194

[[fallthrough]];

56195

case X86ISD::VPERMILPI:

56196

if (!IsSplat && VT.getScalarSizeInBits() == 32 &&

56197

(VT.is256BitVector() ||

56198

(VT.is512BitVector() && Subtarget.useAVX512Regs())) &&

56199

all_of(Ops, [&Op0](SDValue Op) {

56200

return Op0.getOperand(1) == Op.getOperand(1);

56201

})) {

56202

MVT FloatVT = VT.changeVectorElementType(MVT::f32);

56203

SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));

56204

Res =

56205

DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));

56206

return DAG.getBitcast(VT, Res);

56207

}

56208

if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {

56209

uint64_t Idx0 = Ops[0].getConstantOperandVal(1);

56210

uint64_t Idx1 = Ops[1].getConstantOperandVal(1);

56211

uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);

56212

return DAG.getNode(Op0.getOpcode(), DL, VT,

56213

ConcatSubOperand(VT, Ops, 0),

56214

DAG.getTargetConstant(Idx, DL, MVT::i8));

56215

}

56216

break;

56217

case X86ISD::PSHUFB:

56218

if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||

56219

(VT.is512BitVector() && Subtarget.useBWIRegs()))) {

56220

return DAG.getNode(Op0.getOpcode(), DL, VT,

56221

ConcatSubOperand(VT, Ops, 0),

56222

ConcatSubOperand(VT, Ops, 1));

56223

}

56224

break;

56225

case X86ISD::VPERMV:

56226

if (!IsSplat && NumOps == 2 &&

56227

(VT.is512BitVector() && Subtarget.useAVX512Regs())) {

56228

MVT OpVT = Op0.getSimpleValueType();

56229

int NumSrcElts = OpVT.getVectorNumElements();

56230

SmallVector<int, 64> ConcatMask;

56231

for (unsigned i = 0; i != NumOps; ++i) {

56232

SmallVector<int, 64> SubMask;

56233

SmallVector<SDValue, 2> SubOps;

56234

if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,

56235

SubMask))

56236

break;

56237

for (int M : SubMask) {

56238

if (0 <= M)

56239

M += i * NumSrcElts;

56240

ConcatMask.push_back(M);

56241

}

56242

}

56243

if (ConcatMask.size() == (NumOps * NumSrcElts)) {

56244

SDValue Src = concatSubVectors(Ops[0].getOperand(1),

56245

Ops[1].getOperand(1), DAG, DL);

56246

MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);

56247

MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);

56248

SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);

56249

return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src);

56250

}

56251

}

56252

break;

56253

case X86ISD::VPERMV3:

56254

if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {

56255

MVT OpVT = Op0.getSimpleValueType();

56256

int NumSrcElts = OpVT.getVectorNumElements();

56257

SmallVector<int, 64> ConcatMask;

56258

for (unsigned i = 0; i != NumOps; ++i) {

56259

SmallVector<int, 64> SubMask;

56260

SmallVector<SDValue, 2> SubOps;

56261

if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,

56262

SubMask))

56263

break;

56264

for (int M : SubMask) {

56265

if (0 <= M) {

56266

M += M < NumSrcElts ? 0 : NumSrcElts;

56267

M += i * NumSrcElts;

56268

}

56269

ConcatMask.push_back(M);

56270

}

56271

}

56272

if (ConcatMask.size() == (NumOps * NumSrcElts)) {

56273

SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),

56274

Ops[1].getOperand(0), DAG, DL);

56275

SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),

56276

Ops[1].getOperand(2), DAG, DL);

56277

MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);

56278

MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);

56279

SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);

56280

return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);

56281

}

56282

}

56283

break;

56284

case ISD::TRUNCATE:

56285

if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {

56286

EVT SrcVT = Ops[0].getOperand(0).getValueType();

56287

if (SrcVT.is256BitVector() && SrcVT.isSimple() &&

56288

SrcVT == Ops[1].getOperand(0).getValueType() &&

56289

Subtarget.useAVX512Regs() &&

56290

Subtarget.getPreferVectorWidth() >= 512 &&

56291

(SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {

56292

EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());

56293

return DAG.getNode(ISD::TRUNCATE, DL, VT,

56294

ConcatSubOperand(NewSrcVT, Ops, 0));

56295

}

56296

}

56297

break;

56298

case X86ISD::VSHLI:

56299

case X86ISD::VSRLI:

56300

// Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.

56301

// TODO: Move this to LowerShiftByScalarImmediate?

56302

if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&

56303

llvm::all_of(Ops, [](SDValue Op) {

56304

return Op.getConstantOperandAPInt(1) == 32;

56305

})) {

56306

SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));

56307

SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);

56308

if (Op0.getOpcode() == X86ISD::VSHLI) {

56309

Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,

56310

{8, 0, 8, 2, 8, 4, 8, 6});

56311

} else {

56312

Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,

56313

{1, 8, 3, 8, 5, 8, 7, 8});

56314

}

56315

return DAG.getBitcast(VT, Res);

56316

}

56317

[[fallthrough]];

56318

case X86ISD::VSRAI:

56319

case X86ISD::VSHL:

56320

case X86ISD::VSRL:

56321

case X86ISD::VSRA:

56322

if (((VT.is256BitVector() && Subtarget.hasInt256()) ||

56323

(VT.is512BitVector() && Subtarget.useAVX512Regs() &&

56324

(EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&

56325

llvm::all_of(Ops, [Op0](SDValue Op) {

56326

return Op0.getOperand(1) == Op.getOperand(1);

56327

})) {

56328

return DAG.getNode(Op0.getOpcode(), DL, VT,

56329

ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));

56330

}

56331

break;

56332

case X86ISD::VPERMI:

56333

case X86ISD::VROTLI:

56334

case X86ISD::VROTRI:

56335

if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

56336

llvm::all_of(Ops, [Op0](SDValue Op) {

56337

return Op0.getOperand(1) == Op.getOperand(1);

56338

})) {

56339

return DAG.getNode(Op0.getOpcode(), DL, VT,

56340

ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));

56341

}

56342

break;

56343

case ISD::AND:

56344

case ISD::OR:

56345

case ISD::XOR:

56346

case X86ISD::ANDNP:

56347

if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||

56348

(VT.is512BitVector() && Subtarget.useAVX512Regs()))) {

56349

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

56350

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

56351

NumOps * SrcVT.getVectorNumElements());

56352

return DAG.getNode(Op0.getOpcode(), DL, VT,

56353

ConcatSubOperand(SrcVT, Ops, 0),

56354

ConcatSubOperand(SrcVT, Ops, 1));

56355

}

56356

break;

56357

case X86ISD::GF2P8AFFINEQB:

56358

if (!IsSplat &&

56359

(VT.is256BitVector() ||

56360

(VT.is512BitVector() && Subtarget.useAVX512Regs())) &&

56361

llvm::all_of(Ops, [Op0](SDValue Op) {

56362

return Op0.getOperand(2) == Op.getOperand(2);

56363

})) {

56364

return DAG.getNode(Op0.getOpcode(), DL, VT,

56365

ConcatSubOperand(VT, Ops, 0),

56366

ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));

56367

}

56368

break;

56369

case ISD::ADD:

56370

case ISD::SUB:

56371

case ISD::MUL:

56372

if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||

56373

(VT.is512BitVector() && Subtarget.useAVX512Regs() &&

56374

(EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {

56375

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

56376

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

56377

NumOps * SrcVT.getVectorNumElements());

56378

return DAG.getNode(Op0.getOpcode(), DL, VT,

56379

ConcatSubOperand(SrcVT, Ops, 0),

56380

ConcatSubOperand(SrcVT, Ops, 1));

56381

}

56382

break;

56383

case ISD::FADD:

56384

case ISD::FSUB:

56385

case ISD::FMUL:

56386

case ISD::FDIV:

56387

if (!IsSplat && (VT.is256BitVector() ||

56388

(VT.is512BitVector() && Subtarget.useAVX512Regs()))) {

56389

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

56390

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

56391

NumOps * SrcVT.getVectorNumElements());

56392

return DAG.getNode(Op0.getOpcode(), DL, VT,

56393

ConcatSubOperand(SrcVT, Ops, 0),

56394

ConcatSubOperand(SrcVT, Ops, 1));

56395

}

56396

break;

56397

case X86ISD::HADD:

56398

case X86ISD::HSUB:

56399

case X86ISD::FHADD:

56400

case X86ISD::FHSUB:

56401

case X86ISD::PACKSS:

56402

case X86ISD::PACKUS:

56403

if (!IsSplat && VT.is256BitVector() &&

56404

(VT.isFloatingPoint() || Subtarget.hasInt256())) {

56405

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

56406

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

56407

NumOps * SrcVT.getVectorNumElements());

56408

return DAG.getNode(Op0.getOpcode(), DL, VT,

56409

ConcatSubOperand(SrcVT, Ops, 0),

56410

ConcatSubOperand(SrcVT, Ops, 1));

56411

}

56412

break;

56413

case X86ISD::PALIGNR:

56414

if (!IsSplat &&

56415

((VT.is256BitVector() && Subtarget.hasInt256()) ||

56416

(VT.is512BitVector() && Subtarget.useBWIRegs())) &&

56417

llvm::all_of(Ops, [Op0](SDValue Op) {

56418

return Op0.getOperand(2) == Op.getOperand(2);

56419

})) {

56420

return DAG.getNode(Op0.getOpcode(), DL, VT,

56421

ConcatSubOperand(VT, Ops, 0),

56422

ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));

56423

}

56424

break;

56425

case ISD::VSELECT:

56426

if (!IsSplat && Subtarget.hasAVX512() &&

56427

(VT.is256BitVector() ||

56428

(VT.is512BitVector() && Subtarget.useAVX512Regs())) &&

56429

(EltSizeInBits >= 32 || Subtarget.hasBWI())) {

56430

EVT SelVT = Ops[0].getOperand(0).getValueType();

56431

if (SelVT.getVectorElementType() == MVT::i1) {

56432

SelVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

56433

Ops.size() * SelVT.getVectorNumElements());

56434

if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))

56435

return DAG.getNode(Op0.getOpcode(), DL, VT,

56436

ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),

56437

ConcatSubOperand(VT, Ops, 1),

56438

ConcatSubOperand(VT, Ops, 2));

56439

}

56440

}

56441

[[fallthrough]];

56442

case X86ISD::BLENDV:

56443

if (!IsSplat && VT.is256BitVector() && Ops.size() == 2 &&

56444

(EltSizeInBits >= 32 || Subtarget.hasInt256()) &&

56445

IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {

56446

EVT SelVT = Ops[0].getOperand(0).getValueType();

56447

SelVT = SelVT.getDoubleNumVectorElementsVT(*DAG.getContext());

56448

if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))

56449

return DAG.getNode(Op0.getOpcode(), DL, VT,

56450

ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),

56451

ConcatSubOperand(VT, Ops, 1),

56452

ConcatSubOperand(VT, Ops, 2));

56453

}

56454

break;

56455

}

56456

}

56457

56458

// Fold subvector loads into one.

56459

// If needed, look through bitcasts to get to the load.

56460

if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {

56461

unsigned Fast;

56462

const X86TargetLowering *TLI = Subtarget.getTargetLowering();

56463

if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,

56464

*FirstLd->getMemOperand(), &Fast) &&

56465

Fast) {

56466

if (SDValue Ld =

56467

EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))

56468

return Ld;

56469

}

56470

}

56471

56472

// Attempt to fold target constant loads.

56473

if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {

56474

SmallVector<APInt> EltBits;

56475

APInt UndefElts = APInt::getZero(VT.getVectorNumElements());

56476

for (unsigned I = 0, E = Ops.size(); I != E; ++I) {

56477

APInt OpUndefElts;

56478

SmallVector<APInt> OpEltBits;

56479

if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,

56480

OpEltBits, true, false))

56481

break;

56482

EltBits.append(OpEltBits);

56483

UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());

56484

}

56485

if (EltBits.size() == VT.getVectorNumElements())

56486

return getConstVector(EltBits, UndefElts, VT, DAG, DL);

56487

}

56488

56489

return SDValue();

56490

}

56491

56492

static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,

56493

TargetLowering::DAGCombinerInfo &DCI,

56494

const X86Subtarget &Subtarget) {

56495

EVT VT = N->getValueType(0);

56496

EVT SrcVT = N->getOperand(0).getValueType();

56497

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

56498

SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());

56499

56500

if (VT.getVectorElementType() == MVT::i1) {

56501

// Attempt to constant fold.

56502

unsigned SubSizeInBits = SrcVT.getSizeInBits();

56503

APInt Constant = APInt::getZero(VT.getSizeInBits());

56504

for (unsigned I = 0, E = Ops.size(); I != E; ++I) {

56505

auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));

56506

if (!C) break;

56507

Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);

56508

if (I == (E - 1)) {

56509

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());

56510

if (TLI.isTypeLegal(IntVT))

56511

return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));

56512

}

56513

}

56514

56515

// Don't do anything else for i1 vectors.

56516

return SDValue();

56517

}

56518

56519

if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {

56520

if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,

56521

DCI, Subtarget))

56522

return R;

56523

}

56524

56525

return SDValue();

56526

}

56527

56528

static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,

56529

TargetLowering::DAGCombinerInfo &DCI,

56530

const X86Subtarget &Subtarget) {

56531

if (DCI.isBeforeLegalizeOps())

56532

return SDValue();

56533

56534

MVT OpVT = N->getSimpleValueType(0);

56535

56536

bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;

56537

56538

SDLoc dl(N);

56539

SDValue Vec = N->getOperand(0);

56540

SDValue SubVec = N->getOperand(1);

56541

56542

uint64_t IdxVal = N->getConstantOperandVal(2);

56543

MVT SubVecVT = SubVec.getSimpleValueType();

56544

56545

if (Vec.isUndef() && SubVec.isUndef())

56546

return DAG.getUNDEF(OpVT);

56547

56548

// Inserting undefs/zeros into zeros/undefs is a zero vector.

56549

if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&

56550

(SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))

56551

return getZeroVector(OpVT, Subtarget, DAG, dl);

56552

56553

if (ISD::isBuildVectorAllZeros(Vec.getNode())) {

56554

// If we're inserting into a zero vector and then into a larger zero vector,

56555

// just insert into the larger zero vector directly.

56556

if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&

56557

ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {

56558

uint64_t Idx2Val = SubVec.getConstantOperandVal(2);

56559

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

56560

getZeroVector(OpVT, Subtarget, DAG, dl),

56561

SubVec.getOperand(1),

56562

DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));

56563

}

56564

56565

// If we're inserting into a zero vector and our input was extracted from an

56566

// insert into a zero vector of the same type and the extraction was at

56567

// least as large as the original insertion. Just insert the original

56568

// subvector into a zero vector.

56569

if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&

56570

isNullConstant(SubVec.getOperand(1)) &&

56571

SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {

56572

SDValue Ins = SubVec.getOperand(0);

56573

if (isNullConstant(Ins.getOperand(2)) &&

56574

ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&

56575

Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=

56576

SubVecVT.getFixedSizeInBits())

56577

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

56578

getZeroVector(OpVT, Subtarget, DAG, dl),

56579

Ins.getOperand(1), N->getOperand(2));

56580

}

56581

}

56582

56583

// Stop here if this is an i1 vector.

56584

if (IsI1Vector)

56585

return SDValue();

56586

56587

// Eliminate an intermediate vector widening:

56588

// insert_subvector X, (insert_subvector undef, Y, 0), Idx -->

56589

// insert_subvector X, Y, Idx

56590

// TODO: This is a more general version of a DAGCombiner fold, can we move it

56591

// there?

56592

if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&

56593

SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))

56594

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,

56595

SubVec.getOperand(1), N->getOperand(2));

56596

56597

// If this is an insert of an extract, combine to a shuffle. Don't do this

56598

// if the insert or extract can be represented with a subregister operation.

56599

if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

56600

SubVec.getOperand(0).getSimpleValueType() == OpVT &&

56601

(IdxVal != 0 ||

56602

!(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {

56603

int ExtIdxVal = SubVec.getConstantOperandVal(1);

56604

if (ExtIdxVal != 0) {

56605

int VecNumElts = OpVT.getVectorNumElements();

56606

int SubVecNumElts = SubVecVT.getVectorNumElements();

56607

SmallVector<int, 64> Mask(VecNumElts);

56608

// First create an identity shuffle mask.

56609

for (int i = 0; i != VecNumElts; ++i)

56610

Mask[i] = i;

56611

// Now insert the extracted portion.

56612

for (int i = 0; i != SubVecNumElts; ++i)

56613

Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;

56614

56615

return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);

56616

}

56617

}

56618

56619

// Match concat_vector style patterns.

56620

SmallVector<SDValue, 2> SubVectorOps;

56621

if (collectConcatOps(N, SubVectorOps, DAG)) {

56622

if (SDValue Fold =

56623

combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))

56624

return Fold;

56625

56626

// If we're inserting all zeros into the upper half, change this to

56627

// a concat with zero. We will match this to a move

56628

// with implicit upper bit zeroing during isel.

56629

// We do this here because we don't want combineConcatVectorOps to

56630

// create INSERT_SUBVECTOR from CONCAT_VECTORS.

56631

if (SubVectorOps.size() == 2 &&

56632

ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))

56633

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

56634

getZeroVector(OpVT, Subtarget, DAG, dl),

56635

SubVectorOps[0], DAG.getIntPtrConstant(0, dl));

56636

}

56637

56638

// If this is a broadcast insert into an upper undef, use a larger broadcast.

56639

if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)

56640

return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));

56641

56642

// If this is a broadcast load inserted into an upper undef, use a larger

56643

// broadcast load.

56644

if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&

56645

SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {

56646

auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);

56647

SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);

56648

SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };

56649

SDValue BcastLd =

56650

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

56651

MemIntr->getMemoryVT(),

56652

MemIntr->getMemOperand());

56653

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));

56654

return BcastLd;

56655

}

56656

56657

// If we're splatting the lower half subvector of a full vector load into the

56658

// upper half, attempt to create a subvector broadcast.

56659

if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&

56660

Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {

56661

auto *VecLd = dyn_cast<LoadSDNode>(Vec);

56662

auto *SubLd = dyn_cast<LoadSDNode>(SubVec);

56663

if (VecLd && SubLd &&

56664

DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,

56665

SubVec.getValueSizeInBits() / 8, 0))

56666

return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,

56667

SubLd, 0, DAG);

56668

}

56669

56670

return SDValue();

56671

}

56672

56673

/// If we are extracting a subvector of a vector select and the select condition

56674

/// is composed of concatenated vectors, try to narrow the select width. This

56675

/// is a common pattern for AVX1 integer code because 256-bit selects may be

56676

/// legal, but there is almost no integer math/logic available for 256-bit.

56677

/// This function should only be called with legal types (otherwise, the calls

56678

/// to get simple value types will assert).

56679

static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {

56680

SDValue Sel = Ext->getOperand(0);

56681

SmallVector<SDValue, 4> CatOps;

56682

if (Sel.getOpcode() != ISD::VSELECT ||

56683

!collectConcatOps(Sel.getOperand(0).getNode(), CatOps, DAG))

56684

return SDValue();

56685

56686

// Note: We assume simple value types because this should only be called with

56687

// legal operations/types.

56688

// TODO: This can be extended to handle extraction to 256-bits.

56689

MVT VT = Ext->getSimpleValueType(0);

56690

if (!VT.is128BitVector())

56691

return SDValue();

56692

56693

MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();

56694

if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())

56695

return SDValue();

56696

56697

MVT WideVT = Ext->getOperand(0).getSimpleValueType();

56698

MVT SelVT = Sel.getSimpleValueType();

56699

assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56700, __extension__
__PRETTY_FUNCTION__))

56700

"Unexpected vector type with legal operations")(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56700, __extension__
__PRETTY_FUNCTION__));

56701

56702

unsigned SelElts = SelVT.getVectorNumElements();

56703

unsigned CastedElts = WideVT.getVectorNumElements();

56704

unsigned ExtIdx = Ext->getConstantOperandVal(1);

56705

if (SelElts % CastedElts == 0) {

56706

// The select has the same or more (narrower) elements than the extract

56707

// operand. The extraction index gets scaled by that factor.

56708

ExtIdx *= (SelElts / CastedElts);

56709

} else if (CastedElts % SelElts == 0) {

56710

// The select has less (wider) elements than the extract operand. Make sure

56711

// that the extraction index can be divided evenly.

56712

unsigned IndexDivisor = CastedElts / SelElts;

56713

if (ExtIdx % IndexDivisor != 0)

56714

return SDValue();

56715

ExtIdx /= IndexDivisor;

56716

} else {

56717

llvm_unreachable("Element count of simple vector types are not divisible?")::llvm::llvm_unreachable_internal("Element count of simple vector types are not divisible?"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56717);

56718

}

56719

56720

unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();

56721

unsigned NarrowElts = SelElts / NarrowingFactor;

56722

MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);

56723

SDLoc DL(Ext);

56724

SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);

56725

SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);

56726

SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);

56727

SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);

56728

return DAG.getBitcast(VT, NarrowSel);

56729

}

56730

56731

static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,

56732

TargetLowering::DAGCombinerInfo &DCI,

56733

const X86Subtarget &Subtarget) {

56734

// For AVX1 only, if we are extracting from a 256-bit and+not (which will

56735

// eventually get combined/lowered into ANDNP) with a concatenated operand,

56736

// split the 'and' into 128-bit ops to avoid the concatenate and extract.

56737

// We let generic combining take over from there to simplify the

56738

// insert/extract and 'not'.

56739

// This pattern emerges during AVX1 legalization. We handle it before lowering

56740

// to avoid complications like splitting constant vector loads.

56741

56742

// Capture the original wide type in the likely case that we need to bitcast

56743

// back to this type.

56744

if (!N->getValueType(0).isSimple())

56745

return SDValue();

56746

56747

MVT VT = N->getSimpleValueType(0);

56748

SDValue InVec = N->getOperand(0);

56749

unsigned IdxVal = N->getConstantOperandVal(1);

56750

SDValue InVecBC = peekThroughBitcasts(InVec);

56751

EVT InVecVT = InVec.getValueType();

56752

unsigned SizeInBits = VT.getSizeInBits();

56753

unsigned InSizeInBits = InVecVT.getSizeInBits();

56754

unsigned NumSubElts = VT.getVectorNumElements();

56755

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

56756

56757

if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&

56758

TLI.isTypeLegal(InVecVT) &&

56759

InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {

56760

auto isConcatenatedNot = [](SDValue V) {

56761

V = peekThroughBitcasts(V);

56762

if (!isBitwiseNot(V))

56763

return false;

56764

SDValue NotOp = V->getOperand(0);

56765

return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;

56766

};

56767

if (isConcatenatedNot(InVecBC.getOperand(0)) ||

56768

isConcatenatedNot(InVecBC.getOperand(1))) {

56769

// extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1

56770

SDValue Concat = splitVectorIntBinary(InVecBC, DAG);

56771

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,

56772

DAG.getBitcast(InVecVT, Concat), N->getOperand(1));

56773

}

56774

}

56775

56776

if (DCI.isBeforeLegalizeOps())

56777

return SDValue();

56778

56779

if (SDValue V = narrowExtractedVectorSelect(N, DAG))

56780

return V;

56781

56782

if (ISD::isBuildVectorAllZeros(InVec.getNode()))

56783

return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

56784

56785

if (ISD::isBuildVectorAllOnes(InVec.getNode())) {

56786

if (VT.getScalarType() == MVT::i1)

56787

return DAG.getConstant(1, SDLoc(N), VT);

56788

return getOnesVector(VT, DAG, SDLoc(N));

56789

}

56790

56791

if (InVec.getOpcode() == ISD::BUILD_VECTOR)

56792

return DAG.getBuildVector(VT, SDLoc(N),

56793

InVec->ops().slice(IdxVal, NumSubElts));

56794

56795

// If we are extracting from an insert into a larger vector, replace with a

56796

// smaller insert if we don't access less than the original subvector. Don't

56797

// do this for i1 vectors.

56798

// TODO: Relax the matching indices requirement?

56799

if (VT.getVectorElementType() != MVT::i1 &&

56800

InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&

56801

IdxVal == InVec.getConstantOperandVal(2) &&

56802

InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {

56803

SDLoc DL(N);

56804

SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,

56805

InVec.getOperand(0), N->getOperand(1));

56806

unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;

56807

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,

56808

InVec.getOperand(1),

56809

DAG.getVectorIdxConstant(NewIdxVal, DL));

56810

}

56811

56812

// If we're extracting an upper subvector from a broadcast we should just

56813

// extract the lowest subvector instead which should allow

56814

// SimplifyDemandedVectorElts do more simplifications.

56815

if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||

56816

InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||

56817

DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))

56818

return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);

56819

56820

// If we're extracting a broadcasted subvector, just use the lowest subvector.

56821

if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&

56822

cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)

56823

return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);

56824

56825

// Attempt to extract from the source of a shuffle vector.

56826

if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {

56827

SmallVector<int, 32> ShuffleMask;

56828

SmallVector<int, 32> ScaledMask;

56829

SmallVector<SDValue, 2> ShuffleInputs;

56830

unsigned NumSubVecs = InSizeInBits / SizeInBits;

56831

// Decode the shuffle mask and scale it so its shuffling subvectors.

56832

if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&

56833

scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {

56834

unsigned SubVecIdx = IdxVal / NumSubElts;

56835

if (ScaledMask[SubVecIdx] == SM_SentinelUndef)

56836

return DAG.getUNDEF(VT);

56837

if (ScaledMask[SubVecIdx] == SM_SentinelZero)

56838

return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

56839

SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];

56840

if (Src.getValueSizeInBits() == InSizeInBits) {

56841

unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;

56842

unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;

56843

return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,

56844

SDLoc(N), SizeInBits);

56845

}

56846

}

56847

}

56848

56849

// If we're extracting the lowest subvector and we're the only user,

56850

// we may be able to perform this with a smaller vector width.

56851

unsigned InOpcode = InVec.getOpcode();

56852

if (InVec.hasOneUse()) {

56853

if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {

56854

// v2f64 CVTDQ2PD(v4i32).

56855

if (InOpcode == ISD::SINT_TO_FP &&

56856

InVec.getOperand(0).getValueType() == MVT::v4i32) {

56857

return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));

56858

}

56859

// v2f64 CVTUDQ2PD(v4i32).

56860

if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&

56861

InVec.getOperand(0).getValueType() == MVT::v4i32) {

56862

return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));

56863

}

56864

// v2f64 CVTPS2PD(v4f32).

56865

if (InOpcode == ISD::FP_EXTEND &&

56866

InVec.getOperand(0).getValueType() == MVT::v4f32) {

56867

return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));

56868

}

56869

}

56870

if (IdxVal == 0 &&

56871

(InOpcode == ISD::ANY_EXTEND ||

56872

InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||

56873

InOpcode == ISD::ZERO_EXTEND ||

56874

InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||

56875

InOpcode == ISD::SIGN_EXTEND ||

56876

InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&

56877

(SizeInBits == 128 || SizeInBits == 256) &&

56878

InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {

56879

SDLoc DL(N);

56880

SDValue Ext = InVec.getOperand(0);

56881

if (Ext.getValueSizeInBits() > SizeInBits)

56882

Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);

56883

unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);

56884

return DAG.getNode(ExtOp, DL, VT, Ext);

56885

}

56886

if (IdxVal == 0 && InOpcode == ISD::VSELECT &&

56887

InVec.getOperand(0).getValueType().is256BitVector() &&

56888

InVec.getOperand(1).getValueType().is256BitVector() &&

56889

InVec.getOperand(2).getValueType().is256BitVector()) {

56890

SDLoc DL(N);

56891

SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);

56892

SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);

56893

SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);

56894

return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);

56895

}

56896

if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&

56897

(VT.is128BitVector() || VT.is256BitVector())) {

56898

SDLoc DL(N);

56899

SDValue InVecSrc = InVec.getOperand(0);

56900

unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;

56901

SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);

56902

return DAG.getNode(InOpcode, DL, VT, Ext);

56903

}

56904

if (InOpcode == X86ISD::MOVDDUP &&

56905

(VT.is128BitVector() || VT.is256BitVector())) {

56906

SDLoc DL(N);

56907

SDValue Ext0 =

56908

extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);

56909

return DAG.getNode(InOpcode, DL, VT, Ext0);

56910

}

56911

}

56912

56913

// Always split vXi64 logical shifts where we're extracting the upper 32-bits

56914

// as this is very likely to fold into a shuffle/truncation.

56915

if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&

56916

InVecVT.getScalarSizeInBits() == 64 &&

56917

InVec.getConstantOperandAPInt(1) == 32) {

56918

SDLoc DL(N);

56919

SDValue Ext =

56920

extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);

56921

return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));

56922

}

56923

56924

return SDValue();

56925

}

56926

56927

static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {

56928

EVT VT = N->getValueType(0);

56929

SDValue Src = N->getOperand(0);

56930

SDLoc DL(N);

56931

56932

// If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.

56933

// This occurs frequently in our masked scalar intrinsic code and our

56934

// floating point select lowering with AVX512.

56935

// TODO: SimplifyDemandedBits instead?

56936

if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&

56937

isOneConstant(Src.getOperand(1)))

56938

return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));

56939

56940

// Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.

56941

if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

56942

Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&

56943

Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)

56944

if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))

56945

if (C->isZero())

56946

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),

56947

Src.getOperand(1));

56948

56949

// Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.

56950

// TODO: Move to DAGCombine/SimplifyDemandedBits?

56951

if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {

56952

auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {

56953

if (Op.getValueType() != MVT::i64)

56954

return SDValue();

56955

unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;

56956

if (Op.getOpcode() == Opc &&

56957

Op.getOperand(0).getScalarValueSizeInBits() <= 32)

56958

return Op.getOperand(0);

56959

unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;

56960

if (auto *Ld = dyn_cast<LoadSDNode>(Op))

56961

if (Ld->getExtensionType() == Ext &&

56962

Ld->getMemoryVT().getScalarSizeInBits() <= 32)

56963

return Op;

56964

if (IsZeroExt) {

56965

KnownBits Known = DAG.computeKnownBits(Op);

56966

if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)

56967

return Op;

56968

}

56969

return SDValue();

56970

};

56971

56972

if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))

56973

return DAG.getBitcast(

56974

VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,

56975

DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));

56976

56977

if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))

56978

return DAG.getBitcast(

56979

VT,

56980

DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,

56981

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,

56982

DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));

56983

}

56984

56985

// Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.

56986

if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&

56987

Src.getOperand(0).getValueType() == MVT::x86mmx)

56988

return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));

56989

56990

// See if we're broadcasting the scalar value, in which case just reuse that.

56991

// Ensure the same SDValue from the SDNode use is being used.

56992

if (VT.getScalarType() == Src.getValueType())

56993

for (SDNode *User : Src->uses())

56994

if (User->getOpcode() == X86ISD::VBROADCAST &&

56995

Src == User->getOperand(0)) {

56996

unsigned SizeInBits = VT.getFixedSizeInBits();

56997

unsigned BroadcastSizeInBits =

56998

User->getValueSizeInBits(0).getFixedValue();

56999

if (BroadcastSizeInBits == SizeInBits)

57000

return SDValue(User, 0);

57001

if (BroadcastSizeInBits > SizeInBits)

57002

return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);

57003

// TODO: Handle BroadcastSizeInBits < SizeInBits when we have test

57004

// coverage.

57005

}

57006

57007

return SDValue();

57008

}

57009

57010

// Simplify PMULDQ and PMULUDQ operations.

57011

static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,

57012

TargetLowering::DAGCombinerInfo &DCI,

57013

const X86Subtarget &Subtarget) {

57014

SDValue LHS = N->getOperand(0);

57015

SDValue RHS = N->getOperand(1);

57016

57017

// Canonicalize constant to RHS.

57018

if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&

57019

!DAG.isConstantIntBuildVectorOrConstantInt(RHS))

57020

return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);

57021

57022

// Multiply by zero.

57023

// Don't return RHS as it may contain UNDEFs.

57024

if (ISD::isBuildVectorAllZeros(RHS.getNode()))

57025

return DAG.getConstant(0, SDLoc(N), N->getValueType(0));

57026

57027

// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.

57028

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

57029

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))

57030

return SDValue(N, 0);

57031

57032

// If the input is an extend_invec and the SimplifyDemandedBits call didn't

57033

// convert it to any_extend_invec, due to the LegalOperations check, do the

57034

// conversion directly to a vector shuffle manually. This exposes combine

57035

// opportunities missed by combineEXTEND_VECTOR_INREG not calling

57036

// combineX86ShufflesRecursively on SSE4.1 targets.

57037

// FIXME: This is basically a hack around several other issues related to

57038

// ANY_EXTEND_VECTOR_INREG.

57039

if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&

57040

(LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||

57041

LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&

57042

LHS.getOperand(0).getValueType() == MVT::v4i32) {

57043

SDLoc dl(N);

57044

LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),

57045

LHS.getOperand(0), { 0, -1, 1, -1 });

57046

LHS = DAG.getBitcast(MVT::v2i64, LHS);

57047

return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);

57048

}

57049

if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&

57050

(RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||

57051

RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&

57052

RHS.getOperand(0).getValueType() == MVT::v4i32) {

57053

SDLoc dl(N);

57054

RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),

57055

RHS.getOperand(0), { 0, -1, 1, -1 });

57056

RHS = DAG.getBitcast(MVT::v2i64, RHS);

57057

return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);

57058

}

57059

57060

return SDValue();

57061

}

57062

57063

// Simplify VPMADDUBSW/VPMADDWD operations.

57064

static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,

57065

TargetLowering::DAGCombinerInfo &DCI) {

57066

EVT VT = N->getValueType(0);

57067

SDValue LHS = N->getOperand(0);

57068

SDValue RHS = N->getOperand(1);

57069

57070

// Multiply by zero.

57071

// Don't return LHS/RHS as it may contain UNDEFs.

57072

if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||

57073

ISD::isBuildVectorAllZeros(RHS.getNode()))

57074

return DAG.getConstant(0, SDLoc(N), VT);

57075

57076

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

57077

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

57078

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

57079

return SDValue(N, 0);

57080

57081

return SDValue();

57082

}

57083

57084

static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,

57085

TargetLowering::DAGCombinerInfo &DCI,

57086

const X86Subtarget &Subtarget) {

57087

EVT VT = N->getValueType(0);

57088

SDValue In = N->getOperand(0);

57089

unsigned Opcode = N->getOpcode();

57090

unsigned InOpcode = In.getOpcode();

57091

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

57092

SDLoc DL(N);

57093

57094

// Try to merge vector loads and extend_inreg to an extload.

57095

if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&

57096

In.hasOneUse()) {

57097

auto *Ld = cast<LoadSDNode>(In);

57098

if (Ld->isSimple()) {

57099

MVT SVT = In.getSimpleValueType().getVectorElementType();

57100

ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG

57101

? ISD::SEXTLOAD

57102

: ISD::ZEXTLOAD;

57103

EVT MemVT = VT.changeVectorElementType(SVT);

57104

if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {

57105

SDValue Load = DAG.getExtLoad(

57106

Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),

57107

MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());

57108

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

57109

return Load;

57110

}

57111

}

57112

}

57113

57114

// Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).

57115

if (Opcode == InOpcode)

57116

return DAG.getNode(Opcode, DL, VT, In.getOperand(0));

57117

57118

// Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))

57119

// -> EXTEND_VECTOR_INREG(X).

57120

// TODO: Handle non-zero subvector indices.

57121

if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&

57122

In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&

57123

In.getOperand(0).getOperand(0).getValueSizeInBits() ==

57124

In.getValueSizeInBits())

57125

return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));

57126

57127

// Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).

57128

// TODO: Move to DAGCombine?

57129

if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&

57130

In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&

57131

In.getValueSizeInBits() == VT.getSizeInBits()) {

57132

unsigned NumElts = VT.getVectorNumElements();

57133

unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();

57134

EVT EltVT = In.getOperand(0).getValueType();

57135

SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));

57136

for (unsigned I = 0; I != NumElts; ++I)

57137

Elts[I * Scale] = In.getOperand(I);

57138

return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));

57139

}

57140

57141

// Attempt to combine as a shuffle on SSE41+ targets.

57142

if ((Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||

57143

Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) &&

57144

Subtarget.hasSSE41()) {

57145

SDValue Op(N, 0);

57146

if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))

57147

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

57148

return Res;

57149

}

57150

57151

return SDValue();

57152

}

57153

57154

static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,

57155

TargetLowering::DAGCombinerInfo &DCI) {

57156

EVT VT = N->getValueType(0);

57157

57158

if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))

57159

return DAG.getConstant(0, SDLoc(N), VT);

57160

57161

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

57162

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

57163

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

57164

return SDValue(N, 0);

57165

57166

return SDValue();

57167

}

57168

57169

// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.

57170

// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce

57171

// extra instructions between the conversion due to going to scalar and back.

57172

static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,

57173

const X86Subtarget &Subtarget) {

57174

if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())

57175

return SDValue();

57176

57177

if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)

57178

return SDValue();

57179

57180

if (N->getValueType(0) != MVT::f32 ||

57181

N->getOperand(0).getOperand(0).getValueType() != MVT::f32)

57182

return SDValue();

57183

57184

SDLoc dl(N);

57185

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,

57186

N->getOperand(0).getOperand(0));

57187

Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,

57188

DAG.getTargetConstant(4, dl, MVT::i32));

57189

Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);

57190

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,

57191

DAG.getIntPtrConstant(0, dl));

57192

}

57193

57194

static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,

57195

const X86Subtarget &Subtarget) {

57196

if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())

57197

return SDValue();

57198

57199

if (Subtarget.hasFP16())

57200

return SDValue();

57201

57202

bool IsStrict = N->isStrictFPOpcode();

57203

EVT VT = N->getValueType(0);

57204

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

57205

EVT SrcVT = Src.getValueType();

57206

57207

if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)

57208

return SDValue();

57209

57210

if (VT.getVectorElementType() != MVT::f32 &&

57211

VT.getVectorElementType() != MVT::f64)

57212

return SDValue();

57213

57214

unsigned NumElts = VT.getVectorNumElements();

57215

if (NumElts == 1 || !isPowerOf2_32(NumElts))

57216

return SDValue();

57217

57218

SDLoc dl(N);

57219

57220

// Convert the input to vXi16.

57221

EVT IntVT = SrcVT.changeVectorElementTypeToInteger();

57222

Src = DAG.getBitcast(IntVT, Src);

57223

57224

// Widen to at least 8 input elements.

57225

if (NumElts < 8) {

57226

unsigned NumConcats = 8 / NumElts;

57227

SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)

57228

: DAG.getConstant(0, dl, IntVT);

57229

SmallVector<SDValue, 4> Ops(NumConcats, Fill);

57230

Ops[0] = Src;

57231

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);

57232

}

57233

57234

// Destination is vXf32 with at least 4 elements.

57235

EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,

57236

std::max(4U, NumElts));

57237

SDValue Cvt, Chain;

57238

if (IsStrict) {

57239

Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},

57240

{N->getOperand(0), Src});

57241

Chain = Cvt.getValue(1);

57242

} else {

57243

Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);

57244

}

57245

57246

if (NumElts < 4) {

57247

assert(NumElts == 2 && "Unexpected size")(static_cast <bool> (NumElts == 2 && "Unexpected size"
) ? void (0) : __assert_fail ("NumElts == 2 && \"Unexpected size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57247, __extension__
__PRETTY_FUNCTION__));

57248

Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,

57249

DAG.getIntPtrConstant(0, dl));

57250

}

57251

57252

if (IsStrict) {

57253

// Extend to the original VT if necessary.

57254

if (Cvt.getValueType() != VT) {

57255

Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},

57256

{Chain, Cvt});

57257

Chain = Cvt.getValue(1);

57258

}

57259

return DAG.getMergeValues({Cvt, Chain}, dl);

57260

}

57261

57262

// Extend to the original VT if necessary.

57263

return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);

57264

}

57265

57266

// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract

57267

// from. Limit this to cases where the loads have the same input chain and the

57268

// output chains are unused. This avoids any memory ordering issues.

57269

static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,

57270

TargetLowering::DAGCombinerInfo &DCI) {

57271

assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57273, __extension__
__PRETTY_FUNCTION__))

57272

N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57273, __extension__
__PRETTY_FUNCTION__))

57273

"Unknown broadcast load type")(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57273, __extension__
__PRETTY_FUNCTION__));

57274

57275

// Only do this if the chain result is unused.

57276

if (N->hasAnyUseOfValue(1))

57277

return SDValue();

57278

57279

auto *MemIntrin = cast<MemIntrinsicSDNode>(N);

57280

57281

SDValue Ptr = MemIntrin->getBasePtr();

57282

SDValue Chain = MemIntrin->getChain();

57283

EVT VT = N->getSimpleValueType(0);

57284

EVT MemVT = MemIntrin->getMemoryVT();

57285

57286

// Look at other users of our base pointer and try to find a wider broadcast.

57287

// The input chain and the size of the memory VT must match.

57288

for (SDNode *User : Ptr->uses())

57289

if (User != N && User->getOpcode() == N->getOpcode() &&

57290

cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&

57291

cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&

57292

cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==

57293

MemVT.getSizeInBits() &&

57294

!User->hasAnyUseOfValue(1) &&

57295

User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {

57296

SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),

57297

VT.getSizeInBits());

57298

Extract = DAG.getBitcast(VT, Extract);

57299

return DCI.CombineTo(N, Extract, SDValue(User, 1));

57300

}

57301

57302

return SDValue();

57303

}

57304

57305

static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,

57306

const X86Subtarget &Subtarget) {

57307

if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())

57308

return SDValue();

57309

57310

bool IsStrict = N->isStrictFPOpcode();

57311

EVT VT = N->getValueType(0);

57312

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

57313

EVT SrcVT = Src.getValueType();

57314

57315

if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||

57316

SrcVT.getVectorElementType() != MVT::f32)

57317

return SDValue();

57318

57319

SDLoc dl(N);

57320

57321

SDValue Cvt, Chain;

57322

unsigned NumElts = VT.getVectorNumElements();

57323

if (Subtarget.hasFP16()) {

57324

// Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64), ..)))

57325

// into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64), ..))

57326

if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS) {

57327

SDValue Cvt0, Cvt1;

57328

SDValue Op0 = Src.getOperand(0);

57329

SDValue Op1 = Src.getOperand(1);

57330

bool IsOp0Strict = Op0->isStrictFPOpcode();

57331

if (Op0.getOpcode() != Op1.getOpcode() ||

57332

Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||

57333

Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {

57334

return SDValue();

57335

}

57336

int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};

57337

if (IsStrict) {

57338

assert(IsOp0Strict && "Op0 must be strict node")(static_cast <bool> (IsOp0Strict && "Op0 must be strict node"
) ? void (0) : __assert_fail ("IsOp0Strict && \"Op0 must be strict node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57338, __extension__
__PRETTY_FUNCTION__));

57339

unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP

57340

? X86ISD::STRICT_CVTSI2P

57341

: X86ISD::STRICT_CVTUI2P;

57342

Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},

57343

{Op0.getOperand(0), Op0.getOperand(1)});

57344

Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},

57345

{Op1.getOperand(0), Op1.getOperand(1)});

57346

Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);

57347

return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);

57348

}

57349

unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P

57350

: X86ISD::CVTUI2P;

57351

Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));

57352

Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));

57353

return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);

57354

}

57355

return SDValue();

57356

}

57357

57358

if (NumElts == 1 || !isPowerOf2_32(NumElts))

57359

return SDValue();

57360

57361

// Widen to at least 4 input elements.

57362

if (NumElts < 4)

57363

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

57364

DAG.getConstantFP(0.0, dl, SrcVT));

57365

57366

// Destination is v8i16 with at least 8 elements.

57367

EVT CvtVT =

57368

EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));

57369

SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);

57370

if (IsStrict) {

57371

Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},

57372

{N->getOperand(0), Src, Rnd});

57373

Chain = Cvt.getValue(1);

57374

} else {

57375

Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);

57376

}

57377

57378

// Extract down to real number of elements.

57379

if (NumElts < 8) {

57380

EVT IntVT = VT.changeVectorElementTypeToInteger();

57381

Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,

57382

DAG.getIntPtrConstant(0, dl));

57383

}

57384

57385

Cvt = DAG.getBitcast(VT, Cvt);

57386

57387

if (IsStrict)

57388

return DAG.getMergeValues({Cvt, Chain}, dl);

57389

57390

return Cvt;

57391

}

57392

57393

static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {

57394

SDValue Src = N->getOperand(0);

57395

57396

// Turn MOVDQ2Q+simple_load into an mmx load.

57397

if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {

57398

LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());

57399

57400

if (LN->isSimple()) {

57401

SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),

57402

LN->getBasePtr(),

57403

LN->getPointerInfo(),

57404

LN->getOriginalAlign(),

57405

LN->getMemOperand()->getFlags());

57406

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));

57407

return NewLd;

57408

}

57409

}

57410

57411

return SDValue();

57412

}

57413

57414

static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,

57415

TargetLowering::DAGCombinerInfo &DCI) {

57416

unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();

57417

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

57418

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))

57419

return SDValue(N, 0);

57420

57421

return SDValue();

57422

}

57423

57424

SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,

57425

DAGCombinerInfo &DCI) const {

57426

SelectionDAG &DAG = DCI.DAG;

57427

switch (N->getOpcode()) {

57428

default: break;

57429

case ISD::SCALAR_TO_VECTOR:

57430

return combineScalarToVector(N, DAG);

57431

case ISD::EXTRACT_VECTOR_ELT:

57432

case X86ISD::PEXTRW:

57433

case X86ISD::PEXTRB:

57434

return combineExtractVectorElt(N, DAG, DCI, Subtarget);

57435

case ISD::CONCAT_VECTORS:

57436

return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);

57437

case ISD::INSERT_SUBVECTOR:

57438

return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);

57439

case ISD::EXTRACT_SUBVECTOR:

57440

return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);

57441

case ISD::VSELECT:

57442

case ISD::SELECT:

57443

case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);

57444

case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);

57445

case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);

57446

case X86ISD::CMP: return combineCMP(N, DAG);

57447

case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);

57448

case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);

57449

case X86ISD::ADD:

57450

case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);

57451

case X86ISD::SBB: return combineSBB(N, DAG);

57452

case X86ISD::ADC: return combineADC(N, DAG, DCI);

57453

case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);

57454

case ISD::SHL: return combineShiftLeft(N, DAG);

57455

case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);

57456

case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);

57457

case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);

57458

case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);

57459

case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);

57460

case X86ISD::BEXTR:

57461

case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);

57462

case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);

57463

case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);

57464

case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);

57465

case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);

57466

case X86ISD::VEXTRACT_STORE:

57467

return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);

57468

case ISD::SINT_TO_FP:

57469

case ISD::STRICT_SINT_TO_FP:

57470

return combineSIntToFP(N, DAG, DCI, Subtarget);

57471

case ISD::UINT_TO_FP:

57472

case ISD::STRICT_UINT_TO_FP:

57473

return combineUIntToFP(N, DAG, Subtarget);

57474

case ISD::FADD:

57475

case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);

57476

case X86ISD::VFCMULC:

57477

case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);

57478

case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);

57479

case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);

57480

case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);

57481

case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);

57482

case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);

57483

case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);

57484

case X86ISD::FXOR:

57485

case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);

57486

case X86ISD::FMIN:

57487

case X86ISD::FMAX: return combineFMinFMax(N, DAG);

57488

case ISD::FMINNUM:

57489

case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);

57490

case X86ISD::CVTSI2P:

57491

case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);

57492

case X86ISD::CVTP2SI:

57493

case X86ISD::CVTP2UI:

57494

case X86ISD::STRICT_CVTTP2SI:

57495

case X86ISD::CVTTP2SI:

57496

case X86ISD::STRICT_CVTTP2UI:

57497

case X86ISD::CVTTP2UI:

57498

return combineCVTP2I_CVTTP2I(N, DAG, DCI);

57499

case X86ISD::STRICT_CVTPH2PS:

57500

case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);

57501

case X86ISD::BT: return combineBT(N, DAG, DCI);

57502

case ISD::ANY_EXTEND:

57503

case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);

57504

case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);

57505

case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);

57506

case ISD::ANY_EXTEND_VECTOR_INREG:

57507

case ISD::SIGN_EXTEND_VECTOR_INREG:

57508

case ISD::ZERO_EXTEND_VECTOR_INREG:

57509

return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);

57510

case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);

57511

case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);

57512

case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);

57513

case X86ISD::PACKSS:

57514

case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);

57515

case X86ISD::HADD:

57516

case X86ISD::HSUB:

57517

case X86ISD::FHADD:

57518

case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);

57519

case X86ISD::VSHL:

57520

case X86ISD::VSRA:

57521

case X86ISD::VSRL:

57522

return combineVectorShiftVar(N, DAG, DCI, Subtarget);

57523

case X86ISD::VSHLI:

57524

case X86ISD::VSRAI:

57525

case X86ISD::VSRLI:

57526

return combineVectorShiftImm(N, DAG, DCI, Subtarget);

57527

case ISD::INSERT_VECTOR_ELT:

57528

case X86ISD::PINSRB:

57529

case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);

57530

case X86ISD::SHUFP: // Handle all target specific shuffles

57531

case X86ISD::INSERTPS:

57532

case X86ISD::EXTRQI:

57533

case X86ISD::INSERTQI:

57534

case X86ISD::VALIGN:

57535

case X86ISD::PALIGNR:

57536

case X86ISD::VSHLDQ:

57537

case X86ISD::VSRLDQ:

57538

case X86ISD::BLENDI:

57539

case X86ISD::UNPCKH:

57540

case X86ISD::UNPCKL:

57541

case X86ISD::MOVHLPS:

57542

case X86ISD::MOVLHPS:

57543

case X86ISD::PSHUFB:

57544

case X86ISD::PSHUFD:

57545

case X86ISD::PSHUFHW:

57546

case X86ISD::PSHUFLW:

57547

case X86ISD::MOVSHDUP:

57548

case X86ISD::MOVSLDUP:

57549

case X86ISD::MOVDDUP:

57550

case X86ISD::MOVSS:

57551

case X86ISD::MOVSD:

57552

case X86ISD::MOVSH:

57553

case X86ISD::VBROADCAST:

57554

case X86ISD::VPPERM:

57555

case X86ISD::VPERMI:

57556

case X86ISD::VPERMV:

57557

case X86ISD::VPERMV3:

57558

case X86ISD::VPERMIL2:

57559

case X86ISD::VPERMILPI:

57560

case X86ISD::VPERMILPV:

57561

case X86ISD::VPERM2X128:

57562

case X86ISD::SHUF128:

57563

case X86ISD::VZEXT_MOVL:

57564

case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);

57565

case X86ISD::FMADD_RND:

57566

case X86ISD::FMSUB:

57567

case X86ISD::STRICT_FMSUB:

57568

case X86ISD::FMSUB_RND:

57569

case X86ISD::FNMADD:

57570

case X86ISD::STRICT_FNMADD:

57571

case X86ISD::FNMADD_RND:

57572

case X86ISD::FNMSUB:

57573

case X86ISD::STRICT_FNMSUB:

57574

case X86ISD::FNMSUB_RND:

57575

case ISD::FMA:

57576

case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);

57577

case X86ISD::FMADDSUB_RND:

57578

case X86ISD::FMSUBADD_RND:

57579

case X86ISD::FMADDSUB:

57580

case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);

57581

case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);

57582

case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);

57583

case X86ISD::MGATHER:

57584

case X86ISD::MSCATTER:

57585

return combineX86GatherScatter(N, DAG, DCI, Subtarget);

57586

case ISD::MGATHER:

57587

case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);

57588

case X86ISD::PCMPEQ:

57589

case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);

57590

case X86ISD::PMULDQ:

57591

case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);

57592

case X86ISD::VPMADDUBSW:

57593

case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);

57594

case X86ISD::KSHIFTL:

57595

case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);

57596

case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);

57597

case ISD::STRICT_FP_EXTEND:

57598

case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);

57599

case ISD::STRICT_FP_ROUND:

57600

case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);

57601

case X86ISD::VBROADCAST_LOAD:

57602

case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);

57603

case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);

57604

case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);

57605

}

57606

57607

return SDValue();

57608

}

57609

57610

bool X86TargetLowering::preferABDSToABSWithNSW(EVT VT) const {

57611

return false;

57612

}

57613

57614

bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {

57615

if (!isTypeLegal(VT))

57616

return false;

57617

57618

// There are no vXi8 shifts.

57619

if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)

57620

return false;

57621

57622

// TODO: Almost no 8-bit ops are desirable because they have no actual

57623

// size/speed advantages vs. 32-bit ops, but they do have a major

57624

// potential disadvantage by causing partial register stalls.

57625

//

57626

// 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and

57627

// we have specializations to turn 32-bit multiply/shl into LEA or other ops.

57628

// Also, see the comment in "IsDesirableToPromoteOp" - where we additionally

57629

// check for a constant operand to the multiply.

57630

if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)

57631

return false;

57632

57633

// i16 instruction encodings are longer and some i16 instructions are slow,

57634

// so those are not desirable.

57635

if (VT == MVT::i16) {

57636

switch (Opc) {

57637

default:

57638

break;

57639

case ISD::LOAD:

57640

case ISD::SIGN_EXTEND:

57641

case ISD::ZERO_EXTEND:

57642

case ISD::ANY_EXTEND:

57643

case ISD::SHL:

57644

case ISD::SRA:

57645

case ISD::SRL:

57646

case ISD::SUB:

57647

case ISD::ADD:

57648

case ISD::MUL:

57649

case ISD::AND:

57650

case ISD::OR:

57651

case ISD::XOR:

57652

return false;

57653

}

57654

}

57655

57656

// Any legal type not explicitly accounted for above here is desirable.

57657

return true;

57658

}

57659

57660

SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,

57661

SDValue Value, SDValue Addr,

57662

SelectionDAG &DAG) const {

57663

const Module *M = DAG.getMachineFunction().getMMI().getModule();

57664

Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

57665

if (IsCFProtectionSupported) {

57666

// In case control-flow branch protection is enabled, we need to add

57667

// notrack prefix to the indirect branch.

57668

// In order to do that we create NT_BRIND SDNode.

57669

// Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.

57670

return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);

57671

}

57672

57673

return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);

57674

}

57675

57676

TargetLowering::AndOrSETCCFoldKind

57677

X86TargetLowering::isDesirableToCombineLogicOpOfSETCC(

57678

const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {

57679

using AndOrSETCCFoldKind = TargetLowering::AndOrSETCCFoldKind;

57680

EVT VT = LogicOp->getValueType(0);

57681

EVT OpVT = SETCC0->getOperand(0).getValueType();

57682

if (!VT.isInteger())

57683

return AndOrSETCCFoldKind::None;

57684

57685

if (VT.isVector())

57686

return AndOrSETCCFoldKind(AndOrSETCCFoldKind::NotAnd |

57687

(isOperationLegal(ISD::ABS, OpVT)

57688

? AndOrSETCCFoldKind::ABS

57689

: AndOrSETCCFoldKind::None));

57690

57691

// Don't use `NotAnd` as even though `not` is generally shorter code size than

57692

// `add`, `add` can lower to LEA which can save moves / spills. Any case where

57693

// `NotAnd` applies, `AddAnd` does as well.

57694

// TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,

57695

// if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.

57696

return AndOrSETCCFoldKind::AddAnd;

57697

}

57698

57699

bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {

57700

EVT VT = Op.getValueType();

57701

bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&

57702

isa<ConstantSDNode>(Op.getOperand(1));

57703

57704

// i16 is legal, but undesirable since i16 instruction encodings are longer

57705

// and some i16 instructions are slow.

57706

// 8-bit multiply-by-constant can usually be expanded to something cheaper

57707

// using LEA and/or other ALU ops.

57708

if (VT != MVT::i16 && !Is8BitMulByConstant)

57709

return false;

57710

57711

auto IsFoldableRMW = [](SDValue Load, SDValue Op) {

57712

if (!Op.hasOneUse())

57713

return false;

57714

SDNode *User = *Op->use_begin();

57715

if (!ISD::isNormalStore(User))

57716

return false;

57717

auto *Ld = cast<LoadSDNode>(Load);

57718

auto *St = cast<StoreSDNode>(User);

57719

return Ld->getBasePtr() == St->getBasePtr();

57720

};

57721

57722

auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {

57723

if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)

57724

return false;

57725

if (!Op.hasOneUse())

57726

return false;

57727

SDNode *User = *Op->use_begin();

57728

if (User->getOpcode() != ISD::ATOMIC_STORE)

57729

return false;

57730

auto *Ld = cast<AtomicSDNode>(Load);

57731

auto *St = cast<AtomicSDNode>(User);

57732

return Ld->getBasePtr() == St->getBasePtr();

57733

};

57734

57735

bool Commute = false;

57736

switch (Op.getOpcode()) {

57737

default: return false;

57738

case ISD::SIGN_EXTEND:

57739

case ISD::ZERO_EXTEND:

57740

case ISD::ANY_EXTEND:

57741

break;

57742

case ISD::SHL:

57743

case ISD::SRA:

57744

case ISD::SRL: {

57745

SDValue N0 = Op.getOperand(0);

57746

// Look out for (store (shl (load), x)).

57747

if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))

57748

return false;

57749

break;

57750

}

57751

case ISD::ADD:

57752

case ISD::MUL:

57753

case ISD::AND:

57754

case ISD::OR:

57755

case ISD::XOR:

57756

Commute = true;

57757

[[fallthrough]];

57758

case ISD::SUB: {

57759

SDValue N0 = Op.getOperand(0);

57760

SDValue N1 = Op.getOperand(1);

57761

// Avoid disabling potential load folding opportunities.

57762

if (X86::mayFoldLoad(N1, Subtarget) &&

57763

(!Commute || !isa<ConstantSDNode>(N0) ||

57764

(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))

57765

return false;

57766

if (X86::mayFoldLoad(N0, Subtarget) &&

57767

((Commute && !isa<ConstantSDNode>(N1)) ||

57768

(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))

57769

return false;

57770

if (IsFoldableAtomicRMW(N0, Op) ||

57771

(Commute && IsFoldableAtomicRMW(N1, Op)))

57772

return false;

57773

}

57774

}

57775

57776

PVT = MVT::i32;

57777

return true;

57778

}

57779

57780

//===----------------------------------------------------------------------===//

57781

// X86 Inline Assembly Support

57782

//===----------------------------------------------------------------------===//

57783

57784

// Helper to match a string separated by whitespace.

57785

static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {

57786

S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.

57787

57788

for (StringRef Piece : Pieces) {

57789

if (!S.startswith(Piece)) // Check if the piece matches.

57790

return false;

57791

57792

S = S.substr(Piece.size());

57793

StringRef::size_type Pos = S.find_first_not_of(" \t");

57794

if (Pos == 0) // We matched a prefix.

57795

return false;

57796

57797

S = S.substr(Pos);

57798

}

57799

57800

return S.empty();

57801

}

57802

57803

static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {

57804

57805

if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {

57806

if (llvm::is_contained(AsmPieces, "~{cc}") &&

57807

llvm::is_contained(AsmPieces, "~{flags}") &&

57808

llvm::is_contained(AsmPieces, "~{fpsr}")) {

57809

57810

if (AsmPieces.size() == 3)

57811

return true;

57812

else if (llvm::is_contained(AsmPieces, "~{dirflag}"))

57813

return true;

57814

}

57815

}

57816

return false;

57817

}

57818

57819

bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {

57820

InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());

57821

57822

const std::string &AsmStr = IA->getAsmString();

57823

57824

IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());

57825

if (!Ty || Ty->getBitWidth() % 16 != 0)

57826

return false;

57827

57828

// TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"

57829

SmallVector<StringRef, 4> AsmPieces;

57830

SplitString(AsmStr, AsmPieces, ";\n");

57831

57832

switch (AsmPieces.size()) {

57833

default: return false;

57834

case 1:

57835

// FIXME: this should verify that we are targeting a 486 or better. If not,

57836

// we will turn this bswap into something that will be lowered to logical

57837

// ops instead of emitting the bswap asm. For now, we don't support 486 or

57838

// lower so don't worry about this.

57839

// bswap $0

57840

if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||

57841

matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||

57842

matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||

57843

matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||

57844

matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||

57845

matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {

57846

// No need to check constraints, nothing other than the equivalent of

57847

// "=r,0" would be valid here.

57848

return IntrinsicLowering::LowerToByteSwap(CI);

57849

}

57850

57851

// rorw $$8, ${0:w} --> llvm.bswap.i16

57852

if (CI->getType()->isIntegerTy(16) &&

57853

IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&

57854

(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||

57855

matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {

57856

AsmPieces.clear();

57857

StringRef ConstraintsStr = IA->getConstraintString();

57858

SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");

57859

array_pod_sort(AsmPieces.begin(), AsmPieces.end());

57860

if (clobbersFlagRegisters(AsmPieces))

57861

return IntrinsicLowering::LowerToByteSwap(CI);

57862

}

57863

break;

57864

case 3:

57865

if (CI->getType()->isIntegerTy(32) &&

57866

IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&

57867

matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&

57868

matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&

57869

matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {

57870

AsmPieces.clear();

57871

StringRef ConstraintsStr = IA->getConstraintString();

57872

SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");

57873

array_pod_sort(AsmPieces.begin(), AsmPieces.end());

57874

if (clobbersFlagRegisters(AsmPieces))

57875

return IntrinsicLowering::LowerToByteSwap(CI);

57876

}

57877

57878

if (CI->getType()->isIntegerTy(64)) {

57879

InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();

57880

if (Constraints.size() >= 2 &&

57881

Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&

57882

Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {

57883

// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64

57884

if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&

57885

matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&

57886

matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))

57887

return IntrinsicLowering::LowerToByteSwap(CI);

57888

}

57889

}

57890

break;

57891

}

57892

return false;

57893

}

57894

57895

static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {

57896

X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)

57897

.Case("{@cca}", X86::COND_A)

57898

.Case("{@ccae}", X86::COND_AE)

57899

.Case("{@ccb}", X86::COND_B)

57900

.Case("{@ccbe}", X86::COND_BE)

57901

.Case("{@ccc}", X86::COND_B)

57902

.Case("{@cce}", X86::COND_E)

57903

.Case("{@ccz}", X86::COND_E)

57904

.Case("{@ccg}", X86::COND_G)

57905

.Case("{@ccge}", X86::COND_GE)

57906

.Case("{@ccl}", X86::COND_L)

57907

.Case("{@ccle}", X86::COND_LE)

57908

.Case("{@ccna}", X86::COND_BE)

57909

.Case("{@ccnae}", X86::COND_B)

57910

.Case("{@ccnb}", X86::COND_AE)

57911

.Case("{@ccnbe}", X86::COND_A)

57912

.Case("{@ccnc}", X86::COND_AE)

57913

.Case("{@ccne}", X86::COND_NE)

57914

.Case("{@ccnz}", X86::COND_NE)

57915

.Case("{@ccng}", X86::COND_LE)

57916

.Case("{@ccnge}", X86::COND_L)

57917

.Case("{@ccnl}", X86::COND_GE)

57918

.Case("{@ccnle}", X86::COND_G)

57919

.Case("{@ccno}", X86::COND_NO)

57920

.Case("{@ccnp}", X86::COND_NP)

57921

.Case("{@ccns}", X86::COND_NS)

57922

.Case("{@cco}", X86::COND_O)

57923

.Case("{@ccp}", X86::COND_P)

57924

.Case("{@ccs}", X86::COND_S)

57925

.Default(X86::COND_INVALID);

57926

return Cond;

57927

}

57928

57929

/// Given a constraint letter, return the type of constraint for this target.

57930

X86TargetLowering::ConstraintType

57931

X86TargetLowering::getConstraintType(StringRef Constraint) const {

57932

if (Constraint.size() == 1) {

57933

switch (Constraint[0]) {

57934

case 'R':

57935

case 'q':

57936

case 'Q':

57937

case 'f':

57938

case 't':

57939

case 'u':

57940

case 'y':

57941

case 'x':

57942

case 'v':

57943

case 'l':

57944

case 'k': // AVX512 masking registers.

57945

return C_RegisterClass;

57946

case 'a':

57947

case 'b':

57948

case 'c':

57949

case 'd':

57950

case 'S':

57951

case 'D':

57952

case 'A':

57953

return C_Register;

57954

case 'I':

57955

case 'J':

57956

case 'K':

57957

case 'N':

57958

case 'G':

57959

case 'L':

57960

case 'M':

57961

return C_Immediate;

57962

case 'C':

57963

case 'e':

57964

case 'Z':

57965

return C_Other;

57966

default:

57967

break;

57968

}

57969

}

57970

else if (Constraint.size() == 2) {

57971

switch (Constraint[0]) {

57972

default:

57973

break;

57974

case 'Y':

57975

switch (Constraint[1]) {

57976

default:

57977

break;

57978

case 'z':

57979

return C_Register;

57980

case 'i':

57981

case 'm':

57982

case 'k':

57983

case 't':

57984

case '2':

57985

return C_RegisterClass;

57986

}

57987

}

57988

} else if (parseConstraintCode(Constraint) != X86::COND_INVALID)

57989

return C_Other;

57990

return TargetLowering::getConstraintType(Constraint);

57991

}

57992

57993

/// Examine constraint type and operand type and determine a weight value.

57994

/// This object must already have been set up with the operand type

57995

/// and the current alternative constraint selected.

57996

TargetLowering::ConstraintWeight

57997

X86TargetLowering::getSingleConstraintMatchWeight(

57998

AsmOperandInfo &info, const char *constraint) const {

57999

ConstraintWeight weight = CW_Invalid;

58000

Value *CallOperandVal = info.CallOperandVal;

58001

// If we don't have a value, we can't do a match,

58002

// but allow it at the lowest weight.

58003

if (!CallOperandVal)

58004

return CW_Default;

58005

Type *type = CallOperandVal->getType();

58006

// Look at the constraint type.

58007

switch (*constraint) {

58008

default:

58009

weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);

58010

[[fallthrough]];

58011

case 'R':

58012

case 'q':

58013

case 'Q':

58014

case 'a':

58015

case 'b':

58016

case 'c':

58017

case 'd':

58018

case 'S':

58019

case 'D':

58020

case 'A':

58021

if (CallOperandVal->getType()->isIntegerTy())

58022

weight = CW_SpecificReg;

58023

break;

58024

case 'f':

58025

case 't':

58026

case 'u':

58027

if (type->isFloatingPointTy())

58028

weight = CW_SpecificReg;

58029

break;

58030

case 'y':

58031

if (type->isX86_MMXTy() && Subtarget.hasMMX())

58032

weight = CW_SpecificReg;

58033

break;

58034

case 'Y':

58035

if (StringRef(constraint).size() != 2)

58036

break;

58037

switch (constraint[1]) {

58038

default:

58039

return CW_Invalid;

58040

// XMM0

58041

case 'z':

58042

if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||

58043

((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||

58044

((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))

58045

return CW_SpecificReg;

58046

return CW_Invalid;

58047

// Conditional OpMask regs (AVX512)

58048

case 'k':

58049

if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())

58050

return CW_Register;

58051

return CW_Invalid;

58052

// Any MMX reg

58053

case 'm':

58054

if (type->isX86_MMXTy() && Subtarget.hasMMX())

58055

return weight;

58056

return CW_Invalid;

58057

// Any SSE reg when ISA >= SSE2, same as 'x'

58058

case 'i':

58059

case 't':

58060

case '2':

58061

if (!Subtarget.hasSSE2())

58062

return CW_Invalid;

58063

break;

58064

}

58065

break;

58066

case 'v':

58067

if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())

58068

weight = CW_Register;

58069

[[fallthrough]];

58070

case 'x':

58071

if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||

58072

((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))

58073

weight = CW_Register;

58074

break;

58075

case 'k':

58076

// Enable conditional vector operations using %k<#> registers.

58077

if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())

58078

weight = CW_Register;

58079

break;

58080

case 'I':

58081

if (auto *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {

58082

if (C->getZExtValue() <= 31)

58083

weight = CW_Constant;

58084

}

58085

break;

58086

case 'J':

58087

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58088

if (C->getZExtValue() <= 63)

58089

weight = CW_Constant;

58090

}

58091

break;

58092

case 'K':

58093

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58094

if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))

58095

weight = CW_Constant;

58096

}

58097

break;

58098

case 'L':

58099

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58100

if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))

58101

weight = CW_Constant;

58102

}

58103

break;

58104

case 'M':

58105

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58106

if (C->getZExtValue() <= 3)

58107

weight = CW_Constant;

58108

}

58109

break;

58110

case 'N':

58111

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58112

if (C->getZExtValue() <= 0xff)

58113

weight = CW_Constant;

58114

}

58115

break;

58116

case 'G':

58117

case 'C':

58118

if (isa<ConstantFP>(CallOperandVal)) {

58119

weight = CW_Constant;

58120

}

58121

break;

58122

case 'e':

58123

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58124

if ((C->getSExtValue() >= -0x80000000LL) &&

58125

(C->getSExtValue() <= 0x7fffffffLL))

58126

weight = CW_Constant;

58127

}

58128

break;

58129

case 'Z':

58130

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58131

if (C->getZExtValue() <= 0xffffffff)

58132

weight = CW_Constant;

58133

}

58134

break;

58135

}

58136

return weight;

58137

}

58138

58139

/// Try to replace an X constraint, which matches anything, with another that

58140

/// has more specific requirements based on the type of the corresponding

58141

/// operand.

58142

const char *X86TargetLowering::

58143

LowerXConstraint(EVT ConstraintVT) const {

58144

// FP X constraints get lowered to SSE1/2 registers if available, otherwise

58145

// 'f' like normal targets.

58146

if (ConstraintVT.isFloatingPoint()) {

58147

if (Subtarget.hasSSE1())

58148

return "x";

58149

}

58150

58151

return TargetLowering::LowerXConstraint(ConstraintVT);

58152

}

58153

58154

// Lower @cc targets via setcc.

58155

SDValue X86TargetLowering::LowerAsmOutputForConstraint(

58156

SDValue &Chain, SDValue &Glue, const SDLoc &DL,

58157

const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {

58158

X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);

58159

if (Cond == X86::COND_INVALID)

58160

return SDValue();

58161

// Check that return type is valid.

58162

if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||

58163

OpInfo.ConstraintVT.getSizeInBits() < 8)

58164

report_fatal_error("Glue output operand is of invalid type");

58165

58166

// Get EFLAGS register. Only update chain when copyfrom is glued.

58167

if (Glue.getNode()) {

58168

Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);

58169

Chain = Glue.getValue(1);

58170

} else

58171

Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);

58172

// Extract CC code.

58173

SDValue CC = getSETCC(Cond, Glue, DL, DAG);

58174

// Extend to 32-bits

58175

SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);

58176

58177

return Result;

58178

}

58179

58180

/// Lower the specified operand into the Ops vector.

58181

/// If it is invalid, don't add anything to Ops.

58182

void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,

58183

std::string &Constraint,

58184

std::vector<SDValue>&Ops,

58185

SelectionDAG &DAG) const {

58186

SDValue Result;

58187

58188

// Only support length 1 constraints for now.

58189

if (Constraint.length() > 1) return;

58190

58191

char ConstraintLetter = Constraint[0];

58192

switch (ConstraintLetter) {

58193

default: break;

58194

case 'I':

58195

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58196

if (C->getZExtValue() <= 31) {

58197

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58198

Op.getValueType());

58199

break;

58200

}

58201

}

58202

return;

58203

case 'J':

58204

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58205

if (C->getZExtValue() <= 63) {

58206

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58207

Op.getValueType());

58208

break;

58209

}

58210

}

58211

return;

58212

case 'K':

58213

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58214

if (isInt<8>(C->getSExtValue())) {

58215

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58216

Op.getValueType());

58217

break;

58218

}

58219

}

58220

return;

58221

case 'L':

58222

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58223

if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||

58224

(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {

58225

Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),

58226

Op.getValueType());

58227

break;

58228

}

58229

}

58230

return;

58231

case 'M':

58232

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58233

if (C->getZExtValue() <= 3) {

58234

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58235

Op.getValueType());

58236

break;

58237

}

58238

}

58239

return;

58240

case 'N':

58241

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58242

if (C->getZExtValue() <= 255) {

58243

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58244

Op.getValueType());

58245

break;

58246

}

58247

}

58248

return;

58249

case 'O':

58250

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58251

if (C->getZExtValue() <= 127) {

58252

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58253

Op.getValueType());

58254

break;

58255

}

58256

}

58257

return;

58258

case 'e': {

58259

// 32-bit signed value

58260

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58261

if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),

58262

C->getSExtValue())) {

58263

// Widen to 64 bits here to get it sign extended.

58264

Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);

58265

break;

58266

}

58267

// FIXME gcc accepts some relocatable values here too, but only in certain

58268

// memory models; it's complicated.

58269

}

58270

return;

58271

}

58272

case 'Z': {

58273

// 32-bit unsigned value

58274

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58275

if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),

58276

C->getZExtValue())) {

58277

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58278

Op.getValueType());

58279

break;

58280

}

58281

}

58282

// FIXME gcc accepts some relocatable values here too, but only in certain

58283

// memory models; it's complicated.

58284

return;

58285

}

58286

case 'i': {

58287

// Literal immediates are always ok.

58288

if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {

58289

bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;

58290

BooleanContent BCont = getBooleanContents(MVT::i64);

58291

ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)

58292

: ISD::SIGN_EXTEND;

58293

int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()

58294

: CST->getSExtValue();

58295

Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);

58296

break;

58297

}

58298

58299

// In any sort of PIC mode addresses need to be computed at runtime by

58300

// adding in a register or some sort of table lookup. These can't

58301

// be used as immediates. BlockAddresses and BasicBlocks are fine though.

58302

if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&

58303

!(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))

58304

return;

58305

58306

// If we are in non-pic codegen mode, we allow the address of a global (with

58307

// an optional displacement) to be used with 'i'.

58308

if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))

58309

// If we require an extra load to get this address, as in PIC mode, we

58310

// can't accept it.

58311

if (isGlobalStubReference(

58312

Subtarget.classifyGlobalReference(GA->getGlobal())))

58313

return;

58314

break;

58315

}

58316

}

58317

58318

if (Result.getNode()) {

58319

Ops.push_back(Result);

58320

return;

58321

}

58322

return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);

58323

}

58324

58325

/// Check if \p RC is a general purpose register class.

58326

/// I.e., GR* or one of their variant.

58327

static bool isGRClass(const TargetRegisterClass &RC) {

58328

return RC.hasSuperClassEq(&X86::GR8RegClass) ||

58329

RC.hasSuperClassEq(&X86::GR16RegClass) ||

58330

RC.hasSuperClassEq(&X86::GR32RegClass) ||

58331

RC.hasSuperClassEq(&X86::GR64RegClass) ||

58332

RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);

58333

}

58334

58335

/// Check if \p RC is a vector register class.

58336

/// I.e., FR* / VR* or one of their variant.

58337

static bool isFRClass(const TargetRegisterClass &RC) {

58338

return RC.hasSuperClassEq(&X86::FR16XRegClass) ||

58339

RC.hasSuperClassEq(&X86::FR32XRegClass) ||

58340

RC.hasSuperClassEq(&X86::FR64XRegClass) ||

58341

RC.hasSuperClassEq(&X86::VR128XRegClass) ||

58342

RC.hasSuperClassEq(&X86::VR256XRegClass) ||

58343

RC.hasSuperClassEq(&X86::VR512RegClass);

58344

}

58345

58346

/// Check if \p RC is a mask register class.

58347

/// I.e., VK* or one of their variant.

58348

static bool isVKClass(const TargetRegisterClass &RC) {

58349

return RC.hasSuperClassEq(&X86::VK1RegClass) ||

58350

RC.hasSuperClassEq(&X86::VK2RegClass) ||

58351

RC.hasSuperClassEq(&X86::VK4RegClass) ||

58352

RC.hasSuperClassEq(&X86::VK8RegClass) ||

58353

RC.hasSuperClassEq(&X86::VK16RegClass) ||

58354

RC.hasSuperClassEq(&X86::VK32RegClass) ||

58355

RC.hasSuperClassEq(&X86::VK64RegClass);

58356

}

58357

58358

std::pair<unsigned, const TargetRegisterClass *>

58359

X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

58360

StringRef Constraint,

58361

MVT VT) const {

58362

// First, see if this is a constraint that directly corresponds to an LLVM

58363

// register class.

58364

if (Constraint.size() == 1) {

58365

// GCC Constraint Letters

58366

switch (Constraint[0]) {

58367

default: break;

58368

// 'A' means [ER]AX + [ER]DX.

58369

case 'A':

58370

if (Subtarget.is64Bit())

58371

return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);

58372

assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58373, __extension__
__PRETTY_FUNCTION__))

58373

"Expecting 64, 32 or 16 bit subtarget")(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58373, __extension__
__PRETTY_FUNCTION__));

58374

return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);

58375

58376

// TODO: Slight differences here in allocation order and leaving

58377

// RIP in the class. Do they matter any more here than they do

58378

// in the normal allocation?

58379

case 'k':

58380

if (Subtarget.hasAVX512()) {

58381

if (VT == MVT::i1)

58382

return std::make_pair(0U, &X86::VK1RegClass);

58383

if (VT == MVT::i8)

58384

return std::make_pair(0U, &X86::VK8RegClass);

58385

if (VT == MVT::i16)

58386

return std::make_pair(0U, &X86::VK16RegClass);

58387

}

58388

if (Subtarget.hasBWI()) {

58389

if (VT == MVT::i32)

58390

return std::make_pair(0U, &X86::VK32RegClass);

58391

if (VT == MVT::i64)

58392

return std::make_pair(0U, &X86::VK64RegClass);

58393

}

58394

break;

58395

case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.

58396

if (Subtarget.is64Bit()) {

58397

if (VT == MVT::i8 || VT == MVT::i1)

58398

return std::make_pair(0U, &X86::GR8RegClass);

58399

if (VT == MVT::i16)

58400

return std::make_pair(0U, &X86::GR16RegClass);

58401

if (VT == MVT::i32 || VT == MVT::f32)

58402

return std::make_pair(0U, &X86::GR32RegClass);

58403

if (VT != MVT::f80 && !VT.isVector())

58404

return std::make_pair(0U, &X86::GR64RegClass);

58405

break;

58406

}

58407

[[fallthrough]];

58408

// 32-bit fallthrough

58409

case 'Q': // Q_REGS

58410

if (VT == MVT::i8 || VT == MVT::i1)

58411

return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);

58412

if (VT == MVT::i16)

58413

return std::make_pair(0U, &X86::GR16_ABCDRegClass);

58414

if (VT == MVT::i32 || VT == MVT::f32 ||

58415

(!VT.isVector() && !Subtarget.is64Bit()))

58416

return std::make_pair(0U, &X86::GR32_ABCDRegClass);

58417

if (VT != MVT::f80 && !VT.isVector())

58418

return std::make_pair(0U, &X86::GR64_ABCDRegClass);

58419

break;

58420

case 'r': // GENERAL_REGS

58421

case 'l': // INDEX_REGS

58422

if (VT == MVT::i8 || VT == MVT::i1)

58423

return std::make_pair(0U, &X86::GR8RegClass);

58424

if (VT == MVT::i16)

58425

return std::make_pair(0U, &X86::GR16RegClass);

58426

if (VT == MVT::i32 || VT == MVT::f32 ||

58427

(!VT.isVector() && !Subtarget.is64Bit()))

58428

return std::make_pair(0U, &X86::GR32RegClass);

58429

if (VT != MVT::f80 && !VT.isVector())

58430

return std::make_pair(0U, &X86::GR64RegClass);

58431

break;

58432

case 'R': // LEGACY_REGS

58433

if (VT == MVT::i8 || VT == MVT::i1)

58434

return std::make_pair(0U, &X86::GR8_NOREXRegClass);

58435

if (VT == MVT::i16)

58436

return std::make_pair(0U, &X86::GR16_NOREXRegClass);

58437

if (VT == MVT::i32 || VT == MVT::f32 ||

58438

(!VT.isVector() && !Subtarget.is64Bit()))

58439

return std::make_pair(0U, &X86::GR32_NOREXRegClass);

58440

if (VT != MVT::f80 && !VT.isVector())

58441

return std::make_pair(0U, &X86::GR64_NOREXRegClass);

58442

break;

58443

case 'f': // FP Stack registers.

58444

// If SSE is enabled for this VT, use f80 to ensure the isel moves the

58445

// value to the correct fpstack register class.

58446

if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))

58447

return std::make_pair(0U, &X86::RFP32RegClass);

58448

if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))

58449

return std::make_pair(0U, &X86::RFP64RegClass);

58450

if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)

58451

return std::make_pair(0U, &X86::RFP80RegClass);

58452

break;

58453

case 'y': // MMX_REGS if MMX allowed.

58454

if (!Subtarget.hasMMX()) break;

58455

return std::make_pair(0U, &X86::VR64RegClass);

58456

case 'v':

58457

case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed

58458

if (!Subtarget.hasSSE1()) break;

58459

bool VConstraint = (Constraint[0] == 'v');

58460

58461

switch (VT.SimpleTy) {

58462

default: break;

58463

// Scalar SSE types.

58464

case MVT::f16:

58465

if (VConstraint && Subtarget.hasFP16())

58466

return std::make_pair(0U, &X86::FR16XRegClass);

58467

break;

58468

case MVT::f32:

58469

case MVT::i32:

58470

if (VConstraint && Subtarget.hasVLX())

58471

return std::make_pair(0U, &X86::FR32XRegClass);

58472

return std::make_pair(0U, &X86::FR32RegClass);

58473

case MVT::f64:

58474

case MVT::i64:

58475

if (VConstraint && Subtarget.hasVLX())

58476

return std::make_pair(0U, &X86::FR64XRegClass);

58477

return std::make_pair(0U, &X86::FR64RegClass);

58478

case MVT::i128:

58479

if (Subtarget.is64Bit()) {

58480

if (VConstraint && Subtarget.hasVLX())

58481

return std::make_pair(0U, &X86::VR128XRegClass);

58482

return std::make_pair(0U, &X86::VR128RegClass);

58483

}

58484

break;

58485

// Vector types and fp128.

58486

case MVT::v8f16:

58487

if (!Subtarget.hasFP16())

58488

break;

58489

[[fallthrough]];

58490

case MVT::f128:

58491

case MVT::v16i8:

58492

case MVT::v8i16:

58493

case MVT::v4i32:

58494

case MVT::v2i64:

58495

case MVT::v4f32:

58496

case MVT::v2f64:

58497

if (VConstraint && Subtarget.hasVLX())

58498

return std::make_pair(0U, &X86::VR128XRegClass);

58499

return std::make_pair(0U, &X86::VR128RegClass);

58500

// AVX types.

58501

case MVT::v16f16:

58502

if (!Subtarget.hasFP16())

58503

break;

58504

[[fallthrough]];

58505

case MVT::v32i8:

58506

case MVT::v16i16:

58507

case MVT::v8i32:

58508

case MVT::v4i64:

58509

case MVT::v8f32:

58510

case MVT::v4f64:

58511

if (VConstraint && Subtarget.hasVLX())

58512

return std::make_pair(0U, &X86::VR256XRegClass);

58513

if (Subtarget.hasAVX())

58514

return std::make_pair(0U, &X86::VR256RegClass);

58515

break;

58516

case MVT::v32f16:

58517

if (!Subtarget.hasFP16())

58518

break;

58519

[[fallthrough]];

58520

case MVT::v64i8:

58521

case MVT::v32i16:

58522

case MVT::v8f64:

58523

case MVT::v16f32:

58524

case MVT::v16i32:

58525

case MVT::v8i64:

58526

if (!Subtarget.hasAVX512()) break;

58527

if (VConstraint)

58528

return std::make_pair(0U, &X86::VR512RegClass);

58529

return std::make_pair(0U, &X86::VR512_0_15RegClass);

58530

}

58531

break;

58532

}

58533

} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {

58534

switch (Constraint[1]) {

58535

default:

58536

break;

58537

case 'i':

58538

case 't':

58539

case '2':

58540

return getRegForInlineAsmConstraint(TRI, "x", VT);

58541

case 'm':

58542

if (!Subtarget.hasMMX()) break;

58543

return std::make_pair(0U, &X86::VR64RegClass);

58544

case 'z':

58545

if (!Subtarget.hasSSE1()) break;

58546

switch (VT.SimpleTy) {

58547

default: break;

58548

// Scalar SSE types.

58549

case MVT::f16:

58550

if (!Subtarget.hasFP16())

58551

break;

58552

return std::make_pair(X86::XMM0, &X86::FR16XRegClass);

58553

case MVT::f32:

58554

case MVT::i32:

58555

return std::make_pair(X86::XMM0, &X86::FR32RegClass);

58556

case MVT::f64:

58557

case MVT::i64:

58558

return std::make_pair(X86::XMM0, &X86::FR64RegClass);

58559

case MVT::v8f16:

58560

if (!Subtarget.hasFP16())

58561

break;

58562

[[fallthrough]];

58563

case MVT::f128:

58564

case MVT::v16i8:

58565

case MVT::v8i16:

58566

case MVT::v4i32:

58567

case MVT::v2i64:

58568

case MVT::v4f32:

58569

case MVT::v2f64:

58570

return std::make_pair(X86::XMM0, &X86::VR128RegClass);

58571

// AVX types.

58572

case MVT::v16f16:

58573

if (!Subtarget.hasFP16())

58574

break;

58575

[[fallthrough]];

58576

case MVT::v32i8:

58577

case MVT::v16i16:

58578

case MVT::v8i32:

58579

case MVT::v4i64:

58580

case MVT::v8f32:

58581

case MVT::v4f64:

58582

if (Subtarget.hasAVX())

58583

return std::make_pair(X86::YMM0, &X86::VR256RegClass);

58584

break;

58585

case MVT::v32f16:

58586

if (!Subtarget.hasFP16())

58587

break;

58588

[[fallthrough]];

58589

case MVT::v64i8:

58590

case MVT::v32i16:

58591

case MVT::v8f64:

58592

case MVT::v16f32:

58593

case MVT::v16i32:

58594

case MVT::v8i64:

58595

if (Subtarget.hasAVX512())

58596

return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);

58597

break;

58598

}

58599

break;

58600

case 'k':

58601

// This register class doesn't allocate k0 for masked vector operation.

58602

if (Subtarget.hasAVX512()) {

58603

if (VT == MVT::i1)

58604

return std::make_pair(0U, &X86::VK1WMRegClass);

58605

if (VT == MVT::i8)

58606

return std::make_pair(0U, &X86::VK8WMRegClass);

58607

if (VT == MVT::i16)

58608

return std::make_pair(0U, &X86::VK16WMRegClass);

58609

}

58610

if (Subtarget.hasBWI()) {

58611

if (VT == MVT::i32)

58612

return std::make_pair(0U, &X86::VK32WMRegClass);

58613

if (VT == MVT::i64)

58614

return std::make_pair(0U, &X86::VK64WMRegClass);

58615

}

58616

break;

58617

}

58618

}

58619

58620

if (parseConstraintCode(Constraint) != X86::COND_INVALID)

58621

return std::make_pair(0U, &X86::GR32RegClass);

58622

58623

// Use the default implementation in TargetLowering to convert the register

58624

// constraint into a member of a register class.

58625

std::pair<Register, const TargetRegisterClass*> Res;

58626

Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

58627

58628

// Not found as a standard register?

58629

if (!Res.second) {

58630

// Only match x87 registers if the VT is one SelectionDAGBuilder can convert

58631

// to/from f80.

58632

if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {

58633

// Map st(0) -> st(7) -> ST0

58634

if (Constraint.size() == 7 && Constraint[0] == '{' &&

58635

tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&

58636

Constraint[3] == '(' &&

58637

(Constraint[4] >= '0' && Constraint[4] <= '7') &&

58638

Constraint[5] == ')' && Constraint[6] == '}') {

58639

// st(7) is not allocatable and thus not a member of RFP80. Return

58640

// singleton class in cases where we have a reference to it.

58641

if (Constraint[4] == '7')

58642

return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);

58643

return std::make_pair(X86::FP0 + Constraint[4] - '0',

58644

&X86::RFP80RegClass);

58645

}

58646

58647

// GCC allows "st(0)" to be called just plain "st".

58648

if (StringRef("{st}").equals_insensitive(Constraint))

58649

return std::make_pair(X86::FP0, &X86::RFP80RegClass);

58650

}

58651

58652

// flags -> EFLAGS

58653

if (StringRef("{flags}").equals_insensitive(Constraint))

58654

return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);

58655

58656

// dirflag -> DF

58657

// Only allow for clobber.

58658

if (StringRef("{dirflag}").equals_insensitive(Constraint) &&

58659

VT == MVT::Other)

58660

return std::make_pair(X86::DF, &X86::DFCCRRegClass);

58661

58662

// fpsr -> FPSW

58663

if (StringRef("{fpsr}").equals_insensitive(Constraint))

58664

return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);

58665

58666

return Res;

58667

}

58668

58669

// Make sure it isn't a register that requires 64-bit mode.

58670

if (!Subtarget.is64Bit() &&

58671

(isFRClass(*Res.second) || isGRClass(*Res.second)) &&

58672

TRI->getEncodingValue(Res.first) >= 8) {

58673

// Register requires REX prefix, but we're in 32-bit mode.

58674

return std::make_pair(0, nullptr);

58675

}

58676

58677

// Make sure it isn't a register that requires AVX512.

58678

if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&

58679

TRI->getEncodingValue(Res.first) & 0x10) {

58680

// Register requires EVEX prefix.

58681

return std::make_pair(0, nullptr);

58682

}

58683

58684

// Otherwise, check to see if this is a register class of the wrong value

58685

// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to

58686

// turn into {ax},{dx}.

58687

// MVT::Other is used to specify clobber names.

58688

if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)

58689

return Res; // Correct type already, nothing to do.

58690

58691

// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should

58692

// return "eax". This should even work for things like getting 64bit integer

58693

// registers when given an f64 type.

58694

const TargetRegisterClass *Class = Res.second;

58695

// The generic code will match the first register class that contains the

58696

// given register. Thus, based on the ordering of the tablegened file,

58697

// the "plain" GR classes might not come first.

58698

// Therefore, use a helper method.

58699

if (isGRClass(*Class)) {

58700

unsigned Size = VT.getSizeInBits();

58701

if (Size == 1) Size = 8;

58702

if (Size != 8 && Size != 16 && Size != 32 && Size != 64)

58703

return std::make_pair(0, nullptr);

58704

Register DestReg = getX86SubSuperRegister(Res.first, Size);

58705

if (DestReg.isValid()) {

58706

bool is64Bit = Subtarget.is64Bit();

58707

const TargetRegisterClass *RC =

58708

Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)

58709

: Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)

58710

: Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)

58711

: /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);

58712

if (Size == 64 && !is64Bit) {

58713

// Model GCC's behavior here and select a fixed pair of 32-bit

58714

// registers.

58715

switch (DestReg) {

58716

case X86::RAX:

58717

return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);

58718

case X86::RDX:

58719

return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);

58720

case X86::RCX:

58721

return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);

58722

case X86::RBX:

58723

return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);

58724

case X86::RSI:

58725

return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);

58726

case X86::RDI:

58727

return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);

58728

case X86::RBP:

58729

return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);

58730

default:

58731

return std::make_pair(0, nullptr);

58732

}

58733

}

58734

if (RC && RC->contains(DestReg))

58735

return std::make_pair(DestReg, RC);

58736

return Res;

58737

}

58738

// No register found/type mismatch.

58739

return std::make_pair(0, nullptr);

58740

} else if (isFRClass(*Class)) {

58741

// Handle references to XMM physical registers that got mapped into the

58742

// wrong class. This can happen with constraints like {xmm0} where the

58743

// target independent register mapper will just pick the first match it can

58744

// find, ignoring the required type.

58745

58746

// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.

58747

if (VT == MVT::f16)

58748

Res.second = &X86::FR16XRegClass;

58749

else if (VT == MVT::f32 || VT == MVT::i32)

58750

Res.second = &X86::FR32XRegClass;

58751

else if (VT == MVT::f64 || VT == MVT::i64)

58752

Res.second = &X86::FR64XRegClass;

58753

else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))

58754

Res.second = &X86::VR128XRegClass;

58755

else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))

58756

Res.second = &X86::VR256XRegClass;

58757

else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))

58758

Res.second = &X86::VR512RegClass;

58759

else {

58760

// Type mismatch and not a clobber: Return an error;

58761

Res.first = 0;

58762

Res.second = nullptr;

58763

}

58764

} else if (isVKClass(*Class)) {

58765

if (VT == MVT::i1)

58766

Res.second = &X86::VK1RegClass;

58767

else if (VT == MVT::i8)

58768

Res.second = &X86::VK8RegClass;

58769

else if (VT == MVT::i16)

58770

Res.second = &X86::VK16RegClass;

58771

else if (VT == MVT::i32)

58772

Res.second = &X86::VK32RegClass;

58773

else if (VT == MVT::i64)

58774

Res.second = &X86::VK64RegClass;

58775

else {

58776

// Type mismatch and not a clobber: Return an error;

58777

Res.first = 0;

58778

Res.second = nullptr;

58779

}

58780

}

58781

58782

return Res;

58783

}

58784

58785

bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {

58786

// Integer division on x86 is expensive. However, when aggressively optimizing

58787

// for code size, we prefer to use a div instruction, as it is usually smaller

58788

// than the alternative sequence.

58789

// The exception to this is vector division. Since x86 doesn't have vector

58790

// integer division, leaving the division as-is is a loss even in terms of

58791

// size, because it will have to be scalarized, while the alternative code

58792

// sequence can be performed in vector form.

58793

bool OptSize = Attr.hasFnAttr(Attribute::MinSize);

58794

return OptSize && !VT.isVector();

58795

}

58796

58797

void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {

58798

if (!Subtarget.is64Bit())

58799

return;

58800

58801

// Update IsSplitCSR in X86MachineFunctionInfo.

58802

X86MachineFunctionInfo *AFI =

58803

Entry->getParent()->getInfo<X86MachineFunctionInfo>();

58804

AFI->setIsSplitCSR(true);

58805

}

58806

58807

void X86TargetLowering::insertCopiesSplitCSR(

58808

MachineBasicBlock *Entry,

58809

const SmallVectorImpl<MachineBasicBlock *> &Exits) const {

58810

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

58811

const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());

58812

if (!IStart)

58813

return;

58814

58815

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

58816

MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();

58817

MachineBasicBlock::iterator MBBI = Entry->begin();

58818

for (const MCPhysReg *I = IStart; *I; ++I) {

58819

const TargetRegisterClass *RC = nullptr;

58820

if (X86::GR64RegClass.contains(*I))

58821

RC = &X86::GR64RegClass;

58822

else

58823

llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58823);

58824

58825

Register NewVR = MRI->createVirtualRegister(RC);

58826

// Create copy from CSR to a virtual register.

58827

// FIXME: this currently does not emit CFI pseudo-instructions, it works

58828

// fine for CXX_FAST_TLS since the C++-style TLS access functions should be

58829

// nounwind. If we want to generalize this later, we may need to emit

58830

// CFI pseudo-instructions.

58831

assert((static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58833, __extension__
__PRETTY_FUNCTION__))

58832

Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58833, __extension__
__PRETTY_FUNCTION__))

58833

"Function should be nounwind in insertCopiesSplitCSR!")(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58833, __extension__
__PRETTY_FUNCTION__));

58834

Entry->addLiveIn(*I);

58835

BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)

58836

.addReg(*I);

58837

58838

// Insert the copy-back instructions right before the terminator.

58839

for (auto *Exit : Exits)

58840

BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),

58841

TII->get(TargetOpcode::COPY), *I)

58842

.addReg(NewVR);

58843

}

58844

}

58845

58846

bool X86TargetLowering::supportSwiftError() const {

58847

return Subtarget.is64Bit();

58848

}

58849

58850

/// Returns true if stack probing through a function call is requested.

58851

bool X86TargetLowering::hasStackProbeSymbol(const MachineFunction &MF) const {

58852

return !getStackProbeSymbolName(MF).empty();

58853

}

58854

58855

/// Returns true if stack probing through inline assembly is requested.

58856

bool X86TargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {

58857

58858

// No inline stack probe for Windows, they have their own mechanism.

58859

if (Subtarget.isOSWindows() ||

58860

MF.getFunction().hasFnAttribute("no-stack-arg-probe"))

58861

return false;

58862

58863

// If the function specifically requests inline stack probes, emit them.

58864

if (MF.getFunction().hasFnAttribute("probe-stack"))

58865

return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==

58866

"inline-asm";

58867

58868

return false;

58869

}

58870

58871

/// Returns the name of the symbol used to emit stack probes or the empty

58872

/// string if not applicable.

58873

StringRef

58874

X86TargetLowering::getStackProbeSymbolName(const MachineFunction &MF) const {

58875

// Inline Stack probes disable stack probe call

58876

if (hasInlineStackProbe(MF))

58877

return "";

58878

58879

// If the function specifically requests stack probes, emit them.

58880

if (MF.getFunction().hasFnAttribute("probe-stack"))

58881

return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();

58882

58883

// Generally, if we aren't on Windows, the platform ABI does not include

58884

// support for stack probes, so don't emit them.

58885

if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||

58886

MF.getFunction().hasFnAttribute("no-stack-arg-probe"))

58887

return "";

58888

58889

// We need a stack probe to conform to the Windows ABI. Choose the right

58890

// symbol.

58891

if (Subtarget.is64Bit())

58892

return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";

58893

return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";

58894

}

58895

58896

unsigned

58897

X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {

58898

// The default stack probe size is 4096 if the function has no stackprobesize

58899

// attribute.

58900

return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",

58901

4096);

58902

}

58903

58904

Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {

58905

if (ML->isInnermost() &&

58906

ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())

58907

return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);

58908

return TargetLowering::getPrefLoopAlignment();

58909

}

File:	build/source/llvm/lib/Target/X86/X86ISelLowering.cpp
Warning:	line 13330, column 55 The result of the '%' expression is undefined

Bug Summary

Annotated Source Code